From eaf57b45611f3f4926883e7751641e715f315c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Morten=20L=C3=B8nskov?= Date: Thu, 11 Jun 2026 10:22:31 +0200 Subject: [PATCH 1/2] Add a generation for sitemap to the doc site --- build-docs.py | 9 +- build_scripts/config_loader.py | 33 +++++ build_scripts/gen_redirects.py | 10 ++ build_scripts/gen_sitemap_index.py | 197 +++++++++++++++++++++++++++++ docfx-template.json | 5 + metadata/build-config.json | 15 ++- 6 files changed, 267 insertions(+), 2 deletions(-) create mode 100644 build_scripts/gen_sitemap_index.py diff --git a/build-docs.py b/build-docs.py index 1b7d517bf..05b0f8b26 100644 --- a/build-docs.py +++ b/build-docs.py @@ -329,7 +329,14 @@ def main() -> int: [sys.executable, "build_scripts/gen_staticwebapp_config.py"], "Generating staticwebapp.config.json" ) - + + # Generate site-wide sitemap index (references each language's sitemap.xml) + # and robots.txt. Runs last so all per-language sitemaps already exist. + run_command( + [sys.executable, "build_scripts/gen_sitemap_index.py"], + "Generating sitemap index and robots.txt" + ) + print(f"\n{'='*60}") print(" Build complete!") print(f"{'='*60}") diff --git a/build_scripts/config_loader.py b/build_scripts/config_loader.py index ec69f3dae..a3bd7c228 100644 --- a/build_scripts/config_loader.py +++ b/build_scripts/config_loader.py @@ -152,6 +152,39 @@ def get_default_language(config: dict | None = None) -> str: return config.get("defaultLanguage", "en") +def get_base_url(config: dict | None = None) -> str: + """Get the canonical base URL of the published site, without trailing slash. + + Used for canonical/hreflang tags, per-language sitemap baseUrls, and the + site-wide sitemap index. Languages are served under /{code}/ beneath this. + """ + if config is None: + config = load_build_config() + return config.get("baseUrl", "https://docs.tabulareditor.com").rstrip("/") + + +def get_sitemap_downrank(config: dict | None = None) -> list[dict]: + """Get sitemap downrank rules: a list of {match, priority} dicts. + + Each rule lowers the of sitemap URLs whose path contains the + 'match' substring. Rules are evaluated in order; the first match wins. + """ + if config is None: + config = load_build_config() + return config.get("sitemap", {}).get("downrank", []) + + +def get_sitemap_exclude(config: dict | None = None) -> list[dict]: + """Get sitemap exclude rules: a list of {match} dicts. + + Each rule removes sitemap URLs whose path contains the 'match' substring. + Used to drop legacy Tabular Editor 2-only pages from the published sitemap. + """ + if config is None: + config = load_build_config() + return config.get("sitemap", {}).get("exclude", []) + + def compute_file_hash(file_path: Path | str) -> str: """Compute SHA256 hash of a file's contents. diff --git a/build_scripts/gen_redirects.py b/build_scripts/gen_redirects.py index 7f7eaf3e3..1645abb00 100644 --- a/build_scripts/gen_redirects.py +++ b/build_scripts/gen_redirects.py @@ -19,6 +19,8 @@ import sys import traceback +from config_loader import get_base_url, get_default_language + def get_available_languages() -> list[str]: """Scan localizedContent/ folder and return list of language codes (excluding 'en').""" @@ -69,6 +71,10 @@ def generate_localized_config(template: dict, lang: str) -> dict: # Set output destination to language subfolder (relative to project root) # From localizedContent/{lang}/, we go up twice to reach project root build["dest"] = f"../../_site/{lang}" + + # The published sitemap covers the default (English) language only, so + # non-default languages do not emit their own sitemap.xml. + build.pop("sitemap", None) # Update template paths - need to go up two levels to reach project root if "template" in build: @@ -113,6 +119,10 @@ def generate_redirects_config(template: dict) -> dict: # Set English output destination (relative to localizedContent/en/) config["build"]["dest"] = "../../_site/en" + # Point the sitemap at the default language's URL prefix + if "sitemap" in config["build"]: + config["build"]["sitemap"]["baseUrl"] = f"{get_base_url()}/{get_default_language()}" + # Update template paths - need to go up two levels to reach project root if "template" in config["build"]: new_templates = [] diff --git a/build_scripts/gen_sitemap_index.py b/build_scripts/gen_sitemap_index.py new file mode 100644 index 000000000..c438e4bc4 --- /dev/null +++ b/build_scripts/gen_sitemap_index.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Post-process the English sitemap and generate the site-wide entry point. + +DocFX emits a sitemap for the default (English) language at +_site/{default_lang}/sitemap.xml (configured via build.sitemap in the generated +docfx.json). Non-default languages do not emit a sitemap. This script: + + 1. Removes Tabular Editor 2-only URLs from the English sitemap per the + 'exclude' rules in build-config.json. + 2. Downranks selected URLs in the English sitemap (API reference pages) per + the 'downrank' rules in build-config.json. + 3. Writes _site/sitemap.xml: a pointing at the English sitemap + (the published sitemap covers English only). + 4. Writes _site/robots.txt: allows all crawlers and advertises the index. + +Run after the English build so _site/{default_lang}/sitemap.xml exists. + +Usage: + python gen_sitemap_index.py # Process _site/ + python gen_sitemap_index.py --dry-run # Preview without writing +""" + +import argparse +import xml.etree.ElementTree as ET +from pathlib import Path + +from config_loader import ( + get_base_url, + get_default_language, + get_sitemap_downrank, + get_sitemap_exclude, +) + + +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" + + +def apply_exclude(sitemap_path: Path, rules: list[dict], dry_run: bool = False) -> int: + """Remove entries whose matches an exclude rule. Returns count removed. + + A rule matches when its 'match' substring appears anywhere in the URL's + . Used to drop Tabular Editor 2-only pages from the published sitemap. + """ + if not rules or not sitemap_path.exists(): + return 0 + + ET.register_namespace("", SITEMAP_NS) + tree = ET.parse(sitemap_path) + root = tree.getroot() + + matches = [m for r in rules if (m := r.get("match"))] + + removed = 0 + for url in list(root.findall(f"{{{SITEMAP_NS}}}url")): + loc_el = url.find(f"{{{SITEMAP_NS}}}loc") + if loc_el is None or not loc_el.text: + continue + if any(m in loc_el.text for m in matches): + root.remove(url) + removed += 1 + + if removed and not dry_run: + tree.write(sitemap_path, encoding="utf-8", xml_declaration=True) + + return removed + + +def apply_downrank(sitemap_path: Path, rules: list[dict], dry_run: bool = False) -> int: + """Lower for URLs matching downrank rules. Returns count changed. + + A rule matches when its 'match' substring appears anywhere in the URL's + . Rules are evaluated in order; the first match wins. + """ + if not rules or not sitemap_path.exists(): + return 0 + + ET.register_namespace("", SITEMAP_NS) + tree = ET.parse(sitemap_path) + root = tree.getroot() + + changed = 0 + for url in root.findall(f"{{{SITEMAP_NS}}}url"): + loc_el = url.find(f"{{{SITEMAP_NS}}}loc") + if loc_el is None or not loc_el.text: + continue + loc = loc_el.text + for rule in rules: + if rule["match"] in loc: + priority_el = url.find(f"{{{SITEMAP_NS}}}priority") + if priority_el is None: + priority_el = ET.SubElement(url, f"{{{SITEMAP_NS}}}priority") + priority_el.text = f"{float(rule['priority']):.1f}" + changed += 1 + break + + if changed and not dry_run: + tree.write(sitemap_path, encoding="utf-8", xml_declaration=True) + + return changed + + +def build_index_xml(base_url: str, default_lang: str, site_dir: Path) -> tuple[str, bool]: + """Build the sitemap index XML referencing only the default language. + + Returns (xml_string, included) where included is True if the default + language sitemap exists. + """ + included = (site_dir / default_lang / "sitemap.xml").exists() + + lines = [ + '', + '', + ] + if included: + lines.append(" ") + lines.append(f" {base_url}/{default_lang}/sitemap.xml") + lines.append(" ") + lines.append("") + return "\n".join(lines) + "\n", included + + +def build_robots_txt(base_url: str) -> str: + """Build a robots.txt that allows all crawlers and advertises the sitemap index.""" + return ( + "User-agent: *\n" + "Allow: /\n\n" + f"Sitemap: {base_url}/sitemap.xml\n" + ) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Post-process the English sitemap and generate the site-wide index" + ) + parser.add_argument( + "--site-dir", "-s", + default="_site", + help="Site output directory (default: _site)" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview output without writing files" + ) + + args = parser.parse_args() + + site_dir = Path(args.site_dir) + if not site_dir.exists(): + print(f"Error: site directory {site_dir} does not exist") + return 1 + + base_url = get_base_url() + default_lang = get_default_language() + en_sitemap = site_dir / default_lang / "sitemap.xml" + + print(f"Base URL: {base_url}") + print(f"Default language: {default_lang}") + + # 1. Remove Tabular Editor 2-only pages from the English sitemap + exclude_rules = get_sitemap_exclude() + removed = apply_exclude(en_sitemap, exclude_rules, dry_run=args.dry_run) + print(f"Exclude rules: {len(exclude_rules)}; URLs removed: {removed}") + + # 2. Downrank API reference pages in the English sitemap + rules = get_sitemap_downrank() + changed = apply_downrank(en_sitemap, rules, dry_run=args.dry_run) + print(f"Downrank rules: {len(rules)}; URLs adjusted: {changed}") + + # 3. Build the English-only sitemap index + index_xml, included = build_index_xml(base_url, default_lang, site_dir) + if not included: + print(f"Warning: {en_sitemap} not found - sitemap index will be empty. " + "Check that build.sitemap is set in the English docfx.json.") + + # 4. robots.txt + robots_txt = build_robots_txt(base_url) + + if args.dry_run: + print("\n--- _site/sitemap.xml ---") + print(index_xml) + print("--- _site/robots.txt ---") + print(robots_txt) + return 0 + + (site_dir / "sitemap.xml").write_text(index_xml, encoding="utf-8") + (site_dir / "robots.txt").write_text(robots_txt, encoding="utf-8") + + print(f"\nGenerated: {site_dir / 'sitemap.xml'} (English only)") + print(f"Generated: {site_dir / 'robots.txt'}") + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/docfx-template.json b/docfx-template.json index 7158659a3..c7513a0b5 100644 --- a/docfx-template.json +++ b/docfx-template.json @@ -49,6 +49,11 @@ "_disableContribution": true }, "markdownEngineName": "markdig", + "sitemap": { + "baseUrl": "https://docs.tabulareditor.com", + "changefreq": "weekly", + "priority": 0.5 + }, "dest": "_site", "xrefService": [ "https://xref.docs.microsoft.com/query?uid={uid}" ] } diff --git a/metadata/build-config.json b/metadata/build-config.json index aea98afd8..3cd1e63e4 100644 --- a/metadata/build-config.json +++ b/metadata/build-config.json @@ -1,6 +1,18 @@ { "_comment": "Shared build configuration for all build scripts. Edit this file to add/remove content directories.", "defaultLanguage": "en", + "baseUrl": "https://docs.tabulareditor.com", + "sitemap": { + "_comment": "Published sitemap covers the default language only. 'exclude' removes URLs whose path contains 'match' entirely. 'downrank' lowers for URLs whose path contains 'match' (first matching rule wins). Exclude is applied first.", + "exclude": [ + { "match": "/features/Command-line-Options.html" }, + { "match": "/getting-started/Getting-Started-te2.html" }, + { "match": "/references/user-settings-files-te2.html" } + ], + "downrank": [ + { "match": "/api/", "priority": 0.1 } + ] + }, "contentDirectories": { "_comment": "Directories that contain translatable content (markdown and HTML files)", "directories": [ @@ -12,7 +24,8 @@ "security", "troubleshooting", "tutorials", - "whats-new" + "whats-new", + "includes" ] }, "sharedDirectories": { From 924a094932f91b5472fa96bcc937d9d376387e8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Morten=20L=C3=B8nskov?= Date: Thu, 11 Jun 2026 13:50:20 +0200 Subject: [PATCH 2/2] remove includes from build config --- metadata/build-config.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata/build-config.json b/metadata/build-config.json index 3cd1e63e4..77456735f 100644 --- a/metadata/build-config.json +++ b/metadata/build-config.json @@ -24,8 +24,7 @@ "security", "troubleshooting", "tutorials", - "whats-new", - "includes" + "whats-new" ] }, "sharedDirectories": {