Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion build-docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,14 @@ def main() -> int:
[sys.executable, "build_scripts/gen_staticwebapp_config.py"],
"Generating staticwebapp.config.json"
)


# Generate site-wide sitemap index (references each language's sitemap.xml)
# and robots.txt. Runs last so all per-language sitemaps already exist.
run_command(
Comment on lines +333 to +335
[sys.executable, "build_scripts/gen_sitemap_index.py"],
"Generating sitemap index and robots.txt"
)

print(f"\n{'='*60}")
print(" Build complete!")
print(f"{'='*60}")
Expand Down
33 changes: 33 additions & 0 deletions build_scripts/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,39 @@ def get_default_language(config: dict | None = None) -> str:
return config.get("defaultLanguage", "en")


def get_base_url(config: dict | None = None) -> str:
"""Get the canonical base URL of the published site, without trailing slash.

Used for canonical/hreflang tags, per-language sitemap baseUrls, and the
site-wide sitemap index. Languages are served under /{code}/ beneath this.
"""
if config is None:
config = load_build_config()
return config.get("baseUrl", "https://docs.tabulareditor.com").rstrip("/")

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally defaulting to "https://docs.tabulareditor.com" here is a bit wrong. (This is also relevant for injecting SEO) as we should not hardcode these strings. Now we got the same url stored in 3 different places. The build-config, inject seo and this config loader.



def get_sitemap_downrank(config: dict | None = None) -> list[dict]:
"""Get sitemap downrank rules: a list of {match, priority} dicts.

Each rule lowers the <priority> of sitemap URLs whose path contains the
'match' substring. Rules are evaluated in order; the first match wins.
"""
if config is None:
config = load_build_config()
return config.get("sitemap", {}).get("downrank", [])


def get_sitemap_exclude(config: dict | None = None) -> list[dict]:
"""Get sitemap exclude rules: a list of {match} dicts.

Each rule removes sitemap URLs whose path contains the 'match' substring.
Used to drop legacy Tabular Editor 2-only pages from the published sitemap.
"""
if config is None:
config = load_build_config()
return config.get("sitemap", {}).get("exclude", [])


def compute_file_hash(file_path: Path | str) -> str:
"""Compute SHA256 hash of a file's contents.

Expand Down
10 changes: 10 additions & 0 deletions build_scripts/gen_redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import sys
import traceback

from config_loader import get_base_url, get_default_language


def get_available_languages() -> list[str]:
"""Scan localizedContent/ folder and return list of language codes (excluding 'en')."""
Expand Down Expand Up @@ -69,6 +71,10 @@ def generate_localized_config(template: dict, lang: str) -> dict:
# Set output destination to language subfolder (relative to project root)
# From localizedContent/{lang}/, we go up twice to reach project root
build["dest"] = f"../../_site/{lang}"

# The published sitemap covers the default (English) language only, so
# non-default languages do not emit their own sitemap.xml.
build.pop("sitemap", None)

# Update template paths - need to go up two levels to reach project root
if "template" in build:
Expand Down Expand Up @@ -113,6 +119,10 @@ def generate_redirects_config(template: dict) -> dict:
# Set English output destination (relative to localizedContent/en/)
config["build"]["dest"] = "../../_site/en"

# Point the sitemap at the default language's URL prefix
if "sitemap" in config["build"]:
config["build"]["sitemap"]["baseUrl"] = f"{get_base_url()}/{get_default_language()}"

Comment on lines 119 to +125
# Update template paths - need to go up two levels to reach project root
if "template" in config["build"]:
new_templates = []
Expand Down
197 changes: 197 additions & 0 deletions build_scripts/gen_sitemap_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Post-process the English sitemap and generate the site-wide entry point.

DocFX emits a sitemap for the default (English) language at
_site/{default_lang}/sitemap.xml (configured via build.sitemap in the generated
docfx.json). Non-default languages do not emit a sitemap. This script:

1. Removes Tabular Editor 2-only URLs from the English sitemap per the
'exclude' rules in build-config.json.
2. Downranks selected URLs in the English sitemap (API reference pages) per
the 'downrank' rules in build-config.json.
3. Writes _site/sitemap.xml: a <sitemapindex> pointing at the English sitemap
(the published sitemap covers English only).
4. Writes _site/robots.txt: allows all crawlers and advertises the index.

Run after the English build so _site/{default_lang}/sitemap.xml exists.

Usage:
python gen_sitemap_index.py # Process _site/
python gen_sitemap_index.py --dry-run # Preview without writing
"""

import argparse
import xml.etree.ElementTree as ET
from pathlib import Path

from config_loader import (
get_base_url,
get_default_language,
get_sitemap_downrank,
get_sitemap_exclude,
)


SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"


def apply_exclude(sitemap_path: Path, rules: list[dict], dry_run: bool = False) -> int:
"""Remove <url> entries whose <loc> matches an exclude rule. Returns count removed.

A rule matches when its 'match' substring appears anywhere in the URL's
<loc>. Used to drop Tabular Editor 2-only pages from the published sitemap.
"""
if not rules or not sitemap_path.exists():
return 0

ET.register_namespace("", SITEMAP_NS)
tree = ET.parse(sitemap_path)
root = tree.getroot()

matches = [m for r in rules if (m := r.get("match"))]

removed = 0
for url in list(root.findall(f"{{{SITEMAP_NS}}}url")):
loc_el = url.find(f"{{{SITEMAP_NS}}}loc")
if loc_el is None or not loc_el.text:
continue
if any(m in loc_el.text for m in matches):
root.remove(url)
removed += 1

if removed and not dry_run:
tree.write(sitemap_path, encoding="utf-8", xml_declaration=True)

return removed


def apply_downrank(sitemap_path: Path, rules: list[dict], dry_run: bool = False) -> int:
"""Lower <priority> for URLs matching downrank rules. Returns count changed.

A rule matches when its 'match' substring appears anywhere in the URL's
<loc>. Rules are evaluated in order; the first match wins.
"""
if not rules or not sitemap_path.exists():
return 0

ET.register_namespace("", SITEMAP_NS)
tree = ET.parse(sitemap_path)
root = tree.getroot()

changed = 0
for url in root.findall(f"{{{SITEMAP_NS}}}url"):
loc_el = url.find(f"{{{SITEMAP_NS}}}loc")
if loc_el is None or not loc_el.text:
continue
loc = loc_el.text
for rule in rules:
if rule["match"] in loc:
priority_el = url.find(f"{{{SITEMAP_NS}}}priority")
if priority_el is None:
priority_el = ET.SubElement(url, f"{{{SITEMAP_NS}}}priority")
priority_el.text = f"{float(rule['priority']):.1f}"
changed += 1
break
Comment on lines +89 to +96

if changed and not dry_run:
tree.write(sitemap_path, encoding="utf-8", xml_declaration=True)

return changed


def build_index_xml(base_url: str, default_lang: str, site_dir: Path) -> tuple[str, bool]:
"""Build the sitemap index XML referencing only the default language.

Returns (xml_string, included) where included is True if the default
language sitemap exists.
"""
included = (site_dir / default_lang / "sitemap.xml").exists()

lines = [
'<?xml version="1.0" encoding="UTF-8"?>',
'<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
]
if included:
lines.append(" <sitemap>")
lines.append(f" <loc>{base_url}/{default_lang}/sitemap.xml</loc>")
lines.append(" </sitemap>")
lines.append("</sitemapindex>")
return "\n".join(lines) + "\n", included


def build_robots_txt(base_url: str) -> str:
"""Build a robots.txt that allows all crawlers and advertises the sitemap index."""
return (
"User-agent: *\n"
"Allow: /\n\n"
f"Sitemap: {base_url}/sitemap.xml\n"
)


def main() -> int:
parser = argparse.ArgumentParser(
description="Post-process the English sitemap and generate the site-wide index"
)
parser.add_argument(
"--site-dir", "-s",
default="_site",
help="Site output directory (default: _site)"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Preview output without writing files"
)

args = parser.parse_args()

site_dir = Path(args.site_dir)
if not site_dir.exists():
print(f"Error: site directory {site_dir} does not exist")
return 1

base_url = get_base_url()
default_lang = get_default_language()
en_sitemap = site_dir / default_lang / "sitemap.xml"

print(f"Base URL: {base_url}")
print(f"Default language: {default_lang}")

# 1. Remove Tabular Editor 2-only pages from the English sitemap
exclude_rules = get_sitemap_exclude()
removed = apply_exclude(en_sitemap, exclude_rules, dry_run=args.dry_run)
print(f"Exclude rules: {len(exclude_rules)}; URLs removed: {removed}")

# 2. Downrank API reference pages in the English sitemap
rules = get_sitemap_downrank()
changed = apply_downrank(en_sitemap, rules, dry_run=args.dry_run)
print(f"Downrank rules: {len(rules)}; URLs adjusted: {changed}")

# 3. Build the English-only sitemap index
index_xml, included = build_index_xml(base_url, default_lang, site_dir)
if not included:
print(f"Warning: {en_sitemap} not found - sitemap index will be empty. "
"Check that build.sitemap is set in the English docfx.json.")

# 4. robots.txt
robots_txt = build_robots_txt(base_url)

if args.dry_run:
print("\n--- _site/sitemap.xml ---")
print(index_xml)
print("--- _site/robots.txt ---")
print(robots_txt)
return 0

(site_dir / "sitemap.xml").write_text(index_xml, encoding="utf-8")
(site_dir / "robots.txt").write_text(robots_txt, encoding="utf-8")

print(f"\nGenerated: {site_dir / 'sitemap.xml'} (English only)")
print(f"Generated: {site_dir / 'robots.txt'}")
return 0


if __name__ == "__main__":
exit(main())
Comment on lines +196 to +197
5 changes: 5 additions & 0 deletions docfx-template.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@
"_disableContribution": true
},
"markdownEngineName": "markdig",
"sitemap": {
"baseUrl": "https://docs.tabulareditor.com",
"changefreq": "weekly",
"priority": 0.5
},
"dest": "_site",
"xrefService": [ "https://xref.docs.microsoft.com/query?uid={uid}" ]
}
Expand Down
12 changes: 12 additions & 0 deletions metadata/build-config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
{
"_comment": "Shared build configuration for all build scripts. Edit this file to add/remove content directories.",
"defaultLanguage": "en",
"baseUrl": "https://docs.tabulareditor.com",
"sitemap": {
"_comment": "Published sitemap covers the default language only. 'exclude' removes URLs whose path contains 'match' entirely. 'downrank' lowers <priority> for URLs whose path contains 'match' (first matching rule wins). Exclude is applied first.",
"exclude": [
{ "match": "/features/Command-line-Options.html" },
{ "match": "/getting-started/Getting-Started-te2.html" },
{ "match": "/references/user-settings-files-te2.html" }
],
"downrank": [
{ "match": "/api/", "priority": 0.1 }
]
},
"contentDirectories": {
"_comment": "Directories that contain translatable content (markdown and HTML files)",
"directories": [
Expand Down
Loading