From e2353d0f25fd1a33c28b6237e35d8399d51f4e9e Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Tue, 12 May 2026 08:22:20 -0300 Subject: [PATCH 1/2] feat(xsd-ingest): broaden default entrypoints to the full Transitional bundle The default ingest only walked wml.xsd's import closure (12 of 26 XSDs). SML, PML, VML, and several standalone shared schemas - including shared-customXmlDataProperties.xsd (the home of ds:datastoreItem) - never reached the schema graph, so structural tools failed on anything outside WordprocessingML. Default entrypoints become an explicit list of 9 roots whose union closure covers all 26 files in data/xsd-cache/ecma-376-transitional/. Explicit over glob so a stray file in the cache directory can't quietly land in production ingest. No code changes to vocabulary.ts: every targetNamespace the broader set declares is already registered. No spec-prose vs XSD URI alias is added - that's a separate concern. Adds a smoke test that ingests the full closure and asserts (a) 26 documents parsed, (b) ds:datastoreItem resolves under the customXml namespace, (c) SML / PML top-level elements land in their vocabularies, and (d) no unresolved child / group / attrGroup edges. Floors are set above today's WML-only baseline so a regression that drops a vocabulary fails the test. This PR ships code only; the production DB is not mutated as part of merging. See the PR body for the post-merge runbook (run xsd:ingest, expected deltas, smoke checks). --- scripts/ingest-xsd/ingest.ts | 37 ++++++++++++++++- tests/ingest-xsd/ingest.test.ts | 72 +++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index caa2859..acee3d0 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -1130,6 +1130,41 @@ interface CliArgs { sourceName: string; } +/** + * Default entrypoints for the ECMA-376 Transitional XSD bundle. + * + * Union closure of these 9 files covers all 26 .xsd files in the bundle. + * Picked as a minimal explicit list rather than a glob so a stray file in + * data/xsd-cache/ (download cruft, intermediate artifacts) can't sneak + * into the ingest. Each entry is here because it's either a top-level + * vocabulary or a root that's never imported by another XSD in the set. + * + * wml.xsd WML + dml-wp + drawingML closure + * sml.xsd SML + dml-spreadsheetDrawing + * pml.xsd PML + * vml-main.xsd all 5 VML files (chains imports) + * shared-additionalCharacteristics.xsd standalone, no importers + * shared-bibliography.xsd standalone, no importers + * shared-customXmlDataProperties.xsd standalone; targets .../customXml + * (motivating case for ds:datastoreItem) + * shared-documentPropertiesCustom.xsd pulls in shared-docPropsVTypes + * shared-documentPropertiesExtended.xsd + * + * shared-customXmlSchemaProperties.xsd / shared-math.xsd / shared-rel.xsd / + * shared-commonSimpleTypes.xsd are reached transitively by the above. + */ +const DEFAULT_ENTRYPOINTS = [ + "wml.xsd", + "sml.xsd", + "pml.xsd", + "vml-main.xsd", + "shared-additionalCharacteristics.xsd", + "shared-bibliography.xsd", + "shared-customXmlDataProperties.xsd", + "shared-documentPropertiesCustom.xsd", + "shared-documentPropertiesExtended.xsd", +]; + function parseCliArgs(): CliArgs { const argv = process.argv.slice(2); let schemaDir = "./data/xsd-cache/ecma-376-transitional"; @@ -1143,7 +1178,7 @@ function parseCliArgs(): CliArgs { else if (a === "--profile") profileName = argv[++i] ?? profileName; else if (a === "--source") sourceName = argv[++i] ?? sourceName; } - if (entrypoints.length === 0) entrypoints.push("wml.xsd"); + if (entrypoints.length === 0) entrypoints.push(...DEFAULT_ENTRYPOINTS); return { schemaDir, entrypoints, profileName, sourceName }; } diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index 58aacef..4ea5344 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -521,3 +521,75 @@ test.skipIf(!realCacheReady)( }, 30_000, ); + +test.skipIf(!realCacheReady)( + "smoke: ingest the full Transitional bundle via default entrypoints", + async () => { + // Default entrypoint list (9 roots) is the union closure of the 26 + // Transitional XSDs. Calling ingestSchemaSet directly with the same + // list verifies the closure resolves to 26 documents and that the + // previously-unreached namespaces (customXml, SML, PML, VML, doc-prop + // shareds) actually contribute symbols. + const stats = await ingestSchemaSet({ + schemaDir: REAL_CACHE_DIR, + entrypoints: [ + "wml.xsd", + "sml.xsd", + "pml.xsd", + "vml-main.xsd", + "shared-additionalCharacteristics.xsd", + "shared-bibliography.xsd", + "shared-customXmlDataProperties.xsd", + "shared-documentPropertiesCustom.xsd", + "shared-documentPropertiesExtended.xsd", + ], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + expect(stats.documents).toBe(26); + // WML alone landed >1300 symbols; the full bundle is materially larger. + expect(stats.symbolsInserted).toBeGreaterThan(3500); + + // ds:datastoreItem - the motivating case. Lives in shared-customXml. + const datastoreItem = await db.sql` + SELECT s.local_name, s.kind, ns.uri AS namespace_uri + FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + WHERE s.local_name = 'datastoreItem' AND s.kind = 'element' + `; + expect(datastoreItem).toHaveLength(1); + expect(datastoreItem[0].namespace_uri).toBe( + "http://schemas.openxmlformats.org/officeDocument/2006/customXml", + ); + + // SML / PML top-level elements should also be present. + const sml = await db.sql` + SELECT s.local_name FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + WHERE s.vocabulary_id = 'sml-main' AND s.kind = 'element' AND s.parent_symbol_id IS NULL + `; + expect(sml.length).toBeGreaterThan(0); + + const pml = await db.sql` + SELECT s.local_name FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + WHERE s.vocabulary_id = 'pml-main' AND s.kind = 'element' AND s.parent_symbol_id IS NULL + `; + expect(pml.length).toBeGreaterThan(0); + + // Same overall sanity floors as the WML-only test: nothing should be + // left unresolved after the broader ingest. Regression guard against + // import-closure gaps. + expect(stats.childEdgesUnresolved).toBe(0); + expect(stats.groupRefsUnresolved).toBe(0); + expect(stats.attrGroupRefsUnresolved).toBe(0); + // xml:space / xml:lang and a handful of other xml-namespace attrs are + // still expected to be unresolved (we don't ingest the xml namespace + // XSD); the floor is loose to absorb that. + expect(stats.attrEdgesUnresolved).toBeLessThan(20); + }, + 60_000, +); From 97dd589086ad2bc245e0ddc450dd971d83967418 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Tue, 12 May 2026 08:31:34 -0300 Subject: [PATCH 2/2] fix(xsd-ingest): refresh README + tighten full-bundle smoke-test gate Two P3 findings from PR review: - scripts/ingest-xsd/README.md still claimed the default ingest walked wml.xsd's 12-document closure. Updated to describe the 9-root / 26-XSD default and how to narrow it back when needed. - tests/ingest-xsd/ingest.test.ts gated the new full-bundle smoke test on just wml.xsd. A dev with a partial cache (e.g. someone who fetched WML only for hand-testing) would have the test attempt to readFile a missing root and fail. Now gates on all 9 default entrypoints; the existing WML-only smoke test keeps its narrow wml.xsd-only gate. --- scripts/ingest-xsd/README.md | 21 +++++++++++++-------- tests/ingest-xsd/ingest.test.ts | 33 +++++++++++++++++++++------------ 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/scripts/ingest-xsd/README.md b/scripts/ingest-xsd/README.md index a417f5a..d8ca838 100644 --- a/scripts/ingest-xsd/README.md +++ b/scripts/ingest-xsd/README.md @@ -46,16 +46,21 @@ bun run xsd:fetch -- --expected-sha256 # override hash bun run xsd:ingest ``` -By default it walks `wml.xsd` plus its import closure (12 documents) and -populates: `xsd_profiles`, `xsd_namespaces`, `xsd_symbols`, -`xsd_symbol_profiles`, `xsd_inheritance_edges`, `xsd_compositors`, -`xsd_child_edges`, `xsd_group_edges`, `xsd_attr_edges`, `xsd_enums`. Wraps -the whole thing in a single transaction; idempotent across runs. - -To ingest a different working set: +By default it walks 9 roots whose union closure covers all 26 XSDs in +the Transitional bundle (`wml.xsd`, `sml.xsd`, `pml.xsd`, `vml-main.xsd`, +and the standalone shared schemas for additionalCharacteristics, +bibliography, customXmlDataProperties, documentPropertiesCustom, and +documentPropertiesExtended). Populates: `xsd_profiles`, `xsd_namespaces`, +`xsd_symbols`, `xsd_symbol_profiles`, `xsd_inheritance_edges`, +`xsd_compositors`, `xsd_child_edges`, `xsd_group_edges`, `xsd_attr_edges`, +`xsd_enums`. Wraps the whole thing in a single transaction; idempotent +across runs. + +To ingest a narrower working set (overrides the default list): ```bash -bun run xsd:ingest --entrypoint dml-main.xsd +bun run xsd:ingest --entrypoint wml.xsd # WML only +bun run xsd:ingest --entrypoint sml.xsd --entrypoint pml.xsd # SML + PML bun run xsd:ingest --schema-dir --entrypoint \ --profile --source ``` diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index 4ea5344..7d17dc5 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -13,7 +13,26 @@ import { createDbClient, type DbClient } from "../../packages/shared/src/db/inde const FIXTURES_DIR = join(import.meta.dir, "fixtures"); const REAL_CACHE_DIR = "./data/xsd-cache/ecma-376-transitional"; + +// The WML-only smoke test just needs wml.xsd + its import closure on disk. +// The full-bundle test needs all 9 default entrypoints; partial caches +// (e.g. someone fetched a subset for hand-testing) must skip it cleanly +// instead of failing in readFile. const realCacheReady = existsSync(join(REAL_CACHE_DIR, "wml.xsd")); +const FULL_BUNDLE_ROOTS = [ + "wml.xsd", + "sml.xsd", + "pml.xsd", + "vml-main.xsd", + "shared-additionalCharacteristics.xsd", + "shared-bibliography.xsd", + "shared-customXmlDataProperties.xsd", + "shared-documentPropertiesCustom.xsd", + "shared-documentPropertiesExtended.xsd", +]; +const fullBundleCacheReady = FULL_BUNDLE_ROOTS.every((f) => + existsSync(join(REAL_CACHE_DIR, f)), +); import { getTestDatabaseUrl } from "../test-db.ts"; @@ -522,7 +541,7 @@ test.skipIf(!realCacheReady)( 30_000, ); -test.skipIf(!realCacheReady)( +test.skipIf(!fullBundleCacheReady)( "smoke: ingest the full Transitional bundle via default entrypoints", async () => { // Default entrypoint list (9 roots) is the union closure of the 26 @@ -532,17 +551,7 @@ test.skipIf(!realCacheReady)( // shareds) actually contribute symbols. const stats = await ingestSchemaSet({ schemaDir: REAL_CACHE_DIR, - entrypoints: [ - "wml.xsd", - "sml.xsd", - "pml.xsd", - "vml-main.xsd", - "shared-additionalCharacteristics.xsd", - "shared-bibliography.xsd", - "shared-customXmlDataProperties.xsd", - "shared-documentPropertiesCustom.xsd", - "shared-documentPropertiesExtended.xsd", - ], + entrypoints: FULL_BUNDLE_ROOTS, profileName: "transitional", sourceName: "ecma-376-transitional", db,