diff --git a/scripts/ingest-xsd/README.md b/scripts/ingest-xsd/README.md index a417f5a..d8ca838 100644 --- a/scripts/ingest-xsd/README.md +++ b/scripts/ingest-xsd/README.md @@ -46,16 +46,21 @@ bun run xsd:fetch -- --expected-sha256 # override hash bun run xsd:ingest ``` -By default it walks `wml.xsd` plus its import closure (12 documents) and -populates: `xsd_profiles`, `xsd_namespaces`, `xsd_symbols`, -`xsd_symbol_profiles`, `xsd_inheritance_edges`, `xsd_compositors`, -`xsd_child_edges`, `xsd_group_edges`, `xsd_attr_edges`, `xsd_enums`. Wraps -the whole thing in a single transaction; idempotent across runs. - -To ingest a different working set: +By default it walks 9 roots whose union closure covers all 26 XSDs in +the Transitional bundle (`wml.xsd`, `sml.xsd`, `pml.xsd`, `vml-main.xsd`, +and the standalone shared schemas for additionalCharacteristics, +bibliography, customXmlDataProperties, documentPropertiesCustom, and +documentPropertiesExtended). Populates: `xsd_profiles`, `xsd_namespaces`, +`xsd_symbols`, `xsd_symbol_profiles`, `xsd_inheritance_edges`, +`xsd_compositors`, `xsd_child_edges`, `xsd_group_edges`, `xsd_attr_edges`, +`xsd_enums`. Wraps the whole thing in a single transaction; idempotent +across runs. + +To ingest a narrower working set (overrides the default list): ```bash -bun run xsd:ingest --entrypoint dml-main.xsd +bun run xsd:ingest --entrypoint wml.xsd # WML only +bun run xsd:ingest --entrypoint sml.xsd --entrypoint pml.xsd # SML + PML bun run xsd:ingest --schema-dir --entrypoint \ --profile --source ``` diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-xsd/ingest.ts index caa2859..acee3d0 100644 --- a/scripts/ingest-xsd/ingest.ts +++ b/scripts/ingest-xsd/ingest.ts @@ -1130,6 +1130,41 @@ interface CliArgs { sourceName: string; } +/** + * Default entrypoints for the ECMA-376 Transitional XSD bundle. + * + * Union closure of these 9 files covers all 26 .xsd files in the bundle. + * Picked as a minimal explicit list rather than a glob so a stray file in + * data/xsd-cache/ (download cruft, intermediate artifacts) can't sneak + * into the ingest. Each entry is here because it's either a top-level + * vocabulary or a root that's never imported by another XSD in the set. + * + * wml.xsd WML + dml-wp + drawingML closure + * sml.xsd SML + dml-spreadsheetDrawing + * pml.xsd PML + * vml-main.xsd all 5 VML files (chains imports) + * shared-additionalCharacteristics.xsd standalone, no importers + * shared-bibliography.xsd standalone, no importers + * shared-customXmlDataProperties.xsd standalone; targets .../customXml + * (motivating case for ds:datastoreItem) + * shared-documentPropertiesCustom.xsd pulls in shared-docPropsVTypes + * shared-documentPropertiesExtended.xsd + * + * shared-customXmlSchemaProperties.xsd / shared-math.xsd / shared-rel.xsd / + * shared-commonSimpleTypes.xsd are reached transitively by the above. + */ +const DEFAULT_ENTRYPOINTS = [ + "wml.xsd", + "sml.xsd", + "pml.xsd", + "vml-main.xsd", + "shared-additionalCharacteristics.xsd", + "shared-bibliography.xsd", + "shared-customXmlDataProperties.xsd", + "shared-documentPropertiesCustom.xsd", + "shared-documentPropertiesExtended.xsd", +]; + function parseCliArgs(): CliArgs { const argv = process.argv.slice(2); let schemaDir = "./data/xsd-cache/ecma-376-transitional"; @@ -1143,7 +1178,7 @@ function parseCliArgs(): CliArgs { else if (a === "--profile") profileName = argv[++i] ?? profileName; else if (a === "--source") sourceName = argv[++i] ?? sourceName; } - if (entrypoints.length === 0) entrypoints.push("wml.xsd"); + if (entrypoints.length === 0) entrypoints.push(...DEFAULT_ENTRYPOINTS); return { schemaDir, entrypoints, profileName, sourceName }; } diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-xsd/ingest.test.ts index 58aacef..7d17dc5 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-xsd/ingest.test.ts @@ -13,7 +13,26 @@ import { createDbClient, type DbClient } from "../../packages/shared/src/db/inde const FIXTURES_DIR = join(import.meta.dir, "fixtures"); const REAL_CACHE_DIR = "./data/xsd-cache/ecma-376-transitional"; + +// The WML-only smoke test just needs wml.xsd + its import closure on disk. +// The full-bundle test needs all 9 default entrypoints; partial caches +// (e.g. someone fetched a subset for hand-testing) must skip it cleanly +// instead of failing in readFile. const realCacheReady = existsSync(join(REAL_CACHE_DIR, "wml.xsd")); +const FULL_BUNDLE_ROOTS = [ + "wml.xsd", + "sml.xsd", + "pml.xsd", + "vml-main.xsd", + "shared-additionalCharacteristics.xsd", + "shared-bibliography.xsd", + "shared-customXmlDataProperties.xsd", + "shared-documentPropertiesCustom.xsd", + "shared-documentPropertiesExtended.xsd", +]; +const fullBundleCacheReady = FULL_BUNDLE_ROOTS.every((f) => + existsSync(join(REAL_CACHE_DIR, f)), +); import { getTestDatabaseUrl } from "../test-db.ts"; @@ -521,3 +540,65 @@ test.skipIf(!realCacheReady)( }, 30_000, ); + +test.skipIf(!fullBundleCacheReady)( + "smoke: ingest the full Transitional bundle via default entrypoints", + async () => { + // Default entrypoint list (9 roots) is the union closure of the 26 + // Transitional XSDs. Calling ingestSchemaSet directly with the same + // list verifies the closure resolves to 26 documents and that the + // previously-unreached namespaces (customXml, SML, PML, VML, doc-prop + // shareds) actually contribute symbols. + const stats = await ingestSchemaSet({ + schemaDir: REAL_CACHE_DIR, + entrypoints: FULL_BUNDLE_ROOTS, + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + expect(stats.documents).toBe(26); + // WML alone landed >1300 symbols; the full bundle is materially larger. + expect(stats.symbolsInserted).toBeGreaterThan(3500); + + // ds:datastoreItem - the motivating case. Lives in shared-customXml. + const datastoreItem = await db.sql` + SELECT s.local_name, s.kind, ns.uri AS namespace_uri + FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + WHERE s.local_name = 'datastoreItem' AND s.kind = 'element' + `; + expect(datastoreItem).toHaveLength(1); + expect(datastoreItem[0].namespace_uri).toBe( + "http://schemas.openxmlformats.org/officeDocument/2006/customXml", + ); + + // SML / PML top-level elements should also be present. + const sml = await db.sql` + SELECT s.local_name FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + WHERE s.vocabulary_id = 'sml-main' AND s.kind = 'element' AND s.parent_symbol_id IS NULL + `; + expect(sml.length).toBeGreaterThan(0); + + const pml = await db.sql` + SELECT s.local_name FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + WHERE s.vocabulary_id = 'pml-main' AND s.kind = 'element' AND s.parent_symbol_id IS NULL + `; + expect(pml.length).toBeGreaterThan(0); + + // Same overall sanity floors as the WML-only test: nothing should be + // left unresolved after the broader ingest. Regression guard against + // import-closure gaps. + expect(stats.childEdgesUnresolved).toBe(0); + expect(stats.groupRefsUnresolved).toBe(0); + expect(stats.attrGroupRefsUnresolved).toBe(0); + // xml:space / xml:lang and a handful of other xml-namespace attrs are + // still expected to be unresolved (we don't ingest the xml namespace + // XSD); the floor is loose to absorb that. + expect(stats.attrEdgesUnresolved).toBeLessThan(20); + }, + 60_000, +);