diff --git a/CLAUDE.md b/CLAUDE.md index cdb8617..a12c21c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -104,7 +104,7 @@ The XML you provide is wrapped in a minimal `w:document > w:body` structure auto ## MCP Server -Cloudflare Worker exposing two tool families over MCP, backed by the same database. +Cloudflare Worker exposing three tool families over MCP. Prose and schema-lookup tools are backed by the database; package-metadata tool reads a curated static dataset bundled with the worker. Prose search over the spec PDFs (powered by `spec_content`): @@ -120,6 +120,10 @@ Structural queries over the XSD schema graph (powered by `xsd_*` tables): - `ooxml_enum` - simpleType enumeration values - `ooxml_namespace` - vocabularies and per-profile symbol counts for a namespace URI +OPC package metadata (powered by the curated `opc-parts.ts` dataset): + +- `ooxml_package_part` - part-type info by content type, source relationship type, or query substring + Uses PostgreSQL with pgvector (Neon serverless in production, Docker locally). ## Data Pipelines diff --git a/README.md b/README.md index e990e2b..38cd821 100644 --- a/README.md +++ b/README.md @@ -54,10 +54,11 @@ url = "https://api.ooxml.dev/mcp" } ``` -Two tool families share one server: +Three tool families share one server: - **Prose search** (over the spec PDFs): `ooxml_search`, `ooxml_section`, `ooxml_parts` - **Schema lookup** (over the parsed XSDs): `ooxml_element`, `ooxml_type`, `ooxml_children`, `ooxml_attributes`, `ooxml_enum`, `ooxml_namespace` +- **Package metadata** (curated from Part 1 §11.3.x / §12.3.x / §13.3.x / §15.x): `ooxml_package_part` ## Development diff --git a/apps/mcp-server/README.md b/apps/mcp-server/README.md index 64cfd60..8e289ba 100644 --- a/apps/mcp-server/README.md +++ b/apps/mcp-server/README.md @@ -1,9 +1,10 @@ # OOXML Reference MCP Server -Cloudflare Worker that exposes ECMA-376 (Office Open XML) over the Model Context Protocol. Two tool families share one server: +Cloudflare Worker that exposes ECMA-376 (Office Open XML) over the Model Context Protocol. Three tool families share one server: - **Prose search** — semantic search across the four ECMA-376 part PDFs (~18,000 chunks, embedded with Voyage, queried with pgvector). - **Schema lookup** — deterministic queries over the parsed XSD graph (profiles, namespaces, symbols, content models, attributes, enums). +- **Package metadata** — curated OPC part-type reference (content types, source relationship types, root namespaces, typical paths in the package). Hosted at `https://api.ooxml.dev/mcp`. @@ -69,6 +70,14 @@ Any MCP-compatible client that speaks Streamable HTTP can connect to the endpoin Default profile is `transitional`. Future profiles will compose Transitional with Office extension schemas. +### Package metadata + +| Tool | Returns | +| --- | --- | +| `ooxml_package_part` | OPC part type by content type, source relationship type, or query substring (Word / Excel / PowerPoint + cross-cutting parts) | + +Curated from ECMA-376 Part 1 §11.3.x / §12.3.x / §13.3.x / §15.x. Answers package-level questions the schema graph and prose corpus don't cover (e.g. "what kind of part is `/customXml/item1.xml`?"). + ## Development ```bash diff --git a/apps/mcp-server/src/index.ts b/apps/mcp-server/src/index.ts index 41fa8bf..3641e54 100644 --- a/apps/mcp-server/src/index.ts +++ b/apps/mcp-server/src/index.ts @@ -1,10 +1,12 @@ /** * OOXML Reference MCP Server * - * Cloudflare Worker exposing two tool families over MCP: - * - prose search over ECMA-376 PDFs (ooxml_search, ooxml_section, ooxml_parts) - * - schema lookup over the parsed XSD graph (ooxml_element, ooxml_type, - * ooxml_children, ooxml_attributes, ooxml_enum, ooxml_namespace) + * Cloudflare Worker exposing three tool families over MCP: + * - prose search over ECMA-376 PDFs (ooxml_search, ooxml_section, ooxml_parts) + * - schema lookup over the parsed XSD graph (ooxml_element, ooxml_type, + * ooxml_children, ooxml_attributes, ooxml_enum, ooxml_namespace) + * - package metadata curated from Part 1 §11.3.x / §12.3.x / §13.3.x / §15.x + * (ooxml_package_part) */ import { createDb } from "./db"; diff --git a/apps/mcp-server/src/mcp.ts b/apps/mcp-server/src/mcp.ts index 0621c4c..90d5b28 100644 --- a/apps/mcp-server/src/mcp.ts +++ b/apps/mcp-server/src/mcp.ts @@ -140,7 +140,7 @@ function handleInitialize(id: number | string | null): JsonRpcResponse { version: "0.1.0", }, instructions: - "OOXML (ECMA-376 / Office Open XML) reference server. Two tool families: prose search over the spec PDFs (ooxml_search, ooxml_section, ooxml_parts) and deterministic schema lookup over the parsed XSDs (ooxml_element, ooxml_type, ooxml_children, ooxml_attributes, ooxml_enum, ooxml_namespace).", + "OOXML (ECMA-376 / Office Open XML) reference server. Three tool families: (1) prose search over the spec PDFs (ooxml_search, ooxml_section, ooxml_parts); (2) deterministic schema lookup over the parsed XSDs (ooxml_element, ooxml_type, ooxml_children, ooxml_attributes, ooxml_enum, ooxml_namespace); (3) OPC package metadata curated from Part 1 §11.3.x / §12.3.x / §13.3.x / §15.x (ooxml_package_part). The three corpora can disagree about URIs for the same concept (custom XML data storage is the canonical example); each tool surface notes when it keys on the XSD URI vs the spec-prose URI.", }, }; } diff --git a/apps/mcp-server/src/ooxml-tools.ts b/apps/mcp-server/src/ooxml-tools.ts index 30450ed..47825ec 100644 --- a/apps/mcp-server/src/ooxml-tools.ts +++ b/apps/mcp-server/src/ooxml-tools.ts @@ -31,6 +31,13 @@ import { parseQName, type SymbolHit, } from "./ooxml-queries"; +import { + contentTypesOf, + findPartByContentType, + findPartsByRelationshipType, + type OpcPart, + searchParts, +} from "./opc-parts"; export const DEFAULT_PROFILE = "transitional"; @@ -130,6 +137,33 @@ export const OOXML_TOOL_DEFS: ToolDef[] = [ }, }, }, + { + name: "ooxml_package_part", + description: + "Look up OPC (Open Packaging Conventions) part types: content type, source relationship type, root namespace and element, typical paths in the package. Answers 'what kind of part is /customXml/item1.xml?' — package metadata that the schema graph doesn't capture. Four modes: " + + "(1) `content_type` exact match (e.g. 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml'); " + + "(2) `relationship_type` exact match (e.g. '.../officeDocument/2006/relationships/customXmlProps'); " + + "(3) `query` case-insensitive substring across name, content type, relationship type, root namespace and element; " + + "(4) no args → list every curated part. Curated from ECMA-376 Part 1 §11.3.x / §12.3.x / §13.3.x / §15.x; covers Word / Excel / PowerPoint plus cross-cutting (properties, theme, image, custom XML).", + inputSchema: { + type: "object" as const, + properties: { + content_type: { + type: "string", + description: "Exact OPC content type (Content_Types.xml value).", + }, + relationship_type: { + type: "string", + description: "Exact source relationship type URI.", + }, + query: { + type: "string", + description: + "Case-insensitive substring across name, content type, relationship type, namespace, element, notes.", + }, + }, + }, + }, ]; export type OoxmlToolName = @@ -138,7 +172,8 @@ export type OoxmlToolName = | "ooxml_children" | "ooxml_attributes" | "ooxml_enum" - | "ooxml_namespace"; + | "ooxml_namespace" + | "ooxml_package_part"; const OOXML_TOOL_NAMES: ReadonlySet = new Set(OOXML_TOOL_DEFS.map((t) => t.name)); @@ -318,6 +353,39 @@ export async function runOoxmlTool( }); } + case "ooxml_package_part": { + const contentType = typeof args.content_type === "string" ? args.content_type.trim() : ""; + const relationshipType = + typeof args.relationship_type === "string" ? args.relationship_type.trim() : ""; + const query = typeof args.query === "string" ? args.query.trim() : ""; + + if (contentType) { + const hit = findPartByContentType(contentType); + if (hit) return formatPackagePartReport(hit); + return formatPackagePartNotFound("content type", contentType); + } + if (relationshipType) { + const hits = findPartsByRelationshipType(relationshipType); + if (hits.length === 1) return formatPackagePartReport(hits[0]); + if (hits.length > 1) { + // Shared rels (officeDocument across WML/SML/PML, customXml + // across families) intentionally hit multiple parts. + return formatPackagePartList(hits, { + title: `Package parts using relationship '${relationshipType}'`, + query: "", + footer: + "This relationship type is shared across package families. Disambiguate by the source part (the package's main part determines whether `.../relationships/officeDocument` points at a Word, Excel, or PowerPoint main part).", + }); + } + return formatPackagePartNotFound("relationship type", relationshipType); + } + const matches = searchParts(query); + return formatPackagePartList(matches, { + title: query ? `Package parts matching '${query}'` : "Curated OPC package parts", + query, + }); + } + default: { const _exhaustive: never = name; throw new Error(`Unhandled OOXML tool: ${_exhaustive}`); @@ -518,3 +586,81 @@ function formatNotFound(what: string, profile?: string, extras?: NotFoundExtras) lines.push("- a different profile (currently only `transitional` is populated)"); return lines.join("\n"); } + +function formatPackagePartReport(p: OpcPart): string { + const lines: string[] = []; + lines.push(`## OPC Part: ${p.name}`); + lines.push(""); + lines.push(`- key: \`${p.key}\``); + const cts = contentTypesOf(p); + if (cts.length === 1) { + lines.push(`- content type: \`${cts[0]}\``); + } else { + lines.push(`- content types: ${cts.map((c) => `\`${c}\``).join(", ")}`); + } + lines.push( + `- source relationship: ${p.relationshipType ? `\`${p.relationshipType}\`` : "_(implicit, none)_"}`, + ); + lines.push( + `- root namespace: ${p.rootNamespace ? `\`${p.rootNamespace}\`` : "_(none; binary or arbitrary-XML payload)_"}`, + ); + lines.push(`- root element: ${p.rootElement ? `\`${p.rootElement}\`` : "_(none)_"}`); + lines.push(`- typical paths: ${p.typicalPaths.map((t) => `\`${t}\``).join(", ")}`); + lines.push(`- package families: ${p.packageFamilies.join(", ")}`); + lines.push(`- spec: ${p.sourceSections.join("; ")}`); + if (p.notes) { + lines.push(""); + lines.push(`**Notes**: ${p.notes}`); + } + return lines.join("\n"); +} + +function formatPackagePartList( + matches: readonly OpcPart[], + opts: { title: string; query: string; footer?: string }, +): string { + const lines: string[] = []; + lines.push(`## ${opts.title}`); + lines.push(""); + if (matches.length === 0) { + lines.push("_(no matches)_"); + lines.push(""); + lines.push( + "Try `ooxml_package_part` with no args to see the full list, or `ooxml_search` for prose references.", + ); + return lines.join("\n"); + } + lines.push("| key | name | content type | families |"); + lines.push("| --- | --- | --- | --- |"); + for (const p of matches) { + const cts = contentTypesOf(p); + // Show first canonical type plus a "+N" indicator if there are more, + // so the table stays compact for image/* and similar enumerated sets. + const ctCell = cts.length === 1 ? `\`${cts[0]}\`` : `\`${cts[0]}\` _(+${cts.length - 1} more)_`; + lines.push(`| \`${p.key}\` | ${p.name} | ${ctCell} | ${p.packageFamilies.join(", ")} |`); + } + lines.push(""); + lines.push( + opts.footer ?? + "Pass an exact `content_type` or `relationship_type` for the full report on a single part.", + ); + return lines.join("\n"); +} + +function formatPackagePartNotFound( + kind: "content type" | "relationship type", + value: string, +): string { + const lines: string[] = []; + lines.push(`## Not found: OPC part with ${kind} '${value}'`); + lines.push(""); + lines.push("Try one of:"); + lines.push( + "- `ooxml_package_part` with a `query` substring (e.g. 'styles', 'customXml', 'theme')", + ); + lines.push("- `ooxml_package_part` with no args to list every curated part"); + lines.push( + "- `ooxml_search` if the part type is documented in spec prose but not yet curated here", + ); + return lines.join("\n"); +} diff --git a/apps/mcp-server/src/opc-parts.ts b/apps/mcp-server/src/opc-parts.ts new file mode 100644 index 0000000..a6daf48 --- /dev/null +++ b/apps/mcp-server/src/opc-parts.ts @@ -0,0 +1,442 @@ +/** + * Curated reference for OPC (Open Packaging Conventions) package parts. + * + * The schema graph (`xsd_*`) answers "what elements are legal inside this XML + * body?" The prose corpus (`spec_content`) answers "what does this section + * say?" Neither answers "what kind of OPC part is `/customXml/item1.xml`?" + * That is package metadata: content type, source relationship type, root + * namespace, typical path - all defined in ECMA-376 Part 1 §15.x and the + * vocabulary-specific Parts (§11.3.x WML, §12.3.x SML, §13.3.x PML). + * + * Static typed data here, no DB. The set is small (~25 records), static + * across ECMA editions, and curated; PR diff is the right audit primitive. + * Add a new entry by appending to OPC_PARTS - the lookup index is rebuilt + * lazily from the literal array on first access. + * + * Where the spec prose and the XSD targetNamespace disagree (custom XML + * data storage properties), this file pins `rootNamespace` to the XSD URI + * so the value resolves through the schema-graph tools too. + */ + +export type PackageFamily = "wordprocessing" | "spreadsheet" | "presentation"; + +export interface OpcPart { + /** Human-readable name as it appears in the spec heading. */ + name: string; + /** Stable, machine-readable id (kebab-case). Stable across spec edits. */ + key: string; + /** + * OPC content type(s) (Content_Types.xml `Override` or `Default` value). + * Most parts have a single canonical value; binary parts that accept any + * media type in a family (image, embedded font) carry the enumerated set + * called out in the spec so exact lookups against [Content_Types].xml + * resolve. Display uses the first entry; lookups index every entry. + */ + contentType: string | string[]; + /** + * Source relationship type URI. `null` when the part is referenced only + * by an implicit relationship from the package (e.g. core properties). + */ + relationshipType: string | null; + /** + * Target namespace of the XML root, or `null` for binary / arbitrary-XML + * parts (image data, custom XML data storage body). + */ + rootNamespace: string | null; + /** + * Local name of the XML root element, or `null` when the part has no + * fixed root (image data, arbitrary-XML payload). + */ + rootElement: string | null; + /** Typical part-name paths inside the package (informative; varies). */ + typicalPaths: string[]; + /** Source sections in ECMA-376 Part 1. */ + sourceSections: string[]; + /** Implementation notes; spec divergences, common gotchas. */ + notes?: string; + /** Which document families use this part type. */ + packageFamilies: PackageFamily[]; +} + +export const OPC_PARTS: readonly OpcPart[] = [ + // --- WordprocessingML ---------------------------------------------------- + { + name: "Main Document Part", + key: "wml-document", + contentType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", + rootNamespace: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + rootElement: "document", + typicalPaths: ["word/document.xml"], + sourceSections: ["Part 1, §11.3.10"], + packageFamilies: ["wordprocessing"], + }, + { + name: "Style Definitions Part", + key: "wml-styles", + contentType: "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml", + relationshipType: "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles", + rootNamespace: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + rootElement: "styles", + typicalPaths: ["word/styles.xml"], + sourceSections: ["Part 1, §11.3.12"], + packageFamilies: ["wordprocessing"], + }, + { + name: "Settings Part", + key: "wml-settings", + contentType: "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings", + rootNamespace: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + rootElement: "settings", + typicalPaths: ["word/settings.xml"], + sourceSections: ["Part 1, §11.3.3"], + packageFamilies: ["wordprocessing"], + }, + { + name: "Numbering Definitions Part", + key: "wml-numbering", + contentType: "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering", + rootNamespace: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + rootElement: "numbering", + typicalPaths: ["word/numbering.xml"], + sourceSections: ["Part 1, §11.3.11"], + packageFamilies: ["wordprocessing"], + }, + { + name: "Comments Part", + key: "wml-comments", + contentType: "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments", + rootNamespace: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + rootElement: "comments", + typicalPaths: ["word/comments.xml"], + sourceSections: ["Part 1, §11.3.2"], + packageFamilies: ["wordprocessing"], + }, + { + name: "Footnotes Part", + key: "wml-footnotes", + contentType: "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes", + rootNamespace: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + rootElement: "footnotes", + typicalPaths: ["word/footnotes.xml"], + sourceSections: ["Part 1, §11.3.7"], + packageFamilies: ["wordprocessing"], + }, + { + name: "Endnotes Part", + key: "wml-endnotes", + contentType: "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes", + rootNamespace: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + rootElement: "endnotes", + typicalPaths: ["word/endnotes.xml"], + sourceSections: ["Part 1, §11.3.5"], + packageFamilies: ["wordprocessing"], + }, + { + name: "Header Part", + key: "wml-header", + contentType: "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml", + relationshipType: "http://schemas.openxmlformats.org/officeDocument/2006/relationships/header", + rootNamespace: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + rootElement: "hdr", + typicalPaths: ["word/header1.xml", "word/header2.xml"], + sourceSections: ["Part 1, §11.3.9"], + notes: + "A package can contain multiple header parts; each is referenced from sectPr in the main document.", + packageFamilies: ["wordprocessing"], + }, + { + name: "Footer Part", + key: "wml-footer", + contentType: "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml", + relationshipType: "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer", + rootNamespace: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + rootElement: "ftr", + typicalPaths: ["word/footer1.xml", "word/footer2.xml"], + sourceSections: ["Part 1, §11.3.6"], + notes: + "A package can contain multiple footer parts; each is referenced from sectPr in the main document.", + packageFamilies: ["wordprocessing"], + }, + + // --- SpreadsheetML ------------------------------------------------------- + { + name: "Workbook Part", + key: "sml-workbook", + contentType: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", + rootNamespace: "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + rootElement: "workbook", + typicalPaths: ["xl/workbook.xml"], + sourceSections: ["Part 1, §12.3.23"], + packageFamilies: ["spreadsheet"], + }, + { + name: "Worksheet Part", + key: "sml-worksheet", + contentType: "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet", + rootNamespace: "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + rootElement: "worksheet", + typicalPaths: ["xl/worksheets/sheet1.xml"], + sourceSections: ["Part 1, §12.3.24"], + packageFamilies: ["spreadsheet"], + }, + { + name: "Shared String Table Part", + key: "sml-shared-strings", + contentType: "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings", + rootNamespace: "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + rootElement: "sst", + typicalPaths: ["xl/sharedStrings.xml"], + sourceSections: ["Part 1, §12.3.15"], + packageFamilies: ["spreadsheet"], + }, + + // --- PresentationML ----------------------------------------------------- + { + name: "Presentation Part", + key: "pml-presentation", + contentType: + "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", + rootNamespace: "http://schemas.openxmlformats.org/presentationml/2006/main", + rootElement: "presentation", + typicalPaths: ["ppt/presentation.xml"], + sourceSections: ["Part 1, §13.3.6"], + packageFamilies: ["presentation"], + }, + { + name: "Slide Part", + key: "pml-slide", + contentType: "application/vnd.openxmlformats-officedocument.presentationml.slide+xml", + relationshipType: "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide", + rootNamespace: "http://schemas.openxmlformats.org/presentationml/2006/main", + rootElement: "sld", + typicalPaths: ["ppt/slides/slide1.xml"], + sourceSections: ["Part 1, §13.3.8"], + packageFamilies: ["presentation"], + }, + { + name: "Slide Layout Part", + key: "pml-slide-layout", + contentType: "application/vnd.openxmlformats-officedocument.presentationml.slideLayout+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout", + rootNamespace: "http://schemas.openxmlformats.org/presentationml/2006/main", + rootElement: "sldLayout", + typicalPaths: ["ppt/slideLayouts/slideLayout1.xml"], + sourceSections: ["Part 1, §13.3.9"], + packageFamilies: ["presentation"], + }, + { + name: "Slide Master Part", + key: "pml-slide-master", + contentType: "application/vnd.openxmlformats-officedocument.presentationml.slideMaster+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideMaster", + rootNamespace: "http://schemas.openxmlformats.org/presentationml/2006/main", + rootElement: "sldMaster", + typicalPaths: ["ppt/slideMasters/slideMaster1.xml"], + sourceSections: ["Part 1, §13.3.10"], + packageFamilies: ["presentation"], + }, + + // --- Cross-cutting ------------------------------------------------------ + { + name: "Core File Properties Part", + key: "core-properties", + contentType: "application/vnd.openxmlformats-package.core-properties+xml", + relationshipType: + "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties", + rootNamespace: "http://schemas.openxmlformats.org/package/2006/metadata/core-properties", + rootElement: "coreProperties", + typicalPaths: ["docProps/core.xml"], + sourceSections: ["Part 1, §15.2.12.1"], + notes: + "Targeted by an implicit relationship from the package root, not from the main document.", + packageFamilies: ["wordprocessing", "spreadsheet", "presentation"], + }, + { + name: "Extended File Properties Part", + key: "extended-properties", + contentType: "application/vnd.openxmlformats-officedocument.extended-properties+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties", + rootNamespace: "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties", + rootElement: "Properties", + typicalPaths: ["docProps/app.xml"], + sourceSections: ["Part 1, §15.2.12.3"], + packageFamilies: ["wordprocessing", "spreadsheet", "presentation"], + }, + { + name: "Custom File Properties Part", + key: "custom-properties", + contentType: "application/vnd.openxmlformats-officedocument.custom-properties+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties", + rootNamespace: "http://schemas.openxmlformats.org/officeDocument/2006/custom-properties", + rootElement: "Properties", + typicalPaths: ["docProps/custom.xml"], + sourceSections: ["Part 1, §15.2.12.2"], + packageFamilies: ["wordprocessing", "spreadsheet", "presentation"], + }, + { + name: "Theme Part", + key: "theme", + contentType: "application/vnd.openxmlformats-officedocument.theme+xml", + relationshipType: "http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme", + rootNamespace: "http://schemas.openxmlformats.org/drawingml/2006/main", + rootElement: "theme", + typicalPaths: ["word/theme/theme1.xml", "xl/theme/theme1.xml", "ppt/theme/theme1.xml"], + sourceSections: ["Part 1, §14.2.7.10"], + packageFamilies: ["wordprocessing", "spreadsheet", "presentation"], + }, + { + name: "Image Part", + key: "image", + // Enumerated set called out in Part 1 §15.2.13. Real [Content_Types].xml + // entries use a specific media type per image, not a wildcard; agents + // looking up image/png must resolve to this record. + contentType: [ + "image/png", + "image/jpeg", + "image/gif", + "image/tiff", + "image/x-emf", + "image/x-wmf", + "image/bmp", + ], + relationshipType: "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image", + rootNamespace: null, + rootElement: null, + typicalPaths: ["word/media/image1.png", "xl/media/image1.png", "ppt/media/image1.png"], + sourceSections: ["Part 1, §15.2.13"], + notes: + "Binary part; the content type recorded in [Content_Types].xml is the specific image media type. Each image becomes its own part. Other image/* media types may appear in practice; only the spec-enumerated set is indexed here.", + packageFamilies: ["wordprocessing", "spreadsheet", "presentation"], + }, + { + name: "Custom XML Data Storage Part", + key: "custom-xml-data", + contentType: "application/xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXml", + rootNamespace: null, + rootElement: null, + typicalPaths: ["customXml/item1.xml", "customXml/item2.xml"], + sourceSections: ["Part 1, §15.2.5"], + notes: + "Arbitrary XML payload; root namespace and element are whatever the consumer puts there. Each storage part has a sibling Custom XML Data Storage Properties Part that identifies it.", + packageFamilies: ["wordprocessing", "spreadsheet", "presentation"], + }, + { + name: "Custom XML Data Storage Properties Part", + key: "custom-xml-data-properties", + contentType: "application/vnd.openxmlformats-officedocument.customXmlProperties+xml", + relationshipType: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXmlProps", + rootNamespace: "http://schemas.openxmlformats.org/officeDocument/2006/customXml", + rootElement: "datastoreItem", + typicalPaths: ["customXml/itemProps1.xml", "customXml/itemProps2.xml"], + sourceSections: ["Part 1, §15.2.6", "Part 1, §22.5.2.1"], + notes: + "Spec/XSD divergence: ECMA-376 Part 1 §15.2.6 names the root namespace as `.../officeDocument/customXmlDataProps`, but the shipped XSD targets `.../officeDocument/2006/customXml`. The schema-graph URI (used here) is what real packages use and what `ooxml_element` keys on.", + packageFamilies: ["wordprocessing", "spreadsheet", "presentation"], + }, +]; + +// --- Lookup helpers -------------------------------------------------------- + +/** Normalize contentType (string | string[]) to a stable array view. */ +export function contentTypesOf(p: OpcPart): readonly string[] { + return Array.isArray(p.contentType) ? p.contentType : [p.contentType]; +} + +let byContentType: Map | null = null; +let byRelationshipType: Map | null = null; + +function indexes(): { + byContentType: Map; + byRelationshipType: Map; +} { + if (!byContentType || !byRelationshipType) { + byContentType = new Map(); + byRelationshipType = new Map(); + for (const p of OPC_PARTS) { + // Content type is unique per record by construction (binary parts + // enumerate their accepted media types; XML parts have one each). + // Index every alias. + for (const ct of contentTypesOf(p)) byContentType.set(ct, p); + // Relationship type is intentionally non-unique: the .../relationships/ + // officeDocument rel points at the main part for WML / SML / PML, + // the .../relationships/customXml rel can target any custom XML + // storage part regardless of family. Group hits per URI so the + // caller can disambiguate. + if (p.relationshipType) { + const bucket = byRelationshipType.get(p.relationshipType); + if (bucket) bucket.push(p); + else byRelationshipType.set(p.relationshipType, [p]); + } + } + } + return { byContentType, byRelationshipType }; +} + +/** Exact-match lookup by OPC content type. */ +export function findPartByContentType(contentType: string): OpcPart | null { + return indexes().byContentType.get(contentType) ?? null; +} + +/** + * Lookup by source relationship type URI. Returns every part that uses + * the URI: the `.../relationships/officeDocument` rel, for example, + * points at three different main parts (Word / Excel / PowerPoint), and + * the caller has to disambiguate by package family. Returns an empty + * array on miss. + */ +export function findPartsByRelationshipType(relationshipType: string): readonly OpcPart[] { + return indexes().byRelationshipType.get(relationshipType) ?? []; +} + +/** + * Case-insensitive substring search over name, key, content type(s), + * relationship type, root namespace, root element, and notes. Returns + * matches in their declared order in OPC_PARTS (which groups by family). + */ +export function searchParts(query: string): OpcPart[] { + const q = query.trim().toLowerCase(); + if (!q) return [...OPC_PARTS]; + const hits: OpcPart[] = []; + for (const p of OPC_PARTS) { + const haystack = [ + p.name, + p.key, + ...contentTypesOf(p), + p.relationshipType ?? "", + p.rootNamespace ?? "", + p.rootElement ?? "", + p.notes ?? "", + ] + .join(" ") + .toLowerCase(); + if (haystack.includes(q)) hits.push(p); + } + return hits; +} diff --git a/apps/web/public/llms.txt b/apps/web/public/llms.txt index e5e65f2..f1c8c99 100644 --- a/apps/web/public/llms.txt +++ b/apps/web/public/llms.txt @@ -33,7 +33,7 @@ Every page combines XML structure, live rendered previews, and implementation no ## MCP Server -OOXML reference for AI assistants. Two tool families: prose search across 18,000+ ECMA-376 spec chunks, and deterministic schema lookup over the parsed XSDs. +OOXML reference for AI assistants. Three tool families: prose search across 18,000+ ECMA-376 spec chunks, deterministic schema lookup over the parsed XSDs, and curated OPC package metadata. Prose search (over the spec PDFs): - `ooxml_search`: Semantic search — ask questions in natural language @@ -47,6 +47,9 @@ Schema lookup (over the parsed XSDs): - `ooxml_enum`: Enumeration values for a simpleType - `ooxml_namespace`: Vocabularies and symbol counts for a namespace URI +Package metadata (curated from ECMA-376 Part 1 §11.3.x / §12.3.x / §13.3.x / §15.x): +- `ooxml_package_part`: OPC part type by content type, source relationship type, or query substring + ## About ooxml.dev is built by SuperDoc — DOCX editing and tooling (https://superdoc.dev). SuperDoc is a document engine that renders OOXML natively in the browser. The implementation notes on ooxml.dev come from building that engine against thousands of real-world documents. diff --git a/apps/web/src/pages/Mcp.tsx b/apps/web/src/pages/Mcp.tsx index 2a4d32e..5712e28 100644 --- a/apps/web/src/pages/Mcp.tsx +++ b/apps/web/src/pages/Mcp.tsx @@ -57,6 +57,14 @@ const SCHEMA_TOOLS = [ }, ]; +const PACKAGE_TOOLS = [ + { + name: "ooxml_package_part", + description: + "Look up OPC part types by content type, source relationship type, or query substring. Covers Word, Excel, PowerPoint, and cross-cutting parts (properties, theme, image, custom XML).", + }, +]; + const EXAMPLE_QUERIES = [ "How do I add borders to a table cell?", "How does numbering work in WordprocessingML?", @@ -96,8 +104,9 @@ export function Mcp() {

OOXML reference for AI assistants

- Two tool families: prose search across 18,000+ spec chunks, and deterministic schema - lookup over the parsed XSDs. Ask in natural language, or query the structure directly. + Three tool families: prose search across 18,000+ spec chunks, deterministic schema + lookup over the parsed XSDs, and curated OPC package metadata. Ask in natural language, + or query the structure directly.

@@ -262,6 +271,27 @@ export function Mcp() { + {/* Package metadata tools */} +
+

Package metadata

+

+ Curated OPC part-type reference: content types, source relationship types, root + namespaces, and typical paths in the package. +

+
+ {PACKAGE_TOOLS.map((tool) => ( +
+
+ + {tool.name} + +

{tool.description}

+
+
+ ))} +
+
+ {/* Example Queries */}

Example Queries

@@ -295,9 +325,9 @@ export function Mcp() { tools.

- By connecting to this MCP server, your AI assistant gains both prose search across the - ECMA-376 specification and deterministic schema lookup over the parsed XSDs—making it - much easier to work with Office Open XML. + By connecting to this MCP server, your AI assistant gains prose search across the + ECMA-376 specification, deterministic schema lookup over the parsed XSDs, and curated + OPC package metadata—making it much easier to work with Office Open XML.

diff --git a/brand.md b/brand.md index 56dd73f..8cded6b 100644 --- a/brand.md +++ b/brand.md @@ -52,7 +52,7 @@ The commercial document vendors (Aspose, Syncfusion, TX Text Control, Nutrient) **Structural differentiators**: - **Live previews** — Every XML example renders in real-time via SuperDoc. No other OOXML reference shows you what the XML actually produces. - **Implementation notes from production** — Not spec commentary. Notes from building a shipping document engine against real-world documents. -- **AI-native reference** — MCP server with two tool families: prose search across 18,000+ spec chunks (ask questions in natural language) and deterministic schema lookup over the parsed XSDs (legal children, attribute lists, enum values, namespaces — exact answers, no hallucination). +- **AI-native reference** — MCP server with three tool families: prose search across 18,000+ spec chunks (ask questions in natural language), deterministic schema lookup over the parsed XSDs (legal children, attribute lists, enum values, namespaces — exact answers, no hallucination), and curated OPC package metadata (content types, relationship types, root namespaces for every part type that shows up in a real .docx / .xlsx / .pptx). - **Real document corpus** — Backed by docx-corpus (1M+ real documents). Observations are tested against actual documents in the wild, not just spec examples. - **Format-first, tool-agnostic** — Useful whether you're building on SuperDoc, Aspose, your own renderer, or just trying to understand a .docx file. @@ -133,7 +133,7 @@ _Use on homepage hero, social bios, link previews._ **Slogans for different contexts**: - Developer discovery: "5,000 pages of spec. The 200 that matter. The notes you actually need." -- AI/MCP context: "Ask the spec anything, or query the schema directly. Two tool families, one server." +- AI/MCP context: "Ask the spec anything, query the schema directly, or look up any OPC part type. One server." - Community pitch: "Hard-won OOXML knowledge, shared freely." - SuperDoc connection: "Built by SuperDoc — DOCX editing and tooling. Open to everyone." - Credibility: "Every example is a working document." diff --git a/tests/mcp-server/opc-parts.test.ts b/tests/mcp-server/opc-parts.test.ts new file mode 100644 index 0000000..3afe3d3 --- /dev/null +++ b/tests/mcp-server/opc-parts.test.ts @@ -0,0 +1,236 @@ +/** + * Tests for `ooxml_package_part` and the curated OPC dataset. + * + * Static data, no DB. We exercise both the lookup helpers directly and + * the tool dispatch in runOoxmlTool. SQL arg to the dispatch is a stub + * because this tool's case never reaches the database. + */ + +import { expect, test } from "bun:test"; +import { + contentTypesOf, + findPartByContentType, + findPartsByRelationshipType, + OPC_PARTS, + type OpcPart, + searchParts, +} from "../../apps/mcp-server/src/opc-parts.ts"; +import { runOoxmlTool } from "../../apps/mcp-server/src/ooxml-tools.ts"; + +// runOoxmlTool's sql arg isn't touched by the ooxml_package_part case; an +// empty stub keeps the type happy without dragging in a DB. +const sqlStub = (() => { + throw new Error("sql should not be called for ooxml_package_part"); +}) as unknown as Parameters[2]; + +test("OPC_PARTS dataset has unique keys and non-empty required fields", () => { + const keys = new Set(); + for (const p of OPC_PARTS) { + expect(keys.has(p.key)).toBe(false); + keys.add(p.key); + expect(p.name.length).toBeGreaterThan(0); + expect(contentTypesOf(p).length).toBeGreaterThan(0); + for (const ct of contentTypesOf(p)) expect(ct.length).toBeGreaterThan(0); + expect(p.typicalPaths.length).toBeGreaterThan(0); + expect(p.sourceSections.length).toBeGreaterThan(0); + expect(p.packageFamilies.length).toBeGreaterThan(0); + } + // Sanity floor: every major Office family should be represented. + const families = new Set(OPC_PARTS.flatMap((p) => p.packageFamilies)); + expect(families).toEqual(new Set(["wordprocessing", "spreadsheet", "presentation"])); +}); + +test("findPartByContentType: exact match for Word styles part", () => { + const hit = findPartByContentType( + "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml", + ); + expect(hit?.key).toBe("wml-styles"); + expect(hit?.rootElement).toBe("styles"); +}); + +test("findPartByContentType: returns null on miss", () => { + expect(findPartByContentType("application/x-not-real")).toBeNull(); +}); + +test("findPartByContentType: every enumerated image media type resolves to the Image Part", () => { + // Image Part stores multiple content types (image/png, image/jpeg, ...). + // Each must resolve via exact lookup so real [Content_Types].xml entries + // match without the caller stripping a wildcard. + for (const ct of ["image/png", "image/jpeg", "image/gif", "image/tiff", "image/x-emf"]) { + const hit = findPartByContentType(ct); + expect(hit?.key).toBe("image"); + } + // An image media type the spec doesn't enumerate (image/avif, image/webp) + // will fall through to null. That's the expected behavior; the not-found + // path guides the agent to query for "image" instead. + expect(findPartByContentType("image/avif")).toBeNull(); +}); + +test("findPartsByRelationshipType: unique rel returns a single-element array", () => { + const hits = findPartsByRelationshipType( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXmlProps", + ); + expect(hits).toHaveLength(1); + expect(hits[0].key).toBe("custom-xml-data-properties"); + // Spec/XSD URI policy: rootNamespace pins the XSD URI, not the + // spec-prose .../customXmlDataProps URI. + expect(hits[0].rootNamespace).toBe( + "http://schemas.openxmlformats.org/officeDocument/2006/customXml", + ); +}); + +test("findPartsByRelationshipType: shared officeDocument rel returns all three main parts", () => { + // The .../relationships/officeDocument rel points at the main part for + // WML / SML / PML. Collapsing them in the lookup hides two of three. + const hits = findPartsByRelationshipType( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", + ); + const keys = hits.map((h) => h.key).sort(); + expect(keys).toEqual(["pml-presentation", "sml-workbook", "wml-document"]); +}); + +test("findPartsByRelationshipType: miss returns empty array", () => { + expect(findPartsByRelationshipType("http://example.invalid/not-a-rel")).toEqual([]); +}); + +test("searchParts: empty query returns the full set", () => { + const all = searchParts(""); + expect(all.length).toBe(OPC_PARTS.length); +}); + +test("searchParts: case-insensitive across name, namespace, notes", () => { + const theme = searchParts("THEME"); + expect(theme.map((p) => p.key)).toContain("theme"); + + // 'customXml' appears in name AND namespace AND notes for multiple parts. + const customXml = searchParts("customXml"); + const keys = customXml.map((p) => p.key); + expect(keys).toContain("custom-xml-data"); + expect(keys).toContain("custom-xml-data-properties"); +}); + +test("ooxml_package_part: exact content_type returns full report", async () => { + const out = await runOoxmlTool( + "ooxml_package_part", + { + content_type: + "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml", + }, + sqlStub, + ); + expect(out).toContain("## OPC Part: Main Document Part"); + expect(out).toContain("`wml-document`"); + expect(out).toContain("word/document.xml"); + expect(out).toContain("Part 1, §11.3.10"); + expect(out).toContain("wordprocessing"); +}); + +test("ooxml_package_part: exact relationship_type matches the customXmlProps part", async () => { + const out = await runOoxmlTool( + "ooxml_package_part", + { + relationship_type: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXmlProps", + }, + sqlStub, + ); + expect(out).toContain("## OPC Part: Custom XML Data Storage Properties Part"); + // The spec/XSD divergence note is surfaced. + expect(out).toContain("XSD targets"); +}); + +test("ooxml_package_part: shared officeDocument rel renders all three main parts", async () => { + const out = await runOoxmlTool( + "ooxml_package_part", + { + relationship_type: + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument", + }, + sqlStub, + ); + expect(out).toContain("Package parts using relationship"); + // All three main parts must appear; the previous Map + // regression caused only the last-inserted (Presentation) to surface. + expect(out).toContain("`wml-document`"); + expect(out).toContain("`sml-workbook`"); + expect(out).toContain("`pml-presentation`"); + expect(out).toContain("shared across package families"); +}); + +test("ooxml_package_part: image/png exact content_type resolves to the Image Part", async () => { + const out = await runOoxmlTool( + "ooxml_package_part", + { content_type: "image/png" }, + sqlStub, + ); + expect(out).toContain("## OPC Part: Image Part"); + // Multi-content-type record renders as plural label. + expect(out).toContain("content types:"); + expect(out).toContain("`image/png`"); +}); + +test("ooxml_package_part: query substring returns a list table", async () => { + const out = await runOoxmlTool("ooxml_package_part", { query: "slide" }, sqlStub); + expect(out).toContain("Package parts matching 'slide'"); + expect(out).toContain("`pml-slide`"); + expect(out).toContain("`pml-slide-layout`"); + expect(out).toContain("`pml-slide-master`"); + // WML parts should be filtered out. + expect(out).not.toContain("`wml-styles`"); +}); + +test("ooxml_package_part: no args lists the full curated set", async () => { + const out = await runOoxmlTool("ooxml_package_part", {}, sqlStub); + expect(out).toContain("Curated OPC package parts"); + // Spot-check entries from each family. + expect(out).toContain("`wml-document`"); + expect(out).toContain("`sml-workbook`"); + expect(out).toContain("`pml-presentation`"); + expect(out).toContain("`core-properties`"); +}); + +test("ooxml_package_part: content_type miss surfaces helpful next steps", async () => { + const out = await runOoxmlTool( + "ooxml_package_part", + { content_type: "application/x-not-real" }, + sqlStub, + ); + expect(out).toContain("Not found: OPC part with content type 'application/x-not-real'"); + expect(out).toContain("`ooxml_package_part` with a `query` substring"); + expect(out).toContain("`ooxml_search`"); +}); + +test("ooxml_package_part: relationship_type miss surfaces helpful next steps", async () => { + const out = await runOoxmlTool( + "ooxml_package_part", + { relationship_type: "http://example.invalid/rel" }, + sqlStub, + ); + expect(out).toContain("Not found: OPC part with relationship type 'http://example.invalid/rel'"); +}); + +test("ooxml_package_part: empty query result surfaces 'no matches'", async () => { + const out = await runOoxmlTool( + "ooxml_package_part", + { query: "this-should-not-match-anything" }, + sqlStub, + ); + expect(out).toContain("(no matches)"); + expect(out).toContain("ooxml_package_part` with no args"); +}); + +test("OPC_PARTS keys follow the documented kebab-case shape", () => { + for (const p of OPC_PARTS) { + expect(p.key).toMatch(/^[a-z][a-z0-9-]*$/); + } +}); + +test("OPC_PARTS family-specific keys are prefixed", () => { + for (const p of OPC_PARTS as readonly OpcPart[]) { + if (p.packageFamilies.length === 1) { + const fam = p.packageFamilies[0]; + const expectedPrefix = fam === "wordprocessing" ? "wml-" : fam === "spreadsheet" ? "sml-" : "pml-"; + expect(p.key.startsWith(expectedPrefix)).toBe(true); + } + } +}); diff --git a/tests/mcp-server/tools-list.test.ts b/tests/mcp-server/tools-list.test.ts index ae02a94..ea679a7 100644 --- a/tests/mcp-server/tools-list.test.ts +++ b/tests/mcp-server/tools-list.test.ts @@ -21,6 +21,8 @@ const EXPECTED_TOOL_NAMES = [ "ooxml_attributes", "ooxml_enum", "ooxml_namespace", + // OPC package metadata (over the curated opc-parts dataset) + "ooxml_package_part", ] as const; interface JsonRpcResponse {