-
Notifications
You must be signed in to change notification settings - Fork 139
fix: normalize root-level docx parts on import #3254
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,66 @@ const FONT_CONTENT_TYPES = { | |
| otf: 'application/vnd.ms-opentype', | ||
| }; | ||
|
|
||
| const OFFICE_DOCUMENT_RELATIONSHIP_TYPE = | ||
| 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'; | ||
|
|
||
| const normalizePackagePath = (path) => { | ||
| if (typeof path !== 'string') return null; | ||
| return path.replace(/^\/+/, '').replace(/^\.\//, ''); | ||
| }; | ||
|
|
||
| const getRelationshipElements = (relationshipsXml) => { | ||
| if (!relationshipsXml) return []; | ||
|
|
||
| try { | ||
| const parsed = xmljs.xml2js(relationshipsXml, { compact: false }); | ||
| const relationships = parsed.elements?.find((el) => el.name === 'Relationships'); | ||
| return relationships?.elements ?? []; | ||
| } catch { | ||
| return []; | ||
| } | ||
| }; | ||
|
|
||
| const getMainDocumentPath = async (zip) => { | ||
| const packageRels = zip.file('_rels/.rels'); | ||
| if (!packageRels) return null; | ||
|
|
||
| const relsXml = ensureXmlString(await packageRels.async('uint8array')); | ||
| const officeDocumentRel = getRelationshipElements(relsXml).find((rel) => { | ||
| const attrs = rel?.attributes ?? {}; | ||
| return attrs.Type === OFFICE_DOCUMENT_RELATIONSHIP_TYPE && attrs.TargetMode !== 'External'; | ||
| }); | ||
|
|
||
| return normalizePackagePath(officeDocumentRel?.attributes?.Target); | ||
| }; | ||
|
|
||
| const isRootLevelMainDocumentPackage = (mainDocumentPath) => { | ||
| return Boolean(mainDocumentPath) && !mainDocumentPath.includes('/'); | ||
| }; | ||
|
|
||
| const normalizeRootLevelWordPartPath = (name, mainDocumentPath) => { | ||
| if (!isRootLevelMainDocumentPackage(mainDocumentPath)) return name; | ||
| if (name === '[Content_Types].xml' || name === '_rels/.rels') return name; | ||
| if (name.startsWith('word/')) return name; | ||
|
|
||
| const mainDocumentRelsPath = `_rels/${mainDocumentPath}.rels`; | ||
| if (name === mainDocumentRelsPath) return 'word/_rels/document.xml.rels'; | ||
|
|
||
| if (name.startsWith('_rels/') && name.endsWith('.rels')) { | ||
| return `word/${name}`; | ||
| } | ||
|
|
||
| if (!name.includes('/') && name.endsWith('.xml')) { | ||
| return `word/${name}`; | ||
| } | ||
|
|
||
| if (name.startsWith('images/')) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. important: only verified by adding a fixture with root-level likely fix: invert the rewrite logic - normalize any non- |
||
| return `word/${name}`; | ||
| } | ||
|
Comment on lines
+77
to
+79
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When Useful? React with 👍 / 👎.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @financialvice same issue flagged in the inline review at line 77, extends to |
||
|
|
||
| return name; | ||
| }; | ||
|
|
||
| /** | ||
| * Class to handle unzipping and zipping of docx files | ||
| */ | ||
|
|
@@ -111,10 +171,11 @@ class DocxZipper { | |
| // If caller supplied a password but the file isn't encrypted, ignore it. | ||
|
|
||
| const extractedFiles = await this.unzip(fileData); | ||
| const mainDocumentPath = await getMainDocumentPath(extractedFiles); | ||
| const files = Object.entries(extractedFiles.files); | ||
|
|
||
| for (const [, zipEntry] of files) { | ||
| const name = zipEntry.name; | ||
| const name = normalizeRootLevelWordPartPath(zipEntry.name, mainDocumentPath); | ||
|
|
||
| if (isXmlLike(name)) { | ||
| // Read raw bytes and decode (handles UTF-8 & UTF-16) | ||
|
|
@@ -123,7 +184,9 @@ class DocxZipper { | |
| this.files.push({ name, content }); | ||
| } else if ( | ||
| (name.startsWith('word/media') && name !== 'word/media/') || | ||
| (name.startsWith('word/images') && name !== 'word/images/') || | ||
| (zipEntry.name.startsWith('media') && zipEntry.name !== 'media/') || | ||
| (zipEntry.name.startsWith('images') && zipEntry.name !== 'images/') || | ||
| (name.startsWith('media') && name !== 'media/') || | ||
| (name.startsWith('word/embeddings') && name !== 'word/embeddings/') | ||
| ) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
important: this hardcodes
word/_rels/document.xml.relsas the rewrite target, but the main document part itself falls through to the generic root-XML branch and ends up atword/${name}. For a package whose main part is named anything other thandocument.xml(e.g.main.xml), the part lands atword/main.xmlwhile its rels land atword/_rels/document.xml.rels, and the rest of the importer expects the main doc atword/document.xml.ECMA-376 Part 1 §11.3.10 doesn't require the main document part to be named
document.xml- only that it be the target of the package-level officeDocument relationship.likely fix: when
name === mainDocumentPath, return'word/document.xml'so the part and its rels stay in sync regardless of the producer's chosen name.