Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion packages/super-editor/src/editors/v1/core/DocxZipper.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,66 @@ const FONT_CONTENT_TYPES = {
otf: 'application/vnd.ms-opentype',
};

const OFFICE_DOCUMENT_RELATIONSHIP_TYPE =
'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument';

const normalizePackagePath = (path) => {
if (typeof path !== 'string') return null;
return path.replace(/^\/+/, '').replace(/^\.\//, '');
};

const getRelationshipElements = (relationshipsXml) => {
if (!relationshipsXml) return [];

try {
const parsed = xmljs.xml2js(relationshipsXml, { compact: false });
const relationships = parsed.elements?.find((el) => el.name === 'Relationships');
return relationships?.elements ?? [];
} catch {
return [];
}
};

const getMainDocumentPath = async (zip) => {
const packageRels = zip.file('_rels/.rels');
if (!packageRels) return null;

const relsXml = ensureXmlString(await packageRels.async('uint8array'));
const officeDocumentRel = getRelationshipElements(relsXml).find((rel) => {
const attrs = rel?.attributes ?? {};
return attrs.Type === OFFICE_DOCUMENT_RELATIONSHIP_TYPE && attrs.TargetMode !== 'External';
});

return normalizePackagePath(officeDocumentRel?.attributes?.Target);
};

const isRootLevelMainDocumentPackage = (mainDocumentPath) => {
return Boolean(mainDocumentPath) && !mainDocumentPath.includes('/');
};

const normalizeRootLevelWordPartPath = (name, mainDocumentPath) => {
if (!isRootLevelMainDocumentPackage(mainDocumentPath)) return name;
if (name === '[Content_Types].xml' || name === '_rels/.rels') return name;
if (name.startsWith('word/')) return name;

const mainDocumentRelsPath = `_rels/${mainDocumentPath}.rels`;
if (name === mainDocumentRelsPath) return 'word/_rels/document.xml.rels';
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

important: this hardcodes word/_rels/document.xml.rels as the rewrite target, but the main document part itself falls through to the generic root-XML branch and ends up at word/${name}. For a package whose main part is named anything other than document.xml (e.g. main.xml), the part lands at word/main.xml while its rels land at word/_rels/document.xml.rels, and the rest of the importer expects the main doc at word/document.xml.

ECMA-376 Part 1 §11.3.10 doesn't require the main document part to be named document.xml - only that it be the target of the package-level officeDocument relationship.

likely fix: when name === mainDocumentPath, return 'word/document.xml' so the part and its rels stay in sync regardless of the producer's chosen name.


if (name.startsWith('_rels/') && name.endsWith('.rels')) {
return `word/${name}`;
}

if (!name.includes('/') && name.endsWith('.xml')) {
return `word/${name}`;
}

if (name.startsWith('images/')) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

important: only images/ is rewritten, but spec-typical root-level packages also use media/, theme/, embeddings/, fonts/. After moving _rels/document.xml.rels to word/_rels/..., the rels file's base URI shifts from / to /word/, so Target="theme/theme1.xml" and Target="media/image1.png" (the shapes shown in Part 1 §14.2.7 and §15.2.14) now resolve to word/theme/... and word/media/... - but those subdirectories aren't normalized here, so the files stay at root and downstream lookups miss them.

verified by adding a fixture with root-level theme/theme1.xml and media/image1.png and running getDocxData: mediaFiles came back as { 'media/image1.png': ... } (expected 'word/media/image1.png'), and the file list didn't contain 'word/theme/theme1.xml'.

likely fix: invert the rewrite logic - normalize any non-word/, non-[Content_Types].xml, non-_rels/.rels, non-docProps/ path by prepending word/. Or rewrite relationship Target values inside the rels XML when relocating the rels file.

return `word/${name}`;
}
Comment on lines +77 to +79
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Normalize root-level media assets during path rewrite

When document.xml is rewritten to word/document.xml, image relationship targets like media/image1.jpeg are later normalized to word/media/... by the importer (normalizeTargetPath), but this path rewriter only remaps images/ and leaves root-level media/ entries untouched. That causes DocxZipper to store bytes under media/... while image resolution looks for word/media/..., so embedded images disappear for root-level packages that use media/ targets.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@financialvice same issue flagged in the inline review at line 77, extends to theme/, embeddings/, and fonts/ too.


return name;
};

/**
* Class to handle unzipping and zipping of docx files
*/
Expand Down Expand Up @@ -111,10 +171,11 @@ class DocxZipper {
// If caller supplied a password but the file isn't encrypted, ignore it.

const extractedFiles = await this.unzip(fileData);
const mainDocumentPath = await getMainDocumentPath(extractedFiles);
const files = Object.entries(extractedFiles.files);

for (const [, zipEntry] of files) {
const name = zipEntry.name;
const name = normalizeRootLevelWordPartPath(zipEntry.name, mainDocumentPath);

if (isXmlLike(name)) {
// Read raw bytes and decode (handles UTF-8 & UTF-16)
Expand All @@ -123,7 +184,9 @@ class DocxZipper {
this.files.push({ name, content });
} else if (
(name.startsWith('word/media') && name !== 'word/media/') ||
(name.startsWith('word/images') && name !== 'word/images/') ||
(zipEntry.name.startsWith('media') && zipEntry.name !== 'media/') ||
(zipEntry.name.startsWith('images') && zipEntry.name !== 'images/') ||
(name.startsWith('media') && name !== 'media/') ||
(name.startsWith('word/embeddings') && name !== 'word/embeddings/')
) {
Expand Down
57 changes: 57 additions & 0 deletions packages/super-editor/src/editors/v1/core/DocxZipper.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,63 @@ describe('DocxZipper - file extraction', () => {
const documentXml = unzippedXml.find((file) => file.name === 'word/document.xml');
expect(documentXml).toBeTruthy();
});

it('normalizes root-level WordprocessingML parts to canonical word paths', async () => {
const zip = new JSZip();
const contentTypes = `<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
<Override PartName="/settings.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml"/>
<Override PartName="/header1.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"/>
</Types>`;
zip.file('[Content_Types].xml', contentTypes);
zip.file(
'_rels/.rels',
`<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="document.xml"/>
</Relationships>`,
);
zip.file(
'_rels/document.xml.rels',
`<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings" Target="settings.xml"/>
<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" Target="header1.xml"/>
<Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="images/1.png"/>
</Relationships>`,
);
zip.file(
'document.xml',
`<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>Hello from root</w:t></w:r></w:p></w:body>
</w:document>`,
);
zip.file(
'settings.xml',
'<?xml version="1.0" encoding="UTF-8"?><w:settings xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
);
zip.file(
'header1.xml',
'<?xml version="1.0" encoding="UTF-8"?><w:hdr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
);
zip.file('images/1.png', Buffer.from([0x89, 0x50, 0x4e, 0x47]));

const buf = await zip.generateAsync({ type: 'nodebuffer' });
const files = await zipper.getDocxData(buf, true);
const names = files.map((file) => file.name);

expect(names).toContain('word/document.xml');
expect(names).toContain('word/settings.xml');
expect(names).toContain('word/header1.xml');
expect(names).toContain('word/_rels/document.xml.rels');
expect(names).not.toContain('document.xml');
expect(names).not.toContain('_rels/document.xml.rels');
expect(zipper.mediaFiles['word/images/1.png']).toBeTruthy();
});
});

// Helper to build a UTF-16LE Buffer with BOM
Expand Down
Loading