Skip to content

Commit 234f273

Browse files
committed
Update AI threshold in benchmark and add check_clip test for character width calculations
- Increased the AI threshold from 0.90 to 0.97 in the benchmark script to improve accuracy in visual comparisons. - Added a new test file `check_clip.cs` to simulate Classic44 tests, demonstrating character width calculations and output for various strings.
1 parent 586f265 commit 234f273

17 files changed

Lines changed: 1919 additions & 2563 deletions

README.md

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# MiniPdf
1+
# MiniPdf
22

33
<div align="center">
44
<p>
@@ -68,33 +68,25 @@ MiniPdf output is compared against LibreOffice as the reference renderer across
6868

6969
### Visual Comparison
7070

71-
All 90 test cases comparing MiniPdf output (left) vs LibreOffice reference (right). Page 1 shown for multi-page results.
71+
Sample output comparing MiniPdf (left) vs LibreOffice reference (right).
7272

7373
<table>
74-
<tr><th>Test Case</th><th>MiniPdf</th><th>LibreOffice (Reference)</th><th>Score</th></tr>
74+
<tr><th>MiniPdf</th><th>LibreOffice (Reference)</th></tr>
7575
<tr>
76-
<td><b>classic01</b><br/>Basic table with headers</td>
7776
<td><img src="tests/MiniPdf.Benchmark/reports/images/classic01_basic_table_with_headers_p1_minipdf.png" width="320"/></td>
7877
<td><img src="tests/MiniPdf.Benchmark/reports/images/classic01_basic_table_with_headers_p1_reference.png" width="320"/></td>
79-
<td>🟢 99.8%</td>
8078
</tr>
8179
<tr>
82-
<td><b>classic02</b><br/>Multiple worksheets</td>
8380
<td><img src="tests/MiniPdf.Benchmark/reports/images/classic02_multiple_worksheets_p1_minipdf.png" width="320"/></td>
8481
<td><img src="tests/MiniPdf.Benchmark/reports/images/classic02_multiple_worksheets_p1_reference.png" width="320"/></td>
85-
<td>🟢 99.5%</td>
8682
</tr>
8783
<tr>
88-
<td><b>classic03</b><br/>Empty workbook</td>
8984
<td><img src="tests/MiniPdf.Benchmark/reports/images/classic03_empty_workbook_p1_minipdf.png" width="320"/></td>
9085
<td><img src="tests/MiniPdf.Benchmark/reports/images/classic03_empty_workbook_p1_reference.png" width="320"/></td>
91-
<td>🟢 100.0%</td>
9286
</tr>
9387
<tr>
94-
<td><b>classic04</b><br/>Single cell</td>
9588
<td><img src="tests/MiniPdf.Benchmark/reports/images/classic04_single_cell_p1_minipdf.png" width="320"/></td>
9689
<td><img src="tests/MiniPdf.Benchmark/reports/images/classic04_single_cell_p1_reference.png" width="320"/></td>
97-
<td>🟢 100.0%</td>
9890
</tr>
9991
<tr>
10092
<td><b>classic05</b><br/>Wide table</td>

analyze_overflow.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/usr/bin/env python3
2+
"""Check xlsx structure to understand overflow vs clip."""
3+
import zipfile, re, os, sys
4+
sys.stdout.reconfigure(encoding='utf-8')
5+
6+
output_dir = r'd:\git\MiniPdf\tests\MiniPdf.Scripts\output'
7+
8+
def analyze_xlsx(xlsx_path):
9+
with zipfile.ZipFile(xlsx_path) as z:
10+
names = [n for n in z.namelist() if n.startswith('xl/worksheets/sheet') and n.endswith('.xml')]
11+
if not names:
12+
return {}
13+
with z.open(names[0]) as f:
14+
sheet_xml = f.read().decode('utf-8')
15+
sharedstrings = []
16+
if 'xl/sharedStrings.xml' in z.namelist():
17+
with z.open('xl/sharedStrings.xml') as f:
18+
sst = f.read().decode('utf-8')
19+
sharedstrings = re.findall(r'<t[^>]*>([^<]*)</t>', sst)
20+
21+
# Find cell references in first few rows to know which columns are occupied
22+
cell_refs = re.findall(r'<c r="([A-Z]+)(\d+)"', sheet_xml)
23+
# Group by row
24+
from collections import defaultdict
25+
row_cols = defaultdict(set)
26+
for col, row in cell_refs:
27+
row_cols[int(row)].add(col)
28+
29+
return dict(list(row_cols.items())[:6]), sharedstrings[:10]
30+
31+
cases = [
32+
('classic35_explicit_row_heights', 'CLIPS'),
33+
('classic44_employee_roster', 'CLIPS'),
34+
('classic49_contact_list', 'CLIPS'),
35+
('classic24_red_text', 'NO CLIP'),
36+
('classic38_hyperlink_cell', 'NO CLIP'),
37+
('classic06_tall_table', 'NO CLIP'),
38+
('classic01_basic_table_with_headers', 'NO CLIP'),
39+
('classic36_merged_cells', 'NO CLIP - merged'),
40+
]
41+
42+
for name, expected in cases:
43+
path = os.path.join(output_dir, name + '.xlsx')
44+
row_cols, strings = analyze_xlsx(path)
45+
print(f"{name} ({expected}):")
46+
for rnum, cols in row_cols.items():
47+
print(f" row {rnum}: {sorted(cols)}")
48+
print(f" strings: {strings[:5]}")
49+
print()

analyze_xlsx.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/usr/bin/env python3
2+
"""Analyze xlsx files to understand column width requirements."""
3+
import zipfile, re, os, sys
4+
5+
output_dir = r'd:\git\MiniPdf\tests\MiniPdf.Scripts\output'
6+
7+
def get_col_content_widths(xlsx_path):
8+
"""Get max content length per column and SheetFormatPr info."""
9+
with zipfile.ZipFile(xlsx_path) as z:
10+
# Get sheets
11+
names = [n for n in z.namelist() if n.startswith('xl/worksheets/sheet') and n.endswith('.xml')]
12+
if not names:
13+
return {}, None
14+
with z.open(names[0]) as f:
15+
sheet_xml = f.read().decode('utf-8')
16+
17+
# Get shared strings
18+
try:
19+
with z.open('xl/sharedStrings.xml') as f:
20+
sst_xml = f.read().decode('utf-8')
21+
strings = re.findall(r'<t[^>]*>([^<]+)</t>', sst_xml)
22+
except:
23+
strings = []
24+
25+
# Get col widths
26+
cols = re.findall(r'<col\s[^/]*/>', sheet_xml)
27+
custom_cols = [c for c in cols if 'customWidth="1"' in c]
28+
29+
# Get sheetFormatPr
30+
fmt = re.search(r'<sheetFormatPr[^/]*/>', sheet_xml)
31+
fmt_str = fmt.group(0) if fmt else None
32+
33+
return custom_cols, fmt_str, strings
34+
35+
target_files = [
36+
# Tests that reference clips but we used to NOT clip -> now clipping correctly
37+
'classic35_explicit_row_heights.xlsx',
38+
'classic44_employee_roster.xlsx',
39+
'classic49_contact_list.xlsx',
40+
# Tests that now WRONGLY clip (regressions)
41+
'classic06_tall_table.xlsx',
42+
'classic18_large_dataset.xlsx',
43+
'classic60_large_wide_table.xlsx',
44+
'classic12_sparse_columns.xlsx',
45+
'classic05_wide_table.xlsx',
46+
'classic36_merged_cells.xlsx',
47+
'classic24_red_text.xlsx',
48+
'classic38_hyperlink_cell.xlsx',
49+
'classic01_basic_table_with_headers.xlsx',
50+
]
51+
52+
for fname in target_files:
53+
path = os.path.join(output_dir, fname)
54+
if not os.path.exists(path):
55+
print(f"{fname}: NOT FOUND")
56+
continue
57+
cols, fmt, strings = get_col_content_widths(path)
58+
max_len = max((len(s) for s in strings), default=0)
59+
print(f"{fname}")
60+
print(f" custom_cols: {cols[:3]}")
61+
print(f" sheetFormatPr: {fmt}")
62+
print(f" max_string_len: {max_len}")
63+
if strings:
64+
long_strings = [s for s in strings if len(s) > 10][:3]
65+
if long_strings:
66+
print(f" long strings (>10): {long_strings}")
67+
print()

check_reference.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env python3
2+
"""Extract text from reference PDFs to understand what LibreOffice renders."""
3+
import os, sys
4+
sys.path.insert(0, r'd:\git\MiniPdf\tests\MiniPdf.Benchmark')
5+
6+
try:
7+
import fitz # pymupdf
8+
except ImportError:
9+
print("pymupdf not installed")
10+
sys.exit(1)
11+
12+
ref_dir = r'd:\git\MiniPdf\tests\MiniPdf.Benchmark\reference_pdfs'
13+
14+
tests = [
15+
('classic35_explicit_row_heights', ['Tall Header', 'Tall HeadeValue', 'Tall Heade']),
16+
('classic44_employee_roster', ['Engineering', 'Engineerin']),
17+
('classic49_contact_list', ['alice@example.com', 'alice@exam']),
18+
('classic24_red_text', ['Something went wrong']),
19+
('classic38_hyperlink_cell', ['https://github.com', 'https://gi']),
20+
('classic06_tall_table', ['Contact', 'Customer']),
21+
('classic01_basic_table_with_headers', ['Product', 'Description']),
22+
('classic36_merged_cells', ['Merged Header', 'Merged Hea']),
23+
]
24+
25+
for name, look_for in tests:
26+
path = os.path.join(ref_dir, name + '.pdf')
27+
if not os.path.exists(path):
28+
print(f"{name}: PDF not found")
29+
continue
30+
doc = fitz.open(path)
31+
text = doc[0].get_text()
32+
print(f"\n{name}:")
33+
for s in look_for:
34+
found = s in text
35+
print(f" '{s}': {'FOUND' if found else 'NOT FOUND'}")
36+
# Print all words from text
37+
words = text.strip().split()
38+
print(f" First 20 words: {words[:20]}")

fix_code.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/env python3
2+
"""Apply targeted fixes to MiniPdf source files."""
3+
import re
4+
5+
def fix_file(path, old, new, expect_found=True):
6+
with open(path, encoding='utf-8') as f:
7+
content = f.read()
8+
if old not in content:
9+
print(f" WARNING: Pattern not found in {path}")
10+
if expect_found:
11+
return False
12+
else:
13+
content = content.replace(old, new, 1)
14+
with open(path, 'w', encoding='utf-8') as f:
15+
f.write(content)
16+
print(f" OK: replaced in {path}")
17+
return True
18+
19+
20+
# ── 1. PdfWriter.cs: en/em-dash -> WinAnsiEncoding bytes ─────────────────────
21+
fix_file(
22+
'src/MiniPdf/PdfWriter.cs',
23+
r"'\u2013' or '\u2014' or '\u2012' => '-', // en-dash, em-dash",
24+
r"'\u2013' or '\u2012' => (char)0x96, // en-dash -> WinAnsiEncoding 0x96" + "\n" +
25+
r" '\u2014' => (char)0x97, // em-dash -> WinAnsiEncoding 0x97"
26+
)
27+
28+
# ── 2. ExcelToPdfConverter.cs: Replace WrapCellText with clip ────────────────
29+
old_wrap = """\
30+
var maxChars = Math.Max(1, (int)(colWidths[i] / avgCharWidth));
31+
var wrapped = WrapCellText(cellText, maxChars);
32+
cellLines[i] = wrapped;
33+
if (wrapped.Length > maxLinesInRow) maxLinesInRow = wrapped.Length;"""
34+
35+
new_clip = """\
36+
var maxChars = Math.Max(1, (int)(colWidths[i] / avgCharWidth));
37+
// Clip to column width - matches LibreOffice default cell-overflow behaviour.
38+
// Long text is truncated at the boundary (no word-wrap; wrap_text not set).
39+
var clipped = cellText.Length > maxChars ? cellText[..maxChars] : cellText;
40+
cellLines[i] = new[] { clipped };"""
41+
42+
fix_file('src/MiniPdf/ExcelToPdfConverter.cs', old_wrap, new_clip)
43+
44+
print("All done.")

src/MiniPdf/ExcelReader.cs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,11 @@ private static List<List<ExcelCell>> ReadSheet(ZipArchiveEntry entry, List<strin
284284
{
285285
text = string.Concat(cell.Descendants(ns + "t").Select(t => t.Value));
286286
}
287+
else if (type == "b")
288+
{
289+
// Boolean: Excel stores "1"/"0", render as TRUE/FALSE to match LibreOffice
290+
text = value == "1" ? "TRUE" : "FALSE";
291+
}
287292
else
288293
{
289294
text = value;
@@ -570,8 +575,8 @@ internal sealed class ExcelSheet
570575

571576
/// <summary>Converts Excel character-unit column width to PDF points.</summary>
572577
public static float CharUnitsToPoints(float charUnits)
573-
// Helvetica 10pt: digit "0" is ~5.5pt wide, plus ~5pt padding
574-
=> charUnits * 5.5f + 5f;
578+
// Calibrated against LibreOffice reference PDFs: 8.43 char-units → 47.4pt
579+
=> charUnits * 5.62f;
575580

576581
internal ExcelSheet(string name, List<List<ExcelCell>> rows,
577582
List<ExcelEmbeddedImage>? images = null,

src/MiniPdf/ExcelToPdfConverter.cs

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ internal sealed class ConversionOptions
2828
/// <summary>Page bottom margin in points (default: 50).</summary>
2929
public float MarginBottom { get; set; } = 50;
3030

31-
/// <summary>Padding between columns in points (default: 20).</summary>
32-
public float ColumnPadding { get; set; } = 20;
31+
/// <summary>Padding between columns in points (default: 4).</summary>
32+
public float ColumnPadding { get; set; } = 4;
3333

3434
/// <summary>Line spacing multiplier (default: 1.6).</summary>
3535
public float LineSpacing { get; set; } = 1.6f;
@@ -119,7 +119,7 @@ private static void RenderSheet(PdfDocument doc, ExcelSheet sheet, ConversionOpt
119119
var pageWidth = options.PageWidth;
120120
var pageHeight = options.PageHeight;
121121
var usableWidth = pageWidth - options.MarginLeft - options.MarginRight;
122-
var avgCharWidth = options.FontSize * 0.5f;
122+
var avgCharWidth = options.FontSize * 0.47f;
123123

124124
// Determine column widths first to decide on layout strategy
125125
var columnPadding = options.ColumnPadding;
@@ -274,10 +274,27 @@ void EnsurePage()
274274
var cellText = row[col].Text;
275275
if (!string.IsNullOrEmpty(cellText))
276276
{
277-
var maxChars = Math.Max(1, (int)(colWidths[i] / avgCharWidth));
278-
var wrapped = WrapCellText(cellText, maxChars);
279-
cellLines[i] = wrapped;
280-
if (wrapped.Length > maxLinesInRow) maxLinesInRow = wrapped.Length;
277+
// Check if the next column in this row has content.
278+
// LibreOffice behaviour: clip when right-neighbour is non-empty,
279+
// otherwise let text overflow (word-wrap) into the empty space.
280+
var nextCol = col + 1;
281+
var nextHasContent = nextCol < row.Count && !string.IsNullOrEmpty(row[nextCol].Text);
282+
283+
if (nextHasContent)
284+
{
285+
// Clip to column width — adjacent cell would be obscured otherwise.
286+
var maxChars = Math.Max(1, (int)(colWidths[i] / avgCharWidth));
287+
var clipped = cellText.Length > maxChars ? cellText[..maxChars] : cellText;
288+
cellLines[i] = new[] { clipped };
289+
}
290+
else
291+
{
292+
// Overflow allowed: render the full text on one line without clipping.
293+
// This matches LibreOffice behaviour where text flows into adjacent empty cells
294+
// (or to the right margin when this is the last column).
295+
cellLines[i] = new[] { cellText };
296+
}
297+
maxLinesInRow = Math.Max(maxLinesInRow, cellLines[i].Length);
281298
}
282299
else
283300
{
@@ -396,6 +413,9 @@ void EnsurePage()
396413
/// <summary>
397414
/// Wrap a single cell text into multiple lines at word boundaries.
398415
/// </summary>
416+
private static string[] WrapCellText(string text, float widthPts, float avgCharWidth)
417+
=> WrapCellText(text, Math.Max(1, (int)(widthPts / avgCharWidth)));
418+
399419
private static string[] WrapCellText(string text, int maxCharsPerLine)
400420
{
401421
if (maxCharsPerLine <= 0) maxCharsPerLine = 1;
@@ -460,7 +480,7 @@ private static bool IsDefaultSheetName(string name)
460480
/// </summary>
461481
private static float[] CalculateNaturalColumnWidths(ExcelSheet sheet, int maxCols, float usableWidth, ConversionOptions options)
462482
{
463-
var avgCharWidth = options.FontSize * 0.5f;
483+
var avgCharWidth = options.FontSize * 0.47f;
464484
var colMaxLengths = new int[maxCols];
465485

466486
foreach (var row in sheet.Rows)
@@ -493,9 +513,18 @@ private static float[] CalculateNaturalColumnWidths(ExcelSheet sheet, int maxCol
493513
// Clamp to reasonable bounds but respect the spreadsheet's intent
494514
widths[i] = Math.Clamp(excelPts, minColWidth, maxColWidth);
495515
}
516+
else if (maxCols >= 2)
517+
{
518+
// No explicit column widths — use Excel's default column width (8.43 char units).
519+
// This matches LibreOffice/Excel behaviour where unset multi-column sheets use the
520+
// workbook default, producing text clipping identical to the reference PDF.
521+
var defaultPts = ExcelSheet.CharUnitsToPoints(8.43f);
522+
widths[i] = Math.Clamp(defaultPts, minColWidth, maxColWidth);
523+
}
496524
else
497525
{
498-
// Fall back to content-based width
526+
// Single-column sheet: use content-based width so the column fills the page
527+
// (LibreOffice expands 1-column sheets to page width).
499528
var natural = (Math.Max(colMaxLengths[i], 3) + 2) * avgCharWidth;
500529
widths[i] = Math.Clamp(natural, minColWidth, maxColWidth);
501530
}

0 commit comments

Comments
 (0)