mini-software
diff --git a/‎README.md‎
Lines changed: 3 additions & 11 deletions b/‎README.md‎
Lines changed: 3 additions & 11 deletions
diff --git a/‎analyze_overflow.py‎
Lines changed: 49 additions & 0 deletions b/‎analyze_overflow.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎analyze_xlsx.py‎
Lines changed: 67 additions & 0 deletions b/‎analyze_xlsx.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎check_reference.py‎
Lines changed: 38 additions & 0 deletions b/‎check_reference.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎fix_code.py‎
Lines changed: 44 additions & 0 deletions b/‎fix_code.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎src/MiniPdf/ExcelReader.cs‎
Lines changed: 7 additions & 2 deletions b/‎src/MiniPdf/ExcelReader.cs‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/MiniPdf/ExcelToPdfConverter.cs‎
Lines changed: 38 additions & 9 deletions b/‎src/MiniPdf/ExcelToPdfConverter.cs‎
Lines changed: 38 additions & 9 deletions
@@ -1,4 +1,4 @@
-# MiniPdf
+# MiniPdf
 
 <div align="center">
 <p>
@@ -68,33 +68,25 @@ MiniPdf output is compared against LibreOffice as the reference renderer across
 
 ### Visual Comparison
 
-All 90 test cases comparing MiniPdf output (left) vs LibreOffice reference (right). Page 1 shown for multi-page results.
+Sample output comparing MiniPdf (left) vs LibreOffice reference (right).
 
 <table>
-<tr><th>Test Case</th><th>MiniPdf</th><th>LibreOffice (Reference)</th><th>Score</th></tr>
+<tr><th>MiniPdf</th><th>LibreOffice (Reference)</th></tr>
 <tr>
-  <td><b>classic01</b><br/>Basic table with headers</td>
   <td><img src="tests/MiniPdf.Benchmark/reports/images/classic01_basic_table_with_headers_p1_minipdf.png" width="320"/></td>
   <td><img src="tests/MiniPdf.Benchmark/reports/images/classic01_basic_table_with_headers_p1_reference.png" width="320"/></td>
-  <td>🟢 99.8%</td>
 </tr>
 <tr>
-  <td><b>classic02</b><br/>Multiple worksheets</td>
   <td><img src="tests/MiniPdf.Benchmark/reports/images/classic02_multiple_worksheets_p1_minipdf.png" width="320"/></td>
   <td><img src="tests/MiniPdf.Benchmark/reports/images/classic02_multiple_worksheets_p1_reference.png" width="320"/></td>
-  <td>🟢 99.5%</td>
 </tr>
 <tr>
-  <td><b>classic03</b><br/>Empty workbook</td>
   <td><img src="tests/MiniPdf.Benchmark/reports/images/classic03_empty_workbook_p1_minipdf.png" width="320"/></td>
   <td><img src="tests/MiniPdf.Benchmark/reports/images/classic03_empty_workbook_p1_reference.png" width="320"/></td>
-  <td>🟢 100.0%</td>
 </tr>
 <tr>
-  <td><b>classic04</b><br/>Single cell</td>
   <td><img src="tests/MiniPdf.Benchmark/reports/images/classic04_single_cell_p1_minipdf.png" width="320"/></td>
   <td><img src="tests/MiniPdf.Benchmark/reports/images/classic04_single_cell_p1_reference.png" width="320"/></td>
-  <td>🟢 100.0%</td>
 </tr>
 <tr>
   <td><b>classic05</b><br/>Wide table</td>
 
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+"""Check xlsx structure to understand overflow vs clip."""
+import zipfile, re, os, sys
+sys.stdout.reconfigure(encoding='utf-8')
+
+output_dir = r'd:\git\MiniPdf\tests\MiniPdf.Scripts\output'
+
+def analyze_xlsx(xlsx_path):
+    with zipfile.ZipFile(xlsx_path) as z:
+        names = [n for n in z.namelist() if n.startswith('xl/worksheets/sheet') and n.endswith('.xml')]
+        if not names:
+            return {}
+        with z.open(names[0]) as f:
+            sheet_xml = f.read().decode('utf-8')
+        sharedstrings = []
+        if 'xl/sharedStrings.xml' in z.namelist():
+            with z.open('xl/sharedStrings.xml') as f:
+                sst = f.read().decode('utf-8')
+            sharedstrings = re.findall(r'<t[^>]*>([^<]*)</t>', sst)
+    
+    # Find cell references in first few rows to know which columns are occupied
+    cell_refs = re.findall(r'<c r="([A-Z]+)(\d+)"', sheet_xml)
+    # Group by row
+    from collections import defaultdict
+    row_cols = defaultdict(set)
+    for col, row in cell_refs:
+        row_cols[int(row)].add(col)
+    
+    return dict(list(row_cols.items())[:6]), sharedstrings[:10]
+
+cases = [
+    ('classic35_explicit_row_heights', 'CLIPS'),
+    ('classic44_employee_roster', 'CLIPS'),
+    ('classic49_contact_list', 'CLIPS'),
+    ('classic24_red_text', 'NO CLIP'),
+    ('classic38_hyperlink_cell', 'NO CLIP'),
+    ('classic06_tall_table', 'NO CLIP'),
+    ('classic01_basic_table_with_headers', 'NO CLIP'),
+    ('classic36_merged_cells', 'NO CLIP - merged'),
+]
+
+for name, expected in cases:
+    path = os.path.join(output_dir, name + '.xlsx')
+    row_cols, strings = analyze_xlsx(path)
+    print(f"{name} ({expected}):")
+    for rnum, cols in row_cols.items():
+        print(f"  row {rnum}: {sorted(cols)}")
+    print(f"  strings: {strings[:5]}")
+    print()
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""Analyze xlsx files to understand column width requirements."""
+import zipfile, re, os, sys
+
+output_dir = r'd:\git\MiniPdf\tests\MiniPdf.Scripts\output'
+
+def get_col_content_widths(xlsx_path):
+    """Get max content length per column and SheetFormatPr info."""
+    with zipfile.ZipFile(xlsx_path) as z:
+        # Get sheets
+        names = [n for n in z.namelist() if n.startswith('xl/worksheets/sheet') and n.endswith('.xml')]
+        if not names:
+            return {}, None
+        with z.open(names[0]) as f:
+            sheet_xml = f.read().decode('utf-8')
+        
+        # Get shared strings
+        try:
+            with z.open('xl/sharedStrings.xml') as f:
+                sst_xml = f.read().decode('utf-8')
+            strings = re.findall(r'<t[^>]*>([^<]+)</t>', sst_xml)
+        except:
+            strings = []
+        
+        # Get col widths
+        cols = re.findall(r'<col\s[^/]*/>', sheet_xml)
+        custom_cols = [c for c in cols if 'customWidth="1"' in c]
+        
+        # Get sheetFormatPr
+        fmt = re.search(r'<sheetFormatPr[^/]*/>', sheet_xml)
+        fmt_str = fmt.group(0) if fmt else None
+        
+        return custom_cols, fmt_str, strings
+
+target_files = [
+    # Tests that reference clips but we used to NOT clip -> now clipping correctly
+    'classic35_explicit_row_heights.xlsx',
+    'classic44_employee_roster.xlsx',
+    'classic49_contact_list.xlsx',
+    # Tests that now WRONGLY clip (regressions)
+    'classic06_tall_table.xlsx',
+    'classic18_large_dataset.xlsx',
+    'classic60_large_wide_table.xlsx',
+    'classic12_sparse_columns.xlsx',
+    'classic05_wide_table.xlsx',
+    'classic36_merged_cells.xlsx',
+    'classic24_red_text.xlsx',
+    'classic38_hyperlink_cell.xlsx',
+    'classic01_basic_table_with_headers.xlsx',
+]
+
+for fname in target_files:
+    path = os.path.join(output_dir, fname)
+    if not os.path.exists(path):
+        print(f"{fname}: NOT FOUND")
+        continue
+    cols, fmt, strings = get_col_content_widths(path)
+    max_len = max((len(s) for s in strings), default=0)
+    print(f"{fname}")
+    print(f"  custom_cols: {cols[:3]}")
+    print(f"  sheetFormatPr: {fmt}")
+    print(f"  max_string_len: {max_len}")
+    if strings:
+        long_strings = [s for s in strings if len(s) > 10][:3]
+        if long_strings:
+            print(f"  long strings (>10): {long_strings}")
+    print()
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+"""Extract text from reference PDFs to understand what LibreOffice renders."""
+import os, sys
+sys.path.insert(0, r'd:\git\MiniPdf\tests\MiniPdf.Benchmark')
+
+try:
+    import fitz  # pymupdf
+except ImportError:
+    print("pymupdf not installed")
+    sys.exit(1)
+
+ref_dir = r'd:\git\MiniPdf\tests\MiniPdf.Benchmark\reference_pdfs'
+
+tests = [
+    ('classic35_explicit_row_heights', ['Tall Header', 'Tall HeadeValue', 'Tall Heade']),
+    ('classic44_employee_roster', ['Engineering', 'Engineerin']),
+    ('classic49_contact_list', ['alice@example.com', 'alice@exam']),
+    ('classic24_red_text', ['Something went wrong']),
+    ('classic38_hyperlink_cell', ['https://github.com', 'https://gi']),
+    ('classic06_tall_table', ['Contact', 'Customer']),
+    ('classic01_basic_table_with_headers', ['Product', 'Description']),
+    ('classic36_merged_cells', ['Merged Header', 'Merged Hea']),
+]
+
+for name, look_for in tests:
+    path = os.path.join(ref_dir, name + '.pdf')
+    if not os.path.exists(path):
+        print(f"{name}: PDF not found")
+        continue
+    doc = fitz.open(path)
+    text = doc[0].get_text()
+    print(f"\n{name}:")
+    for s in look_for:
+        found = s in text
+        print(f"  '{s}': {'FOUND' if found else 'NOT FOUND'}")
+    # Print all words from text
+    words = text.strip().split()
+    print(f"  First 20 words: {words[:20]}")
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+"""Apply targeted fixes to MiniPdf source files."""
+import re
+
+def fix_file(path, old, new, expect_found=True):
+    with open(path, encoding='utf-8') as f:
+        content = f.read()
+    if old not in content:
+        print(f"  WARNING: Pattern not found in {path}")
+        if expect_found:
+            return False
+    else:
+        content = content.replace(old, new, 1)
+        with open(path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        print(f"  OK: replaced in {path}")
+    return True
+
+
+# ── 1. PdfWriter.cs: en/em-dash -> WinAnsiEncoding bytes ─────────────────────
+fix_file(
+    'src/MiniPdf/PdfWriter.cs',
+    r"'\u2013' or '\u2014' or '\u2012' => '-',   // en-dash, em-dash",
+    r"'\u2013' or '\u2012' => (char)0x96,  // en-dash -> WinAnsiEncoding 0x96" + "\n" + 
+    r"                '\u2014' => (char)0x97,                // em-dash -> WinAnsiEncoding 0x97"
+)
+
+# ── 2. ExcelToPdfConverter.cs: Replace WrapCellText with clip ────────────────
+old_wrap = """\
+                        var maxChars = Math.Max(1, (int)(colWidths[i] / avgCharWidth));
+                        var wrapped = WrapCellText(cellText, maxChars);
+                        cellLines[i] = wrapped;
+                        if (wrapped.Length > maxLinesInRow) maxLinesInRow = wrapped.Length;"""
+
+new_clip = """\
+                        var maxChars = Math.Max(1, (int)(colWidths[i] / avgCharWidth));
+                        // Clip to column width - matches LibreOffice default cell-overflow behaviour.
+                        // Long text is truncated at the boundary (no word-wrap; wrap_text not set).
+                        var clipped = cellText.Length > maxChars ? cellText[..maxChars] : cellText;
+                        cellLines[i] = new[] { clipped };"""
+
+fix_file('src/MiniPdf/ExcelToPdfConverter.cs', old_wrap, new_clip)
+
+print("All done.")
@@ -284,6 +284,11 @@ private static List<List<ExcelCell>> ReadSheet(ZipArchiveEntry entry, List<strin
                 {
                     text = string.Concat(cell.Descendants(ns + "t").Select(t => t.Value));
                 }
+                else if (type == "b")
+                {
+                    // Boolean: Excel stores "1"/"0", render as TRUE/FALSE to match LibreOffice
+                    text = value == "1" ? "TRUE" : "FALSE";
+                }
                 else
                 {
                     text = value;
@@ -570,8 +575,8 @@ internal sealed class ExcelSheet
 
     /// <summary>Converts Excel character-unit column width to PDF points.</summary>
     public static float CharUnitsToPoints(float charUnits)
-        // Helvetica 10pt: digit "0" is ~5.5pt wide, plus ~5pt padding
-        => charUnits * 5.5f + 5f;
+        // Calibrated against LibreOffice reference PDFs: 8.43 char-units → 47.4pt
+        => charUnits * 5.62f;
 
     internal ExcelSheet(string name, List<List<ExcelCell>> rows,
         List<ExcelEmbeddedImage>? images = null,
 
@@ -28,8 +28,8 @@ internal sealed class ConversionOptions
         /// <summary>Page bottom margin in points (default: 50).</summary>
         public float MarginBottom { get; set; } = 50;
 
-        /// <summary>Padding between columns in points (default: 20).</summary>
-        public float ColumnPadding { get; set; } = 20;
+        /// <summary>Padding between columns in points (default: 4).</summary>
+        public float ColumnPadding { get; set; } = 4;
 
         /// <summary>Line spacing multiplier (default: 1.6).</summary>
         public float LineSpacing { get; set; } = 1.6f;
@@ -119,7 +119,7 @@ private static void RenderSheet(PdfDocument doc, ExcelSheet sheet, ConversionOpt
         var pageWidth = options.PageWidth;
         var pageHeight = options.PageHeight;
         var usableWidth = pageWidth - options.MarginLeft - options.MarginRight;
-        var avgCharWidth = options.FontSize * 0.5f;
+        var avgCharWidth = options.FontSize * 0.47f;
 
         // Determine column widths first to decide on layout strategy
         var columnPadding = options.ColumnPadding;
@@ -274,10 +274,27 @@ void EnsurePage()
                     var cellText = row[col].Text;
                     if (!string.IsNullOrEmpty(cellText))
                     {
-                        var maxChars = Math.Max(1, (int)(colWidths[i] / avgCharWidth));
-                        var wrapped = WrapCellText(cellText, maxChars);
-                        cellLines[i] = wrapped;
-                        if (wrapped.Length > maxLinesInRow) maxLinesInRow = wrapped.Length;
+                        // Check if the next column in this row has content.
+                        // LibreOffice behaviour: clip when right-neighbour is non-empty,
+                        // otherwise let text overflow (word-wrap) into the empty space.
+                        var nextCol = col + 1;
+                        var nextHasContent = nextCol < row.Count && !string.IsNullOrEmpty(row[nextCol].Text);
+
+                        if (nextHasContent)
+                        {
+                            // Clip to column width — adjacent cell would be obscured otherwise.
+                            var maxChars = Math.Max(1, (int)(colWidths[i] / avgCharWidth));
+                            var clipped = cellText.Length > maxChars ? cellText[..maxChars] : cellText;
+                            cellLines[i] = new[] { clipped };
+                        }
+                        else
+                        {
+                            // Overflow allowed: render the full text on one line without clipping.
+                            // This matches LibreOffice behaviour where text flows into adjacent empty cells
+                            // (or to the right margin when this is the last column).
+                            cellLines[i] = new[] { cellText };
+                        }
+                        maxLinesInRow = Math.Max(maxLinesInRow, cellLines[i].Length);
                     }
                     else
                     {
@@ -396,6 +413,9 @@ void EnsurePage()
     /// <summary>
     /// Wrap a single cell text into multiple lines at word boundaries.
     /// </summary>
+    private static string[] WrapCellText(string text, float widthPts, float avgCharWidth)
+        => WrapCellText(text, Math.Max(1, (int)(widthPts / avgCharWidth)));
+
     private static string[] WrapCellText(string text, int maxCharsPerLine)
     {
         if (maxCharsPerLine <= 0) maxCharsPerLine = 1;
@@ -460,7 +480,7 @@ private static bool IsDefaultSheetName(string name)
     /// </summary>
     private static float[] CalculateNaturalColumnWidths(ExcelSheet sheet, int maxCols, float usableWidth, ConversionOptions options)
     {
-        var avgCharWidth = options.FontSize * 0.5f;
+        var avgCharWidth = options.FontSize * 0.47f;
         var colMaxLengths = new int[maxCols];
 
         foreach (var row in sheet.Rows)
@@ -493,9 +513,18 @@ private static float[] CalculateNaturalColumnWidths(ExcelSheet sheet, int maxCol
                 // Clamp to reasonable bounds but respect the spreadsheet's intent
                 widths[i] = Math.Clamp(excelPts, minColWidth, maxColWidth);
             }
+            else if (maxCols >= 2)
+            {
+                // No explicit column widths — use Excel's default column width (8.43 char units).
+                // This matches LibreOffice/Excel behaviour where unset multi-column sheets use the
+                // workbook default, producing text clipping identical to the reference PDF.
+                var defaultPts = ExcelSheet.CharUnitsToPoints(8.43f);
+                widths[i] = Math.Clamp(defaultPts, minColWidth, maxColWidth);
+            }
             else
             {
-                // Fall back to content-based width
+                // Single-column sheet: use content-based width so the column fills the page
+                // (LibreOffice expands 1-column sheets to page width).
                 var natural = (Math.Max(colMaxLengths[i], 3) + 2) * avgCharWidth;
                 widths[i] = Math.Clamp(natural, minColWidth, maxColWidth);
             }