From 50aab66c0dab242274cce0f54c1f9e6345d0e9cb Mon Sep 17 00:00:00 2001
From: saccharin98 <xinyanzhou938@gmail.com>
Date: Fri, 24 Apr 2026 14:35:47 +0800
Subject: [PATCH 1/4] fix: make markdown parser robust

---
 pageindex/parser/markdown.py  | 135 +++++++++++++++++++++++-----
 tests/test_markdown_parser.py | 162 ++++++++++++++++++++++++++++++++++
 2 files changed, 275 insertions(+), 22 deletions(-)

diff --git a/pageindex/parser/markdown.py b/pageindex/parser/markdown.py
index f62013c4c..e71bd5a56 100644
--- a/pageindex/parser/markdown.py
+++ b/pageindex/parser/markdown.py
@@ -3,6 +3,13 @@
 from .protocol import ContentNode, ParsedDocument
 from ..index.utils import count_tokens
 
+# Patterns
+_ATX_HEADER = re.compile(r"^(#{1,6})\s+(.+?)(?:\s+#+\s*)?$")
+_SETEXT_H1 = re.compile(r"^={3,}\s*$")
+_SETEXT_H2 = re.compile(r"^-{3,}\s*$")
+_FENCE_OPEN = re.compile(r"^(`{3,}|~{3,})")
+_FRONTMATTER_FENCE = re.compile(r"^---\s*$")
+
 
 class MarkdownParser:
     def supported_extensions(self) -> list[str]:
@@ -16,44 +23,128 @@ def parse(self, file_path: str, **kwargs) -> ParsedDocument:
             content = f.read()
 
         lines = content.split("\n")
+        lines, metadata = self._strip_frontmatter(lines)
         headers = self._extract_headers(lines)
         nodes = self._build_nodes(headers, lines, model)
 
-        return ParsedDocument(doc_name=path.stem, nodes=nodes)
+        return ParsedDocument(
+            doc_name=path.stem,
+            nodes=nodes,
+            metadata=metadata,
+        )
+
+    @staticmethod
+    def _strip_frontmatter(lines: list[str]) -> tuple[list[str], dict | None]:
+        """Strip YAML frontmatter (--- delimited) from the beginning of the file.
+
+        Returns the remaining lines and raw frontmatter as metadata.
+        """
+        if not lines or not _FRONTMATTER_FENCE.match(lines[0]):
+            return lines, None
+
+        for i in range(1, len(lines)):
+            if _FRONTMATTER_FENCE.match(lines[i]):
+                raw = "\n".join(lines[1:i])
+                remaining = lines[i + 1:]
+                return remaining, {"frontmatter": raw}
+
+        # No closing fence found — not valid frontmatter, return as-is
+        return lines, None
 
     def _extract_headers(self, lines: list[str]) -> list[dict]:
-        header_pattern = r"^(#{1,6})\s+(.+)$"
-        code_block_pattern = r"^```"
+        """Extract all ATX and setext headers, respecting fenced code blocks."""
         headers = []
-        in_code_block = False
+        in_fence = False
+        fence_pattern = None  # tracks the char and min length to close
 
         for line_num, line in enumerate(lines, 1):
             stripped = line.strip()
-            if re.match(code_block_pattern, stripped):
-                in_code_block = not in_code_block
+
+            # Track fenced code blocks
+            fence_match = _FENCE_OPEN.match(stripped)
+            if fence_match:
+                marker = fence_match.group(1)
+                if not in_fence:
+                    in_fence = True
+                    fence_char = marker[0]
+                    fence_len = len(marker)
+                    fence_pattern = (fence_char, fence_len)
+                elif stripped[0] == fence_pattern[0] and len(marker) >= fence_pattern[1]:
+                    # Only close if same char and at least as many
+                    in_fence = False
+                    fence_pattern = None
+                continue
+
+            if in_fence:
                 continue
-            if not in_code_block and stripped:
-                match = re.match(header_pattern, stripped)
-                if match:
-                    headers.append({
-                        "title": match.group(2).strip(),
-                        "level": len(match.group(1)),
-                        "line_num": line_num,
-                    })
+
+            # ATX headers: # Title
+            atx = _ATX_HEADER.match(stripped)
+            if atx:
+                headers.append({
+                    "title": atx.group(2).strip(),
+                    "level": len(atx.group(1)),
+                    "line_num": line_num,
+                })
+                continue
+
+            # Setext headers: underline on next line detected by looking back
+            # We check if *this* line is an underline and the previous line is text
+            if line_num >= 2:
+                prev = lines[line_num - 2].strip()  # previous line (0-indexed)
+                if prev and not _FENCE_OPEN.match(prev) and not _ATX_HEADER.match(prev):
+                    if _SETEXT_H1.match(stripped):
+                        headers.append({
+                            "title": prev,
+                            "level": 1,
+                            "line_num": line_num - 1,
+                        })
+                        continue
+                    if _SETEXT_H2.match(stripped):
+                        headers.append({
+                            "title": prev,
+                            "level": 2,
+                            "line_num": line_num - 1,
+                        })
+                        continue
+
         return headers
 
-    def _build_nodes(self, headers: list[dict], lines: list[str], model: str | None) -> list[ContentNode]:
+    def _build_nodes(
+        self, headers: list[dict], lines: list[str], model: str | None
+    ) -> list[ContentNode]:
+        if not headers:
+            # No headers — entire content becomes a single node
+            text = "\n".join(lines).strip()
+            if not text:
+                return []
+            tokens = count_tokens(text, model=model)
+            return [ContentNode(content=text, tokens=tokens, index=1)]
+
         nodes = []
+
+        # Content before the first header → preamble node
+        first_header_line = headers[0]["line_num"]
+        if first_header_line > 1:
+            preamble = "\n".join(lines[: first_header_line - 1]).strip()
+            if preamble:
+                tokens = count_tokens(preamble, model=model)
+                nodes.append(ContentNode(content=preamble, tokens=tokens, index=1))
+
+        # One node per header section
         for i, header in enumerate(headers):
             start = header["line_num"] - 1
             end = headers[i + 1]["line_num"] - 1 if i + 1 < len(headers) else len(lines)
             text = "\n".join(lines[start:end]).strip()
             tokens = count_tokens(text, model=model)
-            nodes.append(ContentNode(
-                content=text,
-                tokens=tokens,
-                title=header["title"],
-                index=header["line_num"],
-                level=header["level"],
-            ))
+            nodes.append(
+                ContentNode(
+                    content=text,
+                    tokens=tokens,
+                    title=header["title"],
+                    index=header["line_num"],
+                    level=header["level"],
+                )
+            )
+
         return nodes
diff --git a/tests/test_markdown_parser.py b/tests/test_markdown_parser.py
index cbd06af99..ff75099a3 100644
--- a/tests/test_markdown_parser.py
+++ b/tests/test_markdown_parser.py
@@ -53,3 +53,165 @@ def test_parse_nodes_have_index(sample_md):
     result = parser.parse(sample_md)
     for node in result.nodes:
         assert node.index is not None
+
+
+# --- New tests for improved parser ---
+
+def test_preamble_before_first_header(tmp_path):
+    """Content before the first header should become a preamble node."""
+    md = tmp_path / "preamble.md"
+    md.write_text("""This is a preamble paragraph.
+
+# First Header
+Body text.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    assert len(result.nodes) == 2
+    # Preamble node has no level/title
+    assert result.nodes[0].level is None
+    assert result.nodes[0].title is None
+    assert "preamble paragraph" in result.nodes[0].content
+    # Header node is normal
+    assert result.nodes[1].level == 1
+    assert result.nodes[1].title == "First Header"
+
+
+def test_headerless_file(tmp_path):
+    """A file with no headers should produce a single node."""
+    md = tmp_path / "plain.md"
+    md.write_text("Just some plain text\nwith multiple lines.\n")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    assert len(result.nodes) == 1
+    assert result.nodes[0].level is None
+    assert "plain text" in result.nodes[0].content
+
+
+def test_empty_file(tmp_path):
+    """An empty file should produce zero nodes."""
+    md = tmp_path / "empty.md"
+    md.write_text("")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    assert len(result.nodes) == 0
+
+
+def test_yaml_frontmatter_stripped(tmp_path):
+    """YAML frontmatter should be stripped and stored as metadata."""
+    md = tmp_path / "front.md"
+    md.write_text("""---
+title: My Doc
+author: Alice
+---
+
+# Introduction
+Hello world.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    assert result.metadata is not None
+    assert "title: My Doc" in result.metadata["frontmatter"]
+    # Frontmatter should not appear in node content
+    for node in result.nodes:
+        assert "title: My Doc" not in node.content
+
+
+def test_setext_h1(tmp_path):
+    """Setext-style H1 (=== underline) should be recognized."""
+    md = tmp_path / "setext.md"
+    md.write_text("""Main Title
+==========
+
+Some content here.
+
+Sub Title
+---------
+
+More content.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    assert len(result.nodes) == 2
+    assert result.nodes[0].title == "Main Title"
+    assert result.nodes[0].level == 1
+    assert result.nodes[1].title == "Sub Title"
+    assert result.nodes[1].level == 2
+
+
+def test_headers_inside_code_blocks_ignored(tmp_path):
+    """Headers inside fenced code blocks should not be detected."""
+    md = tmp_path / "code.md"
+    md.write_text("""# Real Header
+
+```
+# Not a header
+## Also not a header
+```
+
+# Another Real Header
+More text.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    titles = [n.title for n in result.nodes if n.title]
+    assert "Real Header" in titles
+    assert "Another Real Header" in titles
+    assert "Not a header" not in titles
+    assert "Also not a header" not in titles
+
+
+def test_tilde_code_fences(tmp_path):
+    """Tilde fences (~~~) should also be respected."""
+    md = tmp_path / "tilde.md"
+    md.write_text("""# Header
+
+~~~
+# Fake header inside tilde fence
+~~~
+
+# Real Header After
+Text.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    titles = [n.title for n in result.nodes if n.title]
+    assert "Header" in titles
+    assert "Real Header After" in titles
+    assert "Fake header inside tilde fence" not in titles
+
+
+def test_long_fence_requires_matching_close(tmp_path):
+    """A ```` (4-backtick) opening fence should not close with ``` (3-backtick)."""
+    md = tmp_path / "longfence.md"
+    md.write_text("""# Before
+
+````
+# Inside fence
+```
+# Still inside — 3 backticks can't close 4-backtick fence
+```
+````
+
+# After
+Done.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    titles = [n.title for n in result.nodes if n.title]
+    assert "Before" in titles
+    assert "After" in titles
+    assert "Inside fence" not in titles
+    assert "Still inside" not in titles
+
+
+def test_atx_closing_hashes(tmp_path):
+    """ATX headers with closing hashes like '## Title ##' should parse cleanly."""
+    md = tmp_path / "closing.md"
+    md.write_text("""## Title ##
+Content.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    assert result.nodes[0].title == "Title"
+    assert result.nodes[0].level == 2

From a349946dcda04c9fb126fe7834a3b1b75081adb3 Mon Sep 17 00:00:00 2001
From: saccharin98 <xinyanzhou938@gmail.com>
Date: Fri, 24 Apr 2026 17:27:28 +0800
Subject: [PATCH 2/4] fix: line index offset issue and setext non-header prefix

---
 pageindex/parser/markdown.py  | 61 ++++++++++++++++++++++++++++-------
 tests/test_markdown_parser.py | 58 +++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 12 deletions(-)

diff --git a/pageindex/parser/markdown.py b/pageindex/parser/markdown.py
index e71bd5a56..bb08e4d7f 100644
--- a/pageindex/parser/markdown.py
+++ b/pageindex/parser/markdown.py
@@ -1,5 +1,7 @@
 import re
 from pathlib import Path
+import yaml
+
 from .protocol import ContentNode, ParsedDocument
 from ..index.utils import count_tokens
 
@@ -9,6 +11,9 @@
 _SETEXT_H2 = re.compile(r"^-{3,}\s*$")
 _FENCE_OPEN = re.compile(r"^(`{3,}|~{3,})")
 _FRONTMATTER_FENCE = re.compile(r"^---\s*$")
+_BLOCKQUOTE = re.compile(r"^>")
+_LIST_ITEM = re.compile(r"^(?:[-+*]|\d{1,9}[.)])(?:\s+|$)")
+_TABLE_ROW = re.compile(r"^\|.*\|$|^[^|]+\|[^|]+$")
 
 
 class MarkdownParser:
@@ -23,9 +28,9 @@ def parse(self, file_path: str, **kwargs) -> ParsedDocument:
             content = f.read()
 
         lines = content.split("\n")
-        lines, metadata = self._strip_frontmatter(lines)
+        lines, metadata, line_offset = self._strip_frontmatter(lines)
         headers = self._extract_headers(lines)
-        nodes = self._build_nodes(headers, lines, model)
+        nodes = self._build_nodes(headers, lines, model, line_offset)
 
         return ParsedDocument(
             doc_name=path.stem,
@@ -34,22 +39,29 @@ def parse(self, file_path: str, **kwargs) -> ParsedDocument:
         )
 
     @staticmethod
-    def _strip_frontmatter(lines: list[str]) -> tuple[list[str], dict | None]:
+    def _strip_frontmatter(lines: list[str]) -> tuple[list[str], dict | None, int]:
         """Strip YAML frontmatter (--- delimited) from the beginning of the file.
 
-        Returns the remaining lines and raw frontmatter as metadata.
+        Returns the remaining lines, raw frontmatter metadata, and removed line count.
         """
         if not lines or not _FRONTMATTER_FENCE.match(lines[0]):
-            return lines, None
+            return lines, None, 0
 
         for i in range(1, len(lines)):
             if _FRONTMATTER_FENCE.match(lines[i]):
                 raw = "\n".join(lines[1:i])
+                try:
+                    parsed = yaml.safe_load(raw)
+                except yaml.YAMLError:
+                    return lines, None, 0
+                if not isinstance(parsed, dict):
+                    return lines, None, 0
+
                 remaining = lines[i + 1:]
-                return remaining, {"frontmatter": raw}
+                return remaining, {"frontmatter": raw}, i + 1
 
         # No closing fence found — not valid frontmatter, return as-is
-        return lines, None
+        return lines, None, 0
 
     def _extract_headers(self, lines: list[str]) -> list[dict]:
         """Extract all ATX and setext headers, respecting fenced code blocks."""
@@ -92,7 +104,7 @@ def _extract_headers(self, lines: list[str]) -> list[dict]:
             # We check if *this* line is an underline and the previous line is text
             if line_num >= 2:
                 prev = lines[line_num - 2].strip()  # previous line (0-indexed)
-                if prev and not _FENCE_OPEN.match(prev) and not _ATX_HEADER.match(prev):
+                if self._is_setext_paragraph_candidate(prev):
                     if _SETEXT_H1.match(stripped):
                         headers.append({
                             "title": prev,
@@ -110,8 +122,21 @@ def _extract_headers(self, lines: list[str]) -> list[dict]:
 
         return headers
 
+    @staticmethod
+    def _is_setext_paragraph_candidate(line: str) -> bool:
+        return bool(line
+                    and not _FENCE_OPEN.match(line)
+                    and not _ATX_HEADER.match(line)
+                    and not _BLOCKQUOTE.match(line)
+                    and not _LIST_ITEM.match(line)
+                    and not _TABLE_ROW.match(line))
+
     def _build_nodes(
-        self, headers: list[dict], lines: list[str], model: str | None
+        self,
+        headers: list[dict],
+        lines: list[str],
+        model: str | None,
+        line_offset: int = 0,
     ) -> list[ContentNode]:
         if not headers:
             # No headers — entire content becomes a single node
@@ -119,7 +144,8 @@ def _build_nodes(
             if not text:
                 return []
             tokens = count_tokens(text, model=model)
-            return [ContentNode(content=text, tokens=tokens, index=1)]
+            index = self._first_content_line(lines, line_offset)
+            return [ContentNode(content=text, tokens=tokens, index=index)]
 
         nodes = []
 
@@ -129,7 +155,11 @@ def _build_nodes(
             preamble = "\n".join(lines[: first_header_line - 1]).strip()
             if preamble:
                 tokens = count_tokens(preamble, model=model)
-                nodes.append(ContentNode(content=preamble, tokens=tokens, index=1))
+                index = self._first_content_line(
+                    lines[: first_header_line - 1],
+                    line_offset,
+                )
+                nodes.append(ContentNode(content=preamble, tokens=tokens, index=index))
 
         # One node per header section
         for i, header in enumerate(headers):
@@ -142,9 +172,16 @@ def _build_nodes(
                     content=text,
                     tokens=tokens,
                     title=header["title"],
-                    index=header["line_num"],
+                    index=header["line_num"] + line_offset,
                     level=header["level"],
                 )
             )
 
         return nodes
+
+    @staticmethod
+    def _first_content_line(lines: list[str], line_offset: int) -> int:
+        for i, line in enumerate(lines, 1):
+            if line.strip():
+                return i + line_offset
+        return line_offset + 1
diff --git a/tests/test_markdown_parser.py b/tests/test_markdown_parser.py
index ff75099a3..721ac4593 100644
--- a/tests/test_markdown_parser.py
+++ b/tests/test_markdown_parser.py
@@ -117,6 +117,45 @@ def test_yaml_frontmatter_stripped(tmp_path):
         assert "title: My Doc" not in node.content
 
 
+def test_yaml_frontmatter_preserves_original_line_numbers(tmp_path):
+    """Node indexes should use original file line numbers after frontmatter."""
+    md = tmp_path / "front_lines.md"
+    md.write_text("""---
+title: My Doc
+---
+
+# Introduction
+Hello world.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    assert result.nodes[0].title == "Introduction"
+    assert result.nodes[0].index == 5
+
+
+def test_thematic_break_at_start_not_stripped_as_frontmatter(tmp_path):
+    """Markdown that starts with thematic breaks should not be stripped."""
+    md = tmp_path / "thematic.md"
+    md.write_text("""---
+
+# Introduction
+Hello world.
+
+---
+
+# Second
+More text.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    assert result.metadata is None
+    assert result.nodes[0].level is None
+    assert result.nodes[0].content == "---"
+    assert result.nodes[1].title == "Introduction"
+    assert result.nodes[1].index == 3
+    assert result.nodes[2].title == "Second"
+
+
 def test_setext_h1(tmp_path):
     """Setext-style H1 (=== underline) should be recognized."""
     md = tmp_path / "setext.md"
@@ -139,6 +178,25 @@ def test_setext_h1(tmp_path):
     assert result.nodes[1].level == 2
 
 
+@pytest.mark.parametrize("prefix", ["- item", "1. item", "> quote", "| A | B |"])
+def test_setext_requires_paragraph_previous_line(tmp_path, prefix):
+    """Setext underline should not turn list/quote/table lines into headers."""
+    md = tmp_path / "not_setext.md"
+    md.write_text(f"""{prefix}
+---
+
+# Real Header
+Content.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    titles = [n.title for n in result.nodes if n.title]
+    assert prefix not in titles
+    assert "Real Header" in titles
+    assert result.nodes[0].level is None
+    assert prefix in result.nodes[0].content
+
+
 def test_headers_inside_code_blocks_ignored(tmp_path):
     """Headers inside fenced code blocks should not be detected."""
     md = tmp_path / "code.md"

From 3dbc5bbc1f2afdae8160576ea9d73cab0580efbe Mon Sep 17 00:00:00 2001
From: saccharin98 <xinyanzhou938@gmail.com>
Date: Fri, 24 Apr 2026 17:38:01 +0800
Subject: [PATCH 3/4] fix: handle markdown fence and setext edge cases

---
 pageindex/parser/markdown.py  | 20 +++++++++++++---
 tests/test_markdown_parser.py | 43 +++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/pageindex/parser/markdown.py b/pageindex/parser/markdown.py
index bb08e4d7f..507f3c574 100644
--- a/pageindex/parser/markdown.py
+++ b/pageindex/parser/markdown.py
@@ -7,9 +7,10 @@
 
 # Patterns
 _ATX_HEADER = re.compile(r"^(#{1,6})\s+(.+?)(?:\s+#+\s*)?$")
-_SETEXT_H1 = re.compile(r"^={3,}\s*$")
-_SETEXT_H2 = re.compile(r"^-{3,}\s*$")
+_SETEXT_H1 = re.compile(r"^=+\s*$")
+_SETEXT_H2 = re.compile(r"^-+\s*$")
 _FENCE_OPEN = re.compile(r"^(`{3,}|~{3,})")
+_FENCE_CLOSE = re.compile(r"^(`{3,}|~{3,})\s*$")
 _FRONTMATTER_FENCE = re.compile(r"^---\s*$")
 _BLOCKQUOTE = re.compile(r"^>")
 _LIST_ITEM = re.compile(r"^(?:[-+*]|\d{1,9}[.)])(?:\s+|$)")
@@ -81,7 +82,7 @@ def _extract_headers(self, lines: list[str]) -> list[dict]:
                     fence_char = marker[0]
                     fence_len = len(marker)
                     fence_pattern = (fence_char, fence_len)
-                elif stripped[0] == fence_pattern[0] and len(marker) >= fence_pattern[1]:
+                elif self._is_closing_fence(stripped, fence_pattern):
                     # Only close if same char and at least as many
                     in_fence = False
                     fence_pattern = None
@@ -122,6 +123,19 @@ def _extract_headers(self, lines: list[str]) -> list[dict]:
 
         return headers
 
+    @staticmethod
+    def _is_closing_fence(line: str, fence_pattern: tuple[str, int] | None) -> bool:
+        if fence_pattern is None:
+            return False
+
+        close_match = _FENCE_CLOSE.match(line)
+        if not close_match:
+            return False
+
+        marker = close_match.group(1)
+        fence_char, fence_len = fence_pattern
+        return marker[0] == fence_char and len(marker) >= fence_len
+
     @staticmethod
     def _is_setext_paragraph_candidate(line: str) -> bool:
         return bool(line
diff --git a/tests/test_markdown_parser.py b/tests/test_markdown_parser.py
index 721ac4593..7c73f73ae 100644
--- a/tests/test_markdown_parser.py
+++ b/tests/test_markdown_parser.py
@@ -178,6 +178,28 @@ def test_setext_h1(tmp_path):
     assert result.nodes[1].level == 2
 
 
+@pytest.mark.parametrize(
+    ("underline", "level"),
+    [
+        ("=", 1),
+        ("-", 2),
+    ],
+)
+def test_setext_single_character_underline(tmp_path, underline, level):
+    """Setext underlines may be a single marker character."""
+    md = tmp_path / "setext_single.md"
+    md.write_text(f"""Title
+{underline}
+
+Body text.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    assert len(result.nodes) == 1
+    assert result.nodes[0].title == "Title"
+    assert result.nodes[0].level == level
+
+
 @pytest.mark.parametrize("prefix", ["- item", "1. item", "> quote", "| A | B |"])
 def test_setext_requires_paragraph_previous_line(tmp_path, prefix):
     """Setext underline should not turn list/quote/table lines into headers."""
@@ -263,6 +285,27 @@ def test_long_fence_requires_matching_close(tmp_path):
     assert "Still inside" not in titles
 
 
+def test_fence_close_requires_only_marker_and_spaces(tmp_path):
+    """Fence-like content with trailing text should not close the code block."""
+    md = tmp_path / "fence_close.md"
+    md.write_text("""# Before
+
+```
+```not a closing fence
+# Still code
+```
+
+# After
+Done.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    titles = [n.title for n in result.nodes if n.title]
+    assert "Before" in titles
+    assert "After" in titles
+    assert "Still code" not in titles
+
+
 def test_atx_closing_hashes(tmp_path):
     """ATX headers with closing hashes like '## Title ##' should parse cleanly."""
     md = tmp_path / "closing.md"

From c2442494dca177f4fda71751443e8fbc8bd4bb80 Mon Sep 17 00:00:00 2001
From: saccharin98 <xinyanzhou938@gmail.com>
Date: Fri, 24 Apr 2026 17:42:13 +0800
Subject: [PATCH 4/4] fix: ignore indented markdown code headings

---
 pageindex/parser/markdown.py  | 27 +++++++++++++++++++++----
 tests/test_markdown_parser.py | 37 +++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/pageindex/parser/markdown.py b/pageindex/parser/markdown.py
index 507f3c574..dc2fbed50 100644
--- a/pageindex/parser/markdown.py
+++ b/pageindex/parser/markdown.py
@@ -71,10 +71,12 @@ def _extract_headers(self, lines: list[str]) -> list[dict]:
         fence_pattern = None  # tracks the char and min length to close
 
         for line_num, line in enumerate(lines, 1):
-            stripped = line.strip()
+            indent = self._indent_width(line)
+            content = line.lstrip(" \t")
+            stripped = content.strip()
 
             # Track fenced code blocks
-            fence_match = _FENCE_OPEN.match(stripped)
+            fence_match = _FENCE_OPEN.match(content) if indent <= 3 else None
             if fence_match:
                 marker = fence_match.group(1)
                 if not in_fence:
@@ -91,6 +93,10 @@ def _extract_headers(self, lines: list[str]) -> list[dict]:
             if in_fence:
                 continue
 
+            # Four-space/tab indented lines are code blocks, not headers.
+            if indent >= 4:
+                continue
+
             # ATX headers: # Title
             atx = _ATX_HEADER.match(stripped)
             if atx:
@@ -104,8 +110,9 @@ def _extract_headers(self, lines: list[str]) -> list[dict]:
             # Setext headers: underline on next line detected by looking back
             # We check if *this* line is an underline and the previous line is text
             if line_num >= 2:
-                prev = lines[line_num - 2].strip()  # previous line (0-indexed)
-                if self._is_setext_paragraph_candidate(prev):
+                prev_line = lines[line_num - 2]
+                prev = prev_line.strip()  # previous line (0-indexed)
+                if self._indent_width(prev_line) < 4 and self._is_setext_paragraph_candidate(prev):
                     if _SETEXT_H1.match(stripped):
                         headers.append({
                             "title": prev,
@@ -123,6 +130,18 @@ def _extract_headers(self, lines: list[str]) -> list[dict]:
 
         return headers
 
+    @staticmethod
+    def _indent_width(line: str) -> int:
+        width = 0
+        for char in line:
+            if char == " ":
+                width += 1
+            elif char == "\t":
+                width += 4 - (width % 4)
+            else:
+                break
+        return width
+
     @staticmethod
     def _is_closing_fence(line: str, fence_pattern: tuple[str, int] | None) -> bool:
         if fence_pattern is None:
diff --git a/tests/test_markdown_parser.py b/tests/test_markdown_parser.py
index 7c73f73ae..e502047ce 100644
--- a/tests/test_markdown_parser.py
+++ b/tests/test_markdown_parser.py
@@ -241,6 +241,43 @@ def test_headers_inside_code_blocks_ignored(tmp_path):
     assert "Also not a header" not in titles
 
 
+def test_indented_code_block_headers_ignored(tmp_path):
+    """Headers inside four-space indented code blocks should not be detected."""
+    md = tmp_path / "indented_code.md"
+    md.write_text("""# Before
+
+    # Not a header
+    text
+
+# After
+Done.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    titles = [n.title for n in result.nodes if n.title]
+    assert "Before" in titles
+    assert "After" in titles
+    assert "Not a header" not in titles
+
+
+def test_three_space_indented_headers_allowed(tmp_path):
+    """Headers indented up to three spaces should still be recognized."""
+    md = tmp_path / "indented_header.md"
+    md.write_text("""# Before
+
+   # Still a header
+
+# After
+Done.
+""")
+    parser = MarkdownParser()
+    result = parser.parse(str(md))
+    titles = [n.title for n in result.nodes if n.title]
+    assert "Before" in titles
+    assert "Still a header" in titles
+    assert "After" in titles
+
+
 def test_tilde_code_fences(tmp_path):
     """Tilde fences (~~~) should also be respected."""
     md = tmp_path / "tilde.md"