Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,15 @@ def calculate_page_offset(pairs):
return most_common

def add_page_offset_to_toc_json(data, offset):
"""Apply a page offset to convert TOC page numbers to physical indices.

When ``offset`` is ``None`` (e.g. because no matching title/page pairs
were found in the document), the function returns ``data`` unchanged
rather than crashing with a ``TypeError``. Callers should handle the
resulting items-without-``physical_index`` through ``process_none_page_numbers``.
"""
if offset is None:
return data
for i in range(len(data)):
if data[i].get('page') is not None and isinstance(data[i]['page'], int):
data[i]['physical_index'] = data[i]['page'] + offset
Expand Down Expand Up @@ -654,18 +663,22 @@ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_che

##check if needed to process none page numbers
def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
end_index = len(page_list) + start_index - 1
for i, item in enumerate(toc_items):
if "physical_index" not in item:
# logger.info(f"fix item: {item}")
# Find previous physical_index
prev_physical_index = 0 # Default if no previous item exists
prev_physical_index = start_index # Default: start of document
for j in range(i - 1, -1, -1):
if toc_items[j].get('physical_index') is not None:
prev_physical_index = toc_items[j]['physical_index']
break

# Find next physical_index
next_physical_index = -1 # Default if no next item exists
# Find next physical_index.
# Default is end_index (last page of document) so that the last
# TOC item — which has no successor — still gets a valid search
# window instead of an empty range(prev, 0).
next_physical_index = end_index
for j in range(i + 1, len(toc_items)):
if toc_items[j].get('physical_index') is not None:
next_physical_index = toc_items[j]['physical_index']
Expand Down
Empty file added tests/__init__.py
Empty file.
220 changes: 220 additions & 0 deletions tests/test_toc_offset_and_page_range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
"""
Tests for two related bugs in TOC processing:

Bug 1 — add_page_offset_to_toc_json crashes with TypeError when offset is None.
Root cause: calculate_page_offset() returns None when no title/page pairs can be
matched (e.g. the visible TOC pages don't overlap with the first N pages of content).
add_page_offset_to_toc_json() then tried ``data[i]['page'] + None``, raising TypeError.

Bug 2 — process_none_page_numbers produces an empty page-content window for the *last*
TOC item when it has no physical_index.
Root cause: ``next_physical_index`` defaulted to ``-1`` (meaning "no next item"),
and ``range(prev, -1 + 1)`` == ``range(prev, 0)`` is always empty. The LLM was
called with an empty string and could not locate the section.
Secondary: ``prev_physical_index`` defaulted to ``0``, which produces list_index ``-1``
and silently skips the first page (bounds check catches it, but page 1 is excluded).
"""
import copy
import pytest

from pageindex.page_index import (
add_page_offset_to_toc_json,
calculate_page_offset,
process_none_page_numbers,
)


# ---------------------------------------------------------------------------
# Bug 1: add_page_offset_to_toc_json with None offset
# ---------------------------------------------------------------------------

class TestAddPageOffsetToTocJson:
"""Covers the None-offset guard added to add_page_offset_to_toc_json."""

def test_none_offset_returns_data_unchanged(self):
"""When offset is None the input list must be returned unmodified."""
data = [
{"structure": "1", "title": "Introduction", "page": 5},
{"structure": "2", "title": "Methods", "page": 10},
]
original = copy.deepcopy(data)
result = add_page_offset_to_toc_json(data, None)
assert result == original, "Data should be unchanged when offset is None"

def test_none_offset_no_crash(self):
"""No exception must be raised when offset is None."""
data = [{"structure": "1", "title": "Intro", "page": 3}]
try:
add_page_offset_to_toc_json(data, None)
except TypeError as exc:
pytest.fail(f"TypeError raised with None offset: {exc}")

def test_integer_offset_applied_correctly(self):
"""A valid integer offset must still compute physical_index = page + offset."""
data = [
{"structure": "1", "title": "Intro", "page": 5},
{"structure": "2", "title": "Methods", "page": 10},
]
result = add_page_offset_to_toc_json(data, 3)
assert result[0]["physical_index"] == 8
assert result[1]["physical_index"] == 13
assert "page" not in result[0]
assert "page" not in result[1]

def test_zero_offset_applied_correctly(self):
"""Offset of 0 means page number == physical_index."""
data = [{"structure": "1", "title": "Intro", "page": 7}]
result = add_page_offset_to_toc_json(data, 0)
assert result[0]["physical_index"] == 7
assert "page" not in result[0]

def test_negative_offset_applied_correctly(self):
"""Negative offsets (e.g. roman-numeral front matter) must work."""
data = [{"structure": "1", "title": "Chapter 1", "page": 10}]
result = add_page_offset_to_toc_json(data, -2)
assert result[0]["physical_index"] == 8

def test_none_page_value_skipped(self):
"""Items with page=None must not be modified even with a valid offset."""
data = [{"structure": "1", "title": "Unknown", "page": None}]
result = add_page_offset_to_toc_json(data, 3)
assert result[0].get("page") is None
assert "physical_index" not in result[0]

def test_item_without_page_key_skipped(self):
"""Items that already have a physical_index (no 'page' key) are untouched."""
data = [{"structure": "1", "title": "Intro", "physical_index": 5}]
result = add_page_offset_to_toc_json(data, 3)
assert result[0]["physical_index"] == 5 # unchanged

def test_empty_list_none_offset(self):
"""Empty input list with None offset must return empty list."""
assert add_page_offset_to_toc_json([], None) == []

def test_empty_list_integer_offset(self):
"""Empty input list with integer offset must return empty list."""
assert add_page_offset_to_toc_json([], 5) == []

def test_calculate_page_offset_returns_none_for_empty_pairs(self):
"""calculate_page_offset([]) must return None (documents the contract)."""
assert calculate_page_offset([]) is None

def test_pipeline_none_offset_does_not_crash(self):
"""
End-to-end simulation: matching_pairs empty → offset None →
add_page_offset_to_toc_json must not crash.
"""
toc = [
{"structure": "1", "title": "Intro", "page": 1},
{"structure": "2", "title": "Methods", "page": 5},
]
pairs = [] # No matches found → offset will be None
offset = calculate_page_offset(pairs)
assert offset is None
# Must not raise
result = add_page_offset_to_toc_json(toc, offset)
# All items still have 'page' key (unchanged)
assert all("page" in item for item in result)


# ---------------------------------------------------------------------------
# Bug 2: process_none_page_numbers — empty range for last/first items
# ---------------------------------------------------------------------------

class TestProcessNonePageNumbers:
"""Covers the empty-range bug in process_none_page_numbers."""

@staticmethod
def _make_page_list(n):
"""Return a synthetic page_list of length n."""
return [(f"Content of page {i + 1}", 50) for i in range(n)]

def test_last_item_gets_non_empty_range(self):
"""
The *last* TOC item (no successor) used to produce range(prev, 0) which is
always empty — the LLM received an empty string. After the fix, the default
next_physical_index is end_index = len(page_list) + start_index - 1.
"""
# Simulate: the last entry has no physical_index yet
# We can verify the window by checking that page_contents is non-empty
# when we manually replicate the loop logic from process_none_page_numbers.
page_list = self._make_page_list(5)
start_index = 1
end_index = len(page_list) + start_index - 1 # = 5

# Scenario: item at index 1 has no physical_index, no next item
prev_physical_index = 2 # from item at index 0
next_physical_index = end_index # new default

page_range = list(range(prev_physical_index, next_physical_index + 1))
assert page_range == [2, 3, 4, 5], (
f"Expected [2,3,4,5] but got {page_range} — last item would get empty context"
)

def test_first_item_prev_default_is_start_index(self):
"""
The *first* TOC item (no predecessor) used to default prev_physical_index to 0.
With start_index=1, list_index = 0 - 1 = -1, which is silently skipped by the
bounds check. While the effective window was the same (page 1 still appeared
due to the subsequent loop), the new default start_index avoids computing
a negative list_index entirely — a cleaner, less error-prone starting point.
"""
start_index = 1

# Old code attempted page 0, list_index = 0 - 1 = -1, bounds check skips it.
# Effective window started at page 1 anyway.
old_attempted_list_index = 0 - start_index # = -1
assert old_attempted_list_index < 0, "Old code produced list_index=-1 (bounds-checked away)"

# New code: prev = start_index = 1, list_index = 1 - 1 = 0 (valid)
new_first_list_index = start_index - start_index # = 0
assert new_first_list_index == 0, "New code starts at list_index=0 (no negative index attempted)"

def test_single_page_document_last_item(self):
"""A 1-page document whose sole TOC item has no physical_index should get range [1,1]."""
page_list = self._make_page_list(1)
start_index = 1
end_index = 1

prev_physical_index = start_index # new default (no predecessor)
next_physical_index = end_index # new default (no successor)

page_range = list(range(prev_physical_index, next_physical_index + 1))
assert page_range == [1]

def test_middle_item_range_uses_neighbors(self):
"""A middle item should still use actual neighbor physical indices."""
# item at index 1 (between items 0 and 2 with known physical_index)
page_list = self._make_page_list(10)
start_index = 1
prev_physical_index = 3 # from item 0
next_physical_index = 7 # from item 2

page_range = list(range(prev_physical_index, next_physical_index + 1))
assert page_range == [3, 4, 5, 6, 7]

def test_regression_old_next_default_minus_one(self):
"""
Regression: range(prev, -1 + 1) == range(prev, 0) must be empty.
Confirms the old bug and validates the fix direction.
"""
prev = 3
old_next = -1 # old default
new_next = 5 # new default (end_index for a 5-page doc)

assert list(range(prev, old_next + 1)) == [], "Old code produced empty range"
assert list(range(prev, new_next + 1)) == [3, 4, 5], "New code produces valid range"

def test_all_items_have_physical_index_no_op(self):
"""When every item already has physical_index, function must return unchanged."""
page_list = self._make_page_list(5)
toc_items = [
{"structure": "1", "title": "Intro", "physical_index": 1},
{"structure": "2", "title": "Methods", "physical_index": 3},
{"structure": "3", "title": "Results", "physical_index": 5},
]
original = copy.deepcopy(toc_items)
result = process_none_page_numbers(toc_items, page_list)
# physical_index values must remain the same (no LLM called, no changes)
for orig, res in zip(original, result):
assert res["physical_index"] == orig["physical_index"]