Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 30 additions & 14 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def toc_detector_single_page(content, model=None):
response = llm_completion(model=model, prompt=prompt)
# print('response', response)
json_content = extract_json(response)
return json_content['toc_detected']
return json_content.get('toc_detected', 'no')
Comment on lines 119 to +122
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extract_json() can return a non-dict (e.g., a JSON list when the LLM outputs [...]). In that case, calling .get(...) will raise AttributeError and reintroduce a crash path. Consider guarding with isinstance(json_content, dict) (and defaulting to 'no') before accessing toc_detected.

Copilot uses AI. Check for mistakes.


def check_if_toc_extraction_is_complete(content, toc, model=None):
Expand All @@ -137,7 +137,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']
return json_content.get('completed', 'no')
Comment on lines 137 to +140
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extract_json() may return a list if the model outputs a JSON array; in that case json_content.get('completed', ...) will raise. Add an isinstance(json_content, dict) guard (and default to 'no') so malformed-but-parseable outputs don’t crash this completeness check.

Copilot uses AI. Check for mistakes.


def check_if_toc_transformation_is_complete(content, toc, model=None):
Expand All @@ -155,7 +155,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']
return json_content.get('completed', 'no')
Comment on lines 155 to +158
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as above: extract_json() can return a list (valid JSON) and json_content.get('completed', ...) would crash. Guard with isinstance(json_content, dict) (or handle list outputs explicitly) before accessing completed.

Copilot uses AI. Check for mistakes.

def extract_toc_content(content, model=None):
prompt = f"""
Expand Down Expand Up @@ -217,7 +217,7 @@ def detect_page_index(toc_content, model=None):

response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['page_index_given_in_toc']
return json_content.get('page_index_given_in_toc', 'no')
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extract_json() can successfully parse a JSON list if the model responds with [...]. In that case, json_content.get('page_index_given_in_toc', ...) will raise AttributeError. Add an isinstance(json_content, dict) guard (default 'no') to avoid crashing on parseable-but-unexpected outputs.

Suggested change
return json_content.get('page_index_given_in_toc', 'no')
if isinstance(json_content, dict):
return json_content.get('page_index_given_in_toc', 'no')
return 'no'

Copilot uses AI. Check for mistakes.

def toc_extractor(page_list, toc_page_list, model):
def transform_dots_to_colon(text):
Expand Down Expand Up @@ -296,7 +296,7 @@ def toc_transformer(toc_content, model=None):
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
if if_complete == "yes" and finish_reason == "finished":
last_complete = extract_json(last_complete)
cleaned_response=convert_page_to_int(last_complete['table_of_contents'])
cleaned_response=convert_page_to_int(last_complete.get('table_of_contents', []) if isinstance(last_complete, dict) else [])
return cleaned_response
Comment on lines 296 to 300
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

convert_page_to_int() assumes each element is a dict and will misbehave or crash if table_of_contents contains non-dicts (e.g., list of strings from malformed LLM output). Since toc_transformer() returns this value and downstream code assumes dict items, consider validating that table_of_contents is a list of dicts (filtering or coercing) before calling convert_page_to_int() / returning.

Copilot uses AI. Check for mistakes.

last_complete = get_json_content(last_complete)
Expand All @@ -323,7 +323,7 @@ def toc_transformer(toc_content, model=None):

new_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)

if new_complete.startswith('```json'):
if new_complete is not None and isinstance(new_complete, str) and new_complete.startswith('```json'):
new_complete = get_json_content(new_complete)
last_complete = last_complete+new_complete

Expand All @@ -332,7 +332,7 @@ def toc_transformer(toc_content, model=None):

last_complete = extract_json(last_complete)

cleaned_response=convert_page_to_int(last_complete['table_of_contents'])
cleaned_response=convert_page_to_int(last_complete.get('table_of_contents', []) if isinstance(last_complete, dict) else [])
return cleaned_response


Expand Down Expand Up @@ -414,6 +414,15 @@ def calculate_page_offset(pairs):
return most_common

def add_page_offset_to_toc_json(data, offset):
"""Apply a page offset to convert TOC page numbers to physical indices.

When ``offset`` is ``None`` (e.g. because no matching title/page pairs
were found in the document), the function returns ``data`` unchanged
rather than crashing with a ``TypeError``. Callers should handle the
resulting items-without-``physical_index`` through ``process_none_page_numbers``.
"""
if offset is None:
return data
for i in range(len(data)):
if data[i].get('page') is not None and isinstance(data[i]['page'], int):
data[i]['physical_index'] = data[i]['page'] + offset
Expand Down Expand Up @@ -584,9 +593,12 @@ def process_no_toc(page_list, start_index=1, model=None, logger=None):
logger.info(f'len(group_texts): {len(group_texts)}')

toc_with_page_number= generate_toc_init(group_texts[0], model)
if not isinstance(toc_with_page_number, list):
toc_with_page_number = []
for group_text in group_texts[1:]:
toc_with_page_number_additional = generate_toc_continue(toc_with_page_number, group_text, model)
toc_with_page_number.extend(toc_with_page_number_additional)
toc_with_page_number_additional = generate_toc_continue(toc_with_page_number, group_text, model)
if isinstance(toc_with_page_number_additional, list):
toc_with_page_number.extend(toc_with_page_number_additional)
logger.info(f'generate_toc: {toc_with_page_number}')

toc_with_page_number = convert_physical_index_to_int(toc_with_page_number)
Expand Down Expand Up @@ -654,18 +666,22 @@ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_che

##check if needed to process none page numbers
def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
end_index = len(page_list) + start_index - 1
for i, item in enumerate(toc_items):
if "physical_index" not in item:
# logger.info(f"fix item: {item}")
Comment on lines 668 to 672
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

process_none_page_numbers() still assumes each TOC item is a dict and that a 'page' key exists (later del item_copy['page'] / del item['page']). With malformed LLM output (missing keys or non-dict items), this can raise (e.g., TypeError/KeyError) before the later meta_processor dict-filter runs. Consider adding isinstance(item, dict) checks and using pop('page', None) instead of unconditional del.

Copilot uses AI. Check for mistakes.
# Find previous physical_index
prev_physical_index = 0 # Default if no previous item exists
prev_physical_index = start_index # Default: start of document
for j in range(i - 1, -1, -1):
if toc_items[j].get('physical_index') is not None:
prev_physical_index = toc_items[j]['physical_index']
break

# Find next physical_index
next_physical_index = -1 # Default if no next item exists
# Find next physical_index.
# Default is end_index (last page of document) so that the last
# TOC item — which has no successor — still gets a valid search
# window instead of an empty range(prev, 0).
next_physical_index = end_index
for j in range(i + 1, len(toc_items)):
if toc_items[j].get('physical_index') is not None:
next_physical_index = toc_items[j]['physical_index']
Expand Down Expand Up @@ -753,7 +769,7 @@ async def single_toc_item_index_fixer(section_title, content, model=None):
prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
response = await llm_acompletion(model=model, prompt=prompt)
json_content = extract_json(response)
return convert_physical_index_to_int(json_content['physical_index'])
return convert_physical_index_to_int(json_content.get('physical_index'))
Comment on lines 769 to +772
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extract_json() can return a non-dict (e.g., a JSON list), in which case json_content.get('physical_index') will raise. Add an isinstance(json_content, dict) guard (and default None) before accessing physical_index to keep this fixer resilient to malformed LLM output.

Copilot uses AI. Check for mistakes.



Expand Down Expand Up @@ -967,7 +983,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
else:
toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)

toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None]
toc_with_page_number = [item for item in toc_with_page_number if isinstance(item, dict) and item.get('physical_index') is not None]

toc_with_page_number = validate_and_truncate_physical_indices(
toc_with_page_number,
Expand Down
Loading