diff --git a/openkb/cli.py b/openkb/cli.py index 658b3781..f86b2ce1 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -26,7 +26,7 @@ from dotenv import load_dotenv from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb -from openkb.converter import convert_document +from openkb.converter import _registry_path, convert_document from openkb.log import append_log from openkb.schema import AGENTS_MD @@ -160,14 +160,14 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None: click.echo(f" [SKIP] Already in knowledge base: {file_path.name}") return - doc_name = file_path.stem + doc_name = result.doc_name or file_path.stem # 3/4. Index and compile if result.is_long_doc: click.echo(f" Long document detected — indexing with PageIndex...") try: from openkb.indexer import index_long_document - index_result = index_long_document(result.raw_path, kb_dir) + index_result = index_long_document(result.raw_path, kb_dir, doc_name=doc_name) except Exception as exc: click.echo(f" [ERROR] Indexing failed: {exc}") logger.debug("Indexing traceback:", exc_info=True) @@ -208,7 +208,18 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None: # Register hash only after successful compilation if result.file_hash: doc_type = "long_pdf" if result.is_long_doc else file_path.suffix.lstrip(".") - registry.add(result.file_hash, {"name": file_path.name, "type": doc_type}) + metadata = { + "name": file_path.name, + "doc_name": doc_name, + "type": doc_type, + "path": _registry_path(file_path, kb_dir), + } + if result.raw_path is not None: + metadata["raw_path"] = _registry_path(result.raw_path, kb_dir) + if result.source_path is not None: + metadata["source_path"] = _registry_path(result.source_path, kb_dir) + registry.remove_by_doc_name(doc_name) + registry.add(result.file_hash, metadata) append_log(kb_dir / "wiki", "ingest", file_path.name) click.echo(f" [OK] {file_path.name} added to knowledge base.") diff --git a/openkb/converter.py b/openkb/converter.py index 3f5f5299..31ea5145 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -1,9 +1,12 @@ """Document conversion pipeline for OpenKB.""" from __future__ import annotations +import hashlib import logging +import re import shutil -from dataclasses import dataclass, field +import unicodedata +from dataclasses import dataclass from pathlib import Path import pymupdf @@ -25,6 +28,29 @@ class ConvertResult: is_long_doc: bool = False skipped: bool = False file_hash: str | None = None # For deferred hash registration + doc_name: str | None = None + + +_SAFE_STEM_RE = re.compile(r"[^\w\-]+") +_DOC_HASH_LEN = 12 + + +def _registry_path(path: Path, kb_dir: Path) -> str: + """Return a portable path string for registry lookups.""" + resolved_path = path.resolve() + resolved_kb = kb_dir.resolve() + if resolved_path.is_relative_to(resolved_kb): + return resolved_path.relative_to(resolved_kb).as_posix() + return resolved_path.as_posix() + + +def _make_doc_name(src: Path, kb_dir: Path) -> str: + """Return a stable, collision-resistant document name for a path.""" + stem = unicodedata.normalize("NFKC", src.stem) + safe_stem = _SAFE_STEM_RE.sub("-", stem).strip("-") or "document" + path_key = _registry_path(src, kb_dir) + path_hash = hashlib.sha256(path_key.encode("utf-8")).hexdigest()[:_DOC_HASH_LEN] + return f"{safe_stem}-{path_hash}" def get_pdf_page_count(path: Path) -> int: @@ -56,17 +82,27 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: # 1. Hash check # ------------------------------------------------------------------ file_hash = HashRegistry.hash_file(src) + path_key = _registry_path(src, kb_dir) + path_metadata = registry.get_by_path(path_key) or {} + doc_name = path_metadata.get("doc_name") or _make_doc_name(src, kb_dir) if registry.is_known(file_hash): logger.info("Skipping already-known file: %s", src.name) - return ConvertResult(skipped=True) + metadata = registry.get(file_hash) or {} + return ConvertResult( + skipped=True, + file_hash=file_hash, + doc_name=metadata.get("doc_name") or doc_name, + ) # ------------------------------------------------------------------ # 2. Copy to raw/ # ------------------------------------------------------------------ raw_dir = kb_dir / "raw" raw_dir.mkdir(parents=True, exist_ok=True) - raw_dest = raw_dir / src.name - if raw_dest.resolve() != src.resolve(): + if src.resolve().is_relative_to(raw_dir.resolve()): + raw_dest = src + else: + raw_dest = raw_dir / f"{doc_name}{src.suffix.lower()}" shutil.copy2(src, raw_dest) # ------------------------------------------------------------------ @@ -81,18 +117,21 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: threshold, src.name, ) - return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash) + return ConvertResult( + raw_path=raw_dest, + doc_name=doc_name, + is_long_doc=True, + file_hash=file_hash, + ) # ------------------------------------------------------------------ # 4/5. Convert to Markdown # ------------------------------------------------------------------ sources_dir = kb_dir / "wiki" / "sources" sources_dir.mkdir(parents=True, exist_ok=True) - images_dir = kb_dir / "wiki" / "sources" / "images" / src.stem + images_dir = kb_dir / "wiki" / "sources" / "images" / doc_name images_dir.mkdir(parents=True, exist_ok=True) - doc_name = src.stem - if src.suffix.lower() == ".md": markdown = src.read_text(encoding="utf-8") markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir) @@ -109,4 +148,9 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: dest_md = sources_dir / f"{doc_name}.md" dest_md.write_text(markdown, encoding="utf-8") - return ConvertResult(raw_path=raw_dest, source_path=dest_md, file_hash=file_hash) + return ConvertResult( + raw_path=raw_dest, + source_path=dest_md, + doc_name=doc_name, + file_hash=file_hash, + ) diff --git a/openkb/indexer.py b/openkb/indexer.py index 6ea9d73f..68e16327 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -26,7 +26,7 @@ class IndexResult: tree: dict -def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: +def index_long_document(pdf_path: Path, kb_dir: Path, doc_name: str | None = None) -> IndexResult: """Index a long PDF document using PageIndex and write wiki pages.""" openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") @@ -63,16 +63,17 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: # Fetch complete document (metadata + structure + text) doc = col.get_document(doc_id, include_text=True) - doc_name: str = doc.get("doc_name", pdf_path.stem) + indexed_doc_name: str = doc.get("doc_name", pdf_path.stem) description: str = doc.get("doc_description", "") structure: list = doc.get("structure", []) + source_name = doc_name or pdf_path.stem # Debug: print doc keys and page_count to diagnose get_page_content range logger.info("Doc keys: %s", list(doc.keys())) logger.info("page_count from doc: %s", doc.get("page_count", "NOT PRESENT")) tree = { - "doc_name": doc_name, + "doc_name": indexed_doc_name, "doc_description": description, "structure": structure, } @@ -80,7 +81,7 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: # Write wiki/sources/ — per-page content sources_dir = kb_dir / "wiki" / "sources" sources_dir.mkdir(parents=True, exist_ok=True) - images_dir = sources_dir / "images" / pdf_path.stem + images_dir = sources_dir / "images" / source_name from openkb.images import convert_pdf_to_pages @@ -98,16 +99,16 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: if not all_pages: if pageindex_api_key: logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name) - all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir) + all_pages = convert_pdf_to_pages(pdf_path, source_name, images_dir) - (sources_dir / f"{pdf_path.stem}.json").write_text( + (sources_dir / f"{source_name}.json").write_text( json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", ) # Write wiki/summaries/ (no images, just summaries) summaries_dir = kb_dir / "wiki" / "summaries" summaries_dir.mkdir(parents=True, exist_ok=True) - summary_md = render_summary_md(tree, pdf_path.stem, doc_id) - (summaries_dir / f"{pdf_path.stem}.md").write_text(summary_md, encoding="utf-8") + summary_md = render_summary_md(tree, source_name, doc_id) + (summaries_dir / f"{source_name}.md").write_text(summary_md, encoding="utf-8") return IndexResult(doc_id=doc_id, description=description, tree=tree) diff --git a/openkb/state.py b/openkb/state.py index 93816066..7d17dbcc 100644 --- a/openkb/state.py +++ b/openkb/state.py @@ -32,6 +32,17 @@ def all_entries(self) -> dict[str, dict]: """Return a shallow copy of all hash -> metadata entries.""" return dict(self._data) + def get_by_path(self, path: str) -> dict | None: + """Return metadata registered for a source, raw, or wiki path.""" + for metadata in self._data.values(): + if path in { + metadata.get("path"), + metadata.get("raw_path"), + metadata.get("source_path"), + }: + return metadata + return None + # ------------------------------------------------------------------ # Mutation # ------------------------------------------------------------------ @@ -41,6 +52,19 @@ def add(self, file_hash: str, metadata: dict) -> None: self._data[file_hash] = metadata self._persist() + def remove_by_doc_name(self, doc_name: str) -> None: + """Remove older content-hash entries for the same document.""" + stale_hashes = [ + file_hash + for file_hash, metadata in self._data.items() + if metadata.get("doc_name") == doc_name + ] + if not stale_hashes: + return + for file_hash in stale_hashes: + del self._data[file_hash] + self._persist() + # ------------------------------------------------------------------ # Internal # ------------------------------------------------------------------ diff --git a/tests/test_add_command.py b/tests/test_add_command.py index 8d62ea24..06af4428 100644 --- a/tests/test_add_command.py +++ b/tests/test_add_command.py @@ -139,7 +139,9 @@ def test_add_short_doc_runs_compiler(self, tmp_path): mock_result = ConvertResult( raw_path=kb_dir / "raw" / "test.md", source_path=source_path, + doc_name="test-deadbeef00", is_long_doc=False, + file_hash="deadbeef00" * 8, ) runner = CliRunner() @@ -149,3 +151,9 @@ def test_add_short_doc_runs_compiler(self, tmp_path): result = runner.invoke(cli, ["add", str(doc)]) mock_arun.assert_called_once() assert "OK" in result.output + + hashes = json.loads((kb_dir / ".openkb" / "hashes.json").read_text()) + assert hashes[mock_result.file_hash]["doc_name"] == "test-deadbeef00" + assert hashes[mock_result.file_hash]["path"] == "test.md" + assert hashes[mock_result.file_hash]["raw_path"] == "raw/test.md" + assert hashes[mock_result.file_hash]["source_path"] == "wiki/sources/test.md" diff --git a/tests/test_converter.py b/tests/test_converter.py index 6c184fd6..d21ca482 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -7,7 +7,7 @@ import pytest -from openkb.converter import ConvertResult, convert_document, get_pdf_page_count +from openkb.converter import ConvertResult, _make_doc_name, convert_document, get_pdf_page_count # --------------------------------------------------------------------------- @@ -42,7 +42,9 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir): assert result.skipped is False assert result.is_long_doc is False + assert result.doc_name == _make_doc_name(src, kb_dir) assert result.source_path is not None + assert result.source_path.name == f"{result.doc_name}.md" assert result.source_path.exists() assert result.source_path.read_text(encoding="utf-8").startswith("# Notes") @@ -72,8 +74,59 @@ def test_md_raw_file_copied(self, kb_dir): result = convert_document(src, kb_dir) assert result.raw_path is not None + assert result.raw_path.name == f"{result.doc_name}.md" assert result.raw_path.exists() + def test_same_filename_different_paths_get_distinct_outputs(self, kb_dir): + """Same basename in different folders must not overwrite outputs.""" + first_dir = kb_dir / "inputs" / "first" + second_dir = kb_dir / "inputs" / "second" + first_dir.mkdir(parents=True) + second_dir.mkdir(parents=True) + first = first_dir / "report.md" + second = second_dir / "report.md" + first.write_text("# First\n\nAlpha content.", encoding="utf-8") + second.write_text("# Second\n\nBeta content.", encoding="utf-8") + + first_result = convert_document(first, kb_dir) + second_result = convert_document(second, kb_dir) + + assert first_result.doc_name != second_result.doc_name + assert first_result.source_path != second_result.source_path + assert first_result.raw_path != second_result.raw_path + assert first_result.source_path.read_text(encoding="utf-8").startswith("# First") + assert second_result.source_path.read_text(encoding="utf-8").startswith("# Second") + + def test_raw_copy_keeps_doc_name_when_content_changes(self, kb_dir): + """A watched raw copy keeps the original document identity across edits.""" + from openkb.state import HashRegistry + + src = kb_dir / "inputs" / "notes.md" + src.parent.mkdir(parents=True) + src.write_text("# Notes\n\nOld content.", encoding="utf-8") + + first_result = convert_document(src, kb_dir) + registry = HashRegistry(kb_dir / ".openkb" / "hashes.json") + registry.add( + first_result.file_hash, + { + "name": src.name, + "doc_name": first_result.doc_name, + "path": "inputs/notes.md", + "raw_path": f"raw/{first_result.doc_name}.md", + "source_path": f"wiki/sources/{first_result.doc_name}.md", + "type": "md", + }, + ) + + first_result.raw_path.write_text("# Notes\n\nNew content.", encoding="utf-8") + second_result = convert_document(first_result.raw_path, kb_dir) + + assert second_result.skipped is False + assert second_result.doc_name == first_result.doc_name + assert second_result.source_path == first_result.source_path + assert second_result.source_path.read_text(encoding="utf-8").startswith("# Notes\n\nNew content.") + # --------------------------------------------------------------------------- # convert_document — PDF short doc @@ -104,6 +157,29 @@ def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path): assert result.source_path is not None assert result.source_path.exists() + def test_same_stem_different_extensions_get_distinct_outputs(self, kb_dir, tmp_path): + """report.md and report.pdf must not share the same internal name.""" + md_src = tmp_path / "report.md" + pdf_src = tmp_path / "report.pdf" + md_src.write_text("# Markdown report", encoding="utf-8") + pdf_src.write_bytes(b"%PDF-1.4 fake content") + + md_result = convert_document(md_src, kb_dir) + with ( + patch("openkb.converter.pymupdf.open") as mock_mu, + patch("openkb.converter.convert_pdf_with_images", return_value="# PDF report"), + ): + fake_doc = MagicMock() + fake_doc.page_count = 5 + fake_doc.__enter__ = MagicMock(return_value=fake_doc) + fake_doc.__exit__ = MagicMock(return_value=False) + mock_mu.return_value = fake_doc + pdf_result = convert_document(pdf_src, kb_dir) + + assert md_result.doc_name != pdf_result.doc_name + assert md_result.source_path != pdf_result.source_path + assert md_result.raw_path != pdf_result.raw_path + # --------------------------------------------------------------------------- # convert_document — PDF long doc diff --git a/tests/test_indexer.py b/tests/test_indexer.py index 3dbb6772..35ba2753 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -102,6 +102,25 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path): assert "doc_type: pageindex" in content assert "Summary:" in content + def test_explicit_doc_name_controls_output_paths(self, kb_dir, sample_tree, tmp_path): + """Long doc outputs should use the converter's collision-safe doc name.""" + doc_id = "abc-123" + fake_col = self._make_fake_collection(doc_id, sample_tree) + + fake_client = MagicMock() + fake_client.collection.return_value = fake_col + + pdf_path = tmp_path / "sample.pdf" + pdf_path.write_bytes(b"%PDF-1.4 fake") + + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): + index_long_document(pdf_path, kb_dir, doc_name="sample-deadbeef00") + + assert (kb_dir / "wiki" / "sources" / "sample-deadbeef00.json").exists() + assert (kb_dir / "wiki" / "summaries" / "sample-deadbeef00.md").exists() + assert not (kb_dir / "wiki" / "sources" / "sample.json").exists() + def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_path): """LocalClient must be created with the correct IndexConfig flags.""" doc_id = "xyz-456"