Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions openkb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from dotenv import load_dotenv

from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb
from openkb.converter import convert_document
from openkb.converter import _registry_path, convert_document
from openkb.log import append_log
from openkb.schema import AGENTS_MD

Expand Down Expand Up @@ -160,14 +160,14 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
click.echo(f" [SKIP] Already in knowledge base: {file_path.name}")
return

doc_name = file_path.stem
doc_name = result.doc_name or file_path.stem

# 3/4. Index and compile
if result.is_long_doc:
click.echo(f" Long document detected — indexing with PageIndex...")
try:
from openkb.indexer import index_long_document
index_result = index_long_document(result.raw_path, kb_dir)
index_result = index_long_document(result.raw_path, kb_dir, doc_name=doc_name)
except Exception as exc:
click.echo(f" [ERROR] Indexing failed: {exc}")
logger.debug("Indexing traceback:", exc_info=True)
Expand Down Expand Up @@ -208,7 +208,18 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
# Register hash only after successful compilation
if result.file_hash:
doc_type = "long_pdf" if result.is_long_doc else file_path.suffix.lstrip(".")
registry.add(result.file_hash, {"name": file_path.name, "type": doc_type})
metadata = {
"name": file_path.name,
"doc_name": doc_name,
"type": doc_type,
"path": _registry_path(file_path, kb_dir),
}
if result.raw_path is not None:
metadata["raw_path"] = _registry_path(result.raw_path, kb_dir)
if result.source_path is not None:
metadata["source_path"] = _registry_path(result.source_path, kb_dir)
registry.remove_by_doc_name(doc_name)
registry.add(result.file_hash, metadata)

append_log(kb_dir / "wiki", "ingest", file_path.name)
click.echo(f" [OK] {file_path.name} added to knowledge base.")
Expand Down
62 changes: 53 additions & 9 deletions openkb/converter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""Document conversion pipeline for OpenKB."""
from __future__ import annotations

import hashlib
import logging
import re
import shutil
from dataclasses import dataclass, field
import unicodedata
from dataclasses import dataclass
from pathlib import Path

import pymupdf
Expand All @@ -25,6 +28,29 @@ class ConvertResult:
is_long_doc: bool = False
skipped: bool = False
file_hash: str | None = None # For deferred hash registration
doc_name: str | None = None


_SAFE_STEM_RE = re.compile(r"[^\w\-]+")
_DOC_HASH_LEN = 12


def _registry_path(path: Path, kb_dir: Path) -> str:
"""Return a portable path string for registry lookups."""
resolved_path = path.resolve()
resolved_kb = kb_dir.resolve()
if resolved_path.is_relative_to(resolved_kb):
return resolved_path.relative_to(resolved_kb).as_posix()
return resolved_path.as_posix()


def _make_doc_name(src: Path, kb_dir: Path) -> str:
"""Return a stable, collision-resistant document name for a path."""
stem = unicodedata.normalize("NFKC", src.stem)
safe_stem = _SAFE_STEM_RE.sub("-", stem).strip("-") or "document"
path_key = _registry_path(src, kb_dir)
path_hash = hashlib.sha256(path_key.encode("utf-8")).hexdigest()[:_DOC_HASH_LEN]
return f"{safe_stem}-{path_hash}"


def get_pdf_page_count(path: Path) -> int:
Expand Down Expand Up @@ -56,17 +82,27 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
# 1. Hash check
# ------------------------------------------------------------------
file_hash = HashRegistry.hash_file(src)
path_key = _registry_path(src, kb_dir)
path_metadata = registry.get_by_path(path_key) or {}
doc_name = path_metadata.get("doc_name") or _make_doc_name(src, kb_dir)
if registry.is_known(file_hash):
logger.info("Skipping already-known file: %s", src.name)
return ConvertResult(skipped=True)
metadata = registry.get(file_hash) or {}
return ConvertResult(
skipped=True,
file_hash=file_hash,
doc_name=metadata.get("doc_name") or doc_name,
)

# ------------------------------------------------------------------
# 2. Copy to raw/
# ------------------------------------------------------------------
raw_dir = kb_dir / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)
raw_dest = raw_dir / src.name
if raw_dest.resolve() != src.resolve():
if src.resolve().is_relative_to(raw_dir.resolve()):
raw_dest = src
else:
raw_dest = raw_dir / f"{doc_name}{src.suffix.lower()}"
shutil.copy2(src, raw_dest)

# ------------------------------------------------------------------
Expand All @@ -81,18 +117,21 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
threshold,
src.name,
)
return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash)
return ConvertResult(
raw_path=raw_dest,
doc_name=doc_name,
is_long_doc=True,
file_hash=file_hash,
)

# ------------------------------------------------------------------
# 4/5. Convert to Markdown
# ------------------------------------------------------------------
sources_dir = kb_dir / "wiki" / "sources"
sources_dir.mkdir(parents=True, exist_ok=True)
images_dir = kb_dir / "wiki" / "sources" / "images" / src.stem
images_dir = kb_dir / "wiki" / "sources" / "images" / doc_name
images_dir.mkdir(parents=True, exist_ok=True)

doc_name = src.stem

if src.suffix.lower() == ".md":
markdown = src.read_text(encoding="utf-8")
markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir)
Expand All @@ -109,4 +148,9 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
dest_md = sources_dir / f"{doc_name}.md"
dest_md.write_text(markdown, encoding="utf-8")

return ConvertResult(raw_path=raw_dest, source_path=dest_md, file_hash=file_hash)
return ConvertResult(
raw_path=raw_dest,
source_path=dest_md,
doc_name=doc_name,
file_hash=file_hash,
)
17 changes: 9 additions & 8 deletions openkb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class IndexResult:
tree: dict


def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
def index_long_document(pdf_path: Path, kb_dir: Path, doc_name: str | None = None) -> IndexResult:
"""Index a long PDF document using PageIndex and write wiki pages."""
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
Expand Down Expand Up @@ -63,24 +63,25 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:

# Fetch complete document (metadata + structure + text)
doc = col.get_document(doc_id, include_text=True)
doc_name: str = doc.get("doc_name", pdf_path.stem)
indexed_doc_name: str = doc.get("doc_name", pdf_path.stem)
description: str = doc.get("doc_description", "")
structure: list = doc.get("structure", [])
source_name = doc_name or pdf_path.stem

# Debug: print doc keys and page_count to diagnose get_page_content range
logger.info("Doc keys: %s", list(doc.keys()))
logger.info("page_count from doc: %s", doc.get("page_count", "NOT PRESENT"))

tree = {
"doc_name": doc_name,
"doc_name": indexed_doc_name,
"doc_description": description,
"structure": structure,
}

# Write wiki/sources/ — per-page content
sources_dir = kb_dir / "wiki" / "sources"
sources_dir.mkdir(parents=True, exist_ok=True)
images_dir = sources_dir / "images" / pdf_path.stem
images_dir = sources_dir / "images" / source_name

from openkb.images import convert_pdf_to_pages

Expand All @@ -98,16 +99,16 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
if not all_pages:
if pageindex_api_key:
logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
all_pages = convert_pdf_to_pages(pdf_path, source_name, images_dir)

(sources_dir / f"{pdf_path.stem}.json").write_text(
(sources_dir / f"{source_name}.json").write_text(
json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
)

# Write wiki/summaries/ (no images, just summaries)
summaries_dir = kb_dir / "wiki" / "summaries"
summaries_dir.mkdir(parents=True, exist_ok=True)
summary_md = render_summary_md(tree, pdf_path.stem, doc_id)
(summaries_dir / f"{pdf_path.stem}.md").write_text(summary_md, encoding="utf-8")
summary_md = render_summary_md(tree, source_name, doc_id)
(summaries_dir / f"{source_name}.md").write_text(summary_md, encoding="utf-8")

return IndexResult(doc_id=doc_id, description=description, tree=tree)
24 changes: 24 additions & 0 deletions openkb/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@ def all_entries(self) -> dict[str, dict]:
"""Return a shallow copy of all hash -> metadata entries."""
return dict(self._data)

def get_by_path(self, path: str) -> dict | None:
"""Return metadata registered for a source, raw, or wiki path."""
for metadata in self._data.values():
if path in {
metadata.get("path"),
metadata.get("raw_path"),
metadata.get("source_path"),
}:
return metadata
return None

# ------------------------------------------------------------------
# Mutation
# ------------------------------------------------------------------
Expand All @@ -41,6 +52,19 @@ def add(self, file_hash: str, metadata: dict) -> None:
self._data[file_hash] = metadata
self._persist()

def remove_by_doc_name(self, doc_name: str) -> None:
"""Remove older content-hash entries for the same document."""
stale_hashes = [
file_hash
for file_hash, metadata in self._data.items()
if metadata.get("doc_name") == doc_name
]
if not stale_hashes:
return
for file_hash in stale_hashes:
del self._data[file_hash]
self._persist()

# ------------------------------------------------------------------
# Internal
# ------------------------------------------------------------------
Expand Down
8 changes: 8 additions & 0 deletions tests/test_add_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
mock_result = ConvertResult(
raw_path=kb_dir / "raw" / "test.md",
source_path=source_path,
doc_name="test-deadbeef00",
is_long_doc=False,
file_hash="deadbeef00" * 8,
)

runner = CliRunner()
Expand All @@ -149,3 +151,9 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
result = runner.invoke(cli, ["add", str(doc)])
mock_arun.assert_called_once()
assert "OK" in result.output

hashes = json.loads((kb_dir / ".openkb" / "hashes.json").read_text())
assert hashes[mock_result.file_hash]["doc_name"] == "test-deadbeef00"
assert hashes[mock_result.file_hash]["path"] == "test.md"
assert hashes[mock_result.file_hash]["raw_path"] == "raw/test.md"
assert hashes[mock_result.file_hash]["source_path"] == "wiki/sources/test.md"
78 changes: 77 additions & 1 deletion tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pytest

from openkb.converter import ConvertResult, convert_document, get_pdf_page_count
from openkb.converter import ConvertResult, _make_doc_name, convert_document, get_pdf_page_count


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -42,7 +42,9 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir):

assert result.skipped is False
assert result.is_long_doc is False
assert result.doc_name == _make_doc_name(src, kb_dir)
assert result.source_path is not None
assert result.source_path.name == f"{result.doc_name}.md"
assert result.source_path.exists()
assert result.source_path.read_text(encoding="utf-8").startswith("# Notes")

Expand Down Expand Up @@ -72,8 +74,59 @@ def test_md_raw_file_copied(self, kb_dir):
result = convert_document(src, kb_dir)

assert result.raw_path is not None
assert result.raw_path.name == f"{result.doc_name}.md"
assert result.raw_path.exists()

def test_same_filename_different_paths_get_distinct_outputs(self, kb_dir):
"""Same basename in different folders must not overwrite outputs."""
first_dir = kb_dir / "inputs" / "first"
second_dir = kb_dir / "inputs" / "second"
first_dir.mkdir(parents=True)
second_dir.mkdir(parents=True)
first = first_dir / "report.md"
second = second_dir / "report.md"
first.write_text("# First\n\nAlpha content.", encoding="utf-8")
second.write_text("# Second\n\nBeta content.", encoding="utf-8")

first_result = convert_document(first, kb_dir)
second_result = convert_document(second, kb_dir)

assert first_result.doc_name != second_result.doc_name
assert first_result.source_path != second_result.source_path
assert first_result.raw_path != second_result.raw_path
assert first_result.source_path.read_text(encoding="utf-8").startswith("# First")
assert second_result.source_path.read_text(encoding="utf-8").startswith("# Second")

def test_raw_copy_keeps_doc_name_when_content_changes(self, kb_dir):
"""A watched raw copy keeps the original document identity across edits."""
from openkb.state import HashRegistry

src = kb_dir / "inputs" / "notes.md"
src.parent.mkdir(parents=True)
src.write_text("# Notes\n\nOld content.", encoding="utf-8")

first_result = convert_document(src, kb_dir)
registry = HashRegistry(kb_dir / ".openkb" / "hashes.json")
registry.add(
first_result.file_hash,
{
"name": src.name,
"doc_name": first_result.doc_name,
"path": "inputs/notes.md",
"raw_path": f"raw/{first_result.doc_name}.md",
"source_path": f"wiki/sources/{first_result.doc_name}.md",
"type": "md",
},
)

first_result.raw_path.write_text("# Notes\n\nNew content.", encoding="utf-8")
second_result = convert_document(first_result.raw_path, kb_dir)

assert second_result.skipped is False
assert second_result.doc_name == first_result.doc_name
assert second_result.source_path == first_result.source_path
assert second_result.source_path.read_text(encoding="utf-8").startswith("# Notes\n\nNew content.")


# ---------------------------------------------------------------------------
# convert_document — PDF short doc
Expand Down Expand Up @@ -104,6 +157,29 @@ def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path):
assert result.source_path is not None
assert result.source_path.exists()

def test_same_stem_different_extensions_get_distinct_outputs(self, kb_dir, tmp_path):
"""report.md and report.pdf must not share the same internal name."""
md_src = tmp_path / "report.md"
pdf_src = tmp_path / "report.pdf"
md_src.write_text("# Markdown report", encoding="utf-8")
pdf_src.write_bytes(b"%PDF-1.4 fake content")

md_result = convert_document(md_src, kb_dir)
with (
patch("openkb.converter.pymupdf.open") as mock_mu,
patch("openkb.converter.convert_pdf_with_images", return_value="# PDF report"),
):
fake_doc = MagicMock()
fake_doc.page_count = 5
fake_doc.__enter__ = MagicMock(return_value=fake_doc)
fake_doc.__exit__ = MagicMock(return_value=False)
mock_mu.return_value = fake_doc
pdf_result = convert_document(pdf_src, kb_dir)

assert md_result.doc_name != pdf_result.doc_name
assert md_result.source_path != pdf_result.source_path
assert md_result.raw_path != pdf_result.raw_path


# ---------------------------------------------------------------------------
# convert_document — PDF long doc
Expand Down
Loading