VectifyAI · saccharin98 · Apr 29, 2026
diff --git a/openkb/cli.py b/openkb/cli.py
@@ -26,7 +26,7 @@
 from dotenv import load_dotenv
 
 from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb
-from openkb.converter import convert_document
+from openkb.converter import _registry_path, convert_document
 from openkb.log import append_log
 from openkb.schema import AGENTS_MD
 
@@ -160,14 +160,14 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
         click.echo(f"  [SKIP] Already in knowledge base: {file_path.name}")
         return
 
-    doc_name = file_path.stem
+    doc_name = result.doc_name or file_path.stem
 
     # 3/4. Index and compile
     if result.is_long_doc:
         click.echo(f"  Long document detected — indexing with PageIndex...")
         try:
             from openkb.indexer import index_long_document
-            index_result = index_long_document(result.raw_path, kb_dir)
+            index_result = index_long_document(result.raw_path, kb_dir, doc_name=doc_name)
         except Exception as exc:
             click.echo(f"  [ERROR] Indexing failed: {exc}")
             logger.debug("Indexing traceback:", exc_info=True)
@@ -208,7 +208,18 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
     # Register hash only after successful compilation
     if result.file_hash:
         doc_type = "long_pdf" if result.is_long_doc else file_path.suffix.lstrip(".")
-        registry.add(result.file_hash, {"name": file_path.name, "type": doc_type})
+        metadata = {
+            "name": file_path.name,
+            "doc_name": doc_name,
+            "type": doc_type,
+            "path": _registry_path(file_path, kb_dir),
+        }
+        if result.raw_path is not None:
+            metadata["raw_path"] = _registry_path(result.raw_path, kb_dir)
+        if result.source_path is not None:
+            metadata["source_path"] = _registry_path(result.source_path, kb_dir)
+        registry.remove_by_doc_name(doc_name)
+        registry.add(result.file_hash, metadata)
 
     append_log(kb_dir / "wiki", "ingest", file_path.name)
     click.echo(f"  [OK] {file_path.name} added to knowledge base.")

diff --git a/openkb/converter.py b/openkb/converter.py
@@ -1,9 +1,12 @@
 """Document conversion pipeline for OpenKB."""
 from __future__ import annotations
 
+import hashlib
 import logging
+import re
 import shutil
-from dataclasses import dataclass, field
+import unicodedata
+from dataclasses import dataclass
 from pathlib import Path
 
 import pymupdf
@@ -25,6 +28,29 @@ class ConvertResult:
     is_long_doc: bool = False
     skipped: bool = False
     file_hash: str | None = None  # For deferred hash registration
+    doc_name: str | None = None
+
+
+_SAFE_STEM_RE = re.compile(r"[^\w\-]+")
+_DOC_HASH_LEN = 12
+
+
+def _registry_path(path: Path, kb_dir: Path) -> str:
+    """Return a portable path string for registry lookups."""
+    resolved_path = path.resolve()
+    resolved_kb = kb_dir.resolve()
+    if resolved_path.is_relative_to(resolved_kb):
+        return resolved_path.relative_to(resolved_kb).as_posix()
+    return resolved_path.as_posix()
+
+
+def _make_doc_name(src: Path, kb_dir: Path) -> str:
+    """Return a stable, collision-resistant document name for a path."""
+    stem = unicodedata.normalize("NFKC", src.stem)
+    safe_stem = _SAFE_STEM_RE.sub("-", stem).strip("-") or "document"
+    path_key = _registry_path(src, kb_dir)
+    path_hash = hashlib.sha256(path_key.encode("utf-8")).hexdigest()[:_DOC_HASH_LEN]
+    return f"{safe_stem}-{path_hash}"
 
 
 def get_pdf_page_count(path: Path) -> int:
@@ -56,17 +82,27 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
     # 1. Hash check
     # ------------------------------------------------------------------
     file_hash = HashRegistry.hash_file(src)
+    path_key = _registry_path(src, kb_dir)
+    path_metadata = registry.get_by_path(path_key) or {}
+    doc_name = path_metadata.get("doc_name") or _make_doc_name(src, kb_dir)
     if registry.is_known(file_hash):
         logger.info("Skipping already-known file: %s", src.name)
-        return ConvertResult(skipped=True)
+        metadata = registry.get(file_hash) or {}
+        return ConvertResult(
+            skipped=True,
+            file_hash=file_hash,
+            doc_name=metadata.get("doc_name") or doc_name,
+        )
 
     # ------------------------------------------------------------------
     # 2. Copy to raw/
     # ------------------------------------------------------------------
     raw_dir = kb_dir / "raw"
     raw_dir.mkdir(parents=True, exist_ok=True)
-    raw_dest = raw_dir / src.name
-    if raw_dest.resolve() != src.resolve():
+    if src.resolve().is_relative_to(raw_dir.resolve()):
+        raw_dest = src
+    else:
+        raw_dest = raw_dir / f"{doc_name}{src.suffix.lower()}"
         shutil.copy2(src, raw_dest)
 
     # ------------------------------------------------------------------
@@ -81,18 +117,21 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
                 threshold,
                 src.name,
             )
-            return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash)
+            return ConvertResult(
+                raw_path=raw_dest,
+                doc_name=doc_name,
+                is_long_doc=True,
+                file_hash=file_hash,
+            )
 
     # ------------------------------------------------------------------
     # 4/5. Convert to Markdown
     # ------------------------------------------------------------------
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
-    images_dir = kb_dir / "wiki" / "sources" / "images" / src.stem
+    images_dir = kb_dir / "wiki" / "sources" / "images" / doc_name
     images_dir.mkdir(parents=True, exist_ok=True)
 
-    doc_name = src.stem
-
     if src.suffix.lower() == ".md":
         markdown = src.read_text(encoding="utf-8")
         markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir)
@@ -109,4 +148,9 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
     dest_md = sources_dir / f"{doc_name}.md"
     dest_md.write_text(markdown, encoding="utf-8")
 
-    return ConvertResult(raw_path=raw_dest, source_path=dest_md, file_hash=file_hash)
+    return ConvertResult(
+        raw_path=raw_dest,
+        source_path=dest_md,
+        doc_name=doc_name,
+        file_hash=file_hash,
+    )
diff --git a/openkb/indexer.py b/openkb/indexer.py
@@ -26,7 +26,7 @@ class IndexResult:
     tree: dict
 
 
-def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
+def index_long_document(pdf_path: Path, kb_dir: Path, doc_name: str | None = None) -> IndexResult:
     """Index a long PDF document using PageIndex and write wiki pages."""
     openkb_dir = kb_dir / ".openkb"
     config = load_config(openkb_dir / "config.yaml")
@@ -63,24 +63,25 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
 
     # Fetch complete document (metadata + structure + text)
     doc = col.get_document(doc_id, include_text=True)
-    doc_name: str = doc.get("doc_name", pdf_path.stem)
+    indexed_doc_name: str = doc.get("doc_name", pdf_path.stem)
     description: str = doc.get("doc_description", "")
     structure: list = doc.get("structure", [])
+    source_name = doc_name or pdf_path.stem
 
     # Debug: print doc keys and page_count to diagnose get_page_content range
     logger.info("Doc keys: %s", list(doc.keys()))
     logger.info("page_count from doc: %s", doc.get("page_count", "NOT PRESENT"))
 
     tree = {
-        "doc_name": doc_name,
+        "doc_name": indexed_doc_name,
         "doc_description": description,
         "structure": structure,
     }
 
     # Write wiki/sources/ — per-page content
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
-    images_dir = sources_dir / "images" / pdf_path.stem
+    images_dir = sources_dir / "images" / source_name
 
     from openkb.images import convert_pdf_to_pages
 
@@ -98,16 +99,16 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
     if not all_pages:
         if pageindex_api_key:
             logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
-        all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
+        all_pages = convert_pdf_to_pages(pdf_path, source_name, images_dir)
 
-    (sources_dir / f"{pdf_path.stem}.json").write_text(
+    (sources_dir / f"{source_name}.json").write_text(
         json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
     )
 
     # Write wiki/summaries/ (no images, just summaries)
     summaries_dir = kb_dir / "wiki" / "summaries"
     summaries_dir.mkdir(parents=True, exist_ok=True)
-    summary_md = render_summary_md(tree, pdf_path.stem, doc_id)
-    (summaries_dir / f"{pdf_path.stem}.md").write_text(summary_md, encoding="utf-8")
+    summary_md = render_summary_md(tree, source_name, doc_id)
+    (summaries_dir / f"{source_name}.md").write_text(summary_md, encoding="utf-8")
 
     return IndexResult(doc_id=doc_id, description=description, tree=tree)
diff --git a/openkb/state.py b/openkb/state.py
@@ -32,6 +32,17 @@ def all_entries(self) -> dict[str, dict]:
         """Return a shallow copy of all hash -> metadata entries."""
         return dict(self._data)
 
+    def get_by_path(self, path: str) -> dict | None:
+        """Return metadata registered for a source, raw, or wiki path."""
+        for metadata in self._data.values():
+            if path in {
+                metadata.get("path"),
+                metadata.get("raw_path"),
+                metadata.get("source_path"),
+            }:
+                return metadata
+        return None
+
     # ------------------------------------------------------------------
     # Mutation
     # ------------------------------------------------------------------
@@ -41,6 +52,19 @@ def add(self, file_hash: str, metadata: dict) -> None:
         self._data[file_hash] = metadata
         self._persist()
 
+    def remove_by_doc_name(self, doc_name: str) -> None:
+        """Remove older content-hash entries for the same document."""
+        stale_hashes = [
+            file_hash
+            for file_hash, metadata in self._data.items()
+            if metadata.get("doc_name") == doc_name
+        ]
+        if not stale_hashes:
+            return
+        for file_hash in stale_hashes:
+            del self._data[file_hash]
+        self._persist()
+
     # ------------------------------------------------------------------
     # Internal
     # ------------------------------------------------------------------

diff --git a/tests/test_add_command.py b/tests/test_add_command.py
@@ -139,7 +139,9 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
         mock_result = ConvertResult(
             raw_path=kb_dir / "raw" / "test.md",
             source_path=source_path,
+            doc_name="test-deadbeef00",
             is_long_doc=False,
+            file_hash="deadbeef00" * 8,
         )
 
         runner = CliRunner()
@@ -149,3 +151,9 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
             result = runner.invoke(cli, ["add", str(doc)])
             mock_arun.assert_called_once()
             assert "OK" in result.output
+
+        hashes = json.loads((kb_dir / ".openkb" / "hashes.json").read_text())
+        assert hashes[mock_result.file_hash]["doc_name"] == "test-deadbeef00"
+        assert hashes[mock_result.file_hash]["path"] == "test.md"
+        assert hashes[mock_result.file_hash]["raw_path"] == "raw/test.md"
+        assert hashes[mock_result.file_hash]["source_path"] == "wiki/sources/test.md"
diff --git a/tests/test_converter.py b/tests/test_converter.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from openkb.converter import ConvertResult, convert_document, get_pdf_page_count
+from openkb.converter import ConvertResult, _make_doc_name, convert_document, get_pdf_page_count
 
 
 # ---------------------------------------------------------------------------
@@ -42,7 +42,9 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir):
 
         assert result.skipped is False
         assert result.is_long_doc is False
+        assert result.doc_name == _make_doc_name(src, kb_dir)
         assert result.source_path is not None
+        assert result.source_path.name == f"{result.doc_name}.md"
         assert result.source_path.exists()
         assert result.source_path.read_text(encoding="utf-8").startswith("# Notes")
 
@@ -72,8 +74,59 @@ def test_md_raw_file_copied(self, kb_dir):
         result = convert_document(src, kb_dir)
 
         assert result.raw_path is not None
+        assert result.raw_path.name == f"{result.doc_name}.md"
         assert result.raw_path.exists()
 
+    def test_same_filename_different_paths_get_distinct_outputs(self, kb_dir):
+        """Same basename in different folders must not overwrite outputs."""
+        first_dir = kb_dir / "inputs" / "first"
+        second_dir = kb_dir / "inputs" / "second"
+        first_dir.mkdir(parents=True)
+        second_dir.mkdir(parents=True)
+        first = first_dir / "report.md"
+        second = second_dir / "report.md"
+        first.write_text("# First\n\nAlpha content.", encoding="utf-8")
+        second.write_text("# Second\n\nBeta content.", encoding="utf-8")
+
+        first_result = convert_document(first, kb_dir)
+        second_result = convert_document(second, kb_dir)
+
+        assert first_result.doc_name != second_result.doc_name
+        assert first_result.source_path != second_result.source_path
+        assert first_result.raw_path != second_result.raw_path
+        assert first_result.source_path.read_text(encoding="utf-8").startswith("# First")
+        assert second_result.source_path.read_text(encoding="utf-8").startswith("# Second")
+
+    def test_raw_copy_keeps_doc_name_when_content_changes(self, kb_dir):
+        """A watched raw copy keeps the original document identity across edits."""
+        from openkb.state import HashRegistry
+
+        src = kb_dir / "inputs" / "notes.md"
+        src.parent.mkdir(parents=True)
+        src.write_text("# Notes\n\nOld content.", encoding="utf-8")
+
+        first_result = convert_document(src, kb_dir)
+        registry = HashRegistry(kb_dir / ".openkb" / "hashes.json")
+        registry.add(
+            first_result.file_hash,
+            {
+                "name": src.name,
+                "doc_name": first_result.doc_name,
+                "path": "inputs/notes.md",
+                "raw_path": f"raw/{first_result.doc_name}.md",
+                "source_path": f"wiki/sources/{first_result.doc_name}.md",
+                "type": "md",
+            },
+        )
+
+        first_result.raw_path.write_text("# Notes\n\nNew content.", encoding="utf-8")
+        second_result = convert_document(first_result.raw_path, kb_dir)
+
+        assert second_result.skipped is False
+        assert second_result.doc_name == first_result.doc_name
+        assert second_result.source_path == first_result.source_path
+        assert second_result.source_path.read_text(encoding="utf-8").startswith("# Notes\n\nNew content.")
+
 
 # ---------------------------------------------------------------------------
 # convert_document — PDF short doc
@@ -104,6 +157,29 @@ def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path):
         assert result.source_path is not None
         assert result.source_path.exists()
 
+    def test_same_stem_different_extensions_get_distinct_outputs(self, kb_dir, tmp_path):
+        """report.md and report.pdf must not share the same internal name."""
+        md_src = tmp_path / "report.md"
+        pdf_src = tmp_path / "report.pdf"
+        md_src.write_text("# Markdown report", encoding="utf-8")
+        pdf_src.write_bytes(b"%PDF-1.4 fake content")
+
+        md_result = convert_document(md_src, kb_dir)
+        with (
+            patch("openkb.converter.pymupdf.open") as mock_mu,
+            patch("openkb.converter.convert_pdf_with_images", return_value="# PDF report"),
+        ):
+            fake_doc = MagicMock()
+            fake_doc.page_count = 5
+            fake_doc.__enter__ = MagicMock(return_value=fake_doc)
+            fake_doc.__exit__ = MagicMock(return_value=False)
+            mock_mu.return_value = fake_doc
+            pdf_result = convert_document(pdf_src, kb_dir)
+
+        assert md_result.doc_name != pdf_result.doc_name
+        assert md_result.source_path != pdf_result.source_path
+        assert md_result.raw_path != pdf_result.raw_path
+
 
 # ---------------------------------------------------------------------------
 # convert_document — PDF long doc