From 2247032c81cf596dbe48f2f59809ad2c2a7af70e Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Tue, 21 Apr 2026 09:54:01 +0800 Subject: [PATCH 1/3] docs(README): update project documentation with detailed architecture explanation - rewrite principles section with clearer descriptions - add comprehensive "Why Vectorless" comparison table - document four-artifact index architecture with diagrams - explain agent-based retrieval workflow and navigation commands - update features list with multi-agent retrieval and tree navigation details - reorganize sections from "What It's For" to "Key Features" and "Supported Documents" - add resources section with links to documentation and packages docs(CLAUDE): refine project principles with better wording --- CLAUDE.md | 6 ++-- README.md | 105 ++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 97 insertions(+), 14 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 1a2b8d3..e423081 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,9 +4,9 @@ Vectorless is a reasoning-native document intelligence engine written in Rust. ## Principles -- **Reason, don't vector.** — Every retrieval decision is an LLM decision. -- **Model fails, we fail.** — No silent degradation. No heuristic fallbacks. -- **No thought, no answer.** — Only LLM-reasoned output counts as an answer. +- **Reason, don't vector.** Retrieval is a reasoning act, not a similarity computation. +- **Model fails, we fail.** No heuristic fallbacks, no silent degradation. +- **No thought, no answer.** Only reasoned output counts as an answer. ## Project Structure diff --git a/README.md b/README.md index 8e682a3..4d835c2 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,82 @@ -**Reason, don't vector.** +**Reason, don't vector.** · **Structure, not chunks.** · **Agents, not embeddings.** · **Exact, not synthesized.** -**Vectorless** is a reasoning-based document engine with the core written in Rust. It will reason through any of your structured documents — **PDFs, Markdown, reports, contracts** — and retrieve only what's relevant. Nothing more, nothing less. +**Vectorless** is a reasoning-native document engine written in Rust. It compiles documents into navigable trees, then dispatches **multiple agents** to find exactly what's relevant across your **PDFs, Markdown, reports, contracts**. No embeddings, no chunking, no approximate nearest neighbors. Every retrieval is a **reasoning** act. +Three rules govern every decision in this system. **No exceptions**: + +- **Reason, don't vector.** Retrieval is a reasoning act, not a similarity computation. +- **Model fails, we fail.** No heuristic fallbacks, no silent degradation. +- **No thought, no answer.** Only reasoned output counts as an answer. + +Light up a star and shine with us! ⭐ + +## Why Vectorless + +Traditional RAG systems split documents into chunks, embed them into vectors, and retrieve by similarity. Vectorless takes a different approach: it preserves document structure as a navigable tree and lets agents reason through it. + +| | Embedding-Based RAG | Vectorless | +|---|---|---| +| **Indexing** | Chunk → embed → vector store | Parse → compile → document tree | +| **Retrieval** | Cosine similarity (approximate) | Multi-agent navigation (exact) | +| **Structure** | Destroyed by chunking | Preserved as first-class tree | +| **Query handling** | Keyword/similarity match | Intent classification + decomposition | +| **Multi-hop reasoning** | Not supported | Orchestrator replans dynamically | +| **Output** | Retrieved chunks | Original text passages, exact | +| **Failure mode** | Silent degradation | Explicit — no reasoning, no answer | + +## How It Works + +### Four-Artifact Index Architecture + +When a document is indexed, the compile pipeline builds four artifacts: + +``` +Content Layer Navigation Layer Reasoning Index Document Card +DocumentTree NavigationIndex ReasoningIndex DocCard +(TreeNode) (NavEntry, ChildRoute) (topic_paths, hot_nodes) (title, overview, + │ │ │ question hints) + │ │ │ │ + Agent reads Agent reads every Agent's targeted Orchestrator reads + only on cat decision round search tool (grep) for multi-doc routing +``` + +- **Content Layer** — The raw document tree. The agent only accesses this when reading specific paragraphs (`cat`). +- **Navigation Layer** — Each non-leaf node stores an overview, question hints, and child routes (title + description). The agent reads this every round to decide where to go next. +- **Reasoning Index** — Keyword-topic mappings with weights. Provides the agent's `grep` tool with structured keyword data for targeted search within a document. +- **DocCard** — A compact document-level summary. The Orchestrator reads DocCards to decide which documents to navigate in multi-document queries, without loading full documents. + +This separation means the agent makes routing decisions from lightweight metadata, not by scanning full content. + +### Agent-Based Retrieval + +``` +Engine.query("What drove the revenue decline?") + │ + ├─ Query Understanding ── intent, concepts, strategy (LLM) + │ + ├─ Orchestrator ── analyzes query, dispatches Workers + │ │ + │ ├─ Worker 1 ── ls → cd "Financials" → ls → cd "Revenue" → cat + │ └─ Worker 2 ── ls → cd "Risk Factors" → grep "decline" → cat + │ │ + │ └─ evaluate ── insufficient? → replan → dispatch new paths → loop + │ + └─ Fusion ── dedup, LLM-scored relevance, return with source attribution +``` + +Worker navigation commands: + +| Command | Action | Reads | +|---------|--------|-------| +| `ls` | List child sections | Navigation Layer (ChildRoute) | +| `cd` | Enter a child section | Navigation Layer | +| `cat` | Read content at current node | Content Layer (DocumentTree) | +| `grep` | Search by keyword | Reasoning Index (topic_paths) | + +The Orchestrator evaluates Worker results after each round. If evidence is insufficient, it **replans** — adjusting strategy, dispatching new paths, or deepening exploration. This continues until enough evidence is collected. ## Quick Start @@ -44,19 +116,30 @@ async def main(): asyncio.run(main()) ``` -## What It's For +## Key Features + +- **Rust Core** — The entire engine (indexing, retrieval, agent, storage) is implemented in Rust for performance and reliability. Python SDK via PyO3 bindings and a CLI are also provided. +- **Multi-Agent Retrieval** — Every query is handled by multiple cooperating agents: an Orchestrator plans and evaluates, Workers navigate documents. Each retrieval is a reasoning act — not a similarity score, but a sequence of LLM decisions about where to look, what to read, and when to stop. +- **Zero Vectors** — No embedding model, no vector store, no similarity search. This eliminates a class of failure modes: wrong chunk boundaries, stale embeddings, and similarity-score false positives. +- **Tree Navigation** — Documents are compiled into hierarchical trees that preserve the original structure — headings, sections, paragraphs, lists. Workers navigate this tree the way a human would: scan the table of contents, jump to the relevant section, read the passage. +- **Document-Exact Output** — Returns original text passages from the source document. No synthesis, no rewriting, no hallucinated content. What you get is what was written. +- **Multi-Document Orchestration** — Query across multiple documents with a single call. The Orchestrator dispatches Workers, evaluates evidence, and fuses results. When one document is insufficient, it replans and expands the search scope. +- **Query Understanding** — Every query passes through LLM-based intent classification, concept extraction, and strategy selection. Complex queries are decomposed into sub-queries. The system adapts its navigation strategy based on whether the query is factual, analytical, comparative, or navigational. +- **Checkpointable Pipeline** — The 8-stage compile pipeline writes checkpoints at each stage. If indexing is interrupted (LLM rate limit, network failure), it resumes from the last completed stage — no wasted work. +- **Incremental Updates** — Content fingerprinting detects changes at the node level. Re-indexing a modified document only recompiles the changed sections and their dependents. -Vectorless is designed for applications that need **precise** document retrieval: +## Supported Documents -- **Financial analysis** — Extract specific figures from reports, compare across filings -- **Legal research** — Find relevant clauses, trace definitions across documents -- **Technical documentation** — Navigate large manuals, locate specific procedures -- **Academic research** — Cross-reference findings across papers -- **Compliance** — Audit trails with source references for every answer +- **PDF** — Full text extraction with page metadata +- **Markdown** — Structure-aware parsing (headings, lists, code blocks) -## Examples +## Resources -See [examples/](examples/) for complete usage patterns. +- [Documentation](https://vectorless.dev) — Guides, architecture, API reference +- [Rust API Docs](https://docs.rs/vectorless) — Auto-generated crate documentation +- [PyPI](https://pypi.org/project/vectorless/) — Python package +- [Crates.io](https://crates.io/crates/vectorless) — Rust crate +- [Examples](examples/) — Complete usage patterns for Python and Rust ## Contributing From b387150be7ebacdad22e61063058fc4ab1c0f4c1 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Tue, 21 Apr 2026 09:56:27 +0800 Subject: [PATCH 2/3] docs(README): add core principles and restructure content - Add core principles banner at the top of README - Reorganize content with clear "Three Rules" section - Move project description after principles - Maintain star call-to-action in updated structure --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4d835c2..e979851 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ Vectorless

Reasoning-based Document Engine

+**Reason, don't vector.** · **Structure, not chunks.** · **Agents, not embeddings.** · **Exact, not synthesized.** [![PyPI](https://img.shields.io/pypi/v/vectorless.svg)](https://pypi.org/project/vectorless/) [![PyPI Downloads](https://static.pepy.tech/badge/vectorless/month)](https://pepy.tech/projects/vectorless) @@ -13,18 +14,15 @@ -**Reason, don't vector.** · **Structure, not chunks.** · **Agents, not embeddings.** · **Exact, not synthesized.** - **Vectorless** is a reasoning-native document engine written in Rust. It compiles documents into navigable trees, then dispatches **multiple agents** to find exactly what's relevant across your **PDFs, Markdown, reports, contracts**. No embeddings, no chunking, no approximate nearest neighbors. Every retrieval is a **reasoning** act. -Three rules govern every decision in this system. **No exceptions**: +Light up a star and shine with us! ⭐ +## Three Rules - **Reason, don't vector.** Retrieval is a reasoning act, not a similarity computation. - **Model fails, we fail.** No heuristic fallbacks, no silent degradation. - **No thought, no answer.** Only reasoned output counts as an answer. -Light up a star and shine with us! ⭐ - ## Why Vectorless Traditional RAG systems split documents into chunks, embed them into vectors, and retrieve by similarity. Vectorless takes a different approach: it preserves document structure as a navigable tree and lets agents reason through it. From 40829e228bd8dca085e416591b8a3848826eda65 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Tue, 21 Apr 2026 10:29:19 +0800 Subject: [PATCH 3/3] feat(docs): update homepage with three core principles and key features - Add new "Three rules" section highlighting core principles - Implement responsive rule cards component with design styling - Create key features grid displaying six main capabilities - Update hero tagline to emphasize reasoning acts - Simplify code examples showing minimal usage - Remove detailed output information from terminal displays - Add format pills for supported document types - Update tagline across documentation to "Reasoning-based" - Refine how-it-works section descriptions for clarity --- README.md | 2 +- docs/docusaurus.config.ts | 2 +- docs/src/pages/index.module.css | 126 +++++++++++++++++++++++++++++- docs/src/pages/index.tsx | 134 +++++++++++++++++++++++--------- 4 files changed, 222 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index e979851..ad6de21 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Vectorless

Reasoning-based Document Engine

-**Reason, don't vector.** · **Structure, not chunks.** · **Agents, not embeddings.** · **Exact, not synthesized.** +
Reason, don't vector · Structure, not chunks · Agents, not embeddings · Exact, not synthesized
[![PyPI](https://img.shields.io/pypi/v/vectorless.svg)](https://pypi.org/project/vectorless/) [![PyPI Downloads](https://static.pepy.tech/badge/vectorless/month)](https://pepy.tech/projects/vectorless) diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts index b798b05..759c457 100644 --- a/docs/docusaurus.config.ts +++ b/docs/docusaurus.config.ts @@ -6,7 +6,7 @@ import type * as Preset from '@docusaurus/preset-classic'; const config: Config = { title: 'Vectorless', - tagline: 'Reasoning-native Document Intelligence Engine', + tagline: 'Reasoning-based Document Engine', favicon: 'img/favicon.ico', future: { diff --git a/docs/src/pages/index.module.css b/docs/src/pages/index.module.css index 5b963e9..c3025fa 100644 --- a/docs/src/pages/index.module.css +++ b/docs/src/pages/index.module.css @@ -78,6 +78,45 @@ flex-wrap: wrap; } +/* ===== Three Rules ===== */ +.rulesRow { + display: flex; + gap: 1.5rem; + justify-content: center; + flex-wrap: wrap; + max-width: 1000px; + margin: 0 auto; +} + +.ruleCard { + flex: 1; + min-width: 240px; + max-width: 320px; + background: var(--card-bg); + border: 1px solid var(--border); + border-radius: 16px; + padding: 2.25rem 2rem; + text-align: center; +} + +.ruleTitle { + font-size: 1.1rem; + font-weight: 700; + color: var(--primary-dark); + margin-bottom: 0.75rem; + font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; +} + +[data-theme='dark'] .ruleTitle { + color: var(--primary); +} + +.ruleDesc { + font-size: 0.92rem; + line-height: 1.65; + color: var(--text-light); +} + /* GitHub Star button */ .githubStarButton { display: inline-flex; @@ -392,10 +431,72 @@ background: #D97706; } +/* ===== Format Pills ===== */ +.formatPills { + display: flex; + justify-content: center; + gap: 0.75rem; + margin-bottom: 2rem; +} + +.formatPill { + display: inline-flex; + align-items: center; + padding: 0.35rem 1rem; + border-radius: 20px; + font-size: 0.8rem; + font-weight: 600; + font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; + letter-spacing: -0.2px; + background: var(--primary-soft); + color: var(--primary-dark); + border: 1px solid var(--primary); +} + +[data-theme='dark'] .formatPill { + color: var(--primary); +} + +/* ===== Key Features Grid ===== */ +.featureGrid { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 1.5rem; + max-width: 1100px; + margin: 0 auto; +} + +.featureCard { + background: var(--card-bg); + border: 1px solid var(--border); + border-radius: 16px; + padding: 2rem 1.75rem; + transition: border-color 0.2s, box-shadow 0.2s; +} + +.featureCard:hover { + border-color: var(--primary); + box-shadow: 0 4px 20px rgba(245, 158, 11, 0.08); +} + +.featureTitle { + font-size: 1.1rem; + font-weight: 700; + color: var(--text); + margin: 0 0 0.75rem; +} + +.featureDesc { + font-size: 0.92rem; + line-height: 1.65; + color: var(--text-light); + margin: 0; +} + /* ===== Navigation Theater ===== */ .narrativeDemo { - background: var(--code-bg); - border: 1px solid var(--border); + background: #161A1F; + border: 1px solid #252A30; border-radius: 16px; padding: 2rem 2.5rem; max-width: 780px; @@ -440,7 +541,7 @@ top: 24px; bottom: 24px; width: 2px; - background: #2A3040; + background: #252A30; border-radius: 1px; } @@ -473,7 +574,7 @@ height: 10px; border-radius: 50%; background: var(--primary); - border: 2px solid var(--code-bg); + border: 2px solid #161A1F; z-index: 1; } @@ -787,6 +888,10 @@ .section { padding: 3.5rem 1.5rem; } + + .featureGrid { + grid-template-columns: repeat(2, 1fr); + } } @media screen and (max-width: 600px) { @@ -826,4 +931,17 @@ .sectionTitle { font-size: 1.5rem; } + + .featureGrid { + grid-template-columns: 1fr; + } + + .rulesRow { + flex-direction: column; + align-items: center; + } + + .ruleCard { + max-width: 100%; + } } diff --git a/docs/src/pages/index.tsx b/docs/src/pages/index.tsx index 5bad087..ca30948 100644 --- a/docs/src/pages/index.tsx +++ b/docs/src/pages/index.tsx @@ -20,7 +20,7 @@ function HomepageHeader() { Vectorless will reason through any of your structured documents — PDFs, Markdown, reports, contracts,
- and retrieve only what's relevant. Nothing more, nothing less. + and retrieve only what's relevant. Every retrieval is a reasoning act.

vectorless::Result<()> { let result = engine.index(IndexContext::from_path("./report.pdf")).await?; let doc_id = result.doc_id().unwrap(); - // Query with evidence and metrics + // Query let result = engine.query( QueryContext::new("What is the total revenue?") .with_doc_ids(vec![doc_id.to_string()]) ).await?; - let item = result.single().unwrap(); - println!("Answer: {}", item.content); - println!("Score: {:.2} Confidence: {:?}", item.score, item.confidence); - for ev in &item.evidence { - println!(" [{}] {}", ev.title, ev.path); - } - if let Some(m) = &item.metrics { - println!("LLM calls: {} Rounds: {}", m.llm_calls, m.rounds_used); - } + println!("{}", result.content); Ok(()) }`; @@ -156,6 +141,35 @@ function RustCode() { return
{nodes}
; } +function SectionThreeRules() { + return ( +
+
+ + Three rules. No exceptions. + +

+ Every decision in this system follows these principles. +

+
+
+
Reason, don't vector
+
Every retrieval is a reasoning act, not a similarity computation.
+
+
+
Model fails, we fail
+
No heuristic fallbacks. No silent degradation.
+
+
+
No thought, no answer
+
Only reasoned output counts as an answer.
+
+
+
+
+ ); +} + function SectionGetStarted() { const [activeTab, setActiveTab] = useState<'python' | 'rust'>('python'); const [copyLabel, setCopyLabel] = useState('Copy'); @@ -183,8 +197,12 @@ function SectionGetStarted() { Get Started

- Just a few lines of code to get up and running. + Three lines to index. One line to query.

+
+ PDF + Markdown +
{/* Tabs */}
@@ -214,11 +232,7 @@ function SectionGetStarted() {
$ python demo.py
- Answer:  The total revenue for fiscal year 2024 was $2.3 billion, a 15% increase YoY.
- Score:   0.91   Confidence: high
-   [Revenue Summary] Root/Financial Overview/Q3 2024
-   [Revenue Breakdown] Root/Financial Overview/Q3 2024
- LLM calls: 4   Rounds: 3
+ The total revenue for fiscal year 2024 was $2.3 billion, a 15% increase YoY.
@@ -238,11 +252,7 @@ function SectionGetStarted() {
$ cargo run
- Answer:  The total revenue for fiscal year 2024 was $2.3 billion, a 15% increase YoY.
- Score:   0.91   Confidence: High
-   [Revenue Summary] Root/Financial Overview/Q3 2024
-   [Revenue Breakdown] Root/Financial Overview/Q3 2024
- LLM calls: 4   Rounds: 3
+ The total revenue for fiscal year 2024 was $2.3 billion, a 15% increase YoY.
@@ -269,7 +279,7 @@ function SectionHowItWorks() { How does Vectorless work?

- You declare a few lines of code. We do everything else. + Documents are compiled into navigable trees. Multiple agents reason through them.

@@ -286,7 +296,7 @@ function SectionHowItWorks() { Index
- 3 documents indexed → hierarchical trees + NavigationIndex + ReasoningIndex built + 3 documents indexed → hierarchical trees + NavigationIndex + ReasoningIndex + DocCards built
{/* Step 2: Query */} @@ -304,7 +314,7 @@ function SectionHowItWorks() { Orchestrator · Analyze
- LLM understands query intent (complex, analytical) → reads DocCards → dispatches Worker to doc #1 + LLM understands query intent (complex, analytical) → reads DocCards → dispatches Worker to doc #1
{/* Step 4: Bird's-eye view */} @@ -346,7 +356,7 @@ function SectionHowItWorks() { {/* Step 8: Rerank + Synthesize */}
- Rerank pipeline: dedup → BM25 scoring (score: 0.87, confidence: high) → synthesis LLM generates cross-referenced answer. + Rerank pipeline: dedup → LLM-scored relevance (score: 0.87, confidence: high) → return original passages with source attribution.
{/* Step 9: Final Answer */}
@@ -364,6 +374,56 @@ function SectionHowItWorks() { ); } +const KEY_FEATURES = [ + { + title: 'Rust Core', + desc: 'The entire engine is implemented in Rust for performance and reliability. Python SDK and CLI also provided.', + }, + { + title: 'Multi-Agent Retrieval', + desc: 'An Orchestrator plans and evaluates. Workers navigate documents. Each retrieval is a reasoning act.', + }, + { + title: 'Zero Vectors', + desc: 'No embedding model, no vector store, no similarity search. Eliminates wrong chunk boundaries and stale embeddings.', + }, + { + title: 'Tree Navigation', + desc: 'Documents compiled into hierarchical trees. Workers navigate like a human: scan TOC, jump to section, read passage.', + }, + { + title: 'Document-Exact Output', + desc: 'Returns original text passages. No synthesis, no rewriting, no hallucinated content.', + }, + { + title: 'Incremental Updates', + desc: 'Content fingerprinting detects changes. Only recompiles modified sections. Checkpointable 8-stage pipeline.', + }, +]; + +function SectionKeyFeatures() { + return ( +
+
+ + Key Features + +

+ Reasoning-native, from the ground up. +

+
+ {KEY_FEATURES.map((f, i) => ( +
+ {f.title} +

{f.desc}

+
+ ))} +
+
+
+ ); +} + const USE_CASES = [ { title: 'Financial reports', @@ -535,11 +595,13 @@ export default function Home(): ReactNode { return ( + description="Reasoning-based document engine. No embeddings, no chunking. Multiple agents navigate your documents to find exactly what's relevant.">
+ +