From c51cd07b521817b8545d040f2c54ce32234cd3e3 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 16:12:09 +0800 Subject: [PATCH 1/2] docs(README): update project description and simplify documentation - Change title from "Reasoning-based Document Engine" to "Document Understanding Engine for AI" - Update subtitle to reflect new focus on "Think, then answer" - Replace detailed comparison table with concise description of core functionality - Rename "Agent-Based Retrieval" to "Agent-Based Understanding" - Remove redundant features and supported documents sections to streamline documentation --- README.md | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index c8391d1..613c70e 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ Vectorless -

Reasoning-based Document Engine

-
Reason, don't vector · Structure, not chunks · Agents, not embeddings
+

Document Understanding Engine for AI

+
Reason, don't vector · Structure, not chunks · Think, then answer
[![PyPI](https://img.shields.io/pypi/v/vectorless.svg)](https://pypi.org/project/vectorless/) [![PyPI Downloads](https://static.pepy.tech/badge/vectorless/month)](https://pepy.tech/projects/vectorless) @@ -14,29 +14,15 @@ -**Vectorless** is a reasoning-native document engine written in Rust. It compiles documents into navigable trees, then dispatches **multiple agents** to find exactly what's relevant across your **PDFs, Markdown, reports, contracts**. No embeddings, no chunking, no approximate nearest neighbors. Every retrieval is a **reasoning** act. +**Vectorless** is a document understanding engine for AI. It reads documents as structured trees of meaning, then dispatches multiple agents to reason through headings, sections, and paragraphs — understanding how each part relates to the whole. The problem it solves is not "where to look", but "what does this mean in context". Every understanding is a **reasoning** act. Light up a star and shine with us! ⭐ ## Three Rules -- **Reason, don't vector.** Retrieval is a reasoning act, not a similarity computation. +- **Reason, don't vector.** Understanding is reasoning, not similarity. - **Model fails, we fail.** No heuristic fallbacks, no silent degradation. - **No thought, no answer.** Only reasoned output counts as an answer. -## Why Vectorless - -Traditional RAG systems split documents into chunks, embed them into vectors, and retrieve by similarity. Vectorless takes a different approach: it preserves document structure as a navigable tree and lets agents reason through it. - -| | Embedding-Based RAG | Vectorless | -|---|---|---| -| **Indexing** | Chunk → embed → vector store | Parse → compile → document tree | -| **Retrieval** | Cosine similarity (approximate) | Multi-agent navigation (exact) | -| **Structure** | Destroyed by chunking | Preserved as first-class tree | -| **Query handling** | Keyword/similarity match | Intent classification + decomposition | -| **Multi-hop reasoning** | Not supported | Orchestrator replans dynamically | -| **Output** | Retrieved chunks | Original text passages, exact | -| **Failure mode** | Silent degradation | Explicit — no reasoning, no answer | - ## How It Works ### Four-Artifact Index Architecture @@ -60,7 +46,7 @@ DocumentTree NavigationIndex ReasoningIndex Do This separation means the agent makes routing decisions from lightweight metadata, not by scanning full content. -### Agent-Based Retrieval +### Agent-Based Understanding ``` Engine.query("What drove the revenue decline?") @@ -114,23 +100,6 @@ async def main(): asyncio.run(main()) ``` -## Key Features - -- **Rust Core** — The entire engine (indexing, retrieval, agent, storage) is implemented in Rust for performance and reliability. Python SDK via PyO3 bindings and a CLI are also provided. -- **Multi-Agent Retrieval** — Every query is handled by multiple cooperating agents: an Orchestrator plans and evaluates, Workers navigate documents. Each retrieval is a reasoning act — not a similarity score, but a sequence of LLM decisions about where to look, what to read, and when to stop. -- **Zero Vectors** — No embedding model, no vector store, no similarity search. This eliminates a class of failure modes: wrong chunk boundaries, stale embeddings, and similarity-score false positives. -- **Tree Navigation** — Documents are compiled into hierarchical trees that preserve the original structure — headings, sections, paragraphs, lists. Workers navigate this tree the way a human would: scan the table of contents, jump to the relevant section, read the passage. -- **Document-Exact Output** — Returns original text passages from the source document. No synthesis, no rewriting, no hallucinated content. What you get is what was written. -- **Multi-Document Orchestration** — Query across multiple documents with a single call. The Orchestrator dispatches Workers, evaluates evidence, and fuses results. When one document is insufficient, it replans and expands the search scope. -- **Query Understanding** — Every query passes through LLM-based intent classification, concept extraction, and strategy selection. Complex queries are decomposed into sub-queries. The system adapts its navigation strategy based on whether the query is factual, analytical, comparative, or navigational. -- **Checkpointable Pipeline** — The 8-stage compile pipeline writes checkpoints at each stage. If indexing is interrupted (LLM rate limit, network failure), it resumes from the last completed stage — no wasted work. -- **Incremental Updates** — Content fingerprinting detects changes at the node level. Re-indexing a modified document only recompiles the changed sections and their dependents. - -## Supported Documents - -- **PDF** — Full text extraction with page metadata -- **Markdown** — Structure-aware parsing (headings, lists, code blocks) - ## Resources - [Documentation](https://vectorless.dev) — Guides, architecture, API reference From 9ca2fc35d322c59174402c6b403e2b406703e097 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 16:34:45 +0800 Subject: [PATCH 2/2] docs: update project descriptions and terminology - Replace "reasoning-native" and "reasoning-based" with "document understanding" throughout documentation - Update README.md to clarify that Vectorless compiles documents into structured trees and provides reasoning acts, not retrieval results - Change "Fusion" to "Synthesis" in architecture diagram - Update blog post tags from [vectorless, rag, llm, announcement] to [vectorless, document-understanding, llm, ai, announcement] - Modify docusaurus config and homepage header/description to reflect new positioning as "Document Understanding Engine for AI" - Streamline feature descriptions and remove redundant phrases --- README.md | 6 +++--- docs/blog/2026-04-12-welcome/index.mdx | 10 +++++----- docs/docs/intro.mdx | 8 +++----- docs/docusaurus.config.ts | 2 +- docs/src/pages/index.tsx | 4 ++-- 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 613c70e..b42dc24 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Vectorless

Document Understanding Engine for AI

-
Reason, don't vector · Structure, not chunks · Think, then answer
+

Reason, don't vector · Structure, not chunks · Think, then answer

[![PyPI](https://img.shields.io/pypi/v/vectorless.svg)](https://pypi.org/project/vectorless/) [![PyPI Downloads](https://static.pepy.tech/badge/vectorless/month)](https://pepy.tech/projects/vectorless) @@ -14,7 +14,7 @@ -**Vectorless** is a document understanding engine for AI. It reads documents as structured trees of meaning, then dispatches multiple agents to reason through headings, sections, and paragraphs — understanding how each part relates to the whole. The problem it solves is not "where to look", but "what does this mean in context". Every understanding is a **reasoning** act. +**Vectorless** is a document understanding engine for AI. It compiles documents into structured trees of meaning, then dispatches multiple agents to reason through headings, sections, and paragraphs — evaluating how each part relates to the whole. The problem it solves is not "where to look", but "what does this mean in context". Every answer is a reasoning act, not a retrieval result. Light up a star and shine with us! ⭐ @@ -60,7 +60,7 @@ Engine.query("What drove the revenue decline?") │ │ │ └─ evaluate ── insufficient? → replan → dispatch new paths → loop │ - └─ Fusion ── dedup, LLM-scored relevance, return with source attribution + └─ Synthesis ── dedup, evidence scoring, reasoned answer with source chain ``` Worker navigation commands: diff --git a/docs/blog/2026-04-12-welcome/index.mdx b/docs/blog/2026-04-12-welcome/index.mdx index fea3e94..bd30147 100644 --- a/docs/blog/2026-04-12-welcome/index.mdx +++ b/docs/blog/2026-04-12-welcome/index.mdx @@ -2,22 +2,22 @@ slug: welcome title: Welcome to Vectorless authors: [zTgx] -tags: [vectorless, rag, llm, announcement] +tags: [vectorless, document-understanding, llm, ai, announcement] --- -Vectorless is a reasoning-native document intelligence engine written in Rust — **no vector database, no embeddings, no similarity search**. +Vectorless is a document understanding engine for AI. It compiles documents into structured trees of meaning, then dispatches multiple agents to reason through headings, sections, and paragraphs — evaluating how each part relates to the whole. The problem it solves is not "where to look", but "what does this mean in context". Every answer is a reasoning act, not a retrieval result. {/* truncate */} ## Why Vectorless? -Traditional RAG systems rely on vector embeddings and similarity search. This approach loses document structure, requires a vector database, and often returns chunks that lack context. +Understanding a document requires more than finding keywords — it requires navigating structure, cross-referencing sections, and evaluating whether the evidence is sufficient. Vectorless agents do exactly this: they reason through documents the way a human expert would. -Vectorless takes a different path: +Key capabilities: - **Hierarchical Semantic Trees** — Documents are parsed into a tree of sections, preserving structure and relationships. - **LLM Agent Navigation** — Queries are resolved by agents that navigate the tree using commands (ls, cd, cat, find, grep), making every decision through LLM reasoning. -- **Zero Infrastructure** — No vector DB, no embedding models, no similarity search. Just an LLM API key. +- **Zero Infrastructure** — Just an LLM API key, nothing else to deploy. ## Quick Start diff --git a/docs/docs/intro.mdx b/docs/docs/intro.mdx index beb3c30..eb13c61 100644 --- a/docs/docs/intro.mdx +++ b/docs/docs/intro.mdx @@ -4,9 +4,7 @@ sidebar_position: 1 # Introduction -**Vectorless** is a reasoning-native document intelligence engine written in Rust — **no vector database, no embeddings, no similarity search**. - -It transforms documents into hierarchical semantic trees and uses LLMs to navigate the structure, retrieving the most relevant content through deep contextual understanding instead of vector math. +**Vectorless** is a document understanding engine for AI. It compiles documents into structured trees of meaning, then dispatches multiple agents to reason through headings, sections, and paragraphs — evaluating how each part relates to the whole. The problem it solves is not "where to look", but "what does this mean in context". Every answer is a reasoning act, not a retrieval result. ## How It Works @@ -76,7 +74,7 @@ async fn main() -> vectorless::Result<()> { ## Features - **Hierarchical Semantic Trees** — Preserves document structure, not flat chunks -- **LLM-Powered Agent Navigation** — Worker agents navigate the tree using commands (ls, cd, cat, find, grep), making every retrieval decision through LLM reasoning +- **LLM-Powered Agent Navigation** — Worker agents navigate the tree using commands (ls, cd, cat, find, grep), making every decision through LLM reasoning - **Cross-Reference Resolution** — Automatically resolves "see Section 2.1", "Appendix G" references during indexing - **Synonym Expansion** — LLM-generated synonyms for indexed keywords improve recall for differently-worded queries - **Orchestrator Supervisor Loop** — Multi-document queries are coordinated by an LLM supervisor that dispatches Workers, evaluates evidence, and replans when needed @@ -84,4 +82,4 @@ async fn main() -> vectorless::Result<()> { - **Incremental Indexing** — Content fingerprinting skips unchanged files - **DocCard Catalog** — Lightweight document metadata index enables fast multi-document analysis without loading full documents - **Multi-Format** — Markdown and PDF support -- **Zero Infrastructure** — No vector DB, no embedding models, just an LLM API key +- **Zero Infrastructure** — Just an LLM API key, nothing else to deploy diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts index 105e09c..76f4f87 100644 --- a/docs/docusaurus.config.ts +++ b/docs/docusaurus.config.ts @@ -6,7 +6,7 @@ import type * as Preset from '@docusaurus/preset-classic'; const config: Config = { title: 'Vectorless', - tagline: 'Reasoning-based Document Engine', + tagline: 'Document Understanding Engine for AI', favicon: 'img/favicon.ico', future: { diff --git a/docs/src/pages/index.tsx b/docs/src/pages/index.tsx index e9863ec..d506d45 100644 --- a/docs/src/pages/index.tsx +++ b/docs/src/pages/index.tsx @@ -42,7 +42,7 @@ function HomepageHeader() { {/* Left: Brand + Features */}

Vectorless

-

Reasoning-native Document Engine

+

Document Understanding Engine for AI

@@ -119,7 +119,7 @@ export default function Home(): ReactNode { return ( + description="Document understanding engine for AI. Agents reason through your documents — navigating structure, reading passages, cross-referencing across sections.">