From 7b76d65f83793057f1df88941464f84d31542a0f Mon Sep 17 00:00:00 2001
From: Rach Pradhan <54503978+justrach@users.noreply.github.com>
Date: Tue, 5 May 2026 11:00:08 +0800
Subject: [PATCH 1/4] test(issue-393): failing test for BM25 ranking on content
 search

Asserts Explorer.searchContentRanked exists and ranks a file densely
covering all query terms above a file with a single-term mention plus
noise files. Fails on main (function does not exist).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/tests.zig | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/src/tests.zig b/src/tests.zig
index d8f8c1c..6b083ef 100644
--- a/src/tests.zig
+++ b/src/tests.zig
@@ -9762,3 +9762,75 @@ test "issue-413: bundle truncation drops subsequent ops without telling the call
     // Either its result, or an explicit "[2]" entry noting it was dropped.
     try testing.expect(std.mem.indexOf(u8, out.items, "[2]") != null);
 }
+
+test "issue-393: BM25 ranking surfaces high-density file before single-mention file" {
+    // Multi-term content queries today return matches in scan order with only
+    // a per-line occurrence count tiebreaker (explore.zig:1674-1688). On a
+    // large repo this dumps every match with no notion of which *file* is the
+    // most relevant — a file that mentions every query term many times ranks
+    // identically to one that mentions a single term once.
+    //
+    // BM25 over the existing trigram + word index would score documents by
+    // (per-term tf * idf) with length normalization, so the file densely
+    // covering both terms surfaces above the noise file.
+    //
+    // Minimum surface contract: Explorer exposes `searchContentRanked` which
+    // takes a multi-term query and returns results ordered by descending
+    // BM25 score across files (highest-scoring document's match comes first).
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator());
+
+    // dense.zig: hits both query terms many times across many lines.
+    try explorer.indexFile("src/dense.zig",
+        \\pub fn parseTokenStream() void {
+        \\    const token = nextToken();
+        \\    parseToken(token);
+        \\    parseToken(token);
+        \\    parseToken(token);
+        \\    const stream = parseTokenStream();
+        \\    parseTokenStream();
+        \\    _ = token;
+        \\    _ = stream;
+        \\}
+    );
+    // sparse.zig: mentions one term once, in passing.
+    try explorer.indexFile("src/sparse.zig",
+        \\pub fn unrelated() void {
+        \\    // a passing mention of parse here
+        \\    return;
+        \\}
+    );
+    // Noise files dilute df-based scoring; BM25 must still rank dense first.
+    try explorer.indexFile("src/noise_a.zig", "pub fn a() void {}\n");
+    try explorer.indexFile("src/noise_b.zig", "pub fn b() void {}\n");
+    try explorer.indexFile("src/noise_c.zig", "pub fn c() void {}\n");
+
+    try testing.expect(@hasDecl(Explorer, "searchContentRanked"));
+
+    const results = try explorer.searchContentRanked("parse Token", testing.allocator, 16);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.line_text);
+            testing.allocator.free(r.path);
+        }
+        testing.allocator.free(results);
+    }
+
+    try testing.expect(results.len > 0);
+    // Top-ranked result must come from the dense file.
+    try testing.expectEqualStrings("src/dense.zig", results[0].path);
+    // Score must be populated and strictly positive when ranking is on.
+    try testing.expect(results[0].score > 0.0);
+    // Results must be sorted by score descending across distinct documents:
+    // the first dense.zig score must exceed the first sparse.zig score.
+    var dense_score: f32 = -1.0;
+    var sparse_score: f32 = -1.0;
+    for (results) |r| {
+        if (dense_score < 0 and std.mem.eql(u8, r.path, "src/dense.zig")) dense_score = r.score;
+        if (sparse_score < 0 and std.mem.eql(u8, r.path, "src/sparse.zig")) sparse_score = r.score;
+    }
+    if (sparse_score >= 0) {
+        try testing.expect(dense_score > sparse_score);
+    }
+}

From 1b177af904056671ba34a106ee874bfdefef9ec1 Mon Sep 17 00:00:00 2001
From: Rach Pradhan <54503978+justrach@users.noreply.github.com>
Date: Tue, 5 May 2026 11:49:06 +0800
Subject: [PATCH 2/4] feat(index): BM25 ranking via
 Explorer.searchContentRanked (#400)

Adds an additive ranked-content API on Explorer. Existing scan-order
searchContent is unchanged. WordIndex tracks doc_lengths + total_tokens
(BM25 length normalization); on-disk format bumps to v3 with a per-doc
length trailer. Older v1/v2 word.index files are rejected so callers
fall through to a clean rebuild.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/explore.zig | 181 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/index.zig   |  70 ++++++++++++++++++-
 src/tests.zig   |  35 ++++++++++
 3 files changed, 285 insertions(+), 1 deletion(-)

diff --git a/src/explore.zig b/src/explore.zig
index 4963c93..eb0f287 100644
--- a/src/explore.zig
+++ b/src/explore.zig
@@ -1690,6 +1690,187 @@ pub const Explorer = struct {
         return result_list.toOwnedSlice(allocator);
     }
 
+    /// BM25-ranked content search. Tokenizes the query the same way the word
+    /// index tokenizes documents, scores each candidate doc with BM25
+    /// (k1=1.2, b=0.75), and emits one SearchResult per top-N document with
+    /// the best-tf line for any query term in that doc. Existing scan-order
+    /// `searchContent` is unaffected.
+    pub fn searchContentRanked(self: *Explorer, query: []const u8, allocator: std.mem.Allocator, max_results: usize) ![]const SearchResult {
+        self.mu.lockShared();
+        defer self.mu.unlockShared();
+
+        if (max_results == 0) return try allocator.alloc(SearchResult, 0);
+
+        // Tokenize the query the same way WordIndex tokenizes documents:
+        // lowercase + identifier-split. Dedupe terms so repeated query words
+        // don't double-count.
+        var term_arena = std.heap.ArenaAllocator.init(allocator);
+        defer term_arena.deinit();
+        const ta = term_arena.allocator();
+
+        var terms_set = std.StringHashMap(void).init(ta);
+        var raw_tok = idx.WordTokenizer{ .buf = query };
+        while (raw_tok.next()) |word| {
+            if (word.len < 2) continue;
+            const lower = try ta.alloc(u8, word.len);
+            for (word, 0..) |c, j| lower[j] = idx.normalizeChar(c);
+            _ = try terms_set.getOrPut(lower);
+
+            var needs_split: bool = false;
+            if (word.len >= 4) {
+                for (word) |c| {
+                    if (c == '_' or (c >= 'A' and c <= 'Z')) {
+                        needs_split = true;
+                        break;
+                    }
+                }
+            }
+            if (needs_split) {
+                var sub_toks: std.ArrayList([]const u8) = .empty;
+                defer sub_toks.deinit(ta);
+                idx.splitIdentifier(word, &sub_toks, ta) catch continue;
+                for (sub_toks.items) |sub| {
+                    if (sub.len < 2) continue;
+                    _ = try terms_set.getOrPut(sub);
+                }
+            }
+        }
+        if (terms_set.count() == 0) return try allocator.alloc(SearchResult, 0);
+
+        // BM25 constants.
+        const k1: f32 = 1.2;
+        const b: f32 = 0.75;
+        const N = self.word_index.fileCount();
+        if (N == 0) return try allocator.alloc(SearchResult, 0);
+        const avgdl = self.word_index.avgDocLength();
+
+        // Aggregate scores per doc and remember the best line (max term hits)
+        // for each candidate.
+        const DocAgg = struct {
+            score: f32,
+            best_line: u32,
+            best_line_hits: u32,
+        };
+        var per_doc = std.AutoHashMap(u32, DocAgg).init(ta);
+
+        // For each unique query term, look up its posting list once,
+        // compute df and per-doc tf in a single pass.
+        var term_iter = terms_set.keyIterator();
+        while (term_iter.next()) |term_ptr| {
+            const term = term_ptr.*;
+            const hits = self.word_index.search(term);
+            if (hits.len == 0) continue;
+
+            // df: distinct doc_ids in this posting list. tf: count of (term,doc)
+            // entries (each entry is a distinct line per indexFile dedup).
+            // line_hits: per-doc map of line_num → count for best-line picking.
+            var doc_tf = std.AutoHashMap(u32, u32).init(ta);
+            var doc_best_line = std.AutoHashMap(u32, struct { line: u32, count: u32 }).init(ta);
+            for (hits) |h| {
+                const tf_gop = try doc_tf.getOrPut(h.doc_id);
+                if (!tf_gop.found_existing) tf_gop.value_ptr.* = 0;
+                tf_gop.value_ptr.* += 1;
+
+                const ln_gop = try doc_best_line.getOrPut(h.doc_id);
+                if (!ln_gop.found_existing) {
+                    ln_gop.value_ptr.* = .{ .line = h.line_num, .count = 1 };
+                } else {
+                    // Each posting is a distinct line; still, prefer the
+                    // smallest line_num as a deterministic representative.
+                    if (h.line_num < ln_gop.value_ptr.line) {
+                        ln_gop.value_ptr.line = h.line_num;
+                    }
+                    ln_gop.value_ptr.count += 1;
+                }
+            }
+            const df: u32 = @intCast(doc_tf.count());
+            // BM25 idf with the +1 smoothing variant: log(1 + (N - df + 0.5)/(df + 0.5))
+            const num: f32 = @as(f32, @floatFromInt(N)) - @as(f32, @floatFromInt(df)) + 0.5;
+            const den: f32 = @as(f32, @floatFromInt(df)) + 0.5;
+            const idf: f32 = @log(1.0 + num / den);
+
+            var tf_iter = doc_tf.iterator();
+            while (tf_iter.next()) |entry| {
+                const doc_id = entry.key_ptr.*;
+                const tf: f32 = @floatFromInt(entry.value_ptr.*);
+                const dl_raw = self.word_index.docLength(doc_id);
+                const dl: f32 = if (dl_raw == 0) 1.0 else @floatFromInt(dl_raw);
+                const norm = 1.0 - b + b * (dl / avgdl);
+                const term_score = idf * (tf * (k1 + 1.0)) / (tf + k1 * norm);
+
+                const ln_info = doc_best_line.get(doc_id) orelse continue;
+                const agg_gop = try per_doc.getOrPut(doc_id);
+                if (!agg_gop.found_existing) {
+                    agg_gop.value_ptr.* = .{
+                        .score = term_score,
+                        .best_line = ln_info.line,
+                        .best_line_hits = ln_info.count,
+                    };
+                } else {
+                    agg_gop.value_ptr.score += term_score;
+                    if (ln_info.count > agg_gop.value_ptr.best_line_hits or
+                        (ln_info.count == agg_gop.value_ptr.best_line_hits and ln_info.line < agg_gop.value_ptr.best_line))
+                    {
+                        agg_gop.value_ptr.best_line = ln_info.line;
+                        agg_gop.value_ptr.best_line_hits = ln_info.count;
+                    }
+                }
+            }
+        }
+        if (per_doc.count() == 0) return try allocator.alloc(SearchResult, 0);
+
+        const Cand = struct { doc_id: u32, score: f32, best_line: u32 };
+        var cands: std.ArrayList(Cand) = .empty;
+        defer cands.deinit(ta);
+        try cands.ensureTotalCapacity(ta, per_doc.count());
+        var pd_iter = per_doc.iterator();
+        while (pd_iter.next()) |entry| {
+            cands.appendAssumeCapacity(.{
+                .doc_id = entry.key_ptr.*,
+                .score = entry.value_ptr.score,
+                .best_line = entry.value_ptr.best_line,
+            });
+        }
+        std.sort.block(Cand, cands.items, {}, struct {
+            pub fn lt(_: void, a: Cand, b_: Cand) bool {
+                if (a.score != b_.score) return a.score > b_.score;
+                return a.doc_id < b_.doc_id;
+            }
+        }.lt);
+
+        var result_list: std.ArrayList(SearchResult) = .empty;
+        errdefer {
+            for (result_list.items) |r| {
+                allocator.free(r.line_text);
+                allocator.free(r.path);
+            }
+            result_list.deinit(allocator);
+        }
+        try result_list.ensureTotalCapacity(allocator, @min(max_results, cands.items.len));
+
+        for (cands.items) |c| {
+            if (result_list.items.len >= max_results) break;
+            const path = self.word_index.id_to_path.items[c.doc_id];
+            if (path.len == 0) continue;
+            const ref = self.readContentForSearch(path, allocator) orelse continue;
+            defer ref.deinit();
+            const line_text = extractLineByNumber(ref.data, c.best_line) orelse continue;
+            const duped_text = try allocator.dupe(u8, line_text);
+            errdefer allocator.free(duped_text);
+            const duped_path = try allocator.dupe(u8, path);
+            errdefer allocator.free(duped_path);
+            try result_list.append(allocator, .{
+                .path = duped_path,
+                .line_num = c.best_line,
+                .line_text = duped_text,
+                .score = c.score,
+            });
+        }
+
+        return result_list.toOwnedSlice(allocator);
+    }
+
+
     /// Search file contents using a regex pattern with trigram acceleration.
     /// Decomposes the regex to extract literal trigrams for candidate filtering,
     /// then does actual regex matching on candidates.
diff --git a/src/index.zig b/src/index.zig
index 8cf4b62..2bc6f44 100644
--- a/src/index.zig
+++ b/src/index.zig
@@ -20,6 +20,10 @@ pub const WordIndex = struct {
     enabled: bool = true,
     path_to_id: std.StringHashMap(u32),
     id_to_path: std.ArrayList([]const u8),
+    /// doc_id → number of tokens indexed for that doc (BM25 length normalization).
+    doc_lengths: std.AutoHashMap(u32, u32),
+    /// Sum of all values in doc_lengths.
+    total_tokens: u64 = 0,
 
     pub fn hitPath(self: *const WordIndex, hit: WordHit) []const u8 {
         if (hit.doc_id < self.id_to_path.items.len) return self.id_to_path.items[hit.doc_id];
@@ -41,6 +45,8 @@ pub const WordIndex = struct {
             .allocator = allocator,
             .path_to_id = std.StringHashMap(u32).init(allocator),
             .id_to_path = .empty,
+            .doc_lengths = std.AutoHashMap(u32, u32).init(allocator),
+            .total_tokens = 0,
         };
     }
 
@@ -69,6 +75,7 @@ pub const WordIndex = struct {
 
         self.path_to_id.deinit();
         self.id_to_path.deinit(self.allocator);
+        self.doc_lengths.deinit();
     }
 
     /// Remove all index entries for a file (call before re-indexing).
@@ -86,6 +93,9 @@ pub const WordIndex = struct {
         if (doc_id < self.id_to_path.items.len) {
             self.id_to_path.items[doc_id] = "";
         }
+        if (self.doc_lengths.fetchRemove(doc_id)) |kv| {
+            self.total_tokens -= kv.value;
+        }
         defer {
             self.allocator.free(words_slice);
             self.allocator.free(stable_path);
@@ -162,12 +172,14 @@ pub const WordIndex = struct {
         var words_set = std.StringHashMap(void).init(words_arena.allocator());
         var line_num: u32 = 0;
         var lines = std.mem.splitScalar(u8, content, '\n');
+        var doc_token_count: u32 = 0;
 
         while (lines.next()) |line| {
             line_num += 1;
             var tok = WordTokenizer{ .buf = line };
             while (tok.next()) |word| {
                 if (word.len < 2) continue;
+                doc_token_count +|= 1;
 
                 const aa = words_arena.allocator();
 
@@ -226,6 +238,9 @@ pub const WordIndex = struct {
             try self.file_words.put(stable_path, compact);
         }
         words_set.deinit();
+
+        try self.doc_lengths.put(doc_id, doc_token_count);
+        self.total_tokens += doc_token_count;
     }
 
     /// Look up all hits for a word. O(1) lookup + O(hits) iteration.
@@ -311,6 +326,19 @@ pub const WordIndex = struct {
         return @intCast(self.file_words.count());
     }
 
+    /// BM25 helper: number of indexed tokens in a doc, or 0 if unknown.
+    pub fn docLength(self: *const WordIndex, doc_id: u32) u32 {
+        return self.doc_lengths.get(doc_id) orelse 0;
+    }
+
+    /// BM25 helper: average doc length over docs that have a recorded length.
+    /// Returns 1.0 when no docs are tracked, so callers can divide safely.
+    pub fn avgDocLength(self: *const WordIndex) f32 {
+        const n = self.doc_lengths.count();
+        if (n == 0) return 1.0;
+        return @as(f32, @floatFromInt(self.total_tokens)) / @as(f32, @floatFromInt(n));
+    }
+
     /// Shrink all hit lists and per-file word sets to release excess capacity.
     pub fn shrinkAllocations(self: *WordIndex) void {
         var iter = self.index.iterator();
@@ -327,7 +355,7 @@ pub const WordIndex = struct {
     };
 
     const DISK_MAGIC = [4]u8{ 'C', 'D', 'B', 'W' };
-    const DISK_FORMAT_VERSION: u16 = 2;
+    const DISK_FORMAT_VERSION: u16 = 3;
 
     pub fn writeToDisk(self: *WordIndex, io: std.Io, dir_path: []const u8, git_head: ?[40]u8) !void {
         var file_table: std.ArrayList([]const u8) = .empty;
@@ -424,6 +452,27 @@ pub const WordIndex = struct {
                 try writer.interface.writeAll(&hit_buf);
             }
         }
+
+        // v3 trailer: per-doc length table for BM25.
+        // file_id (u32 disk-id) → length (u32). Total tokens follows as u64.
+        var dl_count_buf: [4]u8 = undefined;
+        std.mem.writeInt(u32, &dl_count_buf, @intCast(file_table.items.len), .little);
+        try writer.interface.writeAll(&dl_count_buf);
+        for (file_table.items) |path| {
+            const in_mem_id = self.path_to_id.get(path) orelse {
+                var z: [4]u8 = .{ 0, 0, 0, 0 };
+                try writer.interface.writeAll(&z);
+                continue;
+            };
+            const len = self.doc_lengths.get(in_mem_id) orelse 0;
+            var lb: [4]u8 = undefined;
+            std.mem.writeInt(u32, &lb, len, .little);
+            try writer.interface.writeAll(&lb);
+        }
+        var tt_buf: [8]u8 = undefined;
+        std.mem.writeInt(u64, &tt_buf, self.total_tokens, .little);
+        try writer.interface.writeAll(&tt_buf);
+
         try writer.interface.flush();
 
         try std.Io.Dir.cwd().rename(tmp_path, std.Io.Dir.cwd(), final_path, io);
@@ -535,6 +584,21 @@ pub const WordIndex = struct {
             gop.value_ptr.* = hits;
         }
 
+        // v3 trailer: per-doc length table.
+        if (pos + 4 > data.len) return null;
+        const dl_count = std.mem.readInt(u32, data[pos..][0..4], .little);
+        pos += 4;
+        if (dl_count != file_count) return null;
+        if (pos + dl_count * 4 + 8 > data.len) return null;
+        var dl_values = try allocator.alloc(u32, dl_count);
+        defer allocator.free(dl_values);
+        for (0..dl_count) |i| {
+            dl_values[i] = std.mem.readInt(u32, data[pos..][0..4], .little);
+            pos += 4;
+        }
+        const total_tokens_loaded = std.mem.readInt(u64, data[pos..][0..8], .little);
+        pos += 8;
+
         if (pos != data.len) return null;
 
         // Populate path_to_id and id_to_path from file_paths
@@ -542,7 +606,11 @@ pub const WordIndex = struct {
         for (0..file_count) |i| {
             result.id_to_path.appendAssumeCapacity(file_paths[i]);
             try result.path_to_id.put(file_paths[i], @intCast(i));
+            if (dl_values[i] > 0) {
+                try result.doc_lengths.put(@intCast(i), dl_values[i]);
+            }
         }
+        result.total_tokens = total_tokens_loaded;
 
         // Compact tmp_file_words HashMaps into slices for result.file_words
         var tfw_iter = tmp_file_words.iterator();
diff --git a/src/tests.zig b/src/tests.zig
index 6b083ef..098d1f5 100644
--- a/src/tests.zig
+++ b/src/tests.zig
@@ -9834,3 +9834,38 @@ test "issue-393: BM25 ranking surfaces high-density file before single-mention f
         try testing.expect(dense_score > sparse_score);
     }
 }
+
+test "issue-400: BM25 ranks both-terms file above single-term files" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator());
+
+    try explorer.indexFile("both.zig",
+        \\pub fn parseToken() void {
+        \\    parseToken();
+        \\    parseToken();
+        \\}
+    );
+    try explorer.indexFile("only_parse.zig",
+        \\pub fn parseFoo() void {
+        \\    parse();
+        \\}
+    );
+    try explorer.indexFile("only_token.zig",
+        \\pub fn tokenStream() void {
+        \\    token();
+        \\}
+    );
+
+    const results = try explorer.searchContentRanked("parse Token", testing.allocator, 8);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.line_text);
+            testing.allocator.free(r.path);
+        }
+        testing.allocator.free(results);
+    }
+    try testing.expect(results.len > 0);
+    try testing.expectEqualStrings("both.zig", results[0].path);
+    try testing.expect(results[0].score > 0.0);
+}

From 37293cf9cfd82cbbc9da954696ce85f4f2d52b13 Mon Sep 17 00:00:00 2001
From: Rach Pradhan <54503978+justrach@users.noreply.github.com>
Date: Tue, 5 May 2026 12:19:17 +0800
Subject: [PATCH 3/4] fix(index): BM25 ranker handles skip_file_words=true
 correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- N for BM25 reads doc_lengths.count() instead of file_words.count(),
  so searchContentRanked returns results when the cold-scan path has
  skip_file_words enabled (used in MCP startup).
- indexFile subtracts the prior doc_lengths[doc_id] from total_tokens
  before adding the new length, so re-indexing under skip_file_words
  no longer accumulates phantom tokens (avgdl was inflating by k× on
  k re-indexes, deflating ranking scores).
- indexFile reuses the existing stable_path when the path is already
  tracked, preventing a memory leak on re-index when skip_file_words
  is true (removeFile early-exits, leaving path_to_id populated).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/explore.zig |  2 +-
 src/index.zig   | 18 ++++++++++++++++--
 src/tests.zig   | 27 +++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/src/explore.zig b/src/explore.zig
index eb0f287..106afd3 100644
--- a/src/explore.zig
+++ b/src/explore.zig
@@ -1740,7 +1740,7 @@ pub const Explorer = struct {
         // BM25 constants.
         const k1: f32 = 1.2;
         const b: f32 = 0.75;
-        const N = self.word_index.fileCount();
+        const N = self.word_index.rankedDocCount();
         if (N == 0) return try allocator.alloc(SearchResult, 0);
         const avgdl = self.word_index.avgDocLength();
 
diff --git a/src/index.zig b/src/index.zig
index 2bc6f44..867b67c 100644
--- a/src/index.zig
+++ b/src/index.zig
@@ -158,8 +158,14 @@ pub const WordIndex = struct {
         // Clean up old entries first
         self.removeFile(path);
 
-        const stable_path = try self.allocator.dupe(u8, path);
-        errdefer self.allocator.free(stable_path);
+        // If the path is already tracked (e.g. skip_file_words=true and removeFile
+        // early-exited), reuse the existing stable copy rather than leaking a new dup.
+        const stable_path = if (self.path_to_id.contains(path))
+            path
+        else
+            try self.allocator.dupe(u8, path);
+        const owned_path = stable_path.ptr != path.ptr;
+        errdefer if (owned_path) self.allocator.free(stable_path);
 
         const doc_id = try self.getOrCreateDocId(stable_path);
 
@@ -239,6 +245,9 @@ pub const WordIndex = struct {
         }
         words_set.deinit();
 
+        if (self.doc_lengths.get(doc_id)) |old_len| {
+            self.total_tokens -%= old_len;
+        }
         try self.doc_lengths.put(doc_id, doc_token_count);
         self.total_tokens += doc_token_count;
     }
@@ -326,6 +335,11 @@ pub const WordIndex = struct {
         return @intCast(self.file_words.count());
     }
 
+    /// BM25 helper: number of docs the ranker can see (source of truth regardless of skip_file_words).
+    pub fn rankedDocCount(self: *const WordIndex) u32 {
+        return @intCast(self.doc_lengths.count());
+    }
+
     /// BM25 helper: number of indexed tokens in a doc, or 0 if unknown.
     pub fn docLength(self: *const WordIndex, doc_id: u32) u32 {
         return self.doc_lengths.get(doc_id) orelse 0;
diff --git a/src/tests.zig b/src/tests.zig
index 098d1f5..1980598 100644
--- a/src/tests.zig
+++ b/src/tests.zig
@@ -9869,3 +9869,30 @@ test "issue-400: BM25 ranks both-terms file above single-term files" {
     try testing.expectEqualStrings("both.zig", results[0].path);
     try testing.expect(results[0].score > 0.0);
 }
+
+test "issue-400-bug1: searchContentRanked returns ranked results when skip_file_words=true" {
+    var explorer = Explorer.init(testing.allocator);
+    defer explorer.deinit();
+    explorer.word_index.skip_file_words = true;
+    try explorer.indexFile("a.zig", "apple banana\n");
+    try explorer.indexFile("b.zig", "apple\n");
+    const results = try explorer.searchContentRanked("apple", testing.allocator, 10);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.line_text);
+            testing.allocator.free(r.path);
+        }
+        testing.allocator.free(results);
+    }
+    try testing.expect(results.len > 0);
+}
+
+test "issue-400-bug2: total_tokens stays consistent across re-index when skip_file_words=true" {
+    var explorer = Explorer.init(testing.allocator);
+    defer explorer.deinit();
+    explorer.word_index.skip_file_words = true;
+    try explorer.indexFile("a.zig", "one two three four\n");
+    try explorer.indexFile("a.zig", "five six seven\n");
+    try explorer.indexFile("a.zig", "eight\n");
+    try testing.expectEqual(@as(u64, 1), explorer.word_index.total_tokens);
+}

From a9a1ac5844c87e484bc0c81f35db008f79a885ce Mon Sep 17 00:00:00 2001
From: Rach Pradhan <54503978+justrach@users.noreply.github.com>
Date: Tue, 5 May 2026 12:22:14 +0800
Subject: [PATCH 4/4] test(bm25): restore stress + recall regression tests
 (#400)

The earlier stress agent added 8 BM25 regression tests (tf ordering,
multi-term, df saturation, length normalization, pathological queries,
1000-doc stress, state sync, persistence round-trip). A subsequent fix
agent's force-push to the same branch overwrote them.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/tests.zig | 277 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 277 insertions(+)

diff --git a/src/tests.zig b/src/tests.zig
index 1980598..0418880 100644
--- a/src/tests.zig
+++ b/src/tests.zig
@@ -9896,3 +9896,280 @@ test "issue-400-bug2: total_tokens stays consistent across re-index when skip_fi
     try explorer.indexFile("a.zig", "eight\n");
     try testing.expectEqual(@as(u64, 1), explorer.word_index.total_tokens);
 }
+
+// ---------------------------------------------------------------------------
+// BM25 stress / recall regression tests (#421 stress-421 branch)
+// ---------------------------------------------------------------------------
+
+test "bm25-recall-a: single-term tf ordering" {
+    // 3 docs with identical length but "apple" on different numbers of lines.
+    // The index deduplicates per (doc, line), so tf = number of lines with the term.
+    // Equal doc lengths mean length normalization is constant; higher tf must rank higher.
+    // Each doc has exactly 10 tokens (5 lines x 2 tokens each).
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator());
+
+    // doc1: apple on 1 of 5 lines
+    try explorer.indexFile("doc1.txt", "apple filler\nfiller filler\nfiller filler\nfiller filler\nfiller filler");
+    // doc2: apple on 5 of 5 lines (max tf)
+    try explorer.indexFile("doc2.txt", "apple filler\napple filler\napple filler\napple filler\napple filler");
+    // doc3: apple on 2 of 5 lines
+    try explorer.indexFile("doc3.txt", "apple filler\napple filler\nfiller filler\nfiller filler\nfiller filler");
+
+    const results = try explorer.searchContentRanked("apple", testing.allocator, 10);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.line_text);
+            testing.allocator.free(r.path);
+        }
+        testing.allocator.free(results);
+    }
+
+    try testing.expectEqual(@as(usize, 3), results.len);
+    try testing.expectEqualStrings("doc2.txt", results[0].path);
+    try testing.expectEqualStrings("doc3.txt", results[1].path);
+    try testing.expectEqualStrings("doc1.txt", results[2].path);
+    try testing.expect(results[0].score > results[1].score);
+    try testing.expect(results[1].score > results[2].score);
+}
+
+test "bm25-recall-b: both-terms doc beats high-tf single-term doc" {
+    // doc1 has apple+banana (both query terms, one occurrence each).
+    // doc2 has only apple, but repeated 3x (high tf).
+    // doc3 has only banana, once.
+    // BM25 sums idf*tf_norm per term: doc1 accumulates two idf contributions
+    // while doc2 only gets one -- doc1 must rank first.
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator());
+
+    try explorer.indexFile("doc1.txt", "apple banana cherry");
+    try explorer.indexFile("doc2.txt", "apple apple apple");
+    try explorer.indexFile("doc3.txt", "banana date elderberry");
+
+    const results = try explorer.searchContentRanked("apple banana", testing.allocator, 10);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.line_text);
+            testing.allocator.free(r.path);
+        }
+        testing.allocator.free(results);
+    }
+
+    try testing.expect(results.len >= 2);
+    try testing.expectEqualStrings("doc1.txt", results[0].path);
+    try testing.expect(results[0].score > 0.0);
+    var doc2_score: f32 = -1.0;
+    for (results) |r| {
+        if (std.mem.eql(u8, r.path, "doc2.txt")) {
+            doc2_score = r.score;
+            break;
+        }
+    }
+    if (doc2_score >= 0.0) {
+        try testing.expect(results[0].score > doc2_score);
+    }
+}
+
+test "bm25-recall-c: df-saturation -- ubiquitous term has near-zero idf" {
+    // "the" appears in all 11 docs -> idf near zero, barely contributes.
+    // "unique_marker" appears only in special.txt -> high idf, special.txt ranks first.
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator());
+
+    try explorer.indexFile("d1.txt", "the quick brown fox");
+    try explorer.indexFile("d2.txt", "the lazy dog jumps");
+    try explorer.indexFile("d3.txt", "the sun rises east");
+    try explorer.indexFile("d4.txt", "the moon shines bright");
+    try explorer.indexFile("d5.txt", "the rain in spain");
+    try explorer.indexFile("d6.txt", "the cat sat mat");
+    try explorer.indexFile("d7.txt", "the wind blows cold");
+    try explorer.indexFile("d8.txt", "the tide comes in");
+    try explorer.indexFile("d9.txt", "the stars align now");
+    try explorer.indexFile("d10.txt", "the clock ticks forward");
+    try explorer.indexFile("special.txt", "the unique_marker is here");
+
+    const results = try explorer.searchContentRanked("the unique_marker", testing.allocator, 20);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.line_text);
+            testing.allocator.free(r.path);
+        }
+        testing.allocator.free(results);
+    }
+
+    try testing.expect(results.len > 0);
+    try testing.expectEqualStrings("special.txt", results[0].path);
+    if (results.len > 1) {
+        try testing.expect(results[0].score > results[1].score);
+    }
+}
+
+test "bm25-recall-d: length normalization favors shorter doc" {
+    // short.txt: 5 tokens, one "needle".
+    // long.txt: ~50 tokens, one "needle".
+    // BM25 with b=0.75 penalizes longer docs; short.txt must rank higher.
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator());
+
+    try explorer.indexFile("short.txt", "needle alpha beta gamma delta");
+    try explorer.indexFile("long.txt",
+        "aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx yy zz " ++
+        "aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx needle yy zz"
+    );
+
+    const results = try explorer.searchContentRanked("needle", testing.allocator, 10);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.line_text);
+            testing.allocator.free(r.path);
+        }
+        testing.allocator.free(results);
+    }
+
+    try testing.expectEqual(@as(usize, 2), results.len);
+    try testing.expectEqualStrings("short.txt", results[0].path);
+    try testing.expect(results[0].score > results[1].score);
+}
+
+test "bm25-recall-e: empty and pathological queries return empty without crash" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator());
+
+    try explorer.indexFile("file.txt", "some content here");
+
+    {
+        const r = try explorer.searchContentRanked("", testing.allocator, 10);
+        defer testing.allocator.free(r);
+        try testing.expectEqual(@as(usize, 0), r.len);
+    }
+    {
+        const r = try explorer.searchContentRanked("   ", testing.allocator, 10);
+        defer testing.allocator.free(r);
+        try testing.expectEqual(@as(usize, 0), r.len);
+    }
+    {
+        const r = try explorer.searchContentRanked("nonexistent_xyz_term_99", testing.allocator, 10);
+        defer testing.allocator.free(r);
+        try testing.expectEqual(@as(usize, 0), r.len);
+    }
+}
+
+test "bm25-stress: 1000-doc index, common token, max_results cap honored" {
+    var explorer = Explorer.init(testing.allocator);
+    defer explorer.deinit();
+
+    var path_buf: [64]u8 = undefined;
+    var content_buf: [256]u8 = undefined;
+    for (0..1000) |i| {
+        const path = std.fmt.bufPrint(&path_buf, "stress/doc{d}.txt", .{i}) catch unreachable;
+        const content = std.fmt.bufPrint(&content_buf,
+            "common token alpha beta gamma doc{d} extra filler words here now", .{i}
+        ) catch unreachable;
+        try explorer.indexFile(path, content);
+    }
+
+    const cap = 25;
+    const results = try explorer.searchContentRanked("common", testing.allocator, cap);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.line_text);
+            testing.allocator.free(r.path);
+        }
+        testing.allocator.free(results);
+    }
+
+    try testing.expect(results.len <= cap);
+    try testing.expect(results.len > 0);
+    for (results) |r| {
+        try testing.expect(r.score > 0.0);
+    }
+    for (1..results.len) |i| {
+        try testing.expect(results[i - 1].score >= results[i].score);
+    }
+}
+
+test "bm25-state-sync: re-index and remove update total_tokens correctly" {
+    var explorer = Explorer.init(testing.allocator);
+    defer explorer.deinit();
+
+    try explorer.indexFile("sync.txt", "alpha beta gamma delta epsilon");
+    try testing.expectEqual(@as(u64, 5), explorer.word_index.total_tokens);
+
+    try explorer.indexFile("sync.txt", "alpha beta");
+    try testing.expectEqual(@as(u64, 2), explorer.word_index.total_tokens);
+
+    explorer.removeFile("sync.txt");
+    try testing.expectEqual(@as(u64, 0), explorer.word_index.total_tokens);
+}
+
+test "bm25-persistence: writeToDisk/readFromDisk preserves total_tokens and doc_lengths" {
+    const alloc = testing.allocator;
+    var wi = WordIndex.init(alloc);
+    defer wi.deinit();
+
+    try wi.indexFile("low.txt", "needle filler filler filler filler filler filler filler filler filler");
+    try wi.indexFile("high.txt", "needle needle needle filler");
+    try wi.indexFile("none.txt", "filler filler filler filler");
+
+    const pre_total = wi.total_tokens;
+    const pre_low_len = wi.docLength(wi.path_to_id.get("low.txt") orelse 0);
+    const pre_high_len = wi.docLength(wi.path_to_id.get("high.txt") orelse 0);
+
+    var tmp = testing.tmpDir(.{});
+    defer tmp.cleanup();
+    var path_buf: [std.fs.max_path_bytes]u8 = undefined;
+    const dir_path_len = try tmp.dir.realPathFile(io, ".", &path_buf);
+    const dir_path = path_buf[0..dir_path_len];
+
+    try wi.writeToDisk(io, dir_path, null);
+
+    const maybe_loaded = WordIndex.readFromDisk(io, dir_path, alloc);
+    try testing.expect(maybe_loaded != null);
+    var loaded = maybe_loaded.?;
+    defer loaded.deinit();
+
+    try testing.expectEqual(pre_total, loaded.total_tokens);
+
+    const post_low_id = loaded.path_to_id.get("low.txt") orelse {
+        try testing.expect(false);
+        return;
+    };
+    const post_high_id = loaded.path_to_id.get("high.txt") orelse {
+        try testing.expect(false);
+        return;
+    };
+    try testing.expectEqual(pre_low_len, loaded.docLength(post_low_id));
+    try testing.expectEqual(pre_high_len, loaded.docLength(post_high_id));
+
+    const hits = try loaded.searchDeduped("needle", alloc);
+    defer alloc.free(hits);
+    try testing.expect(hits.len >= 2);
+
+    var saw_high = false;
+    var saw_low = false;
+    for (hits) |h| {
+        const p = loaded.hitPath(h);
+        if (std.mem.eql(u8, p, "high.txt")) saw_high = true;
+        if (std.mem.eql(u8, p, "low.txt")) saw_low = true;
+    }
+    try testing.expect(saw_high);
+    try testing.expect(saw_low);
+
+    // Post-roundtrip ranked search must still work and return hits for "needle".
+    var wi2 = WordIndex.init(alloc);
+    defer wi2.deinit();
+    try wi2.indexFile("low.txt", "needle filler filler filler filler filler filler filler filler filler");
+    try wi2.indexFile("high.txt", "needle needle needle filler");
+    try wi2.indexFile("none.txt", "filler filler filler filler");
+
+    const low_id_orig = wi2.path_to_id.get("low.txt") orelse 0;
+    const high_id_orig = wi2.path_to_id.get("high.txt") orelse 0;
+    try testing.expectEqual(pre_low_len, wi2.docLength(low_id_orig));
+    try testing.expectEqual(pre_high_len, wi2.docLength(high_id_orig));
+    try testing.expectEqual(pre_total, wi2.total_tokens);
+}