From 7b76d65f83793057f1df88941464f84d31542a0f Mon Sep 17 00:00:00 2001 From: Rach Pradhan <54503978+justrach@users.noreply.github.com> Date: Tue, 5 May 2026 11:00:08 +0800 Subject: [PATCH 1/4] test(issue-393): failing test for BM25 ranking on content search Asserts Explorer.searchContentRanked exists and ranks a file densely covering all query terms above a file with a single-term mention plus noise files. Fails on main (function does not exist). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/tests.zig | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/src/tests.zig b/src/tests.zig index d8f8c1c..6b083ef 100644 --- a/src/tests.zig +++ b/src/tests.zig @@ -9762,3 +9762,75 @@ test "issue-413: bundle truncation drops subsequent ops without telling the call // Either its result, or an explicit "[2]" entry noting it was dropped. try testing.expect(std.mem.indexOf(u8, out.items, "[2]") != null); } + +test "issue-393: BM25 ranking surfaces high-density file before single-mention file" { + // Multi-term content queries today return matches in scan order with only + // a per-line occurrence count tiebreaker (explore.zig:1674-1688). On a + // large repo this dumps every match with no notion of which *file* is the + // most relevant — a file that mentions every query term many times ranks + // identically to one that mentions a single term once. + // + // BM25 over the existing trigram + word index would score documents by + // (per-term tf * idf) with length normalization, so the file densely + // covering both terms surfaces above the noise file. + // + // Minimum surface contract: Explorer exposes `searchContentRanked` which + // takes a multi-term query and returns results ordered by descending + // BM25 score across files (highest-scoring document's match comes first). + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var explorer = Explorer.init(arena.allocator()); + + // dense.zig: hits both query terms many times across many lines. + try explorer.indexFile("src/dense.zig", + \\pub fn parseTokenStream() void { + \\ const token = nextToken(); + \\ parseToken(token); + \\ parseToken(token); + \\ parseToken(token); + \\ const stream = parseTokenStream(); + \\ parseTokenStream(); + \\ _ = token; + \\ _ = stream; + \\} + ); + // sparse.zig: mentions one term once, in passing. + try explorer.indexFile("src/sparse.zig", + \\pub fn unrelated() void { + \\ // a passing mention of parse here + \\ return; + \\} + ); + // Noise files dilute df-based scoring; BM25 must still rank dense first. + try explorer.indexFile("src/noise_a.zig", "pub fn a() void {}\n"); + try explorer.indexFile("src/noise_b.zig", "pub fn b() void {}\n"); + try explorer.indexFile("src/noise_c.zig", "pub fn c() void {}\n"); + + try testing.expect(@hasDecl(Explorer, "searchContentRanked")); + + const results = try explorer.searchContentRanked("parse Token", testing.allocator, 16); + defer { + for (results) |r| { + testing.allocator.free(r.line_text); + testing.allocator.free(r.path); + } + testing.allocator.free(results); + } + + try testing.expect(results.len > 0); + // Top-ranked result must come from the dense file. + try testing.expectEqualStrings("src/dense.zig", results[0].path); + // Score must be populated and strictly positive when ranking is on. + try testing.expect(results[0].score > 0.0); + // Results must be sorted by score descending across distinct documents: + // the first dense.zig score must exceed the first sparse.zig score. + var dense_score: f32 = -1.0; + var sparse_score: f32 = -1.0; + for (results) |r| { + if (dense_score < 0 and std.mem.eql(u8, r.path, "src/dense.zig")) dense_score = r.score; + if (sparse_score < 0 and std.mem.eql(u8, r.path, "src/sparse.zig")) sparse_score = r.score; + } + if (sparse_score >= 0) { + try testing.expect(dense_score > sparse_score); + } +} From 1b177af904056671ba34a106ee874bfdefef9ec1 Mon Sep 17 00:00:00 2001 From: Rach Pradhan <54503978+justrach@users.noreply.github.com> Date: Tue, 5 May 2026 11:49:06 +0800 Subject: [PATCH 2/4] feat(index): BM25 ranking via Explorer.searchContentRanked (#400) Adds an additive ranked-content API on Explorer. Existing scan-order searchContent is unchanged. WordIndex tracks doc_lengths + total_tokens (BM25 length normalization); on-disk format bumps to v3 with a per-doc length trailer. Older v1/v2 word.index files are rejected so callers fall through to a clean rebuild. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/explore.zig | 181 ++++++++++++++++++++++++++++++++++++++++++++++++ src/index.zig | 70 ++++++++++++++++++- src/tests.zig | 35 ++++++++++ 3 files changed, 285 insertions(+), 1 deletion(-) diff --git a/src/explore.zig b/src/explore.zig index 4963c93..eb0f287 100644 --- a/src/explore.zig +++ b/src/explore.zig @@ -1690,6 +1690,187 @@ pub const Explorer = struct { return result_list.toOwnedSlice(allocator); } + /// BM25-ranked content search. Tokenizes the query the same way the word + /// index tokenizes documents, scores each candidate doc with BM25 + /// (k1=1.2, b=0.75), and emits one SearchResult per top-N document with + /// the best-tf line for any query term in that doc. Existing scan-order + /// `searchContent` is unaffected. + pub fn searchContentRanked(self: *Explorer, query: []const u8, allocator: std.mem.Allocator, max_results: usize) ![]const SearchResult { + self.mu.lockShared(); + defer self.mu.unlockShared(); + + if (max_results == 0) return try allocator.alloc(SearchResult, 0); + + // Tokenize the query the same way WordIndex tokenizes documents: + // lowercase + identifier-split. Dedupe terms so repeated query words + // don't double-count. + var term_arena = std.heap.ArenaAllocator.init(allocator); + defer term_arena.deinit(); + const ta = term_arena.allocator(); + + var terms_set = std.StringHashMap(void).init(ta); + var raw_tok = idx.WordTokenizer{ .buf = query }; + while (raw_tok.next()) |word| { + if (word.len < 2) continue; + const lower = try ta.alloc(u8, word.len); + for (word, 0..) |c, j| lower[j] = idx.normalizeChar(c); + _ = try terms_set.getOrPut(lower); + + var needs_split: bool = false; + if (word.len >= 4) { + for (word) |c| { + if (c == '_' or (c >= 'A' and c <= 'Z')) { + needs_split = true; + break; + } + } + } + if (needs_split) { + var sub_toks: std.ArrayList([]const u8) = .empty; + defer sub_toks.deinit(ta); + idx.splitIdentifier(word, &sub_toks, ta) catch continue; + for (sub_toks.items) |sub| { + if (sub.len < 2) continue; + _ = try terms_set.getOrPut(sub); + } + } + } + if (terms_set.count() == 0) return try allocator.alloc(SearchResult, 0); + + // BM25 constants. + const k1: f32 = 1.2; + const b: f32 = 0.75; + const N = self.word_index.fileCount(); + if (N == 0) return try allocator.alloc(SearchResult, 0); + const avgdl = self.word_index.avgDocLength(); + + // Aggregate scores per doc and remember the best line (max term hits) + // for each candidate. + const DocAgg = struct { + score: f32, + best_line: u32, + best_line_hits: u32, + }; + var per_doc = std.AutoHashMap(u32, DocAgg).init(ta); + + // For each unique query term, look up its posting list once, + // compute df and per-doc tf in a single pass. + var term_iter = terms_set.keyIterator(); + while (term_iter.next()) |term_ptr| { + const term = term_ptr.*; + const hits = self.word_index.search(term); + if (hits.len == 0) continue; + + // df: distinct doc_ids in this posting list. tf: count of (term,doc) + // entries (each entry is a distinct line per indexFile dedup). + // line_hits: per-doc map of line_num → count for best-line picking. + var doc_tf = std.AutoHashMap(u32, u32).init(ta); + var doc_best_line = std.AutoHashMap(u32, struct { line: u32, count: u32 }).init(ta); + for (hits) |h| { + const tf_gop = try doc_tf.getOrPut(h.doc_id); + if (!tf_gop.found_existing) tf_gop.value_ptr.* = 0; + tf_gop.value_ptr.* += 1; + + const ln_gop = try doc_best_line.getOrPut(h.doc_id); + if (!ln_gop.found_existing) { + ln_gop.value_ptr.* = .{ .line = h.line_num, .count = 1 }; + } else { + // Each posting is a distinct line; still, prefer the + // smallest line_num as a deterministic representative. + if (h.line_num < ln_gop.value_ptr.line) { + ln_gop.value_ptr.line = h.line_num; + } + ln_gop.value_ptr.count += 1; + } + } + const df: u32 = @intCast(doc_tf.count()); + // BM25 idf with the +1 smoothing variant: log(1 + (N - df + 0.5)/(df + 0.5)) + const num: f32 = @as(f32, @floatFromInt(N)) - @as(f32, @floatFromInt(df)) + 0.5; + const den: f32 = @as(f32, @floatFromInt(df)) + 0.5; + const idf: f32 = @log(1.0 + num / den); + + var tf_iter = doc_tf.iterator(); + while (tf_iter.next()) |entry| { + const doc_id = entry.key_ptr.*; + const tf: f32 = @floatFromInt(entry.value_ptr.*); + const dl_raw = self.word_index.docLength(doc_id); + const dl: f32 = if (dl_raw == 0) 1.0 else @floatFromInt(dl_raw); + const norm = 1.0 - b + b * (dl / avgdl); + const term_score = idf * (tf * (k1 + 1.0)) / (tf + k1 * norm); + + const ln_info = doc_best_line.get(doc_id) orelse continue; + const agg_gop = try per_doc.getOrPut(doc_id); + if (!agg_gop.found_existing) { + agg_gop.value_ptr.* = .{ + .score = term_score, + .best_line = ln_info.line, + .best_line_hits = ln_info.count, + }; + } else { + agg_gop.value_ptr.score += term_score; + if (ln_info.count > agg_gop.value_ptr.best_line_hits or + (ln_info.count == agg_gop.value_ptr.best_line_hits and ln_info.line < agg_gop.value_ptr.best_line)) + { + agg_gop.value_ptr.best_line = ln_info.line; + agg_gop.value_ptr.best_line_hits = ln_info.count; + } + } + } + } + if (per_doc.count() == 0) return try allocator.alloc(SearchResult, 0); + + const Cand = struct { doc_id: u32, score: f32, best_line: u32 }; + var cands: std.ArrayList(Cand) = .empty; + defer cands.deinit(ta); + try cands.ensureTotalCapacity(ta, per_doc.count()); + var pd_iter = per_doc.iterator(); + while (pd_iter.next()) |entry| { + cands.appendAssumeCapacity(.{ + .doc_id = entry.key_ptr.*, + .score = entry.value_ptr.score, + .best_line = entry.value_ptr.best_line, + }); + } + std.sort.block(Cand, cands.items, {}, struct { + pub fn lt(_: void, a: Cand, b_: Cand) bool { + if (a.score != b_.score) return a.score > b_.score; + return a.doc_id < b_.doc_id; + } + }.lt); + + var result_list: std.ArrayList(SearchResult) = .empty; + errdefer { + for (result_list.items) |r| { + allocator.free(r.line_text); + allocator.free(r.path); + } + result_list.deinit(allocator); + } + try result_list.ensureTotalCapacity(allocator, @min(max_results, cands.items.len)); + + for (cands.items) |c| { + if (result_list.items.len >= max_results) break; + const path = self.word_index.id_to_path.items[c.doc_id]; + if (path.len == 0) continue; + const ref = self.readContentForSearch(path, allocator) orelse continue; + defer ref.deinit(); + const line_text = extractLineByNumber(ref.data, c.best_line) orelse continue; + const duped_text = try allocator.dupe(u8, line_text); + errdefer allocator.free(duped_text); + const duped_path = try allocator.dupe(u8, path); + errdefer allocator.free(duped_path); + try result_list.append(allocator, .{ + .path = duped_path, + .line_num = c.best_line, + .line_text = duped_text, + .score = c.score, + }); + } + + return result_list.toOwnedSlice(allocator); + } + + /// Search file contents using a regex pattern with trigram acceleration. /// Decomposes the regex to extract literal trigrams for candidate filtering, /// then does actual regex matching on candidates. diff --git a/src/index.zig b/src/index.zig index 8cf4b62..2bc6f44 100644 --- a/src/index.zig +++ b/src/index.zig @@ -20,6 +20,10 @@ pub const WordIndex = struct { enabled: bool = true, path_to_id: std.StringHashMap(u32), id_to_path: std.ArrayList([]const u8), + /// doc_id → number of tokens indexed for that doc (BM25 length normalization). + doc_lengths: std.AutoHashMap(u32, u32), + /// Sum of all values in doc_lengths. + total_tokens: u64 = 0, pub fn hitPath(self: *const WordIndex, hit: WordHit) []const u8 { if (hit.doc_id < self.id_to_path.items.len) return self.id_to_path.items[hit.doc_id]; @@ -41,6 +45,8 @@ pub const WordIndex = struct { .allocator = allocator, .path_to_id = std.StringHashMap(u32).init(allocator), .id_to_path = .empty, + .doc_lengths = std.AutoHashMap(u32, u32).init(allocator), + .total_tokens = 0, }; } @@ -69,6 +75,7 @@ pub const WordIndex = struct { self.path_to_id.deinit(); self.id_to_path.deinit(self.allocator); + self.doc_lengths.deinit(); } /// Remove all index entries for a file (call before re-indexing). @@ -86,6 +93,9 @@ pub const WordIndex = struct { if (doc_id < self.id_to_path.items.len) { self.id_to_path.items[doc_id] = ""; } + if (self.doc_lengths.fetchRemove(doc_id)) |kv| { + self.total_tokens -= kv.value; + } defer { self.allocator.free(words_slice); self.allocator.free(stable_path); @@ -162,12 +172,14 @@ pub const WordIndex = struct { var words_set = std.StringHashMap(void).init(words_arena.allocator()); var line_num: u32 = 0; var lines = std.mem.splitScalar(u8, content, '\n'); + var doc_token_count: u32 = 0; while (lines.next()) |line| { line_num += 1; var tok = WordTokenizer{ .buf = line }; while (tok.next()) |word| { if (word.len < 2) continue; + doc_token_count +|= 1; const aa = words_arena.allocator(); @@ -226,6 +238,9 @@ pub const WordIndex = struct { try self.file_words.put(stable_path, compact); } words_set.deinit(); + + try self.doc_lengths.put(doc_id, doc_token_count); + self.total_tokens += doc_token_count; } /// Look up all hits for a word. O(1) lookup + O(hits) iteration. @@ -311,6 +326,19 @@ pub const WordIndex = struct { return @intCast(self.file_words.count()); } + /// BM25 helper: number of indexed tokens in a doc, or 0 if unknown. + pub fn docLength(self: *const WordIndex, doc_id: u32) u32 { + return self.doc_lengths.get(doc_id) orelse 0; + } + + /// BM25 helper: average doc length over docs that have a recorded length. + /// Returns 1.0 when no docs are tracked, so callers can divide safely. + pub fn avgDocLength(self: *const WordIndex) f32 { + const n = self.doc_lengths.count(); + if (n == 0) return 1.0; + return @as(f32, @floatFromInt(self.total_tokens)) / @as(f32, @floatFromInt(n)); + } + /// Shrink all hit lists and per-file word sets to release excess capacity. pub fn shrinkAllocations(self: *WordIndex) void { var iter = self.index.iterator(); @@ -327,7 +355,7 @@ pub const WordIndex = struct { }; const DISK_MAGIC = [4]u8{ 'C', 'D', 'B', 'W' }; - const DISK_FORMAT_VERSION: u16 = 2; + const DISK_FORMAT_VERSION: u16 = 3; pub fn writeToDisk(self: *WordIndex, io: std.Io, dir_path: []const u8, git_head: ?[40]u8) !void { var file_table: std.ArrayList([]const u8) = .empty; @@ -424,6 +452,27 @@ pub const WordIndex = struct { try writer.interface.writeAll(&hit_buf); } } + + // v3 trailer: per-doc length table for BM25. + // file_id (u32 disk-id) → length (u32). Total tokens follows as u64. + var dl_count_buf: [4]u8 = undefined; + std.mem.writeInt(u32, &dl_count_buf, @intCast(file_table.items.len), .little); + try writer.interface.writeAll(&dl_count_buf); + for (file_table.items) |path| { + const in_mem_id = self.path_to_id.get(path) orelse { + var z: [4]u8 = .{ 0, 0, 0, 0 }; + try writer.interface.writeAll(&z); + continue; + }; + const len = self.doc_lengths.get(in_mem_id) orelse 0; + var lb: [4]u8 = undefined; + std.mem.writeInt(u32, &lb, len, .little); + try writer.interface.writeAll(&lb); + } + var tt_buf: [8]u8 = undefined; + std.mem.writeInt(u64, &tt_buf, self.total_tokens, .little); + try writer.interface.writeAll(&tt_buf); + try writer.interface.flush(); try std.Io.Dir.cwd().rename(tmp_path, std.Io.Dir.cwd(), final_path, io); @@ -535,6 +584,21 @@ pub const WordIndex = struct { gop.value_ptr.* = hits; } + // v3 trailer: per-doc length table. + if (pos + 4 > data.len) return null; + const dl_count = std.mem.readInt(u32, data[pos..][0..4], .little); + pos += 4; + if (dl_count != file_count) return null; + if (pos + dl_count * 4 + 8 > data.len) return null; + var dl_values = try allocator.alloc(u32, dl_count); + defer allocator.free(dl_values); + for (0..dl_count) |i| { + dl_values[i] = std.mem.readInt(u32, data[pos..][0..4], .little); + pos += 4; + } + const total_tokens_loaded = std.mem.readInt(u64, data[pos..][0..8], .little); + pos += 8; + if (pos != data.len) return null; // Populate path_to_id and id_to_path from file_paths @@ -542,7 +606,11 @@ pub const WordIndex = struct { for (0..file_count) |i| { result.id_to_path.appendAssumeCapacity(file_paths[i]); try result.path_to_id.put(file_paths[i], @intCast(i)); + if (dl_values[i] > 0) { + try result.doc_lengths.put(@intCast(i), dl_values[i]); + } } + result.total_tokens = total_tokens_loaded; // Compact tmp_file_words HashMaps into slices for result.file_words var tfw_iter = tmp_file_words.iterator(); diff --git a/src/tests.zig b/src/tests.zig index 6b083ef..098d1f5 100644 --- a/src/tests.zig +++ b/src/tests.zig @@ -9834,3 +9834,38 @@ test "issue-393: BM25 ranking surfaces high-density file before single-mention f try testing.expect(dense_score > sparse_score); } } + +test "issue-400: BM25 ranks both-terms file above single-term files" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var explorer = Explorer.init(arena.allocator()); + + try explorer.indexFile("both.zig", + \\pub fn parseToken() void { + \\ parseToken(); + \\ parseToken(); + \\} + ); + try explorer.indexFile("only_parse.zig", + \\pub fn parseFoo() void { + \\ parse(); + \\} + ); + try explorer.indexFile("only_token.zig", + \\pub fn tokenStream() void { + \\ token(); + \\} + ); + + const results = try explorer.searchContentRanked("parse Token", testing.allocator, 8); + defer { + for (results) |r| { + testing.allocator.free(r.line_text); + testing.allocator.free(r.path); + } + testing.allocator.free(results); + } + try testing.expect(results.len > 0); + try testing.expectEqualStrings("both.zig", results[0].path); + try testing.expect(results[0].score > 0.0); +} From 37293cf9cfd82cbbc9da954696ce85f4f2d52b13 Mon Sep 17 00:00:00 2001 From: Rach Pradhan <54503978+justrach@users.noreply.github.com> Date: Tue, 5 May 2026 12:19:17 +0800 Subject: [PATCH 3/4] fix(index): BM25 ranker handles skip_file_words=true correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - N for BM25 reads doc_lengths.count() instead of file_words.count(), so searchContentRanked returns results when the cold-scan path has skip_file_words enabled (used in MCP startup). - indexFile subtracts the prior doc_lengths[doc_id] from total_tokens before adding the new length, so re-indexing under skip_file_words no longer accumulates phantom tokens (avgdl was inflating by k× on k re-indexes, deflating ranking scores). - indexFile reuses the existing stable_path when the path is already tracked, preventing a memory leak on re-index when skip_file_words is true (removeFile early-exits, leaving path_to_id populated). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/explore.zig | 2 +- src/index.zig | 18 ++++++++++++++++-- src/tests.zig | 27 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/src/explore.zig b/src/explore.zig index eb0f287..106afd3 100644 --- a/src/explore.zig +++ b/src/explore.zig @@ -1740,7 +1740,7 @@ pub const Explorer = struct { // BM25 constants. const k1: f32 = 1.2; const b: f32 = 0.75; - const N = self.word_index.fileCount(); + const N = self.word_index.rankedDocCount(); if (N == 0) return try allocator.alloc(SearchResult, 0); const avgdl = self.word_index.avgDocLength(); diff --git a/src/index.zig b/src/index.zig index 2bc6f44..867b67c 100644 --- a/src/index.zig +++ b/src/index.zig @@ -158,8 +158,14 @@ pub const WordIndex = struct { // Clean up old entries first self.removeFile(path); - const stable_path = try self.allocator.dupe(u8, path); - errdefer self.allocator.free(stable_path); + // If the path is already tracked (e.g. skip_file_words=true and removeFile + // early-exited), reuse the existing stable copy rather than leaking a new dup. + const stable_path = if (self.path_to_id.contains(path)) + path + else + try self.allocator.dupe(u8, path); + const owned_path = stable_path.ptr != path.ptr; + errdefer if (owned_path) self.allocator.free(stable_path); const doc_id = try self.getOrCreateDocId(stable_path); @@ -239,6 +245,9 @@ pub const WordIndex = struct { } words_set.deinit(); + if (self.doc_lengths.get(doc_id)) |old_len| { + self.total_tokens -%= old_len; + } try self.doc_lengths.put(doc_id, doc_token_count); self.total_tokens += doc_token_count; } @@ -326,6 +335,11 @@ pub const WordIndex = struct { return @intCast(self.file_words.count()); } + /// BM25 helper: number of docs the ranker can see (source of truth regardless of skip_file_words). + pub fn rankedDocCount(self: *const WordIndex) u32 { + return @intCast(self.doc_lengths.count()); + } + /// BM25 helper: number of indexed tokens in a doc, or 0 if unknown. pub fn docLength(self: *const WordIndex, doc_id: u32) u32 { return self.doc_lengths.get(doc_id) orelse 0; diff --git a/src/tests.zig b/src/tests.zig index 098d1f5..1980598 100644 --- a/src/tests.zig +++ b/src/tests.zig @@ -9869,3 +9869,30 @@ test "issue-400: BM25 ranks both-terms file above single-term files" { try testing.expectEqualStrings("both.zig", results[0].path); try testing.expect(results[0].score > 0.0); } + +test "issue-400-bug1: searchContentRanked returns ranked results when skip_file_words=true" { + var explorer = Explorer.init(testing.allocator); + defer explorer.deinit(); + explorer.word_index.skip_file_words = true; + try explorer.indexFile("a.zig", "apple banana\n"); + try explorer.indexFile("b.zig", "apple\n"); + const results = try explorer.searchContentRanked("apple", testing.allocator, 10); + defer { + for (results) |r| { + testing.allocator.free(r.line_text); + testing.allocator.free(r.path); + } + testing.allocator.free(results); + } + try testing.expect(results.len > 0); +} + +test "issue-400-bug2: total_tokens stays consistent across re-index when skip_file_words=true" { + var explorer = Explorer.init(testing.allocator); + defer explorer.deinit(); + explorer.word_index.skip_file_words = true; + try explorer.indexFile("a.zig", "one two three four\n"); + try explorer.indexFile("a.zig", "five six seven\n"); + try explorer.indexFile("a.zig", "eight\n"); + try testing.expectEqual(@as(u64, 1), explorer.word_index.total_tokens); +} From a9a1ac5844c87e484bc0c81f35db008f79a885ce Mon Sep 17 00:00:00 2001 From: Rach Pradhan <54503978+justrach@users.noreply.github.com> Date: Tue, 5 May 2026 12:22:14 +0800 Subject: [PATCH 4/4] test(bm25): restore stress + recall regression tests (#400) The earlier stress agent added 8 BM25 regression tests (tf ordering, multi-term, df saturation, length normalization, pathological queries, 1000-doc stress, state sync, persistence round-trip). A subsequent fix agent's force-push to the same branch overwrote them. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/tests.zig | 277 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) diff --git a/src/tests.zig b/src/tests.zig index 1980598..0418880 100644 --- a/src/tests.zig +++ b/src/tests.zig @@ -9896,3 +9896,280 @@ test "issue-400-bug2: total_tokens stays consistent across re-index when skip_fi try explorer.indexFile("a.zig", "eight\n"); try testing.expectEqual(@as(u64, 1), explorer.word_index.total_tokens); } + +// --------------------------------------------------------------------------- +// BM25 stress / recall regression tests (#421 stress-421 branch) +// --------------------------------------------------------------------------- + +test "bm25-recall-a: single-term tf ordering" { + // 3 docs with identical length but "apple" on different numbers of lines. + // The index deduplicates per (doc, line), so tf = number of lines with the term. + // Equal doc lengths mean length normalization is constant; higher tf must rank higher. + // Each doc has exactly 10 tokens (5 lines x 2 tokens each). + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var explorer = Explorer.init(arena.allocator()); + + // doc1: apple on 1 of 5 lines + try explorer.indexFile("doc1.txt", "apple filler\nfiller filler\nfiller filler\nfiller filler\nfiller filler"); + // doc2: apple on 5 of 5 lines (max tf) + try explorer.indexFile("doc2.txt", "apple filler\napple filler\napple filler\napple filler\napple filler"); + // doc3: apple on 2 of 5 lines + try explorer.indexFile("doc3.txt", "apple filler\napple filler\nfiller filler\nfiller filler\nfiller filler"); + + const results = try explorer.searchContentRanked("apple", testing.allocator, 10); + defer { + for (results) |r| { + testing.allocator.free(r.line_text); + testing.allocator.free(r.path); + } + testing.allocator.free(results); + } + + try testing.expectEqual(@as(usize, 3), results.len); + try testing.expectEqualStrings("doc2.txt", results[0].path); + try testing.expectEqualStrings("doc3.txt", results[1].path); + try testing.expectEqualStrings("doc1.txt", results[2].path); + try testing.expect(results[0].score > results[1].score); + try testing.expect(results[1].score > results[2].score); +} + +test "bm25-recall-b: both-terms doc beats high-tf single-term doc" { + // doc1 has apple+banana (both query terms, one occurrence each). + // doc2 has only apple, but repeated 3x (high tf). + // doc3 has only banana, once. + // BM25 sums idf*tf_norm per term: doc1 accumulates two idf contributions + // while doc2 only gets one -- doc1 must rank first. + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var explorer = Explorer.init(arena.allocator()); + + try explorer.indexFile("doc1.txt", "apple banana cherry"); + try explorer.indexFile("doc2.txt", "apple apple apple"); + try explorer.indexFile("doc3.txt", "banana date elderberry"); + + const results = try explorer.searchContentRanked("apple banana", testing.allocator, 10); + defer { + for (results) |r| { + testing.allocator.free(r.line_text); + testing.allocator.free(r.path); + } + testing.allocator.free(results); + } + + try testing.expect(results.len >= 2); + try testing.expectEqualStrings("doc1.txt", results[0].path); + try testing.expect(results[0].score > 0.0); + var doc2_score: f32 = -1.0; + for (results) |r| { + if (std.mem.eql(u8, r.path, "doc2.txt")) { + doc2_score = r.score; + break; + } + } + if (doc2_score >= 0.0) { + try testing.expect(results[0].score > doc2_score); + } +} + +test "bm25-recall-c: df-saturation -- ubiquitous term has near-zero idf" { + // "the" appears in all 11 docs -> idf near zero, barely contributes. + // "unique_marker" appears only in special.txt -> high idf, special.txt ranks first. + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var explorer = Explorer.init(arena.allocator()); + + try explorer.indexFile("d1.txt", "the quick brown fox"); + try explorer.indexFile("d2.txt", "the lazy dog jumps"); + try explorer.indexFile("d3.txt", "the sun rises east"); + try explorer.indexFile("d4.txt", "the moon shines bright"); + try explorer.indexFile("d5.txt", "the rain in spain"); + try explorer.indexFile("d6.txt", "the cat sat mat"); + try explorer.indexFile("d7.txt", "the wind blows cold"); + try explorer.indexFile("d8.txt", "the tide comes in"); + try explorer.indexFile("d9.txt", "the stars align now"); + try explorer.indexFile("d10.txt", "the clock ticks forward"); + try explorer.indexFile("special.txt", "the unique_marker is here"); + + const results = try explorer.searchContentRanked("the unique_marker", testing.allocator, 20); + defer { + for (results) |r| { + testing.allocator.free(r.line_text); + testing.allocator.free(r.path); + } + testing.allocator.free(results); + } + + try testing.expect(results.len > 0); + try testing.expectEqualStrings("special.txt", results[0].path); + if (results.len > 1) { + try testing.expect(results[0].score > results[1].score); + } +} + +test "bm25-recall-d: length normalization favors shorter doc" { + // short.txt: 5 tokens, one "needle". + // long.txt: ~50 tokens, one "needle". + // BM25 with b=0.75 penalizes longer docs; short.txt must rank higher. + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var explorer = Explorer.init(arena.allocator()); + + try explorer.indexFile("short.txt", "needle alpha beta gamma delta"); + try explorer.indexFile("long.txt", + "aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx yy zz " ++ + "aa bb cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx needle yy zz" + ); + + const results = try explorer.searchContentRanked("needle", testing.allocator, 10); + defer { + for (results) |r| { + testing.allocator.free(r.line_text); + testing.allocator.free(r.path); + } + testing.allocator.free(results); + } + + try testing.expectEqual(@as(usize, 2), results.len); + try testing.expectEqualStrings("short.txt", results[0].path); + try testing.expect(results[0].score > results[1].score); +} + +test "bm25-recall-e: empty and pathological queries return empty without crash" { + var arena = std.heap.ArenaAllocator.init(testing.allocator); + defer arena.deinit(); + var explorer = Explorer.init(arena.allocator()); + + try explorer.indexFile("file.txt", "some content here"); + + { + const r = try explorer.searchContentRanked("", testing.allocator, 10); + defer testing.allocator.free(r); + try testing.expectEqual(@as(usize, 0), r.len); + } + { + const r = try explorer.searchContentRanked(" ", testing.allocator, 10); + defer testing.allocator.free(r); + try testing.expectEqual(@as(usize, 0), r.len); + } + { + const r = try explorer.searchContentRanked("nonexistent_xyz_term_99", testing.allocator, 10); + defer testing.allocator.free(r); + try testing.expectEqual(@as(usize, 0), r.len); + } +} + +test "bm25-stress: 1000-doc index, common token, max_results cap honored" { + var explorer = Explorer.init(testing.allocator); + defer explorer.deinit(); + + var path_buf: [64]u8 = undefined; + var content_buf: [256]u8 = undefined; + for (0..1000) |i| { + const path = std.fmt.bufPrint(&path_buf, "stress/doc{d}.txt", .{i}) catch unreachable; + const content = std.fmt.bufPrint(&content_buf, + "common token alpha beta gamma doc{d} extra filler words here now", .{i} + ) catch unreachable; + try explorer.indexFile(path, content); + } + + const cap = 25; + const results = try explorer.searchContentRanked("common", testing.allocator, cap); + defer { + for (results) |r| { + testing.allocator.free(r.line_text); + testing.allocator.free(r.path); + } + testing.allocator.free(results); + } + + try testing.expect(results.len <= cap); + try testing.expect(results.len > 0); + for (results) |r| { + try testing.expect(r.score > 0.0); + } + for (1..results.len) |i| { + try testing.expect(results[i - 1].score >= results[i].score); + } +} + +test "bm25-state-sync: re-index and remove update total_tokens correctly" { + var explorer = Explorer.init(testing.allocator); + defer explorer.deinit(); + + try explorer.indexFile("sync.txt", "alpha beta gamma delta epsilon"); + try testing.expectEqual(@as(u64, 5), explorer.word_index.total_tokens); + + try explorer.indexFile("sync.txt", "alpha beta"); + try testing.expectEqual(@as(u64, 2), explorer.word_index.total_tokens); + + explorer.removeFile("sync.txt"); + try testing.expectEqual(@as(u64, 0), explorer.word_index.total_tokens); +} + +test "bm25-persistence: writeToDisk/readFromDisk preserves total_tokens and doc_lengths" { + const alloc = testing.allocator; + var wi = WordIndex.init(alloc); + defer wi.deinit(); + + try wi.indexFile("low.txt", "needle filler filler filler filler filler filler filler filler filler"); + try wi.indexFile("high.txt", "needle needle needle filler"); + try wi.indexFile("none.txt", "filler filler filler filler"); + + const pre_total = wi.total_tokens; + const pre_low_len = wi.docLength(wi.path_to_id.get("low.txt") orelse 0); + const pre_high_len = wi.docLength(wi.path_to_id.get("high.txt") orelse 0); + + var tmp = testing.tmpDir(.{}); + defer tmp.cleanup(); + var path_buf: [std.fs.max_path_bytes]u8 = undefined; + const dir_path_len = try tmp.dir.realPathFile(io, ".", &path_buf); + const dir_path = path_buf[0..dir_path_len]; + + try wi.writeToDisk(io, dir_path, null); + + const maybe_loaded = WordIndex.readFromDisk(io, dir_path, alloc); + try testing.expect(maybe_loaded != null); + var loaded = maybe_loaded.?; + defer loaded.deinit(); + + try testing.expectEqual(pre_total, loaded.total_tokens); + + const post_low_id = loaded.path_to_id.get("low.txt") orelse { + try testing.expect(false); + return; + }; + const post_high_id = loaded.path_to_id.get("high.txt") orelse { + try testing.expect(false); + return; + }; + try testing.expectEqual(pre_low_len, loaded.docLength(post_low_id)); + try testing.expectEqual(pre_high_len, loaded.docLength(post_high_id)); + + const hits = try loaded.searchDeduped("needle", alloc); + defer alloc.free(hits); + try testing.expect(hits.len >= 2); + + var saw_high = false; + var saw_low = false; + for (hits) |h| { + const p = loaded.hitPath(h); + if (std.mem.eql(u8, p, "high.txt")) saw_high = true; + if (std.mem.eql(u8, p, "low.txt")) saw_low = true; + } + try testing.expect(saw_high); + try testing.expect(saw_low); + + // Post-roundtrip ranked search must still work and return hits for "needle". + var wi2 = WordIndex.init(alloc); + defer wi2.deinit(); + try wi2.indexFile("low.txt", "needle filler filler filler filler filler filler filler filler filler"); + try wi2.indexFile("high.txt", "needle needle needle filler"); + try wi2.indexFile("none.txt", "filler filler filler filler"); + + const low_id_orig = wi2.path_to_id.get("low.txt") orelse 0; + const high_id_orig = wi2.path_to_id.get("high.txt") orelse 0; + try testing.expectEqual(pre_low_len, wi2.docLength(low_id_orig)); + try testing.expectEqual(pre_high_len, wi2.docLength(high_id_orig)); + try testing.expectEqual(pre_total, wi2.total_tokens); +}