justrach · justrach · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/src/explore.zig b/src/explore.zig
@@ -1690,6 +1690,187 @@ pub const Explorer = struct {
         return result_list.toOwnedSlice(allocator);
     }
 
+    /// BM25-ranked content search. Tokenizes the query the same way the word
+    /// index tokenizes documents, scores each candidate doc with BM25
+    /// (k1=1.2, b=0.75), and emits one SearchResult per top-N document with
+    /// the best-tf line for any query term in that doc. Existing scan-order
+    /// `searchContent` is unaffected.
+    pub fn searchContentRanked(self: *Explorer, query: []const u8, allocator: std.mem.Allocator, max_results: usize) ![]const SearchResult {
+        self.mu.lockShared();
+        defer self.mu.unlockShared();
+
+        if (max_results == 0) return try allocator.alloc(SearchResult, 0);
+
+        // Tokenize the query the same way WordIndex tokenizes documents:
+        // lowercase + identifier-split. Dedupe terms so repeated query words
+        // don't double-count.
+        var term_arena = std.heap.ArenaAllocator.init(allocator);
+        defer term_arena.deinit();
+        const ta = term_arena.allocator();
+
+        var terms_set = std.StringHashMap(void).init(ta);
+        var raw_tok = idx.WordTokenizer{ .buf = query };
+        while (raw_tok.next()) |word| {
+            if (word.len < 2) continue;
+            const lower = try ta.alloc(u8, word.len);
+            for (word, 0..) |c, j| lower[j] = idx.normalizeChar(c);
+            _ = try terms_set.getOrPut(lower);
+
+            var needs_split: bool = false;
+            if (word.len >= 4) {
+                for (word) |c| {
+                    if (c == '_' or (c >= 'A' and c <= 'Z')) {
+                        needs_split = true;
+                        break;
+                    }
+                }
+            }
+            if (needs_split) {
+                var sub_toks: std.ArrayList([]const u8) = .empty;
+                defer sub_toks.deinit(ta);
+                idx.splitIdentifier(word, &sub_toks, ta) catch continue;
+                for (sub_toks.items) |sub| {
+                    if (sub.len < 2) continue;
+                    _ = try terms_set.getOrPut(sub);
+                }
+            }
+        }
+        if (terms_set.count() == 0) return try allocator.alloc(SearchResult, 0);
+
+        // BM25 constants.
+        const k1: f32 = 1.2;
+        const b: f32 = 0.75;
+        const N = self.word_index.rankedDocCount();
+        if (N == 0) return try allocator.alloc(SearchResult, 0);
+        const avgdl = self.word_index.avgDocLength();
+
+        // Aggregate scores per doc and remember the best line (max term hits)
+        // for each candidate.
+        const DocAgg = struct {
+            score: f32,
+            best_line: u32,
+            best_line_hits: u32,
+        };
+        var per_doc = std.AutoHashMap(u32, DocAgg).init(ta);
+
+        // For each unique query term, look up its posting list once,
+        // compute df and per-doc tf in a single pass.
+        var term_iter = terms_set.keyIterator();
+        while (term_iter.next()) |term_ptr| {
+            const term = term_ptr.*;
+            const hits = self.word_index.search(term);
+            if (hits.len == 0) continue;
+
+            // df: distinct doc_ids in this posting list. tf: count of (term,doc)
+            // entries (each entry is a distinct line per indexFile dedup).
+            // line_hits: per-doc map of line_num → count for best-line picking.
+            var doc_tf = std.AutoHashMap(u32, u32).init(ta);
+            var doc_best_line = std.AutoHashMap(u32, struct { line: u32, count: u32 }).init(ta);
+            for (hits) |h| {
+                const tf_gop = try doc_tf.getOrPut(h.doc_id);
+                if (!tf_gop.found_existing) tf_gop.value_ptr.* = 0;
+                tf_gop.value_ptr.* += 1;
+
+                const ln_gop = try doc_best_line.getOrPut(h.doc_id);
+                if (!ln_gop.found_existing) {
+                    ln_gop.value_ptr.* = .{ .line = h.line_num, .count = 1 };
+                } else {
+                    // Each posting is a distinct line; still, prefer the
+                    // smallest line_num as a deterministic representative.
+                    if (h.line_num < ln_gop.value_ptr.line) {
+                        ln_gop.value_ptr.line = h.line_num;
+                    }
+                    ln_gop.value_ptr.count += 1;
+                }
+            }
+            const df: u32 = @intCast(doc_tf.count());
+            // BM25 idf with the +1 smoothing variant: log(1 + (N - df + 0.5)/(df + 0.5))
+            const num: f32 = @as(f32, @floatFromInt(N)) - @as(f32, @floatFromInt(df)) + 0.5;
+            const den: f32 = @as(f32, @floatFromInt(df)) + 0.5;
+            const idf: f32 = @log(1.0 + num / den);
+
+            var tf_iter = doc_tf.iterator();
+            while (tf_iter.next()) |entry| {
+                const doc_id = entry.key_ptr.*;
+                const tf: f32 = @floatFromInt(entry.value_ptr.*);
+                const dl_raw = self.word_index.docLength(doc_id);
+                const dl: f32 = if (dl_raw == 0) 1.0 else @floatFromInt(dl_raw);
+                const norm = 1.0 - b + b * (dl / avgdl);
+                const term_score = idf * (tf * (k1 + 1.0)) / (tf + k1 * norm);
+
+                const ln_info = doc_best_line.get(doc_id) orelse continue;
+                const agg_gop = try per_doc.getOrPut(doc_id);
+                if (!agg_gop.found_existing) {
+                    agg_gop.value_ptr.* = .{
+                        .score = term_score,
+                        .best_line = ln_info.line,
+                        .best_line_hits = ln_info.count,
+                    };
+                } else {
+                    agg_gop.value_ptr.score += term_score;
+                    if (ln_info.count > agg_gop.value_ptr.best_line_hits or
+                        (ln_info.count == agg_gop.value_ptr.best_line_hits and ln_info.line < agg_gop.value_ptr.best_line))
+                    {
+                        agg_gop.value_ptr.best_line = ln_info.line;
+                        agg_gop.value_ptr.best_line_hits = ln_info.count;
+                    }
+                }
+            }
+        }
+        if (per_doc.count() == 0) return try allocator.alloc(SearchResult, 0);
+
+        const Cand = struct { doc_id: u32, score: f32, best_line: u32 };
+        var cands: std.ArrayList(Cand) = .empty;
+        defer cands.deinit(ta);
+        try cands.ensureTotalCapacity(ta, per_doc.count());
+        var pd_iter = per_doc.iterator();
+        while (pd_iter.next()) |entry| {
+            cands.appendAssumeCapacity(.{
+                .doc_id = entry.key_ptr.*,
+                .score = entry.value_ptr.score,
+                .best_line = entry.value_ptr.best_line,
+            });
+        }
+        std.sort.block(Cand, cands.items, {}, struct {
+            pub fn lt(_: void, a: Cand, b_: Cand) bool {
+                if (a.score != b_.score) return a.score > b_.score;
+                return a.doc_id < b_.doc_id;
+            }
+        }.lt);
+
+        var result_list: std.ArrayList(SearchResult) = .empty;
+        errdefer {
+            for (result_list.items) |r| {
+                allocator.free(r.line_text);
+                allocator.free(r.path);
+            }
+            result_list.deinit(allocator);
+        }
+        try result_list.ensureTotalCapacity(allocator, @min(max_results, cands.items.len));
+
+        for (cands.items) |c| {
+            if (result_list.items.len >= max_results) break;
+            const path = self.word_index.id_to_path.items[c.doc_id];
+            if (path.len == 0) continue;
+            const ref = self.readContentForSearch(path, allocator) orelse continue;
+            defer ref.deinit();
+            const line_text = extractLineByNumber(ref.data, c.best_line) orelse continue;
+            const duped_text = try allocator.dupe(u8, line_text);
+            errdefer allocator.free(duped_text);
+            const duped_path = try allocator.dupe(u8, path);
+            errdefer allocator.free(duped_path);
+            try result_list.append(allocator, .{
+                .path = duped_path,
+                .line_num = c.best_line,
+                .line_text = duped_text,
+                .score = c.score,
+            });
+        }
+
+        return result_list.toOwnedSlice(allocator);
+    }
+
+
     /// Search file contents using a regex pattern with trigram acceleration.
     /// Decomposes the regex to extract literal trigrams for candidate filtering,
     /// then does actual regex matching on candidates.

diff --git a/src/index.zig b/src/index.zig
@@ -20,6 +20,10 @@ pub const WordIndex = struct {
     enabled: bool = true,
     path_to_id: std.StringHashMap(u32),
     id_to_path: std.ArrayList([]const u8),
+    /// doc_id → number of tokens indexed for that doc (BM25 length normalization).
+    doc_lengths: std.AutoHashMap(u32, u32),
+    /// Sum of all values in doc_lengths.
+    total_tokens: u64 = 0,
 
     pub fn hitPath(self: *const WordIndex, hit: WordHit) []const u8 {
         if (hit.doc_id < self.id_to_path.items.len) return self.id_to_path.items[hit.doc_id];
@@ -41,6 +45,8 @@ pub const WordIndex = struct {
             .allocator = allocator,
             .path_to_id = std.StringHashMap(u32).init(allocator),
             .id_to_path = .empty,
+            .doc_lengths = std.AutoHashMap(u32, u32).init(allocator),
+            .total_tokens = 0,
         };
     }
 
@@ -69,6 +75,7 @@ pub const WordIndex = struct {
 
         self.path_to_id.deinit();
         self.id_to_path.deinit(self.allocator);
+        self.doc_lengths.deinit();
     }
 
     /// Remove all index entries for a file (call before re-indexing).
@@ -86,6 +93,9 @@ pub const WordIndex = struct {
         if (doc_id < self.id_to_path.items.len) {
             self.id_to_path.items[doc_id] = "";
         }
+        if (self.doc_lengths.fetchRemove(doc_id)) |kv| {
+            self.total_tokens -= kv.value;
+        }
         defer {
             self.allocator.free(words_slice);
             self.allocator.free(stable_path);
@@ -148,8 +158,14 @@ pub const WordIndex = struct {
         // Clean up old entries first
         self.removeFile(path);
 
-        const stable_path = try self.allocator.dupe(u8, path);
-        errdefer self.allocator.free(stable_path);
+        // If the path is already tracked (e.g. skip_file_words=true and removeFile
+        // early-exited), reuse the existing stable copy rather than leaking a new dup.
+        const stable_path = if (self.path_to_id.contains(path))
+            path
+        else
+            try self.allocator.dupe(u8, path);
+        const owned_path = stable_path.ptr != path.ptr;
+        errdefer if (owned_path) self.allocator.free(stable_path);
 
         const doc_id = try self.getOrCreateDocId(stable_path);
 
@@ -162,12 +178,14 @@ pub const WordIndex = struct {
         var words_set = std.StringHashMap(void).init(words_arena.allocator());
         var line_num: u32 = 0;
         var lines = std.mem.splitScalar(u8, content, '\n');
+        var doc_token_count: u32 = 0;
 
         while (lines.next()) |line| {
             line_num += 1;
             var tok = WordTokenizer{ .buf = line };
             while (tok.next()) |word| {
                 if (word.len < 2) continue;
+                doc_token_count +|= 1;
 
                 const aa = words_arena.allocator();
 
@@ -226,6 +244,12 @@ pub const WordIndex = struct {
             try self.file_words.put(stable_path, compact);
         }
         words_set.deinit();
+
+        if (self.doc_lengths.get(doc_id)) |old_len| {
+            self.total_tokens -%= old_len;
+        }
+        try self.doc_lengths.put(doc_id, doc_token_count);
+        self.total_tokens += doc_token_count;
     }
 
     /// Look up all hits for a word. O(1) lookup + O(hits) iteration.
@@ -311,6 +335,24 @@ pub const WordIndex = struct {
         return @intCast(self.file_words.count());
     }
 
+    /// BM25 helper: number of docs the ranker can see (source of truth regardless of skip_file_words).
+    pub fn rankedDocCount(self: *const WordIndex) u32 {
+        return @intCast(self.doc_lengths.count());
+    }
+
+    /// BM25 helper: number of indexed tokens in a doc, or 0 if unknown.
+    pub fn docLength(self: *const WordIndex, doc_id: u32) u32 {
+        return self.doc_lengths.get(doc_id) orelse 0;
+    }
+
+    /// BM25 helper: average doc length over docs that have a recorded length.
+    /// Returns 1.0 when no docs are tracked, so callers can divide safely.
+    pub fn avgDocLength(self: *const WordIndex) f32 {
+        const n = self.doc_lengths.count();
+        if (n == 0) return 1.0;
+        return @as(f32, @floatFromInt(self.total_tokens)) / @as(f32, @floatFromInt(n));
+    }
+
     /// Shrink all hit lists and per-file word sets to release excess capacity.
     pub fn shrinkAllocations(self: *WordIndex) void {
         var iter = self.index.iterator();
@@ -327,7 +369,7 @@ pub const WordIndex = struct {
     };
 
     const DISK_MAGIC = [4]u8{ 'C', 'D', 'B', 'W' };
-    const DISK_FORMAT_VERSION: u16 = 2;
+    const DISK_FORMAT_VERSION: u16 = 3;
 
     pub fn writeToDisk(self: *WordIndex, io: std.Io, dir_path: []const u8, git_head: ?[40]u8) !void {
         var file_table: std.ArrayList([]const u8) = .empty;
@@ -424,6 +466,27 @@ pub const WordIndex = struct {
                 try writer.interface.writeAll(&hit_buf);
             }
         }
+
+        // v3 trailer: per-doc length table for BM25.
+        // file_id (u32 disk-id) → length (u32). Total tokens follows as u64.
+        var dl_count_buf: [4]u8 = undefined;
+        std.mem.writeInt(u32, &dl_count_buf, @intCast(file_table.items.len), .little);
+        try writer.interface.writeAll(&dl_count_buf);
+        for (file_table.items) |path| {
+            const in_mem_id = self.path_to_id.get(path) orelse {
+                var z: [4]u8 = .{ 0, 0, 0, 0 };
+                try writer.interface.writeAll(&z);
+                continue;
+            };
+            const len = self.doc_lengths.get(in_mem_id) orelse 0;
+            var lb: [4]u8 = undefined;
+            std.mem.writeInt(u32, &lb, len, .little);
+            try writer.interface.writeAll(&lb);
+        }
+        var tt_buf: [8]u8 = undefined;
+        std.mem.writeInt(u64, &tt_buf, self.total_tokens, .little);
+        try writer.interface.writeAll(&tt_buf);
+
         try writer.interface.flush();
 
         try std.Io.Dir.cwd().rename(tmp_path, std.Io.Dir.cwd(), final_path, io);
@@ -535,14 +598,33 @@ pub const WordIndex = struct {
             gop.value_ptr.* = hits;
         }
 
+        // v3 trailer: per-doc length table.
+        if (pos + 4 > data.len) return null;
+        const dl_count = std.mem.readInt(u32, data[pos..][0..4], .little);
+        pos += 4;
+        if (dl_count != file_count) return null;
+        if (pos + dl_count * 4 + 8 > data.len) return null;
+        var dl_values = try allocator.alloc(u32, dl_count);
+        defer allocator.free(dl_values);
+        for (0..dl_count) |i| {
+            dl_values[i] = std.mem.readInt(u32, data[pos..][0..4], .little);
+            pos += 4;
+        }
+        const total_tokens_loaded = std.mem.readInt(u64, data[pos..][0..8], .little);
+        pos += 8;
+
         if (pos != data.len) return null;
 
         // Populate path_to_id and id_to_path from file_paths
         try result.id_to_path.ensureTotalCapacity(allocator, file_count);
         for (0..file_count) |i| {
             result.id_to_path.appendAssumeCapacity(file_paths[i]);
             try result.path_to_id.put(file_paths[i], @intCast(i));
+            if (dl_values[i] > 0) {
+                try result.doc_lengths.put(@intCast(i), dl_values[i]);
+            }
         }
+        result.total_tokens = total_tokens_loaded;
 
         // Compact tmp_file_words HashMaps into slices for result.file_words
         var tfw_iter = tmp_file_words.iterator();