Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 181 additions & 0 deletions src/explore.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1690,6 +1690,187 @@ pub const Explorer = struct {
return result_list.toOwnedSlice(allocator);
}

/// BM25-ranked content search. Tokenizes the query the same way the word
/// index tokenizes documents, scores each candidate doc with BM25
/// (k1=1.2, b=0.75), and emits one SearchResult per top-N document with
/// the best-tf line for any query term in that doc. Existing scan-order
/// `searchContent` is unaffected.
pub fn searchContentRanked(self: *Explorer, query: []const u8, allocator: std.mem.Allocator, max_results: usize) ![]const SearchResult {
self.mu.lockShared();
defer self.mu.unlockShared();

if (max_results == 0) return try allocator.alloc(SearchResult, 0);

// Tokenize the query the same way WordIndex tokenizes documents:
// lowercase + identifier-split. Dedupe terms so repeated query words
// don't double-count.
var term_arena = std.heap.ArenaAllocator.init(allocator);
defer term_arena.deinit();
const ta = term_arena.allocator();

var terms_set = std.StringHashMap(void).init(ta);
var raw_tok = idx.WordTokenizer{ .buf = query };
while (raw_tok.next()) |word| {
if (word.len < 2) continue;
const lower = try ta.alloc(u8, word.len);
for (word, 0..) |c, j| lower[j] = idx.normalizeChar(c);
_ = try terms_set.getOrPut(lower);

var needs_split: bool = false;
if (word.len >= 4) {
for (word) |c| {
if (c == '_' or (c >= 'A' and c <= 'Z')) {
needs_split = true;
break;
}
}
}
if (needs_split) {
var sub_toks: std.ArrayList([]const u8) = .empty;
defer sub_toks.deinit(ta);
idx.splitIdentifier(word, &sub_toks, ta) catch continue;
for (sub_toks.items) |sub| {
if (sub.len < 2) continue;
_ = try terms_set.getOrPut(sub);
}
}
}
if (terms_set.count() == 0) return try allocator.alloc(SearchResult, 0);

// BM25 constants.
const k1: f32 = 1.2;
const b: f32 = 0.75;
const N = self.word_index.rankedDocCount();
if (N == 0) return try allocator.alloc(SearchResult, 0);
const avgdl = self.word_index.avgDocLength();

// Aggregate scores per doc and remember the best line (max term hits)
// for each candidate.
const DocAgg = struct {
score: f32,
best_line: u32,
best_line_hits: u32,
};
var per_doc = std.AutoHashMap(u32, DocAgg).init(ta);

// For each unique query term, look up its posting list once,
// compute df and per-doc tf in a single pass.
var term_iter = terms_set.keyIterator();
while (term_iter.next()) |term_ptr| {
const term = term_ptr.*;
const hits = self.word_index.search(term);
if (hits.len == 0) continue;

// df: distinct doc_ids in this posting list. tf: count of (term,doc)
// entries (each entry is a distinct line per indexFile dedup).
// line_hits: per-doc map of line_num → count for best-line picking.
var doc_tf = std.AutoHashMap(u32, u32).init(ta);
var doc_best_line = std.AutoHashMap(u32, struct { line: u32, count: u32 }).init(ta);
for (hits) |h| {
const tf_gop = try doc_tf.getOrPut(h.doc_id);
if (!tf_gop.found_existing) tf_gop.value_ptr.* = 0;
tf_gop.value_ptr.* += 1;

const ln_gop = try doc_best_line.getOrPut(h.doc_id);
if (!ln_gop.found_existing) {
ln_gop.value_ptr.* = .{ .line = h.line_num, .count = 1 };
} else {
// Each posting is a distinct line; still, prefer the
// smallest line_num as a deterministic representative.
if (h.line_num < ln_gop.value_ptr.line) {
ln_gop.value_ptr.line = h.line_num;
}
ln_gop.value_ptr.count += 1;
}
}
const df: u32 = @intCast(doc_tf.count());
// BM25 idf with the +1 smoothing variant: log(1 + (N - df + 0.5)/(df + 0.5))
const num: f32 = @as(f32, @floatFromInt(N)) - @as(f32, @floatFromInt(df)) + 0.5;
const den: f32 = @as(f32, @floatFromInt(df)) + 0.5;
const idf: f32 = @log(1.0 + num / den);

var tf_iter = doc_tf.iterator();
while (tf_iter.next()) |entry| {
const doc_id = entry.key_ptr.*;
const tf: f32 = @floatFromInt(entry.value_ptr.*);
const dl_raw = self.word_index.docLength(doc_id);
const dl: f32 = if (dl_raw == 0) 1.0 else @floatFromInt(dl_raw);
const norm = 1.0 - b + b * (dl / avgdl);
const term_score = idf * (tf * (k1 + 1.0)) / (tf + k1 * norm);

const ln_info = doc_best_line.get(doc_id) orelse continue;
const agg_gop = try per_doc.getOrPut(doc_id);
if (!agg_gop.found_existing) {
agg_gop.value_ptr.* = .{
.score = term_score,
.best_line = ln_info.line,
.best_line_hits = ln_info.count,
};
} else {
agg_gop.value_ptr.score += term_score;
if (ln_info.count > agg_gop.value_ptr.best_line_hits or
(ln_info.count == agg_gop.value_ptr.best_line_hits and ln_info.line < agg_gop.value_ptr.best_line))
{
agg_gop.value_ptr.best_line = ln_info.line;
agg_gop.value_ptr.best_line_hits = ln_info.count;
}
}
}
}
if (per_doc.count() == 0) return try allocator.alloc(SearchResult, 0);

const Cand = struct { doc_id: u32, score: f32, best_line: u32 };
var cands: std.ArrayList(Cand) = .empty;
defer cands.deinit(ta);
try cands.ensureTotalCapacity(ta, per_doc.count());
var pd_iter = per_doc.iterator();
while (pd_iter.next()) |entry| {
cands.appendAssumeCapacity(.{
.doc_id = entry.key_ptr.*,
.score = entry.value_ptr.score,
.best_line = entry.value_ptr.best_line,
});
}
std.sort.block(Cand, cands.items, {}, struct {
pub fn lt(_: void, a: Cand, b_: Cand) bool {
if (a.score != b_.score) return a.score > b_.score;
return a.doc_id < b_.doc_id;
}
}.lt);

var result_list: std.ArrayList(SearchResult) = .empty;
errdefer {
for (result_list.items) |r| {
allocator.free(r.line_text);
allocator.free(r.path);
}
result_list.deinit(allocator);
}
try result_list.ensureTotalCapacity(allocator, @min(max_results, cands.items.len));

for (cands.items) |c| {
if (result_list.items.len >= max_results) break;
const path = self.word_index.id_to_path.items[c.doc_id];
if (path.len == 0) continue;
const ref = self.readContentForSearch(path, allocator) orelse continue;
defer ref.deinit();
const line_text = extractLineByNumber(ref.data, c.best_line) orelse continue;
const duped_text = try allocator.dupe(u8, line_text);
errdefer allocator.free(duped_text);
const duped_path = try allocator.dupe(u8, path);
errdefer allocator.free(duped_path);
try result_list.append(allocator, .{
.path = duped_path,
.line_num = c.best_line,
.line_text = duped_text,
.score = c.score,
});
}

return result_list.toOwnedSlice(allocator);
}


/// Search file contents using a regex pattern with trigram acceleration.
/// Decomposes the regex to extract literal trigrams for candidate filtering,
/// then does actual regex matching on candidates.
Expand Down
88 changes: 85 additions & 3 deletions src/index.zig
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ pub const WordIndex = struct {
enabled: bool = true,
path_to_id: std.StringHashMap(u32),
id_to_path: std.ArrayList([]const u8),
/// doc_id → number of tokens indexed for that doc (BM25 length normalization).
doc_lengths: std.AutoHashMap(u32, u32),
/// Sum of all values in doc_lengths.
total_tokens: u64 = 0,

pub fn hitPath(self: *const WordIndex, hit: WordHit) []const u8 {
if (hit.doc_id < self.id_to_path.items.len) return self.id_to_path.items[hit.doc_id];
Expand All @@ -41,6 +45,8 @@ pub const WordIndex = struct {
.allocator = allocator,
.path_to_id = std.StringHashMap(u32).init(allocator),
.id_to_path = .empty,
.doc_lengths = std.AutoHashMap(u32, u32).init(allocator),
.total_tokens = 0,
};
}

Expand Down Expand Up @@ -69,6 +75,7 @@ pub const WordIndex = struct {

self.path_to_id.deinit();
self.id_to_path.deinit(self.allocator);
self.doc_lengths.deinit();
}

/// Remove all index entries for a file (call before re-indexing).
Expand All @@ -86,6 +93,9 @@ pub const WordIndex = struct {
if (doc_id < self.id_to_path.items.len) {
self.id_to_path.items[doc_id] = "";
}
if (self.doc_lengths.fetchRemove(doc_id)) |kv| {
self.total_tokens -= kv.value;
}
defer {
self.allocator.free(words_slice);
self.allocator.free(stable_path);
Expand Down Expand Up @@ -148,8 +158,14 @@ pub const WordIndex = struct {
// Clean up old entries first
self.removeFile(path);

const stable_path = try self.allocator.dupe(u8, path);
errdefer self.allocator.free(stable_path);
// If the path is already tracked (e.g. skip_file_words=true and removeFile
// early-exited), reuse the existing stable copy rather than leaking a new dup.
const stable_path = if (self.path_to_id.contains(path))
path
else
try self.allocator.dupe(u8, path);
const owned_path = stable_path.ptr != path.ptr;
errdefer if (owned_path) self.allocator.free(stable_path);

const doc_id = try self.getOrCreateDocId(stable_path);

Expand All @@ -162,12 +178,14 @@ pub const WordIndex = struct {
var words_set = std.StringHashMap(void).init(words_arena.allocator());
var line_num: u32 = 0;
var lines = std.mem.splitScalar(u8, content, '\n');
var doc_token_count: u32 = 0;

while (lines.next()) |line| {
line_num += 1;
var tok = WordTokenizer{ .buf = line };
while (tok.next()) |word| {
if (word.len < 2) continue;
doc_token_count +|= 1;

const aa = words_arena.allocator();

Expand Down Expand Up @@ -226,6 +244,12 @@ pub const WordIndex = struct {
try self.file_words.put(stable_path, compact);
}
words_set.deinit();

if (self.doc_lengths.get(doc_id)) |old_len| {
self.total_tokens -%= old_len;
}
try self.doc_lengths.put(doc_id, doc_token_count);
self.total_tokens += doc_token_count;
}

/// Look up all hits for a word. O(1) lookup + O(hits) iteration.
Expand Down Expand Up @@ -311,6 +335,24 @@ pub const WordIndex = struct {
return @intCast(self.file_words.count());
}

/// BM25 helper: number of docs the ranker can see (source of truth regardless of skip_file_words).
pub fn rankedDocCount(self: *const WordIndex) u32 {
return @intCast(self.doc_lengths.count());
}

/// BM25 helper: number of indexed tokens in a doc, or 0 if unknown.
pub fn docLength(self: *const WordIndex, doc_id: u32) u32 {
return self.doc_lengths.get(doc_id) orelse 0;
}

/// BM25 helper: average doc length over docs that have a recorded length.
/// Returns 1.0 when no docs are tracked, so callers can divide safely.
pub fn avgDocLength(self: *const WordIndex) f32 {
const n = self.doc_lengths.count();
if (n == 0) return 1.0;
return @as(f32, @floatFromInt(self.total_tokens)) / @as(f32, @floatFromInt(n));
}

/// Shrink all hit lists and per-file word sets to release excess capacity.
pub fn shrinkAllocations(self: *WordIndex) void {
var iter = self.index.iterator();
Expand All @@ -327,7 +369,7 @@ pub const WordIndex = struct {
};

const DISK_MAGIC = [4]u8{ 'C', 'D', 'B', 'W' };
const DISK_FORMAT_VERSION: u16 = 2;
const DISK_FORMAT_VERSION: u16 = 3;

pub fn writeToDisk(self: *WordIndex, io: std.Io, dir_path: []const u8, git_head: ?[40]u8) !void {
var file_table: std.ArrayList([]const u8) = .empty;
Expand Down Expand Up @@ -424,6 +466,27 @@ pub const WordIndex = struct {
try writer.interface.writeAll(&hit_buf);
}
}

// v3 trailer: per-doc length table for BM25.
// file_id (u32 disk-id) → length (u32). Total tokens follows as u64.
var dl_count_buf: [4]u8 = undefined;
std.mem.writeInt(u32, &dl_count_buf, @intCast(file_table.items.len), .little);
try writer.interface.writeAll(&dl_count_buf);
for (file_table.items) |path| {
const in_mem_id = self.path_to_id.get(path) orelse {
var z: [4]u8 = .{ 0, 0, 0, 0 };
try writer.interface.writeAll(&z);
continue;
};
const len = self.doc_lengths.get(in_mem_id) orelse 0;
var lb: [4]u8 = undefined;
std.mem.writeInt(u32, &lb, len, .little);
try writer.interface.writeAll(&lb);
}
var tt_buf: [8]u8 = undefined;
std.mem.writeInt(u64, &tt_buf, self.total_tokens, .little);
try writer.interface.writeAll(&tt_buf);

try writer.interface.flush();

try std.Io.Dir.cwd().rename(tmp_path, std.Io.Dir.cwd(), final_path, io);
Expand Down Expand Up @@ -535,14 +598,33 @@ pub const WordIndex = struct {
gop.value_ptr.* = hits;
}

// v3 trailer: per-doc length table.
if (pos + 4 > data.len) return null;
const dl_count = std.mem.readInt(u32, data[pos..][0..4], .little);
pos += 4;
if (dl_count != file_count) return null;
if (pos + dl_count * 4 + 8 > data.len) return null;
var dl_values = try allocator.alloc(u32, dl_count);
defer allocator.free(dl_values);
for (0..dl_count) |i| {
dl_values[i] = std.mem.readInt(u32, data[pos..][0..4], .little);
pos += 4;
}
const total_tokens_loaded = std.mem.readInt(u64, data[pos..][0..8], .little);
pos += 8;

if (pos != data.len) return null;

// Populate path_to_id and id_to_path from file_paths
try result.id_to_path.ensureTotalCapacity(allocator, file_count);
for (0..file_count) |i| {
result.id_to_path.appendAssumeCapacity(file_paths[i]);
try result.path_to_id.put(file_paths[i], @intCast(i));
if (dl_values[i] > 0) {
try result.doc_lengths.put(@intCast(i), dl_values[i]);
}
}
result.total_tokens = total_tokens_loaded;

// Compact tmp_file_words HashMaps into slices for result.file_words
var tfw_iter = tmp_file_words.iterator();
Expand Down
Loading
Loading