Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/deep/candidate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ fn overlaps_any(
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{AuthCategory, Confidence, Language, ScanPass};
use crate::types::{AuthCategory, Confidence, Language, ScanPass, Surface};
use std::fs;
use tempfile::tempdir;

Expand All @@ -396,6 +396,7 @@ mod tests {
pattern_rule: None,
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::Backend,
}
}

Expand Down Expand Up @@ -769,7 +770,7 @@ mod tests {
// Regression: a structural finding pointing at a deleted file used to
// propagate `DeepError::Io` through `?`, killing the entire deep pass
// even though deep mode is otherwise best-effort.
use crate::types::{AuthCategory, Confidence, Finding, ScanPass};
use crate::types::{AuthCategory, Confidence, Finding, ScanPass, Surface};
let dir = tempdir().unwrap();
// One escalation finding pointing at a file that doesn't exist.
let bad = Finding {
Expand All @@ -785,6 +786,7 @@ mod tests {
pattern_rule: None,
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::Backend,
};
// Should NOT propagate Io; should return Ok with the bad escalation
// skipped. (No cold-region files either, so result is empty.)
Expand Down
3 changes: 2 additions & 1 deletion src/deep/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ fn expand_inner(
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{AuthCategory, Confidence, ScanPass};
use crate::types::{AuthCategory, Confidence, ScanPass, Surface};
use std::fs;
use std::path::PathBuf;
use tempfile::tempdir;
Expand All @@ -214,6 +214,7 @@ mod tests {
pattern_rule: None,
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::Backend,
}
}

Expand Down
154 changes: 147 additions & 7 deletions src/deep/finding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use crate::deep::candidate::Candidate;
use crate::scanner::matcher::compute_finding_id;
use crate::types::{AuthCategory, Confidence, Finding, ScanPass};
use crate::types::{AuthCategory, Confidence, Finding, ScanPass, Surface};
use serde::Deserialize;
use std::path::Path;

Expand Down Expand Up @@ -32,8 +32,14 @@ pub struct SemanticFinding {
///
/// `scan_root` is required to read the file at `candidate.file` (relative)
/// to populate `code_snippet` from the lines the model identified. If the
/// file is unreadable (e.g. moved between scan and analyze), `code_snippet`
/// falls back to the empty string — best-effort, do not fail the finding.
/// file is unreadable (e.g. moved between scan and analyze), the snippet
/// falls back to slicing [`Candidate::source_snippet`] for the same line
/// range — that buffer was loaded by the structural pass so it's already
/// in memory and represents the same source the model just analyzed.
/// Empty `code_snippet` remains the last resort; downstream tools that
/// surface "what was flagged" need *something* on every finding (corpus
/// shakedown turned up 28 semantic findings with `code_snippet: ""`,
/// which made deep-only buckets unreviewable).
pub fn into_finding(
sem: SemanticFinding,
candidate: &Candidate,
Expand Down Expand Up @@ -85,8 +91,14 @@ pub fn into_finding(
_ => format!("semantic-{}", sem.category.slug()),
};

let code_snippet =
extract_lines(scan_root, &candidate.file, sem.line_start, sem.line_end).unwrap_or_default();
// Try the filesystem first — that's the source of truth and produces the
// same byte range a structural finding would. Fall back to slicing the
// candidate's expanded snippet (already in memory) when the file moved,
// permissions changed, or extract_lines bailed for any other reason.
// Last resort: empty string. We never *fail* a finding on snippet read.
let code_snippet = extract_lines(scan_root, &candidate.file, sem.line_start, sem.line_end)
.or_else(|| slice_candidate_snippet(candidate, sem.line_start, sem.line_end))
.unwrap_or_default();
Comment thread
coderabbitai[bot] marked this conversation as resolved.

let id = compute_finding_id(
&rule_id,
Expand All @@ -112,9 +124,53 @@ pub fn into_finding(
pattern_rule: Some(rule_id),
rego_stub: None, // structural-only; semantic findings have no rego template
pass: ScanPass::Semantic,
// Surface follows the source file, not the pass — same path
// heuristic as structural findings so a deep-pass `web/src/foo.ts`
// finding is tagged Frontend just like its structural twin would be.
surface: Surface::classify(&candidate.file),
}
}

/// Slice [`candidate.source_snippet`] to the model-reported line range,
/// translating absolute (1-based, file-relative) line numbers into offsets
/// within the snippet. Returns `None` if the snippet is empty, the model's
/// range falls outside the candidate window, or the requested offsets land
/// past the end of the snippet (which can happen when the snippet was
/// truncated to fit `max_prompt_chars`).
///
/// `clamp_to_candidate` (in `deep::mod`) already keeps `sem` inside the
/// candidate window before this gets called, but we re-check defensively
/// rather than panic if a future caller skips the clamp.
fn slice_candidate_snippet(
candidate: &crate::deep::candidate::Candidate,
sem_start: usize,
sem_end: usize,
) -> Option<String> {
if candidate.source_snippet.is_empty() {
return None;
}
if sem_start == 0
|| sem_end < sem_start
|| sem_start < candidate.line_start
|| sem_end > candidate.line_end
{
return None;
}
let lines: Vec<&str> = candidate.source_snippet.lines().collect();
if lines.is_empty() {
return None;
}
// Translate file-relative 1-based line numbers into snippet-relative
// 0-based offsets. Snippet line 0 corresponds to `candidate.line_start`.
let start_idx = sem_start - candidate.line_start;
let end_idx = sem_end - candidate.line_start;
if start_idx >= lines.len() {
return None;
}
let end_inclusive = end_idx.min(lines.len() - 1);
Some(lines[start_idx..=end_inclusive].join("\n"))
}

/// Read the file at `scan_root.join(relative)` and return lines `[start, end]`
/// joined by `\n`. Returns `None` on read error or out-of-range input.
fn extract_lines(scan_root: &Path, relative: &Path, start: usize, end: usize) -> Option<String> {
Expand All @@ -126,7 +182,16 @@ fn extract_lines(scan_root: &Path, relative: &Path, start: usize, end: usize) ->
if lines.is_empty() {
return None;
}
let s = (start - 1).min(lines.len() - 1);
// Bail (instead of clamping to the last line) when the requested range
// starts past the end of the file. Otherwise the caller's fallback chain
// — `extract_lines(...).or_else(slice_candidate_snippet(...))` in
// `build_finding_from_semantic` — never reaches the candidate snippet on
// a shrunken/replaced file, and we'd silently attach an unrelated
// last-line excerpt to the finding.
if start > lines.len() {
return None;
}
let s = start - 1;
let e = end.min(lines.len()).max(s + 1);
Some(lines[s..e].join("\n"))
}
Expand Down Expand Up @@ -180,6 +245,7 @@ mod tests {
pattern_rule: pattern_rule.map(String::from),
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::Backend,
}
}

Expand Down Expand Up @@ -317,7 +383,8 @@ mod tests {
#[test]
fn into_finding_falls_back_to_empty_snippet_on_read_error() {
let dir = tempdir().unwrap();
// File doesn't exist.
// File doesn't exist AND the candidate has no source_snippet to
// fall back to → last-resort empty string.
let cand = make_candidate("nonexistent.ts", Language::TypeScript);
let f = into_finding(make_semantic(1, 5), &cand, None, dir.path());
assert_eq!(f.code_snippet, "");
Expand All @@ -327,6 +394,79 @@ mod tests {
assert_eq!(f.line_end, 5);
}

#[test]
fn into_finding_falls_back_to_candidate_snippet_when_file_unreadable() {
// Regression for corpus shakedown: 28 semantic findings shipped with
// `code_snippet: ""` because filesystem read failed and we had no
// fallback. The candidate's `source_snippet` is the same source the
// model just analyzed — slice it instead of dropping the snippet.
let dir = tempdir().unwrap();
// Note: file is *not* created — extract_lines must fail.
let mut cand = make_candidate("missing.ts", Language::TypeScript);
cand.line_start = 10;
cand.line_end = 14;
cand.source_snippet = "line 10\nline 11\nline 12\nline 13\nline 14".to_string();

// Model reports lines 11-12 within the candidate window.
let sem = make_semantic(11, 12);
let f = into_finding(sem, &cand, None, dir.path());

assert!(f.code_snippet.contains("line 11"));
assert!(f.code_snippet.contains("line 12"));
assert!(!f.code_snippet.contains("line 10"));
assert!(!f.code_snippet.contains("line 13"));
}

#[test]
fn slice_candidate_snippet_rejects_ranges_outside_window() {
let cand = Candidate {
kind: CandidateKind::ColdRegion,
file: PathBuf::from("a.ts"),
language: Language::TypeScript,
line_start: 10,
line_end: 14,
source_snippet: "line 10\nline 11\nline 12\nline 13\nline 14".to_string(),
imports: Vec::new(),
original_finding_id: None,
seed_category: None,
};
// Below window.
assert!(slice_candidate_snippet(&cand, 5, 8).is_none());
// Above window.
assert!(slice_candidate_snippet(&cand, 20, 22).is_none());
// Reversed.
assert!(slice_candidate_snippet(&cand, 12, 11).is_none());
// Zero start (defensive — clamp_to_candidate normally drops these).
assert!(slice_candidate_snippet(&cand, 0, 12).is_none());
// Empty snippet.
let mut empty = cand.clone();
empty.source_snippet.clear();
assert!(slice_candidate_snippet(&empty, 11, 12).is_none());
}

#[test]
fn slice_candidate_snippet_clamps_when_snippet_was_truncated() {
// Truncation at `max_prompt_chars` can leave the snippet shorter
// than `[candidate.line_start, candidate.line_end]` would imply.
// Tail offsets must clamp instead of panicking on out-of-bounds.
let cand = Candidate {
kind: CandidateKind::ColdRegion,
file: PathBuf::from("a.ts"),
language: Language::TypeScript,
line_start: 10,
line_end: 20, // candidate window claims 11 lines …
source_snippet: "line 10\nline 11\nline 12".to_string(), // … but snippet has 3
imports: Vec::new(),
original_finding_id: None,
seed_category: None,
};
// Model points at lines 11-15 — only 11 and 12 are in the truncated
// snippet, so we should get those two and not panic.
let got = slice_candidate_snippet(&cand, 11, 15).unwrap();
assert!(got.contains("line 11"));
assert!(got.contains("line 12"));
}

#[test]
fn ranges_overlap_covers_inclusive_boundaries() {
// Inclusive on both ends: touching at a single line counts as overlap.
Expand Down
3 changes: 2 additions & 1 deletion src/deep/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ fn range_overlap_fraction(a_start: usize, a_end: usize, b_start: usize, b_end: u
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{AuthCategory, Confidence, Language, ScanPass};
use crate::types::{AuthCategory, Confidence, Language, ScanPass, Surface};
use std::path::PathBuf;

fn finding(
Expand All @@ -79,6 +79,7 @@ mod tests {
pattern_rule: None,
rego_stub: None,
pass,
surface: Surface::Backend,
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/deep/prompt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ fn language_fence(lang: Language) -> &'static str {
mod tests {
use super::*;
use crate::deep::candidate::{Candidate, CandidateKind};
use crate::types::{AuthCategory, Confidence, ScanPass};
use crate::types::{AuthCategory, Confidence, ScanPass, Surface};
use std::path::PathBuf;

fn candidate_with_imports(language: Language, imports: Vec<String>) -> Candidate {
Expand Down Expand Up @@ -336,6 +336,7 @@ mod tests {
pattern_rule: Some("ts-custom-1".into()),
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::Backend,
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/mcp/tools.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use crate::rego::templates::{apply_confidence_wrapping, generate_default_stub, r
use crate::rego::validator::validate_rego;
use crate::rules::PatternRule;
use crate::scanner;
use crate::types::{AuthCategory, Confidence, Finding, Language, ScanPass};
use crate::types::{AuthCategory, Confidence, Finding, Language, ScanPass, Surface};

const DEFAULT_MAX_PROMPT_CHARS: usize = 16_000;

Expand Down Expand Up @@ -607,6 +607,7 @@ fn analyze_snippet(_ctx: &ServerContext, args: &Value) -> Result<Value, String>
pattern_rule: s.pattern_rule.clone(),
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::classify(&PathBuf::from(&parsed.file)),
});

let rendered = render(&PromptInputs {
Expand Down
13 changes: 11 additions & 2 deletions src/output/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::io::Write;
use std::path::Path;

use crate::error::Result;
use crate::types::Finding;
use crate::types::{Finding, Surface};

pub fn print(
findings: &[Finding],
Expand All @@ -21,13 +21,22 @@ pub fn print(
writeln!(writer)?;
}

// Tag frontend-surface findings inline so reviewers can scan past
// the UI-state noise (e.g. `web/src/foo.ts` ownership matches that
// are "is this me?" checks, not security gates) without diving into
// the JSON. Backend is the default — no tag, less visual noise.
let surface_tag = match finding.surface {
Surface::Frontend => " [frontend]",
Surface::Backend => "",
};
writeln!(
writer,
" {}:{} [{}] {}",
" {}:{} [{}] {}{}",
finding.file.display(),
finding.line_start,
finding.category,
finding.confidence,
surface_tag,
)?;

writeln!(writer, " {}", finding.description)?;
Expand Down
3 changes: 3 additions & 0 deletions src/rego/grouping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ mod tests {
pattern_rule: Some("test-rule".into()),
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::Backend,
}];

let files = group_findings(&findings, "app", Path::new("./policies"));
Expand All @@ -260,6 +261,7 @@ mod tests {
pattern_rule: None,
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::Backend,
},
Finding {
id: "b".into(),
Expand All @@ -274,6 +276,7 @@ mod tests {
pattern_rule: None,
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::Backend,
},
];

Expand Down
4 changes: 3 additions & 1 deletion src/scanner/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use tree_sitter::{Query, QueryCursor, Tree};

use crate::error::{Result, ZiftError};
use crate::rules::{CrossPredicate, PatternRule, Predicate};
use crate::types::{Confidence, Finding, Language, ScanPass};
use crate::types::{Confidence, Finding, Language, ScanPass, Surface};

pub struct CompiledRule<'a> {
pub rule: &'a PatternRule,
Expand Down Expand Up @@ -163,6 +163,7 @@ pub fn execute_query(
crate::rego::render_template(tmpl, &owned)
}),
pass: ScanPass::Structural,
surface: Surface::classify(file_path),
});
}

Expand Down Expand Up @@ -1277,6 +1278,7 @@ match = ".*"
pattern_rule: None,
rego_stub: None,
pass: ScanPass::Structural,
surface: Surface::Backend,
};
let findings = vec![f.clone(), f];
let deduped = dedup_findings(findings);
Expand Down
Loading