From 045aa7602dbd86bae0da8eb4357b7fdcd0625e0f Mon Sep 17 00:00:00 2001 From: Brad Anderson Date: Mon, 4 May 2026 20:02:24 -0400 Subject: [PATCH 1/5] feat: add `surface: backend|frontend` field to findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Path-derived classifier that tags findings as `frontend` (UI/client) or `backend` (server/API) based on simple directory-name heuristics — `web/`, `webapp/`, `client/`, `frontend/`, `ui/`, `public/`, `static/`, anywhere in the path, segment-bounded, case-insensitive. Default is `backend`, which is also what's used for old JSON without the field (`#[serde(default)]`). Surfaced so consumers can filter out frontend ownership-style noise (e.g. Zulip's `web/src/...` `user.user_id === current_user.user_id` matches — UI state checks, not security gates) post-hoc, instead of baking that exclusion into rule predicates. Text output tags `(frontend)` inline next to the file:line header; JSON exposes the field via serde. Closes P2 #10 from plans/05-corpus-shakedown-followups. --- src/deep/candidate.rs | 6 +- src/deep/context.rs | 3 +- src/deep/finding.rs | 7 +- src/deep/merge.rs | 3 +- src/deep/prompt.rs | 3 +- src/mcp/tools.rs | 3 +- src/output/text.rs | 13 ++- src/rego/grouping.rs | 3 + src/scanner/matcher.rs | 4 +- src/types.rs | 147 ++++++++++++++++++++++++++- tests/deep_http_integration.rs | 3 +- tests/deep_subprocess_integration.rs | 3 +- 12 files changed, 185 insertions(+), 13 deletions(-) diff --git a/src/deep/candidate.rs b/src/deep/candidate.rs index 97393fa..26ded77 100644 --- a/src/deep/candidate.rs +++ b/src/deep/candidate.rs @@ -378,7 +378,7 @@ fn overlaps_any( #[cfg(test)] mod tests { use super::*; - use crate::types::{AuthCategory, Confidence, Language, ScanPass}; + use crate::types::{AuthCategory, Confidence, Language, ScanPass, Surface}; use std::fs; use tempfile::tempdir; @@ -396,6 +396,7 @@ mod tests { pattern_rule: None, rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, } } @@ -769,7 +770,7 @@ mod tests { // Regression: a structural finding pointing at a deleted file used to // propagate `DeepError::Io` through `?`, killing the entire deep pass // even though deep mode is otherwise best-effort. - use crate::types::{AuthCategory, Confidence, Finding, ScanPass}; + use crate::types::{AuthCategory, Confidence, Finding, ScanPass, Surface}; let dir = tempdir().unwrap(); // One escalation finding pointing at a file that doesn't exist. let bad = Finding { @@ -785,6 +786,7 @@ mod tests { pattern_rule: None, rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, }; // Should NOT propagate Io; should return Ok with the bad escalation // skipped. (No cold-region files either, so result is empty.) diff --git a/src/deep/context.rs b/src/deep/context.rs index 93e034a..25ddce8 100644 --- a/src/deep/context.rs +++ b/src/deep/context.rs @@ -195,7 +195,7 @@ fn expand_inner( #[cfg(test)] mod tests { use super::*; - use crate::types::{AuthCategory, Confidence, ScanPass}; + use crate::types::{AuthCategory, Confidence, ScanPass, Surface}; use std::fs; use std::path::PathBuf; use tempfile::tempdir; @@ -214,6 +214,7 @@ mod tests { pattern_rule: None, rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, } } diff --git a/src/deep/finding.rs b/src/deep/finding.rs index c6ed431..9fe1926 100644 --- a/src/deep/finding.rs +++ b/src/deep/finding.rs @@ -2,7 +2,7 @@ use crate::deep::candidate::Candidate; use crate::scanner::matcher::compute_finding_id; -use crate::types::{AuthCategory, Confidence, Finding, ScanPass}; +use crate::types::{AuthCategory, Confidence, Finding, ScanPass, Surface}; use serde::Deserialize; use std::path::Path; @@ -112,6 +112,10 @@ pub fn into_finding( pattern_rule: Some(rule_id), rego_stub: None, // structural-only; semantic findings have no rego template pass: ScanPass::Semantic, + // Surface follows the source file, not the pass — same path + // heuristic as structural findings so a deep-pass `web/src/foo.ts` + // finding is tagged Frontend just like its structural twin would be. + surface: Surface::classify(&candidate.file), } } @@ -180,6 +184,7 @@ mod tests { pattern_rule: pattern_rule.map(String::from), rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, } } diff --git a/src/deep/merge.rs b/src/deep/merge.rs index 51843cc..044c9e8 100644 --- a/src/deep/merge.rs +++ b/src/deep/merge.rs @@ -56,7 +56,7 @@ fn range_overlap_fraction(a_start: usize, a_end: usize, b_start: usize, b_end: u #[cfg(test)] mod tests { use super::*; - use crate::types::{AuthCategory, Confidence, Language, ScanPass}; + use crate::types::{AuthCategory, Confidence, Language, ScanPass, Surface}; use std::path::PathBuf; fn finding( @@ -79,6 +79,7 @@ mod tests { pattern_rule: None, rego_stub: None, pass, + surface: Surface::Backend, } } diff --git a/src/deep/prompt.rs b/src/deep/prompt.rs index 375ca26..d91e4dc 100644 --- a/src/deep/prompt.rs +++ b/src/deep/prompt.rs @@ -305,7 +305,7 @@ fn language_fence(lang: Language) -> &'static str { mod tests { use super::*; use crate::deep::candidate::{Candidate, CandidateKind}; - use crate::types::{AuthCategory, Confidence, ScanPass}; + use crate::types::{AuthCategory, Confidence, ScanPass, Surface}; use std::path::PathBuf; fn candidate_with_imports(language: Language, imports: Vec) -> Candidate { @@ -336,6 +336,7 @@ mod tests { pattern_rule: Some("ts-custom-1".into()), rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, } } diff --git a/src/mcp/tools.rs b/src/mcp/tools.rs index 947e868..8a495b4 100644 --- a/src/mcp/tools.rs +++ b/src/mcp/tools.rs @@ -25,7 +25,7 @@ use crate::rego::templates::{apply_confidence_wrapping, generate_default_stub, r use crate::rego::validator::validate_rego; use crate::rules::PatternRule; use crate::scanner; -use crate::types::{AuthCategory, Confidence, Finding, Language, ScanPass}; +use crate::types::{AuthCategory, Confidence, Finding, Language, ScanPass, Surface}; const DEFAULT_MAX_PROMPT_CHARS: usize = 16_000; @@ -607,6 +607,7 @@ fn analyze_snippet(_ctx: &ServerContext, args: &Value) -> Result pattern_rule: s.pattern_rule.clone(), rego_stub: None, pass: ScanPass::Structural, + surface: Surface::classify(&PathBuf::from(&parsed.file)), }); let rendered = render(&PromptInputs { diff --git a/src/output/text.rs b/src/output/text.rs index 8b52f65..de09e48 100644 --- a/src/output/text.rs +++ b/src/output/text.rs @@ -2,7 +2,7 @@ use std::io::Write; use std::path::Path; use crate::error::Result; -use crate::types::Finding; +use crate::types::{Finding, Surface}; pub fn print( findings: &[Finding], @@ -21,13 +21,22 @@ pub fn print( writeln!(writer)?; } + // Tag frontend-surface findings inline so reviewers can scan past + // the UI-state noise (e.g. `web/src/foo.ts` ownership matches that + // are "is this me?" checks, not security gates) without diving into + // the JSON. Backend is the default — no tag, less visual noise. + let surface_tag = match finding.surface { + Surface::Frontend => " (frontend)", + Surface::Backend => "", + }; writeln!( writer, - " {}:{} [{}] {}", + " {}:{} [{}] {}{}", finding.file.display(), finding.line_start, finding.category, finding.confidence, + surface_tag, )?; writeln!(writer, " {}", finding.description)?; diff --git a/src/rego/grouping.rs b/src/rego/grouping.rs index f6da247..9d71a61 100644 --- a/src/rego/grouping.rs +++ b/src/rego/grouping.rs @@ -234,6 +234,7 @@ mod tests { pattern_rule: Some("test-rule".into()), rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, }]; let files = group_findings(&findings, "app", Path::new("./policies")); @@ -260,6 +261,7 @@ mod tests { pattern_rule: None, rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, }, Finding { id: "b".into(), @@ -274,6 +276,7 @@ mod tests { pattern_rule: None, rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, }, ]; diff --git a/src/scanner/matcher.rs b/src/scanner/matcher.rs index 8483525..9a2bebc 100644 --- a/src/scanner/matcher.rs +++ b/src/scanner/matcher.rs @@ -8,7 +8,7 @@ use tree_sitter::{Query, QueryCursor, Tree}; use crate::error::{Result, ZiftError}; use crate::rules::{CrossPredicate, PatternRule, Predicate}; -use crate::types::{Confidence, Finding, Language, ScanPass}; +use crate::types::{Confidence, Finding, Language, ScanPass, Surface}; pub struct CompiledRule<'a> { pub rule: &'a PatternRule, @@ -163,6 +163,7 @@ pub fn execute_query( crate::rego::render_template(tmpl, &owned) }), pass: ScanPass::Structural, + surface: Surface::classify(file_path), }); } @@ -1277,6 +1278,7 @@ match = ".*" pattern_rule: None, rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, }; let findings = vec![f.clone(), f]; let deduped = dedup_findings(findings); diff --git a/src/types.rs b/src/types.rs index a27c1f5..7825a5b 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,5 +1,7 @@ -use std::path::PathBuf; +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; +use regex::Regex; use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -16,6 +18,63 @@ pub struct Finding { pub pattern_rule: Option, pub rego_stub: Option, pub pass: ScanPass, + /// Where in a typical app this finding lives — frontend (UI/client) or + /// backend (server/API). Inferred from the file path via simple + /// directory-name heuristics in [`Surface::classify`]. Surfaced so + /// downstream consumers can filter out frontend `ownership`-style noise + /// (e.g. Zulip's `web/src/...` `user.user_id === current_user.user_id` + /// matches, which are UI state checks rather than security gates) without + /// us having to bake that filter into rule predicates. Defaults to + /// `Backend` when the path doesn't match any frontend-shaped directory, + /// which is the right default for the scanner's primary target (backend + /// authz code). + #[serde(default)] + pub surface: Surface, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default, clap::ValueEnum)] +#[serde(rename_all = "lowercase")] +pub enum Surface { + #[default] + Backend, + Frontend, +} + +impl Surface { + /// Classify a finding's surface from its file path. Heuristic-only — + /// looks for a small set of well-known frontend directory names anywhere + /// in the path (`web`, `webapp`, `client`, `frontend`, `ui`, `public`, + /// `static`). Conservative on purpose: false negatives (frontend code + /// classified as backend) leave the existing behavior unchanged, while + /// false positives (backend code classified as frontend) would silently + /// downgrade real findings — so when in doubt, return `Backend`. + /// + /// The match is case-insensitive and segment-bounded (we want + /// `app/web/foo.ts` but not `apps/network/foo.ts` or `webhooks/foo.ts`). + pub fn classify(path: &Path) -> Self { + static FRONTEND_RE: OnceLock = OnceLock::new(); + let re = FRONTEND_RE.get_or_init(|| { + // Anchored at a path separator (or string start) and followed by + // a separator so we don't accidentally match `webhooks/`, + // `clientservice/`, etc. `(?i)` makes it case-insensitive. + Regex::new(r"(?i)(^|[\\/])(web|webapp|client|frontend|ui|public|static)[\\/]") + .expect("static regex compiles") + }); + if re.is_match(&path.to_string_lossy()) { + Surface::Frontend + } else { + Surface::Backend + } + } +} + +impl std::fmt::Display for Surface { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Surface::Backend => write!(f, "backend"), + Surface::Frontend => write!(f, "frontend"), + } + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, clap::ValueEnum)] @@ -142,3 +201,89 @@ impl std::fmt::Display for Confidence { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn surface_classifies_frontend_directories() { + assert_eq!( + Surface::classify(Path::new("web/src/foo.ts")), + Surface::Frontend + ); + assert_eq!( + Surface::classify(Path::new("apps/web/src/page.tsx")), + Surface::Frontend + ); + assert_eq!( + Surface::classify(Path::new("packages/client/index.ts")), + Surface::Frontend + ); + assert_eq!( + Surface::classify(Path::new("frontend/components/Foo.tsx")), + Surface::Frontend + ); + assert_eq!( + Surface::classify(Path::new("apps/ui/Settings.tsx")), + Surface::Frontend + ); + assert_eq!( + Surface::classify(Path::new("public/assets/main.js")), + Surface::Frontend + ); + // Case-insensitive — matches `Web/` as well as `web/`. + assert_eq!( + Surface::classify(Path::new("Apps/Web/Foo.tsx")), + Surface::Frontend + ); + } + + #[test] + fn surface_does_not_misclassify_backend_paths() { + // Adjacent-name traps: `webhooks/` is not `web/`, `clientservice/` + // is not `client/` — segment boundaries matter. + assert_eq!( + Surface::classify(Path::new("internal/webhooks/handler.go")), + Surface::Backend + ); + assert_eq!( + Surface::classify(Path::new("services/clientservice/main.go")), + Surface::Backend + ); + assert_eq!( + Surface::classify(Path::new("server/api/users.py")), + Surface::Backend + ); + assert_eq!( + Surface::classify(Path::new("zerver/views/auth.py")), + Surface::Backend + ); + assert_eq!( + Surface::classify(Path::new("models/perm/access/role.go")), + Surface::Backend + ); + } + + #[test] + fn surface_default_round_trips_through_serde() { + // Old JSON output predates the `surface` field — deserialization + // must default to Backend rather than failing. + let no_surface = r#"{ + "id": "x", + "file": "a.ts", + "line_start": 1, + "line_end": 1, + "code_snippet": "", + "language": "typescript", + "category": "rbac", + "confidence": "low", + "description": "", + "pattern_rule": null, + "rego_stub": null, + "pass": "structural" + }"#; + let f: Finding = serde_json::from_str(no_surface).unwrap(); + assert_eq!(f.surface, Surface::Backend); + } +} diff --git a/tests/deep_http_integration.rs b/tests/deep_http_integration.rs index 4cb3deb..f3d0ddf 100644 --- a/tests/deep_http_integration.rs +++ b/tests/deep_http_integration.rs @@ -463,7 +463,7 @@ fn missing_usage_field_defaults_to_zero() { use std::fs; use std::path::PathBuf; use tempfile::tempdir; -use zift::types::{Finding, ScanPass}; +use zift::types::{Finding, ScanPass, Surface}; fn structural_finding(file: &str, line: usize) -> Finding { Finding { @@ -479,6 +479,7 @@ fn structural_finding(file: &str, line: usize) -> Finding { pattern_rule: Some("ts-custom".into()), rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, } } diff --git a/tests/deep_subprocess_integration.rs b/tests/deep_subprocess_integration.rs index 213feca..6799ae2 100644 --- a/tests/deep_subprocess_integration.rs +++ b/tests/deep_subprocess_integration.rs @@ -21,7 +21,7 @@ use serde_json::json; use tempfile::tempdir; use zift::deep::config::{DeepMode, DeepRuntime}; -use zift::types::{AuthCategory, Confidence, Finding, Language, ScanPass}; +use zift::types::{AuthCategory, Confidence, Finding, Language, ScanPass, Surface}; /// Build a minimal subprocess-mode runtime pointing at `cmd`. fn subprocess_runtime(cmd: &str, timeout_secs: u64) -> DeepRuntime { @@ -59,6 +59,7 @@ fn structural_finding(file: &str, line: usize) -> Finding { pattern_rule: Some("ts-custom".into()), rego_stub: None, pass: ScanPass::Structural, + surface: Surface::Backend, } } From 8c7db4cc26081016427391c19b196fa4c6c40d18 Mon Sep 17 00:00:00 2001 From: Brad Anderson Date: Mon, 4 May 2026 20:02:42 -0400 Subject: [PATCH 2/5] fix(deep): populate code_snippet on semantic findings via candidate-snippet fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `into_finding` previously left `code_snippet` as the empty string when the filesystem read failed (file moved between structural and semantic passes, permission errors, etc.). Corpus shakedown turned up 28 deep findings shipping with `code_snippet: ""`, which made deep-only diff buckets unreviewable — downstream tooling that surfaces "what was flagged" had nothing to render. Add `slice_candidate_snippet` as a fallback: the candidate's `source_snippet` is the same expanded source the model just analyzed and is already in memory, so we can slice it for the model-reported line range when the filesystem path is unavailable. Empty string remains the last resort. Also handles the truncated-snippet case (when the candidate hit `max_prompt_chars`) by clamping at the snippet's actual length instead of panicking. Closes P3 #11 from plans/05-corpus-shakedown-followups. --- src/deep/finding.rs | 136 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 5 deletions(-) diff --git a/src/deep/finding.rs b/src/deep/finding.rs index 9fe1926..dbf279e 100644 --- a/src/deep/finding.rs +++ b/src/deep/finding.rs @@ -32,8 +32,14 @@ pub struct SemanticFinding { /// /// `scan_root` is required to read the file at `candidate.file` (relative) /// to populate `code_snippet` from the lines the model identified. If the -/// file is unreadable (e.g. moved between scan and analyze), `code_snippet` -/// falls back to the empty string — best-effort, do not fail the finding. +/// file is unreadable (e.g. moved between scan and analyze), the snippet +/// falls back to slicing [`Candidate::source_snippet`] for the same line +/// range — that buffer was loaded by the structural pass so it's already +/// in memory and represents the same source the model just analyzed. +/// Empty `code_snippet` remains the last resort; downstream tools that +/// surface "what was flagged" need *something* on every finding (corpus +/// shakedown turned up 28 semantic findings with `code_snippet: ""`, +/// which made deep-only buckets unreviewable). pub fn into_finding( sem: SemanticFinding, candidate: &Candidate, @@ -85,8 +91,14 @@ pub fn into_finding( _ => format!("semantic-{}", sem.category.slug()), }; - let code_snippet = - extract_lines(scan_root, &candidate.file, sem.line_start, sem.line_end).unwrap_or_default(); + // Try the filesystem first — that's the source of truth and produces the + // same byte range a structural finding would. Fall back to slicing the + // candidate's expanded snippet (already in memory) when the file moved, + // permissions changed, or extract_lines bailed for any other reason. + // Last resort: empty string. We never *fail* a finding on snippet read. + let code_snippet = extract_lines(scan_root, &candidate.file, sem.line_start, sem.line_end) + .or_else(|| slice_candidate_snippet(candidate, sem.line_start, sem.line_end)) + .unwrap_or_default(); let id = compute_finding_id( &rule_id, @@ -119,6 +131,46 @@ pub fn into_finding( } } +/// Slice [`candidate.source_snippet`] to the model-reported line range, +/// translating absolute (1-based, file-relative) line numbers into offsets +/// within the snippet. Returns `None` if the snippet is empty, the model's +/// range falls outside the candidate window, or the requested offsets land +/// past the end of the snippet (which can happen when the snippet was +/// truncated to fit `max_prompt_chars`). +/// +/// `clamp_to_candidate` (in `deep::mod`) already keeps `sem` inside the +/// candidate window before this gets called, but we re-check defensively +/// rather than panic if a future caller skips the clamp. +fn slice_candidate_snippet( + candidate: &crate::deep::candidate::Candidate, + sem_start: usize, + sem_end: usize, +) -> Option { + if candidate.source_snippet.is_empty() { + return None; + } + if sem_start == 0 + || sem_end < sem_start + || sem_start < candidate.line_start + || sem_end > candidate.line_end + { + return None; + } + let lines: Vec<&str> = candidate.source_snippet.lines().collect(); + if lines.is_empty() { + return None; + } + // Translate file-relative 1-based line numbers into snippet-relative + // 0-based offsets. Snippet line 0 corresponds to `candidate.line_start`. + let start_idx = sem_start - candidate.line_start; + let end_idx = sem_end - candidate.line_start; + if start_idx >= lines.len() { + return None; + } + let end_inclusive = end_idx.min(lines.len() - 1); + Some(lines[start_idx..=end_inclusive].join("\n")) +} + /// Read the file at `scan_root.join(relative)` and return lines `[start, end]` /// joined by `\n`. Returns `None` on read error or out-of-range input. fn extract_lines(scan_root: &Path, relative: &Path, start: usize, end: usize) -> Option { @@ -322,7 +374,8 @@ mod tests { #[test] fn into_finding_falls_back_to_empty_snippet_on_read_error() { let dir = tempdir().unwrap(); - // File doesn't exist. + // File doesn't exist AND the candidate has no source_snippet to + // fall back to → last-resort empty string. let cand = make_candidate("nonexistent.ts", Language::TypeScript); let f = into_finding(make_semantic(1, 5), &cand, None, dir.path()); assert_eq!(f.code_snippet, ""); @@ -332,6 +385,79 @@ mod tests { assert_eq!(f.line_end, 5); } + #[test] + fn into_finding_falls_back_to_candidate_snippet_when_file_unreadable() { + // Regression for corpus shakedown: 28 semantic findings shipped with + // `code_snippet: ""` because filesystem read failed and we had no + // fallback. The candidate's `source_snippet` is the same source the + // model just analyzed — slice it instead of dropping the snippet. + let dir = tempdir().unwrap(); + // Note: file is *not* created — extract_lines must fail. + let mut cand = make_candidate("missing.ts", Language::TypeScript); + cand.line_start = 10; + cand.line_end = 14; + cand.source_snippet = "line 10\nline 11\nline 12\nline 13\nline 14".to_string(); + + // Model reports lines 11-12 within the candidate window. + let sem = make_semantic(11, 12); + let f = into_finding(sem, &cand, None, dir.path()); + + assert!(f.code_snippet.contains("line 11")); + assert!(f.code_snippet.contains("line 12")); + assert!(!f.code_snippet.contains("line 10")); + assert!(!f.code_snippet.contains("line 13")); + } + + #[test] + fn slice_candidate_snippet_rejects_ranges_outside_window() { + let cand = Candidate { + kind: CandidateKind::ColdRegion, + file: PathBuf::from("a.ts"), + language: Language::TypeScript, + line_start: 10, + line_end: 14, + source_snippet: "line 10\nline 11\nline 12\nline 13\nline 14".to_string(), + imports: Vec::new(), + original_finding_id: None, + seed_category: None, + }; + // Below window. + assert!(slice_candidate_snippet(&cand, 5, 8).is_none()); + // Above window. + assert!(slice_candidate_snippet(&cand, 20, 22).is_none()); + // Reversed. + assert!(slice_candidate_snippet(&cand, 12, 11).is_none()); + // Zero start (defensive — clamp_to_candidate normally drops these). + assert!(slice_candidate_snippet(&cand, 0, 12).is_none()); + // Empty snippet. + let mut empty = cand.clone(); + empty.source_snippet.clear(); + assert!(slice_candidate_snippet(&empty, 11, 12).is_none()); + } + + #[test] + fn slice_candidate_snippet_clamps_when_snippet_was_truncated() { + // Truncation at `max_prompt_chars` can leave the snippet shorter + // than `[candidate.line_start, candidate.line_end]` would imply. + // Tail offsets must clamp instead of panicking on out-of-bounds. + let cand = Candidate { + kind: CandidateKind::ColdRegion, + file: PathBuf::from("a.ts"), + language: Language::TypeScript, + line_start: 10, + line_end: 20, // candidate window claims 11 lines … + source_snippet: "line 10\nline 11\nline 12".to_string(), // … but snippet has 3 + imports: Vec::new(), + original_finding_id: None, + seed_category: None, + }; + // Model points at lines 11-15 — only 11 and 12 are in the truncated + // snippet, so we should get those two and not panic. + let got = slice_candidate_snippet(&cand, 11, 15).unwrap(); + assert!(got.contains("line 11")); + assert!(got.contains("line 12")); + } + #[test] fn ranges_overlap_covers_inclusive_boundaries() { // Inclusive on both ends: touching at a single line counts as overlap. From b0d40136dd102e919e013db8abcc01ad7b324499 Mon Sep 17 00:00:00 2001 From: Brad Anderson Date: Mon, 4 May 2026 20:02:58 -0400 Subject: [PATCH 3/5] test(scanner): pin enforcement_points metric and clarify corpus 0 reading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus shakedown showed `summary.enforcement_points: 0` on every run across five languages — flagged as suspicious in the follow-up plan (metric broken, or scanner isn't recognizing externalized policy at all?). The metric is fine: import-statement detection only fires for TS/JS today (see `find_policy_imports`), and none of the five corpora ship policy-engine imports anyway, so 0 was the correct answer. Pin that with two end-to-end scans against synthetic fixtures: one imports `authorize` from a path containing `authz` and confirms the counter increments to 1 with the would-be inline finding suppressed; the counterpart imports the same name from `utils` and confirms the counter stays 0 with the inline finding intact. Plus a doc-comment on the counter declaration explaining the TS/JS-only limitation and pointing at the test for future audits. Also moves plans/todo/05-corpus-shakedown-followups.md → plans/done/ now that all P0–P3 items from the doc have landed (P0–P1 in #50, P2 and P3 here; P3 #13 was already aligned with the corpus README). Closes P3 #12 from plans/05-corpus-shakedown-followups. --- .../05-corpus-shakedown-followups.md | 0 src/scanner/mod.rs | 12 ++ tests/scanner_enforcement_points.rs | 128 ++++++++++++++++++ 3 files changed, 140 insertions(+) rename plans/{todo => done}/05-corpus-shakedown-followups.md (100%) create mode 100644 tests/scanner_enforcement_points.rs diff --git a/plans/todo/05-corpus-shakedown-followups.md b/plans/done/05-corpus-shakedown-followups.md similarity index 100% rename from plans/todo/05-corpus-shakedown-followups.md rename to plans/done/05-corpus-shakedown-followups.md diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 13d4be3..8be8acb 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -73,6 +73,18 @@ pub fn scan( let mut ts_parser = tree_sitter::Parser::new(); let mut all_findings = Vec::new(); + // `enforcement_points` counts call sites that *would* have matched a + // structural rule, but resolve to a name imported from a path containing + // a policy-engine indicator (`authz`, `opa`, `policy`, `rego`, + // `enforce`, `open-policy-agent` — see `scanner::imports`). Those calls + // are already routed through a policy engine, so we suppress the inline + // finding and count them here instead — that's what feeds + // `summary.externalized_pct` in the JSON output. Note: import-statement + // detection is TS/JS-only today (see `find_policy_imports`), so the + // counter is currently a no-op for Go/Java/Python codebases. Most + // open-source corpora we've tried also ship zero externalized policy, + // so a 0 here is usually correct rather than buggy. Pinned by + // `tests/scanner_enforcement_points.rs`. let mut enforcement_points: usize = 0; let mut seen_enforcement: std::collections::HashSet<(std::path::PathBuf, usize)> = std::collections::HashSet::new(); diff --git a/tests/scanner_enforcement_points.rs b/tests/scanner_enforcement_points.rs new file mode 100644 index 0000000..02e532a --- /dev/null +++ b/tests/scanner_enforcement_points.rs @@ -0,0 +1,128 @@ +//! Audit for `summary.enforcement_points` (PR-followup #12). +//! +//! Corpus shakedown reported `enforcement_points: 0` on every run. The +//! follow-up plan called for verifying the metric on a fixture that *does* +//! ship a policy-engine import — the corpora it reported 0 against simply +//! had no externalized authz, so 0 was correct, but we wanted a regression +//! test pinning the behavior in place. +//! +//! These tests exercise the real `scanner::scan` end-to-end: a synthetic +//! file that imports from a path containing `authz` and calls the imported +//! name should produce one `enforcement_points` increment and zero inline +//! findings (the call is already routed through a policy engine, so we +//! don't flag it again). + +use std::fs; + +use tempfile::tempdir; + +use zift::cli::ScanArgs; +use zift::config::ZiftConfig; +use zift::rules; +use zift::scanner; + +#[test] +fn enforcement_points_increments_when_call_routes_through_policy_import() { + let dir = tempdir().unwrap(); + // Import path contains "authz" → `find_policy_imports` captures `authorize` + // as a policy-bound name. The subsequent `authorize("orders:read")` call + // matches `ts-authorize-function-call` structurally, but because + // `is_enforcement_point` sees `authorize` from the policy import, the + // finding is rerouted into the enforcement-point counter rather than the + // findings list. + fs::write( + dir.path().join("orders.ts"), + r#"import { authorize } from '../lib/authz'; + +export function listOrders(user: User) { + if (authorize("orders:read")) { + return db.orders.find(); + } + return null; +} +"#, + ) + .unwrap(); + + let config = ZiftConfig::default(); + let loaded_rules = rules::load_rules(None, &config).expect("embedded rules load"); + let args = ScanArgs { + path: dir.path().to_path_buf(), + ..ScanArgs::default() + }; + + let result = scanner::scan(dir.path(), &loaded_rules, &args, &config).unwrap(); + + assert_eq!( + result.enforcement_points, + 1, + "expected the policy-imported authorize() call to count as an enforcement point; \ + got {} (findings: {:?})", + result.enforcement_points, + result + .findings + .iter() + .map(|f| (f.pattern_rule.clone(), f.line_start)) + .collect::>(), + ); + // The same call should NOT also appear as an inline finding — that's + // the whole point of the enforcement-point shortcut. (Other rules in + // unrelated files could still fire; we only assert the single rule the + // fixture targets is gone.) + assert!( + !result + .findings + .iter() + .any(|f| f.pattern_rule.as_deref() == Some("ts-authorize-function-call")), + "policy-routed call leaked into findings: {:?}", + result.findings, + ); +} + +#[test] +fn enforcement_points_is_zero_without_policy_imports() { + // Counterpart to the test above: identical call site, but the import + // path doesn't look like a policy module (`utils` instead of `authz`). + // The call should fall through to the structural finding path and the + // enforcement-point counter should stay at 0. This pins the behavior + // the corpus shakedown actually saw — none of the five corpora shipped + // policy imports, so 0 was the correct answer. + let dir = tempdir().unwrap(); + fs::write( + dir.path().join("orders.ts"), + r#"import { authorize } from '../lib/utils'; + +export function listOrders(user: User) { + if (authorize("orders:read")) { + return db.orders.find(); + } + return null; +} +"#, + ) + .unwrap(); + + let config = ZiftConfig::default(); + let loaded_rules = rules::load_rules(None, &config).expect("embedded rules load"); + let args = ScanArgs { + path: dir.path().to_path_buf(), + ..ScanArgs::default() + }; + + let result = scanner::scan(dir.path(), &loaded_rules, &args, &config).unwrap(); + + assert_eq!(result.enforcement_points, 0); + assert!( + result + .findings + .iter() + .any(|f| f.pattern_rule.as_deref() == Some("ts-authorize-function-call")), + "structural rule should have fired without the policy-import shortcut; \ + got: {:?}", + result + .findings + .iter() + .map(|f| f.pattern_rule.clone()) + .collect::>(), + ); +} From 56f3d10c00ea664a0b92f1e33e3ede2ca2f8f061 Mon Sep 17 00:00:00 2001 From: Brad Anderson Date: Mon, 4 May 2026 20:24:01 -0400 Subject: [PATCH 4/5] refactor(surface): tighten frontend heuristic and tag style MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop `public` and `static` from the strong frontend-marker list in `Surface::classify`. They're too generic on their own — `lib/public/api.go` and `services/static/registry.rs` are real backend trees, and treating them as Frontend would silently downgrade real findings. They only matter when accompanied by a strong marker (`web`, `client`, `frontend`, `ui`, `webapp`), at which point the strong marker already classifies the path as Frontend, making them redundant in the regex. Tests updated to pin `lib/public/api.go` and `services/static/registry.rs` as Backend, and to demonstrate `apps/web/public/main.js` still classifies as Frontend via the strong marker. Also switch the text-output frontend tag from `(frontend)` to `[frontend]` so it matches the bracket style of the surrounding `[rbac]`/`[high]` metadata. --- src/output/text.rs | 2 +- src/types.rs | 39 +++++++++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/output/text.rs b/src/output/text.rs index de09e48..be229b5 100644 --- a/src/output/text.rs +++ b/src/output/text.rs @@ -26,7 +26,7 @@ pub fn print( // are "is this me?" checks, not security gates) without diving into // the JSON. Backend is the default — no tag, less visual noise. let surface_tag = match finding.surface { - Surface::Frontend => " (frontend)", + Surface::Frontend => " [frontend]", Surface::Backend => "", }; writeln!( diff --git a/src/types.rs b/src/types.rs index 7825a5b..7e964d8 100644 --- a/src/types.rs +++ b/src/types.rs @@ -42,12 +42,21 @@ pub enum Surface { impl Surface { /// Classify a finding's surface from its file path. Heuristic-only — - /// looks for a small set of well-known frontend directory names anywhere - /// in the path (`web`, `webapp`, `client`, `frontend`, `ui`, `public`, - /// `static`). Conservative on purpose: false negatives (frontend code - /// classified as backend) leave the existing behavior unchanged, while - /// false positives (backend code classified as frontend) would silently - /// downgrade real findings — so when in doubt, return `Backend`. + /// looks for a small set of well-known *strong* frontend directory names + /// anywhere in the path (`web`, `webapp`, `client`, `frontend`, `ui`). + /// Generic tokens like `public/` and `static/` are intentionally **not** + /// in the strong list — they show up in plenty of backend trees + /// (`lib/public/api.go`, `services/static/registry.rs`) and treating + /// them as Frontend on their own would silently downgrade real findings. + /// They only count when a strong marker is also present in the path + /// (e.g. `apps/web/public/main.js`), which already classifies as Frontend + /// via the strong marker — so dropping them from the regex is equivalent + /// to "weak markers require a nearby strong marker." + /// + /// Conservative on purpose: false negatives (frontend code classified as + /// backend) leave the existing behavior unchanged, while false positives + /// (backend code classified as frontend) would silently downgrade real + /// findings — so when in doubt, return `Backend`. /// /// The match is case-insensitive and segment-bounded (we want /// `app/web/foo.ts` but not `apps/network/foo.ts` or `webhooks/foo.ts`). @@ -57,7 +66,7 @@ impl Surface { // Anchored at a path separator (or string start) and followed by // a separator so we don't accidentally match `webhooks/`, // `clientservice/`, etc. `(?i)` makes it case-insensitive. - Regex::new(r"(?i)(^|[\\/])(web|webapp|client|frontend|ui|public|static)[\\/]") + Regex::new(r"(?i)(^|[\\/])(web|webapp|client|frontend|ui)[\\/]") .expect("static regex compiles") }); if re.is_match(&path.to_string_lossy()) { @@ -228,8 +237,10 @@ mod tests { Surface::classify(Path::new("apps/ui/Settings.tsx")), Surface::Frontend ); + // `public/` co-occurring with a strong marker (`web`) is still + // Frontend — the strong marker carries the classification. assert_eq!( - Surface::classify(Path::new("public/assets/main.js")), + Surface::classify(Path::new("apps/web/public/main.js")), Surface::Frontend ); // Case-insensitive — matches `Web/` as well as `web/`. @@ -263,6 +274,18 @@ mod tests { Surface::classify(Path::new("models/perm/access/role.go")), Surface::Backend ); + // Generic tokens without a strong marker stay Backend, by design — + // false-positive avoidance for trees like `lib/public/api.go` or + // `services/static/registry.rs`. Accepts a false negative on + // bare `public/assets/main.js`-style trees in exchange. + assert_eq!( + Surface::classify(Path::new("lib/public/api.go")), + Surface::Backend + ); + assert_eq!( + Surface::classify(Path::new("services/static/registry.rs")), + Surface::Backend + ); } #[test] From 70f8e337271f4ca43170daaac5b712bce89194cc Mon Sep 17 00:00:00 2001 From: Brad Anderson Date: Mon, 4 May 2026 20:35:08 -0400 Subject: [PATCH 5/5] fix: address CodeRabbit review feedback on PR #55 - finding.rs: extract_lines now returns None when start > lines.len() instead of clamping to the last line, so the fallback chain reaches slice_candidate_snippet on shrunken/replaced files rather than attaching unrelated last-line text to the finding. - types.rs: Surface::classify now requires a frontend file extension in addition to a strong directory marker. Previously a backend Java/Rust/Go module under web/, client/, or ui/ would misclassify as Frontend and silently downgrade real findings for consumers that filter frontend results. Adds regression tests for backend extensions in frontend-marker dirs and a case-insensitive .TSX case. --- src/deep/finding.rs | 11 ++++- src/types.rs | 113 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 103 insertions(+), 21 deletions(-) diff --git a/src/deep/finding.rs b/src/deep/finding.rs index dbf279e..75a1839 100644 --- a/src/deep/finding.rs +++ b/src/deep/finding.rs @@ -182,7 +182,16 @@ fn extract_lines(scan_root: &Path, relative: &Path, start: usize, end: usize) -> if lines.is_empty() { return None; } - let s = (start - 1).min(lines.len() - 1); + // Bail (instead of clamping to the last line) when the requested range + // starts past the end of the file. Otherwise the caller's fallback chain + // — `extract_lines(...).or_else(slice_candidate_snippet(...))` in + // `build_finding_from_semantic` — never reaches the candidate snippet on + // a shrunken/replaced file, and we'd silently attach an unrelated + // last-line excerpt to the finding. + if start > lines.len() { + return None; + } + let s = start - 1; let e = end.min(lines.len()).max(s + 1); Some(lines[s..e].join("\n")) } diff --git a/src/types.rs b/src/types.rs index 7e964d8..ab42472 100644 --- a/src/types.rs +++ b/src/types.rs @@ -42,24 +42,35 @@ pub enum Surface { impl Surface { /// Classify a finding's surface from its file path. Heuristic-only — - /// looks for a small set of well-known *strong* frontend directory names - /// anywhere in the path (`web`, `webapp`, `client`, `frontend`, `ui`). - /// Generic tokens like `public/` and `static/` are intentionally **not** - /// in the strong list — they show up in plenty of backend trees - /// (`lib/public/api.go`, `services/static/registry.rs`) and treating - /// them as Frontend on their own would silently downgrade real findings. - /// They only count when a strong marker is also present in the path - /// (e.g. `apps/web/public/main.js`), which already classifies as Frontend - /// via the strong marker — so dropping them from the regex is equivalent - /// to "weak markers require a nearby strong marker." + /// requires **both** a well-known *strong* frontend directory name + /// (`web`, `webapp`, `client`, `frontend`, `ui`) AND a frontend file + /// extension on the leaf. Generic tokens like `public/` and `static/` + /// are intentionally **not** in the strong list — they show up in + /// plenty of backend trees (`lib/public/api.go`, + /// `services/static/registry.rs`) and treating them as Frontend on + /// their own would silently downgrade real findings. They only count + /// when a strong marker is also present in the path (e.g. + /// `apps/web/public/main.js`), which already classifies as Frontend + /// via the strong marker — so dropping them from the regex is + /// equivalent to "weak markers require a nearby strong marker." /// - /// Conservative on purpose: false negatives (frontend code classified as - /// backend) leave the existing behavior unchanged, while false positives - /// (backend code classified as frontend) would silently downgrade real - /// findings — so when in doubt, return `Backend`. + /// The extension gate covers the inverse trap: a Java/Rust/Go backend + /// project may legitimately use a `web/`, `client/`, or `ui/` + /// subdirectory for its server-side HTTP/UI-glue layer + /// (`src/main/java/com/x/web/UserService.java`, + /// `crates/server/src/web/handler.rs`). Without an extension check the + /// directory marker alone would misclassify those as Frontend and + /// silently suppress real backend findings for consumers that filter + /// frontend results. /// - /// The match is case-insensitive and segment-bounded (we want - /// `app/web/foo.ts` but not `apps/network/foo.ts` or `webhooks/foo.ts`). + /// Conservative on purpose: false negatives (frontend code classified + /// as backend) leave the existing behavior unchanged, while false + /// positives (backend code classified as frontend) would silently + /// downgrade real findings — so when in doubt, return `Backend`. + /// + /// The directory match is case-insensitive and segment-bounded (we + /// want `app/web/foo.ts` but not `apps/network/foo.ts` or + /// `webhooks/foo.ts`). The extension match is also case-insensitive. pub fn classify(path: &Path) -> Self { static FRONTEND_RE: OnceLock = OnceLock::new(); let re = FRONTEND_RE.get_or_init(|| { @@ -69,14 +80,34 @@ impl Surface { Regex::new(r"(?i)(^|[\\/])(web|webapp|client|frontend|ui)[\\/]") .expect("static regex compiles") }); - if re.is_match(&path.to_string_lossy()) { - Surface::Frontend - } else { - Surface::Backend + if !re.is_match(&path.to_string_lossy()) { + return Surface::Backend; + } + if !has_frontend_extension(path) { + return Surface::Backend; } + Surface::Frontend } } +/// Frontend-asset file extensions we recognize when gating +/// [`Surface::classify`]. Lowercase, no leading dot. Kept narrow on +/// purpose: a backend `web/` package full of `.java`/`.go`/`.py` files +/// must not slip through. Extensionless files (no extension at all) +/// stay Backend by design — the directory marker on its own isn't a +/// strong enough signal to override the safe default. +fn has_frontend_extension(path: &Path) -> bool { + const FRONTEND_EXTS: &[&str] = &[ + "js", "jsx", "ts", "tsx", "mjs", "cjs", "html", "htm", "css", "scss", "sass", "less", + "vue", "svelte", "astro", + ]; + let Some(ext) = path.extension().and_then(|s| s.to_str()) else { + return false; + }; + let lower = ext.to_ascii_lowercase(); + FRONTEND_EXTS.iter().any(|e| *e == lower) +} + impl std::fmt::Display for Surface { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -288,6 +319,48 @@ mod tests { ); } + #[test] + fn surface_requires_frontend_extension_even_with_strong_marker() { + // Strong directory marker present, but the leaf file is a backend + // language — must classify as Backend. Java/Rust/Go projects + // legitimately use `web/`, `client/`, `ui/` packages for their + // server-side HTTP/UI-glue layer, and treating those as Frontend + // would silently downgrade real findings for consumers that filter + // frontend results. + assert_eq!( + Surface::classify(Path::new("src/main/java/com/x/web/UserService.java")), + Surface::Backend + ); + assert_eq!( + Surface::classify(Path::new("crates/server/src/web/handler.rs")), + Surface::Backend + ); + assert_eq!( + Surface::classify(Path::new("internal/client/auth.go")), + Surface::Backend + ); + assert_eq!( + Surface::classify(Path::new("services/ui/templating.py")), + Surface::Backend + ); + // Extensionless files stay Backend — directory marker alone isn't + // strong enough signal to override the safe default. + assert_eq!( + Surface::classify(Path::new("apps/web/Makefile")), + Surface::Backend + ); + } + + #[test] + fn surface_extension_gate_is_case_insensitive() { + // Same as `Apps/Web/Foo.tsx` upstream, but capitalised extension — + // the `.TSX` should still trip the frontend gate. + assert_eq!( + Surface::classify(Path::new("apps/web/Component.TSX")), + Surface::Frontend + ); + } + #[test] fn surface_default_round_trips_through_serde() { // Old JSON output predates the `surface` field — deserialization