diff --git a/src/scanner/imports.rs b/src/scanner/imports.rs index b4412a6..a88d682 100644 --- a/src/scanner/imports.rs +++ b/src/scanner/imports.rs @@ -1,4 +1,5 @@ use std::collections::HashSet; +use std::path::Path; use streaming_iterator::StreamingIterator; @@ -23,6 +24,36 @@ fn is_policy_path(source: &str) -> bool { POLICY_INDICATORS.iter().any(|ind| lower.contains(ind)) } +/// Check if a file's path places it *inside* a policy-engine implementation +/// module — e.g. `internal/authz/authz_test.go` lives under a directory whose +/// name contains `authz`. Files like these are themselves the policy engine, +/// so the structural rules (which flag *consumers* of authz primitives) would +/// only produce noise. The scanner uses this to skip such files entirely. +/// +/// Heuristic: any directory component along the path (excluding the filename +/// itself) contains a `POLICY_INDICATORS` substring, case-insensitive. The +/// filename is intentionally excluded — a top-level `authz.go` could plausibly +/// be either the implementation or a consumer; if it really is the +/// implementation, the user can exclude it via config. Bypassing here is +/// reserved for the unambiguous "in a policy directory" shape. +/// +/// False-positive risk: the indicator set is shared with import-path matching +/// and includes broad terms (`enforce`, `opa`, `policy`). Consumer directories +/// like `services/enforcement/` or any path with `opa` as a substring will be +/// silently skipped — no findings, no enforcement points. The scanner surfaces +/// a `warn!` per skipped file so accidental bypasses are visible in logs; if a +/// legitimate consumer directory is being dropped, the user can either rename +/// it or exclude it via config (and we can narrow the indicator set later). +pub fn is_policy_implementation_path(rel_path: &Path) -> bool { + let Some(parent) = rel_path.parent() else { + return false; + }; + parent.components().any(|component| { + let s = component.as_os_str().to_string_lossy().to_lowercase(); + POLICY_INDICATORS.iter().any(|ind| s.contains(ind)) + }) +} + /// Tree-sitter query for named imports: `import { foo } from 'bar'` /// and aliased: `import { foo as bar } from 'baz'`. /// Captures the binding actually used in code (the alias when renamed, @@ -148,6 +179,46 @@ pub fn find_policy_imports( bindings } +/// Compute policy bindings for a Go package by treating every `.go` file in +/// the package directory as one propagation domain. Same shape as +/// `find_policy_imports` for the single-file case — collect imports, then +/// (if any) collect propagation edges and run to fixed point — but with +/// imports and edges unioned across every file in the package. +/// +/// Why: Go's idiomatic DI shape wires a policy primitive in one file and +/// calls it from another. The OCP corpus repo is the motivating case — +/// `internal/database/database.go` does +/// `&Database{accessFactory: authz.NewAccess}`, and 40+ call sites in +/// sibling files (`bundle_status.go`, etc.) reach the same field via +/// `d.accessFactory()...`. Per-file propagation never sees the +/// `accessFactory: authz.NewAccess` edge from the consumer's perspective, +/// so the call site looks like raw embedded authz. Unioning at the package +/// level matches Go's own scoping rules: package-private identifiers are +/// visible across files in the same directory, not across packages. +pub fn find_go_package_policy_imports<'a, I>(files: I) -> HashSet +where + I: IntoIterator, +{ + // Materialize so we can do two passes — bindings first, then edges + // only when there's something to propagate. + let files: Vec<(&tree_sitter::Tree, &[u8])> = files.into_iter().collect(); + + let mut bindings: HashSet = HashSet::new(); + for (tree, source) in &files { + bindings.extend(find_go_policy_imports(tree, source)); + } + if bindings.is_empty() { + return bindings; + } + + let mut edges: Vec<(String, String)> = Vec::new(); + for (tree, source) in &files { + edges.extend(extract_propagation_edges(tree, source, Language::Go)); + } + propagate_to_fixed_point(&mut bindings, &edges); + bindings +} + /// Iteratively grow `bindings` by adding any edge LHS whose RHS textually /// mentions a known binding. Stops when no edge changed the set. /// @@ -1777,4 +1848,132 @@ const cached = handler; let imports = find_policy_imports(&tree, source.as_bytes(), Language::TypeScript); assert!(imports.is_empty(), "got: {imports:?}"); } + + // ---------- Go package-scoped propagation ---------- + + #[test] + fn go_package_propagates_across_files() { + // OCP-shaped: one file in the package imports `authz` and stashes + // its constructor on a struct field; a sibling file with no policy + // imports of its own calls the field via the struct receiver. The + // per-file pass would miss the consumer entirely; the package pass + // must surface `accessFactory` as a binding visible to both files. + let producer = r#" +package database + +import "github.com/example/authz" + +type Database struct { + accessFactory func() Access +} + +func New() *Database { + return &Database{accessFactory: authz.NewAccess} +} +"#; + let consumer = r#" +package database + +func (d *Database) checkBundle() { + d.accessFactory().WithPrincipal("bob") +} +"#; + let producer_tree = parse_lang(producer, Language::Go); + let consumer_tree = parse_lang(consumer, Language::Go); + + let bindings = find_go_package_policy_imports([ + (&producer_tree, producer.as_bytes()), + (&consumer_tree, consumer.as_bytes()), + ]); + + assert!(bindings.contains("authz")); + assert!( + bindings.contains("accessFactory"), + "expected the cross-file edge to propagate; got: {bindings:?}" + ); + assert!(is_enforcement_point( + "d.accessFactory().WithPrincipal(\"bob\")", + &bindings, + )); + } + + #[test] + fn go_package_returns_empty_when_no_file_imports_policy() { + // If no file in the package has a policy import, propagation has no + // seed and the result must be empty even when files reference each + // other through edges. + let a = r#" +package svc + +import "github.com/example/utils" + +func wire() any { + return utils.New +} +"#; + let b = r#" +package svc + +var helper = wire +"#; + let at = parse_lang(a, Language::Go); + let bt = parse_lang(b, Language::Go); + + let bindings = find_go_package_policy_imports([(&at, a.as_bytes()), (&bt, b.as_bytes())]); + assert!(bindings.is_empty(), "got: {bindings:?}"); + } + + // ---------- Policy-implementation path bypass ---------- + + #[test] + fn implementation_path_matches_policy_directory_components() { + // The shapes that motivated this check: files that live under a + // policy-engine module directory, where structural rules would + // only flag the engine's own internals. + assert!(is_policy_implementation_path(Path::new( + "internal/authz/authz_test.go" + ))); + assert!(is_policy_implementation_path(Path::new( + "pkg/policy/check.go" + ))); + assert!(is_policy_implementation_path(Path::new("src/opa/eval.ts"))); + assert!(is_policy_implementation_path(Path::new( + "lib/cedar/engine.ts" + ))); + // Indicator-bearing component anywhere along the path counts — + // not just the immediate parent directory. + assert!(is_policy_implementation_path(Path::new( + "services/policy/src/check.ts" + ))); + } + + #[test] + fn implementation_path_is_case_insensitive() { + assert!(is_policy_implementation_path(Path::new( + "Internal/Authz/Handler.cs" + ))); + assert!(is_policy_implementation_path(Path::new( + "src/Policy/Engine.java" + ))); + } + + #[test] + fn implementation_path_ignores_filename_only_match() { + // A top-level `authz.go` shouldn't auto-bypass just because the + // filename mentions a policy term — the directory placement is the + // signal, not the basename. Users with a top-level implementation + // file can exclude it via config. + assert!(!is_policy_implementation_path(Path::new("authz.go"))); + assert!(!is_policy_implementation_path(Path::new("policy.ts"))); + } + + #[test] + fn implementation_path_skips_non_policy_directories() { + assert!(!is_policy_implementation_path(Path::new( + "src/handlers/orders.ts" + ))); + assert!(!is_policy_implementation_path(Path::new( + "internal/database/bundle_status.go" + ))); + } } diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index 7c55649..43b7a04 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -3,8 +3,8 @@ pub mod imports; pub mod matcher; pub mod parser; -use std::collections::HashMap; -use std::path::Path; +use std::collections::{HashMap, HashSet}; +use std::path::{Path, PathBuf}; use crate::cli::ScanArgs; use crate::config::ZiftConfig; @@ -71,6 +71,14 @@ pub fn scan( compiled_cache.insert((*lang, *is_tsx_jsx), compiled_rules); } + // Pre-compute per-Go-package policy bindings. Go's scoping makes the + // package directory the natural propagation domain — bindings declared + // in one file are visible to siblings via package-private identifiers. + // Without this pass, a consumer file with no policy imports of its own + // would never recognize a call like `d.accessFactory()` even when a + // sibling file wired `accessFactory: authz.NewAccess`. + let go_package_bindings = build_go_package_bindings(root, &files); + let mut ts_parser = tree_sitter::Parser::new(); let mut all_findings = Vec::new(); // `enforcement_points` counts call sites that *would* have matched a @@ -90,6 +98,25 @@ pub fn scan( std::collections::HashSet::new(); for file in &files { + let rel_path = file.path.strip_prefix(root).unwrap_or(&file.path); + + // Files that live *inside* a policy-engine implementation directory + // (`internal/authz/**`, `pkg/policy/**`, etc.) are themselves the + // policy engine, not consumers of one. Structural rules flag the + // embedded-authz shape, so running them here is guaranteed noise. + // Skip the file entirely — these don't count as findings or as + // enforcement points; they don't participate in the externalization + // metric at all. Done before read/parse so we don't pay I/O or + // tree-sitter cost for files we'll immediately drop. + if imports::is_policy_implementation_path(rel_path) { + tracing::warn!( + "skipping policy implementation file: {} (matched policy-indicator directory; \ + exclude via config if this is consumer code)", + rel_path.display(), + ); + continue; + } + let source = match std::fs::read_to_string(&file.path) { Ok(s) => s, Err(e) => { @@ -115,10 +142,20 @@ pub fn scan( tracing::debug!("parse errors in {}, scanning anyway", file.path.display()); } - let rel_path = file.path.strip_prefix(root).unwrap_or(&file.path); - - // Check for policy-engine imports in this file - let policy_imports = imports::find_policy_imports(&tree, source.as_bytes(), file.language); + // Check for policy-engine imports. For Go we use the package-level + // binding set computed in the pre-pass (a superset of this file's + // local bindings, including any propagated from sibling files). For + // every other language we stay per-file — cross-file flow there + // needs a project-wide symbol table that isn't built yet. + let policy_imports = if file.language == Language::Go { + file.path + .parent() + .and_then(|d| go_package_bindings.get(d)) + .cloned() + .unwrap_or_default() + } else { + imports::find_policy_imports(&tree, source.as_bytes(), file.language) + }; let compiled_rules = &compiled_cache[&(file.language, file.is_tsx_jsx)]; for compiled in compiled_rules { @@ -182,3 +219,84 @@ pub fn scan( enforcement_points, }) } + +/// Group Go files by their parent directory (=Go package) and compute the +/// union of imports + propagation edges across each group, returning the +/// per-package binding set keyed by directory. +/// +/// Files inside policy-engine implementation directories (`internal/authz/` +/// etc.) are skipped here for the same reason the main loop skips them: the +/// files themselves *are* the policy engine, and any bindings they contribute +/// are noise relative to consumer-side detection. +/// +/// We re-parse Go files here rather than threading a tree cache through the +/// main loop. The double parse is cheap (tree-sitter Go is fast and the +/// number of `.go` files in real repos is bounded) and the simpler control +/// flow is worth it; if profiling later flags this, the obvious next step +/// is to pre-parse once and pass the trees through. +/// +/// Same-directory ≠ same Go package: `_test.go` files can declare +/// `package foo_test` alongside `package foo`, and build constraints can +/// gate files to different OS/arch combos. We deliberately union bindings +/// across all of them. The union is monotonic for consumer detection +/// (extra bindings only reduce false embedded findings; they can't turn a +/// real finding into a false enforcement point), so this is safe today. +/// Revisit if propagation ever gains non-monotonic logic (scoring, +/// subtraction, etc.). +fn build_go_package_bindings( + root: &Path, + files: &[discovery::DiscoveredFile], +) -> HashMap> { + let mut by_dir: HashMap> = HashMap::new(); + for file in files { + if file.language != Language::Go { + continue; + } + let rel = file.path.strip_prefix(root).unwrap_or(&file.path); + if imports::is_policy_implementation_path(rel) { + continue; + } + let Some(dir) = file.path.parent() else { + continue; + }; + by_dir + .entry(dir.to_path_buf()) + .or_default() + .push(&file.path); + } + + let mut result: HashMap> = HashMap::new(); + let mut ts_parser = tree_sitter::Parser::new(); + + for (dir, paths) in by_dir { + let mut parsed: Vec<(tree_sitter::Tree, Vec)> = Vec::with_capacity(paths.len()); + for path in paths { + let source = match std::fs::read_to_string(path) { + Ok(s) => s, + Err(e) => { + tracing::warn!("skipping {} during package scan: {}", path.display(), e); + continue; + } + }; + let tree = match parser::parse_source( + &mut ts_parser, + source.as_bytes(), + Language::Go, + false, + ) { + Ok(t) => t, + Err(e) => { + tracing::warn!("skipping {} during package scan: {}", path.display(), e); + continue; + } + }; + parsed.push((tree, source.into_bytes())); + } + let bindings = + imports::find_go_package_policy_imports(parsed.iter().map(|(t, s)| (t, s.as_slice()))); + if !bindings.is_empty() { + result.insert(dir, bindings); + } + } + result +} diff --git a/tests/scanner_enforcement_points.rs b/tests/scanner_enforcement_points.rs index 2eb5b0e..6b0d7b2 100644 --- a/tests/scanner_enforcement_points.rs +++ b/tests/scanner_enforcement_points.rs @@ -226,6 +226,88 @@ func check() { ); } +#[test] +fn go_package_propagation_reroutes_cross_file_consumer() { + // OCP-shaped: one file in `package database` imports `authz` and wires + // its constructor onto a struct field; a sibling consumer file with no + // imports of its own calls the field via the receiver. Per-file scan + // sees nothing in the consumer. Package-scoped propagation must surface + // `accessFactory` as a policy binding so the consumer's call counts as + // an enforcement point, not an embedded finding. + let dir = tempdir().unwrap(); + let pkg_dir = dir.path().join("internal").join("database"); + fs::create_dir_all(&pkg_dir).unwrap(); + fs::write( + pkg_dir.join("database.go"), + r#"package database + +import "github.com/example/authz" + +type Database struct { + accessFactory func() any +} + +func New() *Database { + return &Database{accessFactory: authz.NewAccess} +} +"#, + ) + .unwrap(); + fs::write( + pkg_dir.join("bundle_status.go"), + r#"package database + +func (d *Database) checkBundle() { + _ = d.accessFactory().WithPrincipal("bob").WithResource("bundles") +} +"#, + ) + .unwrap(); + + let config = ZiftConfig::default(); + let loaded_rules = rules::load_rules(None, &config).expect("embedded rules load"); + let args = ScanArgs { + path: dir.path().to_path_buf(), + ..ScanArgs::default() + }; + + let result = scanner::scan(dir.path(), &loaded_rules, &args, &config).unwrap(); + + assert!( + result.enforcement_points >= 1, + "expected the consumer's d.accessFactory().WithPrincipal(...) chain to \ + count as an enforcement point via cross-file propagation; got {} (findings: {:?})", + result.enforcement_points, + result + .findings + .iter() + .map(|f| ( + f.pattern_rule.clone(), + f.file.display().to_string(), + f.line_start + )) + .collect::>(), + ); + assert!( + !result + .findings + .iter() + .any(|f| f.file.ends_with("bundle_status.go") + && f.pattern_rule.as_deref() == Some("go-access-descriptor-builder")), + "consumer file leaked an embedded finding — package-scoped propagation \ + should have rerouted the call: {:?}", + result + .findings + .iter() + .map(|f| ( + f.pattern_rule.clone(), + f.file.display().to_string(), + f.line_start + )) + .collect::>(), + ); +} + #[test] fn enforcement_points_increments_for_python_authz_import() { // Use a `check_*_permission` shape so it actually trips a structural rule @@ -336,6 +418,52 @@ public class OrderController { ); } +#[test] +fn in_package_policy_implementation_file_is_skipped() { + // OCP case: `internal/authz/authz_test.go` lives in `package authz` and + // calls policy constructors directly (no import — same package). The + // structural rules still match those call sites, but the file *is* the + // policy engine, not a consumer. The path-based bypass should drop it + // before structural matching runs: no findings, no enforcement points. + let dir = tempdir().unwrap(); + let policy_dir = dir.path().join("internal").join("authz"); + fs::create_dir_all(&policy_dir).unwrap(); + fs::write( + policy_dir.join("authz_test.go"), + r#"package authz + +func TestNewAccess(t *testing.T) { + a := NewAccess() + _ = a.WithPrincipal("bob").WithResource("doc").Allow() +} +"#, + ) + .unwrap(); + + let config = ZiftConfig::default(); + let loaded_rules = rules::load_rules(None, &config).expect("embedded rules load"); + let args = ScanArgs { + path: dir.path().to_path_buf(), + ..ScanArgs::default() + }; + + let result = scanner::scan(dir.path(), &loaded_rules, &args, &config).unwrap(); + + assert!( + result.findings.is_empty(), + "in-package policy implementation file leaked findings: {:?}", + result + .findings + .iter() + .map(|f| (f.pattern_rule.clone(), f.line_start)) + .collect::>(), + ); + assert_eq!( + result.enforcement_points, 0, + "in-package policy implementation file should not be counted as an enforcement point", + ); +} + #[test] fn externalized_rules_count_without_policy_import_shortcut() { let cases = [