Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 199 additions & 0 deletions src/scanner/imports.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::collections::HashSet;
use std::path::Path;

use streaming_iterator::StreamingIterator;

Expand All @@ -23,6 +24,36 @@ fn is_policy_path(source: &str) -> bool {
POLICY_INDICATORS.iter().any(|ind| lower.contains(ind))
}

/// Check if a file's path places it *inside* a policy-engine implementation
/// module — e.g. `internal/authz/authz_test.go` lives under a directory whose
/// name contains `authz`. Files like these are themselves the policy engine,
/// so the structural rules (which flag *consumers* of authz primitives) would
/// only produce noise. The scanner uses this to skip such files entirely.
///
/// Heuristic: any directory component along the path (excluding the filename
/// itself) contains a `POLICY_INDICATORS` substring, case-insensitive. The
/// filename is intentionally excluded — a top-level `authz.go` could plausibly
/// be either the implementation or a consumer; if it really is the
/// implementation, the user can exclude it via config. Bypassing here is
/// reserved for the unambiguous "in a policy directory" shape.
///
/// False-positive risk: the indicator set is shared with import-path matching
/// and includes broad terms (`enforce`, `opa`, `policy`). Consumer directories
/// like `services/enforcement/` or any path with `opa` as a substring will be
/// silently skipped — no findings, no enforcement points. The scanner surfaces
/// a `warn!` per skipped file so accidental bypasses are visible in logs; if a
/// legitimate consumer directory is being dropped, the user can either rename
/// it or exclude it via config (and we can narrow the indicator set later).
pub fn is_policy_implementation_path(rel_path: &Path) -> bool {
let Some(parent) = rel_path.parent() else {
return false;
};
parent.components().any(|component| {
let s = component.as_os_str().to_string_lossy().to_lowercase();
POLICY_INDICATORS.iter().any(|ind| s.contains(ind))
})
}

/// Tree-sitter query for named imports: `import { foo } from 'bar'`
/// and aliased: `import { foo as bar } from 'baz'`.
/// Captures the binding actually used in code (the alias when renamed,
Expand Down Expand Up @@ -148,6 +179,46 @@ pub fn find_policy_imports(
bindings
}

/// Compute policy bindings for a Go package by treating every `.go` file in
/// the package directory as one propagation domain. Same shape as
/// `find_policy_imports` for the single-file case — collect imports, then
/// (if any) collect propagation edges and run to fixed point — but with
/// imports and edges unioned across every file in the package.
///
/// Why: Go's idiomatic DI shape wires a policy primitive in one file and
/// calls it from another. The OCP corpus repo is the motivating case —
/// `internal/database/database.go` does
/// `&Database{accessFactory: authz.NewAccess}`, and 40+ call sites in
/// sibling files (`bundle_status.go`, etc.) reach the same field via
/// `d.accessFactory()...`. Per-file propagation never sees the
/// `accessFactory: authz.NewAccess` edge from the consumer's perspective,
/// so the call site looks like raw embedded authz. Unioning at the package
/// level matches Go's own scoping rules: package-private identifiers are
/// visible across files in the same directory, not across packages.
pub fn find_go_package_policy_imports<'a, I>(files: I) -> HashSet<String>
where
I: IntoIterator<Item = (&'a tree_sitter::Tree, &'a [u8])>,
{
// Materialize so we can do two passes — bindings first, then edges
// only when there's something to propagate.
let files: Vec<(&tree_sitter::Tree, &[u8])> = files.into_iter().collect();

let mut bindings: HashSet<String> = HashSet::new();
for (tree, source) in &files {
bindings.extend(find_go_policy_imports(tree, source));
}
if bindings.is_empty() {
return bindings;
}

let mut edges: Vec<(String, String)> = Vec::new();
for (tree, source) in &files {
edges.extend(extract_propagation_edges(tree, source, Language::Go));
}
propagate_to_fixed_point(&mut bindings, &edges);
bindings
}

/// Iteratively grow `bindings` by adding any edge LHS whose RHS textually
/// mentions a known binding. Stops when no edge changed the set.
///
Expand Down Expand Up @@ -1777,4 +1848,132 @@ const cached = handler;
let imports = find_policy_imports(&tree, source.as_bytes(), Language::TypeScript);
assert!(imports.is_empty(), "got: {imports:?}");
}

// ---------- Go package-scoped propagation ----------

#[test]
fn go_package_propagates_across_files() {
// OCP-shaped: one file in the package imports `authz` and stashes
// its constructor on a struct field; a sibling file with no policy
// imports of its own calls the field via the struct receiver. The
// per-file pass would miss the consumer entirely; the package pass
// must surface `accessFactory` as a binding visible to both files.
let producer = r#"
package database

import "github.com/example/authz"

type Database struct {
accessFactory func() Access
}

func New() *Database {
return &Database{accessFactory: authz.NewAccess}
}
"#;
let consumer = r#"
package database

func (d *Database) checkBundle() {
d.accessFactory().WithPrincipal("bob")
}
"#;
let producer_tree = parse_lang(producer, Language::Go);
let consumer_tree = parse_lang(consumer, Language::Go);

let bindings = find_go_package_policy_imports([
(&producer_tree, producer.as_bytes()),
(&consumer_tree, consumer.as_bytes()),
]);

assert!(bindings.contains("authz"));
assert!(
bindings.contains("accessFactory"),
"expected the cross-file edge to propagate; got: {bindings:?}"
);
assert!(is_enforcement_point(
"d.accessFactory().WithPrincipal(\"bob\")",
&bindings,
));
}

#[test]
fn go_package_returns_empty_when_no_file_imports_policy() {
// If no file in the package has a policy import, propagation has no
// seed and the result must be empty even when files reference each
// other through edges.
let a = r#"
package svc

import "github.com/example/utils"

func wire() any {
return utils.New
}
"#;
let b = r#"
package svc

var helper = wire
"#;
let at = parse_lang(a, Language::Go);
let bt = parse_lang(b, Language::Go);

let bindings = find_go_package_policy_imports([(&at, a.as_bytes()), (&bt, b.as_bytes())]);
assert!(bindings.is_empty(), "got: {bindings:?}");
}

// ---------- Policy-implementation path bypass ----------

#[test]
fn implementation_path_matches_policy_directory_components() {
// The shapes that motivated this check: files that live under a
// policy-engine module directory, where structural rules would
// only flag the engine's own internals.
assert!(is_policy_implementation_path(Path::new(
"internal/authz/authz_test.go"
)));
assert!(is_policy_implementation_path(Path::new(
"pkg/policy/check.go"
)));
assert!(is_policy_implementation_path(Path::new("src/opa/eval.ts")));
assert!(is_policy_implementation_path(Path::new(
"lib/cedar/engine.ts"
)));
// Indicator-bearing component anywhere along the path counts —
// not just the immediate parent directory.
assert!(is_policy_implementation_path(Path::new(
"services/policy/src/check.ts"
)));
}

#[test]
fn implementation_path_is_case_insensitive() {
assert!(is_policy_implementation_path(Path::new(
"Internal/Authz/Handler.cs"
)));
assert!(is_policy_implementation_path(Path::new(
"src/Policy/Engine.java"
)));
}

#[test]
fn implementation_path_ignores_filename_only_match() {
// A top-level `authz.go` shouldn't auto-bypass just because the
// filename mentions a policy term — the directory placement is the
// signal, not the basename. Users with a top-level implementation
// file can exclude it via config.
assert!(!is_policy_implementation_path(Path::new("authz.go")));
assert!(!is_policy_implementation_path(Path::new("policy.ts")));
}

#[test]
fn implementation_path_skips_non_policy_directories() {
assert!(!is_policy_implementation_path(Path::new(
"src/handlers/orders.ts"
)));
assert!(!is_policy_implementation_path(Path::new(
"internal/database/bundle_status.go"
)));
}
}
130 changes: 124 additions & 6 deletions src/scanner/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ pub mod imports;
pub mod matcher;
pub mod parser;

use std::collections::HashMap;
use std::path::Path;
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};

use crate::cli::ScanArgs;
use crate::config::ZiftConfig;
Expand Down Expand Up @@ -71,6 +71,14 @@ pub fn scan(
compiled_cache.insert((*lang, *is_tsx_jsx), compiled_rules);
}

// Pre-compute per-Go-package policy bindings. Go's scoping makes the
// package directory the natural propagation domain — bindings declared
// in one file are visible to siblings via package-private identifiers.
// Without this pass, a consumer file with no policy imports of its own
// would never recognize a call like `d.accessFactory()` even when a
// sibling file wired `accessFactory: authz.NewAccess`.
let go_package_bindings = build_go_package_bindings(root, &files);

let mut ts_parser = tree_sitter::Parser::new();
let mut all_findings = Vec::new();
// `enforcement_points` counts call sites that *would* have matched a
Expand All @@ -90,6 +98,25 @@ pub fn scan(
std::collections::HashSet::new();

for file in &files {
let rel_path = file.path.strip_prefix(root).unwrap_or(&file.path);

// Files that live *inside* a policy-engine implementation directory
// (`internal/authz/**`, `pkg/policy/**`, etc.) are themselves the
// policy engine, not consumers of one. Structural rules flag the
// embedded-authz shape, so running them here is guaranteed noise.
// Skip the file entirely — these don't count as findings or as
// enforcement points; they don't participate in the externalization
// metric at all. Done before read/parse so we don't pay I/O or
// tree-sitter cost for files we'll immediately drop.
if imports::is_policy_implementation_path(rel_path) {
tracing::warn!(
"skipping policy implementation file: {} (matched policy-indicator directory; \
exclude via config if this is consumer code)",
rel_path.display(),
);
continue;
}

let source = match std::fs::read_to_string(&file.path) {
Ok(s) => s,
Err(e) => {
Expand All @@ -115,10 +142,20 @@ pub fn scan(
tracing::debug!("parse errors in {}, scanning anyway", file.path.display());
}

let rel_path = file.path.strip_prefix(root).unwrap_or(&file.path);

// Check for policy-engine imports in this file
let policy_imports = imports::find_policy_imports(&tree, source.as_bytes(), file.language);
// Check for policy-engine imports. For Go we use the package-level
// binding set computed in the pre-pass (a superset of this file's
// local bindings, including any propagated from sibling files). For
// every other language we stay per-file — cross-file flow there
// needs a project-wide symbol table that isn't built yet.
let policy_imports = if file.language == Language::Go {
file.path
.parent()
.and_then(|d| go_package_bindings.get(d))
.cloned()
.unwrap_or_default()
} else {
imports::find_policy_imports(&tree, source.as_bytes(), file.language)
};

let compiled_rules = &compiled_cache[&(file.language, file.is_tsx_jsx)];
for compiled in compiled_rules {
Expand Down Expand Up @@ -182,3 +219,84 @@ pub fn scan(
enforcement_points,
})
}

/// Group Go files by their parent directory (=Go package) and compute the
/// union of imports + propagation edges across each group, returning the
/// per-package binding set keyed by directory.
///
/// Files inside policy-engine implementation directories (`internal/authz/`
/// etc.) are skipped here for the same reason the main loop skips them: the
/// files themselves *are* the policy engine, and any bindings they contribute
/// are noise relative to consumer-side detection.
///
/// We re-parse Go files here rather than threading a tree cache through the
/// main loop. The double parse is cheap (tree-sitter Go is fast and the
/// number of `.go` files in real repos is bounded) and the simpler control
/// flow is worth it; if profiling later flags this, the obvious next step
/// is to pre-parse once and pass the trees through.
///
/// Same-directory ≠ same Go package: `_test.go` files can declare
/// `package foo_test` alongside `package foo`, and build constraints can
/// gate files to different OS/arch combos. We deliberately union bindings
/// across all of them. The union is monotonic for consumer detection
/// (extra bindings only reduce false embedded findings; they can't turn a
/// real finding into a false enforcement point), so this is safe today.
/// Revisit if propagation ever gains non-monotonic logic (scoring,
/// subtraction, etc.).
fn build_go_package_bindings(
root: &Path,
files: &[discovery::DiscoveredFile],
) -> HashMap<PathBuf, HashSet<String>> {
let mut by_dir: HashMap<PathBuf, Vec<&Path>> = HashMap::new();
for file in files {
if file.language != Language::Go {
continue;
}
let rel = file.path.strip_prefix(root).unwrap_or(&file.path);
if imports::is_policy_implementation_path(rel) {
continue;
}
let Some(dir) = file.path.parent() else {
continue;
};
by_dir
.entry(dir.to_path_buf())
.or_default()
.push(&file.path);
}

let mut result: HashMap<PathBuf, HashSet<String>> = HashMap::new();
let mut ts_parser = tree_sitter::Parser::new();

for (dir, paths) in by_dir {
let mut parsed: Vec<(tree_sitter::Tree, Vec<u8>)> = Vec::with_capacity(paths.len());
for path in paths {
let source = match std::fs::read_to_string(path) {
Ok(s) => s,
Err(e) => {
tracing::warn!("skipping {} during package scan: {}", path.display(), e);
continue;
}
};
let tree = match parser::parse_source(
&mut ts_parser,
source.as_bytes(),
Language::Go,
false,
) {
Ok(t) => t,
Err(e) => {
tracing::warn!("skipping {} during package scan: {}", path.display(), e);
continue;
}
};
parsed.push((tree, source.into_bytes()));
}
let bindings =
imports::find_go_package_policy_imports(parsed.iter().map(|(t, s)| (t, s.as_slice())));
if !bindings.is_empty() {
result.insert(dir, bindings);
}
}
result
}
Loading