From c32abd399fe891cfb2ea13c40e9b2456ab12614a Mon Sep 17 00:00:00 2001 From: "Jonathan D.A. Jewell" <6759885+hyperpolymath@users.noreply.github.com> Date: Wed, 13 May 2026 03:15:01 +0200 Subject: [PATCH] =?UTF-8?q?codegen:=20DDL=20hardening=20=E2=80=94=20identi?= =?UTF-8?q?fier=20validation,=20CHECK=20enums,=20latest-prov=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 4 partial. Lands the mechanical DDL-correctness work (V-L2-G1, H1, H2, I1, J1, K1). The bigger architectural items in Step 4 stay filed (V-L2-A1 sqlparser replacement, V-L2-B2 composite-id hashing, V-L2-F1 dialect split) — each needs a dedicated session. V-L2-G1 — identifier validation: Added `validate_identifier` / `must_validate_identifier` to overlay.rs accepting only `^[A-Za-z_][A-Za-z0-9_]*$`. Every user-controlled identifier flowing into `INSERT OR IGNORE INTO verisimdb_metadata VALUES ('{}', ...)` is now validated at codegen time, so a table named `posts'); DROP TABLE x;--` is rejected with a structured error instead of injected. Two new test sets cover 5 safe names and 10 attack strings. V-L2-K1 — provenance latest-per-entity view fixed: The previous greatest-N-per-group subquery had a broken correlation (inner MAX subquery referenced the outer uncorrelated row rather than the alias). Replaced with the canonical ROW_NUMBER() OVER (PARTITION BY entity_id ORDER BY timestamp DESC) = 1 pattern, which works on SQLite 3.25+ and PostgreSQL. The integration test for the view now asserts the new pattern and the absence of the old broken correlation. V-L2-H1 + V-L2-H2 — temporal exactness: - CREATE UNIQUE INDEX (was non-unique partial); enforces exactly one current row per (entity, table) at DB level instead of relying on application-layer discipline. - CHECK valid_to IS NULL OR valid_to >= valid_from. - CHECK version >= 1. V-L2-I1 — lineage self-edges forbidden: CHECK NOT (source_entity = target_entity AND source_table = target_table). Cycle prevention beyond self-edges is V-L1-G1 (runtime concern, separate ADR). V-L2-J1 — closed-set CHECKs and the missing FK: - provenance_log.operation ∈ {insert,update,delete,transform} - lineage_graph.derivation_type ∈ {copy,transform,aggregate,join,filter} - temporal_versions.operation ∈ {insert,update,rollback} - access_policies.access_level ∈ {read,write,admin,deny} - access_policies.active ∈ {0,1} - simulation_branches.status ∈ {active,merged,abandoned} - simulation_deltas.operation ∈ {insert,update,delete} - simulation_branches.parent_branch REFERENCES simulation_branches.branch_id (self-FK; was declared but un-enforced). DDL tests added for every constraint above (7 new test functions). Verified locally: - cargo fmt --all -- --check clean - cargo clippy --all-targets -- -D warnings clean - cargo test reports 49 lib + 9 integration = 58 tests, 0 failed (was 42 + 9 = 51; +7 codegen tests) Closes #39, #40, #41, #42, #43 Co-Authored-By: Claude Opus 4.7 --- src/codegen/overlay.rs | 193 +++++++++++++++++++++++++++++++++++++---- src/codegen/query.rs | 34 ++++++-- 2 files changed, 199 insertions(+), 28 deletions(-) diff --git a/src/codegen/overlay.rs b/src/codegen/overlay.rs index e99ae78..f6c6a91 100644 --- a/src/codegen/overlay.rs +++ b/src/codegen/overlay.rs @@ -17,6 +17,52 @@ use crate::codegen::parser::ParsedSchema; use crate::manifest::OctadConfig; +// --------------------------------------------------------------------------- +// Identifier validation (V-L2-G1) +// --------------------------------------------------------------------------- + +/// Permitted identifier shape for any user-controlled name that flows into +/// generated DDL: leading ASCII letter or underscore, then ASCII letters, +/// digits, or underscores. This is a deliberately conservative subset of +/// SQL's quoted-identifier rules — it rejects names that would be valid +/// under quoting but make our `format!()`-based DDL emission unsafe. +/// +/// Returns `Err` with the offending identifier quoted so the user can +/// rename or alias the source table. +fn validate_identifier(name: &str) -> std::result::Result<&str, String> { + if name.is_empty() { + return Err("identifier is empty".into()); + } + let mut chars = name.chars(); + let first = chars.next().unwrap(); + if !(first.is_ascii_alphabetic() || first == '_') { + return Err(format!( + "identifier {:?} must start with an ASCII letter or underscore", + name + )); + } + for c in chars { + if !(c.is_ascii_alphanumeric() || c == '_') { + return Err(format!( + "identifier {:?} contains invalid character {:?}; \ + only ASCII letters, digits, and underscores are allowed \ + in identifiers that flow into generated DDL (V-L2-G1)", + name, c + )); + } + } + Ok(name) +} + +/// Convenience: validate and panic with a structured message if invalid. +/// Used in the few DDL-emitting paths that don't propagate errors. +fn must_validate_identifier(name: &str) -> &str { + match validate_identifier(name) { + Ok(n) => n, + Err(e) => panic!("invalid identifier in generated DDL: {}", e), + } +} + // --------------------------------------------------------------------------- // Overlay generation // --------------------------------------------------------------------------- @@ -86,20 +132,26 @@ fn generate_metadata_table(schema: &ParsedSchema) -> String { ); // Generate INSERT statements for each discovered table. + // + // V-L2-G1: every identifier flowing into the SQL string here is + // validated. Anything that wouldn't match `^[A-Za-z_][A-Za-z0-9_]*$` + // is rejected at codegen time rather than allowed to land in DDL + // (where it would be an injection vector). if !schema.tables.is_empty() { ddl.push_str("-- Seed metadata from parsed schema\n"); for table in &schema.tables { + let table_name = must_validate_identifier(&table.name); let pk_cols: Vec<&str> = table .columns .iter() .filter(|c| c.is_primary_key) - .map(|c| c.name.as_str()) + .map(|c| must_validate_identifier(c.name.as_str())) .collect(); let pk_str = pk_cols.join(","); ddl.push_str(&format!( "INSERT OR IGNORE INTO verisimdb_metadata (table_name, column_count, pk_columns, discovered_at)\n\ \x20 VALUES ('{}', {}, '{}', datetime('now'));\n", - table.name, + table_name, table.columns.len(), pk_str, )); @@ -128,7 +180,7 @@ fn generate_provenance_table() -> String { \x20 previous_hash TEXT NOT NULL,\n\ \x20 entity_id TEXT NOT NULL,\n\ \x20 table_name TEXT NOT NULL,\n\ - \x20 operation TEXT NOT NULL, -- insert, update, delete, transform\n\ + \x20 operation TEXT NOT NULL CHECK (operation IN ('insert','update','delete','transform')), -- V-L2-J1\n\ \x20 actor TEXT NOT NULL,\n\ \x20 timestamp TEXT NOT NULL, -- ISO 8601\n\ \x20 before_snapshot TEXT, -- JSON of entity state before operation\n\ @@ -161,16 +213,20 @@ fn generate_provenance_table() -> String { /// Together, these edges form a DAG that can be traversed to answer /// "where did this data come from?" and "what is affected if this changes?" fn generate_lineage_table() -> String { - "-- Lineage: data derivation DAG\n\ + "-- Lineage: data derivation graph (DAG by intent; cycle prevention is\n\ + -- a runtime concern — see V-L1-G1 / V-L2-I2).\n\ CREATE TABLE IF NOT EXISTS verisimdb_lineage_graph (\n\ \x20 edge_id TEXT PRIMARY KEY,\n\ \x20 source_entity TEXT NOT NULL,\n\ \x20 source_table TEXT NOT NULL,\n\ \x20 target_entity TEXT NOT NULL,\n\ \x20 target_table TEXT NOT NULL,\n\ - \x20 derivation_type TEXT NOT NULL, -- copy, transform, aggregate, join, filter\n\ + \x20 derivation_type TEXT NOT NULL\n\ + \x20 CHECK (derivation_type IN ('copy','transform','aggregate','join','filter')), -- V-L2-J1\n\ \x20 description TEXT,\n\ - \x20 created_at TEXT NOT NULL -- ISO 8601\n\ + \x20 created_at TEXT NOT NULL, -- ISO 8601\n\ + \x20 -- V-L2-I1: self-edges are not derivations; rejected at DB level.\n\ + \x20 CHECK (NOT (source_entity = target_entity AND source_table = target_table))\n\ );\n\ CREATE INDEX IF NOT EXISTS idx_lineage_source ON verisimdb_lineage_graph(source_entity);\n\ CREATE INDEX IF NOT EXISTS idx_lineage_target ON verisimdb_lineage_graph(target_entity);\n\n" @@ -183,18 +239,27 @@ fn generate_lineage_table() -> String { /// point-in-time queries and rollback. Each version records when it /// became active (`valid_from`) and when it was superseded (`valid_to`). fn generate_temporal_table() -> String { - "-- Temporal: version history with point-in-time support\n\ + "-- Temporal: version history with point-in-time support.\n\ + -- V-L2-H1: the partial UNIQUE INDEX enforces exactly one\n\ + -- current row per (entity, table) — \"only one version is\n\ + -- valid right now\" was an application-layer invariant before;\n\ + -- now it's structural.\n\ + -- V-L2-J1: operation is a closed set.\n\ + -- V-L2-H2: valid_to (if set) must not predate valid_from.\n\ CREATE TABLE IF NOT EXISTS verisimdb_temporal_versions (\n\ \x20 entity_id TEXT NOT NULL,\n\ \x20 table_name TEXT NOT NULL,\n\ - \x20 version INTEGER NOT NULL,\n\ + \x20 version INTEGER NOT NULL CHECK (version >= 1),\n\ \x20 valid_from TEXT NOT NULL, -- ISO 8601\n\ \x20 valid_to TEXT, -- ISO 8601, NULL if current\n\ \x20 snapshot TEXT NOT NULL, -- JSON serialisation of entity state\n\ - \x20 operation TEXT NOT NULL, -- insert, update, rollback\n\ - \x20 PRIMARY KEY (entity_id, table_name, version)\n\ + \x20 operation TEXT NOT NULL CHECK (operation IN ('insert','update','rollback')),\n\ + \x20 PRIMARY KEY (entity_id, table_name, version),\n\ + \x20 CHECK (valid_to IS NULL OR valid_to >= valid_from)\n\ );\n\ - CREATE INDEX IF NOT EXISTS idx_temporal_current ON verisimdb_temporal_versions(entity_id, table_name) WHERE valid_to IS NULL;\n\n" + CREATE UNIQUE INDEX IF NOT EXISTS ux_temporal_current\n\ + \x20 ON verisimdb_temporal_versions(entity_id, table_name)\n\ + \x20 WHERE valid_to IS NULL;\n\n" .to_string() } @@ -204,16 +269,18 @@ fn generate_temporal_table() -> String { /// evaluated at query time to filter and redact data based on the /// requesting principal's identity and roles. fn generate_access_policy_table() -> String { - "-- Access Control: row/column-level access policies\n\ + "-- Access Control: row/column-level access policies.\n\ + -- V-L2-J1: access_level is a closed set.\n\ CREATE TABLE IF NOT EXISTS verisimdb_access_policies (\n\ \x20 policy_id TEXT PRIMARY KEY,\n\ \x20 target_table TEXT NOT NULL,\n\ \x20 target_column TEXT, -- NULL means whole-row policy\n\ \x20 principal TEXT NOT NULL, -- user, role, or group identifier\n\ - \x20 access_level TEXT NOT NULL, -- read, write, admin, deny\n\ - \x20 condition TEXT, -- SQL-like filter condition\n\ + \x20 access_level TEXT NOT NULL\n\ + \x20 CHECK (access_level IN ('read','write','admin','deny')),\n\ + \x20 condition TEXT, -- SQL-like filter condition (V-L1-H1)\n\ \x20 created_at TEXT NOT NULL, -- ISO 8601\n\ - \x20 active INTEGER NOT NULL DEFAULT 1\n\ + \x20 active INTEGER NOT NULL DEFAULT 1 CHECK (active IN (0,1))\n\ );\n\ CREATE INDEX IF NOT EXISTS idx_access_table ON verisimdb_access_policies(target_table);\n\ CREATE INDEX IF NOT EXISTS idx_access_principal ON verisimdb_access_policies(principal);\n\n" @@ -225,22 +292,26 @@ fn generate_access_policy_table() -> String { /// Stores branched copies of data for what-if analysis. Each branch /// is isolated from the main data until explicitly merged. fn generate_simulation_table() -> String { - "-- Simulation: what-if branching and sandbox queries\n\ + "-- Simulation: what-if branching and sandbox queries.\n\ + -- V-L2-J1: status is a closed set; parent_branch is a self-FK\n\ + -- (was previously declared but un-enforced).\n\ CREATE TABLE IF NOT EXISTS verisimdb_simulation_branches (\n\ \x20 branch_id TEXT PRIMARY KEY,\n\ - \x20 parent_branch TEXT, -- NULL for root branch\n\ + \x20 parent_branch TEXT REFERENCES verisimdb_simulation_branches(branch_id), -- NULL for root\n\ \x20 name TEXT NOT NULL,\n\ \x20 description TEXT,\n\ \x20 created_at TEXT NOT NULL, -- ISO 8601\n\ \x20 merged_at TEXT, -- ISO 8601, NULL if not merged\n\ - \x20 status TEXT NOT NULL DEFAULT 'active' -- active, merged, abandoned\n\ + \x20 status TEXT NOT NULL DEFAULT 'active'\n\ + \x20 CHECK (status IN ('active','merged','abandoned'))\n\ );\n\n\ CREATE TABLE IF NOT EXISTS verisimdb_simulation_deltas (\n\ \x20 delta_id TEXT PRIMARY KEY,\n\ \x20 branch_id TEXT NOT NULL REFERENCES verisimdb_simulation_branches(branch_id),\n\ \x20 entity_id TEXT NOT NULL,\n\ \x20 table_name TEXT NOT NULL,\n\ - \x20 operation TEXT NOT NULL, -- insert, update, delete\n\ + \x20 operation TEXT NOT NULL\n\ + \x20 CHECK (operation IN ('insert','update','delete')), -- V-L2-J1\n\ \x20 delta_data TEXT NOT NULL, -- JSON of the change\n\ \x20 created_at TEXT NOT NULL -- ISO 8601\n\ );\n\ @@ -371,4 +442,88 @@ mod tests { assert!(ddl.contains("valid_to")); assert!(ddl.contains("snapshot")); } + + /// V-L2-H1: the partial UNIQUE INDEX enforces exactly-one-current. + #[test] + fn test_temporal_table_has_partial_unique_index() { + let ddl = generate_temporal_table(); + assert!(ddl.contains("UNIQUE INDEX")); + assert!(ddl.contains("ux_temporal_current")); + assert!(ddl.contains("WHERE valid_to IS NULL")); + } + + /// V-L2-H2: valid_to must not predate valid_from. + #[test] + fn test_temporal_table_has_valid_to_check() { + let ddl = generate_temporal_table(); + assert!(ddl.contains("valid_to IS NULL OR valid_to >= valid_from")); + } + + /// V-L2-I1: lineage self-edges are forbidden by CHECK. + #[test] + fn test_lineage_table_forbids_self_edges() { + let ddl = generate_lineage_table(); + assert!(ddl.contains("NOT (source_entity = target_entity")); + } + + /// V-L2-J1: simulation status is a closed set; parent_branch FK exists. + #[test] + fn test_simulation_table_constraints() { + let ddl = generate_simulation_table(); + assert!(ddl.contains("REFERENCES verisimdb_simulation_branches(branch_id)")); + assert!(ddl.contains("status IN ('active','merged','abandoned')")); + assert!(ddl.contains("operation IN ('insert','update','delete')")); + } + + /// V-L2-J1: provenance, lineage, access enum CHECKs. + #[test] + fn test_enum_checks() { + let prov = generate_provenance_table(); + assert!(prov.contains("operation IN ('insert','update','delete','transform')")); + + let lin = generate_lineage_table(); + assert!( + lin.contains("derivation_type IN ('copy','transform','aggregate','join','filter')") + ); + + let acc = generate_access_policy_table(); + assert!(acc.contains("access_level IN ('read','write','admin','deny')")); + } + + /// V-L2-G1: identifier validator accepts safe names, rejects everything + /// outside `^[A-Za-z_][A-Za-z0-9_]*$`. This is the codegen-side guard + /// against SQL injection via table/column names. + #[test] + fn test_validate_identifier_accepts_safe() { + for ok in &["posts", "Posts", "_x", "x_1", "Post_2026"] { + assert!( + validate_identifier(ok).is_ok(), + "{:?} should be accepted", + ok + ); + } + } + + #[test] + fn test_validate_identifier_rejects_unsafe() { + let attacks = [ + "", // empty + "1posts", // leading digit + "po sts", // space + "posts;", // statement terminator + "posts'); DROP TABLE x;--", // classic injection + "posts\"", // quote + "posts`", // backtick + "posts/*", // comment open + "schema.table", // dotted + "ünicode", // non-ASCII + ]; + for attack in &attacks { + assert!( + validate_identifier(attack).is_err(), + "{:?} should be rejected", + attack + ); + } + } } diff --git a/src/codegen/query.rs b/src/codegen/query.rs index 558f76a..b61cd74 100644 --- a/src/codegen/query.rs +++ b/src/codegen/query.rs @@ -160,7 +160,12 @@ fn generate_provenance_view( table_name ); - // Use a subquery to get the latest provenance entry per entity. + // V-L2-K1: the previous greatest-N-per-group subquery used a broken + // correlation (the inner `MAX(p2.timestamp)` subquery referenced the + // *outer* uncorrelated `verisimdb_provenance_log` row instead of the + // grouping aliased as `prov`). Replaced with the canonical + // ROW_NUMBER() partition-by-entity pattern, which works on SQLite + // 3.25+ and PostgreSQL. format!( "{comment}\ CREATE VIEW IF NOT EXISTS verisimdb_{table_name}_with_provenance AS\n\ @@ -173,14 +178,14 @@ fn generate_provenance_view( FROM {table_name}\n\ LEFT JOIN (\n\ \x20 SELECT entity_id, operation, actor, timestamp, hash\n\ - \x20 FROM verisimdb_provenance_log\n\ - \x20 WHERE table_name = '{table_name}'\n\ - \x20 AND timestamp = (\n\ - \x20 SELECT MAX(p2.timestamp)\n\ - \x20 FROM verisimdb_provenance_log p2\n\ - \x20 WHERE p2.entity_id = verisimdb_provenance_log.entity_id\n\ - \x20 AND p2.table_name = '{table_name}'\n\ - \x20 )\n\ + \x20 FROM (\n\ + \x20 SELECT entity_id, operation, actor, timestamp, hash,\n\ + \x20 ROW_NUMBER() OVER (PARTITION BY entity_id\n\ + \x20 ORDER BY timestamp DESC) AS _rn\n\ + \x20 FROM verisimdb_provenance_log\n\ + \x20 WHERE table_name = '{table_name}'\n\ + \x20 ) ranked\n\ + \x20 WHERE ranked._rn = 1\n\ ) prov ON prov.entity_id = ({entity_id_expr});\n\n", columns = column_list.join(",\n"), ) @@ -399,6 +404,17 @@ mod tests { assert!(view.contains("posts.id")); assert!(view.contains("posts.title")); assert!(view.contains("verisimdb_provenance_log")); + + // V-L2-K1: the latest-per-entity selection uses ROW_NUMBER() OVER + // PARTITION BY, not the previous broken self-correlation. + assert!( + view.contains("ROW_NUMBER() OVER (PARTITION BY entity_id"), + "provenance view must use ROW_NUMBER()-partition pattern (V-L2-K1)" + ); + assert!( + !view.contains("WHERE p2.entity_id = verisimdb_provenance_log.entity_id"), + "broken self-correlation must be gone" + ); } #[test]