diff --git a/src/codegen/ident.rs b/src/codegen/ident.rs new file mode 100644 index 0000000..26c5cfe --- /dev/null +++ b/src/codegen/ident.rs @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: PMPL-1.0-or-later +// Copyright (c) 2026 Jonathan D.A. Jewell (hyperpolymath) +// +// Identifier validation for any user-controlled name flowing into +// generated DDL. The codegen layer interpolates table and column names +// directly into SQL via `format!()`; an unchecked name like +// `posts'); DROP TABLE x;--` would be injected verbatim. Closes #39. + +use anyhow::{Result, bail}; + +/// Allowed: leading letter or underscore, then letters/digits/underscores. +/// Same shape as standard SQL identifiers without quoting. Length 1..=63 +/// (Postgres' NAMEDATALEN minus the null terminator). +const MAX_IDENT_LEN: usize = 63; + +/// Validate a single identifier (table or column name) for safe +/// interpolation into generated DDL. +/// +/// Returns `Ok(ident)` if `ident` matches `^[A-Za-z_][A-Za-z0-9_]*$` and +/// is within the length bound; otherwise an `anyhow::Error` whose message +/// includes the offending identifier (truncated) and the kind label so +/// users can pinpoint the offending schema input. +pub fn validate_identifier<'a>(ident: &'a str, kind: &str) -> Result<&'a str> { + if ident.is_empty() { + bail!("invalid {kind}: empty identifier"); + } + if ident.len() > MAX_IDENT_LEN { + bail!( + "invalid {kind} '{}': identifier too long ({} > {} chars)", + display_truncated(ident), + ident.len(), + MAX_IDENT_LEN, + ); + } + let mut chars = ident.chars(); + let first = chars.next().expect("non-empty checked above"); + if !(first.is_ascii_alphabetic() || first == '_') { + bail!( + "invalid {kind} '{}': must start with a letter or underscore", + display_truncated(ident), + ); + } + for c in chars { + if !(c.is_ascii_alphanumeric() || c == '_') { + bail!( + "invalid {kind} '{}': contains disallowed character {:?} \ + (only [A-Za-z0-9_] permitted)", + display_truncated(ident), + c, + ); + } + } + Ok(ident) +} + +fn display_truncated(s: &str) -> String { + const LIMIT: usize = 60; + if s.len() <= LIMIT { + s.to_string() + } else { + format!("{}…", &s[..LIMIT]) + } +} + +#[cfg(test)] +mod tests { + use super::validate_identifier; + + #[test] + fn valid_identifiers_pass() { + for ok in [ + "posts", + "users", + "Order", + "_internal", + "user_2024", + "T", + "a_b_c_d", + ] { + validate_identifier(ok, "table").unwrap_or_else(|e| panic!("{ok} should pass: {e}")); + } + } + + /// Attack strings that the validator must reject. At least 10 per the + /// V-L2-G1 acceptance criteria. + #[test] + fn injection_strings_rejected() { + let attacks = [ + "posts'); DROP TABLE x;--", + "posts; DROP TABLE x;", + "posts--", + "1posts", // leading digit + "", // empty + "posts table", // space + "posts;", // semicolon + "posts'", // single quote + "posts\"x\"", // double quote + "posts/*x*/", // comment + "posts\nx", // newline + "posts\tx", // tab + "posts UNION SELECT 1", + "ünicode", // non-ASCII + "posts.col", // dot + "posts(", // paren + ]; + for attack in attacks { + let result = validate_identifier(attack, "table"); + assert!( + result.is_err(), + "injection string {attack:?} must be rejected" + ); + } + } + + /// Long identifiers (over 63 chars) must be rejected. + #[test] + fn overlong_identifiers_rejected() { + let long = "a".repeat(64); + let err = validate_identifier(&long, "column").unwrap_err(); + assert!( + err.to_string().contains("too long"), + "expected length error, got: {err}" + ); + } + + /// The error message must include the offending identifier so users + /// can find it in their schema. + #[test] + fn error_names_offending_identifier() { + let err = validate_identifier("bad name", "table").unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("bad name"), + "error must name the offending identifier; got: {msg}" + ); + assert!( + msg.contains("table"), + "error must name the kind; got: {msg}" + ); + } +} diff --git a/src/codegen/mod.rs b/src/codegen/mod.rs index 9cbbcb5..0f4f627 100644 --- a/src/codegen/mod.rs +++ b/src/codegen/mod.rs @@ -13,6 +13,7 @@ // - overlay: Generate sidecar schema DDL for enabled octad dimensions // - query: Generate query interceptor SQL for octad enrichment +pub mod ident; pub mod overlay; pub mod parser; pub mod query; diff --git a/src/codegen/overlay.rs b/src/codegen/overlay.rs index 7dd4a29..9707986 100644 --- a/src/codegen/overlay.rs +++ b/src/codegen/overlay.rs @@ -34,8 +34,24 @@ use crate::manifest::OctadConfig; /// dimension tables to generate. /// /// # Returns -/// A string containing the complete DDL for the sidecar database. -pub fn generate_sidecar_schema(schema: &ParsedSchema, octad: &OctadConfig) -> String { +/// `Ok(String)` with the complete DDL on success. `Err` if any table or +/// column name in `schema` is not a valid SQL identifier per +/// [`crate::codegen::ident::validate_identifier`] — guards against SQL +/// injection via parsed schema input. Closes #39. +pub fn generate_sidecar_schema( + schema: &ParsedSchema, + octad: &OctadConfig, +) -> anyhow::Result { + use crate::codegen::ident::validate_identifier; + + // Fail fast on any unsafe identifier flowing into generated DDL. + for table in &schema.tables { + validate_identifier(&table.name, "table name")?; + for column in &table.columns { + validate_identifier(&column.name, "column name")?; + } + } + let mut ddl = String::new(); ddl.push_str("-- SPDX-License-Identifier: PMPL-1.0-or-later\n"); @@ -65,7 +81,7 @@ pub fn generate_sidecar_schema(schema: &ParsedSchema, octad: &OctadConfig) -> St ddl.push_str(&generate_simulation_table()); } - ddl + Ok(ddl) } /// Generate the metadata table that tracks which target tables are augmented. @@ -284,7 +300,7 @@ mod tests { enable_constraints: true, enable_simulation: true, }; - let ddl = generate_sidecar_schema(&schema, &octad); + let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate"); assert!(ddl.contains("verisimdb_metadata")); assert!(ddl.contains("verisimdb_provenance_log")); @@ -308,7 +324,7 @@ mod tests { enable_constraints: true, enable_simulation: true, }; - let ddl = generate_sidecar_schema(&schema, &octad); + let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate"); // Self-referencing FK on parent_branch. assert!( @@ -351,7 +367,7 @@ mod tests { enable_constraints: false, enable_simulation: false, }; - let ddl = generate_sidecar_schema(&schema, &octad); + let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate"); assert!(ddl.contains("verisimdb_temporal_versions")); assert!( ddl.contains( @@ -379,7 +395,7 @@ mod tests { enable_constraints: false, enable_simulation: false, }; - let ddl = generate_sidecar_schema(&schema, &octad); + let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate"); assert!(ddl.contains("verisimdb_lineage_graph")); // The exact CHECK clause must be present in the emitted DDL. assert!( @@ -399,7 +415,7 @@ mod tests { enable_constraints: false, enable_simulation: false, }; - let ddl = generate_sidecar_schema(&schema, &octad); + let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate"); // Metadata is always generated. assert!(ddl.contains("verisimdb_metadata")); @@ -415,7 +431,7 @@ mod tests { fn test_metadata_seeds_table_info() { let schema = test_schema(); let octad = OctadConfig::default(); - let ddl = generate_sidecar_schema(&schema, &octad); + let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate"); assert!(ddl.contains("INSERT OR IGNORE INTO verisimdb_metadata")); assert!(ddl.contains("'posts'")); diff --git a/src/main.rs b/src/main.rs index eea1f78..1b5c707 100644 --- a/src/main.rs +++ b/src/main.rs @@ -141,8 +141,10 @@ fn main() -> Result<()> { // Create output directory. std::fs::create_dir_all(&output)?; - // Generate sidecar overlay schema. - let overlay_ddl = codegen::overlay::generate_sidecar_schema(&schema, &m.octad); + // Generate sidecar overlay schema. Errors here surface invalid + // table/column identifiers in the parsed schema before they + // reach disk. + let overlay_ddl = codegen::overlay::generate_sidecar_schema(&schema, &m.octad)?; let overlay_path = format!("{}/sidecar_schema.sql", output); std::fs::write(&overlay_path, &overlay_ddl)?; println!("Generated sidecar schema: {}", overlay_path); diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 30243b1..9adac57 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -78,7 +78,8 @@ fn test_full_pipeline_blog_schema() { enable_constraints: true, enable_simulation: false, }; - let overlay_ddl = overlay::generate_sidecar_schema(&schema, &octad); + let overlay_ddl = + overlay::generate_sidecar_schema(&schema, &octad).expect("schema is valid"); // Verify all expected sidecar tables are present. assert!( @@ -476,7 +477,8 @@ path = ".verisim/test.db" assert_eq!(schema.tables[0].name, "articles"); // Generate overlay. - let overlay_ddl = overlay::generate_sidecar_schema(&schema, &manifest.octad); + let overlay_ddl = overlay::generate_sidecar_schema(&schema, &manifest.octad) + .expect("schema is valid"); assert!(overlay_ddl.contains("verisim_provenance_log")); assert!(overlay_ddl.contains("verisim_temporal_versions")); assert!(