Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions src/codegen/ident.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// SPDX-License-Identifier: PMPL-1.0-or-later
// Copyright (c) 2026 Jonathan D.A. Jewell (hyperpolymath) <j.d.a.jewell@open.ac.uk>
//
// Identifier validation for any user-controlled name flowing into
// generated DDL. The codegen layer interpolates table and column names
// directly into SQL via `format!()`; an unchecked name like
// `posts'); DROP TABLE x;--` would be injected verbatim. Closes #39.

use anyhow::{Result, bail};

/// Allowed: leading letter or underscore, then letters/digits/underscores.
/// Same shape as standard SQL identifiers without quoting. Length 1..=63
/// (Postgres' NAMEDATALEN minus the null terminator).
const MAX_IDENT_LEN: usize = 63;

/// Validate a single identifier (table or column name) for safe
/// interpolation into generated DDL.
///
/// Returns `Ok(ident)` if `ident` matches `^[A-Za-z_][A-Za-z0-9_]*$` and
/// is within the length bound; otherwise an `anyhow::Error` whose message
/// includes the offending identifier (truncated) and the kind label so
/// users can pinpoint the offending schema input.
pub fn validate_identifier<'a>(ident: &'a str, kind: &str) -> Result<&'a str> {
if ident.is_empty() {
bail!("invalid {kind}: empty identifier");
}
if ident.len() > MAX_IDENT_LEN {
bail!(
"invalid {kind} '{}': identifier too long ({} > {} chars)",
display_truncated(ident),
ident.len(),
MAX_IDENT_LEN,
);
}
let mut chars = ident.chars();
let first = chars.next().expect("non-empty checked above");
if !(first.is_ascii_alphabetic() || first == '_') {
bail!(
"invalid {kind} '{}': must start with a letter or underscore",
display_truncated(ident),
);
}
for c in chars {
if !(c.is_ascii_alphanumeric() || c == '_') {
bail!(
"invalid {kind} '{}': contains disallowed character {:?} \
(only [A-Za-z0-9_] permitted)",
display_truncated(ident),
c,
);
}
}
Ok(ident)
}

fn display_truncated(s: &str) -> String {
const LIMIT: usize = 60;
if s.len() <= LIMIT {
s.to_string()
} else {
format!("{}…", &s[..LIMIT])
}
}

#[cfg(test)]
mod tests {
use super::validate_identifier;

#[test]
fn valid_identifiers_pass() {
for ok in [
"posts",
"users",
"Order",
"_internal",
"user_2024",
"T",
"a_b_c_d",
] {
validate_identifier(ok, "table").unwrap_or_else(|e| panic!("{ok} should pass: {e}"));
}
}

/// Attack strings that the validator must reject. At least 10 per the
/// V-L2-G1 acceptance criteria.
#[test]
fn injection_strings_rejected() {
let attacks = [
"posts'); DROP TABLE x;--",
"posts; DROP TABLE x;",
"posts--",
"1posts", // leading digit
"", // empty
"posts table", // space
"posts;", // semicolon
"posts'", // single quote
"posts\"x\"", // double quote
"posts/*x*/", // comment
"posts\nx", // newline
"posts\tx", // tab
"posts UNION SELECT 1",
"ünicode", // non-ASCII
"posts.col", // dot
"posts(", // paren
];
for attack in attacks {
let result = validate_identifier(attack, "table");
assert!(
result.is_err(),
"injection string {attack:?} must be rejected"
);
}
}

/// Long identifiers (over 63 chars) must be rejected.
#[test]
fn overlong_identifiers_rejected() {
let long = "a".repeat(64);
let err = validate_identifier(&long, "column").unwrap_err();
assert!(
err.to_string().contains("too long"),
"expected length error, got: {err}"
);
}

/// The error message must include the offending identifier so users
/// can find it in their schema.
#[test]
fn error_names_offending_identifier() {
let err = validate_identifier("bad name", "table").unwrap_err();
let msg = err.to_string();
assert!(
msg.contains("bad name"),
"error must name the offending identifier; got: {msg}"
);
assert!(
msg.contains("table"),
"error must name the kind; got: {msg}"
);
}
}
1 change: 1 addition & 0 deletions src/codegen/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// - overlay: Generate sidecar schema DDL for enabled octad dimensions
// - query: Generate query interceptor SQL for octad enrichment

pub mod ident;
pub mod overlay;
pub mod parser;
pub mod query;
34 changes: 25 additions & 9 deletions src/codegen/overlay.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,24 @@ use crate::manifest::OctadConfig;
/// dimension tables to generate.
///
/// # Returns
/// A string containing the complete DDL for the sidecar database.
pub fn generate_sidecar_schema(schema: &ParsedSchema, octad: &OctadConfig) -> String {
/// `Ok(String)` with the complete DDL on success. `Err` if any table or
/// column name in `schema` is not a valid SQL identifier per
/// [`crate::codegen::ident::validate_identifier`] — guards against SQL
/// injection via parsed schema input. Closes #39.
pub fn generate_sidecar_schema(
schema: &ParsedSchema,
octad: &OctadConfig,
) -> anyhow::Result<String> {
use crate::codegen::ident::validate_identifier;

// Fail fast on any unsafe identifier flowing into generated DDL.
for table in &schema.tables {
validate_identifier(&table.name, "table name")?;
for column in &table.columns {
validate_identifier(&column.name, "column name")?;
}
}

let mut ddl = String::new();

ddl.push_str("-- SPDX-License-Identifier: PMPL-1.0-or-later\n");
Expand Down Expand Up @@ -65,7 +81,7 @@ pub fn generate_sidecar_schema(schema: &ParsedSchema, octad: &OctadConfig) -> St
ddl.push_str(&generate_simulation_table());
}

ddl
Ok(ddl)
}

/// Generate the metadata table that tracks which target tables are augmented.
Expand Down Expand Up @@ -284,7 +300,7 @@ mod tests {
enable_constraints: true,
enable_simulation: true,
};
let ddl = generate_sidecar_schema(&schema, &octad);
let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate");

assert!(ddl.contains("verisimdb_metadata"));
assert!(ddl.contains("verisimdb_provenance_log"));
Expand All @@ -308,7 +324,7 @@ mod tests {
enable_constraints: true,
enable_simulation: true,
};
let ddl = generate_sidecar_schema(&schema, &octad);
let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate");

// Self-referencing FK on parent_branch.
assert!(
Expand Down Expand Up @@ -351,7 +367,7 @@ mod tests {
enable_constraints: false,
enable_simulation: false,
};
let ddl = generate_sidecar_schema(&schema, &octad);
let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate");
assert!(ddl.contains("verisimdb_temporal_versions"));
assert!(
ddl.contains(
Expand Down Expand Up @@ -379,7 +395,7 @@ mod tests {
enable_constraints: false,
enable_simulation: false,
};
let ddl = generate_sidecar_schema(&schema, &octad);
let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate");
assert!(ddl.contains("verisimdb_lineage_graph"));
// The exact CHECK clause must be present in the emitted DDL.
assert!(
Expand All @@ -399,7 +415,7 @@ mod tests {
enable_constraints: false,
enable_simulation: false,
};
let ddl = generate_sidecar_schema(&schema, &octad);
let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate");

// Metadata is always generated.
assert!(ddl.contains("verisimdb_metadata"));
Expand All @@ -415,7 +431,7 @@ mod tests {
fn test_metadata_seeds_table_info() {
let schema = test_schema();
let octad = OctadConfig::default();
let ddl = generate_sidecar_schema(&schema, &octad);
let ddl = generate_sidecar_schema(&schema, &octad).expect("test schema must validate");

assert!(ddl.contains("INSERT OR IGNORE INTO verisimdb_metadata"));
assert!(ddl.contains("'posts'"));
Expand Down
6 changes: 4 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,10 @@ fn main() -> Result<()> {
// Create output directory.
std::fs::create_dir_all(&output)?;

// Generate sidecar overlay schema.
let overlay_ddl = codegen::overlay::generate_sidecar_schema(&schema, &m.octad);
// Generate sidecar overlay schema. Errors here surface invalid
// table/column identifiers in the parsed schema before they
// reach disk.
let overlay_ddl = codegen::overlay::generate_sidecar_schema(&schema, &m.octad)?;
let overlay_path = format!("{}/sidecar_schema.sql", output);
std::fs::write(&overlay_path, &overlay_ddl)?;
println!("Generated sidecar schema: {}", overlay_path);
Expand Down
6 changes: 4 additions & 2 deletions tests/integration_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ fn test_full_pipeline_blog_schema() {
enable_constraints: true,
enable_simulation: false,
};
let overlay_ddl = overlay::generate_sidecar_schema(&schema, &octad);
let overlay_ddl =
overlay::generate_sidecar_schema(&schema, &octad).expect("schema is valid");

// Verify all expected sidecar tables are present.
assert!(
Expand Down Expand Up @@ -476,7 +477,8 @@ path = ".verisim/test.db"
assert_eq!(schema.tables[0].name, "articles");

// Generate overlay.
let overlay_ddl = overlay::generate_sidecar_schema(&schema, &manifest.octad);
let overlay_ddl = overlay::generate_sidecar_schema(&schema, &manifest.octad)
.expect("schema is valid");
assert!(overlay_ddl.contains("verisim_provenance_log"));
assert!(overlay_ddl.contains("verisim_temporal_versions"));
assert!(
Expand Down
Loading