diff --git a/.gitignore b/.gitignore index 5f899ff8d..fd42dac32 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ +# Benchmark generated output +benchmarks/results/ +benchmarks/figures/ + ################# ## Eclipse ################# diff --git a/benchmarks/data_generator.py b/benchmarks/data_generator.py new file mode 100644 index 000000000..2cc646b3a --- /dev/null +++ b/benchmarks/data_generator.py @@ -0,0 +1,230 @@ +"""Generate synthetic and real HED strings/Series for benchmarking. + +Usage:: + + from data_generator import DataGenerator + gen = DataGenerator() # loads schema 8.4.0 + s = gen.make_string(n_tags=10, n_groups=2, depth=1) + series = gen.make_series(n_rows=1000, n_tags=10, n_groups=2, depth=1) + real = gen.load_real_data(tile_to=5000) +""" + +from __future__ import annotations + +import os +import random + +import pandas as pd + +from hed.schema import load_schema_version +from hed.models.schema_lookup import generate_schema_lookup +from hed.models.tabular_input import TabularInput +from hed.models.df_util import convert_to_form + + +class DataGenerator: + """Build synthetic and real HED data for benchmarking.""" + + def __init__(self, schema_version="8.4.0", seed=42): + self.schema = load_schema_version(schema_version) + self.lookup = generate_schema_lookup(self.schema) + self._rng = random.Random(seed) + + # Collect real tag short names from the schema for realistic generation + self._all_tags = [] + for name, entry in self.schema.tags.items(): + if name.endswith("/#"): + continue + short = getattr(entry, "short_tag_name", name.rsplit("/", 1)[-1]) + self._all_tags.append(short) + + # Separate leaf vs non-leaf for variety + self._tags = list(self._all_tags) + + # ------------------------------------------------------------------ + # Single string generation + # ------------------------------------------------------------------ + + def _pick_tags(self, n, repeats=0): + """Pick *n* unique tags, then append *repeats* duplicates of the first.""" + chosen = self._rng.sample(self._tags, min(n, len(self._tags))) + if repeats and chosen: + chosen.extend([chosen[0]] * repeats) + return chosen + + def make_string(self, n_tags=5, n_groups=0, depth=0, repeats=0, form="short"): + """Build a single synthetic HED string. + + Parameters: + n_tags: Total number of tag tokens (spread across top-level and groups). + n_groups: Number of parenthesised groups to create. + depth: Maximum nesting depth inside groups. + repeats: Number of duplicate copies of the first tag to append. + form: 'short' | 'long' — tag form. + + Returns: + str: A raw HED string. + """ + tags = self._pick_tags(n_tags, repeats=repeats) + if form == "long": + tags = self._to_long(tags) + + if n_groups == 0 or depth == 0: + return ", ".join(tags) + + # Distribute tags across top-level and groups + top_count = max(1, n_tags - n_groups * 2) + top_tags = tags[:top_count] + remaining = tags[top_count:] + + parts = list(top_tags) + for i in range(n_groups): + group_tags = remaining[i * 2 : i * 2 + 2] if i * 2 + 2 <= len(remaining) else remaining[i * 2 :] + if not group_tags: + group_tags = [self._rng.choice(self._tags)] + parts.append(self._wrap_group(group_tags, depth)) + + return ", ".join(parts) + + def _wrap_group(self, tags, depth): + """Recursively nest *tags* to the given *depth*.""" + inner = ", ".join(tags) + result = f"({inner})" + for _ in range(depth - 1): + extra = self._rng.choice(self._tags) + result = f"({extra}, {result})" + return result + + def make_deeply_nested_string(self, depth, tags_per_level=2): + """Build a string with deep nesting: (A, (B, (C, ...))). + + Parameters: + depth: Number of nesting levels. + tags_per_level: Tags at each level. + + Returns: + str: Deeply nested HED string. + """ + tags = self._pick_tags(depth * tags_per_level + 2) + # Build inside-out + inner = ", ".join(tags[:tags_per_level]) + for i in range(depth): + level_tags = tags[tags_per_level + i * tags_per_level : tags_per_level + (i + 1) * tags_per_level] + if not level_tags: + level_tags = [self._rng.choice(self._tags)] + inner = f"({', '.join(level_tags)}, ({inner}))" + return f"Event, Action, {inner}" + + def make_string_with_specific_tags(self, target_tags, n_extra=5, n_groups=2, depth=1, repeats=0): + """Build a string guaranteed to contain specific tags. + + Parameters: + target_tags: List of tag names to include. + n_extra: Number of random extra tags. + n_groups: Number of groups. + depth: Nesting depth. + repeats: How many times to repeat the first target tag. + + Returns: + str: HED string containing the target tags. + """ + extra = self._pick_tags(n_extra) + all_tags = list(target_tags) + extra + [target_tags[0]] * repeats + self._rng.shuffle(all_tags) + + if n_groups == 0 or depth == 0: + return ", ".join(all_tags) + + top_count = max(1, len(all_tags) - n_groups * 2) + top_tags = all_tags[:top_count] + remaining = all_tags[top_count:] + + parts = list(top_tags) + for i in range(n_groups): + group_tags = remaining[i * 2 : i * 2 + 2] if i * 2 + 2 <= len(remaining) else remaining[i * 2 :] + if not group_tags: + group_tags = [self._rng.choice(self._tags)] + parts.append(self._wrap_group(group_tags, depth)) + + return ", ".join(parts) + + def _to_long(self, short_tags): + """Convert short tag names to long form via the schema.""" + from hed.models.hed_tag import HedTag + + out = [] + for t in short_tags: + try: + out.append(HedTag(t, self.schema).long_tag) + except Exception: + out.append(t) + return out + + # ------------------------------------------------------------------ + # Series generation + # ------------------------------------------------------------------ + + def make_series(self, n_rows, *, n_tags=5, n_groups=0, depth=0, repeats=0, form="short", heterogeneous=False): + """Build a pd.Series of HED strings. + + Parameters: + n_rows: Number of rows. + n_tags, n_groups, depth, repeats, form: Passed to make_string. + heterogeneous: If True, randomise parameters per row. + """ + if heterogeneous: + rows = [] + for _ in range(n_rows): + nt = self._rng.choice([3, 5, 10, 15, 25]) + ng = self._rng.choice([0, 1, 2, 5]) + d = self._rng.choice([0, 1, 2]) + rows.append(self.make_string(n_tags=nt, n_groups=ng, depth=d, form=form)) + return pd.Series(rows) + else: + # Homogeneous: one template, tiled + template = self.make_string(n_tags=n_tags, n_groups=n_groups, depth=depth, repeats=repeats, form=form) + return pd.Series([template] * n_rows) + + # ------------------------------------------------------------------ + # Real data + # ------------------------------------------------------------------ + + def load_real_data(self, tile_to=None, form="short"): + """Load the FacePerception BIDS events and return a HED Series. + + Parameters: + tile_to: If set, tile the series up to this many rows. + form: 'short' | 'long'. + + Returns: + pd.Series of HED strings. + """ + bids_root = os.path.realpath( + os.path.join(os.path.dirname(__file__), "..", "tests", "data", "bids_tests", "eeg_ds003645s_hed") + ) + sidecar = os.path.join(bids_root, "task-FacePerception_events.json") + events = os.path.join(bids_root, "sub-002", "eeg", "sub-002_task-FacePerception_run-1_events.tsv") + tab = TabularInput(events, sidecar) + series = tab.series_filtered + + if form == "long": + df = series.copy() + convert_to_form(df, self.schema, "long_tag") + series = df + + if tile_to and tile_to > len(series): + reps = (tile_to // len(series)) + 1 + series = pd.Series(list(series) * reps).iloc[:tile_to].reset_index(drop=True) + + return series + + +# Quick self-test +if __name__ == "__main__": + gen = DataGenerator() + print(f"Schema tags available: {len(gen._tags)}") + print(f"Sample string (5 tags): {gen.make_string(5)}") + print(f"Sample string (10 tags, 2 groups, depth 2): {gen.make_string(10, 2, 2)}") + print(f"Sample string (5 tags, 3 repeats): {gen.make_string(5, repeats=3)}") + print(f"Real data rows: {len(gen.load_real_data())}") + print(f"Tiled to 500: {len(gen.load_real_data(tile_to=500))}") diff --git a/benchmarks/report.py b/benchmarks/report.py new file mode 100644 index 000000000..4d4f1dee6 --- /dev/null +++ b/benchmarks/report.py @@ -0,0 +1,747 @@ +"""Generate analysis report from benchmark results. + +Reads the latest JSON results file and produces: + - Console summary tables + - Matplotlib figures saved to benchmarks/figures/{stem}/ + - A Markdown report in benchmarks/results/ + +Usage:: + + python report.py # latest results + python report.py results/benchmark_20260407_120000.json # specific file +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") # must be set before importing pyplot +import matplotlib.pyplot as plt # noqa: E402 +import pandas as pd + +RESULTS_DIR = Path(__file__).parent / "results" +_FIGURES_BASE = Path(__file__).parent / "figures" +_FIGURES_BASE.mkdir(exist_ok=True) + +# Consistent colours per engine +ENGINE_COLORS = { + "basic_search": "#1f77b4", + "QueryHandler": "#ff7f0e", + "QueryHandler_loop": "#ff7f0e", + "SQH_no_lookup": "#2ca02c", + "SQH_with_lookup": "#d62728", + "search_series_no_lookup": "#2ca02c", + "search_series_with_lookup": "#d62728", + "StringQueryHandler": "#9467bd", + "search_series": "#8c564b", + "StringQueryHandler_no_lookup": "#2ca02c", + "StringQueryHandler_with_lookup": "#d62728", +} + + +def load_results(path=None): + """Load benchmark results from JSON.""" + if path is None: + files = sorted(RESULTS_DIR.glob("benchmark_*.json")) + if not files: + print("No results files found in", RESULTS_DIR) + sys.exit(1) + path = files[-1] + else: + path = Path(path) + print(f"Loading results from {path}") + return json.loads(path.read_text(encoding="utf-8")), path.stem + + +# ====================================================================== +# Console summary +# ====================================================================== + + +def print_single_string_summary(data): + """Print a pivoted summary table of single-string results.""" + records = data.get("single_string", []) + if not records: + return + df = pd.DataFrame(records) + print("\n" + "=" * 80) + print("SINGLE-STRING BENCHMARK SUMMARY (median seconds)") + print("=" * 80) + pivot = df.pivot_table( + index=["config_label", "query_label"], + columns="engine", + values="total_time", + aggfunc="first", + ) + # Convert to milliseconds for readability + pivot_ms = pivot * 1000 + pd.set_option("display.float_format", "{:.4f}".format) + pd.set_option("display.max_columns", 20) + pd.set_option("display.width", 200) + print(pivot_ms.to_string()) + print() + + +def print_series_summary(data): + """Print series-level benchmark summary.""" + records = data.get("series", []) + if not records: + return + df = pd.DataFrame(records) + print("\n" + "=" * 80) + print("SERIES BENCHMARK SUMMARY (median seconds)") + print("=" * 80) + pivot = df.pivot_table( + index=["config_label", "query_label"], + columns="engine", + values="total_time", + aggfunc="first", + ) + pivot_ms = pivot * 1000 + print(pivot_ms.to_string()) + print() + + +def print_sweep_summary(data): + """Print factor sweep summary.""" + records = data.get("factor_sweeps", []) + if not records: + return + df = pd.DataFrame(records) + print("\n" + "=" * 80) + print("FACTOR SWEEP SUMMARY") + print("=" * 80) + for factor in df["factor"].unique(): + sub = df[df["factor"] == factor] + pivot = sub.pivot_table(index="level", columns="engine", values="time", aggfunc="first") + pivot_ms = pivot * 1000 + print(f"\n--- {factor} (ms) ---") + print(pivot_ms.to_string()) + + +def print_real_data_summary(data): + """Print real-data benchmark summary.""" + records = data.get("real_data", []) + if not records: + return + df = pd.DataFrame(records) + print("\n" + "=" * 80) + print(f"REAL DATA BENCHMARK ({records[0].get('n_rows', '?')} rows)") + print("=" * 80) + pivot = df.pivot_table(index="query_label", columns="engine", values="total_time", aggfunc="first") + pivot_ms = pivot * 1000 + print(pivot_ms.to_string()) + print() + + +# ====================================================================== +# Plots +# ====================================================================== + + +def _color(engine): + return ENGINE_COLORS.get(engine, "#333333") + + +def plot_factor_sweep(data, stem): + """One figure per factor sweep with engines as separate lines.""" + records = data.get("factor_sweeps", []) + if not records: + return + df = pd.DataFrame(records) + + for factor in df["factor"].unique(): + sub = df[df["factor"] == factor].copy() + + fig, ax = plt.subplots(figsize=(8, 5)) + for engine in sub["engine"].unique(): + edf = sub[sub["engine"] == engine].sort_values("level") + ax.plot(range(len(edf)), edf["time"].values * 1000, marker="o", label=engine, color=_color(engine)) + ax.set_xticks(range(len(edf))) + ax.set_xticklabels(edf["level"].astype(str), rotation=45, ha="right") + + ax.set_xlabel(factor) + ax.set_ylabel("Time (ms)") + ax.set_title(f"Factor sweep: {factor}") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3) + fig.tight_layout() + fig.savefig(_figures_dir(stem) / f"benchmark_sweep_{factor}.png", dpi=150) + plt.close(fig) + print(f" Saved figures/{stem}/benchmark_sweep_{factor}.png") + + +def plot_series_scaling(data, stem): + """Plot total time vs series size for each engine.""" + records = data.get("factor_sweeps", []) + if not records: + return + df = pd.DataFrame(records) + sub = df[df["factor"] == "series_size"] + if sub.empty: + return + + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + + # Total time + ax = axes[0] + for engine in sub["engine"].unique(): + edf = sub[sub["engine"] == engine].sort_values("level") + ax.plot(edf["level"], edf["time"] * 1000, marker="o", label=engine, color=_color(engine)) + ax.set_xlabel("Series size (rows)") + ax.set_ylabel("Total time (ms)") + ax.set_title("Series search: total time") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3) + + # Per-row time + ax = axes[1] + for engine in sub["engine"].unique(): + edf = sub[sub["engine"] == engine].sort_values("level") + if "per_row" in edf.columns: + ax.plot(edf["level"], edf["per_row"] * 1000, marker="o", label=engine, color=_color(engine)) + ax.set_xlabel("Series size (rows)") + ax.set_ylabel("Per-row time (ms)") + ax.set_title("Series search: per-row amortized cost") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3) + + fig.tight_layout() + fig.savefig(_figures_dir(stem) / "benchmark_series_scaling.png", dpi=150) + plt.close(fig) + print(f" Saved figures/{stem}/benchmark_series_scaling.png") + + +def plot_compile_vs_search(data, stem): + """Bar chart comparing compilation time to per-search time.""" + records = data.get("factor_sweeps", []) + if not records: + return + df = pd.DataFrame(records) + sub = df[df["factor"] == "compile_vs_search"] + if sub.empty: + return + + fig, ax = plt.subplots(figsize=(8, 5)) + engines = sub["engine"].unique() + levels = sub["level"].unique() # compile, search + x = range(len(engines)) + width = 0.35 + + for i, level in enumerate(levels): + vals = [] + for eng in engines: + row = sub[(sub["engine"] == eng) & (sub["level"] == level)] + vals.append(row["time"].values[0] * 1000 if len(row) else 0) + offset = (i - 0.5) * width + ax.bar([xi + offset for xi in x], vals, width, label=level) + + ax.set_xticks(x) + ax.set_xticklabels(engines, rotation=15) + ax.set_ylabel("Time (ms)") + ax.set_title("Compilation vs per-search cost") + ax.legend() + ax.grid(True, alpha=0.3, axis="y") + fig.tight_layout() + fig.savefig(_figures_dir(stem) / "benchmark_compile_vs_search.png", dpi=150) + plt.close(fig) + print(f" Saved figures/{stem}/benchmark_compile_vs_search.png") + + +def plot_query_complexity_heatmap(data, stem): + """Heatmap of query complexity vs engine (single-string results).""" + records = data.get("single_string", []) + if not records: + return + df = pd.DataFrame(records) + # Pick one config for clarity + config = df["config_label"].unique()[len(df["config_label"].unique()) // 2] + sub = df[df["config_label"] == config] + + pivot = sub.pivot_table(index="query_label", columns="engine", values="total_time", aggfunc="first") + pivot_ms = pivot * 1000 + + fig, ax = plt.subplots(figsize=(12, 6)) + im = ax.imshow(pivot_ms.values, aspect="auto", cmap="YlOrRd") + ax.set_xticks(range(len(pivot_ms.columns))) + ax.set_xticklabels(pivot_ms.columns, rotation=45, ha="right", fontsize=8) + ax.set_yticks(range(len(pivot_ms.index))) + ax.set_yticklabels(pivot_ms.index, fontsize=8) + ax.set_title(f"Query × Engine time (ms) — config: {config}") + fig.colorbar(im, ax=ax, label="Time (ms)") + + # Annotate cells + for i in range(len(pivot_ms.index)): + for j in range(len(pivot_ms.columns)): + val = pivot_ms.values[i, j] + if pd.notna(val): + ax.text( + j, + i, + f"{val:.2f}", + ha="center", + va="center", + fontsize=7, + color="white" if val > pivot_ms.values[pd.notna(pivot_ms.values)].mean() else "black", + ) + + fig.tight_layout() + fig.savefig(_figures_dir(stem) / "benchmark_query_heatmap.png", dpi=150) + plt.close(fig) + print(f" Saved figures/{stem}/benchmark_query_heatmap.png") + + +def plot_real_data(data, stem): + """Bar chart of real-data results.""" + records = data.get("real_data", []) + if not records: + return + df = pd.DataFrame(records) + + pivot = df.pivot_table(index="query_label", columns="engine", values="total_time", aggfunc="first") + pivot_ms = pivot * 1000 + + fig, ax = plt.subplots(figsize=(10, 5)) + pivot_ms.plot(kind="bar", ax=ax, color=[_color(c) for c in pivot_ms.columns]) + ax.set_ylabel("Total time (ms)") + ax.set_title(f"Real BIDS data ({records[0].get('n_rows', '?')} rows)") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + plt.xticks(rotation=45, ha="right") + fig.tight_layout() + fig.savefig(_figures_dir(stem) / "benchmark_real_data.png", dpi=150) + plt.close(fig) + print(f" Saved figures/{stem}/benchmark_real_data.png") + + +# ====================================================================== +# Markdown report +# ====================================================================== + + +def _pivot_to_md(pivot_ms, float_fmt=".3f"): + """Convert a pandas pivot table (in ms) to a Markdown table string.""" + lines = [] + headers = [""] + [str(c) for c in pivot_ms.columns] + lines.append("| " + " | ".join(headers) + " |") + lines.append("| " + " | ".join(["---"] * len(headers)) + " |") + for idx, row in pivot_ms.iterrows(): + label = str(idx) if not isinstance(idx, tuple) else " / ".join(str(x) for x in idx) + cells = [label] + for v in row: + cells.append(f"{v:{float_fmt}}" if pd.notna(v) else "—") + lines.append("| " + " | ".join(cells) + " |") + return "\n".join(lines) + + +def _engine_summary_table(data): + """Build a comparison table of the three search engines.""" + return ( + "| Feature | basic_search | QueryHandler | StringQueryHandler |\n" + "| --- | --- | --- | --- |\n" + "| Input type | `pd.Series[str]` | `HedString` objects | Raw strings (`str`) |\n" + "| Schema required | No | Yes | Optional (via `schema_lookup`) |\n" + "| Series-native | Yes (`find_matching`) | No (manual loop) | Yes (`search_series`) |\n" + "| Boolean AND | `word1, word2` | `term1 && term2` | same as QH |\n" + "| Boolean OR | — | `term1 || term2` | same as QH |\n" + "| Negation | `~word` | `~term` | same as QH |\n" + "| Exact group `{}` | — | `{term1, term2}` | same as QH |\n" + "| Optional exact `{:}` | — | `{term1, term2:}` | same as QH |\n" + "| Logical group `[]` | — | `[term1, term2]` | same as QH |\n" + "| Wildcard `?/?? /???` | — | Yes | same as QH |\n" + "| Descendant wildcard | `*` suffix | `*` suffix | same as QH |\n" + '| Quoted exact match | — | `"Exact-tag"` | same as QH |\n' + "| Implementation | Regex on text | Recursive tree on parsed nodes | Recursive tree on StringNode |\n" + ) + + +def _figures_dir(stem: str) -> Path: + """Return (and create) the per-run figures subdirectory.""" + d = _FIGURES_BASE / stem + d.mkdir(parents=True, exist_ok=True) + return d + + +def generate_markdown_report(data, stem): + """Write a comprehensive Markdown report with tables, plots, and analysis.""" + mode = "quick" if data.get("quick") else "full" + lines = [] + + def h1(t): + lines.extend([f"# {t}", ""]) + + def h2(t): + lines.extend([f"## {t}", ""]) + + def h3(t): + lines.extend([f"### {t}", ""]) + + def p(t): + lines.extend([t, ""]) + + def img(alt, path): + lines.extend([f"![{alt}]({path})", ""]) + + def table(md): + lines.extend([md, ""]) + + # ------------------------------------------------------------------ + # Title and overview + # ------------------------------------------------------------------ + h1("HED search benchmark report") + p(f"**Run:** {data.get('timestamp', 'unknown')} ") + p(f"**Mode:** {mode}") + + h2("Overview") + p("This report compares the performance of the three HED string search engines provided by the `hedtools` package:") + p( + "1. **basic_search** (`hed.models.basic_search.find_matching`) — regex-based pattern matching " + "that operates directly on a `pd.Series` of raw HED strings. No schema required. " + "Supports simple boolean AND (`@`), negation (`~`), wildcards (`*`), and parenthesised groups.\n" + "2. **QueryHandler** (`hed.models.query_handler.QueryHandler`) — full expression-tree search " + "that operates on parsed `HedString` objects. Requires a loaded HED schema. " + "Supports AND, OR, negation, exact groups `{}`, optional exact `{:}`, logical groups `[]`, " + "wildcard child `?`/`??`/`???`, descendant wildcards, and quoted exact matches.\n" + "3. **StringQueryHandler** (`hed.models.string_search.StringQueryHandler`) — lightweight " + "tree-based search that operates on raw strings via `StringNode` duck-typing. Schema is " + "optional (via `schema_lookup` dict for ancestor queries). Provides `search_series()` " + "convenience function for `pd.Series` input. Same query syntax as QueryHandler." + ) + + h3("Engine capability matrix") + table(_engine_summary_table(data)) + + # ------------------------------------------------------------------ + # Key findings (populated from data) + # ------------------------------------------------------------------ + h2("Key findings") + findings = [] + + # Series speed — use series_size sweep so query and config are consistent; + # report ratio at the largest row count tested. + series_recs = data.get("series", []) + _sweep_recs = data.get("factor_sweeps", []) + if _sweep_recs: + swdf_series = pd.DataFrame(_sweep_recs) + ss = swdf_series[swdf_series["factor"] == "series_size"] + if not ss.empty: + max_level = ss["level"].max() + at_max = ss[ss["level"] == max_level] + bs_row = at_max[at_max["engine"] == "basic_search"]["time"] + qh_row = at_max[at_max["engine"] == "QueryHandler_loop"]["time"] + if not bs_row.empty and not qh_row.empty and bs_row.values[0] > 0: + ratio = qh_row.values[0] / bs_row.values[0] + findings.append( + f"**Series throughput:** `basic_search` is ~{ratio:.0f}× faster than " + f"`QueryHandler` in a row-by-row loop at {max_level:,} rows, " + f"because it leverages vectorised pandas `str.contains` regex matching." + ) + elif series_recs: + sdf = pd.DataFrame(series_recs) + # Group by engine + n_rows, then take the median across queries at each row count; + # report the ratio at the largest row count to avoid mixing incomparable workloads. + per_nrows = sdf.groupby(["engine", "n_rows"])["total_time"].median().reset_index() + max_nrows = per_nrows["n_rows"].max() + at_max = per_nrows[per_nrows["n_rows"] == max_nrows] + bs_row = at_max[at_max["engine"] == "basic_search"]["total_time"] + qh_row = at_max[at_max["engine"] == "QueryHandler_loop"]["total_time"] + if not bs_row.empty and not qh_row.empty and bs_row.values[0] > 0: + ratio = qh_row.values[0] / bs_row.values[0] + findings.append( + f"**Series throughput:** `basic_search` is ~{ratio:.0f}× faster than " + f"`QueryHandler` in a row-by-row loop at {max_nrows:,} rows, " + f"because it leverages vectorised pandas `str.contains` regex matching." + ) + + # SQH vs QH per string + single_recs = data.get("single_string", []) + if single_recs: + ssdf = pd.DataFrame(single_recs) + qh_avg = ssdf[ssdf["engine"] == "QueryHandler"]["total_time"].mean() + sqh_avg = ssdf[ssdf["engine"] == "StringQueryHandler_no_lookup"]["total_time"].mean() + if qh_avg > 0 and sqh_avg > 0: + pct = (1 - sqh_avg / qh_avg) * 100 + findings.append( + f"**Single-string speed:** `StringQueryHandler` (no lookup) is ~{pct:.0f}% " + f"faster than `QueryHandler` per string because it avoids schema-based " + f"`HedString` construction and uses lightweight string parsing." + ) + + # Schema lookup cost + sweeps = data.get("factor_sweeps", []) + if sweeps: + swdf = pd.DataFrame(sweeps) + lu = swdf[swdf["factor"] == "schema_lookup"] + if not lu.empty: + with_lu = lu[lu["level"] == "with_lookup"]["time"].mean() + no_lu = lu[lu["level"] == "no_lookup"]["time"].mean() + if no_lu > 0: + lu_pct = ((with_lu / no_lu) - 1) * 100 + if abs(lu_pct) < 5: + findings.append( + "**Schema-lookup overhead:** Enabling `schema_lookup` in " + "`StringQueryHandler` has negligible overhead for simple queries " + "(cost comes from queries that actually use ancestor matching)." + ) + else: + findings.append( + f"**Schema-lookup overhead:** Enabling `schema_lookup` in " + f"`StringQueryHandler` adds ~{lu_pct:.0f}% overhead for " + f"ancestor-based queries." + ) + + # Deep nesting + if sweeps: + nest_df = swdf[swdf["factor"] == "nesting_depth"] + if not nest_df.empty: + for eng in ["QueryHandler", "SQH_with_lookup"]: + edf = nest_df[nest_df["engine"] == eng].sort_values("level") + if len(edf) >= 2: + t0 = edf.iloc[0]["time"] + t_last = edf.iloc[-1]["time"] + if t0 > 0: + ratio = t_last / t0 + findings.append( + f"**Nesting depth ({eng}):** At depth {edf.iloc[-1]['level']}, " + f"search time is ~{ratio:.1f}× the flat-string time." + ) + + # basic_search operation limitations + if sweeps: + po = swdf[swdf["factor"] == "per_operation"] + if not po.empty: + total = po["level"].nunique() + bs_supported = po[po["engine"] == "basic_search"]["level"].nunique() + unsupported = total - bs_supported + if unsupported > 0: + findings.append( + f"**Operation coverage:** `basic_search` supports " + f"{bs_supported} of {total} tested operations. " + f"The remaining {unsupported} operations (OR, exact groups, logical groups, " + f"wildcards `?`/`??`/`???`, quoted terms) require `QueryHandler` or " + f"`StringQueryHandler`." + ) + + for f in findings: + p(f"- {f}") + + # ------------------------------------------------------------------ + # Single-string results + # ------------------------------------------------------------------ + if single_recs: + h2("Single-string performance") + p( + "Each query was applied to a single HED string of varying complexity. " + "Times are medians of repeated runs, in milliseconds." + ) + ssdf = pd.DataFrame(single_recs) + pivot = ( + ssdf.pivot_table( + index=["config_label", "query_label"], columns="engine", values="total_time", aggfunc="first" + ) + * 1000 + ) + table(_pivot_to_md(pivot)) + + img("Query × Engine heatmap", f"../figures/{stem}/benchmark_query_heatmap.png") + + # ------------------------------------------------------------------ + # Series results + # ------------------------------------------------------------------ + if series_recs: + h2("Series performance") + p( + "Whole-series search: each engine processes all rows of a `pd.Series` for a " + "given query. `basic_search` uses vectorised regex; `search_series` uses " + "`StringQueryHandler.search()` per row; `QueryHandler_loop` parses each row " + "into a `HedString` then searches. Times in milliseconds." + ) + sdf = pd.DataFrame(series_recs) + pivot = ( + sdf.pivot_table( + index=["config_label", "query_label"], columns="engine", values="total_time", aggfunc="first" + ) + * 1000 + ) + table(_pivot_to_md(pivot)) + + img("Series scaling", f"../figures/{stem}/benchmark_series_scaling.png") + + # ------------------------------------------------------------------ + # Factor sweeps + # ------------------------------------------------------------------ + h2("Factor sweeps") + p("Each sweep varies a single factor while holding others constant, measuring how performance degrades.") + + factor_descriptions = { + "tag_count": ( + "Number of tags in the HED string (1 to 100). basic_search time is dominated by " + "regex compilation overhead and stays roughly constant; tree-based engines scale " + "linearly with the number of nodes to traverse." + ), + "nesting_depth": ( + "Parenthesisation depth from 0 (flat) to 20. Deeper nesting increases the tree " + "walk for QueryHandler/StringQueryHandler. basic_search sees variable cost because " + "deeper nesting means more delimiter positions for its cartesian-product verification." + ), + "repeated_tags": ( + "Repetitions of a target tag (0 to 40). basic_search's `verify_search_delimiters` " + "uses `itertools.product` over delimiter positions; repeated tags multiply the " + "search space. Tree-based engines are unaffected." + ), + "group_count": ( + "Number of parenthesised groups (1 to 20). More groups mean more children at the " + "top level for tree traversal." + ), + "series_size": ( + "Number of rows in the Series (10 to 5000). basic_search scales sub-linearly " + "thanks to vectorised pandas regex. All other engines scale linearly (per-row cost " + "is fixed)." + ), + "query_complexity": ( + "Query expression complexity from a bare term to a multi-clause composite. " + "More clauses = more expression-tree nodes to evaluate per candidate." + ), + "schema_lookup": ( + "StringQueryHandler with vs without the `schema_lookup` dictionary. The lookup " + "enables ancestor-based matching (e.g. `Event` matches `Sensory-event`) at a cost." + ), + "string_form": ( + "Short-form vs long-form HED strings. Long-form strings have fully expanded " + "paths (e.g. `Event/Sensory-event`) and are longer, increasing regex and parse cost." + ), + "compile_vs_search": ( + "Decomposition of one-time query compilation cost vs per-string search cost. " + "Compilation is cheap for both engines; the per-search cost dominates." + ), + "per_operation": ( + "Individual operation types tested in isolation. Shows which operations are " + "expensive for each engine. basic_search shows NaN/— for unsupported operations." + ), + } + + # Deep nesting sub-sweeps + for rec in sweeps: + factor = rec["factor"] + if factor.startswith("deep_nest_") and factor not in factor_descriptions: + query_type = factor.replace("deep_nest_", "").replace("_", " ") + factor_descriptions[factor] = ( + f"Deep nesting sweep for *{query_type}* queries at depths 1–20. " + f"Shows how nesting interacts with specific query patterns." + ) + + factors = sorted({rec["factor"] for rec in sweeps}) + for factor in factors: + h3(factor.replace("_", " ").title()) + desc = factor_descriptions.get(factor, "") + if desc: + p(desc) + + # Inline table for this factor + sub = pd.DataFrame([r for r in sweeps if r["factor"] == factor]) + pivot = sub.pivot_table(index="level", columns="engine", values="time", aggfunc="first") * 1000 + table(_pivot_to_md(pivot)) + + img(factor, f"../figures/{stem}/benchmark_sweep_{factor}.png") + + # ------------------------------------------------------------------ + # Real data + # ------------------------------------------------------------------ + real_recs = data.get("real_data", []) + if real_recs: + h2("Real BIDS data") + n_rows = real_recs[0].get("n_rows", "?") + p( + f"Search over {n_rows} rows of real BIDS event data " + f"(`eeg_ds003645s_hed` test dataset, HED_column values). " + f"Times in milliseconds." + ) + rdf = pd.DataFrame(real_recs) + pivot = rdf.pivot_table(index="query_label", columns="engine", values="total_time", aggfunc="first") * 1000 + table(_pivot_to_md(pivot)) + img("Real BIDS data", f"../figures/{stem}/benchmark_real_data.png") + + # ------------------------------------------------------------------ + # Recommendations + # ------------------------------------------------------------------ + h2("Recommendations") + p( + "**Choose `basic_search` when:** You need the fastest possible series-level search, " + "your queries use only simple terms, AND, negation, or descendant wildcards (`*`), " + "and you don't need schema-aware matching. Ideal for filtering event files where " + "speed matters and queries are simple." + ) + p( + "**Choose `StringQueryHandler` when:** You need the full query language (OR, exact " + "groups, logical groups, wildcards) but want to avoid the overhead of parsing every " + "HED string through the schema. `search_series()` is the best general-purpose " + "option when operating on raw strings from tabular files." + ) + p( + "**Choose `QueryHandler` when:** You already have parsed `HedString` objects (e.g. " + "from validation pipelines), or you need exact schema-validated matching. The " + "additional overhead comes from `HedString` construction, not the search itself." + ) + + # ------------------------------------------------------------------ + # Methodology + # ------------------------------------------------------------------ + h2("Methodology") + p( + f"- **Timing:** `timeit` with {20 if not data.get('quick') else 10} iterations " + f"(single-string), {5 if not data.get('quick') else 3} iterations (series), " + f"{10 if not data.get('quick') else 5} iterations (sweeps). Median of all iterations reported.\n" + f"- **Schema:** HED 8.4.0 loaded once and reused across all benchmarks.\n" + f"- **Data generation:** Synthetic strings built from real schema tags with controlled " + f"tag count, nesting depth, group count, and tag repetition.\n" + f"- **schema_lookup:** Generated via `generate_schema_lookup(schema)` — a dict mapping " + f"each short tag to its ancestor tuple.\n" + f"- **Environment:** Results depend on hardware; relative ratios between engines are " + f"the meaningful comparison." + ) + + # Write + report_path = RESULTS_DIR / f"{stem}_report.md" + report_path.write_text("\n".join(lines), encoding="utf-8") + print(f" Saved {report_path}") + + +# ====================================================================== +# Main +# ====================================================================== + + +def main(path=None): + data, stem = load_results(path) + + # Console summaries + print_single_string_summary(data) + print_series_summary(data) + print_sweep_summary(data) + print_real_data_summary(data) + + # Plots + print("\nGenerating plots…") + plot_factor_sweep(data, stem) + plot_series_scaling(data, stem) + plot_compile_vs_search(data, stem) + plot_query_complexity_heatmap(data, stem) + plot_real_data(data, stem) + + # Markdown + print("\nGenerating Markdown report…") + generate_markdown_report(data, stem) + + print("\nDone.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate benchmark report") + parser.add_argument("results_file", nargs="?", default=None, help="Path to results JSON") + args = parser.parse_args() + main(args.results_file) diff --git a/benchmarks/search_benchmark.py b/benchmarks/search_benchmark.py new file mode 100644 index 000000000..3cefdfd0c --- /dev/null +++ b/benchmarks/search_benchmark.py @@ -0,0 +1,747 @@ +"""HED search performance benchmark harness. + +Measures compilation time, single-string search time, and series search time +for all three HED search engines across a matrix of query types + data configs. + +Usage:: + + python search_benchmark.py # full benchmark + python search_benchmark.py --quick # fast smoke-test +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import timeit +import tracemalloc +from datetime import datetime +from pathlib import Path + +import pandas as pd + +# Ensure the repo root is importable when running the script directly +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from hed import HedString, QueryHandler # noqa: E402 +from hed.models.basic_search import find_matching # noqa: E402 +from hed.models.string_search import StringQueryHandler, search_series # noqa: E402 + +from data_generator import DataGenerator # noqa: E402 + +RESULTS_DIR = Path(__file__).parent / "results" +RESULTS_DIR.mkdir(exist_ok=True) + + +# ====================================================================== +# Timing helpers +# ====================================================================== + + +def time_it(func, n_runs=5): + """Return (median_seconds, all_times) for calling *func* n_runs times.""" + times = [] + for _ in range(n_runs): + t = timeit.timeit(func, number=1) + times.append(t) + times.sort() + median = times[len(times) // 2] + return median, times + + +def measure_memory(func): + """Return peak memory (bytes) used by *func*.""" + tracemalloc.start() + func() + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + return peak + + +# ====================================================================== +# Query definitions — (label, basic_search_query, qh_query) +# basic_search_query = None means "not supported by basic_search" +# ====================================================================== + +QUERIES = [ + # --- Simple terms --- + ("single_bare_term", "@Event", "Event"), + ("single_exact_term", None, '"Event"'), + ("single_wildcard", "Def/*", "Def/*"), + # --- Boolean --- + ("two_term_and", "@Event, @Action", "Event && Action"), + ("two_term_or", None, "Event || Action"), + ("negation", "~Event", "~Event"), + # --- Groups --- + ("group_nesting", "(Event, Action)", "[Event && Action]"), + ("exact_group", None, "{Event && Action}"), + ("exact_group_optional", None, "{Event && Action: Agent}"), + ("wildcard_child", None, "{Event, ?}"), + # --- Complex --- + ("three_term_and", "@Event, @Action, @Agent", "Event && Action && Agent"), + ("complex_composite", None, "{(Onset || Offset), (Def || {Def-expand}): ???}"), +] + + +# ====================================================================== +# Single-string benchmarks +# ====================================================================== + + +class SingleStringBenchmark: + """Benchmark each engine on a single HED string.""" + + def __init__(self, gen: DataGenerator, n_runs=20): + self.gen = gen + self.schema = gen.schema + self.lookup = gen.lookup + self.n_runs = n_runs + + def run_all(self, string_configs): + """Run all queries against all string configurations. + + Parameters: + string_configs: list of dicts with keys matching DataGenerator.make_string params + plus a 'label' key for identification. + + Returns: + list[dict]: One record per (query, config, engine) combination. + """ + records = [] + for cfg in string_configs: + label = cfg.pop("label") + raw = self.gen.make_string(**cfg) + cfg["label"] = label # restore + + for q_label, bs_query, qh_query in QUERIES: + # --- basic_search --- + if bs_query is not None: + rec = self._bench_basic(raw, bs_query, label, q_label) + records.append(rec) + + # --- QueryHandler --- + rec = self._bench_query_handler(raw, qh_query, label, q_label) + records.append(rec) + + # --- StringQueryHandler (no lookup) --- + rec = self._bench_string_qh(raw, qh_query, label, q_label, schema_lookup=None, suffix="no_lookup") + records.append(rec) + + # --- StringQueryHandler (with lookup) --- + rec = self._bench_string_qh( + raw, qh_query, label, q_label, schema_lookup=self.lookup, suffix="with_lookup" + ) + records.append(rec) + + return records + + def _bench_basic(self, raw, query, cfg_label, q_label): + series = pd.Series([raw]) + # Compilation (regex build is inside find_matching, not separable easily) + med, _ = time_it(lambda: find_matching(series, query), self.n_runs) + matches = int(find_matching(series, query).sum()) + return { + "engine": "basic_search", + "query_label": q_label, + "config_label": cfg_label, + "query": query, + "compile_time": None, # not separable + "search_time": med, + "total_time": med, + "matches": matches, + } + + def _bench_query_handler(self, raw, query, cfg_label, q_label): + # Compilation + comp_med, _ = time_it(lambda: QueryHandler(query), self.n_runs) + qh = QueryHandler(query) + + # Need to parse HedString each time (part of the cost) + def do_search(): + hs = HedString(raw, self.schema) + return qh.search(hs) + + search_med, _ = time_it(do_search, self.n_runs) + result = do_search() + return { + "engine": "QueryHandler", + "query_label": q_label, + "config_label": cfg_label, + "query": query, + "compile_time": comp_med, + "search_time": search_med, + "total_time": comp_med + search_med, + "matches": len(result), + } + + def _bench_string_qh(self, raw, query, cfg_label, q_label, schema_lookup, suffix): + comp_med, _ = time_it(lambda: StringQueryHandler(query), self.n_runs) + sqh = StringQueryHandler(query) + search_med, _ = time_it(lambda: sqh.search(raw, schema_lookup=schema_lookup), self.n_runs) + result = sqh.search(raw, schema_lookup=schema_lookup) + return { + "engine": f"StringQueryHandler_{suffix}", + "query_label": q_label, + "config_label": cfg_label, + "query": query, + "compile_time": comp_med, + "search_time": search_med, + "total_time": comp_med + search_med, + "matches": len(result), + } + + +# ====================================================================== +# Series benchmarks +# ====================================================================== + + +class SeriesBenchmark: + """Benchmark each engine on a pd.Series of HED strings.""" + + def __init__(self, gen: DataGenerator, n_runs=5): + self.gen = gen + self.schema = gen.schema + self.lookup = gen.lookup + self.n_runs = n_runs + + def run_all(self, series_configs): + """Run selected queries against series of varying size. + + Parameters: + series_configs: list of dicts with keys 'label', 'n_rows', plus + DataGenerator.make_series params. + + Returns: + list[dict]: One record per (query, config, engine) combination. + """ + records = [] + for cfg in series_configs: + label = cfg.pop("label") + n_rows = cfg["n_rows"] + series = self.gen.make_series(**cfg) + cfg["label"] = label # restore + + # Use a subset of queries for series (too slow to run all × all) + # For small series test all; for large ones test representative subset + queries_to_test = QUERIES if n_rows <= 500 else QUERIES[:6] + for q_label, bs_query, qh_query in queries_to_test: + print(f" Series {label} | {q_label} ({n_rows} rows)…") + + # --- basic_search --- + if bs_query is not None: + rec = self._bench_basic_series(series, bs_query, label, q_label, n_rows) + records.append(rec) + + # --- search_series (StringQueryHandler) no lookup --- + rec = self._bench_search_series(series, qh_query, label, q_label, n_rows, None, "no_lookup") + records.append(rec) + + # --- search_series (StringQueryHandler) with lookup --- + rec = self._bench_search_series(series, qh_query, label, q_label, n_rows, self.lookup, "with_lookup") + records.append(rec) + + # --- QueryHandler loop --- + rec = self._bench_qh_loop(series, qh_query, label, q_label, n_rows) + records.append(rec) + + return records + + def _bench_basic_series(self, series, query, cfg_label, q_label, n_rows): + med, _ = time_it(lambda: find_matching(series, query), self.n_runs) + matches = int(find_matching(series, query).sum()) + return { + "engine": "basic_search", + "query_label": q_label, + "config_label": cfg_label, + "n_rows": n_rows, + "total_time": med, + "per_row": med / n_rows, + "matches": matches, + } + + def _bench_search_series(self, series, query, cfg_label, q_label, n_rows, lookup, suffix): + med, _ = time_it(lambda: search_series(series, query, schema_lookup=lookup), self.n_runs) + matches = int(search_series(series, query, schema_lookup=lookup).sum()) + return { + "engine": f"search_series_{suffix}", + "query_label": q_label, + "config_label": cfg_label, + "n_rows": n_rows, + "total_time": med, + "per_row": med / n_rows, + "matches": matches, + } + + def _bench_qh_loop(self, series, query, cfg_label, q_label, n_rows): + qh = QueryHandler(query) + schema = self.schema + + def do_all(): + for s in series: + if pd.notna(s) and s: + hs = HedString(s, schema) + qh.search(hs) + + med, _ = time_it(do_all, self.n_runs) + # count matches + count = 0 + for s in series: + if pd.notna(s) and s: + hs = HedString(s, schema) + if qh.search(hs): + count += 1 + return { + "engine": "QueryHandler_loop", + "query_label": q_label, + "config_label": cfg_label, + "n_rows": n_rows, + "total_time": med, + "per_row": med / n_rows, + "matches": count, + } + + +# ====================================================================== +# Factor sweeps +# ====================================================================== + + +class FactorSweep: + """Isolate the effect of one variable on performance.""" + + def __init__(self, gen: DataGenerator, n_runs=10): + self.gen = gen + self.schema = gen.schema + self.lookup = gen.lookup + self.n_runs = n_runs + + def sweep_tag_count(self, tag_counts=(1, 5, 10, 25, 50, 100)): + """Vary number of tags per string, fixed simple query.""" + query = "Event" + bs_query = "@Event" + records = [] + for nt in tag_counts: + raw = self.gen.make_string(n_tags=nt) + for engine, med in self._bench_all_engines(raw, query, bs_query): + records.append({"factor": "tag_count", "level": nt, "engine": engine, "time": med}) + return records + + def sweep_nesting_depth(self, depths=(0, 1, 2, 3, 5, 10, 15, 20)): + """Vary nesting depth using deeply nested strings.""" + query = "Event" + bs_query = "@Event" + records = [] + for d in depths: + if d == 0: + raw = self.gen.make_string(n_tags=10) + else: + raw = self.gen.make_deeply_nested_string(depth=d, tags_per_level=2) + for engine, med in self._bench_all_engines(raw, query, bs_query): + records.append({"factor": "nesting_depth", "level": d, "engine": engine, "time": med}) + return records + + def sweep_repeated_tags(self, repeat_counts=(0, 3, 5, 10, 20, 40)): + """Vary duplicate tag count — stresses basic_search cartesian product. + + Uses strings that actually contain 'Event' and 'Action' as the repeated + tags so the group query ``(Event, Action)`` triggers combinatorial matching. + """ + query = "(Event, Action)" + bs_query = "(Event, Action)" + records = [] + for r in repeat_counts: + raw = self.gen.make_string_with_specific_tags( + ["Event", "Action"], n_extra=3, n_groups=1, depth=1, repeats=r + ) + for engine, med in self._bench_all_engines(raw, query, bs_query): + records.append({"factor": "repeated_tags", "level": r, "engine": engine, "time": med}) + return records + + def sweep_group_count(self, group_counts=(0, 1, 5, 10, 20)): + """Vary number of groups per string.""" + query = "Event" + bs_query = "@Event" + records = [] + for ng in group_counts: + raw = self.gen.make_string(n_tags=max(10, ng * 2 + 3), n_groups=ng, depth=1) + for engine, med in self._bench_all_engines(raw, query, bs_query): + records.append({"factor": "group_count", "level": ng, "engine": engine, "time": med}) + return records + + def sweep_series_size(self, sizes=(10, 100, 500, 1000, 5000)): + """Vary series length.""" + query = "Event" + bs_query = "@Event" + records = [] + for n in sizes: + series = self.gen.make_series(n_rows=n, n_tags=10, n_groups=2, depth=1) + for engine, med in self._bench_series_engines(series, query, bs_query, n): + records.append({"factor": "series_size", "level": n, "engine": engine, "time": med, "per_row": med / n}) + return records + + def sweep_query_complexity(self): + """Compare queries of increasing complexity.""" + raw = self.gen.make_string(n_tags=20, n_groups=5, depth=2) + complexity_queries = [ + ("1_single_term", "@Event", "Event"), + ("2_two_and", "@Event, @Action", "Event && Action"), + ("3_three_and", "@Event, @Action, @Agent", "Event && Action && Agent"), + ("4_or", None, "Event || Action"), + ("5_negation", "~Event", "~Event"), + ("6_group", "(Event, Action)", "[Event && Action]"), + ("7_exact", None, "{Event && Action}"), + ("8_complex", None, "{(Onset || Offset), (Def || {Def-expand}): ???}"), + ] + records = [] + for clabel, bs_q, qh_q in complexity_queries: + for engine, med in self._bench_all_engines(raw, qh_q, bs_q): + records.append({"factor": "query_complexity", "level": clabel, "engine": engine, "time": med}) + return records + + def sweep_schema_lookup(self): + """Compare StringQueryHandler with vs without schema_lookup.""" + raw = self.gen.make_string(n_tags=15, n_groups=3, depth=1) + query = "Event" + sqh = StringQueryHandler(query) + records = [] + for with_lookup in [False, True]: + lk = self.lookup if with_lookup else None + label = "with_lookup" if with_lookup else "no_lookup" + med, _ = time_it(lambda lk=lk: sqh.search(raw, schema_lookup=lk), self.n_runs) + records.append({"factor": "schema_lookup", "level": label, "engine": "StringQueryHandler", "time": med}) + return records + + def sweep_string_form(self): + """Compare short vs long form strings.""" + query = "Event" + bs_query = "@Event" + records = [] + for form in ["short", "long"]: + raw = self.gen.make_string(n_tags=15, n_groups=3, depth=1, form=form) + for engine, med in self._bench_all_engines(raw, query, bs_query): + records.append({"factor": "string_form", "level": form, "engine": engine, "time": med}) + return records + + def sweep_compilation_vs_search(self): + """Separate compilation cost from per-search cost.""" + raw = self.gen.make_string(n_tags=15, n_groups=3, depth=1) + query = "Event" + records = [] + + # QueryHandler + comp, _ = time_it(lambda: QueryHandler(query), self.n_runs) + qh = QueryHandler(query) + + def qh_search(): + hs = HedString(raw, self.schema) + qh.search(hs) + + search_med, _ = time_it(qh_search, self.n_runs) + records.append({"factor": "compile_vs_search", "level": "compile", "engine": "QueryHandler", "time": comp}) + records.append({"factor": "compile_vs_search", "level": "search", "engine": "QueryHandler", "time": search_med}) + + # StringQueryHandler + comp2, _ = time_it(lambda: StringQueryHandler(query), self.n_runs) + sqh = StringQueryHandler(query) + search_med2, _ = time_it(lambda: sqh.search(raw, schema_lookup=self.lookup), self.n_runs) + records.append( + {"factor": "compile_vs_search", "level": "compile", "engine": "StringQueryHandler", "time": comp2} + ) + records.append( + {"factor": "compile_vs_search", "level": "search", "engine": "StringQueryHandler", "time": search_med2} + ) + + return records + + def sweep_per_operation(self): + """Test every query operation type on the same string. + + Uses a string with enough structure to exercise all operations: + groups, nested groups, Def tags, Onset, etc. + """ + # Build a string with structure that can match all query types + raw = ( + "Sensory-event, Action, Agent, " + "(Event, (Onset, (Def/MyDef))), " + "(Offset, Item, (Def-expand/MyDef, (Red, Blue))), " + "(Visual-presentation, Square, Green)" + ) + + operation_queries = [ + # (label, basic_search_query, qh_query) + ("bare_term", "@Event", "Event"), + ("exact_quoted", None, '"Sensory-event"'), + ("wildcard_prefix", "Def/*", "Def/*"), + ("and_2", "@Event, @Action", "Event && Action"), + ("and_3", "@Event, @Action, @Agent", "Event && Action && Agent"), + ("or", None, "Event || Action"), + ("negation", "~Event", "~Event"), + ("nested_group_[]", "(Event, Action)", "[Event && Action]"), + ("exact_group_{}", None, "{Event && Action}"), + ("exact_optional_{:}", None, "{Event && Action: Agent}"), + ("wildcard_?", None, "{Event, ?}"), + ("wildcard_??", None, "{Event, ??}"), + ("wildcard_???", None, "{Event, ???}"), + ("descendant_nested", None, "[Def && Onset]"), + ("complex_onset_def", None, "{(Onset || Offset), (Def || {Def-expand}): ???}"), + ("deep_and_chain", "@Event, @Action, @Agent, @Item, @Red", "Event && Action && Agent && Item && Red"), + ("nested_or_and", None, "(Event || Sensory-event) && (Action || Agent)"), + ("double_negation", None, "~(~Event)"), + ] + + records = [] + for op_label, bs_q, qh_q in operation_queries: + for engine, med in self._bench_all_engines(raw, qh_q, bs_q): + records.append({"factor": "per_operation", "level": op_label, "engine": engine, "time": med}) + return records + + def sweep_deep_nesting_by_query(self): + """Test how different query types perform on deeply nested strings.""" + depths = [1, 5, 10, 20] + queries = [ + ("bare_term", "@Event", "Event"), + ("two_and", "@Event, @Action", "Event && Action"), + ("group_match", "(Event, Action)", "[Event && Action]"), + ("exact_group", None, "{Event && Action}"), + ("negation", "~Event", "~Event"), + ] + records = [] + for d in depths: + raw = self.gen.make_deeply_nested_string(depth=d, tags_per_level=2) + for q_label, bs_q, qh_q in queries: + for engine, med in self._bench_all_engines(raw, qh_q, bs_q): + records.append( + { + "factor": f"deep_nest_{q_label}", + "level": d, + "engine": engine, + "time": med, + } + ) + return records + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _bench_all_engines(self, raw, qh_query, bs_query=None): + """Yield (engine_name, median_time) for all engines on a single string.""" + series1 = pd.Series([raw]) + + # basic_search + if bs_query is not None: + med, _ = time_it(lambda: find_matching(series1, bs_query), self.n_runs) + yield "basic_search", med + + # QueryHandler + qh = QueryHandler(qh_query) + + def qh_search(): + hs = HedString(raw, self.schema) + qh.search(hs) + + med, _ = time_it(qh_search, self.n_runs) + yield "QueryHandler", med + + # StringQueryHandler no lookup + sqh = StringQueryHandler(qh_query) + med, _ = time_it(lambda: sqh.search(raw, schema_lookup=None), self.n_runs) + yield "SQH_no_lookup", med + + # StringQueryHandler with lookup + med, _ = time_it(lambda: sqh.search(raw, schema_lookup=self.lookup), self.n_runs) + yield "SQH_with_lookup", med + + def _bench_series_engines(self, series, qh_query, bs_query, n_rows): + """Yield (engine_name, median_time) for series-level engines.""" + # basic_search + if bs_query is not None: + med, _ = time_it(lambda: find_matching(series, bs_query), max(3, self.n_runs // 2)) + yield "basic_search", med + + # search_series no lookup + med, _ = time_it(lambda: search_series(series, qh_query, schema_lookup=None), max(3, self.n_runs // 2)) + yield "search_series_no_lookup", med + + # search_series with lookup + med, _ = time_it(lambda: search_series(series, qh_query, schema_lookup=self.lookup), max(3, self.n_runs // 2)) + yield "search_series_with_lookup", med + + # QueryHandler loop + qh = QueryHandler(qh_query) + schema = self.schema + + def qh_loop(): + for s in series: + if pd.notna(s) and s: + hs = HedString(s, schema) + qh.search(hs) + + med, _ = time_it(qh_loop, max(3, self.n_runs // 2)) + yield "QueryHandler_loop", med + + +# ====================================================================== +# Main orchestrator +# ====================================================================== + + +def run_full_benchmark(quick=False): + """Run the complete benchmark suite and save results.""" + print("Initialising DataGenerator (loading schema)…") + gen = DataGenerator() + + n_single = 10 if quick else 20 + n_series = 3 if quick else 5 + n_sweep = 5 if quick else 10 + + # ------------------------------------------------------------------ + # 1. Single-string benchmark + # ------------------------------------------------------------------ + print("\n=== Single-string benchmarks ===") + ssb = SingleStringBenchmark(gen, n_runs=n_single) + + string_configs = [ + {"label": "tiny_1tag", "n_tags": 1}, + {"label": "small_5tag", "n_tags": 5}, + {"label": "medium_10tag", "n_tags": 10, "n_groups": 2, "depth": 1}, + {"label": "large_25tag", "n_tags": 25, "n_groups": 5, "depth": 2}, + {"label": "xlarge_50tag", "n_tags": 50, "n_groups": 10, "depth": 2}, + ] + if not quick: + string_configs.append({"label": "xxlarge_100tag", "n_tags": 100, "n_groups": 15, "depth": 3}) + single_results = ssb.run_all(string_configs) + print(f" Collected {len(single_results)} single-string records.") + + # ------------------------------------------------------------------ + # 2. Series benchmark + # ------------------------------------------------------------------ + print("\n=== Series benchmarks ===") + sb = SeriesBenchmark(gen, n_runs=n_series) + + if quick: + series_sizes = [10, 100, 500] + else: + series_sizes = [10, 100, 500, 1000, 5000] + + series_configs = [] + for n in series_sizes: + series_configs.append({"label": f"homo_{n}", "n_rows": n, "n_tags": 10, "n_groups": 2, "depth": 1}) + for n in [100, 1000] if not quick else [100]: + series_configs.append({"label": f"hetero_{n}", "n_rows": n, "n_tags": 10, "heterogeneous": True}) + + series_results = sb.run_all(series_configs) + print(f" Collected {len(series_results)} series records.") + + # ------------------------------------------------------------------ + # 3. Factor sweeps + # ------------------------------------------------------------------ + print("\n=== Factor sweeps ===") + fs = FactorSweep(gen, n_runs=n_sweep) + + sweep_results = [] + for name, method in [ + ("tag_count", fs.sweep_tag_count), + ("nesting_depth", fs.sweep_nesting_depth), + ("repeated_tags", fs.sweep_repeated_tags), + ("group_count", fs.sweep_group_count), + ("series_size", fs.sweep_series_size), + ("query_complexity", fs.sweep_query_complexity), + ("schema_lookup", fs.sweep_schema_lookup), + ("string_form", fs.sweep_string_form), + ("compile_vs_search", fs.sweep_compilation_vs_search), + ("per_operation", fs.sweep_per_operation), + ("deep_nesting_by_query", fs.sweep_deep_nesting_by_query), + ]: + print(f" Sweep: {name}") + sweep_results.extend(method()) + + print(f" Collected {len(sweep_results)} sweep records.") + + # ------------------------------------------------------------------ + # 4. Real data benchmark + # ------------------------------------------------------------------ + print("\n=== Real data benchmark ===") + real_series = gen.load_real_data() + real_n = len(real_series) + print(f" Real data: {real_n} rows") + + real_results = [] + for q_label, bs_query, qh_query in QUERIES: + if bs_query is not None: + med, _ = time_it(lambda bs_query=bs_query: find_matching(real_series, bs_query), n_series) + real_results.append( + { + "engine": "basic_search", + "query_label": q_label, + "total_time": med, + "per_row": med / real_n, + "n_rows": real_n, + } + ) + + med, _ = time_it( + lambda qh_query=qh_query: search_series(real_series, qh_query, schema_lookup=gen.lookup), n_series + ) + real_results.append( + { + "engine": "search_series", + "query_label": q_label, + "total_time": med, + "per_row": med / real_n, + "n_rows": real_n, + } + ) + + qh = QueryHandler(qh_query) + schema = gen.schema + + def qh_loop(qh=qh, schema=schema): + for s in real_series: + if pd.notna(s) and s: + hs = HedString(s, schema) + qh.search(hs) + + med, _ = time_it(qh_loop, n_series) + real_results.append( + { + "engine": "QueryHandler_loop", + "query_label": q_label, + "total_time": med, + "per_row": med / real_n, + "n_rows": real_n, + } + ) + + print(f" Collected {len(real_results)} real-data records.") + + # ------------------------------------------------------------------ + # Save + # ------------------------------------------------------------------ + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output = { + "timestamp": timestamp, + "quick": quick, + "single_string": single_results, + "series": series_results, + "factor_sweeps": sweep_results, + "real_data": real_results, + } + out_path = RESULTS_DIR / f"benchmark_{timestamp}.json" + out_path.write_text(json.dumps(output, indent=2, default=str), encoding="utf-8") + print(f"\nResults saved to {out_path}") + return output + + +# ====================================================================== +# Entry point +# ====================================================================== + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="HED search benchmark") + parser.add_argument("--quick", action="store_true", help="Reduced run for smoke testing") + args = parser.parse_args() + run_full_benchmark(quick=args.quick) diff --git a/docs/_static/images/benchmark_compile_vs_search.png b/docs/_static/images/benchmark_compile_vs_search.png new file mode 100644 index 000000000..a8a317963 Binary files /dev/null and b/docs/_static/images/benchmark_compile_vs_search.png differ diff --git a/docs/_static/images/benchmark_query_heatmap.png b/docs/_static/images/benchmark_query_heatmap.png new file mode 100644 index 000000000..993bb0040 Binary files /dev/null and b/docs/_static/images/benchmark_query_heatmap.png differ diff --git a/docs/_static/images/benchmark_real_data.png b/docs/_static/images/benchmark_real_data.png new file mode 100644 index 000000000..5fa4e3cf2 Binary files /dev/null and b/docs/_static/images/benchmark_real_data.png differ diff --git a/docs/_static/images/benchmark_series_scaling.png b/docs/_static/images/benchmark_series_scaling.png new file mode 100644 index 000000000..2372b0c0a Binary files /dev/null and b/docs/_static/images/benchmark_series_scaling.png differ diff --git a/docs/_static/images/benchmark_sweep_compile_vs_search.png b/docs/_static/images/benchmark_sweep_compile_vs_search.png new file mode 100644 index 000000000..afb71a4b8 Binary files /dev/null and b/docs/_static/images/benchmark_sweep_compile_vs_search.png differ diff --git a/docs/_static/images/benchmark_sweep_deep_nest_bare_term.png b/docs/_static/images/benchmark_sweep_deep_nest_bare_term.png new file mode 100644 index 000000000..bd4d580e9 Binary files /dev/null and b/docs/_static/images/benchmark_sweep_deep_nest_bare_term.png differ diff --git a/docs/_static/images/benchmark_sweep_deep_nest_exact_group.png b/docs/_static/images/benchmark_sweep_deep_nest_exact_group.png new file mode 100644 index 000000000..b8098dd19 Binary files /dev/null and b/docs/_static/images/benchmark_sweep_deep_nest_exact_group.png differ diff --git a/docs/_static/images/benchmark_sweep_deep_nest_group_match.png b/docs/_static/images/benchmark_sweep_deep_nest_group_match.png new file mode 100644 index 000000000..91595e58f Binary files /dev/null and b/docs/_static/images/benchmark_sweep_deep_nest_group_match.png differ diff --git a/docs/_static/images/benchmark_sweep_deep_nest_negation.png b/docs/_static/images/benchmark_sweep_deep_nest_negation.png new file mode 100644 index 000000000..a510be20b Binary files /dev/null and b/docs/_static/images/benchmark_sweep_deep_nest_negation.png differ diff --git a/docs/_static/images/benchmark_sweep_deep_nest_two_and.png b/docs/_static/images/benchmark_sweep_deep_nest_two_and.png new file mode 100644 index 000000000..c12bd8f7a Binary files /dev/null and b/docs/_static/images/benchmark_sweep_deep_nest_two_and.png differ diff --git a/docs/_static/images/benchmark_sweep_group_count.png b/docs/_static/images/benchmark_sweep_group_count.png new file mode 100644 index 000000000..19bca25fa Binary files /dev/null and b/docs/_static/images/benchmark_sweep_group_count.png differ diff --git a/docs/_static/images/benchmark_sweep_nesting_depth.png b/docs/_static/images/benchmark_sweep_nesting_depth.png new file mode 100644 index 000000000..bae381e6e Binary files /dev/null and b/docs/_static/images/benchmark_sweep_nesting_depth.png differ diff --git a/docs/_static/images/benchmark_sweep_per_operation.png b/docs/_static/images/benchmark_sweep_per_operation.png new file mode 100644 index 000000000..715786af9 Binary files /dev/null and b/docs/_static/images/benchmark_sweep_per_operation.png differ diff --git a/docs/_static/images/benchmark_sweep_query_complexity.png b/docs/_static/images/benchmark_sweep_query_complexity.png new file mode 100644 index 000000000..efb4a06e9 Binary files /dev/null and b/docs/_static/images/benchmark_sweep_query_complexity.png differ diff --git a/docs/_static/images/benchmark_sweep_repeated_tags.png b/docs/_static/images/benchmark_sweep_repeated_tags.png new file mode 100644 index 000000000..83375fe70 Binary files /dev/null and b/docs/_static/images/benchmark_sweep_repeated_tags.png differ diff --git a/docs/_static/images/benchmark_sweep_schema_lookup.png b/docs/_static/images/benchmark_sweep_schema_lookup.png new file mode 100644 index 000000000..9818123e1 Binary files /dev/null and b/docs/_static/images/benchmark_sweep_schema_lookup.png differ diff --git a/docs/_static/images/benchmark_sweep_series_size.png b/docs/_static/images/benchmark_sweep_series_size.png new file mode 100644 index 000000000..e92b8c68e Binary files /dev/null and b/docs/_static/images/benchmark_sweep_series_size.png differ diff --git a/docs/_static/images/benchmark_sweep_string_form.png b/docs/_static/images/benchmark_sweep_string_form.png new file mode 100644 index 000000000..8c777d3d9 Binary files /dev/null and b/docs/_static/images/benchmark_sweep_string_form.png differ diff --git a/docs/_static/images/benchmark_sweep_tag_count.png b/docs/_static/images/benchmark_sweep_tag_count.png new file mode 100644 index 000000000..4efce2e38 Binary files /dev/null and b/docs/_static/images/benchmark_sweep_tag_count.png differ diff --git a/docs/api/models.rst b/docs/api/models.rst index 91dc10d26..01d662de2 100644 --- a/docs/api/models.rst +++ b/docs/api/models.rst @@ -181,7 +181,7 @@ String-based search ------------------- Search functions that operate on raw HED strings without requiring pre-parsed ``HedString`` objects -or a loaded schema. See also :doc:`/search_implementation` for a full comparison of all three +or a loaded schema. See also :doc:`/search_details` for a full comparison of all three search implementations. StringQueryHandler diff --git a/docs/index.rst b/docs/index.rst index 530937629..d65a14b27 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,7 +27,7 @@ in various formats: :maxdepth: 2 User guide - Search implementations + Search implementations API * :ref:`genindex` diff --git a/docs/search_details.md b/docs/search_details.md new file mode 100644 index 000000000..35ebcdecf --- /dev/null +++ b/docs/search_details.md @@ -0,0 +1,443 @@ +--- +html_meta: + description: HED search details — implementation comparison and performance benchmarks for basic_search, QueryHandler, and StringQueryHandler in hedtools + keywords: HED search, string search, query handler, basic search, performance, benchmarks, hedtools, pattern matching +--- + +```{index} search, string search, query, QueryHandler, StringQueryHandler, basic_search +``` + +# HED search details + +HEDtools provides three distinct mechanisms for searching HED-annotated data. This page covers their design and query languages ({ref}`implementations `) and measured performance characteristics ({ref}`performance `). + +(hed-search-implementations)= + +## HED search implementations + +The three implementations share a common goal — "does this HED string match this query?" — but differ substantially in their inputs, capabilities, schema requirements, and performance characteristics. Choosing the right implementation depends on whether you need schema-aware ancestor matching, full group-structural queries, or raw throughput on unannotated strings. + +### Overview of the three implementations + +#### `basic_search` — regex-based flat matching + +Located in {mod}`hed.models.basic_search`, the `find_matching()` function operates directly on a `pd.Series` of raw HED strings using compiled regular expressions. It requires no schema and no parsing step, making it the fastest option for bulk row filtering. + +Key characteristics: + +- Input is a `pd.Series` of raw strings; output is a `pd.Series[bool]` mask. +- The query is compiled once into a regex and applied with `Series.str.contains`. +- Matches are purely literal — `Event` does not match `Sensory-event`. +- `@A` in a basic-search query means A **must be present** anywhere in the string (note: this is the **opposite** of what `@A` means in `QueryHandler`/`StringQueryHandler`). +- `~A` means A must not appear anywhere (global negation). +- `(A, B)` syntax checks that A and B appear at the same nesting level. +- Wildcard `A*` expands to the regex `A.*?`, which can span `/` and match mid-token substrings. + +Use `basic_search` when you are working with a large series of raw strings, don't need ancestor matching, and want maximum throughput. See {func}`hed.models.basic_search.find_matching`. + +#### `QueryHandler` — schema-backed object search + +Located in {mod}`hed.models.query_handler`, `QueryHandler` is the full-featured search engine. It compiles a query string into an expression tree once, then evaluates that tree against `HedString` objects that have already been parsed against a loaded `HedSchema`. + +Key characteristics: + +- Input is a `HedString` object; a full `HedSchema` is required. +- Output is a `list[SearchResult]` containing `HedTag` / `HedGroup` object references, useful for tag-level introspection (not just row filtering). +- Supports the complete query language: `&&`, `||`, `~`, `@`, `{}`, `[]`, `{:}`, `?`, `??`, `???`. +- `@A` means A must **not** appear anywhere in the string. +- Ancestor matching is exact — the schema normalises both query and string tags to short form, so `Event` matches `Sensory-event` because the schema knows `Sensory-event` descends from `Event`. +- Per-string cost includes a full HedString parse and schema tag resolution. + +Use `QueryHandler` when you need schema-aware ancestor matching, or when you want object references (e.g., to retrieve the matched group for further processing). See {class}`hed.models.query_handler.QueryHandler`. + +#### `StringQueryHandler` — tree-based schema-optional search + +Located in {mod}`hed.models.string_search`, `StringQueryHandler` is a new middle-ground implementation that inherits from `QueryHandler` and reuses the full expression-tree compiler, but operates on raw strings rather than pre-parsed `HedString` objects. + +It parses each raw HED string into a lightweight {class}`~hed.models.string_search.StringNode` tree that duck-types the `HedGroup`/`HedTag` interfaces expected by the existing expression evaluators — so all `QueryHandler` query syntax works unchanged. + +Key characteristics: + +- Input is a raw string (or a `pd.Series` via {func}`~hed.models.string_search.search_series`). +- Schema is **optional**: pass a `schema_lookup` dict (see {mod}`hed.models.schema_lookup`) to enable ancestor matching for short-form strings (e.g. `Event` matching `Sensory-event`); omit it for purely literal matching. +- Output is a list (truthy/falsy) — row-filtering only, no object references. +- Supports the same full query syntax as `QueryHandler` (`&&`, `||`, `~`, `@`, `{}`, etc.). +- `@A` carries the same semantics as `QueryHandler` — A must **not** be present. +- Long-form strings (`Event/Sensory-event`) support ancestor matching via slash-splitting even without a lookup. Short-form strings (`Sensory-event`) require a `schema_lookup` for ancestor matching; without one, matching is purely literal. +- Parse cost is a lightweight recursive split — much cheaper than a full HedString + schema parse. + +Use `StringQueryHandler` when you have raw strings (not `HedString` objects), need the full `QueryHandler` query syntax, and either don't have a schema available or want faster processing at the cost of losing full schema-aware ancestor matching. See {class}`hed.models.string_search.StringQueryHandler`. + +#### Generating a schema lookup + +If you want `StringQueryHandler` to resolve ancestors for short-form strings (e.g. query `Event` matching `Sensory-event`) without a full schema parse per row, you can pre-generate a lookup dictionary from a `HedSchema`: + +```python +from hed import load_schema_version +from hed import generate_schema_lookup, save_schema_lookup, load_schema_lookup + +schema = load_schema_version("8.4.0") +lookup = generate_schema_lookup(schema) # {short_name_casefold: tag_terms_tuple} + +# Persist for reuse +save_schema_lookup(lookup, "hed840_lookup.json") +lookup = load_schema_lookup("hed840_lookup.json") +``` + +See {func}`hed.models.schema_lookup.generate_schema_lookup`. + +______________________________________________________________________ + +### Comparison tables + +#### Core characteristics + +| Property | `basic_search` | `QueryHandler` | `StringQueryHandler` | +| --------------------- | -------------------------- | -------------------------------------------------- | ----------------------------------------------- | +| **Input** | `pd.Series` of raw strings | `HedString` object | Raw string or `pd.Series` (via `search_series`) | +| **Schema required** | No | Yes — full `HedSchema` for tag parsing | No; optional `schema_lookup` dict | +| **Output** | `pd.Series[bool]` mask | `list[SearchResult]` with `HedTag`/`HedGroup` refs | `list` (truthy/falsy); `StringNode` refs | +| **Result usable for** | Row filtering | Row filtering + tag/group introspection | Row filtering only | +| **Batch API** | Native (`series`) | Manual loop | `search_series(series, query)` | +| **Parse cost** | Regex compilation once | Full `HedString` + schema parse per string | Lightweight tree parse per string | +| **Unrecognised tags** | Matched literally | Silent match failure (`tag_terms = ()`) | Matched literally | + +#### Query syntax + +| Feature | `basic_search` query syntax | `QueryHandler` / `StringQueryHandler` query syntax | +| ---------------------------- | --------------------------------------------------- | -------------------------------------------------- | +| **AND** | Space or comma between terms (context-dependent) | `A && B` or `A, B` | +| **OR** | Not supported | `A \|\| B` | +| **Absent from string (`@`)** | ⚠️ `@A` means A **must be present** anywhere | `@A` means A must **not** appear anywhere | +| **Must-not-appear (`~`)** | `~A` — A must not appear anywhere (global) | `~A` — negation within group context (local) | +| **Prefix wildcard** | `A*` → regex `A.*?` (spans `/`, matches substrings) | `A*` → prefix on short form only | +| **Full regex per term** | Yes (`regex=True` mode) | No | +| **Quoted exact match** | No | `"A"` — exact match, no ancestor search | +| **Implicit default** | If no `(` or `@`: all terms become "anywhere" | No implicit conversion — must be explicit | + +#### Group / structural operators + +| Feature | `basic_search` | `QueryHandler` | `StringQueryHandler` | +| --------------------------------- | ----------------------------------------- | -------------------------------------------- | ---------------------- | +| **Same nesting level** | `(A, B)` — A and B at same relative level | N/A — use `{A, B}` | N/A — use `{A, B}` | +| **Same parenthesised group `{}`** | No | `{A, B}` — must share a direct parent group | Same as `QueryHandler` | +| **Exact group `{:}`** | No | `{A, B:}` — same group, no other children | Same | +| **Optional exact group** | No | `{A, B: C}` — A and B required, C optional | Same | +| **Descendant group `[]`** | No | `[A, B]` — both in same subtree at any depth | Same | +| **Any child `?`** | No | `?` — any tag or group child | Same | +| **Any tag child `??`** | No | `??` — any leaf (non-group) child | Same | +| **Any group child `???`** | No | `???` — any parenthesised group child | Same | +| **Nested query operators** | No | Yes — full recursive composition | Same | + +#### Ancestor / cross-form search + +| Scenario | `basic_search` | `QueryHandler` | `StringQueryHandler` | +| ------------------------------------------------------- | ------------------------------------------------- | --------------------------------------- | ---------------------------------------------------------------- | +| Query `Event`, string `Sensory-event` (short form) | ❌ literal only | ✅ `tag_terms` from schema | ✅ with `schema_lookup`; ❌ without | +| Query `Event`, string `Event/Sensory-event` (long form) | ❌ `Event` ≠ `Event/Sensory-event` | ✅ schema normalises | ✅ slash-split produces `tag_terms = ("event", "sensory-event")` | +| Query `Event/Sensory-event`, string `Sensory-event` | ❌ | ✅ schema normalises both to short form | ❌ no schema to normalise | +| Schema-free ancestor search | `convert_query()` + long-form series (workaround) | N/A — schema always required | ✅ works natively for long-form strings | +| Tag `Def/Name` matched by query `Def` | ❌ literal prefix mismatch | ✅ `short_base_tag = "Def"` | ✅ `tag_terms` contains `"def"` | + +#### Critical semantic traps + +These differences are silent — no error, just wrong answers if you mix up query strings across implementations: + +| Operator | `basic_search` | `QueryHandler` / `StringQueryHandler` | +| ----------------- | -------------------------------------------------------- | ----------------------------------------------------------------------------------- | +| `@A` | A **must** appear anywhere in the string | A must **not** appear anywhere in the string | +| `~A` | A must not appear **anywhere** (global) | A must not appear in any group that also matches the rest of the expression (local) | +| `*` wildcard | Regex `.*?` — spans `/` and matches mid-token substrings | Strict prefix on the tag's short form — anchored to start | +| No-operator `A B` | Both present anywhere (implicit `@@`) | Parse error — `&&` required | + +______________________________________________________________________ + +(hed-search-performance)= + +## HED search performance + +Benchmarks were run using HED 8.4.0 with `timeit` on both synthetic strings and real BIDS event data. All times are medians in milliseconds. Relative ratios between engines are more meaningful than absolute values, which depend on hardware. + +### Key findings + +- **Series throughput:** `basic_search` is ~16× faster than a `QueryHandler` row-by-row loop at 5 000 rows because it leverages vectorised pandas `str.contains` regex matching. +- **Single-string speed:** `StringQueryHandler` (no lookup) is ~39% faster than `QueryHandler` per string because it avoids schema-based `HedString` construction. +- **Schema-lookup overhead:** Enabling `schema_lookup` in `StringQueryHandler` has negligible overhead for most queries; cost appears only when ancestor matching is actually invoked. +- **Nesting depth:** At depth 20, `QueryHandler` is ~8× slower than on a flat string; `StringQueryHandler` shows similar scaling (~8×). +- **Operation coverage:** `basic_search` supports 7 of 18 tested operation types. The remaining 11 (OR, exact groups, logical groups, `?`/`??`/`???` wildcards, quoted terms) require `QueryHandler` or `StringQueryHandler`. + +### Series throughput + +Whole-series search over a `pd.Series` of HED strings. `basic_search` uses vectorised regex; `search_series` uses `StringQueryHandler.search()` per row; `QueryHandler_loop` constructs a `HedString` per row then searches. Query: `single_bare_term`. + +| Rows | QueryHandler_loop (ms) | basic_search (ms) | search_series (ms) | +| ----: | ---------------------: | ----------------: | -----------------: | +| 10 | 0.34 | 0.20 | 0.30 | +| 100 | 3.43 | 0.40 | 2.41 | +| 500 | 16.7 | 2.25 | 13.1 | +| 1 000 | 29.8 | 1.91 | 19.5 | +| 5 000 | 164 | 11.7 | 114 | + +All three engines scale linearly with row count. `basic_search` is 14–20× faster than `QueryHandler_loop`; `search_series` is roughly 1.4× faster than `QueryHandler_loop`. + +![Series search time vs row count](_static/images/benchmark_sweep_series_size.png) + +### Single-string timing + +Per-string median search time (ms) across string sizes. Tag counts: tiny = 1, small = 5, medium = 10, large = 25, xlarge = 50, xxlarge = 100. Query: `single_bare_term`. + +| String size | QueryHandler (ms) | SQH_no_lookup (ms) | basic_search (ms) | +| ------------------ | ----------------: | -----------------: | ----------------: | +| tiny (1 tag) | 0.012 | 0.007 | 0.131 | +| small (5 tags) | 0.020 | 0.014 | 0.197 | +| medium (10 tags) | 0.041 | 0.021 | 0.123 | +| large (25 tags) | 0.132 | 0.102 | 0.157 | +| xlarge (50 tags) | 0.176 | 0.113 | 0.131 | +| xxlarge (100 tags) | 0.329 | 0.248 | 0.154 | + +`basic_search` regex overhead dominates on small strings; `QueryHandler` and `StringQueryHandler` dominate on large strings. The crossover occurs around 25–50 tags. + +![Median search time per query × engine (ms)](_static/images/benchmark_query_heatmap.png) + +### Operation coverage and cost + +Per-operation timing on a 10-tag string. `basic_search` returns no results (not an error) for unsupported constructs, so queries using those operations will silently produce incorrect results. + +| Operation | QueryHandler (ms) | SQH (ms) | basic_search | +| ------------------------------ | ----------------: | -------: | ------------- | +| `bare_term` | 0.061 | 0.037 | 0.278 ms | +| `and_2` | 0.063 | 0.041 | 0.321 ms | +| `and_3` | 0.067 | 0.045 | 0.355 ms | +| `negation` | 0.083 | 0.043 | 0.160 ms | +| `wildcard_prefix` (`*` suffix) | 0.046 | 0.037 | 0.204 ms | +| `nested_group_[]` | 0.057 | 0.039 | 0.634 ms | +| `deep_and_chain` | 0.117 | 0.059 | 0.515 ms | +| `or` | 0.058 | 0.037 | — unsupported | +| `exact_group_{}` | 0.052 | 0.030 | — unsupported | +| `exact_optional_{:}` | 0.071 | 0.043 | — unsupported | +| `exact_quoted` | 0.062 | 0.030 | — unsupported | +| `wildcard_?` | 0.086 | 0.047 | — unsupported | +| `wildcard_??` | 0.068 | 0.041 | — unsupported | +| `wildcard_???` | 0.074 | 0.041 | — unsupported | +| `descendant_nested` | 0.138 | 0.086 | — unsupported | +| `double_negation` | 0.057 | 0.035 | — unsupported | +| `complex_onset_def` | 0.113 | 0.068 | — unsupported | +| `nested_or_and` | 0.080 | 0.057 | — unsupported | + +`StringQueryHandler` supports all 18 operation types. + +![Per-operation timing across all three engines](_static/images/benchmark_sweep_per_operation.png) + +### Nesting depth + +Parenthesisation depth from 0 (flat) to 20. Deeper nesting increases the tree walk for `QueryHandler` and `StringQueryHandler`. `basic_search` shows no consistent depth trend because its cost depends on delimiter count, not recursion depth. + +| Depth | QueryHandler (ms) | SQH_no_lookup (ms) | basic_search (ms) | +| ----: | ----------------: | -----------------: | ----------------: | +| 0 | 0.026 | 0.017 | 0.125 | +| 1 | 0.022 | 0.013 | 0.256 | +| 2 | 0.028 | 0.019 | 0.218 | +| 3 | 0.034 | 0.023 | 0.215 | +| 5 | 0.110 | 0.076 | 0.531 | +| 10 | 0.094 | 0.060 | 0.385 | +| 15 | 0.116 | 0.082 | 0.226 | +| 20 | 0.409 | 0.140 | 0.200 | + +At depth 20, `QueryHandler` is ~8× slower than at depth 0; `SQH` is ~8× slower. + +![Nesting depth sweep](_static/images/benchmark_sweep_nesting_depth.png) + +#### Deep nesting by query type + +The nesting cost depends on query type. For group-structural queries (`group_match`, `two_term_and`) the engines must evaluate all candidate groups at each level, and `QueryHandler` shows a pronounced cost spike at depth 10 while `StringQueryHandler` stays flatter. All values in ms at depths 1–20. + +**Bare term:** + +| Depth | QueryHandler | SQH_no_lookup | basic_search | +| ----: | -----------: | ------------: | -----------: | +| 1 | 0.030 | 0.019 | 0.204 | +| 5 | 0.045 | 0.031 | 0.198 | +| 10 | 0.087 | 0.059 | 0.209 | +| 20 | 0.141 | 0.154 | 0.212 | + +![Deep nesting — bare term](_static/images/benchmark_sweep_deep_nest_bare_term.png) + +**Exact group `{}`:** + +| Depth | QueryHandler | SQH_no_lookup | +| ----: | -----------: | ------------: | +| 1 | 0.025 | 0.018 | +| 5 | 0.053 | 0.036 | +| 10 | 0.105 | 0.072 | +| 20 | 0.209 | 0.146 | + +![Deep nesting — exact group](_static/images/benchmark_sweep_deep_nest_exact_group.png) + +**Group match `[]`:** + +| Depth | QueryHandler | SQH_no_lookup | basic_search | +| ----: | -----------: | ------------: | -----------: | +| 1 | 0.032 | 0.020 | 0.520 | +| 5 | 0.054 | 0.038 | 0.551 | +| 10 | 0.181 | 0.063 | 0.536 | +| 20 | 0.324 | 0.118 | 0.658 | + +![Deep nesting — group match](_static/images/benchmark_sweep_deep_nest_group_match.png) + +`QueryHandler` at depth 10 is 5.7× its depth-1 cost; `StringQueryHandler` is only 3.2×. + +**Negation:** + +| Depth | QueryHandler | SQH_no_lookup | basic_search | +| ----: | -----------: | ------------: | -----------: | +| 1 | 0.021 | 0.014 | 0.128 | +| 5 | 0.055 | 0.037 | 0.163 | +| 10 | 0.101 | 0.072 | 0.121 | +| 20 | 0.177 | 0.129 | 0.112 | + +![Deep nesting — negation](_static/images/benchmark_sweep_deep_nest_negation.png) + +**Two-term AND:** + +| Depth | QueryHandler | SQH_no_lookup | basic_search | +| ----: | -----------: | ------------: | -----------: | +| 1 | 0.043 | 0.024 | 0.422 | +| 5 | 0.065 | 0.052 | 0.395 | +| 10 | 0.205 | 0.070 | 0.274 | +| 20 | 0.320 | 0.109 | 0.355 | + +![Deep nesting — two-term AND](_static/images/benchmark_sweep_deep_nest_two_and.png) + +### Repeated tags + +Repeating a target tag N times in the string. `basic_search`'s `verify_search_delimiters` uses `itertools.product` over delimiter positions; repeated instances multiply the internal search space. Tree-based engines are linear in the number of candidates and are not affected. + +| Occurrences | QueryHandler (ms) | SQH_no_lookup (ms) | basic_search (ms) | +| ----------: | ----------------: | -----------------: | ----------------: | +| 0 | 0.034 | 0.022 | 0.544 | +| 5 | 0.151 | 0.084 | 0.791 | +| 10 | 0.093 | 0.073 | 0.940 | +| 20 | 0.182 | 0.138 | 0.668 | +| 40 | 0.200 | 0.195 | 0.654 | + +![Repeated target tag sweep](_static/images/benchmark_sweep_repeated_tags.png) + +### Compile vs. search + +Query compilation is a one-time cost; subsequent searches against different strings reuse the compiled expression. Reusing a compiled handler across many strings amortises compilation cost to near zero. + +| Phase | QueryHandler (ms) | StringQueryHandler (ms) | +| ------- | ----------------: | ----------------------: | +| Compile | 0.004 | 0.005 | +| Search | 0.053 | 0.036 | + +![Compile vs. search cost breakdown](_static/images/benchmark_compile_vs_search.png) + +### Real BIDS data + +Search over 200 rows of the `eeg_ds003645s_hed` BIDS test dataset. + +| Query | QueryHandler_loop (ms) | basic_search (ms) | search_series (ms) | +| ---------------------- | ---------------------: | ----------------: | -----------------: | +| `single_bare_term` | 9.0 | 2.5 | 6.5 | +| `single_wildcard` | 8.2 | 0.6 | 4.9 | +| `negation` | 8.0 | 0.9 | 6.8 | +| `two_term_and` | 8.8 | 1.2 | 4.9 | +| `three_term_and` | 8.5 | 1.9 | 5.1 | +| `group_nesting` | 7.9 | 0.3 | 7.8 | +| `two_term_or` | 7.9 | — | 6.8 | +| `exact_group` | 9.3 | — | 6.6 | +| `exact_group_optional` | 11.7 | — | 5.8 | +| `single_exact_term` | 8.1 | — | 5.5 | +| `wildcard_child` | 12.6 | — | 8.9 | +| `complex_composite` | 14.2 | — | 9.5 | + +![Real BIDS data — 200-row search times](_static/images/benchmark_real_data.png) + +### Tag count + +Number of tags in the HED string (1 to 100). `basic_search` time is dominated by regex compilation overhead and stays roughly constant; tree-based engines scale linearly with the number of nodes to traverse. + +| Tags | QueryHandler (ms) | SQH_no_lookup (ms) | basic_search (ms) | +| ---: | ----------------: | -----------------: | ----------------: | +| 1 | 0.014 | 0.004 | 0.294 | +| 5 | 0.019 | 0.013 | 0.163 | +| 10 | 0.031 | 0.018 | 0.150 | +| 25 | 0.061 | 0.080 | 0.124 | +| 50 | 0.149 | 0.160 | 0.184 | +| 100 | 0.287 | 0.167 | 0.271 | + +![Tag count sweep](_static/images/benchmark_sweep_tag_count.png) + +The tree-based crossover with `basic_search` occurs around 25–50 tags, where traversal cost meets the regex setup cost. + +### String form + +Short-form vs long-form HED strings. Long-form strings use fully expanded paths (e.g. `Event/Sensory-event`), increasing string length and parse cost. `basic_search` is largely unaffected because it matches on short tag names via word-boundary patterns. + +| Form | QueryHandler (ms) | SQH_no_lookup (ms) | basic_search (ms) | +| ----- | ----------------: | -----------------: | ----------------: | +| short | 0.044 | 0.029 | 0.124 | +| long | 0.074 | 0.063 | 0.121 | + +![String form sweep](_static/images/benchmark_sweep_string_form.png) + +`QueryHandler` is 1.7× slower on long-form strings; `StringQueryHandler` is 2.2× slower. + +### Schema lookup overhead + +`StringQueryHandler` can be used with or without a `schema_lookup` dictionary. The dictionary enables ancestor-based matching (e.g. `Event` matches `Sensory-event`) at negligible per-call overhead. + +| Mode | StringQueryHandler (ms) | +| ----------- | ----------------------: | +| no_lookup | 0.030 | +| with_lookup | 0.029 | + +![Schema lookup overhead](_static/images/benchmark_sweep_schema_lookup.png) + +### Group count and query complexity + +More top-level parenthesised groups increase the number of children the tree must inspect. Query complexity (more AND/OR clauses) adds expression-tree nodes to evaluate per candidate. + +**Group count** (0–20 single-level groups): + +| Groups | QueryHandler (ms) | SQH_no_lookup (ms) | basic_search (ms) | +| -----: | ----------------: | -----------------: | ----------------: | +| 0 | 0.032 | 0.022 | 0.139 | +| 1 | 0.028 | 0.019 | 0.129 | +| 5 | 0.045 | 0.030 | 0.114 | +| 10 | 0.080 | 0.053 | 0.135 | +| 20 | 0.140 | 0.085 | 0.136 | + +![Group count sweep](_static/images/benchmark_sweep_group_count.png) + +**Query complexity** (1-clause bare term → 8-clause composite): + +| Complexity | QueryHandler (ms) | SQH_no_lookup (ms) | basic_search | +| --------------------- | ----------------: | -----------------: | ------------ | +| 1 — single term | 0.134 | 0.100 | 0.247 ms | +| 2 — two AND | 0.152 | 0.093 | 0.405 ms | +| 3 — three AND | 0.158 | 0.103 | 0.460 ms | +| 4 — OR | 0.134 | 0.071 | — | +| 5 — negation | 0.088 | 0.056 | 0.286 ms | +| 6 — group `[]` | 0.138 | 0.094 | 0.361 ms | +| 7 — exact group `{}` | 0.120 | 0.078 | — | +| 8 — complex composite | 0.106 | 0.078 | — | + +![Query complexity sweep](_static/images/benchmark_sweep_query_complexity.png) + +### Choosing an implementation + +**Use `basic_search`** when you need the fastest possible series-level filter, your queries can be expressed with simple terms, AND, negation, or descendant wildcards (`*`), and schema-aware ancestor matching is not required. Ideal for quick event file filtering when query simplicity is acceptable. + +**Use `StringQueryHandler`** (via `search_series()`) when you need the full query language (OR, exact groups, logical groups, `?`/`??`/`???` wildcards) and are working with raw strings from tabular files or sidecars. This is the best general-purpose choice — it is ~40 % faster than a `QueryHandler` loop per string and close to `basic_search` on large strings. + +**Use `QueryHandler`** when you already have parsed `HedString` objects (for example from a validation pipeline), or when you need results as structured `HedString`/`HedTag` objects rather than boolean matches. The additional overhead relative to `StringQueryHandler` comes from `HedString` construction, not from search expression evaluation, so reusing pre-parsed objects avoids the cost entirely. + +### Benchmark methodology + +- **Timing:** `timeit` — 20 iterations (single-string), 5 iterations (series), 10 iterations (sweeps). Median reported. +- **Schema:** HED 8.4.0, loaded once and reused. +- **Synthetic data:** Strings built from real schema tags with controlled tag count, nesting depth, group count, and tag repetition. +- **`schema_lookup`:** Generated via `generate_schema_lookup(schema)` — a dict mapping each short tag to its ancestor tuple, enabling ancestor-based matching in `StringQueryHandler` without a full schema load per string. +- **Hardware note:** Absolute timings depend on hardware; relative ratios between engines are the meaningful comparison. diff --git a/docs/search_implementation.md b/docs/search_implementation.md deleted file mode 100644 index f00f1fa69..000000000 --- a/docs/search_implementation.md +++ /dev/null @@ -1,157 +0,0 @@ ---- -html_meta: - description: Comparison of the three HED string search implementations in hedtools - basic_search, QueryHandler, and StringQueryHandler - keywords: HED search, string search, query handler, basic search, performance, hedtools, pattern matching ---- - -```{index} search, string search, query, QueryHandler, StringQueryHandler, basic_search -``` - -# HED search implementations - -HEDtools provides three distinct mechanisms for searching HED-annotated data. They share a common goal — "does this HED string match this query?" — but differ substantially in their inputs, capabilities, schema requirements, and performance characteristics. Choosing the right implementation depends on whether you need schema-aware ancestor matching, full group-structural queries, or raw throughput on unannotated strings. - -## Overview of the three implementations - -### `basic_search` — regex-based flat matching - -Located in {mod}`hed.models.basic_search`, the `find_matching()` function operates directly on a `pd.Series` of raw HED strings using compiled regular expressions. It requires no schema and no parsing step, making it the fastest option for bulk row filtering. - -Key characteristics: - -- Input is a `pd.Series` of raw strings; output is a `pd.Series[bool]` mask. -- The query is compiled once into a regex and applied with `Series.str.contains`. -- Matches are purely literal — `Event` does not match `Sensory-event`. -- `@A` in a basic-search query means A **must be present** anywhere in the string (note: this is the **opposite** of what `@A` means in `QueryHandler`/`StringQueryHandler`). -- `~A` means A must not appear anywhere (global negation). -- `(A, B)` syntax checks that A and B appear at the same nesting level. -- Wildcard `A*` expands to the regex `A.*?`, which can span `/` and match mid-token substrings. - -Use `basic_search` when you are working with a large series of raw strings, don't need ancestor matching, and want maximum throughput. See {func}`hed.models.basic_search.find_matching`. - -### `QueryHandler` — schema-backed object search - -Located in {mod}`hed.models.query_handler`, `QueryHandler` is the full-featured search engine. It compiles a query string into an expression tree once, then evaluates that tree against `HedString` objects that have already been parsed against a loaded `HedSchema`. - -Key characteristics: - -- Input is a `HedString` object; a full `HedSchema` is required. -- Output is a `list[SearchResult]` containing `HedTag` / `HedGroup` object references, useful for tag-level introspection (not just row filtering). -- Supports the complete query language: `&&`, `||`, `~`, `@`, `{}`, `[]`, `{:}`, `?`, `??`, `???`. -- `@A` means A must **not** appear anywhere in the string. -- Ancestor matching is exact — the schema normalises both query and string tags to short form, so `Event` matches `Sensory-event` because the schema knows `Sensory-event` descends from `Event`. -- Per-string cost includes a full HedString parse and schema tag resolution. - -Use `QueryHandler` when you need schema-aware ancestor matching, or when you want object references (e.g., to retrieve the matched group for further processing). See {class}`hed.models.query_handler.QueryHandler`. - -### `StringQueryHandler` — tree-based schema-optional search - -Located in {mod}`hed.models.string_search`, `StringQueryHandler` is a new middle-ground implementation that inherits from `QueryHandler` and reuses the full expression-tree compiler, but operates on raw strings rather than pre-parsed `HedString` objects. - -It parses each raw HED string into a lightweight {class}`~hed.models.string_search.StringNode` tree that duck-types the `HedGroup`/`HedTag` interfaces expected by the existing expression evaluators — so all `QueryHandler` query syntax works unchanged. - -Key characteristics: - -- Input is a raw string (or a `pd.Series` via {func}`~hed.models.string_search.search_series`). -- Schema is **optional**: pass a `schema_lookup` dict (see {mod}`hed.models.schema_lookup`) to enable ancestor matching for short-form strings (e.g. `Event` matching `Sensory-event`); omit it for purely literal matching. -- Output is a list (truthy/falsy) — row-filtering only, no object references. -- Supports the same full query syntax as `QueryHandler` (`&&`, `||`, `~`, `@`, `{}`, etc.). -- `@A` carries the same semantics as `QueryHandler` — A must **not** be present. -- Long-form strings (`Event/Sensory-event`) support ancestor matching via slash-splitting even without a lookup. Short-form strings (`Sensory-event`) require a `schema_lookup` for ancestor matching; without one, matching is purely literal. -- Parse cost is a lightweight recursive split — much cheaper than a full HedString + schema parse. - -Use `StringQueryHandler` when you have raw strings (not `HedString` objects), need the full `QueryHandler` query syntax, and either don't have a schema available or want faster processing at the cost of losing full schema-aware ancestor matching. See {class}`hed.models.string_search.StringQueryHandler`. - -### Generating a schema lookup - -If you want `StringQueryHandler` to resolve ancestors for short-form strings (e.g. query `Event` matching `Sensory-event`) without a full schema parse per row, you can pre-generate a lookup dictionary from a `HedSchema`: - -```python -from hed import load_schema_version -from hed import generate_schema_lookup, save_schema_lookup, load_schema_lookup - -schema = load_schema_version("8.4.0") -lookup = generate_schema_lookup(schema) # {short_name_casefold: tag_terms_tuple} - -# Persist for reuse -save_schema_lookup(lookup, "hed840_lookup.json") -lookup = load_schema_lookup("hed840_lookup.json") -``` - -See {func}`hed.models.schema_lookup.generate_schema_lookup`. - -______________________________________________________________________ - -## Comparison tables - -### Core characteristics - -| Property | `basic_search` | `QueryHandler` | `StringQueryHandler` | -| --------------------- | -------------------------- | -------------------------------------------------- | ----------------------------------------------- | -| **Input** | `pd.Series` of raw strings | `HedString` object | Raw string or `pd.Series` (via `search_series`) | -| **Schema required** | No | Yes — full `HedSchema` for tag parsing | No; optional `schema_lookup` dict | -| **Output** | `pd.Series[bool]` mask | `list[SearchResult]` with `HedTag`/`HedGroup` refs | `list` (truthy/falsy); `StringNode` refs | -| **Result usable for** | Row filtering | Row filtering + tag/group introspection | Row filtering only | -| **Batch API** | Native (`series`) | Manual loop | `search_series(series, query)` | -| **Parse cost** | Regex compilation once | Full `HedString` + schema parse per string | Lightweight tree parse per string | -| **Unrecognised tags** | Matched literally | Silent match failure (`tag_terms = ()`) | Matched literally | - -### Query syntax - -| Feature | `basic_search` query syntax | `QueryHandler` / `StringQueryHandler` query syntax | -| ---------------------------- | --------------------------------------------------- | -------------------------------------------------- | -| **AND** | Space or comma between terms (context-dependent) | `A && B` or `A, B` | -| **OR** | Not supported | `A \|\| B` | -| **Absent from string (`@`)** | ⚠️ `@A` means A **must be present** anywhere | `@A` means A must **not** appear anywhere | -| **Must-not-appear (`~`)** | `~A` — A must not appear anywhere (global) | `~A` — negation within group context (local) | -| **Prefix wildcard** | `A*` → regex `A.*?` (spans `/`, matches substrings) | `A*` → prefix on short form only | -| **Full regex per term** | Yes (`regex=True` mode) | No | -| **Quoted exact match** | No | `"A"` — exact match, no ancestor search | -| **Implicit default** | If no `(` or `@`: all terms become "anywhere" | No implicit conversion — must be explicit | - -### Group / structural operators - -| Feature | `basic_search` | `QueryHandler` | `StringQueryHandler` | -| --------------------------------- | ----------------------------------------- | -------------------------------------------- | ---------------------- | -| **Same nesting level** | `(A, B)` — A and B at same relative level | N/A — use `{A, B}` | N/A — use `{A, B}` | -| **Same parenthesised group `{}`** | No | `{A, B}` — must share a direct parent group | Same as `QueryHandler` | -| **Exact group `{:}`** | No | `{A, B:}` — same group, no other children | Same | -| **Optional exact group** | No | `{A, B: C}` — A and B required, C optional | Same | -| **Descendant group `[]`** | No | `[A, B]` — both in same subtree at any depth | Same | -| **Any child `?`** | No | `?` — any tag or group child | Same | -| **Any tag child `??`** | No | `??` — any leaf (non-group) child | Same | -| **Any group child `???`** | No | `???` — any parenthesised group child | Same | -| **Nested query operators** | No | Yes — full recursive composition | Same | - -### Ancestor / cross-form search - -| Scenario | `basic_search` | `QueryHandler` | `StringQueryHandler` | -| ------------------------------------------------------- | ------------------------------------------------- | --------------------------------------- | ---------------------------------------------------------------- | -| Query `Event`, string `Sensory-event` (short form) | ❌ literal only | ✅ `tag_terms` from schema | ✅ with `schema_lookup`; ❌ without | -| Query `Event`, string `Event/Sensory-event` (long form) | ❌ `Event` ≠ `Event/Sensory-event` | ✅ schema normalises | ✅ slash-split produces `tag_terms = ("event", "sensory-event")` | -| Query `Event/Sensory-event`, string `Sensory-event` | ❌ | ✅ schema normalises both to short form | ❌ no schema to normalise | -| Schema-free ancestor search | `convert_query()` + long-form series (workaround) | N/A — schema always required | ✅ works natively for long-form strings | -| Tag `Def/Name` matched by query `Def` | ❌ literal prefix mismatch | ✅ `short_base_tag = "Def"` | ✅ `tag_terms` contains `"def"` | - -### Critical semantic traps - -These differences are silent — no error, just wrong answers if you mix up query strings across implementations: - -| Operator | `basic_search` | `QueryHandler` / `StringQueryHandler` | -| ----------------- | -------------------------------------------------------- | ----------------------------------------------------------------------------------- | -| `@A` | A **must** appear anywhere in the string | A must **not** appear anywhere in the string | -| `~A` | A must not appear **anywhere** (global) | A must not appear in any group that also matches the rest of the expression (local) | -| `*` wildcard | Regex `.*?` — spans `/` and matches mid-token substrings | Strict prefix on the tag's short form — anchored to start | -| No-operator `A B` | Both present anywhere (implicit `@@`) | Parse error — `&&` required | - -______________________________________________________________________ - -## Performance - -*Performance benchmarks will be added here.* - -Preliminary guidance: - -- For large-scale row filtering on raw strings where schema awareness is not needed, `basic_search` is likely fastest due to vectorised regex on the full series with no per-row parsing. -- `StringQueryHandler` trades some throughput for full query-language support and optional ancestor matching; parse cost per row is a lightweight recursive split. -- `QueryHandler` has the highest per-string cost because it requires a pre-parsed `HedString` (including schema tag resolution), but provides the richest result objects.