From 389a328a5ca0ce4430955e2bdf4a5bb533b828c6 Mon Sep 17 00:00:00 2001 From: KevinMLanderos Date: Fri, 15 May 2026 16:22:25 -0400 Subject: [PATCH] fix: filter non-productive TCRs, dynamic gene family columns, single-cell quality filters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three bugs were fixed: 1. bin/sample_calc.py — Gene family CSVs (v_family, d_family, j_family) were built with hard-coded maximum indices (TRBV: 30, TRBD: 2, TRBJ: 2), silently dropping any gene with a number above those limits. The max index is now derived dynamically from genes observed in each sample. Samples with no valid calls for a gene type write a sample-only row instead of crashing. 2. modules/local/annotate/main.nf — ANNOTATE_PROCESS did not filter for productive TCRs, so non-productive rearrangements propagated into every downstream file: concatenated_cdr3_sorted.tsv, OLGA pgen calculation, TCRSHARING, and the patient workflow (GIANA, GLIPH2, overlap metrics). The process now reads the 'productive' column when present and retains only productive entries before writing per-sample _cdr3.tsv files, fixing all downstream analyses in one place. 3. bin/pseudobulk.py — Cell Ranger AIRR output was not filtered for cell or contig quality before pseudobulking. is_cell, high_confidence, and productive filters are now applied in both pseudobulk() and pseudobulk_phenotype() when those columns are present, ensuring background barcodes, low-confidence assemblies, and non-productive contigs are excluded from single-cell input. Co-Authored-By: Claude Sonnet 4.6 --- bin/pseudobulk.py | 16 ++++++++++++++-- bin/sample_calc.py | 19 +++++++++++++------ modules/local/annotate/main.nf | 20 ++++++++++++-------- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/bin/pseudobulk.py b/bin/pseudobulk.py index 2ae8902..d1ee544 100755 --- a/bin/pseudobulk.py +++ b/bin/pseudobulk.py @@ -69,8 +69,14 @@ def normalize_bool(val): return pd.NA def pseudobulk(input_df, basename, airr_schema): - # Load data, filter out TRA calls df = input_df + + # Standard Cell Ranger quality filters (applied only when columns are present) + for col in ["is_cell", "high_confidence", "productive"]: + if col in df.columns: + df = df[df[col].apply(normalize_bool) == "true"] + + # Load data, filter out TRA calls df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")] # Define required aggregations @@ -173,8 +179,14 @@ def add_phenotype_info(input_df, basename, sobject_gex): return merged_df def pseudobulk_phenotype(input_df, basename, airr_schema): - # Load data, filter out TRA calls df = input_df + + # Standard Cell Ranger quality filters (applied only when columns are present) + for col in ["is_cell", "high_confidence", "productive"]: + if col in df.columns: + df = df[df[col].apply(normalize_bool) == "true"] + + # Load data, filter out TRA calls df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")] # Define required aggregations diff --git a/bin/sample_calc.py b/bin/sample_calc.py index 644c2bd..7fc4b52 100755 --- a/bin/sample_calc.py +++ b/bin/sample_calc.py @@ -22,14 +22,21 @@ def extract_trb_family(allele): match = re.match(r'(TRB[V|D|J])(\d+)', allele) return f"{match.group(1)}{match.group(2)}" if match else None -def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file): - # Build list of all possible family names +def calc_gene_family(sample_name, counts, gene_column, family_prefix, output_file): + families = counts[gene_column].apply(extract_trb_family).dropna() + + if families.empty: + pd.DataFrame([{'sample': sample_name}]).to_csv(output_file, header=True, index=False) + return + + # Derive max family index from observed genes + max_index = families.str.extract(r'(\d+)$')[0].astype(int).max() all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)] # Count usage fam_df = counts[gene_column].apply(extract_trb_family).value_counts(dropna=False).to_frame().T - # Reindex to include all families + # Reindex to include all families from 1 to observed max fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True) # Add sample column @@ -122,9 +129,9 @@ def main(): # Read in the counts file counts = pd.read_csv(args.count_table, sep='\t') - calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'v_family_{sample}.csv') - calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'd_family_{sample}.csv') - calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'j_family_{sample}.csv') + calc_gene_family(sample, counts, 'v_call', 'TRBV', f'v_family_{sample}.csv') + calc_gene_family(sample, counts, 'd_call', 'TRBD', f'd_family_{sample}.csv') + calc_gene_family(sample, counts, 'j_call', 'TRBJ', f'j_family_{sample}.csv') calc_sample_stats(sample, counts, f'sample_stats_{sample}.csv') diff --git a/modules/local/annotate/main.nf b/modules/local/annotate/main.nf index 086fdbd..0f24d97 100644 --- a/modules/local/annotate/main.nf +++ b/modules/local/annotate/main.nf @@ -14,12 +14,7 @@ process ANNOTATE_PROCESS { python - <