From 389a328a5ca0ce4430955e2bdf4a5bb533b828c6 Mon Sep 17 00:00:00 2001
From: KevinMLanderos <kevinmezalanderos@gmail.com>
Date: Fri, 15 May 2026 16:22:25 -0400
Subject: [PATCH] fix: filter non-productive TCRs, dynamic gene family columns,
 single-cell quality filters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three bugs were fixed:

  1. bin/sample_calc.py — Gene family CSVs (v_family, d_family, j_family) were
     built with hard-coded maximum indices (TRBV: 30, TRBD: 2, TRBJ: 2), silently
     dropping any gene with a number above those limits. The max index is now
     derived dynamically from genes observed in each sample. Samples with no valid
     calls for a gene type write a sample-only row instead of crashing.

  2. modules/local/annotate/main.nf — ANNOTATE_PROCESS did not filter for
     productive TCRs, so non-productive rearrangements propagated into every
     downstream file: concatenated_cdr3_sorted.tsv, OLGA pgen calculation,
     TCRSHARING, and the patient workflow (GIANA, GLIPH2, overlap metrics).
     The process now reads the 'productive' column when present and retains only
     productive entries before writing per-sample _cdr3.tsv files, fixing all
     downstream analyses in one place.

  3. bin/pseudobulk.py — Cell Ranger AIRR output was not filtered for cell or
     contig quality before pseudobulking. is_cell, high_confidence, and productive
     filters are now applied in both pseudobulk() and pseudobulk_phenotype() when
     those columns are present, ensuring background barcodes, low-confidence
     assemblies, and non-productive contigs are excluded from single-cell input.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 bin/pseudobulk.py              | 16 ++++++++++++++--
 bin/sample_calc.py             | 19 +++++++++++++------
 modules/local/annotate/main.nf | 20 ++++++++++++--------
 3 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/bin/pseudobulk.py b/bin/pseudobulk.py
index 2ae8902..d1ee544 100755
--- a/bin/pseudobulk.py
+++ b/bin/pseudobulk.py
@@ -69,8 +69,14 @@ def normalize_bool(val):
     return pd.NA 
 
 def pseudobulk(input_df, basename, airr_schema):
-    # Load data, filter out TRA calls
     df = input_df
+
+    # Standard Cell Ranger quality filters (applied only when columns are present)
+    for col in ["is_cell", "high_confidence", "productive"]:
+        if col in df.columns:
+            df = df[df[col].apply(normalize_bool) == "true"]
+
+    # Load data, filter out TRA calls
     df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")]
 
     # Define required aggregations
@@ -173,8 +179,14 @@ def add_phenotype_info(input_df, basename, sobject_gex):
     return merged_df
 
 def pseudobulk_phenotype(input_df, basename, airr_schema):
-    # Load data, filter out TRA calls
     df = input_df
+
+    # Standard Cell Ranger quality filters (applied only when columns are present)
+    for col in ["is_cell", "high_confidence", "productive"]:
+        if col in df.columns:
+            df = df[df[col].apply(normalize_bool) == "true"]
+
+    # Load data, filter out TRA calls
     df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")]
 
     # Define required aggregations
diff --git a/bin/sample_calc.py b/bin/sample_calc.py
index 644c2bd..7fc4b52 100755
--- a/bin/sample_calc.py
+++ b/bin/sample_calc.py
@@ -22,14 +22,21 @@ def extract_trb_family(allele):
     match = re.match(r'(TRB[V|D|J])(\d+)', allele)
     return f"{match.group(1)}{match.group(2)}" if match else None
 
-def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file):
-    # Build list of all possible family names
+def calc_gene_family(sample_name, counts, gene_column, family_prefix, output_file):
+    families = counts[gene_column].apply(extract_trb_family).dropna()
+
+    if families.empty:
+        pd.DataFrame([{'sample': sample_name}]).to_csv(output_file, header=True, index=False)
+        return
+
+    # Derive max family index from observed genes
+    max_index = families.str.extract(r'(\d+)$')[0].astype(int).max()
     all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)]
 
     # Count usage
     fam_df = counts[gene_column].apply(extract_trb_family).value_counts(dropna=False).to_frame().T
 
-    # Reindex to include all families
+    # Reindex to include all families from 1 to observed max
     fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True)
 
     # Add sample column
@@ -122,9 +129,9 @@ def main():
     # Read in the counts file
     counts = pd.read_csv(args.count_table, sep='\t')
 
-    calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'v_family_{sample}.csv')
-    calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'd_family_{sample}.csv')
-    calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'j_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'v_call', 'TRBV', f'v_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'd_call', 'TRBD', f'd_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'j_call', 'TRBJ', f'j_family_{sample}.csv')
 
     calc_sample_stats(sample, counts, f'sample_stats_{sample}.csv')
 
diff --git a/modules/local/annotate/main.nf b/modules/local/annotate/main.nf
index 086fdbd..0f24d97 100644
--- a/modules/local/annotate/main.nf
+++ b/modules/local/annotate/main.nf
@@ -14,12 +14,7 @@ process ANNOTATE_PROCESS {
     python - <<EOF
 import pandas as pd
 
-USECOLS = [
-    "junction_aa",
-    "v_call",
-    "j_call",
-    "duplicate_count"
-]
+WANTED_COLS = ["junction_aa", "v_call", "j_call", "duplicate_count", "productive"]
 
 COLMAP = {
     "junction_aa": "CDR3b",
@@ -28,10 +23,14 @@ COLMAP = {
     "duplicate_count": "counts"
 }
 
+# Only read columns that are actually present in the file
+available = pd.read_csv("${count_table}", sep='\t', nrows=0).columns.tolist()
+usecols = [c for c in WANTED_COLS if c in available]
+
 df = pd.read_csv(
     "${count_table}",
     sep='\t',
-    usecols=USECOLS,
+    usecols=usecols,
     dtype={
         "junction_aa": "string",
         "v_call": "string",
@@ -39,8 +38,13 @@ df = pd.read_csv(
         "duplicate_count": "int"
     })
 
+df = df[df.junction_aa.notna()]
+
+if "productive" in df.columns:
+    df = df[df["productive"].astype(str).str.lower().isin(["t", "true", "1", "yes"])]
+
 df = (
-    df[df.junction_aa.notna()]
+    df
     .rename(columns=COLMAP)
     [["CDR3b", "TRBV", "TRBJ", "counts"]]
 )