KarchinLab · KevinMLanderos · May 15, 2026
diff --git a/bin/pseudobulk.py b/bin/pseudobulk.py
@@ -69,8 +69,14 @@ def normalize_bool(val):
     return pd.NA 
 
 def pseudobulk(input_df, basename, airr_schema):
-    # Load data, filter out TRA calls
     df = input_df
+
+    # Standard Cell Ranger quality filters (applied only when columns are present)
+    for col in ["is_cell", "high_confidence", "productive"]:
+        if col in df.columns:
+            df = df[df[col].apply(normalize_bool) == "true"]
+
+    # Load data, filter out TRA calls
     df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")]
 
     # Define required aggregations
@@ -173,8 +179,14 @@ def add_phenotype_info(input_df, basename, sobject_gex):
     return merged_df
 
 def pseudobulk_phenotype(input_df, basename, airr_schema):
-    # Load data, filter out TRA calls
     df = input_df
+
+    # Standard Cell Ranger quality filters (applied only when columns are present)
+    for col in ["is_cell", "high_confidence", "productive"]:
+        if col in df.columns:
+            df = df[df[col].apply(normalize_bool) == "true"]
+
+    # Load data, filter out TRA calls
     df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")]
 
     # Define required aggregations

diff --git a/bin/sample_calc.py b/bin/sample_calc.py
@@ -22,14 +22,21 @@ def extract_trb_family(allele):
     match = re.match(r'(TRB[V|D|J])(\d+)', allele)
     return f"{match.group(1)}{match.group(2)}" if match else None
 
-def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file):
-    # Build list of all possible family names
+def calc_gene_family(sample_name, counts, gene_column, family_prefix, output_file):
+    families = counts[gene_column].apply(extract_trb_family).dropna()
+
+    if families.empty:
+        pd.DataFrame([{'sample': sample_name}]).to_csv(output_file, header=True, index=False)
+        return
+
+    # Derive max family index from observed genes
+    max_index = families.str.extract(r'(\d+)$')[0].astype(int).max()
     all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)]
 
     # Count usage
     fam_df = counts[gene_column].apply(extract_trb_family).value_counts(dropna=False).to_frame().T
 
-    # Reindex to include all families
+    # Reindex to include all families from 1 to observed max
     fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True)
 
     # Add sample column
@@ -122,9 +129,9 @@ def main():
     # Read in the counts file
     counts = pd.read_csv(args.count_table, sep='\t')
 
-    calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'v_family_{sample}.csv')
-    calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'd_family_{sample}.csv')
-    calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'j_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'v_call', 'TRBV', f'v_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'd_call', 'TRBD', f'd_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'j_call', 'TRBJ', f'j_family_{sample}.csv')
 
     calc_sample_stats(sample, counts, f'sample_stats_{sample}.csv')
 

diff --git a/modules/local/annotate/main.nf b/modules/local/annotate/main.nf
@@ -14,12 +14,7 @@ process ANNOTATE_PROCESS {
     python - <<EOF
 import pandas as pd
 
-USECOLS = [
-    "junction_aa",
-    "v_call",
-    "j_call",
-    "duplicate_count"
-]
+WANTED_COLS = ["junction_aa", "v_call", "j_call", "duplicate_count", "productive"]
 
 COLMAP = {
     "junction_aa": "CDR3b",
@@ -28,19 +23,28 @@ COLMAP = {
     "duplicate_count": "counts"
 }
 
+# Only read columns that are actually present in the file
+available = pd.read_csv("${count_table}", sep='\t', nrows=0).columns.tolist()
+usecols = [c for c in WANTED_COLS if c in available]
+
 df = pd.read_csv(
     "${count_table}",
     sep='\t',
-    usecols=USECOLS,
+    usecols=usecols,
     dtype={
         "junction_aa": "string",
         "v_call": "string",
         "j_call": "string",
         "duplicate_count": "int"
     })
 
+df = df[df.junction_aa.notna()]
+
+if "productive" in df.columns:
+    df = df[df["productive"].astype(str).str.lower().isin(["t", "true", "1", "yes"])]
+
 df = (
-    df[df.junction_aa.notna()]
+    df
     .rename(columns=COLMAP)
     [["CDR3b", "TRBV", "TRBJ", "counts"]]
 )