diff --git a/bin/pseudobulk.py b/bin/pseudobulk.py index 2ae8902..d1ee544 100755 --- a/bin/pseudobulk.py +++ b/bin/pseudobulk.py @@ -69,8 +69,14 @@ def normalize_bool(val): return pd.NA def pseudobulk(input_df, basename, airr_schema): - # Load data, filter out TRA calls df = input_df + + # Standard Cell Ranger quality filters (applied only when columns are present) + for col in ["is_cell", "high_confidence", "productive"]: + if col in df.columns: + df = df[df[col].apply(normalize_bool) == "true"] + + # Load data, filter out TRA calls df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")] # Define required aggregations @@ -173,8 +179,14 @@ def add_phenotype_info(input_df, basename, sobject_gex): return merged_df def pseudobulk_phenotype(input_df, basename, airr_schema): - # Load data, filter out TRA calls df = input_df + + # Standard Cell Ranger quality filters (applied only when columns are present) + for col in ["is_cell", "high_confidence", "productive"]: + if col in df.columns: + df = df[df[col].apply(normalize_bool) == "true"] + + # Load data, filter out TRA calls df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")] # Define required aggregations diff --git a/bin/sample_calc.py b/bin/sample_calc.py index 644c2bd..7fc4b52 100755 --- a/bin/sample_calc.py +++ b/bin/sample_calc.py @@ -22,14 +22,21 @@ def extract_trb_family(allele): match = re.match(r'(TRB[V|D|J])(\d+)', allele) return f"{match.group(1)}{match.group(2)}" if match else None -def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file): - # Build list of all possible family names +def calc_gene_family(sample_name, counts, gene_column, family_prefix, output_file): + families = counts[gene_column].apply(extract_trb_family).dropna() + + if families.empty: + pd.DataFrame([{'sample': sample_name}]).to_csv(output_file, header=True, index=False) + return + + # Derive max family index from observed genes + max_index = families.str.extract(r'(\d+)$')[0].astype(int).max() all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)] # Count usage fam_df = counts[gene_column].apply(extract_trb_family).value_counts(dropna=False).to_frame().T - # Reindex to include all families + # Reindex to include all families from 1 to observed max fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True) # Add sample column @@ -122,9 +129,9 @@ def main(): # Read in the counts file counts = pd.read_csv(args.count_table, sep='\t') - calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'v_family_{sample}.csv') - calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'd_family_{sample}.csv') - calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'j_family_{sample}.csv') + calc_gene_family(sample, counts, 'v_call', 'TRBV', f'v_family_{sample}.csv') + calc_gene_family(sample, counts, 'd_call', 'TRBD', f'd_family_{sample}.csv') + calc_gene_family(sample, counts, 'j_call', 'TRBJ', f'j_family_{sample}.csv') calc_sample_stats(sample, counts, f'sample_stats_{sample}.csv') diff --git a/modules/local/annotate/main.nf b/modules/local/annotate/main.nf index 086fdbd..0f24d97 100644 --- a/modules/local/annotate/main.nf +++ b/modules/local/annotate/main.nf @@ -14,12 +14,7 @@ process ANNOTATE_PROCESS { python - <