Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions bin/pseudobulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,14 @@ def normalize_bool(val):
return pd.NA

def pseudobulk(input_df, basename, airr_schema):
# Load data, filter out TRA calls
df = input_df

# Standard Cell Ranger quality filters (applied only when columns are present)
for col in ["is_cell", "high_confidence", "productive"]:
if col in df.columns:
df = df[df[col].apply(normalize_bool) == "true"]

# Load data, filter out TRA calls
df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")]

# Define required aggregations
Expand Down Expand Up @@ -173,8 +179,14 @@ def add_phenotype_info(input_df, basename, sobject_gex):
return merged_df

def pseudobulk_phenotype(input_df, basename, airr_schema):
# Load data, filter out TRA calls
df = input_df

# Standard Cell Ranger quality filters (applied only when columns are present)
for col in ["is_cell", "high_confidence", "productive"]:
if col in df.columns:
df = df[df[col].apply(normalize_bool) == "true"]

# Load data, filter out TRA calls
df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")]

# Define required aggregations
Expand Down
19 changes: 13 additions & 6 deletions bin/sample_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,21 @@ def extract_trb_family(allele):
match = re.match(r'(TRB[V|D|J])(\d+)', allele)
return f"{match.group(1)}{match.group(2)}" if match else None

def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file):
# Build list of all possible family names
def calc_gene_family(sample_name, counts, gene_column, family_prefix, output_file):
families = counts[gene_column].apply(extract_trb_family).dropna()

if families.empty:
pd.DataFrame([{'sample': sample_name}]).to_csv(output_file, header=True, index=False)
return

# Derive max family index from observed genes
max_index = families.str.extract(r'(\d+)$')[0].astype(int).max()
all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)]

# Count usage
fam_df = counts[gene_column].apply(extract_trb_family).value_counts(dropna=False).to_frame().T

# Reindex to include all families
# Reindex to include all families from 1 to observed max
fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True)

# Add sample column
Expand Down Expand Up @@ -122,9 +129,9 @@ def main():
# Read in the counts file
counts = pd.read_csv(args.count_table, sep='\t')

calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'v_family_{sample}.csv')
calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'd_family_{sample}.csv')
calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'j_family_{sample}.csv')
calc_gene_family(sample, counts, 'v_call', 'TRBV', f'v_family_{sample}.csv')
calc_gene_family(sample, counts, 'd_call', 'TRBD', f'd_family_{sample}.csv')
calc_gene_family(sample, counts, 'j_call', 'TRBJ', f'j_family_{sample}.csv')

calc_sample_stats(sample, counts, f'sample_stats_{sample}.csv')

Expand Down
20 changes: 12 additions & 8 deletions modules/local/annotate/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,7 @@ process ANNOTATE_PROCESS {
python - <<EOF
import pandas as pd

USECOLS = [
"junction_aa",
"v_call",
"j_call",
"duplicate_count"
]
WANTED_COLS = ["junction_aa", "v_call", "j_call", "duplicate_count", "productive"]

COLMAP = {
"junction_aa": "CDR3b",
Expand All @@ -28,19 +23,28 @@ COLMAP = {
"duplicate_count": "counts"
}

# Only read columns that are actually present in the file
available = pd.read_csv("${count_table}", sep='\t', nrows=0).columns.tolist()
usecols = [c for c in WANTED_COLS if c in available]

df = pd.read_csv(
"${count_table}",
sep='\t',
usecols=USECOLS,
usecols=usecols,
dtype={
"junction_aa": "string",
"v_call": "string",
"j_call": "string",
"duplicate_count": "int"
})

df = df[df.junction_aa.notna()]

if "productive" in df.columns:
df = df[df["productive"].astype(str).str.lower().isin(["t", "true", "1", "yes"])]

df = (
df[df.junction_aa.notna()]
df
.rename(columns=COLMAP)
[["CDR3b", "TRBV", "TRBJ", "counts"]]
)
Expand Down
Loading