From 856736f8f729b6aa8581ed9963778b949ada332a Mon Sep 17 00:00:00 2001 From: brysonbennett Date: Thu, 13 Mar 2025 10:25:05 -0700 Subject: [PATCH 1/2] Added ability to read in a TSV with retention times, and add them to the output .msp. Also found that the LysoCL class had a class specific generate fragments function, which I removed as it was redudntant. --- calicolipidlibrary/LysoCL.py | 38 ++++++++++----------- calicolipidlibrary/lipidRules.py | 43 +++++++++++------------- generateDB.py | 57 +++++++++++++++++++++++++++----- 3 files changed, 86 insertions(+), 52 deletions(-) diff --git a/calicolipidlibrary/LysoCL.py b/calicolipidlibrary/LysoCL.py index 406aae8..36889ab 100644 --- a/calicolipidlibrary/LysoCL.py +++ b/calicolipidlibrary/LysoCL.py @@ -96,25 +96,25 @@ def theoreticalDigest(self): return FRAGMENTS - def generateLibrary(self, target=None, mode="pos"): - if target: - handle = open(target, "a+") - if mode == "pos": - adduct_set = self.pos_adduct_set - elif mode == "neg": - adduct_set = self.neg_adduct_set - # parent = self.__bases__[0] - class_name = self.__class__.__name__ - for c in self.chain_sets: - for adduct in adduct_set: - self.set_chains_and_adduct(class_name, c[:3], adduct=adduct) - content = self.printNist() - if target: - handle.write(content) - else: - sys.stdout.write(content) - if target: - handle.close() + # def generateLibrary(self, target=None, mode="pos", retention_times): + # if target: + # handle = open(target, "a+") + # if mode == "pos": + # adduct_set = self.pos_adduct_set + # elif mode == "neg": + # adduct_set = self.neg_adduct_set + # # parent = self.__bases__[0] + # class_name = self.__class__.__name__ + # for c in self.chain_sets: + # for adduct in adduct_set: + # self.set_chains_and_adduct(class_name, c[:3], adduct=adduct) + # content = self.printNist(retention_times) + # if target: + # handle.write(content) + # else: + # sys.stdout.write(content) + # if target: + # handle.close() # x = LysoCL("LysoCL",[[16,0,0], [18,1,0], [18,2,0]], adduct="[M+H]+") diff --git a/calicolipidlibrary/lipidRules.py b/calicolipidlibrary/lipidRules.py index b450900..12bd6fd 100644 --- a/calicolipidlibrary/lipidRules.py +++ b/calicolipidlibrary/lipidRules.py @@ -1,19 +1,15 @@ import re import sys -from types import NoneType - from create_smiles import * -from rdkit import Chem -from rdkit.Chem import Descriptors -from rdkit.Chem import rdMolDescriptors -H2O = 18.01056 -NH3 = 17.00274 + +H2O = 18.010564683 +NH3 = 17.026549100 PROTON = 1.00727645229 HYDROGEN = 1.0078250321 OXYGEN = 15.9949146221 -ELECTRON = 0.0005489 +ELECTRON = 0.00054857990 ADDUCT = { "[M]": 0.0, @@ -559,7 +555,7 @@ def prettyFormula(self): return formulaStr - def printNist(self): + def printNist(self, retention_times): bbType = LIPID_BACKBONES[self.lipidclass][0] Nchains = len(self.chains) totalC = 0 @@ -622,6 +618,7 @@ def printNist(self): ChainStrings = str(self.chains[0][0]) FullName = self.lipidclass + ChainStrings FullNameWithAdduct = FullName + " " + self.adduct + SumName = FullName atoms = self.MF() MASS = MW_list(atoms) @@ -731,21 +728,18 @@ def printNist(self): RECORD.append("SMILES: " + get_lipid_smiles(self.lipidclass, LIPID_BACKBONES[self.lipidclass][0], LIPID_BACKBONES[self.lipidclass][1], self.chains) + "\n") RECORD.append("NumPeaks: " + str(len(FRAGMENTS)) + "\n") + #Add retention times if they are available + if retention_times is not None: + this_rt_entry = retention_times[retention_times["name"] == FullName] + if not this_rt_entry.empty: + if "rt" in this_rt_entry.columns.tolist(): + RECORD.append(f"RT: {this_rt_entry.iloc[0]["rt"]:.2f}\n") + if "rt_min" in this_rt_entry.columns.tolist(): + RECORD.append(f"RT_min: {this_rt_entry.iloc[0]["rt_min"]:.2f}\n") + if "rt_max" in this_rt_entry.columns.tolist(): + RECORD.append(f"RT_max: {this_rt_entry.iloc[0]["rt_max"]:.2f}\n") - # smiles = get_lipid_smiles(self.lipidclass, LIPID_BACKBONES[self.lipidclass][0], - # LIPID_BACKBONES[self.lipidclass][1], self.chains) - - # Temporary checks to make sure that molecular weight from smiles matches that calculated from headgroup+sn - # mol = Chem.MolFromSmiles(smiles) - # smiles_mw = Chem.Descriptors.ExactMolWt(mol) - # if smiles == "" or smiles_mw == 0.0 or (abs(MASS - smiles_mw) > .0001): - # print(FullName) - # print("SMILES MF: " + str(Chem.rdMolDescriptors.CalcMolFormula(mol))) - - #RECORD.append("SMILES MF: " + str(Chem.rdMolDescriptors.CalcMolFormula(mol)) + "\n") - #end of temporary check - for f in FRAGMENTS: RECORD.append(str(f[0]) + " " + str(f[1]) + " " + str(f[2]) + "\n") RECORD.append("\n\n") @@ -770,19 +764,20 @@ def theoreticalDigest(self): # FRAGMENTS.append( [PREC, 1000, "pre"] ) return FRAGMENTS - def generateLibrary(self, target=None, mode="pos"): + def generateLibrary(self, target=None, mode="pos", retention_times=None): if target: handle = open(target, "a+") if mode == "pos": adduct_set = self.pos_adduct_set elif mode == "neg": adduct_set = self.neg_adduct_set + #self.rt = # parent = self.__bases__[0] class_name = self.__class__.__name__ for c in self.chain_sets: for adduct in adduct_set: self.set_chains_and_adduct(class_name, c, adduct=adduct) - content = self.printNist() + content = self.printNist(retention_times) if target and content is not None: handle.write(content) elif content is not None: diff --git a/generateDB.py b/generateDB.py index 6d17923..29b9472 100644 --- a/generateDB.py +++ b/generateDB.py @@ -1,6 +1,9 @@ import datetime import argparse import os +import pandas as pd +from nltk import rte_features + from calicolipidlibrary import * @@ -38,6 +41,13 @@ def cli_parser(): default="", ) + parser.add_argument( + "-r", + "--retention-times", + help="retention time file name", + dest="retention_time_file_name", + default="", + ) return parser @@ -64,6 +74,13 @@ def cli_parser(): print("AVAILABLE LIPID CLASSES:") for lipid_class in ALL_LIPID_CLASSES: print(lipid_class) + print("-r :") + print("\tSupply path and filename for a .tsv containing known retention times\n") + #Needs Name RT or Name and RT+rt_min+rt_max or Name and rt_min_rt_max? + print("\tFile must contain a column with \"name\" and \"rt\" and/or\n") + print("\t\"name\" and both of \"rt_max\" and \"rt_min\"\n") + print("\tDEFAULT: No retention times will be associated.\n") + exit(0) args = cli_parser().parse_args() @@ -86,6 +103,30 @@ def cli_parser(): exit(1) + + + if args.retention_time_file_name != "": + if not os.path.exists(args.retention_time_file_name): + print("Supplied retention time files does not exist. Exiting program.") + exit(1) + + retention_times_df = pd.read_csv(args.retention_time_file_name, delimiter = '\t') + retention_times_df.columns = retention_times_df.columns.str.lower() #make column names lower case + rt_col_names = retention_times_df.columns.tolist() + + #check that we have Name and RT or both of RT_min and RT_max (or all three) + if ("name" not in rt_col_names) and (("rt" in rt_col_names) or ("rt_max" not in rt_col_names and "rt_min" not in rt_col_names)): + print("Supplied retention time file is improperly formatted.\n") + print("\tSupply path and filename for a .tsv containing known retention times\n") + print("\tFile must contain a column with \"name\" and \"rt\" and/or\n") + print("\tboth of \"rt_max\" and \"rt_min\"\n") + print("\tExiting program.") + exit(1) + else: + retention_times_df = retention_times_df.loc[:, retention_times_df.columns.isin(["name", "rt", "rt_min", "rt_max"])] + else: + retention_times_df = None + lipid_classes = [] if len(args.lipid_classes) == 0: lipid_classes = ALL_LIPID_CLASSES.copy() @@ -128,18 +169,16 @@ def cli_parser(): open(output_msp_file, "w") - for lipid_class in lipid_classes: + lipid_classes_list = list(lipid_classes) + lipid_classes_list.sort() #put lipid classes in alphabetical order + for lipid_class in lipid_classes_list: msg = "Enumerating spectra for " + lipid_class + " ... " instantiate_str = lipid_class + " = " + lipid_class + "()" sys.stdout.write(msg) - eval_str = ( - lipid_class - + ".generateLibrary(target='" - + output_msp_file - + "', mode='" - + ion - + "')" - ) + eval_str = (f"{lipid_class}.generateLibrary(target='{output_msp_file}', mode='{ion}', retention_times=retention_times_df)" + ) + + exec(instantiate_str) eval(eval_str) sys.stdout.write(" DONE\n") From 7c3604f47ec315472dddfc0234c66eb0095a170a Mon Sep 17 00:00:00 2001 From: brysonbennett Date: Wed, 29 Apr 2026 08:36:17 -0700 Subject: [PATCH 2/2] CLL issue #17, mass_spec issue #1920. Add two double bond LCB sepcies to Ceramides, fix LCB mass (from 262 to 264 for SMs --- calicolipidlibrary/Ceramide.py | 8 +++++--- calicolipidlibrary/SM.py | 4 ++-- calicolipidlibrary/lipidRules.py | 15 ++++++++------- calicolipidlibrary/utils.py | 2 +- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/calicolipidlibrary/Ceramide.py b/calicolipidlibrary/Ceramide.py index 26837ed..3d4a71d 100644 --- a/calicolipidlibrary/Ceramide.py +++ b/calicolipidlibrary/Ceramide.py @@ -4,11 +4,13 @@ # need to fix nomenclature class Ceramide(SphingoLipid): +# h is one less than nomenclature as the first OH is defined as part of the head group +# so h = 1 is a d-LCB species chain1_ranges = [] for c in range(14, 23): - for d in range(0, 2): - for h in range(1, 3): - if (h > 1 and d > 0): + for d in range(0, 3): + for h in range(0, 3): + if (h > 1 and d > 1): continue chain1_ranges.append([c, d, h]) diff --git a/calicolipidlibrary/SM.py b/calicolipidlibrary/SM.py index 4c027c8..ba643b2 100644 --- a/calicolipidlibrary/SM.py +++ b/calicolipidlibrary/SM.py @@ -27,7 +27,7 @@ def theoreticalDigest(self): MW_list( [ self.chains[0][0], - (2 * self.chains[0][0] - 2 * self.chains[0][1] - 1), + (2 * self.chains[0][0] - 2 * self.chains[0][1] + 1), 1, self.chains[0][2], 0, @@ -45,7 +45,7 @@ def theoreticalDigest(self): MW_list( [ self.chains[0][0], - (2 * self.chains[0][0] - 2 * self.chains[0][1] - 1), + (2 * self.chains[0][0] - 2 * self.chains[0][1] + 1), 1, self.chains[0][2], 0, diff --git a/calicolipidlibrary/lipidRules.py b/calicolipidlibrary/lipidRules.py index 12bd6fd..3392f64 100644 --- a/calicolipidlibrary/lipidRules.py +++ b/calicolipidlibrary/lipidRules.py @@ -727,7 +727,6 @@ def printNist(self, retention_times): RECORD.append("FORMULA: " + self.prettyFormula() + "\n") RECORD.append("SMILES: " + get_lipid_smiles(self.lipidclass, LIPID_BACKBONES[self.lipidclass][0], LIPID_BACKBONES[self.lipidclass][1], self.chains) + "\n") - RECORD.append("NumPeaks: " + str(len(FRAGMENTS)) + "\n") #Add retention times if they are available if retention_times is not None: this_rt_entry = retention_times[retention_times["name"] == FullName] @@ -739,6 +738,8 @@ def printNist(self, retention_times): if "rt_max" in this_rt_entry.columns.tolist(): RECORD.append(f"RT_max: {this_rt_entry.iloc[0]["rt_max"]:.2f}\n") + RECORD.append("NumPeaks: " + str(len(FRAGMENTS)) + "\n") + for f in FRAGMENTS: RECORD.append(str(f[0]) + " " + str(f[1]) + " " + str(f[2]) + "\n") @@ -883,10 +884,8 @@ class NAcylGPL(Lipid): for c1 in chain1_ranges: for c2 in chain2_ranges: for c3 in chain3_ranges: - if c2[0] < c1[0]: - continue -# if ((c2[0] == c1[0]) and (c2[1] < c1[0])): -# continue + if c2[0] < c1[0]: continue + if (c2[0] == c1[0] and c2[1] < c1[1]): continue chain_sets.append([c1, c2, c3]) @@ -900,7 +899,7 @@ class SphingoLipid(Lipid): chain1_ranges = [] for c in range(14, 23): - for d in range(0, 2): + for d in range(0, 3): for h in range(1, 2): if (h > 1 and d > 0): continue @@ -982,6 +981,8 @@ class AcylSphingoLipid(Lipid): chain_sets = [] for c1 in chain1_ranges: for c2 in chain2_ranges: + if c2[0] < c1[0]: continue + if (c2[0] == c1[0] and c2[1] < c1[1]): continue for c3 in chain3_ranges: chain_sets.append([c1, c2, c3]) @@ -1090,7 +1091,7 @@ class CardioLipin(Lipid): if c3[0] < c2[0]: continue if (c3[0] == c2[0] and c3[1] < c2[1]): continue if c4[0] < c3[0]: continue - if (c4[0] == c3[0] and c3[1] < c2[1]): continue + if (c4[0] == c3[0] and c4[1] < c3[1]): continue # if c2[0] < c1[0]: continue # if (c2[0] == c1[0] and c2[1] < c1[0]): continue diff --git a/calicolipidlibrary/utils.py b/calicolipidlibrary/utils.py index bc88909..0f3e938 100644 --- a/calicolipidlibrary/utils.py +++ b/calicolipidlibrary/utils.py @@ -74,7 +74,7 @@ def print_spectrum(lipid_class, chains=[], adduct=""): if lipid_class not in ALL_LIPID_CLASSES: print("Lipid Class '" + lipid_class + "' is not a supported lipid class.") - print("Suported lipid classes are:") + print("Supported lipid classes are:") for lipid in ALL_LIPID_CLASSES: print(lipid) return