From 856736f8f729b6aa8581ed9963778b949ada332a Mon Sep 17 00:00:00 2001
From: brysonbennett <bryson@bryson-mbp-2172.calicolabs.local>
Date: Thu, 13 Mar 2025 10:25:05 -0700
Subject: [PATCH 1/2] Added ability to read in a TSV with retention times, and
 add them to the output .msp.  Also found that the LysoCL class had a class
 specific generate fragments function, which I removed as it was redudntant.

---
 calicolipidlibrary/LysoCL.py     | 38 ++++++++++-----------
 calicolipidlibrary/lipidRules.py | 43 +++++++++++-------------
 generateDB.py                    | 57 +++++++++++++++++++++++++++-----
 3 files changed, 86 insertions(+), 52 deletions(-)

diff --git a/calicolipidlibrary/LysoCL.py b/calicolipidlibrary/LysoCL.py
index 406aae8..36889ab 100644
--- a/calicolipidlibrary/LysoCL.py
+++ b/calicolipidlibrary/LysoCL.py
@@ -96,25 +96,25 @@ def theoreticalDigest(self):
 
         return FRAGMENTS
 
-    def generateLibrary(self, target=None, mode="pos"):
-        if target:
-            handle = open(target, "a+")
-        if mode == "pos":
-            adduct_set = self.pos_adduct_set
-        elif mode == "neg":
-            adduct_set = self.neg_adduct_set
-        # parent = self.__bases__[0]
-        class_name = self.__class__.__name__
-        for c in self.chain_sets:
-            for adduct in adduct_set:
-                self.set_chains_and_adduct(class_name, c[:3], adduct=adduct)
-                content = self.printNist()
-                if target:
-                    handle.write(content)
-                else:
-                    sys.stdout.write(content)
-        if target:
-            handle.close()
+    # def generateLibrary(self, target=None, mode="pos", retention_times):
+    #     if target:
+    #         handle = open(target, "a+")
+    #     if mode == "pos":
+    #         adduct_set = self.pos_adduct_set
+    #     elif mode == "neg":
+    #         adduct_set = self.neg_adduct_set
+    #     # parent = self.__bases__[0]
+    #     class_name = self.__class__.__name__
+    #     for c in self.chain_sets:
+    #         for adduct in adduct_set:
+    #             self.set_chains_and_adduct(class_name, c[:3], adduct=adduct)
+    #             content = self.printNist(retention_times)
+    #             if target:
+    #                 handle.write(content)
+    #             else:
+    #                 sys.stdout.write(content)
+    #     if target:
+    #         handle.close()
 
 
 # x  = LysoCL("LysoCL",[[16,0,0], [18,1,0], [18,2,0]],  adduct="[M+H]+")
diff --git a/calicolipidlibrary/lipidRules.py b/calicolipidlibrary/lipidRules.py
index b450900..12bd6fd 100644
--- a/calicolipidlibrary/lipidRules.py
+++ b/calicolipidlibrary/lipidRules.py
@@ -1,19 +1,15 @@
 import re
 import sys
-from types import NoneType
-
 from create_smiles import *
-from rdkit import Chem
-from rdkit.Chem import Descriptors
-from rdkit.Chem import rdMolDescriptors
 
 
-H2O = 18.01056
-NH3 = 17.00274
+
+H2O = 18.010564683
+NH3 = 17.026549100
 PROTON = 1.00727645229
 HYDROGEN = 1.0078250321
 OXYGEN = 15.9949146221
-ELECTRON = 0.0005489
+ELECTRON = 0.00054857990
 
 ADDUCT = {
     "[M]": 0.0,
@@ -559,7 +555,7 @@ def prettyFormula(self):
 
         return formulaStr
 
-    def printNist(self):
+    def printNist(self, retention_times):
         bbType = LIPID_BACKBONES[self.lipidclass][0]
         Nchains = len(self.chains)
         totalC = 0
@@ -622,6 +618,7 @@ def printNist(self):
             ChainStrings = str(self.chains[0][0])
             FullName = self.lipidclass + ChainStrings
             FullNameWithAdduct = FullName + " " + self.adduct
+            SumName = FullName
 
         atoms = self.MF()
         MASS = MW_list(atoms)
@@ -731,21 +728,18 @@ def printNist(self):
         RECORD.append("SMILES: " + get_lipid_smiles(self.lipidclass, LIPID_BACKBONES[self.lipidclass][0],
                                                     LIPID_BACKBONES[self.lipidclass][1], self.chains) + "\n")
         RECORD.append("NumPeaks: " + str(len(FRAGMENTS)) + "\n")
+        #Add retention times if they are available
+        if retention_times is not None:
+            this_rt_entry = retention_times[retention_times["name"] == FullName]
+            if not this_rt_entry.empty:
+                if "rt" in this_rt_entry.columns.tolist():
+                    RECORD.append(f"RT: {this_rt_entry.iloc[0]["rt"]:.2f}\n")
+                if "rt_min" in this_rt_entry.columns.tolist():
+                    RECORD.append(f"RT_min: {this_rt_entry.iloc[0]["rt_min"]:.2f}\n")
+                if "rt_max" in this_rt_entry.columns.tolist():
+                    RECORD.append(f"RT_max: {this_rt_entry.iloc[0]["rt_max"]:.2f}\n")
 
 
-        # smiles = get_lipid_smiles(self.lipidclass, LIPID_BACKBONES[self.lipidclass][0],
-        #                                              LIPID_BACKBONES[self.lipidclass][1], self.chains)
-
-        # Temporary checks to make sure that molecular weight from smiles matches that calculated from headgroup+sn
-        # mol = Chem.MolFromSmiles(smiles)
-        # smiles_mw = Chem.Descriptors.ExactMolWt(mol)
-        # if smiles == "" or smiles_mw == 0.0  or (abs(MASS - smiles_mw) > .0001):
-        #     print(FullName)
-        #     print("SMILES MF: " + str(Chem.rdMolDescriptors.CalcMolFormula(mol)))
-
-        #RECORD.append("SMILES MF: " + str(Chem.rdMolDescriptors.CalcMolFormula(mol)) + "\n")
-        #end of temporary check
-
         for f in FRAGMENTS:
             RECORD.append(str(f[0]) + " " + str(f[1]) + " " + str(f[2]) + "\n")
         RECORD.append("\n\n")
@@ -770,19 +764,20 @@ def theoreticalDigest(self):
         # FRAGMENTS.append( [PREC, 1000, "pre"] )
         return FRAGMENTS
 
-    def generateLibrary(self, target=None, mode="pos"):
+    def generateLibrary(self, target=None, mode="pos", retention_times=None):
         if target:
             handle = open(target, "a+")
         if mode == "pos":
             adduct_set = self.pos_adduct_set
         elif mode == "neg":
             adduct_set = self.neg_adduct_set
+        #self.rt =
         # parent = self.__bases__[0]
         class_name = self.__class__.__name__
         for c in self.chain_sets:
             for adduct in adduct_set:
                 self.set_chains_and_adduct(class_name, c, adduct=adduct)
-                content = self.printNist()
+                content = self.printNist(retention_times)
                 if target and content is not None:
                     handle.write(content)
                 elif content is not None:
diff --git a/generateDB.py b/generateDB.py
index 6d17923..29b9472 100644
--- a/generateDB.py
+++ b/generateDB.py
@@ -1,6 +1,9 @@
 import datetime
 import argparse
 import os
+import pandas as pd
+from nltk import rte_features
+
 from calicolipidlibrary import *
 
 
@@ -38,6 +41,13 @@ def cli_parser():
         default="",
     )
 
+    parser.add_argument(
+        "-r",
+        "--retention-times",
+        help="retention time file name",
+        dest="retention_time_file_name",
+        default="",
+    )
     return parser
 
 
@@ -64,6 +74,13 @@ def cli_parser():
         print("AVAILABLE LIPID CLASSES:")
         for lipid_class in ALL_LIPID_CLASSES:
             print(lipid_class)
+        print("-r <retention times>:")
+        print("\tSupply path and filename for a .tsv containing known retention times\n")
+        #Needs Name RT or Name and RT+rt_min+rt_max or Name and rt_min_rt_max?
+        print("\tFile must contain a column with \"name\" and \"rt\" and/or\n")
+        print("\t\"name\" and both of \"rt_max\" and \"rt_min\"\n")
+        print("\tDEFAULT: No retention times will be associated.\n")
+
         exit(0)
 
     args = cli_parser().parse_args()
@@ -86,6 +103,30 @@ def cli_parser():
 
         exit(1)
 
+
+
+    if args.retention_time_file_name != "":
+        if not os.path.exists(args.retention_time_file_name):
+            print("Supplied retention time files does not exist. Exiting program.")
+            exit(1)
+
+        retention_times_df = pd.read_csv(args.retention_time_file_name, delimiter = '\t')
+        retention_times_df.columns = retention_times_df.columns.str.lower() #make column names lower case
+        rt_col_names = retention_times_df.columns.tolist()
+
+        #check that we have Name and RT or both of RT_min and RT_max (or all three)
+        if ("name" not in rt_col_names) and (("rt" in rt_col_names) or ("rt_max" not in rt_col_names and "rt_min" not in rt_col_names)):
+            print("Supplied retention time file is improperly formatted.\n")
+            print("\tSupply path and filename for a .tsv containing known retention times\n")
+            print("\tFile must contain a column with \"name\" and \"rt\" and/or\n")
+            print("\tboth of \"rt_max\" and \"rt_min\"\n")
+            print("\tExiting program.")
+            exit(1)
+        else:
+            retention_times_df = retention_times_df.loc[:, retention_times_df.columns.isin(["name", "rt", "rt_min", "rt_max"])]
+    else:
+        retention_times_df = None
+
     lipid_classes = []
     if len(args.lipid_classes) == 0:
         lipid_classes = ALL_LIPID_CLASSES.copy()
@@ -128,18 +169,16 @@ def cli_parser():
 
     open(output_msp_file, "w")
 
-    for lipid_class in lipid_classes:
+    lipid_classes_list = list(lipid_classes)
+    lipid_classes_list.sort() #put lipid classes in alphabetical order
+    for lipid_class in lipid_classes_list:
         msg = "Enumerating spectra for " + lipid_class + "  ... "
         instantiate_str = lipid_class + " = " + lipid_class + "()"
         sys.stdout.write(msg)
-        eval_str = (
-            lipid_class
-            + ".generateLibrary(target='"
-            + output_msp_file
-            + "', mode='"
-            + ion
-            + "')"
-        )
+        eval_str = (f"{lipid_class}.generateLibrary(target='{output_msp_file}', mode='{ion}', retention_times=retention_times_df)"
+                    )
+
+
         exec(instantiate_str)
         eval(eval_str)
         sys.stdout.write(" DONE\n")

From 7c3604f47ec315472dddfc0234c66eb0095a170a Mon Sep 17 00:00:00 2001
From: brysonbennett <bryson@bryson-mbp-2172.calicolabs.local>
Date: Wed, 29 Apr 2026 08:36:17 -0700
Subject: [PATCH 2/2] CLL issue #17, mass_spec issue #1920. Add two double bond
 LCB sepcies to Ceramides, fix LCB mass (from 262 to 264 for SMs

---
 calicolipidlibrary/Ceramide.py   |  8 +++++---
 calicolipidlibrary/SM.py         |  4 ++--
 calicolipidlibrary/lipidRules.py | 15 ++++++++-------
 calicolipidlibrary/utils.py      |  2 +-
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/calicolipidlibrary/Ceramide.py b/calicolipidlibrary/Ceramide.py
index 26837ed..3d4a71d 100644
--- a/calicolipidlibrary/Ceramide.py
+++ b/calicolipidlibrary/Ceramide.py
@@ -4,11 +4,13 @@
 # need to fix nomenclature
 class Ceramide(SphingoLipid):
 
+# h is one less than nomenclature as the first OH is defined as part of the head group
+# so h = 1 is a d-LCB species
     chain1_ranges = []
     for c in range(14, 23):
-        for d in range(0, 2):
-            for h in range(1, 3):
-                if (h > 1 and d > 0):
+        for d in range(0, 3):
+            for h in range(0, 3):
+                if (h > 1 and d > 1):
                     continue
                 chain1_ranges.append([c, d, h])
 
diff --git a/calicolipidlibrary/SM.py b/calicolipidlibrary/SM.py
index 4c027c8..ba643b2 100644
--- a/calicolipidlibrary/SM.py
+++ b/calicolipidlibrary/SM.py
@@ -27,7 +27,7 @@ def theoreticalDigest(self):
                         MW_list(
                             [
                                 self.chains[0][0],
-                                (2 * self.chains[0][0] - 2 * self.chains[0][1] - 1),
+                                (2 * self.chains[0][0] - 2 * self.chains[0][1] + 1),
                                 1,
                                 self.chains[0][2],
                                 0,
@@ -45,7 +45,7 @@ def theoreticalDigest(self):
                             MW_list(
                                 [
                                     self.chains[0][0],
-                                    (2 * self.chains[0][0] - 2 * self.chains[0][1] - 1),
+                                    (2 * self.chains[0][0] - 2 * self.chains[0][1] + 1),
                                     1,
                                     self.chains[0][2],
                                     0,
diff --git a/calicolipidlibrary/lipidRules.py b/calicolipidlibrary/lipidRules.py
index 12bd6fd..3392f64 100644
--- a/calicolipidlibrary/lipidRules.py
+++ b/calicolipidlibrary/lipidRules.py
@@ -727,7 +727,6 @@ def printNist(self, retention_times):
         RECORD.append("FORMULA: " + self.prettyFormula() + "\n")
         RECORD.append("SMILES: " + get_lipid_smiles(self.lipidclass, LIPID_BACKBONES[self.lipidclass][0],
                                                     LIPID_BACKBONES[self.lipidclass][1], self.chains) + "\n")
-        RECORD.append("NumPeaks: " + str(len(FRAGMENTS)) + "\n")
         #Add retention times if they are available
         if retention_times is not None:
             this_rt_entry = retention_times[retention_times["name"] == FullName]
@@ -739,6 +738,8 @@ def printNist(self, retention_times):
                 if "rt_max" in this_rt_entry.columns.tolist():
                     RECORD.append(f"RT_max: {this_rt_entry.iloc[0]["rt_max"]:.2f}\n")
 
+        RECORD.append("NumPeaks: " + str(len(FRAGMENTS)) + "\n")
+
 
         for f in FRAGMENTS:
             RECORD.append(str(f[0]) + " " + str(f[1]) + " " + str(f[2]) + "\n")
@@ -883,10 +884,8 @@ class NAcylGPL(Lipid):
     for c1 in chain1_ranges:
         for c2 in chain2_ranges:
             for c3 in chain3_ranges:
-                if c2[0] < c1[0]:
-                    continue
-#                if ((c2[0] == c1[0]) and (c2[1] < c1[0])):
-#                    continue
+                if c2[0] < c1[0]: continue
+                if (c2[0] == c1[0] and c2[1] < c1[1]): continue
                 chain_sets.append([c1, c2, c3])
 
 
@@ -900,7 +899,7 @@ class SphingoLipid(Lipid):
 
     chain1_ranges = []
     for c in range(14, 23):
-        for d in range(0, 2):
+        for d in range(0, 3):
             for h in range(1, 2):
                 if (h > 1 and d > 0):
                     continue
@@ -982,6 +981,8 @@ class AcylSphingoLipid(Lipid):
     chain_sets = []
     for c1 in chain1_ranges:
         for c2 in chain2_ranges:
+            if c2[0] < c1[0]: continue
+            if (c2[0] == c1[0] and c2[1] < c1[1]): continue
             for c3 in chain3_ranges:
                 chain_sets.append([c1, c2, c3])
 
@@ -1090,7 +1091,7 @@ class CardioLipin(Lipid):
                     if c3[0] < c2[0]: continue
                     if (c3[0] == c2[0] and c3[1] < c2[1]): continue
                     if c4[0] < c3[0]: continue
-                    if (c4[0] == c3[0] and c3[1] < c2[1]): continue
+                    if (c4[0] == c3[0] and c4[1] < c3[1]): continue
 
                     # if c2[0] < c1[0]: continue
                     # if (c2[0] == c1[0] and c2[1] < c1[0]): continue
diff --git a/calicolipidlibrary/utils.py b/calicolipidlibrary/utils.py
index bc88909..0f3e938 100644
--- a/calicolipidlibrary/utils.py
+++ b/calicolipidlibrary/utils.py
@@ -74,7 +74,7 @@ def print_spectrum(lipid_class, chains=[], adduct=""):
 
     if lipid_class not in ALL_LIPID_CLASSES:
         print("Lipid Class '" + lipid_class + "' is not a supported lipid class.")
-        print("Suported lipid classes are:")
+        print("Supported lipid classes are:")
         for lipid in ALL_LIPID_CLASSES:
             print(lipid)
         return