diff --git a/README.md b/README.md index 6b15b7e8..8d25b47f 100644 --- a/README.md +++ b/README.md @@ -200,8 +200,7 @@ Copyright (c) 2022-2024, Carlos Bueno, Hana Jaafari ## Acknowledgements -This work was supported by National Science Foundation (NSF) Center for Theoretical Biological Physics (NSF PHY-2019745) and NSF grants PHY-1522550. -Project skeleton based on the [Computational Molecular Science Python Cookiecutter] (https://github.com/molssi/cookiecutter-cms) version 1.6. +Carlos Bueno was supported by the MolSSI Software Fellowship, under the mentorship of Jessica Nash. We thank AMD (Advanced Micro Devices, Inc.) for the donation of high-performance computing hardware and HPC resources. This project is also supported by the Center for Theoretical Biological Physics (NSF Grants PHY-2019745 and PHY-1522550), with additional support from the D.R. Bullard Welch Chair at Rice University (Grant No. C-0016 to PGW). Project skeleton based on the [Computational Molecular Science Python Cookiecutter] (https://github.com/molssi/cookiecutter-cms) version 1.6. ## References diff --git a/frustratometer/align/align.py b/frustratometer/align/align.py index f79594c8..7f4733b5 100644 --- a/frustratometer/align/align.py +++ b/frustratometer/align/align.py @@ -1,30 +1,31 @@ import subprocess from pathlib import Path import tempfile -from typing import Union +import typing -def jackhmmer(sequence, - output_file, - database, - log=subprocess.DEVNULL, +def jackhmmer(sequence: str, + output_file: typing.Union[Path,str], + database: typing.Union[Path,str], + log = subprocess.DEVNULL, dry_run: bool = False, - **kwargs)->Path: + **kwargs) -> Path: """ - Generates alignment using jackhmmer + Generates alignment in stockholm format using jackhmmer. Parameters ---------- sequence : str Protein sequence output_file : str - If True, will record alignment output messages; - Arguments: True OR False (Default is False) + Path to save the alignment file database: str - Location of the sequence database used to generation MSA + Location of the sequence database used to generation MSA. + The database should be in fasta format. + Databases can be downloaded from: https://www.uniprot.org/help/downloads log: File handle - jackhmmer output Default:DEVNULL + Path to write the output of the program. Default:DEVNULL dry_run: bool (default: False) - Save the temporary fasta_file and print the command instead of running + Creates a fasta file with the sequence and prints the command needed to run jackhmmer **kwargs: Other arguments that can be passed to jackhmmer. More information can be found by executing `jackhmmer -h` @@ -32,8 +33,6 @@ def jackhmmer(sequence, Common kwargs: N: number of iterations E: E-value threshold - - Returns ------- output_file: Path @@ -98,7 +97,7 @@ def extract_sequences_from_alignment(alignment_file, import Bio.SeqIO alignment = Bio.AlignIO.read(alignment_file,'stockholm') - database = Bio.SeqIO.parse('/home/cb/Development/DCA_Frustratometer/dca_frustratometer/databases/Uniprot/uniprot_sprot.fasta','fasta') + database = Bio.SeqIO.parse(database,'fasta') alignment_records=[] for record in alignment: alignment_records+=[record.name] diff --git a/frustratometer/classes/AWSEM.py b/frustratometer/classes/AWSEM.py index 770f4ad0..a25290c7 100644 --- a/frustratometer/classes/AWSEM.py +++ b/frustratometer/classes/AWSEM.py @@ -341,7 +341,7 @@ def compute_configurational_energies(self): # decoy_data_columns=['decoy_i','i_resno','j_resno','ires_type','jres_type','aa1','aa2','rij','rho_i','rho_j','water_energy','burial_energy_i','burial_energy_j','electrostatic_energy','total_energies'] # decoy_data=[] - configurational_energies=np.zeros((n,n)) + configurational_energies=np.full((n,n), np.nan) # masked pairs will be left as nan for c in range(n_contacts): n1=indices1[c] n2=indices2[c] @@ -366,4 +366,4 @@ def compute_configurational_energies(self): def configurational_frustration(self,aa_freq=None, correction=0, n_decoys=4000): mean_decoy_energy, std_decoy_energy = self.compute_configurational_decoy_statistics(n_decoys=n_decoys,aa_freq=aa_freq) - return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) \ No newline at end of file + return -(self.compute_configurational_energies()-mean_decoy_energy)/(std_decoy_energy+correction) diff --git a/frustratometer/classes/Gamma.py b/frustratometer/classes/Gamma.py index 821bffea..6f9ab658 100644 --- a/frustratometer/classes/Gamma.py +++ b/frustratometer/classes/Gamma.py @@ -408,7 +408,8 @@ def plot_gamma(self, new_order=None): # Plot setup f, axes = plt.subplots(2, 2, figsize=(18, 16)) - titles = ['Burial Gammas', 'Direct Gammas', 'Water Gammas', 'Protein Gammas'] + f.subplots_adjust(hspace=50) # fix overlap between axis ticks of upper subplots and titles of lower subplots + titles = ['Burial Gammas', 'Direct Gammas', 'Protein Gammas', 'Water Gammas'] for i, (title, name) in enumerate(zip(titles, segments)): ax = axes[i // 2, i % 2] @@ -416,8 +417,12 @@ def plot_gamma(self, new_order=None): ax.set_title(title) ax.set_xticks(np.arange(len(self.alphabet)) + 0.5) ax.set_xticklabels(self.alphabet) - ax.set_yticks(np.arange(segments[name].shape[0] // 20) + 0.5) - ax.set_yticklabels(range(segments[name].shape[0] // 20)) + if i==0: # burial + ax.set_yticks([0.5,1.5,2.5]) + ax.set_yticklabels(['low','medium','high'], rotation=45, size=12) + else: # direct, prot, or wat + ax.set_yticks(np.arange(len(self.alphabet)) + 0.5) + ax.set_yticklabels(self.alphabet, rotation=0) plt.tight_layout() plt.show() @@ -648,4 +653,4 @@ class O(): self.gamma1 = Gamma(np.arange(0,1260,1)) self.gamma2 = Gamma(np.arange(0,1260,1)*5+10) - self.gamma3 = Gamma(np.arange(1260,0,-1)*2-4) \ No newline at end of file + self.gamma3 = Gamma(np.arange(1260,0,-1)*2-4) diff --git a/frustratometer/frustration/frustration.py b/frustratometer/frustration/frustration.py index 81ec8801..eaddd834 100644 --- a/frustratometer/frustration/frustration.py +++ b/frustratometer/frustration/frustration.py @@ -1,3 +1,21 @@ +""" +Utilities for sequence energies, decoys, and frustration metrics from Potts models. + +Conventions +----------- +- Alphabet: `_AA = '-ACDEFGHIKLMNPQRSTVWY'` (gap '-' at index 0) +- Sequence length: L +- Fields and couplings: + * potts_model['h'] has shape (L, Q) + * potts_model['J'] has shape (L, L, Q, Q) +- Energies: A positive value in the potts model or a negative value in the energy is a favorable contribution. +- Masks: boolean array of shape (L, L). True values indicate included residue pairs. + +Notes +----- +All functions assume `len(seq) == potts_model['h'].shape[0] == potts_model['J'].shape[0]`. +""" + import prody import scipy.spatial.distance as sdist import numpy as np @@ -10,25 +28,24 @@ def compute_mask(distance_matrix: np.array, maximum_contact_distance: Union[float, None] = None, minimum_sequence_separation: Union[int, None] = None) -> np.array: """ - Computes a 2D Boolean mask from a given distance matrix based on a distance cutoff and a sequence separation cutoff. + Computes a 2D Boolean mask (L, L) for pairwise interactions from a given distance matrix using a distance cutoff and/or a minimum sequence-separation cutoff. + Both cutoffs are inclusive. True indicates that the residue pair meets the criteria. Parameters ---------- - distance_matrix : np.array + distance_matrix : np.array (L,L) A 2D array where the element at index [i, j] represents the spatial distance - between residues i and j. This matrix is assumed to be symmetric. + between residues i and j (e.g., Ca-Ca, Cb-Cb). This matrix is assumed to be symmetric. maximum_contact_distance : float, optional - The maximum distance of a contact. Pairs of residues with distances less than this - threshold are marked as True in the mask. If None, the spatial distance criterion - is ignored and all distances are included. Default is None. + The maximum distance of a contact. Include i,j if distance_matrix[i,j] <= maximum_contact_distance. + If None, no distance filtering is applied. Default is None. minimum_sequence_separation : int, optional - A minimum sequence distance threshold. Pairs of residues with sequence indices - differing by at least this value are marked as True in the mask. If None, - the sequence distance criterion is ignored. Default is None. + A minimum sequence separation threshold. Include i,j if |i-j| >= minimum_sequence_separation. + If None, no sequence separation is applied . Default is None. Returns ------- - mask : np.array + mask : np.array (L,L) A 2D Boolean array of the same dimensions as `distance_matrix`. Elements of the mask are True where the residue pairs meet the specified `distance_cutoff` and `sequence_distance_cutoff` criteria.