Source code for qp.structure.missing

"""Build missing residues and atoms using Modeller

**Usage**::

    >>> from qp.structure import missing
    >>> pdb = "1lnh"

    # Parse PDB file, filling in missing residues
    >>> residues = missing.get_residues("path/to/PDB.pdb")

    # Write alignment file
    >>> missing.write_alignment(
    ...     residues,
    ...     pdb,
    ...     "path/to/PDB.pdb",
    ...     "path/to/ALI.ali"
    ... )
    >P1;1lnh (template from original PDB file)
    structureX:1lnh.pdb:FIRST:@ END::::::
    --------GHKIKGTVVLMRKNVLDVNSVTSV-------------TLDTLTAFLGRSVSLQLISAT...
    >P1;1lnh_fill (full sequence)
    sequence:::::::::
    MLGGLLHRGHKIKGTVVLMRKNVLDVNSVTSVGGIIGQGLDLVGSTLDTLTAFLGRSVSLQLISAT...

    # Run Modeller with the given alignment file
    >>> missing.build_model(
    ...     residues,
    ...     pdb,
    ...     "path/to/PDB.pdb",
    ...     "path/to/ALI.ali",
    ...     "path/to/OUT.pdb"
    ... )

Optimization level (``optimize`` argument in ``missing.build_model``):

* 0. No optimization. Missing coordinates filled in using Modeller's topology library.
* 1. Optimize missing residues and residues with missing atoms only. (Default)
* 2. Optimize the entire structure. Hetero atoms are included but treated as rigid bodies.

.. note::
   Non-standard residues (substrates, cofactors, and other HETATM
   entries) are preserved as rigid bodies by Modeller and any missing
   atoms in these residues will not be modeled.  A warning is emitted
   listing the non-standard residues encountered during parsing.
"""

import os
import warnings
from modeller import log, Environ, Selection
from modeller.automodel import AutoModel
from modeller import alignment, model
from Bio import PDB

log.none()

#: Amino acid 3 to 1 lookup table
[docs]def define_residues(): """ Access a look up table of standard and non-standard amino acids. Returns ------- AA: dict Dictionary of standard and non-standard amino acids. """ AA = { "CYS": "C", "ASP": "D", "SER": "S", "GLN": "Q", "LYS": "K", "ILE": "I", "PRO": "P", "THR": "T", "PHE": "F", "ASN": "N", "GLY": "G", "HIS": "H", "LEU": "L", "ARG": "R", "TRP": "W", "ALA": "A", "VAL": "V", "GLU": "E", "TYR": "Y", "MET": "M", "MSE": "M", } return AA
[docs]def get_residues(path, AA): """Extract residues from a PDB file, including missing residue annotations. Parses REMARK 465 (missing residues) and REMARK 470 (missing atoms) records to identify gaps in the structure that need to be filled by Modeller. Non-standard residues (substrates, cofactors, and other HETATM entries except water and selenomethionine) are treated as rigid bodies by Modeller. Their existing coordinates are preserved, but any missing atoms in these residues will not be modeled. A warning is emitted listing the non-standard residue names encountered. Parameters ---------- path : str Path to the PDB file. AA : dict Amino acid lookup table mapping 3-letter codes to 1-letter codes (from :func:`define_residues`). Returns ------- list of list Residues separated by chain. Each residue is a tuple of ``((sequence_number, insertion_code), one_letter_code, flag)``, where ``flag`` is ``'R'`` for completely absent residues, ``'A'`` for residues with missing atoms, or ``''`` for complete. """ with open(path, "r") as f: p = f.read().splitlines() missing_residues = {} ind = {} seen = {} residues = [] cur = None # Track non-standard residues to warn the user after parsing nonstandard_residues = set() for line in p: if line.startswith("ENDMDL"): break elif line.startswith("REMARK 465 "): # missing residues res = line[15:18] if res in AA and (line[13] == " " or line[13] == "1"): # want only the first model chain = line[19] rid = int(line[21:26]) ic = line[26] missing_residues.setdefault(chain, set()).add(((rid, ic), AA[res], "R")) elif line.startswith("REMARK 470 "): # missing atoms res = line[15:18] if res in AA and (line[13] == " " or line[13] == "1"): chain = line[19] rid = int(line[20:24]) ic = line[24] missing_residues.setdefault(chain, set()).add(((rid, ic), AA[res], "A")) elif line.startswith("ATOM") or line.startswith("HETATM"): res = line[17:20] chain = line[21] rid = int(line[22:26]) ic = line[26] if chain != cur: residues.append([]) if chain in missing_residues and chain not in ind: missing_residues[chain] = sorted(missing_residues[chain]) ind[chain] = 0 if chain not in seen: seen[chain] = set() cur = chain if chain in missing_residues: while ( ind[chain] < len(missing_residues[chain]) and (rid, ic) >= missing_residues[chain][ind[chain]][0] ): residues[-1].append(missing_residues[chain][ind[chain]]) ind[chain] += 1 seen[chain].add(residues[-1][-1][:2]) if line.startswith("HETATM") and res != "MSE": resname = "w" if res == "HOH" else "." # Track non-standard residues (exclude water and MSE) if res != "HOH": nonstandard_residues.add(res) else: resname = AA.get(res, ".") if ((rid, ic), resname) not in seen[chain]: residues[-1].append(((rid, ic), resname, "")) seen[chain].add(residues[-1][-1][:2]) elif line.startswith("TER"): if len(line) > 21: chain = line[21] else: chain = cur if chain in missing_residues: while ind[chain] < len(missing_residues[chain]): residues[-1].append(missing_residues[chain][ind[chain]]) ind[chain] += 1 # Warn user about non-standard residues that Modeller treats as rigid bodies if nonstandard_residues: names = ", ".join(sorted(nonstandard_residues)) warnings.warn( f"Non-standard residues detected: {names}. These residues are " f"preserved as rigid bodies by Modeller and any missing atoms " f"will not be modeled.", stacklevel=2, ) return residues
[docs]def clean_termini(residues): """ Remove unresolve amino acids from the N- and C-termini. Most proteins will have long sequences of missing residues at the ends. Adding these with Modeller is unreliable and are removed with this function. Parameters ---------- residues: list of lists of tuples Residues by chain [[((),)],[((),)]] Returns ------- residues: list of lists of tuples Residues by chain. Stored as a tuple without unresolved termini """ stripped_residues = [] for chain in residues: # Strip 'R' tuples from the beginning of the chain while chain and chain[0][-1] == 'R': chain.pop(0) # Strip 'R' tuples from the end of the chain while chain and chain[-1][-1] == 'R': chain.pop() # Add the processed chain to the stripped protein if chain: # Only add non-empty chains stripped_residues.append(chain) return stripped_residues
[docs]def write_alignment(residues, pdb, path, out): """Write a Modeller alignment file for filling missing residues. Creates a PIR-format alignment file with two sequences: the template (with gaps for missing residues) and the target (complete sequence). Parameters ---------- residues : list of list Residues separated by chain, as returned by :func:`get_residues`. pdb : str PDB code (used as sequence identifier). path : str Path to the template PDB file. out : str Path to the output alignment file (``.ali``). See Also -------- https://salilab.org/modeller/10.4/manual/node501.html Modeller alignment file format documentation. https://salilab.org/modeller/wiki/Missing_residues Modeller missing residues tutorial. """ seq = "/".join( "".join(res[1] if res[2] != "R" else "-" for res in chain) for chain in residues ) seq_fill = "/".join("".join(res[1] for res in chain) for chain in residues) os.makedirs(os.path.dirname(os.path.abspath(out)), exist_ok=True) with open(out, "w") as f: f.write(f">P1;{pdb}\nstructureX:{path}:FIRST:@ END::::::\n{seq}*\n") f.write(f">P1;{pdb}_fill\nsequence:::::::::\n{seq_fill}*\n")
[docs]def transfer_numbering(e, ali, path, out): """Transfer residue numbers and chain IDs from template to built model. Modeller restarts residue numbering at 1 and reassigns chain IDs. This function restores the original numbering from the template PDB file. Insertion codes introduced by the transfer are corrected by :func:`fix_numbering`. Parameters ---------- e : modeller.Environ Modeller environment object. ali : str Path to the alignment file. path : str Path to the original template PDB file. out : str Path to the output PDB file (will be overwritten in place). See Also -------- fix_numbering : Corrects insertion code issues after transfer. """ # Read the alignment for the transfer aln = alignment(e, file=ali) # Read the template and target models: mdl_reference = model(e, file=path) mdl_built = model(e, file=os.path.basename(out)) # Transfer the residue and chain ids and write out the modified MODEL: mdl_built.res_num_from(mdl_reference, aln) file=os.path.basename(out) mdl_built.write(file) with open(file, 'r') as f: content = f.read() corrected_content = fix_numbering(content) with open(file, 'w') as f: f.write(corrected_content)
[docs]def fix_numbering(pdb_content): """ Handles insertion codes from Modeller. Modeller resets the residue indices, which we fix with transfer_numbering(). However, transfer_numbering() names residues with insertion codes. For example, 5,5A,5B,5C instead of 5,6,7,8. This function fixes is residue numbering behavior. Takes the generated PDB, fixes the residue numbers and overwrites the PDB. Parameters ---------- pdb_content : str Contents of a PDB as a str with line breaks. Returns ------- pdb_content :str Contents of a PDB as a str with line breaks. """ lines = pdb_content.split('\n') new_lines = [] prev_chain = None prev_residue_number = None seen_residues = set() for line in lines: if line.startswith("ATOM") or line.startswith("HETATM"): current_chain = line[21] residue_number_with_letter = line[22:27].strip() # Residue number along with potential insertion code has_char = line[26] != ' ' # If we encounter a new chain or a residue we haven't seen, reset/update values if current_chain != prev_chain: prev_chain = current_chain prev_residue_number = None seen_residues = set() if residue_number_with_letter not in seen_residues: if has_char: prev_residue_number = prev_residue_number + 1 if prev_residue_number is not None else int(residue_number_with_letter[:-1]) else: prev_residue_number = int(residue_number_with_letter) seen_residues.add(residue_number_with_letter) # If there's an insertion code, replace the residue number and remove the insertion code if has_char: line = line[:22] + f"{prev_residue_number:4} " + line[27:] new_lines.append(line) else: new_lines.append(line) pdb_content = '\n'.join(new_lines) return pdb_content
[docs]def build_model(residues, pdb, path, ali, out, optimize=1): """Run Modeller to build missing residues and atoms. Constructs a complete model by filling in missing residues and atoms using Modeller's homology modeling capabilities. The level of structure optimization can be controlled. Parameters ---------- residues : list of list Residues separated by chain, as returned by :func:`get_residues`. pdb : str PDB code (must match the alignment file identifiers). path : str Path to the template PDB file. ali : str Path to the Modeller alignment file. out : str Path to the output PDB file. optimize : int, optional Optimization level (default 1): - 0: No optimization; coordinates from topology library only. - 1: Optimize only missing residues and residues with missing atoms. - 2: Optimize the entire structure (heteroatoms as rigid bodies). Notes ----- The output PDB file will have residue numbering transferred from the template. Modeller intermediate files are cleaned up automatically. """ ali = os.path.abspath(ali) cwd = os.getcwd() dir = os.path.dirname(os.path.abspath(out)) os.makedirs(dir, exist_ok=True) os.chdir(dir) # Modeller only supports writing files to the current working directory class CustomModel(AutoModel): def get_model_filename(self, root_name, id1, id2, file_ext): return os.path.basename(out) def special_patches(self, aln): # renumber residues to avoid hybrid-36 notation with large models chain_ids = [] for i in range(26): chain_ids.append(chr(ord("A") + i)) for i in range(10): chain_ids.append(str(i)) for i in range(26): chain_ids.append(chr(ord("a") + i)) n = len(residues) self.rename_segments(chain_ids[:n], [1] * n) missing_residues = [] for i, c in enumerate(residues): chain = chr(ord("A") + i) ind = None for j, res in enumerate(c): if res[2] and ind is None: ind = j + 1 elif not res[2] and ind is not None: missing_residues.append((f"{ind}:{chain}", f"{j}:{chain}")) ind = None if ind is not None: missing_residues.append((f"{ind}:{chain}", f"{len(c)}:{chain}")) if optimize == 1 and missing_residues: CustomModel.select_atoms = lambda self: Selection( *[self.residue_range(x, y) for x, y in missing_residues] ) e = Environ() e.io.hetatm = True e.io.water = True a = CustomModel(e, alnfile=ali, knowns=pdb, sequence=f"{pdb}_fill") a.starting_model = 1 a.ending_model = 1 if not optimize or (optimize == 1 and not missing_residues): a.make(exit_stage=2) os.rename(f"{pdb}_fill.ini", os.path.basename(out)) transfer_numbering(e, ali, pdb, out) else: a.make() for ext in ["ini", "rsr", "sch", "D00000001", "V99990001"]: os.remove(f"{pdb}_fill.{ext}") transfer_numbering(e, ali, path, out) os.chdir(cwd)
[docs]def get_chain_order(pdb_path): """ Get the order of chains in a PDB file. Parameters ---------- pdb_path: str Path to the PDB file. Returns ------- chain_order: dict Dictionary mapping chain IDs to their indices. """ chain_order = {} with open(pdb_path, "r") as f: for line in f: if line.startswith("ATOM") or line.startswith("HETATM"): chain_id = line[21] if chain_id not in chain_order: chain_order[chain_id] = len(chain_order) return chain_order
[docs]def delete_clashes(pdb_path, residues_with_clashes): """ Delete residues with clashes from a PDB file and rewrite the PDB file. Parameters ---------- pdb_path: str Path to the PDB file. residues_with_clashes: set of tuples Set of residues with clashes. Each tuple contains (res_id, one_letter_code, chain_index, chain). Returns ------- output_pdb_path: str Path to the new PDB file with residues removed. """ # Read the PDB file with open(pdb_path, "r") as file: lines = file.readlines() # Create a set for quick lookup of residues to delete residues_to_delete = {(chain, res_id) for res_id, _, _, chain, _ in residues_with_clashes} # Write the new PDB file, excluding the residues to delete with open(pdb_path, "w") as file: for line in lines: if line.startswith("ATOM") or line.startswith("HETATM"): chain_id = line[21] res_id = int(line[22:26]) if (chain_id, res_id) not in residues_to_delete: file.write(line) else: file.write(line) print(f"> Updated PDB {os.path.basename(pdb_path)} by removing residues with clashes")