Source code for qp.structure.missing

"""Build missing residues and atoms using Modeller

**Usage**::

    >>> from qp.structure import missing
    >>> pdb = "1lnh"

    # Parse PDB file, filling in missing residues
    >>> residues = missing.get_residues("path/to/PDB.pdb")

    # Write alignment file
    >>> missing.write_alignment(
    ...     residues,
    ...     pdb,
    ...     "path/to/PDB.pdb",
    ...     "path/to/ALI.ali"
    ... )
    >P1;1lnh (template from original PDB file)
    structureX:1lnh.pdb:FIRST:@ END::::::
    --------GHKIKGTVVLMRKNVLDVNSVTSV-------------TLDTLTAFLGRSVSLQLISAT...
    >P1;1lnh_fill (full sequence)
    sequence:::::::::
    MLGGLLHRGHKIKGTVVLMRKNVLDVNSVTSVGGIIGQGLDLVGSTLDTLTAFLGRSVSLQLISAT...

    # Run Modeller with the given alignment file
    >>> missing.build_model(
    ...     residues,
    ...     pdb,
    ...     "path/to/PDB.pdb",
    ...     "path/to/ALI.ali",
    ...     "path/to/OUT.pdb"
    ... )

Optimization level (``optimize`` argument in ``missing.build_model``):

* 0. No optimization. Missing coordinates filled in using Modeller's topology library.
* 1. Optimize missing residues and residues with missing atoms only. (Default)
* 2. Optimize the entire structure. Hetero atoms are included but treated as rigid bodies.

.. note::
   Non-standard residues (substrates, cofactors, and other HETATM
   entries) are preserved as rigid bodies by Modeller and any missing
   atoms in these residues will not be modeled.  A warning is emitted
   listing the non-standard residues encountered during parsing.
"""

import os
import warnings
from modeller import log, Environ, Selection
from modeller.automodel import AutoModel
from modeller import alignment, model
from Bio import PDB

log.none()

#: Amino acid 3 to 1 lookup table
[docs]def define_residues():
    """
    Access a look up table of standard and non-standard amino acids.

    Returns
    -------
    AA: dict
        Dictionary of standard and non-standard amino acids.
    """
    AA = {
        "CYS": "C",
        "ASP": "D",
        "SER": "S",
        "GLN": "Q",
        "LYS": "K",
        "ILE": "I",
        "PRO": "P",
        "THR": "T",
        "PHE": "F",
        "ASN": "N",
        "GLY": "G",
        "HIS": "H",
        "LEU": "L",
        "ARG": "R",
        "TRP": "W",
        "ALA": "A",
        "VAL": "V",
        "GLU": "E",
        "TYR": "Y",
        "MET": "M",
        "MSE": "M",
    }

    return AA


[docs]def get_residues(path, AA):
    """Extract residues from a PDB file, including missing residue annotations.

    Parses REMARK 465 (missing residues) and REMARK 470 (missing atoms)
    records to identify gaps in the structure that need to be filled by
    Modeller.

    Non-standard residues (substrates, cofactors, and other HETATM
    entries except water and selenomethionine) are treated as rigid
    bodies by Modeller.  Their existing coordinates are preserved, but
    any missing atoms in these residues will not be modeled.  A warning
    is emitted listing the non-standard residue names encountered.

    Parameters
    ----------
    path : str
        Path to the PDB file.
    AA : dict
        Amino acid lookup table mapping 3-letter codes to 1-letter codes
        (from :func:`define_residues`).

    Returns
    -------
    list of list
        Residues separated by chain. Each residue is a tuple of
        ``((sequence_number, insertion_code), one_letter_code, flag)``,
        where ``flag`` is ``'R'`` for completely absent residues,
        ``'A'`` for residues with missing atoms, or ``''`` for complete.
    """
    with open(path, "r") as f:
        p = f.read().splitlines()

    missing_residues = {}
    ind = {}
    seen = {}
    residues = []
    cur = None
    # Track non-standard residues to warn the user after parsing
    nonstandard_residues = set()

    for line in p:
        if line.startswith("ENDMDL"):
            break

        elif line.startswith("REMARK 465   "):  # missing residues
            res = line[15:18]
            if res in AA and (line[13] == " " or line[13] == "1"): # want only the first model
                chain = line[19]
                rid = int(line[21:26])
                ic = line[26]
                missing_residues.setdefault(chain, set()).add(((rid, ic), AA[res], "R"))

        elif line.startswith("REMARK 470   "):  # missing atoms
            res = line[15:18]
            if res in AA and (line[13] == " " or line[13] == "1"):
                chain = line[19]
                rid = int(line[20:24])
                ic = line[24]
                missing_residues.setdefault(chain, set()).add(((rid, ic), AA[res], "A"))

        elif line.startswith("ATOM") or line.startswith("HETATM"):
            res = line[17:20]
            chain = line[21]
            rid = int(line[22:26])
            ic = line[26]

            if chain != cur:
                residues.append([])
                if chain in missing_residues and chain not in ind:
                    missing_residues[chain] = sorted(missing_residues[chain])
                    ind[chain] = 0
                if chain not in seen:
                    seen[chain] = set()
                cur = chain

            if chain in missing_residues:
                while (
                    ind[chain] < len(missing_residues[chain])
                    and (rid, ic) >= missing_residues[chain][ind[chain]][0]
                ):
                    residues[-1].append(missing_residues[chain][ind[chain]])
                    ind[chain] += 1
                    seen[chain].add(residues[-1][-1][:2])

            if line.startswith("HETATM") and res != "MSE":
                resname = "w" if res == "HOH" else "."
                # Track non-standard residues (exclude water and MSE)
                if res != "HOH":
                    nonstandard_residues.add(res)
            else:
                resname = AA.get(res, ".")
            if ((rid, ic), resname) not in seen[chain]:
                residues[-1].append(((rid, ic), resname, ""))
                seen[chain].add(residues[-1][-1][:2])

        elif line.startswith("TER"):
            if len(line) > 21:
                chain = line[21]
            else:
                chain = cur
            if chain in missing_residues:
                while ind[chain] < len(missing_residues[chain]):
                    residues[-1].append(missing_residues[chain][ind[chain]])
                    ind[chain] += 1

    # Warn user about non-standard residues that Modeller treats as rigid bodies
    if nonstandard_residues:
        names = ", ".join(sorted(nonstandard_residues))
        warnings.warn(
            f"Non-standard residues detected: {names}. These residues are "
            f"preserved as rigid bodies by Modeller and any missing atoms "
            f"will not be modeled.",
            stacklevel=2,
        )

    return residues

[docs]def clean_termini(residues):
    """
    Remove unresolve amino acids from the N- and C-termini.

    Most proteins will have long sequences of missing residues at the ends.
    Adding these with Modeller is unreliable and are removed with this function.

    Parameters
    ----------
    residues: list of lists of tuples
        Residues by chain [[((),)],[((),)]]
    
    Returns
    -------
    residues: list of lists of tuples
        Residues by chain. Stored as a tuple without unresolved termini

    """
    stripped_residues = []
    for chain in residues:
        # Strip 'R' tuples from the beginning of the chain
        while chain and chain[0][-1] == 'R':
            chain.pop(0)

        # Strip 'R' tuples from the end of the chain
        while chain and chain[-1][-1] == 'R':
            chain.pop()

        # Add the processed chain to the stripped protein
        if chain:  # Only add non-empty chains
            stripped_residues.append(chain)

    return stripped_residues
    

[docs]def write_alignment(residues, pdb, path, out):
    """Write a Modeller alignment file for filling missing residues.

    Creates a PIR-format alignment file with two sequences: the template
    (with gaps for missing residues) and the target (complete sequence).

    Parameters
    ----------
    residues : list of list
        Residues separated by chain, as returned by :func:`get_residues`.
    pdb : str
        PDB code (used as sequence identifier).
    path : str
        Path to the template PDB file.
    out : str
        Path to the output alignment file (``.ali``).

    See Also
    --------
    https://salilab.org/modeller/10.4/manual/node501.html
        Modeller alignment file format documentation.
    https://salilab.org/modeller/wiki/Missing_residues
        Modeller missing residues tutorial.
    """
    seq = "/".join(
        "".join(res[1] if res[2] != "R" else "-" for res in chain) for chain in residues
    )
    seq_fill = "/".join("".join(res[1] for res in chain) for chain in residues)

    os.makedirs(os.path.dirname(os.path.abspath(out)), exist_ok=True)
    with open(out, "w") as f:
        f.write(f">P1;{pdb}\nstructureX:{path}:FIRST:@ END::::::\n{seq}*\n")
        f.write(f">P1;{pdb}_fill\nsequence:::::::::\n{seq_fill}*\n")


[docs]def transfer_numbering(e, ali, path, out):
    """Transfer residue numbers and chain IDs from template to built model.

    Modeller restarts residue numbering at 1 and reassigns chain IDs. This
    function restores the original numbering from the template PDB file.
    Insertion codes introduced by the transfer are corrected by
    :func:`fix_numbering`.

    Parameters
    ----------
    e : modeller.Environ
        Modeller environment object.
    ali : str
        Path to the alignment file.
    path : str
        Path to the original template PDB file.
    out : str
        Path to the output PDB file (will be overwritten in place).

    See Also
    --------
    fix_numbering : Corrects insertion code issues after transfer.
    """
    # Read the alignment for the transfer
    aln = alignment(e, file=ali)

    # Read the template and target models:
    mdl_reference = model(e, file=path)
    mdl_built = model(e, file=os.path.basename(out))

    # Transfer the residue and chain ids and write out the modified MODEL:
    mdl_built.res_num_from(mdl_reference, aln)
    file=os.path.basename(out)
    mdl_built.write(file)

    with open(file, 'r') as f:
        content = f.read()

    corrected_content = fix_numbering(content)
    with open(file, 'w') as f:
        f.write(corrected_content)


[docs]def fix_numbering(pdb_content):
    """
    Handles insertion codes from Modeller.

    Modeller resets the residue indices, which we fix with transfer_numbering().
    However, transfer_numbering() names residues with insertion codes.
    For example, 5,5A,5B,5C instead of 5,6,7,8.
    This function fixes is residue numbering behavior.
    Takes the generated PDB, fixes the residue numbers and overwrites the PDB.

    Parameters
    ----------
    pdb_content : str
        Contents of a PDB as a str with line breaks.

    Returns
    -------
    pdb_content :str
        Contents of a PDB as a str with line breaks.
    """

    lines = pdb_content.split('\n')
    new_lines = []

    prev_chain = None
    prev_residue_number = None
    seen_residues = set()

    for line in lines:
        if line.startswith("ATOM") or line.startswith("HETATM"):
            current_chain = line[21]
            residue_number_with_letter = line[22:27].strip()  # Residue number along with potential insertion code
            has_char = line[26] != ' '

            # If we encounter a new chain or a residue we haven't seen, reset/update values
            if current_chain != prev_chain:
                prev_chain = current_chain
                prev_residue_number = None
                seen_residues = set()
            if residue_number_with_letter not in seen_residues:
                if has_char:
                    prev_residue_number = prev_residue_number + 1 if prev_residue_number is not None else int(residue_number_with_letter[:-1])
                else:
                    prev_residue_number = int(residue_number_with_letter)
                seen_residues.add(residue_number_with_letter)

            # If there's an insertion code, replace the residue number and remove the insertion code
            if has_char:
                line = line[:22] + f"{prev_residue_number:4} " + line[27:]

            new_lines.append(line)
        else:
            new_lines.append(line)
    
    pdb_content = '\n'.join(new_lines)
    return pdb_content


[docs]def build_model(residues, pdb, path, ali, out, optimize=1):
    """Run Modeller to build missing residues and atoms.

    Constructs a complete model by filling in missing residues and atoms
    using Modeller's homology modeling capabilities. The level of
    structure optimization can be controlled.

    Parameters
    ----------
    residues : list of list
        Residues separated by chain, as returned by :func:`get_residues`.
    pdb : str
        PDB code (must match the alignment file identifiers).
    path : str
        Path to the template PDB file.
    ali : str
        Path to the Modeller alignment file.
    out : str
        Path to the output PDB file.
    optimize : int, optional
        Optimization level (default 1):

        - 0: No optimization; coordinates from topology library only.
        - 1: Optimize only missing residues and residues with missing atoms.
        - 2: Optimize the entire structure (heteroatoms as rigid bodies).

    Notes
    -----
    The output PDB file will have residue numbering transferred from the
    template. Modeller intermediate files are cleaned up automatically.
    """
    ali = os.path.abspath(ali)
    cwd = os.getcwd()
    dir = os.path.dirname(os.path.abspath(out))
    os.makedirs(dir, exist_ok=True)
    os.chdir(dir) # Modeller only supports writing files to the current working directory

    class CustomModel(AutoModel):
        def get_model_filename(self, root_name, id1, id2, file_ext):
            return os.path.basename(out)

        def special_patches(self, aln): # renumber residues to avoid hybrid-36 notation with large models
            chain_ids = []
            for i in range(26):
                chain_ids.append(chr(ord("A") + i))
            for i in range(10):
                chain_ids.append(str(i))
            for i in range(26):
                chain_ids.append(chr(ord("a") + i))
            n = len(residues)
            self.rename_segments(chain_ids[:n], [1] * n)

    missing_residues = []
    for i, c in enumerate(residues):
        chain = chr(ord("A") + i)
        ind = None
        for j, res in enumerate(c):
            if res[2] and ind is None:
                ind = j + 1
            elif not res[2] and ind is not None:
                missing_residues.append((f"{ind}:{chain}", f"{j}:{chain}"))
                ind = None
        if ind is not None:
            missing_residues.append((f"{ind}:{chain}", f"{len(c)}:{chain}"))

    if optimize == 1 and missing_residues:
        CustomModel.select_atoms = lambda self: Selection(
            *[self.residue_range(x, y) for x, y in missing_residues]
        )

    e = Environ()
    e.io.hetatm = True
    e.io.water = True
    a = CustomModel(e, alnfile=ali, knowns=pdb, sequence=f"{pdb}_fill")
    a.starting_model = 1
    a.ending_model = 1

    if not optimize or (optimize == 1 and not missing_residues):
        a.make(exit_stage=2)
        os.rename(f"{pdb}_fill.ini", os.path.basename(out))
        transfer_numbering(e, ali, pdb, out)
    else:
        a.make()
        for ext in ["ini", "rsr", "sch", "D00000001", "V99990001"]:
            os.remove(f"{pdb}_fill.{ext}")
        transfer_numbering(e, ali, path, out)

    os.chdir(cwd)

[docs]def get_chain_order(pdb_path):
    """
    Get the order of chains in a PDB file.

    Parameters
    ----------
    pdb_path: str
        Path to the PDB file.

    Returns
    -------
    chain_order: dict
        Dictionary mapping chain IDs to their indices.
    """
    chain_order = {}
    with open(pdb_path, "r") as f:
        for line in f:
            if line.startswith("ATOM") or line.startswith("HETATM"):
                chain_id = line[21]
                if chain_id not in chain_order:
                    chain_order[chain_id] = len(chain_order)
    return chain_order


[docs]def delete_clashes(pdb_path, residues_with_clashes):
    """
    Delete residues with clashes from a PDB file and rewrite the PDB file.

    Parameters
    ----------
    pdb_path: str
        Path to the PDB file.
    residues_with_clashes: set of tuples
        Set of residues with clashes.
        Each tuple contains (res_id, one_letter_code, chain_index, chain).

    Returns
    -------
    output_pdb_path: str
        Path to the new PDB file with residues removed.
    """
    
    # Read the PDB file
    with open(pdb_path, "r") as file:
        lines = file.readlines()

    # Create a set for quick lookup of residues to delete
    residues_to_delete = {(chain, res_id) for res_id, _, _, chain, _ in residues_with_clashes}

    # Write the new PDB file, excluding the residues to delete
    with open(pdb_path, "w") as file:
        for line in lines:
            if line.startswith("ATOM") or line.startswith("HETATM"):
                chain_id = line[21]
                res_id = int(line[22:26])
                if (chain_id, res_id) not in residues_to_delete:
                    file.write(line)
            else:
                file.write(line)

    print(f"> Updated PDB {os.path.basename(pdb_path)} by removing residues with clashes")