Source code for qp.manager.charge_embedding

"""Generate charge embedding ptchrges.xyz file for TeraChem input.

By default, partial charges are taken from the built-in AMBER ff14SB
dictionary.  Users can supply a custom JSON file of charges via the
``charge_embedding_charges`` configuration option to support other
force fields.
"""

import os
import json
import shutil
import warnings
import numpy as np
from qp.manager import ff14SB_dict
from scipy.spatial import KDTree

[docs]def calculate_centroid(coords):
    """Calculate the geometric centroid of a set of coordinates.

    Parameters
    ----------
    coords : array-like of shape (N, 3)
        Cartesian coordinates of N atoms.

    Returns
    -------
    numpy.ndarray of shape (3,)
        The centroid (mean position) of the input coordinates.
    """
    return np.mean(coords, axis=0)

[docs]def rename_and_clean_resnames(input_pdb, output_pdb):
    """Rename histidine residues for ff14SB compatibility and convert HETATM to ATOM.

    Protoss outputs all histidines as HIS, but ff14SB requires specific
    protonation states: HIP (doubly protonated), HID (delta-protonated),
    or HIE (epsilon-protonated). This function determines the correct
    name based on hydrogen atoms present.

    Also handles N-terminal (NXXX) and C-terminal (CXXX) residue naming,
    and converts all HETATM records to ATOM for force field assignment.

    Parameters
    ----------
    input_pdb : str
        Path to the input PDB file (typically Protoss output).
    output_pdb : str
        Path to write the renamed PDB file.

    Notes
    -----
    The four-letter residue names (NALA, CALA, etc.) may cause column
    alignment issues in strict PDB format parsers.
    """
    with open(input_pdb, 'r') as infile, open(output_pdb, 'w') as outfile:
        lines = infile.readlines()

        # Identify residues to rename based on the specified conditions
        residue_info = {}
        for line in lines:
            if line.startswith('ATOM') or line.startswith('HETATM'):
                atom_name = line[12:16].strip()
                residue_name = line[17:20].strip()
                chain_id = line[21].strip()
                residue_id = line[22:26].strip()
                key = (residue_name, chain_id, residue_id)

                if key not in residue_info:
                    residue_info[key] = {
                        'residue_name': residue_name,
                        'atoms': set()
                    }
                residue_info[key]['atoms'].add(atom_name)

        # Determine the new residue names based on the conditions
        for key, info in residue_info.items():
            atoms = info['atoms']
            if info['residue_name'] == 'HIS':
                if 'HD1' in atoms and 'HE2' in atoms:
                    info['new_name'] = 'HIP'
                elif 'HD1' in atoms and 'HE2' not in atoms:
                    info['new_name'] = 'HID'
                elif 'HE2' in atoms and 'HD1' not in atoms:
                    info['new_name'] = 'HIE'
                else:
                    info['new_name'] = info['residue_name']
            else:
                info['new_name'] = info['residue_name']

            if 'OXT' in atoms:
                info['new_name'] = 'C' + info['new_name']

            if {'N', 'H1', 'H2', 'H3'} <= atoms:
                info['new_name'] = 'N' + info['new_name']

        # Write the modified PDB file with the new residue names
        for line in lines:
            if line.startswith('ATOM') or line.startswith('HETATM'):
                residue_name = line[17:20].strip()
                chain_id = line[21].strip()
                residue_id = line[22:26].strip()
                key = (residue_name, chain_id, residue_id)
                if key in residue_info:
                    new_residue_name = residue_info[key]['new_name']
                    line = line[:17] + f"{new_residue_name:>3}" + line[20:]
                if line.startswith('HETATM'):
                    line = 'ATOM  ' + line[6:]
            outfile.write(line)

[docs]def parse_pdb(input_pdb, output_pdb, ff_dict):
    """Write ff14SB partial charges into the B-factor column of a PDB file.

    Only ATOM lines whose residue and atom names match entries in ``ff_dict``
    are written to the output file.

    Parameters
    ----------
    input_pdb : str
        Path to the input PDB file.
    output_pdb : str
        Path to the output PDB file with charges in the B-factor column.
    ff_dict : dict
        Nested dict from :func:`~qp.manager.ff14SB_dict.get_ff14SB_dict`.
    """
    skipped_residues = set()
    with open(input_pdb, 'r') as infile, open(output_pdb, 'w') as outfile:
        for line in infile:
            if line.startswith('ATOM'):
                atom_name = line[12:16].strip()
                residue_name = line[17:20].strip()
                if residue_name not in ff_dict:
                    skipped_residues.add(residue_name)
                    continue
                if atom_name in ff_dict[residue_name]:
                    # Dump information into the B-factor column (columns 61-66)
                    ff_value = ff_dict[residue_name][atom_name]
                    line = line[:60] + f"{ff_value:>6.2f}" + line[66:]
                    outfile.write(line)
    if skipped_residues:
        warnings.warn(
            f"The following residues were not found in the force field "
            f"dictionary and were excluded from the MM point charge "
            f"embedding: {', '.join(sorted(skipped_residues))}. To include "
            f"these residues, supply custom charges via the "
            f"'charge_embedding_charges' configuration option.",
            stacklevel=2,
        )

[docs]def read_pdb(file_path):
    """Read a PDB file and extract ATOM record lines.

    Parameters
    ----------
    file_path : str
        Path to the PDB file.

    Returns
    -------
    list of str
        Lines starting with 'ATOM'.
    """
    atom_lines = []
    with open(file_path, 'r') as pdb_file:
        for line in pdb_file:
            if line.startswith('ATOM'):
                atom_lines.append(line)
    return atom_lines

[docs]def read_xyz(file_path):
    """Read an XYZ file and extract atomic coordinates.

    Parameters
    ----------
    file_path : str
        Path to the XYZ file.

    Returns
    -------
    numpy.ndarray of shape (N, 3)
        Cartesian coordinates of N atoms.
    """
    coordinates = []
    with open(file_path, 'r') as xyz_file:
        for line in xyz_file:
            parts = line.split()
            if len(parts) == 4:
                x = float(parts[1])
                y = float(parts[2])
                z = float(parts[3])
                coordinates.append([x, y, z])
    return np.array(coordinates)

[docs]def remove_atoms_from_pdb(pdb_lines, xyz_coords, threshold):
    """Filter PDB lines to remove atoms close to XYZ coordinates.

    Uses a KD-tree for efficient spatial queries to identify and remove
    atoms within the threshold distance of any atom in the XYZ file.

    Parameters
    ----------
    pdb_lines : list of str
        ATOM record lines from a PDB file.
    xyz_coords : numpy.ndarray of shape (M, 3)
        Reference coordinates (typically QM cluster atoms).
    threshold : float
        Distance threshold in angstroms.

    Returns
    -------
    list of str
        PDB lines with nearby atoms removed.
    """
    threshold_sq = threshold ** 2
    xyz_tree = KDTree(xyz_coords)

    new_pdb_lines = []
    for line in pdb_lines:
        if line.startswith('ATOM'):
            x, y, z = map(float, [line[30:38], line[38:46], line[46:54]])
            pdb_coord = np.array([x, y, z])
            distances, _ = xyz_tree.query(pdb_coord, k=1)
            if distances ** 2 > threshold_sq:
                new_pdb_lines.append(line)
        else:
            new_pdb_lines.append(line)
    return new_pdb_lines

[docs]def write_pdb(output_path, pdb_lines):
    """Write PDB lines to a file.

    Parameters
    ----------
    output_path : str
        Path to the output PDB file.
    pdb_lines : list of str
        PDB record lines to write.
    """
    with open(output_path, 'w') as pdb_file:
        for line in pdb_lines:
            pdb_file.write(line)

[docs]def remove_qm_atoms(pdb_file, xyz_file, output_pdb_file, threshold=0.5):
    """Remove QM cluster atoms from a PDB file based on coordinate matching.

    Atoms in the PDB whose coordinates are within ``threshold`` angstroms
    of any atom in the XYZ file are removed.

    Parameters
    ----------
    pdb_file : str
        Path to the full-protein PDB file.
    xyz_file : str
        Path to the QM cluster XYZ file.
    output_pdb_file : str
        Path to write the PDB with QM atoms removed.
    threshold : float, optional
        Distance threshold in angstroms for matching atoms (default 0.5).
    """
    pdb_lines = read_pdb(pdb_file)
    xyz_coords = read_xyz(xyz_file)

    remaining_lines = remove_atoms_from_pdb(pdb_lines, xyz_coords, threshold)

    write_pdb(output_pdb_file, remaining_lines)

[docs]def parse_pdb_to_xyz(pdb_file_path, output_file_path, qm_centroid, cutoff_distance):
    """Convert PDB with charges to TeraChem point charge XYZ format.

    Reads the B-factor column (containing ff14SB partial charges) from
    the PDB file and writes atoms within the cutoff distance of the
    QM centroid to a point charge file.

    Selection is residue-based: if any atom in a residue falls within
    the cutoff distance of the QM centroid, all atoms of that residue
    are included.  This prevents partial residues with non-integer
    charge contributions.

    Parameters
    ----------
    pdb_file_path : str
        Path to PDB file with charges in B-factor column.
    output_file_path : str
        Path for the output point charge file (``ptchrges.xyz``).
    qm_centroid : array-like of shape (3,)
        Centroid of the QM cluster for distance calculations.
    cutoff_distance : float
        Maximum distance in angstroms from centroid to include.
        If any atom of a residue is within this distance, the
        entire residue is included.
    """
    cutoff_distance_sq = cutoff_distance ** 2

    with open(pdb_file_path, 'r') as pdb_file:
        lines = pdb_file.readlines()

    # Pass 1: identify residues with at least one atom within cutoff
    included_residues = set()
    for line in lines:
        if line.startswith('ATOM'):
            x_coord = float(line[30:38])
            y_coord = float(line[38:46])
            z_coord = float(line[46:54])
            atom_coord = np.array([x_coord, y_coord, z_coord])
            if np.sum((atom_coord - qm_centroid) ** 2) <= cutoff_distance_sq:
                residue_name = line[17:20].strip()
                chain_id = line[21].strip()
                residue_id = line[22:26].strip()
                included_residues.add((residue_name, chain_id, residue_id))

    # Pass 2: collect all atoms belonging to included residues
    atom_lines = []
    for line in lines:
        if line.startswith('ATOM'):
            residue_name = line[17:20].strip()
            chain_id = line[21].strip()
            residue_id = line[22:26].strip()
            if (residue_name, chain_id, residue_id) in included_residues:
                atom_lines.append(line)

    atom_count = len(atom_lines)

    with open(output_file_path, 'w') as output_file:
        output_file.write(f"{atom_count}\nGenerated from PDB file\n")
        for line in atom_lines:
            charges = float(line[60:66])
            x_coord = float(line[30:38])
            y_coord = float(line[38:46])
            z_coord = float(line[46:54])
            output_file.write(f"{charges} {x_coord} {y_coord} {z_coord}\n")

[docs]def load_custom_charges(filepath):
    """Load custom partial charges from a JSON file.

    The JSON file must contain a nested dictionary keyed first by residue
    name, then by atom name, with partial charges as float values.

    Parameters
    ----------
    filepath : str
        Path to the JSON file.

    Returns
    -------
    dict
        Nested ``{residue_name: {atom_name: charge}}`` dictionary, in the
        same format returned by
        :func:`~qp.manager.ff14SB_dict.get_ff14SB_dict`.

    Raises
    ------
    FileNotFoundError
        If *filepath* does not exist.
    json.JSONDecodeError
        If the file contents are not valid JSON.

    Examples
    --------
    A minimal JSON file might look like::

        {
            "ALA": {"N": -0.4157, "H": 0.2719, "CA": 0.0337},
            "GLY": {"N": -0.4157, "H": 0.2719}
        }
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Custom charges file not found: {filepath}")

    with open(filepath, 'r') as f:
        charges = json.load(f)

    return charges


[docs]def get_charges(charge_embedding_cutoff, charge_embedding_charges=None):
    """Generate the MM point charge embedding file (``ptchrges.xyz``).

    Orchestrates the full charge embedding pipeline: renames residues for
    ff14SB compatibility, assigns partial charges into the B-factor column,
    removes QM cluster atoms, and writes the remaining MM atoms within the
    cutoff distance as a point charge XYZ file.

    Parameters
    ----------
    charge_embedding_cutoff : float
        Distance cutoff in angstroms from the QM cluster centroid for
        including MM point charges.
    charge_embedding_charges : str, optional
        Path to a JSON file containing custom partial charges. When
        ``None`` (default), the built-in AMBER ff14SB charges are used.
        See :func:`load_custom_charges` for the expected file format.
    """
    # Setup a temporary directory to store files
    temporary_files_dir = "ptchrges_temp"
    # Check if the directory exists
    if os.path.exists(temporary_files_dir):
        shutil.rmtree(temporary_files_dir)
    os.mkdir(temporary_files_dir)

    pdb_name = os.getcwd().split('/')[-3]
    protoss_pdb_name = f'{pdb_name}_protoss.pdb'
    protoss_pdb_path = os.path.join("/".join(os.getcwd().split('/')[:-2]),"Protoss",protoss_pdb_name)
    chain_name = os.getcwd().split('/')[-2]
    renamed_his_pdb_file = f'{temporary_files_dir}/{chain_name}_rename_his.pdb'
    if charge_embedding_charges is not None:
        ff_dict = load_custom_charges(charge_embedding_charges)
    else:
        ff_dict = ff14SB_dict.get_ff14SB_dict()
    charges_pdb = f'{temporary_files_dir}/{chain_name}_added_charges.pdb'
    xyz_file = f'{chain_name}.xyz'
    pdb_no_qm_atoms = f'{temporary_files_dir}/{chain_name}_without_qm_atoms.pdb'
    final_point_charges_file = "ptchrges.xyz"

    # Rename histidines
    rename_and_clean_resnames(protoss_pdb_path, renamed_his_pdb_file)

    # Parse the PDB file and dump charge information into the B-factor column
    parse_pdb(renamed_his_pdb_file, charges_pdb, ff_dict)

    # Remove QM atoms
    remove_qm_atoms(charges_pdb, xyz_file, pdb_no_qm_atoms, threshold=0.5)

    # Calculate the centroid of the QM region
    qm_coords = read_xyz(xyz_file)
    qm_centroid = calculate_centroid(qm_coords)
    
    # Parse the PDB to XYZ with distance cutoff
    parse_pdb_to_xyz(pdb_no_qm_atoms, final_point_charges_file, qm_centroid, charge_embedding_cutoff)

    shutil.rmtree(temporary_files_dir)

if __name__ == "__main__":
    get_charges()