Source code for qp.manager.charge_embedding

"""Generate charge embedding ptchrges.xyz file for TeraChem input.

By default, partial charges are taken from the built-in AMBER ff14SB
dictionary.  Users can supply a custom JSON file of charges via the
``charge_embedding_charges`` configuration option to support other
force fields.
"""

import os
import json
import shutil
import warnings
import numpy as np
from qp.manager import ff14SB_dict
from scipy.spatial import KDTree

[docs]def calculate_centroid(coords): """Calculate the geometric centroid of a set of coordinates. Parameters ---------- coords : array-like of shape (N, 3) Cartesian coordinates of N atoms. Returns ------- numpy.ndarray of shape (3,) The centroid (mean position) of the input coordinates. """ return np.mean(coords, axis=0)
[docs]def rename_and_clean_resnames(input_pdb, output_pdb): """Rename histidine residues for ff14SB compatibility and convert HETATM to ATOM. Protoss outputs all histidines as HIS, but ff14SB requires specific protonation states: HIP (doubly protonated), HID (delta-protonated), or HIE (epsilon-protonated). This function determines the correct name based on hydrogen atoms present. Also handles N-terminal (NXXX) and C-terminal (CXXX) residue naming, and converts all HETATM records to ATOM for force field assignment. Parameters ---------- input_pdb : str Path to the input PDB file (typically Protoss output). output_pdb : str Path to write the renamed PDB file. Notes ----- The four-letter residue names (NALA, CALA, etc.) may cause column alignment issues in strict PDB format parsers. """ with open(input_pdb, 'r') as infile, open(output_pdb, 'w') as outfile: lines = infile.readlines() # Identify residues to rename based on the specified conditions residue_info = {} for line in lines: if line.startswith('ATOM') or line.startswith('HETATM'): atom_name = line[12:16].strip() residue_name = line[17:20].strip() chain_id = line[21].strip() residue_id = line[22:26].strip() key = (residue_name, chain_id, residue_id) if key not in residue_info: residue_info[key] = { 'residue_name': residue_name, 'atoms': set() } residue_info[key]['atoms'].add(atom_name) # Determine the new residue names based on the conditions for key, info in residue_info.items(): atoms = info['atoms'] if info['residue_name'] == 'HIS': if 'HD1' in atoms and 'HE2' in atoms: info['new_name'] = 'HIP' elif 'HD1' in atoms and 'HE2' not in atoms: info['new_name'] = 'HID' elif 'HE2' in atoms and 'HD1' not in atoms: info['new_name'] = 'HIE' else: info['new_name'] = info['residue_name'] else: info['new_name'] = info['residue_name'] if 'OXT' in atoms: info['new_name'] = 'C' + info['new_name'] if {'N', 'H1', 'H2', 'H3'} <= atoms: info['new_name'] = 'N' + info['new_name'] # Write the modified PDB file with the new residue names for line in lines: if line.startswith('ATOM') or line.startswith('HETATM'): residue_name = line[17:20].strip() chain_id = line[21].strip() residue_id = line[22:26].strip() key = (residue_name, chain_id, residue_id) if key in residue_info: new_residue_name = residue_info[key]['new_name'] line = line[:17] + f"{new_residue_name:>3}" + line[20:] if line.startswith('HETATM'): line = 'ATOM ' + line[6:] outfile.write(line)
[docs]def parse_pdb(input_pdb, output_pdb, ff_dict): """Write ff14SB partial charges into the B-factor column of a PDB file. Only ATOM lines whose residue and atom names match entries in ``ff_dict`` are written to the output file. Parameters ---------- input_pdb : str Path to the input PDB file. output_pdb : str Path to the output PDB file with charges in the B-factor column. ff_dict : dict Nested dict from :func:`~qp.manager.ff14SB_dict.get_ff14SB_dict`. """ skipped_residues = set() with open(input_pdb, 'r') as infile, open(output_pdb, 'w') as outfile: for line in infile: if line.startswith('ATOM'): atom_name = line[12:16].strip() residue_name = line[17:20].strip() if residue_name not in ff_dict: skipped_residues.add(residue_name) continue if atom_name in ff_dict[residue_name]: # Dump information into the B-factor column (columns 61-66) ff_value = ff_dict[residue_name][atom_name] line = line[:60] + f"{ff_value:>6.2f}" + line[66:] outfile.write(line) if skipped_residues: warnings.warn( f"The following residues were not found in the force field " f"dictionary and were excluded from the MM point charge " f"embedding: {', '.join(sorted(skipped_residues))}. To include " f"these residues, supply custom charges via the " f"'charge_embedding_charges' configuration option.", stacklevel=2, )
[docs]def read_pdb(file_path): """Read a PDB file and extract ATOM record lines. Parameters ---------- file_path : str Path to the PDB file. Returns ------- list of str Lines starting with 'ATOM'. """ atom_lines = [] with open(file_path, 'r') as pdb_file: for line in pdb_file: if line.startswith('ATOM'): atom_lines.append(line) return atom_lines
[docs]def read_xyz(file_path): """Read an XYZ file and extract atomic coordinates. Parameters ---------- file_path : str Path to the XYZ file. Returns ------- numpy.ndarray of shape (N, 3) Cartesian coordinates of N atoms. """ coordinates = [] with open(file_path, 'r') as xyz_file: for line in xyz_file: parts = line.split() if len(parts) == 4: x = float(parts[1]) y = float(parts[2]) z = float(parts[3]) coordinates.append([x, y, z]) return np.array(coordinates)
[docs]def remove_atoms_from_pdb(pdb_lines, xyz_coords, threshold): """Filter PDB lines to remove atoms close to XYZ coordinates. Uses a KD-tree for efficient spatial queries to identify and remove atoms within the threshold distance of any atom in the XYZ file. Parameters ---------- pdb_lines : list of str ATOM record lines from a PDB file. xyz_coords : numpy.ndarray of shape (M, 3) Reference coordinates (typically QM cluster atoms). threshold : float Distance threshold in angstroms. Returns ------- list of str PDB lines with nearby atoms removed. """ threshold_sq = threshold ** 2 xyz_tree = KDTree(xyz_coords) new_pdb_lines = [] for line in pdb_lines: if line.startswith('ATOM'): x, y, z = map(float, [line[30:38], line[38:46], line[46:54]]) pdb_coord = np.array([x, y, z]) distances, _ = xyz_tree.query(pdb_coord, k=1) if distances ** 2 > threshold_sq: new_pdb_lines.append(line) else: new_pdb_lines.append(line) return new_pdb_lines
[docs]def write_pdb(output_path, pdb_lines): """Write PDB lines to a file. Parameters ---------- output_path : str Path to the output PDB file. pdb_lines : list of str PDB record lines to write. """ with open(output_path, 'w') as pdb_file: for line in pdb_lines: pdb_file.write(line)
[docs]def remove_qm_atoms(pdb_file, xyz_file, output_pdb_file, threshold=0.5): """Remove QM cluster atoms from a PDB file based on coordinate matching. Atoms in the PDB whose coordinates are within ``threshold`` angstroms of any atom in the XYZ file are removed. Parameters ---------- pdb_file : str Path to the full-protein PDB file. xyz_file : str Path to the QM cluster XYZ file. output_pdb_file : str Path to write the PDB with QM atoms removed. threshold : float, optional Distance threshold in angstroms for matching atoms (default 0.5). """ pdb_lines = read_pdb(pdb_file) xyz_coords = read_xyz(xyz_file) remaining_lines = remove_atoms_from_pdb(pdb_lines, xyz_coords, threshold) write_pdb(output_pdb_file, remaining_lines)
[docs]def parse_pdb_to_xyz(pdb_file_path, output_file_path, qm_centroid, cutoff_distance): """Convert PDB with charges to TeraChem point charge XYZ format. Reads the B-factor column (containing ff14SB partial charges) from the PDB file and writes atoms within the cutoff distance of the QM centroid to a point charge file. Selection is residue-based: if any atom in a residue falls within the cutoff distance of the QM centroid, all atoms of that residue are included. This prevents partial residues with non-integer charge contributions. Parameters ---------- pdb_file_path : str Path to PDB file with charges in B-factor column. output_file_path : str Path for the output point charge file (``ptchrges.xyz``). qm_centroid : array-like of shape (3,) Centroid of the QM cluster for distance calculations. cutoff_distance : float Maximum distance in angstroms from centroid to include. If any atom of a residue is within this distance, the entire residue is included. """ cutoff_distance_sq = cutoff_distance ** 2 with open(pdb_file_path, 'r') as pdb_file: lines = pdb_file.readlines() # Pass 1: identify residues with at least one atom within cutoff included_residues = set() for line in lines: if line.startswith('ATOM'): x_coord = float(line[30:38]) y_coord = float(line[38:46]) z_coord = float(line[46:54]) atom_coord = np.array([x_coord, y_coord, z_coord]) if np.sum((atom_coord - qm_centroid) ** 2) <= cutoff_distance_sq: residue_name = line[17:20].strip() chain_id = line[21].strip() residue_id = line[22:26].strip() included_residues.add((residue_name, chain_id, residue_id)) # Pass 2: collect all atoms belonging to included residues atom_lines = [] for line in lines: if line.startswith('ATOM'): residue_name = line[17:20].strip() chain_id = line[21].strip() residue_id = line[22:26].strip() if (residue_name, chain_id, residue_id) in included_residues: atom_lines.append(line) atom_count = len(atom_lines) with open(output_file_path, 'w') as output_file: output_file.write(f"{atom_count}\nGenerated from PDB file\n") for line in atom_lines: charges = float(line[60:66]) x_coord = float(line[30:38]) y_coord = float(line[38:46]) z_coord = float(line[46:54]) output_file.write(f"{charges} {x_coord} {y_coord} {z_coord}\n")
[docs]def load_custom_charges(filepath): """Load custom partial charges from a JSON file. The JSON file must contain a nested dictionary keyed first by residue name, then by atom name, with partial charges as float values. Parameters ---------- filepath : str Path to the JSON file. Returns ------- dict Nested ``{residue_name: {atom_name: charge}}`` dictionary, in the same format returned by :func:`~qp.manager.ff14SB_dict.get_ff14SB_dict`. Raises ------ FileNotFoundError If *filepath* does not exist. json.JSONDecodeError If the file contents are not valid JSON. Examples -------- A minimal JSON file might look like:: { "ALA": {"N": -0.4157, "H": 0.2719, "CA": 0.0337}, "GLY": {"N": -0.4157, "H": 0.2719} } """ if not os.path.exists(filepath): raise FileNotFoundError(f"Custom charges file not found: {filepath}") with open(filepath, 'r') as f: charges = json.load(f) return charges
[docs]def get_charges(charge_embedding_cutoff, charge_embedding_charges=None): """Generate the MM point charge embedding file (``ptchrges.xyz``). Orchestrates the full charge embedding pipeline: renames residues for ff14SB compatibility, assigns partial charges into the B-factor column, removes QM cluster atoms, and writes the remaining MM atoms within the cutoff distance as a point charge XYZ file. Parameters ---------- charge_embedding_cutoff : float Distance cutoff in angstroms from the QM cluster centroid for including MM point charges. charge_embedding_charges : str, optional Path to a JSON file containing custom partial charges. When ``None`` (default), the built-in AMBER ff14SB charges are used. See :func:`load_custom_charges` for the expected file format. """ # Setup a temporary directory to store files temporary_files_dir = "ptchrges_temp" # Check if the directory exists if os.path.exists(temporary_files_dir): shutil.rmtree(temporary_files_dir) os.mkdir(temporary_files_dir) pdb_name = os.getcwd().split('/')[-3] protoss_pdb_name = f'{pdb_name}_protoss.pdb' protoss_pdb_path = os.path.join("/".join(os.getcwd().split('/')[:-2]),"Protoss",protoss_pdb_name) chain_name = os.getcwd().split('/')[-2] renamed_his_pdb_file = f'{temporary_files_dir}/{chain_name}_rename_his.pdb' if charge_embedding_charges is not None: ff_dict = load_custom_charges(charge_embedding_charges) else: ff_dict = ff14SB_dict.get_ff14SB_dict() charges_pdb = f'{temporary_files_dir}/{chain_name}_added_charges.pdb' xyz_file = f'{chain_name}.xyz' pdb_no_qm_atoms = f'{temporary_files_dir}/{chain_name}_without_qm_atoms.pdb' final_point_charges_file = "ptchrges.xyz" # Rename histidines rename_and_clean_resnames(protoss_pdb_path, renamed_his_pdb_file) # Parse the PDB file and dump charge information into the B-factor column parse_pdb(renamed_his_pdb_file, charges_pdb, ff_dict) # Remove QM atoms remove_qm_atoms(charges_pdb, xyz_file, pdb_no_qm_atoms, threshold=0.5) # Calculate the centroid of the QM region qm_coords = read_xyz(xyz_file) qm_centroid = calculate_centroid(qm_coords) # Parse the PDB to XYZ with distance cutoff parse_pdb_to_xyz(pdb_no_qm_atoms, final_point_charges_file, qm_centroid, charge_embedding_cutoff) shutil.rmtree(temporary_files_dir)
if __name__ == "__main__": get_charges()