"""Fix protonation errors in metal coordination sites."""
import numpy as np
from typing import List
from queue import Queue
from Bio.PDB.Atom import Atom
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB.NeighborSearch import NeighborSearch
from Bio.PDB.Residue import Residue
from Bio.PDB import PDBParser, PDBIO, Select, Polypeptide
from qp.cluster.spheres import CenterResidue
[docs]def res_priority(res, res_info, center_residues):
"""Compute a priority score for a residue during partial occupancy resolution.
Higher scores mean higher priority (kept over lower-priority residues).
Center residues get the highest priority, followed by standard amino acids,
then non-standard residues scored by atom count and average occupancy.
Parameters
----------
res : Bio.PDB.Residue.Residue
The residue to score.
res_info : dict
Partial occupancy info dict with ``'avg_occupancy'`` and ``'num_atom'`` keys.
center_residues : CenterResidue
The center residue definition for the current structure.
Returns
-------
float
Priority score.
"""
if res in center_residues:
return 8e7
elif Polypeptide.is_aa(res, standard=True):
return 4e7
else:
return res_info[res]["num_atom"] + \
int(res_info[res]["avg_occupancy"] * 100) * 1e5 + \
int(Polypeptide.is_aa(res)) * 2e7
[docs]def partial_res_info(res, res_info):
"""Format a residue's partial occupancy information as a human-readable string.
Parameters
----------
res : Bio.PDB.Residue.Residue
The residue to describe.
res_info : dict
Partial occupancy info dict with ``'avg_occupancy'`` and ``'num_atom'`` keys.
Returns
-------
str
Formatted string, e.g. ``'residue FE_A199 with avg. occupancy 0.75 and 1 atoms'``.
"""
_, _, chain, code = res.get_full_id()
_, resid, _ = code
resname = res.get_resname()
return f'residue {resname}_{chain}{resid} with avg. occupancy {res_info[res]["avg_occupancy"]:.2f} and {res_info[res]["num_atom"]} atoms'
[docs]def clean_occupancy(path: str, center_residues: CenterResidue) -> bool:
"""Resolve partial occupancy conflicts by selecting a self-consistent coordinate set.
Iterates over all residues with partial occupancy and removes lower-priority
residues that clash spatially. Priority order: center residues > standard
amino acids > non-standard residues > higher average occupancy > more atoms.
Distance criteria are 1.0 A for center residues (allowing coordination bonds)
and 2.5 A for all others.
The PDB file at ``path`` is overwritten in place with the cleaned structure.
Parameters
----------
path : str
Path to the PDB file (modified in place).
center_residues : CenterResidue
The center residue definition for priority assignment.
Returns
-------
bool
True if partial occupancy was detected and cleaned, False otherwise.
"""
parser = PDBParser(QUIET=True)
io = PDBIO()
structure = parser.get_structure("PDB", path)
io.set_structure(structure)
kept_partial_res = dict()
partial_atom_list = []
for model in structure:
for chain in model:
for res in chain.get_residues():
partial_in_res = False
total_occupancy = 0
# Check if there are partial occupied atoms in res
for atom in res:
atom_occupancy = atom.get_occupancy()
total_occupancy += atom_occupancy
if atom_occupancy < 1.0:
partial_in_res = True
partial_atom_list.append(atom)
# If so, add res to the partial res dict
if partial_in_res:
num_atom = len(res)
kept_partial_res[res] = {
"avg_occupancy": total_occupancy / num_atom,
"num_atom": num_atom,
"kept": True,
"search_radius": 1.0 if res in center_residues else 2.5 # allow for bonding to center, otherwise vdw
}
# no partial occupancy detected
if not partial_atom_list:
return False
check_queue = Queue()
for res in kept_partial_res:
check_queue.put(res)
search = NeighborSearch(partial_atom_list)
while not check_queue.empty():
res = check_queue.get()
# don't check deleted res
if not kept_partial_res[res]["kept"]:
continue
recheck_res = set()
for atom in res:
for clash_res in search.search(center=atom.get_coord(), radius=kept_partial_res[res]["search_radius"], level="R"):
# don't delete self and deleted res
if (
clash_res == res or # self
not kept_partial_res[clash_res]["kept"] # deleted res
):
continue
# if center is in clash with current atom
# reconsider this clash from the center's point of view with tighter search radius
if clash_res in center_residues:
# if center doesn't contain partial occupancy, put it into the queue for rechecking,
if clash_res not in kept_partial_res:
kept_partial_res[clash_res] = {
"avg_occupancy": 1.0,
"num_atom": len(clash_res),
"kept": True,
"search_radius": 1.0
}
recheck_res.add(clash_res)
# otherwise the center would have been checked
continue
# compare priority and delete the lower one
p1 = res_priority(res, kept_partial_res, center_residues)
p2 = res_priority(clash_res, kept_partial_res, center_residues)
if p1 > p2:
kept_partial_res[clash_res]["kept"] = False
print(f"> Deleting {partial_res_info(clash_res, kept_partial_res)}, keeping {partial_res_info(res, kept_partial_res)} in search radius {kept_partial_res[res]['search_radius']}Å")
elif p1 < p2:
kept_partial_res[res]["kept"] = False
print(f"> Deleting {partial_res_info(res, kept_partial_res)}, keeping {partial_res_info(clash_res, kept_partial_res)} in search radius {kept_partial_res[res]['search_radius']}Å")
break
for res in recheck_res:
print("Recheck", res)
check_queue.put(res)
class ResSelect(Select):
def accept_residue(self, residue):
return (residue not in kept_partial_res) or kept_partial_res[residue]["kept"]
# Creating a backup copy is unnecessary although it could be useful so I'm commenting it
# shutil.copy(path, f"{path[:-4]}_old.pdb")
io.save(path, ResSelect())
print("> Partial occupancy cleaning finished")
return True
[docs]def flip_coordinated_HIS(points, res):
"""Flip a histidine imidazole ring to orient nitrogen toward a metal.
Protoss assigns histidine tautomers and conformations, but sometimes
the coordinating nitrogen (NE2 or ND1) is oriented away from the metal.
This function detects when a carbon (CE1 or CD2) is closer to the metal
than its adjacent nitrogen and flips the ring around the CB--CG axis
to restore proper coordination geometry.
Parameters
----------
points : list of Bio.PDB.Atom.Atom
Metal atoms to check coordination against.
res : Bio.PDB.Residue.Residue
A histidine residue (modified in place).
Notes
-----
The residue is modified in place. Hydrogen positions are adjusted
to match the new ring orientation.
"""
flip_flag = ""
try:
CE1 = res["CE1"]
NE2 = res["NE2"]
CD2 = res["CD2"]
ND1 = res["ND1"]
CB = res["CB"]
CG = res["CG"]
except IndexError:
print("> Non standard atom names of HIS")
return
closest = sorted(points, key=lambda p: min(p - x for x in [CE1, NE2, CD2, ND1]))[0]
dist_CE1_metal = closest - CE1
dist_NE2_metal = closest - NE2
dist_CD2_metal = closest - CD2
dist_ND1_metal = closest - ND1
if dist_CE1_metal < dist_NE2_metal and dist_CE1_metal < 3.5 and dist_CE1_metal < dist_ND1_metal:
flip_flag = "E"
elif dist_CD2_metal < dist_ND1_metal and dist_CD2_metal < 3.5 and dist_CD2_metal < dist_NE2_metal:
flip_flag = "D"
if flip_flag:
old_coords = dict()
for atom_name in ["HD2", "HE1", "HD1", "HE2", "CD2", "ND1", "CE1", "NE2", "CG", "CB"]:
if atom_name in res:
old_coords[atom_name] = res[atom_name].get_coord()
coord_CB, coord_CG = old_coords["CB"], old_coords["CG"]
axis = coord_CG - coord_CB
for atom_name in ["HD2", "HE1", "HD1", "HE2", "CD2", "ND1", "CE1", "NE2"]:
if atom_name in res:
coord = old_coords[atom_name]
loc = coord - coord_CB
proj = axis * np.dot(axis, loc) / np.dot(axis, axis)
flipped_loc = 2 * proj - loc
flipped_coord = flipped_loc + coord_CB
res[atom_name].set_coord(flipped_coord)
if flip_flag == "E" and "HE2" in res and "HD1" not in res:
coord_CE1 = res["CE1"].get_coord()
coord_ND1 = res["ND1"].get_coord()
vec_ND1_CE1 = coord_CE1 - coord_ND1
vec_ND1_CG = coord_CG - coord_ND1
bisector = vec_ND1_CE1 + vec_ND1_CG
vec_ND1_HD1 = - bisector / np.linalg.norm(bisector) # * 1.00
res["HE2"].set_coord(vec_ND1_HD1 + coord_ND1)
res["HE2"].name = "HD1"
if flip_flag == "D" and "HD1" in res and "HE2" not in res:
coord_CE1 = res["CE1"].get_coord()
coord_NE2 = res["NE2"].get_coord()
coord_CD2 = res["CD2"].get_coord()
vec_NE2_CE1 = coord_CE1 - coord_NE2
vec_NE2_CD2 = coord_CD2 - coord_NE2
bisector = vec_NE2_CE1 + vec_NE2_CD2
vec_NE2_HE2 = - bisector / np.linalg.norm(bisector) # * 1.00
res["HD1"].set_coord(vec_NE2_HE2 + coord_NE2)
res["HD1"].name = "HE2"
[docs]def add_hydrogen_CSO(res, structure):
"""Add hydrogens to cysteine sulfenic acid (CSO) residues.
CSO is a post-translational modification not recognized by Protoss.
This function adds hydrogens to CA (HA), CB (HB1, HB2), and the
hydroxyl oxygen (HD) using ideal geometry. The hydroxyl hydrogen
is omitted if a TAN (2,3,3-trichloroallyl alcohol) ligand is
covalently bonded to the OD atom.
Parameters
----------
res : Bio.PDB.Residue.Residue
A CSO residue (modified in place).
structure : Bio.PDB.Structure.Structure
The full protein structure (used to detect TAN--CSO bonds).
Notes
-----
The residue is modified in place. Existing hydrogens are not replaced.
"""
coords = dict()
try:
for atom_name in ["C", "N", "CA", "CB", "SG", "OD"]:
coords[atom_name] = res[atom_name].get_coord()
except IndexError:
print("> Non standard atom names of CSO")
return
if "H_1" not in res and "HA" not in res:
vec_CA_C = coords["C"] - coords["CA"]
vec_CA_N = coords["N"] - coords["CA"]
vec_CA_CB = coords["CB"] - coords["CA"]
vec_sum = vec_CA_C + vec_CA_N + vec_CA_CB
vec_CA_HA = - vec_sum / np.linalg.norm(vec_sum) # * 1.00
coord_HA = vec_CA_HA + coords["CA"]
res.add(Atom("HA", coord_HA, 0, 1, " ", "HA", None, "H"))
if "H_2" not in res and "H_3" not in res and "HB1" not in res and "HB2" not in res:
ANGLE_HCH = 109.51 / 180 * np.pi
vec_CB_SG = coords["SG"] - coords["CB"]
vec_CB_CA = coords["CA"] - coords["CB"]
vec_xy = vec_CB_SG + vec_CB_CA
vec_xy = - vec_xy / np.linalg.norm(vec_xy) * np.cos(ANGLE_HCH / 2) # * 1.00
vec_z = np.cross(vec_CB_SG, vec_CB_CA)
vec_z = vec_z / np.linalg.norm(vec_z) * np.sin(ANGLE_HCH / 2) # * 1.00
coord_HB1 = coords["CB"] + vec_xy + vec_z
coord_HB2 = coords["CB"] + vec_xy - vec_z
res.add(Atom("HB1", coord_HB1, 0, 1, " ", "HB1", None, "H"))
res.add(Atom("HB2", coord_HB2, 0, 1, " ", "HB2", None, "H"))
if "H_4" not in res and "HD" not in res:
for res2 in structure[0].get_residues():
if res2.get_resname() == "TAN":
try:
C = res2["C"]
except:
print("> Non standard atom names of TAN")
if C - res["OD"] < 1.6 and "H1" in res2:
# TAN reacting with CSO, ref. 3X20, 3X24, 3X25
print("> TAN reacting with CSO!")
return
vec_CB_SG = coords["SG"] - coords["CB"]
vec_OD_HD = vec_CB_SG / np.linalg.norm(vec_CB_SG) # * 1.00
coord_HD = coords["OD"] + vec_OD_HD
res.add(Atom("HD", coord_HD, 0, 1, " ", "HD", None, "H"))
[docs]def fix_OXT(path: str) -> None:
"""Fix corrupted C-terminal OXT atoms by reflecting their coordinates.
When the OXT and O atoms of a C-terminal residue are too close (< 1.8 A),
the O atom is reflected across the CA--C bond axis to restore correct
geometry. The PDB file is overwritten in place.
Parameters
----------
path : str
Path to the PDB file (modified in place).
"""
parser = PDBParser(QUIET=True)
io = PDBIO()
structure = parser.get_structure("PDB", path)
io.set_structure(structure)
for model in structure:
for chain in model:
for res in chain.get_residues():
res: Residue
OXT_flag = True
for name in {"OXT", "O", "CA", "C"}:
if name not in res:
OXT_flag = False
break
if not OXT_flag:
continue
OXT: Atom = res["OXT"]
O: Atom = res["O"]
CA: Atom = res["CA"]
C: Atom = res["C"]
if OXT - O > 1.8:
break
print("> Fixing corrupted C terminus")
vec_C_CA = CA.get_coord() - C.get_coord()
vec_C_OXT = OXT.get_coord() - C.get_coord()
proj_vec_C_OXT = np.dot(vec_C_CA, vec_C_OXT) / np.dot(vec_C_CA, vec_C_CA) * vec_C_CA
dist_vec_C_OXT = proj_vec_C_OXT - vec_C_OXT
reflect_vec_C_OXT = vec_C_OXT + 2 * dist_vec_C_OXT
O.set_coord(C.get_coord() + reflect_vec_C_OXT)
io.save(path, Select())
[docs]def adjust_activesites(path, center_residue: CenterResidue):
"""Deprotonate metal-coordinating residues incorrectly protonated by Protoss.
Removes hydrogens from residues coordinating to metal centers:
- Tyrosine OH groups within 3 A of a metal
- Cysteine SG groups within 3 A of a metal
- Backbone N atoms within 3 A of a metal
- All hydrogens on NO ligands (protonated as hydroxylamine by Protoss)
- Any ligand H atoms within 1.5 A of a metal
Also flips incorrectly oriented histidine rings and adds hydrogens to
CSO (cysteine sulfenic acid) residues.
Parameters
----------
path : str
Path to the Protoss output PDB file (modified in place).
center_residue : CenterResidue
Center residue definition identifying metal atoms.
Notes
-----
The file at ``path`` is overwritten with the corrected structure.
"""
parser = PDBParser(QUIET=True)
structure = parser.get_structure("PDB", path)
points = []
for res in structure[0].get_residues():
if res in center_residue:
atoms = res.get_unpacked_list()
if len(atoms) > 1:
continue # skip if not metal-like
points.append(atoms[0])
for res in structure[0].get_residues():
if res.get_resname() == "HIS" and points:
flip_coordinated_HIS(points, res)
if res.get_resname() == "CSO":
add_hydrogen_CSO(res, structure)
class AtomSelect(Select):
def accept_atom(self, atom):
res = atom.get_parent()
if res.get_resname() == "NO":
if "H" in atom.get_name():
return False
coord = None
if atom.get_name() == "HH" and res.get_resname() == "TYR":
coord = res["OH"]
elif atom.get_name() == "HG" and res.get_resname() == "CYS":
coord = res["SG"]
elif is_aa(res) and atom.get_name() in ["H", "H2"] and "N" in res:
coord = res["N"]
if coord:
for p in points:
if p - coord < 3:
return False
if atom.element == "H" and res.get_resname() != "HOH" and not is_aa(res):
for p in points:
if p - atom < 1.5:
return False
return True
io = PDBIO()
io.set_structure(structure)
io.save(path, AtomSelect())