Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 45 additions & 9 deletions src/poli/core/proteins/foldx_black_box.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
parse_pdb_as_residue_strings,
parse_pdb_as_residues,
)
from poli.core.util.proteins.foldx import FoldxInterface

# This is the folder where all the files
# generated by FoldX will be stored.
Expand All @@ -31,6 +32,7 @@ def __init__(
alphabet: List[str] = None,
experiment_id: str = None,
tmp_folder: Path = None,
eager_repair: bool = False,
):
"""
TODO: Document
Expand All @@ -52,6 +54,13 @@ def __init__(
num_workers=num_workers,
)

# Defining the experiment id
if experiment_id is None:
experiment_id = f"{int(time())}_{str(uuid4())[:8]}"
self.experiment_id = experiment_id

self.tmp_folder = tmp_folder if tmp_folder is not None else DEFAULT_TMP_PATH

if alphabet is None:
alphabet = info.alphabet

Expand All @@ -61,26 +70,53 @@ def __init__(
if isinstance(wildtype_pdb_path, Path):
wildtype_pdb_path = [wildtype_pdb_path]

self.wildtype_pdb_paths = wildtype_pdb_path
if isinstance(wildtype_pdb_path, list):
_wildtype_pdb_path = []
for pdb_file in wildtype_pdb_path:
if isinstance(pdb_file, str):
pdb_file = Path(pdb_file.strip())
assert isinstance(
pdb_file, Path
), f"Expected a Path object or a string, but got {type(pdb_file)}."
_wildtype_pdb_path.append(pdb_file)

wildtype_pdb_path = _wildtype_pdb_path

# At this point, wildtype_pdb_path is a list of Path objects.
# We need to ensure that these are repaired pdb files.
# We do this by creating a temporary folder and repairing
# the pdbs there.
if eager_repair:
path_for_repairing_pdbs = self.tmp_folder / "foldx_tmp_files_for_repair"
path_for_repairing_pdbs.mkdir(exist_ok=True, parents=True)
foldx_interface_for_repairing = FoldxInterface(path_for_repairing_pdbs)

# Re-writing wildtype_pdb_path to be the list of repaired pdb files.
repaired_wildtype_pdb_files = [
foldx_interface_for_repairing._repair_if_necessary_and_provide_path(
pdb_file
)
for pdb_file in wildtype_pdb_path
]

# At this point, wildtype_pdb_path is a list of Path objects.
self.wildtype_pdb_paths = repaired_wildtype_pdb_files
else:
self.wildtype_pdb_paths = wildtype_pdb_path

self.wildtype_resiudes = [
parse_pdb_as_residues(pdb_file) for pdb_file in wildtype_pdb_path
parse_pdb_as_residues(pdb_file) for pdb_file in self.wildtype_pdb_paths
]

self.wildtype_amino_acids = [
parse_pdb_as_residue_strings(pdb_file) for pdb_file in wildtype_pdb_path
parse_pdb_as_residue_strings(pdb_file)
for pdb_file in self.wildtype_pdb_paths
]

self.wildtype_residue_strings = [
"".join(amino_acids) for amino_acids in self.wildtype_amino_acids
]

if experiment_id is None:
experiment_id = f"{int(time())}_{str(uuid4())[:8]}"
self.experiment_id = experiment_id

self.tmp_folder = tmp_folder if tmp_folder is not None else DEFAULT_TMP_PATH

def create_working_directory(self) -> Path:
"""
TODO: document.
Expand Down
56 changes: 54 additions & 2 deletions src/poli/core/util/proteins/foldx.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@
import shutil
import subprocess
import os
import logging

from Bio.PDB.Residue import Residue
from Bio.PDB import SASA
from Bio.SeqUtils import seq1

from pdbtools.pdb_delhetatm import run as pdb_delhetatm_run

from poli.core.util.proteins.mutations import (
mutations_from_wildtype_residues_and_mutant,
)
Expand Down Expand Up @@ -57,7 +60,11 @@ def __init__(self, working_dir: Union[Path, str]):
self.working_dir = working_dir

def repair(
self, pdb_file: Union[str, Path], remove_and_rename: bool = False
self,
pdb_file: Union[str, Path],
remove_and_rename: bool = False,
pH: float = 7.0,
remove_heteroatoms: bool = True,
) -> None:
"""
This method repairs a PDB file with FoldX, overwriting
Expand All @@ -77,10 +84,20 @@ def repair(
"--command=RepairPDB",
"--pdb",
f"{pdb_file.stem}.pdb",
"--water",
"-CRYSTAL",
"--pH",
f"{pH}",
]

# Running it in the working directory
subprocess.run(command, cwd=self.working_dir)
try:
subprocess.run(command, cwd=self.working_dir, check=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(
f"FoldX failed to repair the pdb file {pdb_file}. "
f"Please check the working directory: {self.working_dir}. "
) from e

# Checking that the file was generated
repaired_pdb_file = self.working_dir / f"{pdb_file.stem}_Repair.pdb"
Expand All @@ -89,6 +106,19 @@ def repair(
f"Please check the working directory: {self.working_dir}. "
)

# If remove heteroatoms is True, we remove them
# using pdbtools
if remove_heteroatoms:
# We load up the repaired file
with open(repaired_pdb_file) as f:
lines = f.readlines()

deleting_heteroatoms_result = pdb_delhetatm_run(lines)

# We write the result to the same file
with open(repaired_pdb_file, "w") as f:
f.writelines(deleting_heteroatoms_result)

# Removing the old pdb file, and renaming the repaired one
if remove_and_rename:
shutil.rmtree(self.working_dir / f"{pdb_file.stem}.pdb")
Expand All @@ -97,6 +127,28 @@ def repair(
self.working_dir / f"{pdb_file.stem}.pdb",
)

def _repair_if_necessary_and_provide_path(self, pdb_file: Path) -> Path:
"""
If the pdb_file's name doesn't end in "_Repair.pdb",
then we repair it and return the path of the repaired
pdb. Otherwise, we return the same path as the input.
"""
# Make sure that we don't have a repaired pdb file
# in the working directory (which is usually a cache)
if (self.working_dir / f"{pdb_file.stem}_Repair.pdb").exists():
logging.warning(
f"Found a repaired pdb file in the cache for {pdb_file.stem}. Using it instead of repairing."
)
return self.working_dir / f"{pdb_file.stem}_Repair.pdb"

# If the file's already fixed, then we don't need to
# do anything. Else, we repair it.
if "_Repair" in pdb_file.name:
return pdb_file
else:
self.repair(pdb_file)
return self.working_dir / f"{pdb_file.stem}_Repair.pdb"

def _simulate_mutations(self, pdb_file: Path, mutations: List[str] = None) -> None:
"""
This method simulates mutations on a PDB file with FoldX.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ def clean_pdb(pdb_input_filename: str, out_dir: str, reduce_executable: str):
first_model = _step_1_reduce(
reduce_executable, pdb_input_filename, pdbid, temp1
)

# Step 2: NonHetSelector filter
with tempfile.NamedTemporaryFile(mode="wt", delete=True) as temp2:
PDBIO.set_structure(first_model)
Expand All @@ -192,7 +191,7 @@ def clean_pdb(pdb_input_filename: str, out_dir: str, reduce_executable: str):

# Step 3: Replace altloc chars to " " and use pdbfixer
with tempfile.NamedTemporaryFile(mode="wt", delete=True) as temp3:
temp_3, fixer = _step_3_pdbfixer(first_model, temp3)
temp3, fixer = _step_3_pdbfixer(first_model, temp3)

# Step 4: Correct for pdbfixer not preserving insertion codes
with tempfile.NamedTemporaryFile(mode="wt", delete=True) as temp4:
Expand Down
2 changes: 1 addition & 1 deletion src/poli/objective_repository/foldx_sasa/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ channels:
- conda-forge
dependencies:
- python=3.9
- gcc
- pip
- pip:
- biopython
- python-levenshtein
- numpy
- pdb-tools
- "git+https://github.com/MachineLearningLifeScience/poli.git@master"
24 changes: 16 additions & 8 deletions src/poli/objective_repository/foldx_sasa/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,18 @@ def __init__(
alphabet: List[str] = None,
experiment_id: str = None,
tmp_folder: Path = None,
eager_repair: bool = False,
):
super().__init__(
info,
batch_size,
parallelize,
num_workers,
wildtype_pdb_path,
alphabet,
experiment_id,
tmp_folder,
info=info,
batch_size=batch_size,
parallelize=parallelize,
num_workers=num_workers,
wildtype_pdb_path=wildtype_pdb_path,
alphabet=alphabet,
experiment_id=experiment_id,
tmp_folder=tmp_folder,
eager_repair=eager_repair,
)

def _black_box(self, x: np.ndarray, context: None) -> np.ndarray:
Expand Down Expand Up @@ -121,6 +123,9 @@ def create(
num_workers: int = None,
wildtype_pdb_path: Union[Path, List[Path]] = None,
alphabet: List[str] = None,
experiment_id: str = None,
tmp_folder: Path = None,
eager_repair: bool = False,
) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
"""
TODO: document
Expand Down Expand Up @@ -165,6 +170,9 @@ def create(
num_workers=num_workers,
wildtype_pdb_path=wildtype_pdb_path,
alphabet=alphabet,
experiment_id=experiment_id,
tmp_folder=tmp_folder,
eager_repair=eager_repair,
)

# We need to compute the initial values of all wildtypes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ channels:
- conda-forge
dependencies:
- python=3.9
- gcc
- pip
- pip:
- biopython
- python-levenshtein
- numpy
- pdb-tools
- "git+https://github.com/MachineLearningLifeScience/poli.git@master"
35 changes: 25 additions & 10 deletions src/poli/objective_repository/foldx_stability/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
This script registers FoldX stability as an objective function.
"""
from pathlib import Path
from typing import Dict, List, Tuple, Union
from typing import List, Tuple, Union

import numpy as np

Expand Down Expand Up @@ -46,16 +46,18 @@ def __init__(
alphabet: List[str] = None,
experiment_id: str = None,
tmp_folder: Path = None,
eager_repair: bool = False,
):
super().__init__(
info,
batch_size,
parallelize,
num_workers,
wildtype_pdb_path,
alphabet,
experiment_id,
tmp_folder,
info=info,
batch_size=batch_size,
parallelize=parallelize,
num_workers=num_workers,
wildtype_pdb_path=wildtype_pdb_path,
alphabet=alphabet,
experiment_id=experiment_id,
tmp_folder=tmp_folder,
eager_repair=eager_repair,
)

def _black_box(self, x: np.ndarray, context: None) -> np.ndarray:
Expand Down Expand Up @@ -129,6 +131,9 @@ def create(
num_workers: int = None,
wildtype_pdb_path: Union[Path, List[Path]] = None,
alphabet: List[str] = None,
experiment_id: str = None,
tmp_folder: Path = None,
eager_repair: bool = False,
) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
seed_numpy(seed)
seed_python(seed)
Expand Down Expand Up @@ -169,14 +174,24 @@ def create(
num_workers=num_workers,
wildtype_pdb_path=wildtype_pdb_path,
alphabet=alphabet,
experiment_id=experiment_id,
tmp_folder=tmp_folder,
eager_repair=eager_repair,
)

# During the creation of the black box,
# we might have repaired the PDB files.
# Thus, we need to compute the initial
# values of all wildtypes in wildtype_pdb_path
# using the repaired PDB files instead.
repaired_wildtype_pdb_paths = f.wildtype_pdb_paths

# We need to compute the initial values of all wildtypes
# in wildtype_pdb_path. For this, we need to specify x0,
# a vector of wildtype sequences. These are padded to
# match the maximum length with empty strings.
wildtype_amino_acids_ = []
for pdb_file in wildtype_pdb_path:
for pdb_file in repaired_wildtype_pdb_paths:
wildtype_residues = parse_pdb_as_residues(pdb_file)
wildtype_amino_acids_.append(
[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ channels:
- conda-forge
dependencies:
- python=3.9
- gcc
- pip
- pip:
- biopython
- python-levenshtein
- numpy
- pdb-tools
- "git+https://github.com/MachineLearningLifeScience/poli.git@master"
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(
alphabet: List[str] = None,
experiment_id: str = None,
tmp_folder: Path = None,
eager_repair: bool = False,
):
super().__init__(
info=info,
Expand All @@ -45,6 +46,7 @@ def __init__(
alphabet=alphabet,
experiment_id=experiment_id,
tmp_folder=tmp_folder,
eager_repair=eager_repair,
)

def _black_box(self, x: np.ndarray, context: None) -> np.ndarray:
Expand Down Expand Up @@ -121,6 +123,9 @@ def create(
num_workers: int = None,
wildtype_pdb_path: Union[Path, List[Path]] = None,
alphabet: List[str] = None,
experiment_id: str = None,
tmp_folder: Path = None,
eager_repair: bool = False,
) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
seed_numpy(seed)
seed_python(seed)
Expand Down Expand Up @@ -160,6 +165,9 @@ def create(
num_workers=num_workers,
wildtype_pdb_path=wildtype_pdb_path,
alphabet=alphabet,
experiment_id=experiment_id,
tmp_folder=tmp_folder,
eager_repair=eager_repair,
)

# We need to compute the initial values of all wildtypes
Expand Down
Loading