diff --git a/src/poli/core/proteins/foldx_black_box.py b/src/poli/core/proteins/foldx_black_box.py index 58bfb04a..3bf32a25 100644 --- a/src/poli/core/proteins/foldx_black_box.py +++ b/src/poli/core/proteins/foldx_black_box.py @@ -10,6 +10,7 @@ parse_pdb_as_residue_strings, parse_pdb_as_residues, ) +from poli.core.util.proteins.foldx import FoldxInterface # This is the folder where all the files # generated by FoldX will be stored. @@ -31,6 +32,7 @@ def __init__( alphabet: List[str] = None, experiment_id: str = None, tmp_folder: Path = None, + eager_repair: bool = False, ): """ TODO: Document @@ -52,6 +54,13 @@ def __init__( num_workers=num_workers, ) + # Defining the experiment id + if experiment_id is None: + experiment_id = f"{int(time())}_{str(uuid4())[:8]}" + self.experiment_id = experiment_id + + self.tmp_folder = tmp_folder if tmp_folder is not None else DEFAULT_TMP_PATH + if alphabet is None: alphabet = info.alphabet @@ -61,26 +70,53 @@ def __init__( if isinstance(wildtype_pdb_path, Path): wildtype_pdb_path = [wildtype_pdb_path] - self.wildtype_pdb_paths = wildtype_pdb_path + if isinstance(wildtype_pdb_path, list): + _wildtype_pdb_path = [] + for pdb_file in wildtype_pdb_path: + if isinstance(pdb_file, str): + pdb_file = Path(pdb_file.strip()) + assert isinstance( + pdb_file, Path + ), f"Expected a Path object or a string, but got {type(pdb_file)}." + _wildtype_pdb_path.append(pdb_file) + + wildtype_pdb_path = _wildtype_pdb_path + + # At this point, wildtype_pdb_path is a list of Path objects. + # We need to ensure that these are repaired pdb files. + # We do this by creating a temporary folder and repairing + # the pdbs there. + if eager_repair: + path_for_repairing_pdbs = self.tmp_folder / "foldx_tmp_files_for_repair" + path_for_repairing_pdbs.mkdir(exist_ok=True, parents=True) + foldx_interface_for_repairing = FoldxInterface(path_for_repairing_pdbs) + + # Re-writing wildtype_pdb_path to be the list of repaired pdb files. + repaired_wildtype_pdb_files = [ + foldx_interface_for_repairing._repair_if_necessary_and_provide_path( + pdb_file + ) + for pdb_file in wildtype_pdb_path + ] + + # At this point, wildtype_pdb_path is a list of Path objects. + self.wildtype_pdb_paths = repaired_wildtype_pdb_files + else: + self.wildtype_pdb_paths = wildtype_pdb_path self.wildtype_resiudes = [ - parse_pdb_as_residues(pdb_file) for pdb_file in wildtype_pdb_path + parse_pdb_as_residues(pdb_file) for pdb_file in self.wildtype_pdb_paths ] self.wildtype_amino_acids = [ - parse_pdb_as_residue_strings(pdb_file) for pdb_file in wildtype_pdb_path + parse_pdb_as_residue_strings(pdb_file) + for pdb_file in self.wildtype_pdb_paths ] self.wildtype_residue_strings = [ "".join(amino_acids) for amino_acids in self.wildtype_amino_acids ] - if experiment_id is None: - experiment_id = f"{int(time())}_{str(uuid4())[:8]}" - self.experiment_id = experiment_id - - self.tmp_folder = tmp_folder if tmp_folder is not None else DEFAULT_TMP_PATH - def create_working_directory(self) -> Path: """ TODO: document. diff --git a/src/poli/core/util/proteins/foldx.py b/src/poli/core/util/proteins/foldx.py index 125ddea2..ed545aed 100644 --- a/src/poli/core/util/proteins/foldx.py +++ b/src/poli/core/util/proteins/foldx.py @@ -8,11 +8,14 @@ import shutil import subprocess import os +import logging from Bio.PDB.Residue import Residue from Bio.PDB import SASA from Bio.SeqUtils import seq1 +from pdbtools.pdb_delhetatm import run as pdb_delhetatm_run + from poli.core.util.proteins.mutations import ( mutations_from_wildtype_residues_and_mutant, ) @@ -57,7 +60,11 @@ def __init__(self, working_dir: Union[Path, str]): self.working_dir = working_dir def repair( - self, pdb_file: Union[str, Path], remove_and_rename: bool = False + self, + pdb_file: Union[str, Path], + remove_and_rename: bool = False, + pH: float = 7.0, + remove_heteroatoms: bool = True, ) -> None: """ This method repairs a PDB file with FoldX, overwriting @@ -77,10 +84,20 @@ def repair( "--command=RepairPDB", "--pdb", f"{pdb_file.stem}.pdb", + "--water", + "-CRYSTAL", + "--pH", + f"{pH}", ] # Running it in the working directory - subprocess.run(command, cwd=self.working_dir) + try: + subprocess.run(command, cwd=self.working_dir, check=True) + except subprocess.CalledProcessError as e: + raise RuntimeError( + f"FoldX failed to repair the pdb file {pdb_file}. " + f"Please check the working directory: {self.working_dir}. " + ) from e # Checking that the file was generated repaired_pdb_file = self.working_dir / f"{pdb_file.stem}_Repair.pdb" @@ -89,6 +106,19 @@ def repair( f"Please check the working directory: {self.working_dir}. " ) + # If remove heteroatoms is True, we remove them + # using pdbtools + if remove_heteroatoms: + # We load up the repaired file + with open(repaired_pdb_file) as f: + lines = f.readlines() + + deleting_heteroatoms_result = pdb_delhetatm_run(lines) + + # We write the result to the same file + with open(repaired_pdb_file, "w") as f: + f.writelines(deleting_heteroatoms_result) + # Removing the old pdb file, and renaming the repaired one if remove_and_rename: shutil.rmtree(self.working_dir / f"{pdb_file.stem}.pdb") @@ -97,6 +127,28 @@ def repair( self.working_dir / f"{pdb_file.stem}.pdb", ) + def _repair_if_necessary_and_provide_path(self, pdb_file: Path) -> Path: + """ + If the pdb_file's name doesn't end in "_Repair.pdb", + then we repair it and return the path of the repaired + pdb. Otherwise, we return the same path as the input. + """ + # Make sure that we don't have a repaired pdb file + # in the working directory (which is usually a cache) + if (self.working_dir / f"{pdb_file.stem}_Repair.pdb").exists(): + logging.warning( + f"Found a repaired pdb file in the cache for {pdb_file.stem}. Using it instead of repairing." + ) + return self.working_dir / f"{pdb_file.stem}_Repair.pdb" + + # If the file's already fixed, then we don't need to + # do anything. Else, we repair it. + if "_Repair" in pdb_file.name: + return pdb_file + else: + self.repair(pdb_file) + return self.working_dir / f"{pdb_file.stem}_Repair.pdb" + def _simulate_mutations(self, pdb_file: Path, mutations: List[str] = None) -> None: """ This method simulates mutations on a PDB file with FoldX. diff --git a/src/poli/core/util/proteins/rasp/inner_rasp/pdb_parser_scripts/clean_pdb.py b/src/poli/core/util/proteins/rasp/inner_rasp/pdb_parser_scripts/clean_pdb.py index 7a2ca7bf..a9a4e0d0 100644 --- a/src/poli/core/util/proteins/rasp/inner_rasp/pdb_parser_scripts/clean_pdb.py +++ b/src/poli/core/util/proteins/rasp/inner_rasp/pdb_parser_scripts/clean_pdb.py @@ -182,7 +182,6 @@ def clean_pdb(pdb_input_filename: str, out_dir: str, reduce_executable: str): first_model = _step_1_reduce( reduce_executable, pdb_input_filename, pdbid, temp1 ) - # Step 2: NonHetSelector filter with tempfile.NamedTemporaryFile(mode="wt", delete=True) as temp2: PDBIO.set_structure(first_model) @@ -192,7 +191,7 @@ def clean_pdb(pdb_input_filename: str, out_dir: str, reduce_executable: str): # Step 3: Replace altloc chars to " " and use pdbfixer with tempfile.NamedTemporaryFile(mode="wt", delete=True) as temp3: - temp_3, fixer = _step_3_pdbfixer(first_model, temp3) + temp3, fixer = _step_3_pdbfixer(first_model, temp3) # Step 4: Correct for pdbfixer not preserving insertion codes with tempfile.NamedTemporaryFile(mode="wt", delete=True) as temp4: diff --git a/src/poli/objective_repository/foldx_sasa/environment.yml b/src/poli/objective_repository/foldx_sasa/environment.yml index 472e4bd3..2af88bae 100644 --- a/src/poli/objective_repository/foldx_sasa/environment.yml +++ b/src/poli/objective_repository/foldx_sasa/environment.yml @@ -4,10 +4,10 @@ channels: - conda-forge dependencies: - python=3.9 - - gcc - pip - pip: - biopython - python-levenshtein - numpy + - pdb-tools - "git+https://github.com/MachineLearningLifeScience/poli.git@master" diff --git a/src/poli/objective_repository/foldx_sasa/register.py b/src/poli/objective_repository/foldx_sasa/register.py index cc2ecb78..b0ff9e53 100644 --- a/src/poli/objective_repository/foldx_sasa/register.py +++ b/src/poli/objective_repository/foldx_sasa/register.py @@ -37,16 +37,18 @@ def __init__( alphabet: List[str] = None, experiment_id: str = None, tmp_folder: Path = None, + eager_repair: bool = False, ): super().__init__( - info, - batch_size, - parallelize, - num_workers, - wildtype_pdb_path, - alphabet, - experiment_id, - tmp_folder, + info=info, + batch_size=batch_size, + parallelize=parallelize, + num_workers=num_workers, + wildtype_pdb_path=wildtype_pdb_path, + alphabet=alphabet, + experiment_id=experiment_id, + tmp_folder=tmp_folder, + eager_repair=eager_repair, ) def _black_box(self, x: np.ndarray, context: None) -> np.ndarray: @@ -121,6 +123,9 @@ def create( num_workers: int = None, wildtype_pdb_path: Union[Path, List[Path]] = None, alphabet: List[str] = None, + experiment_id: str = None, + tmp_folder: Path = None, + eager_repair: bool = False, ) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]: """ TODO: document @@ -165,6 +170,9 @@ def create( num_workers=num_workers, wildtype_pdb_path=wildtype_pdb_path, alphabet=alphabet, + experiment_id=experiment_id, + tmp_folder=tmp_folder, + eager_repair=eager_repair, ) # We need to compute the initial values of all wildtypes diff --git a/src/poli/objective_repository/foldx_stability/environment.yml b/src/poli/objective_repository/foldx_stability/environment.yml index 472e4bd3..2af88bae 100644 --- a/src/poli/objective_repository/foldx_stability/environment.yml +++ b/src/poli/objective_repository/foldx_stability/environment.yml @@ -4,10 +4,10 @@ channels: - conda-forge dependencies: - python=3.9 - - gcc - pip - pip: - biopython - python-levenshtein - numpy + - pdb-tools - "git+https://github.com/MachineLearningLifeScience/poli.git@master" diff --git a/src/poli/objective_repository/foldx_stability/register.py b/src/poli/objective_repository/foldx_stability/register.py index 49996768..c081548c 100644 --- a/src/poli/objective_repository/foldx_stability/register.py +++ b/src/poli/objective_repository/foldx_stability/register.py @@ -2,7 +2,7 @@ This script registers FoldX stability as an objective function. """ from pathlib import Path -from typing import Dict, List, Tuple, Union +from typing import List, Tuple, Union import numpy as np @@ -46,16 +46,18 @@ def __init__( alphabet: List[str] = None, experiment_id: str = None, tmp_folder: Path = None, + eager_repair: bool = False, ): super().__init__( - info, - batch_size, - parallelize, - num_workers, - wildtype_pdb_path, - alphabet, - experiment_id, - tmp_folder, + info=info, + batch_size=batch_size, + parallelize=parallelize, + num_workers=num_workers, + wildtype_pdb_path=wildtype_pdb_path, + alphabet=alphabet, + experiment_id=experiment_id, + tmp_folder=tmp_folder, + eager_repair=eager_repair, ) def _black_box(self, x: np.ndarray, context: None) -> np.ndarray: @@ -129,6 +131,9 @@ def create( num_workers: int = None, wildtype_pdb_path: Union[Path, List[Path]] = None, alphabet: List[str] = None, + experiment_id: str = None, + tmp_folder: Path = None, + eager_repair: bool = False, ) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]: seed_numpy(seed) seed_python(seed) @@ -169,14 +174,24 @@ def create( num_workers=num_workers, wildtype_pdb_path=wildtype_pdb_path, alphabet=alphabet, + experiment_id=experiment_id, + tmp_folder=tmp_folder, + eager_repair=eager_repair, ) + # During the creation of the black box, + # we might have repaired the PDB files. + # Thus, we need to compute the initial + # values of all wildtypes in wildtype_pdb_path + # using the repaired PDB files instead. + repaired_wildtype_pdb_paths = f.wildtype_pdb_paths + # We need to compute the initial values of all wildtypes # in wildtype_pdb_path. For this, we need to specify x0, # a vector of wildtype sequences. These are padded to # match the maximum length with empty strings. wildtype_amino_acids_ = [] - for pdb_file in wildtype_pdb_path: + for pdb_file in repaired_wildtype_pdb_paths: wildtype_residues = parse_pdb_as_residues(pdb_file) wildtype_amino_acids_.append( [ diff --git a/src/poli/objective_repository/foldx_stability_and_sasa/environment.yml b/src/poli/objective_repository/foldx_stability_and_sasa/environment.yml index 472e4bd3..2af88bae 100644 --- a/src/poli/objective_repository/foldx_stability_and_sasa/environment.yml +++ b/src/poli/objective_repository/foldx_stability_and_sasa/environment.yml @@ -4,10 +4,10 @@ channels: - conda-forge dependencies: - python=3.9 - - gcc - pip - pip: - biopython - python-levenshtein - numpy + - pdb-tools - "git+https://github.com/MachineLearningLifeScience/poli.git@master" diff --git a/src/poli/objective_repository/foldx_stability_and_sasa/register.py b/src/poli/objective_repository/foldx_stability_and_sasa/register.py index a8e08dc3..af81e7dd 100644 --- a/src/poli/objective_repository/foldx_stability_and_sasa/register.py +++ b/src/poli/objective_repository/foldx_stability_and_sasa/register.py @@ -35,6 +35,7 @@ def __init__( alphabet: List[str] = None, experiment_id: str = None, tmp_folder: Path = None, + eager_repair: bool = False, ): super().__init__( info=info, @@ -45,6 +46,7 @@ def __init__( alphabet=alphabet, experiment_id=experiment_id, tmp_folder=tmp_folder, + eager_repair=eager_repair, ) def _black_box(self, x: np.ndarray, context: None) -> np.ndarray: @@ -121,6 +123,9 @@ def create( num_workers: int = None, wildtype_pdb_path: Union[Path, List[Path]] = None, alphabet: List[str] = None, + experiment_id: str = None, + tmp_folder: Path = None, + eager_repair: bool = False, ) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]: seed_numpy(seed) seed_python(seed) @@ -160,6 +165,9 @@ def create( num_workers=num_workers, wildtype_pdb_path=wildtype_pdb_path, alphabet=alphabet, + experiment_id=experiment_id, + tmp_folder=tmp_folder, + eager_repair=eager_repair, ) # We need to compute the initial values of all wildtypes diff --git a/src/poli/objective_repository/rasp/environment.yml b/src/poli/objective_repository/rasp/environment.yml index 71fe85a8..c9e5df0c 100644 --- a/src/poli/objective_repository/rasp/environment.yml +++ b/src/poli/objective_repository/rasp/environment.yml @@ -13,7 +13,6 @@ dependencies: - clang - cmake - make - - gcc - pip - pip: - "git+https://github.com/MachineLearningLifeScience/poli.git@dev" diff --git a/src/poli/tests/registry/proteins/test_foldx.py b/src/poli/tests/registry/proteins/test_foldx.py index a5737656..c9745bf1 100644 --- a/src/poli/tests/registry/proteins/test_foldx.py +++ b/src/poli/tests/registry/proteins/test_foldx.py @@ -7,6 +7,17 @@ THIS_DIR = Path(__file__).parent.resolve() +HOME_DIR = Path().home().resolve() +PATH_TO_FOLDX_FILES = HOME_DIR / "foldx" +if not PATH_TO_FOLDX_FILES.exists(): + pytest.skip("FoldX is not installed. ", allow_module_level=True) + +if not (PATH_TO_FOLDX_FILES / "foldx").exists(): + pytest.skip("FoldX is not compiled. ", allow_module_level=True) + +if not (PATH_TO_FOLDX_FILES / "rotabase.txt").exists(): + pytest.skip("rotabase.txt is not in the foldx directory. ", allow_module_level=True) + def test_foldx_stability_is_available(): """ @@ -148,17 +159,6 @@ def test_registering_foldx_stability_and_sasa(): Testing whether we can register the logp problem if biopython and python-levenshtein are installed. """ - HOME_DIR = Path().home().resolve() - PATH_TO_FOLDX_FILES = HOME_DIR / "foldx" - if not PATH_TO_FOLDX_FILES.exists(): - pytest.skip("FoldX is not installed. ") - - if not (PATH_TO_FOLDX_FILES / "foldx").exists(): - pytest.skip("FoldX is not compiled. ") - - if not (PATH_TO_FOLDX_FILES / "rotabase.txt").exists(): - pytest.skip("rotabase.txt is not in the foldx directory. ") - _ = pytest.importorskip("Bio") _ = pytest.importorskip("Levenshtein") @@ -169,3 +169,40 @@ def test_registering_foldx_stability_and_sasa(): assert np.isclose(y0[:, 0], 32.4896).all() assert np.isclose(y0[:, 1], 8411.45578009).all() + + +def test_foldx_from_non_repaired_file(): + """ + In this test, we check whether foldx properly + repairs a file if it doesn't contain _Repair. + + TODO: mock the behavior of the repair function + inside the foldx interface. Otherwise, this test + takes 4min to run. + """ + wildtype_pdb_path = THIS_DIR / "3ned.pdb" + _, f, _, y0, _ = objective_factory.create( + name="foldx_stability", + wildtype_pdb_path=wildtype_pdb_path, + eager_repair=True, + ) + + assert np.isclose(y0, 32.6135).all() + + +def test_foldx_from_repaired_file(): + """ + In this test, we check whether no repair is + performed if the file already contains _Repair. + """ + wildtype_pdb_path = THIS_DIR / "101m_Repair.pdb" + _, f, _, y0, _ = objective_factory.create( + name="foldx_stability", + wildtype_pdb_path=wildtype_pdb_path, + ) + + assert np.isclose(y0, 32.4896).all() + + +if __name__ == "__main__": + test_foldx_from_non_repaired_file() diff --git a/src/poli/tests/registry/proteins/test_rasp.py b/src/poli/tests/registry/proteins/test_rasp.py index a301a731..23fb942b 100644 --- a/src/poli/tests/registry/proteins/test_rasp.py +++ b/src/poli/tests/registry/proteins/test_rasp.py @@ -52,8 +52,6 @@ def test_rasp_on_3ned_against_notebooks_results_isolated(): """ We test forceful registration of the RaSP problem. """ - # If the previous import was successful, we can - # create a RaSP problem: _, f, x0, _, _ = objective_factory.create( name="rasp", wildtype_pdb_path=THIS_DIR / "3ned.pdb",