MachineLearningLifeScience · miguelgondu · Nov 6, 2023 · Nov 2, 2023 · Nov 6, 2023
diff --git a/src/poli/core/proteins/foldx_black_box.py b/src/poli/core/proteins/foldx_black_box.py
@@ -10,6 +10,7 @@
     parse_pdb_as_residue_strings,
     parse_pdb_as_residues,
 )
+from poli.core.util.proteins.foldx import FoldxInterface
 
 # This is the folder where all the files
 # generated by FoldX will be stored.
@@ -31,6 +32,7 @@ def __init__(
         alphabet: List[str] = None,
         experiment_id: str = None,
         tmp_folder: Path = None,
+        eager_repair: bool = False,
     ):
         """
         TODO: Document
@@ -52,6 +54,13 @@ def __init__(
             num_workers=num_workers,
         )
 
+        # Defining the experiment id
+        if experiment_id is None:
+            experiment_id = f"{int(time())}_{str(uuid4())[:8]}"
+        self.experiment_id = experiment_id
+
+        self.tmp_folder = tmp_folder if tmp_folder is not None else DEFAULT_TMP_PATH
+
         if alphabet is None:
             alphabet = info.alphabet
 
@@ -61,26 +70,53 @@ def __init__(
         if isinstance(wildtype_pdb_path, Path):
             wildtype_pdb_path = [wildtype_pdb_path]
 
-        self.wildtype_pdb_paths = wildtype_pdb_path
+        if isinstance(wildtype_pdb_path, list):
+            _wildtype_pdb_path = []
+            for pdb_file in wildtype_pdb_path:
+                if isinstance(pdb_file, str):
+                    pdb_file = Path(pdb_file.strip())
+                assert isinstance(
+                    pdb_file, Path
+                ), f"Expected a Path object or a string, but got {type(pdb_file)}."
+                _wildtype_pdb_path.append(pdb_file)
+
+            wildtype_pdb_path = _wildtype_pdb_path
+
+        # At this point, wildtype_pdb_path is a list of Path objects.
+        # We need to ensure that these are repaired pdb files.
+        # We do this by creating a temporary folder and repairing
+        # the pdbs there.
+        if eager_repair:
+            path_for_repairing_pdbs = self.tmp_folder / "foldx_tmp_files_for_repair"
+            path_for_repairing_pdbs.mkdir(exist_ok=True, parents=True)
+            foldx_interface_for_repairing = FoldxInterface(path_for_repairing_pdbs)
+
+            # Re-writing wildtype_pdb_path to be the list of repaired pdb files.
+            repaired_wildtype_pdb_files = [
+                foldx_interface_for_repairing._repair_if_necessary_and_provide_path(
+                    pdb_file
+                )
+                for pdb_file in wildtype_pdb_path
+            ]
+
+            # At this point, wildtype_pdb_path is a list of Path objects.
+            self.wildtype_pdb_paths = repaired_wildtype_pdb_files
+        else:
+            self.wildtype_pdb_paths = wildtype_pdb_path
 
         self.wildtype_resiudes = [
-            parse_pdb_as_residues(pdb_file) for pdb_file in wildtype_pdb_path
+            parse_pdb_as_residues(pdb_file) for pdb_file in self.wildtype_pdb_paths
         ]
 
         self.wildtype_amino_acids = [
-            parse_pdb_as_residue_strings(pdb_file) for pdb_file in wildtype_pdb_path
+            parse_pdb_as_residue_strings(pdb_file)
+            for pdb_file in self.wildtype_pdb_paths
         ]
 
         self.wildtype_residue_strings = [
             "".join(amino_acids) for amino_acids in self.wildtype_amino_acids
         ]
 
-        if experiment_id is None:
-            experiment_id = f"{int(time())}_{str(uuid4())[:8]}"
-        self.experiment_id = experiment_id
-
-        self.tmp_folder = tmp_folder if tmp_folder is not None else DEFAULT_TMP_PATH
-
     def create_working_directory(self) -> Path:
         """
         TODO: document.

diff --git a/src/poli/core/util/proteins/foldx.py b/src/poli/core/util/proteins/foldx.py
@@ -8,11 +8,14 @@
 import shutil
 import subprocess
 import os
+import logging
 
 from Bio.PDB.Residue import Residue
 from Bio.PDB import SASA
 from Bio.SeqUtils import seq1
 
+from pdbtools.pdb_delhetatm import run as pdb_delhetatm_run
+
 from poli.core.util.proteins.mutations import (
     mutations_from_wildtype_residues_and_mutant,
 )
@@ -57,7 +60,11 @@ def __init__(self, working_dir: Union[Path, str]):
         self.working_dir = working_dir
 
     def repair(
-        self, pdb_file: Union[str, Path], remove_and_rename: bool = False
+        self,
+        pdb_file: Union[str, Path],
+        remove_and_rename: bool = False,
+        pH: float = 7.0,
+        remove_heteroatoms: bool = True,
     ) -> None:
         """
         This method repairs a PDB file with FoldX, overwriting
@@ -77,10 +84,20 @@ def repair(
             "--command=RepairPDB",
             "--pdb",
             f"{pdb_file.stem}.pdb",
+            "--water",
+            "-CRYSTAL",
+            "--pH",
+            f"{pH}",
         ]
 
         # Running it in the working directory
-        subprocess.run(command, cwd=self.working_dir)
+        try:
+            subprocess.run(command, cwd=self.working_dir, check=True)
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(
+                f"FoldX failed to repair the pdb file {pdb_file}. "
+                f"Please check the working directory: {self.working_dir}. "
+            ) from e
 
         # Checking that the file was generated
         repaired_pdb_file = self.working_dir / f"{pdb_file.stem}_Repair.pdb"
@@ -89,6 +106,19 @@ def repair(
             f"Please check the working directory: {self.working_dir}. "
         )
 
+        # If remove heteroatoms is True, we remove them
+        # using pdbtools
+        if remove_heteroatoms:
+            # We load up the repaired file
+            with open(repaired_pdb_file) as f:
+                lines = f.readlines()
+
+            deleting_heteroatoms_result = pdb_delhetatm_run(lines)
+
+            # We write the result to the same file
+            with open(repaired_pdb_file, "w") as f:
+                f.writelines(deleting_heteroatoms_result)
+
         # Removing the old pdb file, and renaming the repaired one
         if remove_and_rename:
             shutil.rmtree(self.working_dir / f"{pdb_file.stem}.pdb")
@@ -97,6 +127,28 @@ def repair(
                 self.working_dir / f"{pdb_file.stem}.pdb",
             )
 
+    def _repair_if_necessary_and_provide_path(self, pdb_file: Path) -> Path:
+        """
+        If the pdb_file's name doesn't end in "_Repair.pdb",
+        then we repair it and return the path of the repaired
+        pdb. Otherwise, we return the same path as the input.
+        """
+        # Make sure that we don't have a repaired pdb file
+        # in the working directory (which is usually a cache)
+        if (self.working_dir / f"{pdb_file.stem}_Repair.pdb").exists():
+            logging.warning(
+                f"Found a repaired pdb file in the cache for {pdb_file.stem}. Using it instead of repairing."
+            )
+            return self.working_dir / f"{pdb_file.stem}_Repair.pdb"
+
+        # If the file's already fixed, then we don't need to
+        # do anything. Else, we repair it.
+        if "_Repair" in pdb_file.name:
+            return pdb_file
+        else:
+            self.repair(pdb_file)
+            return self.working_dir / f"{pdb_file.stem}_Repair.pdb"
+
     def _simulate_mutations(self, pdb_file: Path, mutations: List[str] = None) -> None:
         """
         This method simulates mutations on a PDB file with FoldX.

diff --git a/src/poli/core/util/proteins/rasp/inner_rasp/pdb_parser_scripts/clean_pdb.py b/src/poli/core/util/proteins/rasp/inner_rasp/pdb_parser_scripts/clean_pdb.py
@@ -182,7 +182,6 @@ def clean_pdb(pdb_input_filename: str, out_dir: str, reduce_executable: str):
         first_model = _step_1_reduce(
             reduce_executable, pdb_input_filename, pdbid, temp1
         )
-
         # Step 2: NonHetSelector filter
         with tempfile.NamedTemporaryFile(mode="wt", delete=True) as temp2:
             PDBIO.set_structure(first_model)
@@ -192,7 +191,7 @@ def clean_pdb(pdb_input_filename: str, out_dir: str, reduce_executable: str):
 
             # Step 3: Replace altloc chars to " " and use pdbfixer
             with tempfile.NamedTemporaryFile(mode="wt", delete=True) as temp3:
-                temp_3, fixer = _step_3_pdbfixer(first_model, temp3)
+                temp3, fixer = _step_3_pdbfixer(first_model, temp3)
 
                 # Step 4: Correct for pdbfixer not preserving insertion codes
                 with tempfile.NamedTemporaryFile(mode="wt", delete=True) as temp4:

diff --git a/src/poli/objective_repository/foldx_sasa/environment.yml b/src/poli/objective_repository/foldx_sasa/environment.yml
@@ -4,10 +4,10 @@ channels:
   - conda-forge
 dependencies:
   - python=3.9
-  - gcc
   - pip
   - pip:
     - biopython
     - python-levenshtein
     - numpy
+    - pdb-tools
     - "git+https://github.com/MachineLearningLifeScience/poli.git@master"
diff --git a/src/poli/objective_repository/foldx_sasa/register.py b/src/poli/objective_repository/foldx_sasa/register.py
@@ -37,16 +37,18 @@ def __init__(
         alphabet: List[str] = None,
         experiment_id: str = None,
         tmp_folder: Path = None,
+        eager_repair: bool = False,
     ):
         super().__init__(
-            info,
-            batch_size,
-            parallelize,
-            num_workers,
-            wildtype_pdb_path,
-            alphabet,
-            experiment_id,
-            tmp_folder,
+            info=info,
+            batch_size=batch_size,
+            parallelize=parallelize,
+            num_workers=num_workers,
+            wildtype_pdb_path=wildtype_pdb_path,
+            alphabet=alphabet,
+            experiment_id=experiment_id,
+            tmp_folder=tmp_folder,
+            eager_repair=eager_repair,
         )
 
     def _black_box(self, x: np.ndarray, context: None) -> np.ndarray:
@@ -121,6 +123,9 @@ def create(
         num_workers: int = None,
         wildtype_pdb_path: Union[Path, List[Path]] = None,
         alphabet: List[str] = None,
+        experiment_id: str = None,
+        tmp_folder: Path = None,
+        eager_repair: bool = False,
     ) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
         """
         TODO: document
@@ -165,6 +170,9 @@ def create(
             num_workers=num_workers,
             wildtype_pdb_path=wildtype_pdb_path,
             alphabet=alphabet,
+            experiment_id=experiment_id,
+            tmp_folder=tmp_folder,
+            eager_repair=eager_repair,
         )
 
         # We need to compute the initial values of all wildtypes

diff --git a/src/poli/objective_repository/foldx_stability/environment.yml b/src/poli/objective_repository/foldx_stability/environment.yml
@@ -4,10 +4,10 @@ channels:
   - conda-forge
 dependencies:
   - python=3.9
-  - gcc
   - pip
   - pip:
     - biopython
     - python-levenshtein
     - numpy
+    - pdb-tools
     - "git+https://github.com/MachineLearningLifeScience/poli.git@master"
diff --git a/src/poli/objective_repository/foldx_stability/register.py b/src/poli/objective_repository/foldx_stability/register.py
@@ -2,7 +2,7 @@
 This script registers FoldX stability as an objective function.
 """
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import List, Tuple, Union
 
 import numpy as np
 
@@ -46,16 +46,18 @@ def __init__(
         alphabet: List[str] = None,
         experiment_id: str = None,
         tmp_folder: Path = None,
+        eager_repair: bool = False,
     ):
         super().__init__(
-            info,
-            batch_size,
-            parallelize,
-            num_workers,
-            wildtype_pdb_path,
-            alphabet,
-            experiment_id,
-            tmp_folder,
+            info=info,
+            batch_size=batch_size,
+            parallelize=parallelize,
+            num_workers=num_workers,
+            wildtype_pdb_path=wildtype_pdb_path,
+            alphabet=alphabet,
+            experiment_id=experiment_id,
+            tmp_folder=tmp_folder,
+            eager_repair=eager_repair,
         )
 
     def _black_box(self, x: np.ndarray, context: None) -> np.ndarray:
@@ -129,6 +131,9 @@ def create(
         num_workers: int = None,
         wildtype_pdb_path: Union[Path, List[Path]] = None,
         alphabet: List[str] = None,
+        experiment_id: str = None,
+        tmp_folder: Path = None,
+        eager_repair: bool = False,
     ) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
         seed_numpy(seed)
         seed_python(seed)
@@ -169,14 +174,24 @@ def create(
             num_workers=num_workers,
             wildtype_pdb_path=wildtype_pdb_path,
             alphabet=alphabet,
+            experiment_id=experiment_id,
+            tmp_folder=tmp_folder,
+            eager_repair=eager_repair,
         )
 
+        # During the creation of the black box,
+        # we might have repaired the PDB files.
+        # Thus, we need to compute the initial
+        # values of all wildtypes in wildtype_pdb_path
+        # using the repaired PDB files instead.
+        repaired_wildtype_pdb_paths = f.wildtype_pdb_paths
+
         # We need to compute the initial values of all wildtypes
         # in wildtype_pdb_path. For this, we need to specify x0,
         # a vector of wildtype sequences. These are padded to
         # match the maximum length with empty strings.
         wildtype_amino_acids_ = []
-        for pdb_file in wildtype_pdb_path:
+        for pdb_file in repaired_wildtype_pdb_paths:
             wildtype_residues = parse_pdb_as_residues(pdb_file)
             wildtype_amino_acids_.append(
                 [

diff --git a/src/poli/objective_repository/foldx_stability_and_sasa/environment.yml b/src/poli/objective_repository/foldx_stability_and_sasa/environment.yml
@@ -4,10 +4,10 @@ channels:
   - conda-forge
 dependencies:
   - python=3.9
-  - gcc
   - pip
   - pip:
     - biopython
     - python-levenshtein
     - numpy
+    - pdb-tools
     - "git+https://github.com/MachineLearningLifeScience/poli.git@master"
diff --git a/src/poli/objective_repository/foldx_stability_and_sasa/register.py b/src/poli/objective_repository/foldx_stability_and_sasa/register.py
@@ -35,6 +35,7 @@ def __init__(
         alphabet: List[str] = None,
         experiment_id: str = None,
         tmp_folder: Path = None,
+        eager_repair: bool = False,
     ):
         super().__init__(
             info=info,
@@ -45,6 +46,7 @@ def __init__(
             alphabet=alphabet,
             experiment_id=experiment_id,
             tmp_folder=tmp_folder,
+            eager_repair=eager_repair,
         )
 
     def _black_box(self, x: np.ndarray, context: None) -> np.ndarray:
@@ -121,6 +123,9 @@ def create(
         num_workers: int = None,
         wildtype_pdb_path: Union[Path, List[Path]] = None,
         alphabet: List[str] = None,
+        experiment_id: str = None,
+        tmp_folder: Path = None,
+        eager_repair: bool = False,
     ) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
         seed_numpy(seed)
         seed_python(seed)
@@ -160,6 +165,9 @@ def create(
             num_workers=num_workers,
             wildtype_pdb_path=wildtype_pdb_path,
             alphabet=alphabet,
+            experiment_id=experiment_id,
+            tmp_folder=tmp_folder,
+            eager_repair=eager_repair,
         )
 
         # We need to compute the initial values of all wildtypes