Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/poli/objective_repository/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,18 @@
pass


try:
from .rfp_foldx_stability_and_sasa.register import (
RFPFoldXStabilityAndSASAProblemFactory,
)

AVAILABLE_PROBLEM_FACTORIES[
"rfp_foldx_stability_and_sasa"
] = RFPFoldXStabilityAndSASAProblemFactory
except (ImportError, FileNotFoundError):
pass


try:
from .penalized_logp_lambo.register import PenalizedLogPLamboProblemFactory

Expand Down
72 changes: 4 additions & 68 deletions src/poli/objective_repository/foldx_stability_and_sasa/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
"""
from pathlib import Path
from typing import List, Tuple, Union
import warnings

import numpy as np

Expand Down Expand Up @@ -205,8 +204,6 @@ def create(
parallelize: bool = False,
num_workers: int = None,
evaluation_budget: int = float("inf"),
n_starting_points: int = None,
strict: bool = False,
) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
"""
Create a FoldXSASABlackBox object and compute the initial values of wildtypes.
Expand Down Expand Up @@ -235,11 +232,7 @@ def create(
Number of worker processes for parallel computation.
evaluation_budget: int, optional
The maximum number of function evaluations. Default is infinity.
n_starting_points: int, optional
Size of D_0. Default is all available data.
The minimum number of sequence is given by the Pareto front of the RFP problem, ie. you cannot have less sequences than that.
strict: bool, optional
Enable RuntimeErrors if number of starting sequences different to requested number of sequences.

Returns
-------
Tuple[AbstractBlackBox, np.ndarray, np.ndarray]
Expand Down Expand Up @@ -280,56 +273,10 @@ def create(
if alphabet is None:
alphabet = self.get_setup_information().get_alphabet()

if n_starting_points is None:
n_starting_points = len(wildtype_pdb_path)

# For a comparable RFP definition we require the sequences of the Pareto front:
pareto_sequences_name_pdb_dict = {
"DsRed.M1": "2VAD",
"DsRed.T4": "2VAE",
"mScarlet": "5LK4",
"AdRed": "6AA7",
"mRouge": "3NED",
"RFP630": "3E5V",
}

if strict and n_starting_points < len(pareto_sequences_name_pdb_dict):
raise RuntimeError(
f"Initial number of sequences too low!\nMinimum size {len(pareto_sequences_name_pdb_dict)} , requested {n_starting_points}"
)

remaining_n_starting_points = max(
n_starting_points - len(pareto_sequences_name_pdb_dict.values()), 0
)
# filter minimal required Pareto sequences
pareto_pdb_files = [
p
for p in wildtype_pdb_path
if any(
[
bool(_pdb.lower() in str(p).lower())
for _pdb in pareto_sequences_name_pdb_dict.values()
]
)
]
if len(pareto_pdb_files) != len(pareto_sequences_name_pdb_dict):
raise RuntimeError(
f"The provided PDB files list is incomplete!\n Required={','.join(list(pareto_sequences_name_pdb_dict.values()))} provided files={pareto_pdb_files}"
)

remaining_wildtype_pdb_files = list(
set(wildtype_pdb_path) - set(pareto_pdb_files)
)
np.random.shuffle(remaining_wildtype_pdb_files)
remaining_wildtype_pdb_files = remaining_wildtype_pdb_files[
:remaining_n_starting_points
] # subselect w.r.t. requested number of sequences
pdb_files_for_black_box: List = pareto_pdb_files + remaining_wildtype_pdb_files

problem_info = self.get_setup_information()
f = FoldXStabilityAndSASABlackBox(
info=problem_info,
wildtype_pdb_path=pdb_files_for_black_box,
wildtype_pdb_path=wildtype_pdb_path,
alphabet=alphabet,
experiment_id=experiment_id,
tmp_folder=tmp_folder,
Expand All @@ -345,8 +292,7 @@ def create(
# a vector of wildtype sequences. These are padded to
# match the maximum length with empty strings.
wildtype_amino_acids_ = []
for pdb_file in pdb_files_for_black_box:
# NOTE: we require the elements of the pareto front for this problem to be well-defined
for pdb_file in wildtype_pdb_path:
wildtype_residues = parse_pdb_as_residues(pdb_file)
wildtype_amino_acids_.append(
[
Expand All @@ -363,19 +309,9 @@ def create(
]

x0 = np.array(wildtype_amino_acids).reshape(
len(pdb_files_for_black_box), longest_wildtype_length
len(wildtype_pdb_path), longest_wildtype_length
)

if n_starting_points is not None and x0.shape[0] != n_starting_points:
if strict:
raise RuntimeError(
f"Requested number of starting sequences different to loaded!\nRequested n={n_starting_points}, loaded n={x0.shape[0]}"
)
else:
warnings.warn(
f"Requested number of starting sequences different to loaded!\nRequested n={n_starting_points}, loaded n={x0.shape[0]}"
)

f_0 = f(x0)

return f, x0, f_0
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""A black box for computing stability and solvent accessible surface area (SASA) using FoldX on a particular RFP problem with a defined Pareto front."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: poli__protein
channels:
- defaults
- conda-forge
dependencies:
- python=3.9
- pip
- pip:
- biopython
- python-levenshtein
- numpy
- pdb-tools
- "git+https://github.com/MachineLearningLifeScience/poli.git@dev"
Loading