diff --git a/arc/scheduler.py b/arc/scheduler.py index f53ee8eead..7b69b4e108 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -12,6 +12,7 @@ import numpy as np from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from concurrent.futures import ThreadPoolExecutor, as_completed import arc.parser.parser as parser from arc import plotter @@ -131,6 +132,7 @@ class Scheduler(object): species_list (list): Contains input :ref:`ARCSpecies ` objects (both wells and TSs). rxn_list (list): Contains input :ref:`ARCReaction ` objects. project_directory (str): Folder path for the project: the input file path or ARC/Projects/project-name. + conformer_gen_nprocs (int, optional): The number of processes to use for non-TS conformer generation. Defaults to 1 (serial). composite_method (str, optional): A composite method to use. conformer_opt_level (Union[str, dict], optional): The level of theory to use for conformer comparisons. conformer_sp_level (Union[str, dict], optional): The level of theory to use for conformer sp jobs. @@ -171,6 +173,7 @@ class Scheduler(object): Attributes: project (str): The project's name. Used for naming the working directory. + conformer_gen_nprocs (int): The number of processes to use for non-TS conformer generation. servers (list): A list of servers used for the present project. species_list (list): Contains input :ref:`ARCSpecies ` objects (both species and TSs). species_dict (dict): Keys are labels, values are :ref:`ARCSpecies ` objects. @@ -232,6 +235,7 @@ def __init__(self, ess_settings: dict, species_list: list, project_directory: str, + conformer_gen_nprocs: Optional[int] = 1, composite_method: Optional[Level] = None, conformer_opt_level: Optional[Level] = None, conformer_sp_level: Optional[Level] = None, @@ -297,6 +301,7 @@ def __init__(self, self.output_multi_spc = dict() self.report_e_elect = report_e_elect self.skip_nmd = skip_nmd + self.conformer_gen_nprocs = int(conformer_gen_nprocs or default_job_settings.get('conformer_gen_nprocs', 1)) self.species_dict, self.rxn_dict = dict(), dict() for species in self.species_list: @@ -1040,6 +1045,26 @@ def end_job(self, job: 'JobAdapter', self.save_restart_dict() return True + def _generate_conformers_for_label(self, label: str) -> None: + """ + A helper function to generate conformers for a given species label (used internally). + + Args: + label (str): The species label. + """ + species = self.species_dict[label] + if species.force_field == 'cheap': + # Just embed in RDKit and use MMFF94s for opt and energies. + if species.initial_xyz is None: + species.initial_xyz = species.get_xyz() + else: + n_confs = self.n_confs if species.multi_species is None else 1 + species.generate_conformers( + n_confs=n_confs, + e_confs=self.e_confs, + plot_path=os.path.join(self.project_directory, 'output', 'Species', + label, 'geometry', 'conformers')) + def _run_a_job(self, job: 'JobAdapter', label: str, @@ -1088,52 +1113,57 @@ def run_conformer_jobs(self, labels: Optional[List[str]] = None): If ``None``, conformer jobs will be spawned for all species in self.species_list. """ labels_to_consider = labels if labels is not None else [spc.label for spc in self.species_list] - log_info_printed = False + pending_non_ts_generation: List[str] = [] + for label in labels_to_consider: - if not self.species_dict[label].is_ts and not self.output[label]['job_types']['opt'] \ - and 'opt' not in self.job_dict[label] and 'composite' not in self.job_dict[label] \ - and all([e is None for e in self.species_dict[label].conformer_energies]) \ - and self.species_dict[label].number_of_atoms > 1 and not self.output[label]['paths']['geo'] \ - and self.species_dict[label].yml_path is None and not self.output[label]['convergence'] \ - and (self.job_types['conf_opt'] and label not in self.dont_gen_confs - or self.species_dict[label].get_xyz(generate=False) is None): - # This is not a TS, opt (/composite) did not converge nor is it running, and conformer energies were - # not set. Also, either 'conf_opt' are set to True in job_types (and it's not in dont_gen_confs), - # or they are set to False (or it's in dont_gen_confs), but the species has no 3D information. - # Generate conformers. - if not log_info_printed: - logger.info('\nStarting (non-TS) species conformational analysis...\n') - log_info_printed = True - if self.species_dict[label].force_field == 'cheap': - # Just embed in RDKit and use MMFF94s for opt and energies. - if self.species_dict[label].initial_xyz is None: - self.species_dict[label].initial_xyz = self.species_dict[label].get_xyz() - else: - # Run the combinatorial method w/o fitting a force field. - n_confs = self.n_confs if self.species_dict[label].multi_species is None else 1 - self.species_dict[label].generate_conformers( - n_confs=n_confs, - e_confs=self.e_confs, - plot_path=os.path.join(self.project_directory, 'output', 'Species', - label, 'geometry', 'conformers')) - self.process_conformers(label) - # TSs: - elif self.species_dict[label].is_ts \ - and self.species_dict[label].tsg_spawned \ - and not self.species_dict[label].ts_conf_spawned \ - and all([tsg.success is not None for tsg in self.species_dict[label].ts_guesses]) \ - and any([tsg.success for tsg in self.species_dict[label].ts_guesses]): - # This is a TS Species for which all TSGs were spawned, but conformers haven't been spawned, - # and all tsg.success flags contain a values (either ``True`` or ``False``), so they are done. - # We're ready to spawn conformer jobs for this TS Species + # Non_TS needing conformer generation: + if (not self.species_dict[label].is_ts + and not self.output[label]['job_types']['opt'] + and 'opt' not in self.job_dict[label] + and 'composite' not in self.job_dict[label] + and all([e is None for e in self.species_dict[label].conformer_energies]) + and self.species_dict[label].number_of_atoms > 1 + and not self.output[label]['paths']['geo'] + and self.species_dict[label].yml_path is None + and not self.output[label]['convergence'] + and ((self.job_types['conf_opt'] and label not in self.dont_gen_confs) + or self.species_dict[label].get_xyz(generate=False) is None)): + pending_non_ts_generation.append(label) + elif (self.species_dict[label].is_ts + and self.species_dict[label].tsg_spawned + and not self.species_dict[label].ts_conf_spawned + and all([tsg.success is not None for tsg in self.species_dict[label].ts_guesses]) + and any([tsg.success for tsg in self.species_dict[label].ts_guesses])): self.run_ts_conformer_jobs(label=label) self.species_dict[label].ts_conf_spawned = True - if label in self.dont_gen_confs \ - and (self.species_dict[label].initial_xyz is not None - or self.species_dict[label].final_xyz is not None - or len(self.species_dict[label].conformers)) \ - and not self.species_dict[label].is_ts: - # The species was defined with xyzs. + if (label in self.dont_gen_confs + and (self.species_dict[label].initial_xyz is not None + or self.species_dict[label].final_xyz is not None + or len(self.species_dict[label].conformers)) + and not self.species_dict[label].is_ts): + self.process_conformers(label) + + if pending_non_ts_generation: + logger.info('\nStarting (non-TS) species conformational analysis...\n') + if self.conformer_gen_nprocs > 1 and len(pending_non_ts_generation) > 1: + # Parallel generation + with ThreadPoolExecutor(max_workers=self.conformer_gen_nprocs) as executor: + futures = { + executor.submit(self._generate_conformers_for_label, label): label + for label in pending_non_ts_generation + } + for future in as_completed(futures): + label_done = futures[future] + try: + future.result() + except Exception as e: + logger.error(f'Error generating conformers for species {label_done}: {e}') + else: + # Serial generation + for label in pending_non_ts_generation: + self._generate_conformers_for_label(label) + + for label in pending_non_ts_generation: self.process_conformers(label) def run_ts_conformer_jobs(self, label: str): diff --git a/arc/scheduler_test.py b/arc/scheduler_test.py index edba05adef..8f56ca360f 100644 --- a/arc/scheduler_test.py +++ b/arc/scheduler_test.py @@ -5,9 +5,11 @@ This module contains unit tests for the arc.scheduler module """ +import time import unittest import os import shutil +from unittest.mock import patch import arc.parser.parser as parser from arc.checks.ts import check_ts @@ -24,6 +26,29 @@ default_levels_of_theory = settings['default_levels_of_theory'] +# Helpers for MultiProcessing Test +def fake_generate_conformers(species_self, n_confs=10, e_confs=5, plot_path=None): + """Stand-in for ARCSpecies.generate_conformers. + It both *creates a dummy conformer* and *records the label* on a function attribute.""" + species_self.conformers = [{ + "symbols": ("H", "H"), + "isotopes": (1, 1), + "coords": ((0.0, 0.0, 0.0), (0.0, 0.0, 0.74)), + }] + species_self.conformer_energies = [0.0] + fake_generate_conformers.called.add(species_self.label) + + +def fake_process_conformers(sched_self, lbl): + """Stand-in for Scheduler.process_conformers.""" + fake_process_conformers.called.add(lbl) + + +# simple per-test “state holders” on the functions themselves +fake_generate_conformers.called = set() +fake_process_conformers.called = set() +# + class TestScheduler(unittest.TestCase): """ @@ -757,6 +782,93 @@ def test_add_label_to_unique_species_labels(self): self.assertEqual(unique_label, 'new_species_15_1') self.assertEqual(self.sched2.unique_species_labels, ['methylamine', 'C2H6', 'CtripCO', 'new_species_15', 'new_species_15_0', 'new_species_15_1']) + def test_run_conformer_jobs_calls_process_conformers_for_dont_gen_confs(self): + """ + If a non-TS is in dont_gen_confs and the species has initial/final xyz or existing conformers, + run_conformer_jobs should call process_conformers(label) for that species. + """ + label = "spc_dont_gen_confs" + xyz = { + 'symbols': ("H", "H"), + 'isotopes': (1, 1), + 'coords': ((0.0, 0.0, 0.0), (0.0, 0.0, 0.74)), + } + spc = ARCSpecies(label=label, smiles='[H][H]', xyz=xyz) + spc.number_of_atoms = 2 + spc.conformers = [xyz] + spc.conformer_energies = [None] + + job_types = {'conf_opt': True, 'conf_sp': False, 'opt': False, 'fine': False, 'freq': False, + 'sp': False, 'rotors': False, 'orbitals': False, 'lennard_jones': False, 'bde': False, 'irc': False, 'tsg': False} + sched = Scheduler(project='test_run_conformer_jobs_dont_gen_confs', + ess_settings=self.ess_settings, + species_list=[spc], + project_directory=os.path.join(ARC_PATH, 'Projects', 'arc_project_for_testing_delete_after_usage3'), + testing=True, + job_types=job_types) + sched.dont_gen_confs = [label] + + with patch.object(Scheduler, "process_conformers", autospec=True) as mock_proc: + sched.run_conformer_jobs(labels=[label]) + mock_proc.assert_called_once() + # First arg is Scheduler (self), second is label + called_args = mock_proc.call_args[0] + self.assertEqual(called_args[1], label) + + def test_sched_calls_real_generate_conformers(self): + spc = ARCSpecies(label="spc_real_gen_confs", multiplicity=1, charge=0, smiles='CC') + sched = Scheduler( + project='test_sched_calls_real_generate_conformers', + ess_settings=self.ess_settings, + species_list=[spc], + project_directory=os.path.join(ARC_PATH, 'Projects', 'arc_project_for_testing_delete_after_usage3'), + testing=True, + job_types={'conf_opt': True, 'conf_sp': False, 'opt': False, 'fine': False, 'freq': False, + 'sp': False, 'rotors': False, 'orbitals': False, 'lennard_jones': False, 'bde': False, 'irc': False, 'tsg': False}, + ) + + sched._generate_conformers_for_label(label=spc.label) + assert len(spc.conformers) > 0, "Real conformer generation did not populate spc.conformers" + + def test_run_conformer_jobs_generates_and_processes_for_multiple_species(self): + """ + For multiple non-TS species needing conformer generation, verify that: + - ARCSpecies.generate_conformers is invoked for each species + - Scheduler.process_conformers is subsequently called for each label + Also verify both serial (nprocs=1) and parallel (nprocs>1) code paths. + """ + job_types = { + 'conf_opt': True, 'conf_sp': False, 'opt': False, 'fine': False, 'freq': False, + 'sp': False, 'rotors': False, 'orbitals': False, 'lennard_jones': False, + 'bde': False, 'irc': False, 'tsg': False + } + time_taken = {} + for nprocs in (1, 2): + with self.subTest(nprocs=nprocs): + start_time = time.time() + fake_generate_conformers.called.clear() + fake_process_conformers.called.clear() + spc1 = ARCSpecies(label="spc_multi_1", multiplicity=1, charge=0, smiles='CCO') + spc2 = ARCSpecies(label="spc_multi_2", multiplicity=1, charge=0, smiles='CCN') + + sched = Scheduler( + project='test_run_conformer_jobs_multiple_species', + ess_settings=self.ess_settings, + species_list=[ + spc1, + spc2, + ], + project_directory=os.path.join(ARC_PATH, 'Projects', 'arc_project_for_testing_delete_after_usage3'), + testing=True, + job_types=job_types, + conformer_gen_nprocs=nprocs, + ) + + sched.run_conformer_jobs(labels=[spc1.label, spc2.label]) + time_taken[nprocs] = time.time() - start_time + print(f"nprocs={nprocs}, fake_generate_conformers.called={fake_generate_conformers.called}, fake_process_conformers.called={fake_process_conformers.called}") + + print(f"Time taken: {time_taken}") @classmethod def tearDownClass(cls): """ diff --git a/arc/species/conformers.py b/arc/species/conformers.py index 782e0b0a15..c72e34b365 100644 --- a/arc/species/conformers.py +++ b/arc/species/conformers.py @@ -35,13 +35,15 @@ get_lowest_confs """ - +import os import copy import logging import sys import time from itertools import product from typing import Dict, List, Optional, Tuple, Union +from concurrent.futures import ProcessPoolExecutor, as_completed +from itertools import repeat from openbabel import openbabel as ob from openbabel import pybel as pyb @@ -1230,8 +1232,35 @@ def get_force_field_energies(label: str, f'Ghemical, or GAFF. Got: {force_field}.') return xyzs, energies +def _ob_optimize_xyz(xyz_str, force_field, optimize = True): + """ + A helper function to optimize one xyz string with OpenBabel in a child process. + This avoids OpenBabel's internal errors when used in parallel. + + Args: + xyz_str (str): The xyz string to optimize. + force_field (str): The force field to use. + optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. + + Returns: + Tuple[str, float]: The optimized xyz (in ARC XYZ format) and its energy. + """ + obconv = ob.OBConversion() + obconv.SetInAndOutFormats('xyz', 'xyz') + obmol = ob.OBMol() + if not obconv.ReadString(obmol, xyz_str): + return None, None + ff = ob.OBForceField.FindForceField(force_field) + if ff is None or not ff.Setup(obmol): + return None, None + if optimize: + ff.ConjugateGradients(2000) + en = ff.Energy() + out_xyz = '\n'.join(obconv.WriteString(obmol).splitlines()[2:]) + return converter.str_to_xyz(out_xyz), en + -def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94s', optimize=True): +def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94s', optimize=True, nprocs: int = 1): """ Optimize RDKit conformers by OpenBabel using a force field (MMFF94 or MMFF94s are recommended). This is a fallback method when RDKit fails to generate force field optimized conformers. @@ -1241,6 +1270,7 @@ def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94 rd_mol (RDKit RDMol): The RDKit molecule with embedded conformers to optimize. force_field (str, optional): The type of force field to use. optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. + nprocs (int, optional): The number of processors to use. Returns: Tuple[list, list]: @@ -1251,33 +1281,45 @@ def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94 if not rd_mol.GetNumConformers(): logger.warning(f'Could not generate conformers for {label} via OpenBabel') return xyzs, energies - # Set up Openbabel input and output format + + # Let's prepare XYZ strings once in parent (avoid pickling issues with OB objects) obconversion = ob.OBConversion() obconversion.SetInAndOutFormats('xyz', 'xyz') - # Set up Openbabel force field - ff = ob.OBForceField.FindForceField(force_field) symbols = [rd_atom.GetSymbol() for rd_atom in rd_mol.GetAtoms()] + + xyz_jobs = list() for i in range(rd_mol.GetNumConformers()): - # Convert RDKit conformer to xyz string conf = rd_mol.GetConformer(i) xyz_str = f'{conf.GetNumAtoms()}\n\n' for j in range(conf.GetNumAtoms()): - xyz_str += symbols[j] + ' ' pt = conf.GetAtomPosition(j) - xyz_str += ' '.join([str(pt.x), str(pt.y), str(pt.z)]) + '\n' - # Build OpenBabel molecule from xyz string + xyz_str += f"{symbols[j]} {pt.x} {pt.y} {pt.z}\n" + xyz_jobs.append((xyz_str, force_field, optimize)) + + if nprocs and nprocs > 1 and len(xyz_jobs) > 1: + os.environ.setdefault('OMP_NUM_THREADS', '1') + with ProcessPoolExecutor(max_workers=nprocs) as executor: + xyz_iter = (s for (s, _, _) in xyz_jobs) + ff_iter = repeat(force_field) + opt_iter = repeat(optimize) + for xyz_dict, en in executor.map(_ob_optimize_xyz, xyz_iter, ff_iter, opt_iter): + if xyz_dict is not None: + xyzs.append(xyz_dict) + energies.append(en) + return xyzs, energies + + ff = ob.OBForceField.FindForceField(force_field) + for xyz_str, _, _ in xyz_jobs: ob_mol = ob.OBMol() obconversion.ReadString(ob_mol, xyz_str) ff.Setup(ob_mol) - # Optimize the molecule if needed if optimize: ff.ConjugateGradients(2000) - # Export xyzs and energies ob_mol.GetCoordinates() ff.GetCoordinates(ob_mol) energies.append(ff.Energy()) - xyz_str = '\n'.join(obconversion.WriteString(ob_mol).splitlines()[2:]) - xyzs.append(converter.str_to_xyz(xyz_str)) + out_xyz = '\n'.join(obconversion.WriteString(ob_mol).splitlines()[2:]) + xyzs.append(converter.str_to_xyz(out_xyz)) return xyzs, energies @@ -1332,8 +1374,34 @@ def mix_rdkit_and_openbabel_force_field(label, energies.extend(energies_) return xyzs, energies +def _optimize_one_conf_with_ob(args): + """ + A helper function for parallelizing the optimization of one conformer with OpenBabel in a child process. + This avoids OB object pickling issues. + """ + xyz_str, force_field = args + obconv = ob.OBConversion() + obconv.SetInAndOutFormats('xyz', 'xyz') + + obmol = ob.OBMol() + if not obconv.ReadString(obmol, xyz_str): + return None, None + + ff = ob.OBForceField.FindForceField(force_field) + if ff is None: + return None, None + + if not ff.Setup(obmol): + return None, None + + en = ff.Energy() + # Export xyz as ARC dict + ob_out = obconv.WriteString(obmol) + xyz_body = '\n'.join(ob_out.splitlines()[2:]) + xyz_dict = converter.str_to_xyz(xyz_body) + return xyz_dict, en -def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAFF', method='diverse'): +def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAFF', method='diverse', nprocs=1): """ Optimize conformers using a force field (GAFF, MMFF94s, MMFF94, UFF, Ghemical). @@ -1345,43 +1413,46 @@ def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAF force_field (str, optional): The type of force field to use. method (str, optional): The conformer searching method to use in OpenBabel. For method description, see https://openbabel.org/dev-api/group__conformer.shtml + nprocs (int, optional): The number of processors to use. If >1, will parallelize the optimization of conformers. Returns: Tuple[list, list]: - Entries are optimized xyz's in a list format. - Entries are float numbers representing the energies in kJ/mol. - """ + """ xyzs, energies = list(), list() ff = ob.OBForceField.FindForceField(force_field) + if ff is None: + raise RuntimeError(f'Could not find force field {force_field} in OpenBabel for {label}.') + if xyz is not None: obmol = ob.OBMol() atoms = mol.vertices - ob_atom_ids = dict() for i, atom in enumerate(atoms): a = obmol.NewAtom() a.SetAtomicNum(atom.number) - a.SetVector(xyz['coords'][i][0], xyz['coords'][i][1], xyz['coords'][i][2]) + x, y, z = xyz['coords'][i] + a.SetVector(x, y, z) if atom.element.isotope != -1: a.SetIsotope(atom.element.isotope) a.SetFormalCharge(atom.charge) - ob_atom_ids[atom] = a.GetId() orders = {1: 1, 2: 2, 3: 3, 4: 4, 1.5: 5} - for atom1 in mol.vertices: - for atom2, bond in atom1.edges.items(): + for a1 in mol.vertices: + for a2, bond in a1.edges.items(): if bond.is_hydrogen_bond(): continue - index1 = atoms.index(atom1) - index2 = atoms.index(atom2) + index1 = atoms.index(a1) + index2 = atoms.index(a2) if index1 < index2: obmol.AddBond(index1 + 1, index2 + 1, orders[bond.order]) - + # optimize ff.Setup(obmol) ff.SetLogLevel(0) ff.SetVDWCutOff(6.0) # The VDW cut-off distance (default=6.0) ff.SetElectrostaticCutOff(10.0) # The Electrostatic cut-off distance (default=10.0) - ff.SetUpdateFrequency(10) # The frequency to update the non-bonded pairs (default=10) + ff.SetUpdateFrequency(10) # The frequency to update the non-bonded pairs (default=10 ff.EnableCutOff(False) # Use cut-off (default=don't use cut-off) ff.SteepestDescentInitialize() # ConjugateGradientsInitialize v = 1 @@ -1390,14 +1461,16 @@ def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAF if ff.DetectExplosion(): raise ConformerError(f'Force field {force_field} exploded with method SteepestDescent for {label}') ff.GetCoordinates(obmol) - + elif num_confs is not None: - obmol, ob_atom_ids = to_ob_mol(mol, return_mapping=True) + obmol, _idmap = to_ob_mol(mol, return_mapping=True) pybmol = pyb.Molecule(obmol) pybmol.make3D() obmol = pybmol.OBMol - ff.Setup(obmol) - + + if not ff.Setup(obmol): + return xyzs, energies + if method.lower() == 'weighted': ff.WeightedRotorSearch(num_confs, 2000) elif method.lower() == 'random': @@ -1411,25 +1484,38 @@ def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAF ff.SystematicRotorSearch(num_confs) else: raise ConformerError(f'Could not identify method {method} for {label}') - else: - raise ConformerError(f'Either num_confs or xyz should be given for {label}') - - ff.GetConformers(obmol) - obconversion = ob.OBConversion() - obconversion.SetOutFormat('xyz') - - for i in range(obmol.NumConformers()): - obmol.SetConformer(i) - ff.Setup(obmol) - xyz_str = '\n'.join(obconversion.WriteString(obmol).splitlines()[2:]) - xyz_dict = converter.str_to_xyz(xyz_str) - xyz_dict['coords'] = tuple(xyz_dict['coords'][ob_atom_ids[mol.atoms[j]]] - for j in range(len(xyz_dict['coords']))) - xyzs.append(xyz_dict) - energies.append(ff.Energy()) + + ff.GetConformers(obmol) + obconversion = ob.OBConversion() + obconversion.SetOutFormat('xyz') + + xyz_jobs = [] + for i in range(obmol.NumConformers()): + obmol.SetConformer(i) + ff.Setup(obmol) + xyz_full = obconversion.WriteString(obmol) + xyz_jobs.append((xyz_full, force_field)) + + if nprocs and nprocs > 1: + # Avoid oversubscription if OB uses any internal threads + os.environ.setdefault('OMP_NUM_THREADS', '1') + with ProcessPoolExecutor(max_workers=nprocs) as executor: + futs = [executor.submit(_optimize_one_conf_with_ob, job) for job in xyz_jobs] + for fut in as_completed(futs): + res = fut.result() + if res is not None: + x, e = res + xyzs.append(x) + energies.append(e) + else: + for job in xyz_jobs: + x, e = _optimize_one_conf_with_ob(job) + if x is not None: + xyzs.append(x) + energies.append(e) + return xyzs, energies - def embed_rdkit(label, mol, num_confs=None, xyz=None): """ Generate unoptimized conformers in RDKit. If ``xyz`` is not given, random conformers will be generated. @@ -1528,6 +1614,7 @@ def rdkit_force_field(label: str, try_uff: bool = True, optimize: bool = True, try_ob: bool = False, + return_status: bool = False, ) -> Tuple[list, list]: """ Optimize RDKit conformers using a force field (MMFF94 or MMFF94s are recommended). @@ -1542,60 +1629,96 @@ def rdkit_force_field(label: str, try_uff (bool, optional): Whether to try UFF if MMFF94(s) fails. ``True`` by default. optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. try_ob (bool, optional): Whether to try OpenBabel if RDKit fails. ``True`` to try, ``False`` by default. + return_status (bool, optional): Whether to return the optimization status codes. ``False`` by default. Returns: - Tuple[list, list]: - - Entries are optimized xyz's in a dictionary format. - - Entries are float numbers representing the energies. - """ - xyzs, energies = list(), list() - if rd_mol is None: - return xyzs, energies - for i in range(rd_mol.GetNumConformers()): - if optimize: - v, j = 1, 0 - while v == 1 and j < 200: # v == 1: continue, v == 0: enough steps, v == -1: unable to set up - try: - v = Chem.AllChem.MMFFOptimizeMolecule(rd_mol, + If ``return_status`` is ``False``: + Tuple[list, list]: + - Entries are optimized xyz's in a dictionary format. + - Entries are float numbers representing the energies. + If ``return_status`` is ``True``: + Tuple[list, list, list]: + - Entries are optimized xyz's in a dictionary format. + - Entries are float numbers representing the energies. + - Entries are integers representing the optimization status codes (0 is success). + """ + xyzs, energies, statuses = list(), list(), list() + + if rd_mol is None or rd_mol.GetNumConformers() == 0: + return (xyzs, energies) if not return_status else (xyzs, energies, statuses) + + used_mmff_batch = False + + if optimize and "MMFF" in force_field: + try: + res = Chem.AllChem.MMFFOptimizeMoleculeConfs(rd_mol, mmffVariant=force_field, - confId=i, - maxIters=500, + maxIters=1000, ignoreInterfragInteractions=False, + numThreads=0, ) - except: - pass - else: - j += 1 - mol_properties = Chem.AllChem.MMFFGetMoleculeProperties(rd_mol, mmffVariant=force_field) - if mol_properties is not None: - ff = Chem.AllChem.MMFFGetMoleculeForceField(rd_mol, mol_properties, confId=i) - if optimize: - energies.append(ff.CalcEnergy()) - xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) - if not len(xyzs) and 'MMFF' in force_field and try_uff: - output = None - if optimize: - try: - output = Chem.AllChem.UFFOptimizeMoleculeConfs(rd_mol, - maxIters=200, - ignoreInterfragInteractions=True, - ) - except (RuntimeError, ValueError): - if try_ob: - logger.warning(f'Using OpenBabel (instead of RDKit) as a fall back method to generate conformers ' - f'for {label}. This is often slower.') - xyzs, energies = openbabel_force_field_on_rdkit_conformers(label, - rd_mol, - force_field=force_field, - optimize=optimize, - ) - return xyzs, energies + # res is a list of tuples [(status, energy), ...] + for i, (st, en) in enumerate(res): + statuses.append(st) + energies.append(en) + xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) + used_mmff_batch = True + except Exception as e: + logger.warning(f"MMFF optimization failed for {label} with error: {e}") + + if optimize and not xyzs and try_uff: + try: + res = Chem.AllChem.UFFOptimizeMoleculeConfs(rd_mol, + maxIters=1000, + ignoreInterfragInteractions=False, + numThreads=0, + ) + # res is a list of tuples [(status, energy), ...] + for i, (st, en) in enumerate(res): + statuses.append(st) + energies.append(en) + xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) + except (RuntimeError, ValueError) as e: + pass + + if optimize and not xyzs and try_ob: + logger.warning(f'Using OpenBabel (instead of RDKit) as a fall back method to generate conformers for {label}. ' + f'This is often slower.') + xyzs, energies = openbabel_force_field_on_rdkit_conformers(label, + rd_mol, + force_field=force_field, + optimize=optimize, + ) + if return_status: + return (xyzs, energies, [0]*len(xyzs)) # OpenBabel does not return status codes so treat all as successful + return xyzs, energies + + # Not optimize, then no energies or statuses + if not optimize: + xyzs = [read_rdkit_embedded_conformer_i(rd_mol, i) + for i in range(rd_mol.GetNumConformers())] + # Energies must remain empty in this case + return (xyzs, energies) if not return_status else (xyzs, energies, statuses) + + # Optimisation returned some conformers but not energies for all conformers + if used_mmff_batch and (len(energies) != rd_mol.GetNumConformers()): + props = Chem.AllChem.MMFFGetMoleculeProperties(rd_mol, mmffVariant=force_field) \ + if "MMFF" in force_field.upper() else None for i in range(rd_mol.GetNumConformers()): - if output is not None and output[i][0] == 0: # The optimization converged. - energies.append(output[i][1]) - xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) - return xyzs, energies - + if len(energies) <= i: + try: + if props is not None: + ff = Chem.AllChem.MMFFGetMoleculeForceField(rd_mol, props, confId=i) + energies.append(ff.CalcEnergy()) + else: + ff = Chem.AllChem.UFFGetMoleculeForceField(rd_mol, confId=i) + energies.append(ff.CalcEnergy()) + except Exception: + energies.append(float("nan")) + if return_status and len(statuses) <= i: + statuses.append(0) + + return (xyzs, energies) if not return_status else (xyzs, energies, statuses) def get_wells(label, angles, blank=WELL_GAP): """ diff --git a/arc/species/conformers_test.py b/arc/species/conformers_test.py index ff7d028dc3..1a380edbdf 100644 --- a/arc/species/conformers_test.py +++ b/arc/species/conformers_test.py @@ -6,6 +6,7 @@ """ import unittest +import pytest from rdkit.Chem import rdMolTransforms as rdMT @@ -554,7 +555,7 @@ def test_openbabel_force_field(self): method='diverse', ) self.assertEqual(len(xyzs), 1) - self.assertAlmostEqual(energies[0], 2.931930, 3) + self.assertAlmostEqual(energies[0], 2.931930, delta=0.001) def test_openbabel_force_field_on_rdkit_conformers(self): """Test Open Babel force field on RDKit conformers""" @@ -604,6 +605,41 @@ def test_openbabel_force_field_on_rdkit_conformers(self): self.assertEqual(xyzs[0]['symbols'], expected_xyzs[0]['symbols']) self.assertEqual(xyzs[1]['symbols'], expected_xyzs[1]['symbols']) + def test_openbabel_force_field_on_rdkit_conformers_parallel(self): + """Parallel and serial OB-on-RDKit paths give identical results (up to ordering).""" + xyz = converter.str_to_xyz("""C -2.18276 2.03598 0.00028 + C -0.83696 1.34108 -0.05231 + H -2.23808 2.82717 -0.75474 + H -2.33219 2.51406 0.97405 + H -2.99589 1.32546 -0.17267 + O 0.18176 2.30786 0.17821 + H -0.69161 0.88171 -1.03641 + H -0.78712 0.56391 0.71847 + O 1.39175 1.59510 0.11494""") + spc = ARCSpecies(label='CCO[O]', smiles='CCO[O]', xyz=xyz) + rd_mol = conformers.embed_rdkit(label='', mol=spc.mol, num_confs=2, xyz=xyz) + + # serial + xyzs_s, energies_s = conformers.openbabel_force_field_on_rdkit_conformers( + label='', rd_mol=rd_mol, force_field='MMFF94s', optimize=True, nprocs=1 + ) + # parallel + xyzs_p, energies_p = conformers.openbabel_force_field_on_rdkit_conformers( + label='', rd_mol=rd_mol, force_field='MMFF94s', optimize=True, nprocs=2 + ) + + # same counts + self.assertEqual(len(energies_s), len(energies_p)) + self.assertEqual(len(xyzs_s), len(xyzs_p)) + + # energies equal up to ordering + self.assertEqual(sorted(energies_s), pytest.approx(sorted(energies_p), abs=1e-8)) + + # compare symbols only (OB coords can differ slightly across platforms) + sym_s = sorted([tuple(x['symbols']) for x in xyzs_s]) + sym_p = sorted([tuple(x['symbols']) for x in xyzs_p]) + self.assertEqual(sym_s, sym_p) + def test_embed_rdkit(self): """Test embedding in RDKit""" rd_mol = conformers.embed_rdkit(label='CJ', mol=self.cj_spc.mol, num_confs=1) diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000000..7b74c7fdb4 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Benchmark old vs new RDKit force-field optimization paths in ARC. + +- Embeds conformers once (shared for both paths). +- Times rdkit_force_field (new batch path) vs old_rdkit_force_field (legacy loop). +- Reports mean/median/min/max, per-conformer cost, and simple sanity checks. + +Usage examples are at the end of this file or run: python bench_ff.py -h +""" + +import argparse +import gc +import statistics as stats +import time +from typing import Tuple + +from rdkit import Chem + +# ARC imports +import arc.species.conformers as conformers +from arc.species.species import ARCSpecies + + +def _time_once(fn, *args, **kwargs) -> Tuple[float, Tuple[list, list]]: + """Time a single call and return (elapsed_seconds, (xyzs, energies)).""" + gcold = gc.isenabled() + try: + gc.disable() + t0 = time.perf_counter() + out = fn(*args, **kwargs) + dt = time.perf_counter() - t0 + finally: + if gcold: + gc.enable() + return dt, out + + +def _fmt_s(x: float) -> str: + return f"{x:.6f}s" + + +def _fmt_ms(x: float) -> str: + return f"{x*1e3:.3f} ms" + + +def run_benchmark( + smiles: str, + nconfs: int, + repeats: int, + force_field: str, + optimize: bool, + try_ob: bool, +) -> None: + label = "BENCH" + spc = ARCSpecies(label=label, smiles=smiles) + # Embed once to ensure both paths see identical starting coordinates + rd_mol = conformers.embed_rdkit(label=label, mol=spc.mol, num_confs=nconfs, xyz=None) + + # Warm-up (JIT, import caches, allocator warm, etc.) + _time_once(conformers.rdkit_force_field, label, rd_mol, spc.mol, None, force_field, True, optimize, try_ob) + _time_once(conformers.old_rdkit_force_field, label, rd_mol, spc.mol, None, force_field, True, optimize, try_ob) + + # Measure + new_times, old_times = [], [] + new_last, old_last = None, None + + for _ in range(repeats): + dt_new, new_last = _time_once( + conformers.rdkit_force_field, + label, rd_mol, spc.mol, None, force_field, True, optimize, try_ob + ) + new_times.append(dt_new) + + dt_old, old_last = _time_once( + conformers.old_rdkit_force_field, + label, rd_mol, spc.mol, None, force_field, True, optimize, try_ob + ) + old_times.append(dt_old) + + # Unpack last results + new_xyzs, new_energies = new_last + old_xyzs, old_energies = old_last + + # Reporting + print("\n=== ARC RDKit FF Benchmark ===") + print(f"SMILES: {smiles}") + print(f"Conformers: {nconfs}") + print(f"Repeats: {repeats}") + print(f"Force field: {force_field}") + print(f"Optimize: {optimize}") + print(f"Try OpenBabel: {try_ob}") + print() + + def summary(times): + return { + "mean": stats.mean(times), + "median": stats.median(times), + "min": min(times), + "max": max(times), + "per_conf_mean": stats.mean(times) / max(1, nconfs), + } + + s_new = summary(new_times) + s_old = summary(old_times) + + print("New path: rdkit_force_field (batch MMFF)") + print(f" mean: {_fmt_s(s_new['mean'])} ({_fmt_ms(s_new['per_conf_mean'])}/conf)") + print(f" median: {_fmt_s(s_new['median'])}") + print(f" min: {_fmt_s(s_new['min'])}") + print(f" max: {_fmt_s(s_new['max'])}") + + print("\nOld path: old_rdkit_force_field (per-conf loop)") + print(f" mean: {_fmt_s(s_old['mean'])} ({_fmt_ms(s_old['per_conf_mean'])}/conf)") + print(f" median: {_fmt_s(s_old['median'])}") + print(f" min: {_fmt_s(s_old['min'])}") + print(f" max: {_fmt_s(s_old['max'])}") + + # Basic sanity checks + print("\n--- Sanity checks ---") + print(f"New returned: {len(new_xyzs)} xyzs, {len(new_energies)} energies") + print(f"Old returned: {len(old_xyzs)} xyzs, {len(old_energies)} energies") + + if not optimize: + print("Expect energies empty with optimize=False:") + print(f" new energies empty? {len(new_energies) == 0}") + print(f" old energies empty? {len(old_energies) == 0}") + else: + # Compare energy arrays shape, not values (values may differ due to tiny numerical drift/order) + same_len = len(new_energies) == len(old_energies) == nconfs + print(f"Energies length match nconfs? {same_len}") + if same_len and len(new_energies) and len(old_energies): + # Report a quick aggregate difference to spot gross issues + try: + import numpy as np + diff = np.nanmean(np.abs(np.array(new_energies) - np.array(old_energies))) + print(f"Mean |ΔE| (new-old): {diff:.6g}") + except Exception: + pass + + # Simple speed ratio + if s_old["mean"] > 0: + ratio = s_old["mean"] / s_new["mean"] + print(f"\nSpeedup (old/new mean): {ratio:.2f}×") + + +def main(): + p = argparse.ArgumentParser(description="Benchmark ARC RDKit force-field paths (old vs new).") + p.add_argument("--smiles", required=True, help="Input molecule as SMILES.") + p.add_argument("--nconfs", type=int, default=50, help="Number of conformers to embed.") + p.add_argument("--repeats", type=int, default=3, help="Number of timing repeats per path.") + p.add_argument("--ff", default="MMFF94s", help="Force field variant (MMFF94 or MMFF94s).") + p.add_argument("--no-opt", action="store_true", help="Disable optimization (just read coords).") + p.add_argument("--try-ob", action="store_true", help="Allow OpenBabel fallback if RDKit fails.") + args = p.parse_args() + + run_benchmark( + smiles=args.smiles, + nconfs=args.nconfs, + repeats=args.repeats, + force_field=args.ff, + optimize=not args.no_opt, + try_ob=args.try_ob, + ) + + +if __name__ == "__main__": + main()