Save more data to .json in preparation of other model kinds

leculver · leculver · commit 6eaec8f7999c · 2024-05-07T10:43:07.000-07:00
diff --git a/src/coreclr/scripts/cse_ml/evaluate.py b/src/coreclr/scripts/cse_ml/evaluate.py
@@ -9,7 +9,7 @@
 import pandas
 import tqdm
 
-from jitml import SuperPmi, SuperPmiContext, JitCseModel, MethodContext, JitCseEnv
+from jitml import SuperPmi, SuperPmiContext, JitCseModel, MethodContext, JitCseEnv, split_for_cse
 from train import validate_core_root
 
 class ModelResult(Enum):
@@ -109,9 +109,6 @@ def test_model(superpmi : SuperPmi, jitrl : JitCseModel, method_ids, model_name)
 
 def evaluate(superpmi, jitrl, methods, model_name, csv_file) -> pandas.DataFrame:
     """Evaluate the model and save to the specified CSV file."""
-    print(csv_file)
-    print(model_name)
-    print(len(methods))
     if os.path.exists(csv_file):
         return pandas.read_csv(csv_file)
 
@@ -200,10 +197,11 @@ def main(args):
         spmi_context = SuperPmiContext.load(spmi_file)
     else:
         print(f"Creating SuperPmiContext '{spmi_file}', this may take several minutes...")
-        spmi_context = SuperPmiContext(core_root=args.core_root, mch=args.mch)
-        spmi_context.find_methods_and_split(0.1)
+        spmi_context = SuperPmiContext.create_from_mch(args.mch, args.core_root)
         spmi_context.save(spmi_file)
 
+    test_methods, training_methods = split_for_cse(spmi_context.methods, 0.1)
+
     for file in enumerate_models(dir_or_path):
         print(file)
         with spmi_context.create_superpmi() as superpmi:
@@ -216,11 +214,11 @@ def main(args):
             model_name = os.path.splitext(file)[0]
 
             filename = os.path.join(dir_or_path, f"{model_name}_test.csv")
-            result = evaluate(superpmi, jitrl, spmi_context.test_methods, model_name, filename)
+            result = evaluate(superpmi, jitrl, test_methods, model_name, filename)
             print_result(result, model_name, "Test")
 
             filename = os.path.join(dir_or_path, f"{model_name}_train.csv")
-            result = evaluate(superpmi, jitrl, spmi_context.training_methods, model_name, filename)
+            result = evaluate(superpmi, jitrl, training_methods, model_name, filename)
             print_result(result, model_name, "Train")
 
 if __name__ == "__main__":
diff --git a/src/coreclr/scripts/cse_ml/jitml/__init__.py b/src/coreclr/scripts/cse_ml/jitml/__init__.py
@@ -4,6 +4,7 @@
 from .jit_cse import JitCseEnv
 from .machine_learning import JitCseModel
 from .wrappers import OptimalCseWrapper, NormalizeFeaturesWrapper
+from .constants import is_acceptable_for_cse, split_for_cse
 
 __all__ = [
     SuperPmi.__name__,
@@ -15,4 +16,6 @@
     JitType.__name__,
     OptimalCseWrapper.__name__,
     NormalizeFeaturesWrapper.__name__,
+    is_acceptable_for_cse.__name__,
+    split_for_cse.__name__,
 ]
diff --git a/src/coreclr/scripts/cse_ml/jitml/constants.py b/src/coreclr/scripts/cse_ml/jitml/constants.py
@@ -1,12 +1,50 @@
 """Constants and parameters for the project."""
 
+from typing import Sequence
+
+import numpy as np
+from .method_context import MethodContext
+
 MIN_CSE = 3
 MAX_CSE = 16
 
 INVALID_ACTION_PENALTY = -0.05
 INVALID_ACTION_LIMIT = 20
 
-def is_acceptable_method(method):
-    """Returns True if the method is acceptable for training."""
+def is_acceptable_for_cse(method):
+    """Returns True if the method is acceptable for training on JitCseEnv."""
     applicable = len([x for x in method.cse_candidates if x.viable])
     return MIN_CSE <= applicable and len(method.cse_candidates) <= MAX_CSE
+
+def split_for_cse(methods : Sequence['MethodContext'], test_percent=0.1):
+    """Splits the methods into those that can be used for training and those that can't.
+    Returns the test and train sets."""
+    method_by_cse = {}
+
+    for x in methods:
+        if is_acceptable_for_cse(x):
+            method_by_cse.setdefault(x.num_cse, []).append(x)
+
+    # convert method_by_cse to a list of methods
+    methods_list = []
+    for value in method_by_cse.values():
+        methods_list.append(value)
+
+    test = []
+    train = []
+
+    # use a fixed seed so subsequent calls line up
+    # Sort the groups of methods by length to ensure we don't care what order we process them in.
+    # Then sort each method by id before shuffling to (again) ensure we get the same result.
+    methods_list.sort(key=len)
+    for method_group in methods_list:
+        split = int(len(method_group) * test_percent)
+
+        # Discard any groups that are too small to split.
+        if split > 0:
+            method_group.sort(key=lambda x: x.index)
+            np.random.default_rng(seed=42).shuffle(method_group)
+            test.extend(method_group[:split])
+            train.extend(method_group[split:])
+
+    return test, train
diff --git a/src/coreclr/scripts/cse_ml/jitml/jit_cse.py b/src/coreclr/scripts/cse_ml/jitml/jit_cse.py
@@ -6,7 +6,7 @@
 
 from .method_context import JitType, MethodContext
 from .superpmi import SuperPmi, SuperPmiContext
-from .constants import (INVALID_ACTION_PENALTY, INVALID_ACTION_LIMIT, MAX_CSE, is_acceptable_method)
+from .constants import (INVALID_ACTION_PENALTY, INVALID_ACTION_LIMIT, MAX_CSE, is_acceptable_for_cse)
 
 # observation space
 JITTYPE_ONEHOT_SIZE = 6
@@ -60,13 +60,8 @@ def reset(self, *, seed: int | None = None, options: dict[str, Any] | None = Non
         while True:
             index = self.__select_method()
             no_cse = self._jit_method_with_cleanup(index, JitMetrics=1, JitRLHook=1, JitRLHookCSEDecisions=[])
-            if no_cse is None:
-                continue
-
-            if is_acceptable_method(no_cse):
-                original_heuristic = self._jit_method_with_cleanup(index, JitMetrics=1)
-                if original_heuristic is None:
-                    continue
+            original_heuristic = self._jit_method_with_cleanup(index, JitMetrics=1)
+            if no_cse and original_heuristic:
                 break
 
             failure_count += 1
@@ -184,7 +179,7 @@ def get_observation(cls, method : MethodContext, fill=True):
 
             # one-hot encode the type
             one_hot = [0.0] * 6
-            one_hot[cse.type.value - 1] = 1.0
+            one_hot[cse.type - 1] = 1.0
             tensor.extend(one_hot)
 
             # boolean features
@@ -227,7 +222,7 @@ def _jit_method_with_cleanup(self, m_id, *args, **kwargs):
     def __select_method(self):
         if self.methods is None:
             superpmi = self.__get_or_create_superpmi()
-            self.methods = [x.index for x in superpmi.enumerate_methods() if is_acceptable_method(x)]
+            self.methods = [x.index for x in superpmi.enumerate_methods() if is_acceptable_for_cse(x)]
 
         return np.random.choice(self.methods)
 
diff --git a/src/coreclr/scripts/cse_ml/jitml/machine_learning.py b/src/coreclr/scripts/cse_ml/jitml/machine_learning.py
@@ -17,6 +17,7 @@
 from stable_baselines3.common.vec_env import SubprocVecEnv
 import gymnasium as gym
 
+from .method_context import MethodContext
 from .jit_cse import JitCseEnv
 from .superpmi import SuperPmiContext
 
@@ -60,12 +61,14 @@ def action_probabilities(self, obs):
         probs = action_distribution.distribution.probs
         return probs.cpu().detach().numpy()[0]
 
-    def train(self, pmi_context : SuperPmiContext, output_dir : str, iterations = None, parallel = None,
-              progress_bar = True, wrappers : Optional[List[gym.Wrapper]] = None) -> str:
+    def train(self, pmi_context : SuperPmiContext, training_methods : List[MethodContext], output_dir : str,
+              iterations = None, parallel = None, progress_bar = True,
+              wrappers : Optional[List[gym.Wrapper]] = None) -> str:
         """Trains a model from scratch.
 
         Args:
             pmi_context: The SuperPmiContext to use for training.
+            training_methods : The methods to train on.
             output_dir: The directory to save the model to.
             iterations: The number of iterations to train for.  Defaults to 100,000.
             parallel: The number of parallel environments to use.  Defaults to single-process (None).
@@ -74,10 +77,11 @@ def train(self, pmi_context : SuperPmiContext, output_dir : str, iterations = No
         Returns:
             The full path to the trained model.
         """
+        training_methods = [m.index for m in training_methods]
         os.makedirs(output_dir, exist_ok=True)
 
         def default_make_env():
-            env = JitCseEnv(pmi_context, pmi_context.training_methods)
+            env = JitCseEnv(pmi_context, training_methods)
             if wrappers:
                 for wrapper in wrappers:
                     env = wrapper(env)
diff --git a/src/coreclr/scripts/cse_ml/jitml/method_context.py b/src/coreclr/scripts/cse_ml/jitml/method_context.py
@@ -26,7 +26,7 @@ class CseCandidate(BaseModel):
     make_cse : bool
     has_call : bool
     containable : bool
-    type : JitType
+    type : int
     cost_ex : int
     cost_sz : int
     use_count : int
@@ -38,6 +38,7 @@ class CseCandidate(BaseModel):
     bb_count : int
     block_spread : int
     enreg_count : int
+    for_testing : Optional[bool] = False
 
     @field_validator('applied', 'viable', 'live_across_call', 'const', 'shared_const', 'make_cse', 'has_call',
                      'containable', mode='before')
diff --git a/src/coreclr/scripts/cse_ml/jitml/superpmi.py b/src/coreclr/scripts/cse_ml/jitml/superpmi.py
@@ -5,10 +5,8 @@
 import subprocess
 import re
 from typing import Iterable, List, Optional
-import numpy as np
 from pydantic import BaseModel, field_validator
 
-from .constants import is_acceptable_method
 from .method_context import MethodContext
 
 class SuperPmiContext(BaseModel):
@@ -18,8 +16,7 @@ class SuperPmiContext(BaseModel):
     core_root : str
     mch : str
     jit : Optional[str] = None
-    test_methods : Optional[List[int]] = []
-    training_methods : Optional[List[int]] = []
+    methods : Optional[List[MethodContext]] = []
 
     @field_validator('core_root', 'mch', mode='before')
     @classmethod
@@ -37,37 +34,24 @@ def _validate_optional_path(cls, v):
 
         return v
 
-    def resplit_data(self, test_percent:float):
-        """Splits the data into training and testing sets."""
-        if not self.test_methods and not self.training_methods:
-            raise ValueError("No methods to split.  Try calling 'find_methods_and_split' first.")
-
-        all_methods = self.test_methods + self.training_methods
-        np.random.shuffle(all_methods)
-        self.test_methods = all_methods[:int(len(all_methods) * test_percent)]
-        self.training_methods = all_methods[len(self.test_methods):]
-
-    def find_methods_and_split(self, test_percent:float) -> None:
+    @staticmethod
+    def create_from_mch(mch : str, core_root : str,  jit : Optional[str] = None) -> 'SuperPmiContext':
         """Loads the SuperPmiContext from the specified arguments."""
-        suitable_methods = []
-        with SuperPmi(self) as superpmi:
+        result = SuperPmiContext(core_root=core_root, mch=mch, jit=jit)
+
+        methods = []
+        with SuperPmi(result) as superpmi:
             for method in superpmi.enumerate_methods():
-                if is_acceptable_method(method):
-                    suitable_methods.append(method.index)
+                methods.append(method)
 
-        self.test_methods = suitable_methods
-        self.resplit_data(test_percent)
+        result.methods = methods
+        return result
 
     def save(self, file_path:str):
         """Saves the SuperPmiContext to a file."""
         with open(file_path, 'w', encoding="utf8") as f:
             json.dump(self.model_dump(), f)
 
-
-    def create_superpmi(self, verbosity:str = 'q'):
-        """Creates a SuperPmi object from this context."""
-        return SuperPmi(self, verbosity)
-
     @staticmethod
     def load(file_path:str):
         """Loads the SuperPmiContext from a file."""
@@ -78,13 +62,15 @@ def load(file_path:str):
             data = json.load(f)
             return SuperPmiContext(**data)
 
+    def create_superpmi(self, verbosity:str = 'q'):
+        """Creates a SuperPmi object from this context."""
+        return SuperPmi(self, verbosity)
 
 class SuperPmi:
     """Controls one instance of superpmi."""
     def __init__(self, context : SuperPmiContext, verbosity:str = 'q'):
         """Constructor.
         core_root is the path to the coreclr build, usually at [repo]/artifiacts/bin/coreclr/[arch]/.
-        jit is the full path to the jit to use. Default is None.
         verbosity is the verbosity level of the superpmi process. Default is 'q'."""
         self._process = None
         self._feature_names = None
diff --git a/src/coreclr/scripts/cse_ml/train.py b/src/coreclr/scripts/cse_ml/train.py
@@ -4,7 +4,7 @@
 import os
 import argparse
 
-from jitml import SuperPmiContext, JitCseModel, OptimalCseWrapper, NormalizeFeaturesWrapper
+from jitml import SuperPmiContext, JitCseModel, OptimalCseWrapper, NormalizeFeaturesWrapper, split_for_cse
 
 def validate_core_root(core_root):
     """Validates and returns the core_root directory."""
@@ -44,11 +44,11 @@ def main(args):
         ctx = SuperPmiContext.load(spmi_file)
     else:
         print(f"Creating SuperPmiContext '{spmi_file}', this may take several minutes...")
-        ctx = SuperPmiContext(core_root=args.core_root, mch=args.mch)
-        ctx.find_methods_and_split(args.test_percent)
+        ctx = SuperPmiContext.create_from_mch(args.mch, args.core_root)
         ctx.save(spmi_file)
 
-    print(f"Training with {len(ctx.training_methods)} methods, holding back {len(ctx.test_methods)} for testing.")
+    test_methods, training_methods = split_for_cse(ctx.methods, 0.1)
+    print(f"Training with {len(training_methods)} methods, holding back {len(test_methods)} for testing.")
 
     # Define our own environment (with wrappers) if requested.
 
@@ -63,7 +63,7 @@ def main(args):
         wrappers.append(NormalizeFeaturesWrapper)
 
     iterations = args.iterations if args.iterations is not None else 1_000_000
-    path = rl.train(ctx, output_dir, iterations=iterations, parallel=args.parallel, wrappers=wrappers)
+    path = rl.train(ctx, training_methods, output_dir, iterations=iterations, parallel=args.parallel, wrappers=wrappers)
     print(f"Model saved to: {path}")
 
 if __name__ == "__main__":