11import random
22from collections import defaultdict
33from math import ceil
4- from typing import TYPE_CHECKING , Optional , Union
4+ from typing import TYPE_CHECKING , Any , Optional , Union
55
6+ import httpx
67from pydantic import BaseModel , computed_field
78
89if TYPE_CHECKING :
910 from guidellm .benchmark .benchmark import GenerativeBenchmark
1011
12+ from guidellm .dataset .file import FileDatasetCreator
13+ from guidellm .dataset .hf_datasets import HFDatasetsCreator
14+ from guidellm .dataset .in_memory import InMemoryDatasetCreator
15+ from guidellm .dataset .synthetic import SyntheticDatasetConfig , SyntheticDatasetCreator
1116from guidellm .objects .statistics import DistributionSummary
1217
1318
@@ -58,6 +63,41 @@ class Model(BaseModel):
5863class Dataset (BaseModel ):
5964 name : str
6065
66+ @classmethod
67+ def from_data (cls , request_loader : Any ):
68+ creators = [
69+ InMemoryDatasetCreator ,
70+ SyntheticDatasetCreator ,
71+ FileDatasetCreator ,
72+ HFDatasetsCreator ,
73+ ]
74+ dataset_name = None
75+ data = request_loader .data
76+ data_args = request_loader .data_args
77+ processor = request_loader .processor
78+ processor_args = request_loader .processor_args
79+
80+ for creator in creators :
81+ if not creator .is_supported (data , None ):
82+ continue
83+ random_seed = 42
84+ dataset = creator .handle_create (
85+ data , data_args , processor , processor_args , random_seed
86+ )
87+ dataset_name = creator .extract_dataset_name (dataset )
88+ if dataset_name is None or dataset_name == "" :
89+ if creator == SyntheticDatasetCreator :
90+ data_dict = SyntheticDatasetConfig .parse_str (data )
91+ dataset_name = data_dict .source
92+ if creator == FileDatasetCreator or isinstance (
93+ creator , HFDatasetsCreator
94+ ):
95+ dataset_name = data
96+ if creator == InMemoryDatasetCreator :
97+ dataset_name = "In-memory"
98+ break
99+ return cls (name = dataset_name or "" )
100+
61101
62102class RunInfo (BaseModel ):
63103 model : Model
@@ -71,11 +111,14 @@ def from_benchmarks(cls, benchmarks: list["GenerativeBenchmark"]):
71111 timestamp = max (
72112 bm .run_stats .start_time for bm in benchmarks if bm .start_time is not None
73113 )
114+ response = httpx .get (f"https://huggingface.co/api/models/{ model } " )
115+ model_json = response .json ()
116+
74117 return cls (
75- model = Model (name = model , size = 0 ),
118+ model = Model (name = model , size = model_json . get ( "usedStorage" , 0 ) ),
76119 task = "N/A" ,
77120 timestamp = timestamp ,
78- dataset = Dataset ( name = "N/A" ),
121+ dataset = Dataset . from_data ( benchmarks [ 0 ]. request_loader ),
79122 )
80123
81124
0 commit comments