1+ import httpx
12import random
23from collections import defaultdict
34from math import ceil
4- from typing import TYPE_CHECKING , Optional , Union
5+ from typing import TYPE_CHECKING , Any , Optional , Union
56
67from pydantic import BaseModel , computed_field
78
89if TYPE_CHECKING :
910 from guidellm .benchmark .benchmark import GenerativeBenchmark
1011
12+ from guidellm .dataset .file import FileDatasetCreator
13+ from guidellm .dataset .hf_datasets import HFDatasetsCreator
14+ from guidellm .dataset .in_memory import InMemoryDatasetCreator
15+ from guidellm .dataset .synthetic import SyntheticDatasetConfig , SyntheticDatasetCreator
1116from guidellm .objects .statistics import DistributionSummary
17+ from guidellm .preprocess .dataset import TokensConfig
18+
1219
1320
1421class Bucket (BaseModel ):
@@ -58,6 +65,38 @@ class Model(BaseModel):
5865class Dataset (BaseModel ):
5966 name : str
6067
68+ @classmethod
69+ def from_data (cls , request_loader : Any ):
70+ creators = [
71+ InMemoryDatasetCreator ,
72+ SyntheticDatasetCreator ,
73+ FileDatasetCreator ,
74+ HFDatasetsCreator ,
75+ ]
76+ dataset_name = ""
77+ data = request_loader .data
78+ data_args = request_loader .data_args
79+ processor = request_loader .processor
80+ processor_args = request_loader .processor_args
81+
82+ for creator in creators :
83+ if creator .is_supported (data , None ):
84+ random_seed = 42
85+ dataset = creator .handle_create (data , data_args , processor , processor_args , random_seed )
86+ dataset_name = creator .extract_dataset_name (dataset )
87+ if dataset_name is None or dataset_name == "" :
88+ if creator == SyntheticDatasetCreator :
89+ data_dict = SyntheticDatasetConfig .parse_str (data )
90+ dataset_name = data_dict .source
91+ if creator == FileDatasetCreator or isinstance (creator , HFDatasetsCreator ):
92+ dataset_name = data
93+ if creator == InMemoryDatasetCreator :
94+ dataset_name = "In-memory"
95+ break
96+ return cls (
97+ name = dataset_name or ""
98+ )
99+
61100
62101class RunInfo (BaseModel ):
63102 model : Model
@@ -71,13 +110,16 @@ def from_benchmarks(cls, benchmarks: list["GenerativeBenchmark"]):
71110 timestamp = max (
72111 bm .run_stats .start_time for bm in benchmarks if bm .start_time is not None
73112 )
113+ response = httpx .get (f"https://huggingface.co/api/models/{ model } " )
114+ modelJson = response .json ()
115+
74116 return cls (
75- model = Model (name = model , size = 0 ),
117+ model = Model (name = model , size = modelJson . get ( "usedStorage" , 0 ) ),
76118 task = "N/A" ,
77119 timestamp = timestamp ,
78- dataset = Dataset ( name = "N/A" ),
120+ dataset = Dataset . from_data ( benchmarks [ 0 ]. request_loader ),
79121 )
80-
122+
81123
82124class Distribution (BaseModel ):
83125 statistics : Optional [DistributionSummary ] = None
0 commit comments