From 5d329ce0febea754f8179f8d4669514c139a7fb3 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Wed, 20 Aug 2025 15:11:59 +0200 Subject: [PATCH 01/18] deploy fix and add qwen 1.7 --- config/settings.toml | 11 +++++++++-- pyproject.toml | 1 - src/wraval/main.py | 8 +++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/config/settings.toml b/config/settings.toml index fbf1d08..7792f63 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -1,7 +1,8 @@ [default] -region = 'us-east-1' +region = 'us-west-2' data_dir = 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/' -# "./data" +# 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/' +human_eval_dir = 's3://llm-finetune-us-east-1-{aws_account}/human_eval/tones/' deploy_bucket_name = 'llm-finetune-us-east-1-{aws_account}' deploy_bucket_prefix = 'models' sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-us-east-1' @@ -42,6 +43,12 @@ model = 'Qwen2-5-1-5B-Instruct' hf_name = 'Qwen/Qwen2.5-1.5B-Instruct' endpoint_type = 'sagemaker' +[qwen-3-1-7B] +model = 'Qwen3-1.7B-Instruct' +hf_name = 'Qwen/Qwen3-1.7B' +# instruct is now this, and base is appended with 'base' +endpoint_type = 'sagemaker' + [phi-3-ollama] model = 'phi3' hf_name = 'microsoft/Phi-3.5-mini-instruct' diff --git a/pyproject.toml b/pyproject.toml index f12bce3..9b9c596 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ dependencies = [ "boto3", "plotly~=5.24.1", "transformers==4.51.0", - "datasets~=3.2.0", "evaluate~=0.4.3", "dynaconf~=3.2.7", "torch", diff --git a/src/wraval/main.py b/src/wraval/main.py index b0e269e..9a32c20 100644 --- a/src/wraval/main.py +++ b/src/wraval/main.py @@ -228,7 +228,13 @@ def human_judge_upload( @app.command() -def deploy( +def human_judge_parsing(): + """Parse human judgments, merge it to the original results table and create a plot.""" + settings = get_settings() + parse_human_judgements(settings) + +@app.command() +def deploy_model( model: str = typer.Option("haiku-3", "--model", "-m", help="Model to deploy"), cleanup_endpoints: bool = typer.Option( False, From 524e66cc59e3949d7a521d32556bb80c8a1b5a6a Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Wed, 20 Aug 2025 15:20:41 +0200 Subject: [PATCH 02/18] remove bleu score as not in use for now --- pyproject.toml | 1 - src/wraval/actions/bleu_conf.py | 49 --------------------------------- 2 files changed, 50 deletions(-) delete mode 100644 src/wraval/actions/bleu_conf.py diff --git a/pyproject.toml b/pyproject.toml index 9b9c596..2c9593f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ dependencies = [ "boto3", "plotly~=5.24.1", "transformers==4.51.0", - "evaluate~=0.4.3", "dynaconf~=3.2.7", "torch", "botocore", diff --git a/src/wraval/actions/bleu_conf.py b/src/wraval/actions/bleu_conf.py deleted file mode 100644 index 5738d6e..0000000 --- a/src/wraval/actions/bleu_conf.py +++ /dev/null @@ -1,49 +0,0 @@ -# -# // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# // SPDX-License-Identifier: Apache-2.0 -# -import random -import numpy as np -from evaluate import load - -# Load BLEU metric -bleu = load("bleu") - - -def compute_bleu_with_ci( - predictions, references, num_bootstrap_samples=1000, confidence_level=0.95 -): - # Compute the original BLEU score - original_bleu = bleu.compute(predictions=predictions, references=references)["bleu"] - - # Bootstrap sampling - bootstrap_scores = [] - n = len(predictions) - - for _ in range(num_bootstrap_samples): - # Sample indices with replacement - indices = [random.randint(0, n - 1) for _ in range(n)] - sampled_predictions = [predictions[i] for i in indices] - sampled_references = [references[i] for i in indices] - - # Compute BLEU for the bootstrap sample - score = bleu.compute( - predictions=sampled_predictions, references=sampled_references - )["bleu"] - bootstrap_scores.append(score) - - # Calculate confidence intervals - lower_bound = np.percentile(bootstrap_scores, (1 - confidence_level) / 2 * 100) - upper_bound = np.percentile(bootstrap_scores, (1 + confidence_level) / 2 * 100) - - return {"bleu": original_bleu, "confidence_interval": (lower_bound, upper_bound)} - - -# Example usage -predictions = ["This is a test", "Another sentence"] -references = [["This is a test"], ["Another sentence"]] - -results = compute_bleu_with_ci(predictions, references) - -print(f"BLEU Score: {results['bleu']}") -print(f"95% Confidence Interval: {results['confidence_interval']}") From 2f4c637e51b47aeb040dd499e9beb9863ecc82fe Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Wed, 20 Aug 2025 15:23:45 +0200 Subject: [PATCH 03/18] move the comment in settings toml --- config/settings.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config/settings.toml b/config/settings.toml index 7792f63..53a9d49 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -45,8 +45,7 @@ endpoint_type = 'sagemaker' [qwen-3-1-7B] model = 'Qwen3-1.7B-Instruct' -hf_name = 'Qwen/Qwen3-1.7B' -# instruct is now this, and base is appended with 'base' +hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base' endpoint_type = 'sagemaker' [phi-3-ollama] From b5f05d624ce3bba7bfcedeeec65c97120993fac5 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Wed, 20 Aug 2025 17:40:26 +0200 Subject: [PATCH 04/18] using dynaconf variable interpolation --- config/settings.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/settings.toml b/config/settings.toml index 53a9d49..259dde2 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -1,9 +1,9 @@ [default] region = 'us-west-2' -data_dir = 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/' +deploy_bucket_name = 'llm-finetune-{region}-{aws_account}' +data_dir = 's3://{deploy_bucket_name}/eval/tones/' # 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/' -human_eval_dir = 's3://llm-finetune-us-east-1-{aws_account}/human_eval/tones/' -deploy_bucket_name = 'llm-finetune-us-east-1-{aws_account}' +human_eval_dir = 's3://{deploy_bucket_name}/human_eval/tones/' deploy_bucket_prefix = 'models' sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-us-east-1' endpoint_type = 'bedrock' From 7519af6093902889cfc0c86007f137b5efe570c7 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Wed, 20 Aug 2025 17:41:03 +0200 Subject: [PATCH 05/18] using dynaconf variable interpolation for sagemaker role too --- config/settings.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/settings.toml b/config/settings.toml index 259dde2..510d453 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -5,7 +5,7 @@ data_dir = 's3://{deploy_bucket_name}/eval/tones/' # 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/' human_eval_dir = 's3://{deploy_bucket_name}/human_eval/tones/' deploy_bucket_prefix = 'models' -sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-us-east-1' +sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-{region}' endpoint_type = 'bedrock' model = 'anthropic.claude-3-haiku-20240307-v1:0' From 837d55f64adb88e286f535f194f42a65fa29d957 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Wed, 20 Aug 2025 17:42:36 +0200 Subject: [PATCH 06/18] using dynaconf variable interpolation with only one nesting --- config/settings.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/settings.toml b/config/settings.toml index 510d453..b6394db 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -1,9 +1,9 @@ [default] region = 'us-west-2' deploy_bucket_name = 'llm-finetune-{region}-{aws_account}' -data_dir = 's3://{deploy_bucket_name}/eval/tones/' +data_dir = 's3://llm-finetune-{region}-{aws_account}/eval/tones/' # 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/' -human_eval_dir = 's3://{deploy_bucket_name}/human_eval/tones/' +human_eval_dir = 's3://llm-finetune-{region}-{aws_account}/human_eval/tones/' deploy_bucket_prefix = 'models' sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-{region}' endpoint_type = 'bedrock' From ba7ff24b9ec757172fd36b8253c83a317bcc1e77 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Wed, 20 Aug 2025 17:53:11 +0200 Subject: [PATCH 07/18] dropping dynaconf variable interpolation but keeping region flexible in python --- src/wraval/main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/wraval/main.py b/src/wraval/main.py index 9a32c20..3a81d49 100644 --- a/src/wraval/main.py +++ b/src/wraval/main.py @@ -62,6 +62,11 @@ def get_settings( if local_tokenizer_path: settings.local_tokenizer_path = local_tokenizer_path + settings.deploy_bucket_name = settings.deploy_bucket_name.format(region=settings.region, aws_account=settings.aws_account) + settings.data_dir = settings.data_dir.format(region=settings.region, aws_account=settings.aws_account) + settings.human_eval_dir = settings.human_eval_dir.format(region=settings.region, aws_account=settings.aws_account) + settings.sagemaker_execution_role_arn = settings.sagemaker_execution_role_arn.format(region=settings.region, aws_account=settings.aws_account) + # Format settings with AWS account settings.model = settings.model.format(aws_account=settings.aws_account) settings.data_dir = settings.data_dir.format(aws_account=settings.aws_account) From 0b1463ff3e08f860a6f27234cc792346a38203d3 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Thu, 21 Aug 2025 17:19:34 +0200 Subject: [PATCH 08/18] no_think input and output --- config/settings.toml | 4 +++- src/wraval/actions/action_deploy.py | 2 +- src/wraval/actions/action_inference.py | 3 ++- src/wraval/actions/completion.py | 24 ++++++++++++++++++------ src/wraval/actions/format.py | 7 +++++-- src/wraval/actions/model_router.py | 9 +++++++-- 6 files changed, 36 insertions(+), 13 deletions(-) diff --git a/config/settings.toml b/config/settings.toml index b6394db..67603ae 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -1,4 +1,5 @@ [default] +# region = 'us-east-1' region = 'us-west-2' deploy_bucket_name = 'llm-finetune-{region}-{aws_account}' data_dir = 's3://llm-finetune-{region}-{aws_account}/eval/tones/' @@ -44,9 +45,10 @@ hf_name = 'Qwen/Qwen2.5-1.5B-Instruct' endpoint_type = 'sagemaker' [qwen-3-1-7B] -model = 'Qwen3-1.7B-Instruct' +model = 'Qwen3-1-7B' hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base' endpoint_type = 'sagemaker' +thinking = false [phi-3-ollama] model = 'phi3' diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py index 8cd2aa0..273d4ee 100644 --- a/src/wraval/actions/action_deploy.py +++ b/src/wraval/actions/action_deploy.py @@ -106,7 +106,7 @@ def deploy_endpoint(s3_uri, role, endpoint_name): def validate_deployment(predictor): try: sagemaker_runtime_client = boto3.client("sagemaker-runtime") - input_string = json.dumps({"inputs": "Hello, my dog is a little"}) + input_string = json.dumps({"inputs": "<|im_start|>user\nHello, can you pass me the milk?<|im_end|>\n<|im_start|>assistant\n"}) response = sagemaker_runtime_client.invoke_endpoint( EndpointName=predictor.endpoint_name, Body=input_string.encode("utf-8"), diff --git a/src/wraval/actions/action_inference.py b/src/wraval/actions/action_inference.py index b9ee95d..75a8394 100644 --- a/src/wraval/actions/action_inference.py +++ b/src/wraval/actions/action_inference.py @@ -45,8 +45,9 @@ def run_inference( print(f"Processing {len(queries)} inputs for tone: {tone}") outputs = route_completion(settings, queries, tone_prompt) - + cleaned_output = [o.strip().strip('"') for o in outputs] + if no_rewrite: mask = results["tone"] == tone results.loc[mask, "rewrite"] = cleaned_output diff --git a/src/wraval/actions/completion.py b/src/wraval/actions/completion.py index 41951bf..f82e3e2 100644 --- a/src/wraval/actions/completion.py +++ b/src/wraval/actions/completion.py @@ -15,13 +15,25 @@ # Function to extract last assistant response from each entry def extract_last_assistant_response(data): - matches = re.findall(r"<\|assistant\|>(.*?)<\|end\|>", data, re.DOTALL) - # matches = re.findall(r"(.*?)", data, re.DOTALL) - if matches: - return matches[-1].strip() - else: - return data + if r"<\|assistant\|>" in data: # phi + assistant_part = data.split(r"<\|assistant\|>")[-1] + response = response.replace(r"<\|end\|>", "").strip() + return response + + if r"<|im_start|>assistant" in data: # qwen + assistant_part = data.split(r"<|im_start|>assistant")[-1] + + # Remove the thinking part if it exists + if r"" in assistant_part: + # Extract everything after + response = assistant_part.split(r"")[-1] + else: + response = assistant_part + response = response.replace(r"<|im_end|>", "").strip() + return response + + return data def get_bedrock_completion(settings, prompt, system_prompt=None): bedrock_client = boto3.client( diff --git a/src/wraval/actions/format.py b/src/wraval/actions/format.py index d789682..25056db 100644 --- a/src/wraval/actions/format.py +++ b/src/wraval/actions/format.py @@ -6,7 +6,7 @@ import xml.dom.minidom -def format_prompt(usr_prompt, prompt=None, tokenizer=None, type="bedrock"): +def format_prompt(usr_prompt, prompt=None, tokenizer=None, type="bedrock", thinking=None): """ Format prompts according to each model's prompt guidelines (e.g. xml tags for Haiku). @@ -18,7 +18,10 @@ def format_prompt(usr_prompt, prompt=None, tokenizer=None, type="bedrock"): if type == "hf": if prompt: - sys_prompt = [{"role": "system", "content": prompt.sys_prompt}] + if thinking is None or True: + sys_prompt = [{"role": "system", "content": prompt.sys_prompt}] + else: + sys_prompt = [{"role": "system", "content": prompt.sys_prompt + '/no_think'}] messages = [] if prompt.examples: for k, v in prompt.examples[0].items(): diff --git a/src/wraval/actions/model_router.py b/src/wraval/actions/model_router.py index 40a8a77..a404be4 100644 --- a/src/wraval/actions/model_router.py +++ b/src/wraval/actions/model_router.py @@ -45,14 +45,19 @@ class SageMakerRouter(HuggingFaceModelRouter): def __init__(self, master_sys_prompt, settings): super().__init__(master_sys_prompt, settings) self.model_name = settings.model + self.region = settings.region + if settings.exists('thinking'): + self.thinking = settings.thinking + else: + self.thinking = None def get_completion(self, queries: List[str]) -> List[str]: prompts = [ - format_prompt(text, self.master_sys_prompt, self.tokenizer, type="hf") + format_prompt(text, self.master_sys_prompt, self.tokenizer, "hf", self.thinking) for text in queries ] return [ - invoke_sagemaker_endpoint({"inputs": prompt}, self.model_name) for prompt in tqdm(prompts) + invoke_sagemaker_endpoint({"inputs": prompt}, self.model_name, self.region) for prompt in tqdm(prompts) ] From b58fd5c1c2d683672109ea0e37c91d914890ea5a Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Fri, 22 Aug 2025 11:29:55 +0200 Subject: [PATCH 09/18] try async --- config/settings.toml | 7 +++++++ src/wraval/actions/action_deploy.py | 13 +++++++++++-- src/wraval/actions/action_inference.py | 2 +- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/config/settings.toml b/config/settings.toml index 67603ae..d320bee 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -50,6 +50,13 @@ hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'b endpoint_type = 'sagemaker' thinking = false +[qwen-3-1-7B-async] +model = 'Qwen3-1-7B' +hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base' +endpoint_type = 'sagemaker' +thinking = false +async = true + [phi-3-ollama] model = 'phi3' hf_name = 'microsoft/Phi-3.5-mini-instruct' diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py index 273d4ee..3f009d4 100644 --- a/src/wraval/actions/action_deploy.py +++ b/src/wraval/actions/action_deploy.py @@ -6,6 +6,7 @@ import boto3 import torch from sagemaker.huggingface import HuggingFaceModel +from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -80,7 +81,7 @@ def write_model_to_s3(settings, model_name): return s3_uri -def deploy_endpoint(s3_uri, role, endpoint_name): +def deploy_endpoint(s3_uri, role, endpoint_name, async_config=None): env = { "HF_TASK": "text-generation", "HF_HUB_OFFLINE": "1", @@ -100,6 +101,7 @@ def deploy_endpoint(s3_uri, role, endpoint_name): initial_instance_count=1, instance_type="ml.g5.2xlarge", endpoint_name=endpoint_name, + async_inference_config=async_config, ) @@ -145,7 +147,14 @@ def deploy(settings): sanitized_model_name = settings.hf_name.split("/")[1].replace(".", "-") load_artifacts(settings) s3_uri = write_model_to_s3(settings, sanitized_model_name) + if settings.exists('async'): + async_config = AsyncInferenceConfig( + max_concurrency=1000, + max_invocations=1000, + max_payload_in_mb=1000 + ) + predictor = deploy_endpoint( - s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name + s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name, async_config ) validate_deployment(predictor) diff --git a/src/wraval/actions/action_inference.py b/src/wraval/actions/action_inference.py index 75a8394..e1bfffd 100644 --- a/src/wraval/actions/action_inference.py +++ b/src/wraval/actions/action_inference.py @@ -40,7 +40,7 @@ def run_inference( tone_prompt = get_prompt(Tone(tone)) - queries = results[results["tone"] == tone]["synthetic_data"] + queries = results[results["tone"] == tone]["synthetic_data"].unique() print(f"Processing {len(queries)} inputs for tone: {tone}") From 8bdcd9cac7966f2049e2d319180e5ee811a023f5 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Fri, 22 Aug 2025 11:52:24 +0200 Subject: [PATCH 10/18] empty async config --- src/wraval/actions/action_deploy.py | 7 +------ src/wraval/actions/action_inference.py | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py index 3f009d4..29b47f6 100644 --- a/src/wraval/actions/action_deploy.py +++ b/src/wraval/actions/action_deploy.py @@ -148,12 +148,7 @@ def deploy(settings): load_artifacts(settings) s3_uri = write_model_to_s3(settings, sanitized_model_name) if settings.exists('async'): - async_config = AsyncInferenceConfig( - max_concurrency=1000, - max_invocations=1000, - max_payload_in_mb=1000 - ) - + async_config = AsyncInferenceConfig() predictor = deploy_endpoint( s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name, async_config ) diff --git a/src/wraval/actions/action_inference.py b/src/wraval/actions/action_inference.py index e1bfffd..b8f0c19 100644 --- a/src/wraval/actions/action_inference.py +++ b/src/wraval/actions/action_inference.py @@ -54,7 +54,7 @@ def run_inference( results.loc[mask, "inference_model"] = model_name else: new_results = pd.DataFrame( - {"synthetic_data": results[results["tone"] == tone]["synthetic_data"]} + {"synthetic_data": results[results["tone"] == tone]["synthetic_data"].unique()} ) new_results["tone"] = tone new_results["rewrite"] = cleaned_output From 9ac330f9af2b47feed79bfd873aaade0c38789fd Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Fri, 22 Aug 2025 12:01:08 +0200 Subject: [PATCH 11/18] different async qwen name --- config/settings.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/settings.toml b/config/settings.toml index d320bee..e8e0cef 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -51,7 +51,7 @@ endpoint_type = 'sagemaker' thinking = false [qwen-3-1-7B-async] -model = 'Qwen3-1-7B' +model = 'Qwen3-1-7B-async' hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base' endpoint_type = 'sagemaker' thinking = false From 7c3471587e4c7057e01c1f068925004440905c7d Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Fri, 22 Aug 2025 12:09:12 +0200 Subject: [PATCH 12/18] use model name and not hf name as sagemaker endpoint name --- src/wraval/actions/action_deploy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py index 29b47f6..2c6f0e5 100644 --- a/src/wraval/actions/action_deploy.py +++ b/src/wraval/actions/action_deploy.py @@ -144,7 +144,7 @@ def cleanup_model_directory(): def deploy(settings): validate_model_directory() cleanup_model_directory() - sanitized_model_name = settings.hf_name.split("/")[1].replace(".", "-") + sanitized_model_name = settings.model.split("/")[1].replace(".", "-") load_artifacts(settings) s3_uri = write_model_to_s3(settings, sanitized_model_name) if settings.exists('async'): From e132f59b369f8e8542c767bfeb023d9102c83435 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Fri, 22 Aug 2025 12:10:50 +0200 Subject: [PATCH 13/18] model name has no slash unlike hf name --- src/wraval/actions/action_deploy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py index 2c6f0e5..61e5380 100644 --- a/src/wraval/actions/action_deploy.py +++ b/src/wraval/actions/action_deploy.py @@ -144,7 +144,7 @@ def cleanup_model_directory(): def deploy(settings): validate_model_directory() cleanup_model_directory() - sanitized_model_name = settings.model.split("/")[1].replace(".", "-") + sanitized_model_name = settings.model.replace(".", "-") load_artifacts(settings) s3_uri = write_model_to_s3(settings, sanitized_model_name) if settings.exists('async'): From 019d5619ad2081e1a21a7be19cc42d03bc19d46d Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Mon, 25 Aug 2025 11:58:10 +0200 Subject: [PATCH 14/18] add qwen 3 4B --- config/settings.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/config/settings.toml b/config/settings.toml index e8e0cef..8cb835d 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -3,7 +3,7 @@ region = 'us-west-2' deploy_bucket_name = 'llm-finetune-{region}-{aws_account}' data_dir = 's3://llm-finetune-{region}-{aws_account}/eval/tones/' -# 's3://llm-finetune-us-east-1-{aws_account}/eval/tones/' +# data_dir = './data/' human_eval_dir = 's3://llm-finetune-{region}-{aws_account}/human_eval/tones/' deploy_bucket_prefix = 'models' sagemaker_execution_role_arn = 'arn:aws:iam::{aws_account}:role/sagemaker-execution-role-{region}' @@ -57,6 +57,11 @@ endpoint_type = 'sagemaker' thinking = false async = true +[qwen-3-4B] +model = 'Qwen3-4B' +hf_name = 'Qwen/Qwen3-4B-Instruct-2507' # this finetune is non-thinking only +endpoint_type = 'sagemaker' + [phi-3-ollama] model = 'phi3' hf_name = 'microsoft/Phi-3.5-mini-instruct' From ba7bdbc6d624dcb4e1c616027728b36beed03563 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Mon, 25 Aug 2025 12:04:58 +0200 Subject: [PATCH 15/18] async config fix --- src/wraval/actions/action_deploy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py index 61e5380..fb5748e 100644 --- a/src/wraval/actions/action_deploy.py +++ b/src/wraval/actions/action_deploy.py @@ -147,6 +147,7 @@ def deploy(settings): sanitized_model_name = settings.model.replace(".", "-") load_artifacts(settings) s3_uri = write_model_to_s3(settings, sanitized_model_name) + async_config = None if settings.exists('async'): async_config = AsyncInferenceConfig() predictor = deploy_endpoint( From 815a24e0abfb209d2d979a3b9df50822e9226a4b Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Mon, 25 Aug 2025 14:32:58 +0200 Subject: [PATCH 16/18] small fixes and first attempt at a batch endpoint --- config/settings.toml | 2 +- src/wraval/actions/action_deploy.py | 2 +- src/wraval/actions/completion.py | 142 ++++++++++++++++++++++++++++ src/wraval/actions/format.py | 2 +- src/wraval/actions/model_router.py | 18 +++- 5 files changed, 160 insertions(+), 6 deletions(-) diff --git a/config/settings.toml b/config/settings.toml index 8cb835d..c2802fd 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -55,7 +55,7 @@ model = 'Qwen3-1-7B-async' hf_name = 'Qwen/Qwen3-1.7B' # instruct is now this, and base is appended with 'base' endpoint_type = 'sagemaker' thinking = false -async = true +asynchronous = true [qwen-3-4B] model = 'Qwen3-4B' diff --git a/src/wraval/actions/action_deploy.py b/src/wraval/actions/action_deploy.py index fb5748e..1ff18ff 100644 --- a/src/wraval/actions/action_deploy.py +++ b/src/wraval/actions/action_deploy.py @@ -148,7 +148,7 @@ def deploy(settings): load_artifacts(settings) s3_uri = write_model_to_s3(settings, sanitized_model_name) async_config = None - if settings.exists('async'): + if settings.exists('asynchronous'): async_config = AsyncInferenceConfig() predictor = deploy_endpoint( s3_uri, settings.sagemaker_execution_role_arn, sanitized_model_name, async_config diff --git a/src/wraval/actions/completion.py b/src/wraval/actions/completion.py index f82e3e2..e2699b0 100644 --- a/src/wraval/actions/completion.py +++ b/src/wraval/actions/completion.py @@ -11,6 +11,7 @@ import boto3 import re import requests +import uuid # Function to extract last assistant response from each entry @@ -232,3 +233,144 @@ def invoke_ollama_endpoint(payload, endpoint_name, url="127.0.0.1:11434"): lines.append(json.loads(r)) return "".join([l["response"] for l in lines]) + + +def batch_invoke_sagemaker_endpoint( + payloads, + endpoint_name, + region="us-east-1", + s3_bucket=None, + s3_input_prefix="/eval/async/input/", + poll_interval_seconds=10, + timeout_seconds=600, +): + """ + Invoke a SageMaker async endpoint for a batch of payloads. + + - payloads: list of JSON-serializable objects (each is one request) + - endpoint_name: name of the async SageMaker endpoint + - region: AWS region + - s3_bucket: S3 bucket to upload inputs (required) + - s3_input_prefix: S3 prefix for input uploads + - poll_interval_seconds: interval between checks for output readiness + - timeout_seconds: max time to wait for each result + + Returns list of raw results (strings) in the same order as payloads. + """ + if s3_bucket is None: + raise ValueError("s3_bucket is required for async invocations") + if not isinstance(s3_bucket, str) or not s3_bucket.strip(): + raise ValueError( + "s3_bucket must be a non-empty string (e.g., 'my-bucket-name'), got: " + f"{type(s3_bucket).__name__}" + ) + + sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=region) + s3_client = boto3.client("s3", region_name=region) + + # Normalize prefix + input_prefix = s3_input_prefix.lstrip("/") + + input_locations = [] + output_locations = [] + inference_ids = [] + + # 1) Upload all payloads and invoke async endpoint + for idx, payload in enumerate(payloads): + print(f"Submitting {idx + 1}/{len(payloads)} to async endpoint '{endpoint_name}'...") + request_id = str(uuid.uuid4())[:8] + input_key = f"{input_prefix}batch-{request_id}-{idx}.json" + + # Ensure payload is in expected format for the model container + if isinstance(payload, str): + payload_to_upload = {"inputs": payload} + elif isinstance(payload, list) and all(isinstance(p, str) for p in payload): + payload_to_upload = {"inputs": payload} + elif isinstance(payload, dict): + payload_to_upload = payload + else: + # Fallback: wrap unknown types under inputs + payload_to_upload = {"inputs": payload} + + s3_client.put_object( + Bucket=s3_bucket, + Key=input_key, + Body=json.dumps(payload_to_upload), + ContentType="application/json", + ) + + input_location = f"s3://{s3_bucket}/{input_key}" + input_locations.append(input_location) + + response = sagemaker_runtime.invoke_endpoint_async( + EndpointName=endpoint_name, + InputLocation=input_location, + ContentType="application/json", + InvocationTimeoutSeconds=3600, + ) + + output_locations.append(response["OutputLocation"]) # s3 uri + inference_ids.append(response.get("InferenceId")) + print(f"Submitted {idx + 1}/{len(payloads)}. Output will be written to {response['OutputLocation']}") + + # 2) Poll for each output and download results + results = [] + for i, output_location in enumerate(output_locations): + start_time = time.time() + + # Parse s3 uri and derive expected result key: /.out + uri = output_location.replace("s3://", "") + bucket, key = uri.split("/", 1) + inference_id = inference_ids[i] + expected_key = f"{key.rstrip('/')}/{inference_id}.out" if isinstance(inference_id, str) and inference_id else key + if expected_key != key: + print(f"Polling for result object s3://{bucket}/{expected_key}") + + while True: + try: + # First, check expected result key (InferenceId.out) + s3_client.head_object(Bucket=bucket, Key=expected_key) + break + except Exception: + if time.time() - start_time > timeout_seconds: + print(f"Timed out waiting for result {i + 1}/{len(output_locations)} after {timeout_seconds}s") + results.append(None) + break + elapsed = int(time.time() - start_time) + print(f"Waiting for result {i + 1}/{len(output_locations)}... {elapsed}s elapsed") + + # Try to detect async failure artifact: async-endpoint-failures/.../-error.out + if isinstance(inference_id, str) and inference_id: + try: + candidates = s3_client.list_objects_v2( + Bucket=bucket, + Prefix="async-endpoint-failures/", + MaxKeys=1000, + ) + for obj in candidates.get("Contents", []): + k = obj.get("Key", "") + if k.endswith(f"{inference_id}-error.out"): + err_obj = s3_client.get_object(Bucket=bucket, Key=k) + err_text = err_obj["Body"].read().decode("utf-8", errors="replace") + print(f"Error for request {i + 1}/{len(output_locations)} (InferenceId={inference_id}):\n{err_text}") + results.append(None) + # Stop waiting for this one + elapsed = int(time.time() - start_time) + print(f"Marking request {i + 1} as failed after {elapsed}s due to async failure artifact: s3://{bucket}/{k}") + # Break out of the polling loop + raise StopIteration + except StopIteration: + break + except Exception: + # Ignore listing errors silently and keep polling + pass + time.sleep(poll_interval_seconds) + + if len(results) == 0 or results[-1] is not None: + obj = s3_client.get_object(Bucket=bucket, Key=key) + result_body = obj["Body"].read().decode("utf-8") + results.append(result_body) + total = int(time.time() - start_time) + print(f"Result ready for {i + 1}/{len(output_locations)} after {total}s") + + return results diff --git a/src/wraval/actions/format.py b/src/wraval/actions/format.py index 25056db..f9bc374 100644 --- a/src/wraval/actions/format.py +++ b/src/wraval/actions/format.py @@ -18,7 +18,7 @@ def format_prompt(usr_prompt, prompt=None, tokenizer=None, type="bedrock", think if type == "hf": if prompt: - if thinking is None or True: + if thinking is None or thinking is True: sys_prompt = [{"role": "system", "content": prompt.sys_prompt}] else: sys_prompt = [{"role": "system", "content": prompt.sys_prompt + '/no_think'}] diff --git a/src/wraval/actions/model_router.py b/src/wraval/actions/model_router.py index a404be4..b05c1f0 100644 --- a/src/wraval/actions/model_router.py +++ b/src/wraval/actions/model_router.py @@ -2,6 +2,7 @@ batch_get_bedrock_completions, invoke_sagemaker_endpoint, invoke_ollama_endpoint, + batch_invoke_sagemaker_endpoint, ) from .format import format_prompt from transformers import AutoTokenizer @@ -46,18 +47,29 @@ def __init__(self, master_sys_prompt, settings): super().__init__(master_sys_prompt, settings) self.model_name = settings.model self.region = settings.region + self.thinking = None if settings.exists('thinking'): self.thinking = settings.thinking - else: - self.thinking = None + self.async_config = False + if settings.exists('asynchronous'): + self.async_config = settings.asynchronous + self.deploy_bucket_name = settings.deploy_bucket_name def get_completion(self, queries: List[str]) -> List[str]: prompts = [ format_prompt(text, self.master_sys_prompt, self.tokenizer, "hf", self.thinking) for text in queries ] + if self.async_config: + return batch_invoke_sagemaker_endpoint(prompts, + self.model_name, + self.region, + self.deploy_bucket_name) return [ - invoke_sagemaker_endpoint({"inputs": prompt}, self.model_name, self.region) for prompt in tqdm(prompts) + invoke_sagemaker_endpoint({"inputs": prompt}, + self.model_name, + self.region) + for prompt in tqdm(prompts) ] From 6d046ce9cbeb41f96bcb660797c8cd58acbf1508 Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Mon, 25 Aug 2025 16:01:31 +0200 Subject: [PATCH 17/18] add qwen 0.6B --- config/settings.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/config/settings.toml b/config/settings.toml index c2802fd..32b6a6f 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -39,6 +39,12 @@ model = 'Phi-3-5-mini-instruct' hf_name = 'microsoft/Phi-3.5-mini-instruct' endpoint_type = 'sagemaker' +[qwen-3-0-6B] +model = 'Qwen3-0-6B' +hf_name = 'Qwen/Qwen3-0-6B' # instruct is now this, and base is appended with 'base' +endpoint_type = 'sagemaker' +thinking = false + [qwen-2-5-1-5B] model = 'Qwen2-5-1-5B-Instruct' hf_name = 'Qwen/Qwen2.5-1.5B-Instruct' From d5385a2da2d8fc59ce07d7b4e44ec11f640da76a Mon Sep 17 00:00:00 2001 From: Gabriel Benedict Date: Mon, 25 Aug 2025 16:03:53 +0200 Subject: [PATCH 18/18] typpo in qwen hf name --- config/settings.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/settings.toml b/config/settings.toml index 32b6a6f..594fd24 100644 --- a/config/settings.toml +++ b/config/settings.toml @@ -41,7 +41,7 @@ endpoint_type = 'sagemaker' [qwen-3-0-6B] model = 'Qwen3-0-6B' -hf_name = 'Qwen/Qwen3-0-6B' # instruct is now this, and base is appended with 'base' +hf_name = 'Qwen/Qwen3-0.6B' # instruct is now this, and base is appended with 'base' endpoint_type = 'sagemaker' thinking = false