iMeanAI · Syclus123 · May 21, 2025 · May 20, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,9 @@ data/*
 !data/human_labeled_reward_reference/*
 !data/dataset_io.py
 !data/raw_data_processor.py
+!data/Online-Mind2Web/
+
+!configs/log_config.json
 
 test.py
 test_dom_tree.py
@@ -27,4 +30,9 @@ agent/Environment/html_env/js_event/
 
 node_modules/
 package-lock.json
-package.json
+package.json
+
+results_wodom/
+results_wodom33/
+scripts/
+eval_agent/
diff --git a/agent/.DS_Store b/agent/.DS_Store
diff --git a/agent/LLM/llm_instance.py b/agent/LLM/llm_instance.py
@@ -5,7 +5,8 @@
 
 
 def create_llm_instance(model, json_mode=False, all_json_models=None):
-    if "gpt" in model or "o1" in model:
+    # if "gpt" in model or "o1" in model:
+    if any(keyword in model for keyword in ["gpt", "o1", "o3-mini", "o4-mini"]):
         if json_mode:
             if model in all_json_models:
                 return GPTGeneratorWithJSON(model)

diff --git a/agent/LLM/openai.py b/agent/LLM/openai.py
@@ -9,13 +9,18 @@
 from agent.Utils import *
 from .token_utils import truncate_messages_based_on_estimated_tokens
 
+# Adopt the new field schema (max_completion_tokens)
+NEW_TOKEN_MODELS = ("o3", "o4")
+
+def use_new_token_param(model_name: str) -> bool:
+    return any(model_name.startswith(p) for p in NEW_TOKEN_MODELS)
 
 class GPTGenerator:
     def __init__(self, model=None):
         self.model = model
         self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
-    async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> (str, str):
+    async def request(self, messages: list = None, max_tokens: int = 100000, temperature: float = 0.7) -> (str, str):
         try:
             if "gpt-3.5" in self.model:
                 messages = truncate_messages_based_on_estimated_tokens(messages, max_tokens=16385)
@@ -42,17 +47,34 @@ async def request(self, messages: list = None, max_tokens: int = 500, temperatur
             logger.error(f"Error in GPTGenerator.request: {e}")
             return "", str(e)
 
-    async def chat(self, messages, max_tokens=500, temperature=0.7):
+    async def chat(self, messages, max_tokens=100000, temperature=0.7):
         loop = asyncio.get_event_loop()
+
+        # Dynamically select field names
+        token_key = "max_completion_tokens" if use_new_token_param(self.model) \
+                                         else "max_tokens"                        
         if "o1" in self.model:
             data = {
                 'model': self.model,
                 'messages': messages,
             }
+        elif "o3" in self.model or "o4" in self.model:
+            data = {
+                'model': self.model,
+                token_key: max_tokens,
+                'messages': messages,
+            }
+        elif "gpt-4.1" in self.model:
+            data = {
+                'model': self.model,
+                token_key: 32768, # gpt-4.1 max_tokens = 32768
+                'messages': messages,
+            }
         else:
             data = {
                 'model': self.model,
-                'max_tokens': max_tokens,
+                # 'max_tokens': max_tokens,
+                token_key: max_tokens,
             'temperature': temperature,
             'messages': messages,
         }
@@ -79,11 +101,11 @@ def prepare_messages_for_json_mode(messages):
             messages.insert(0, {"role": "system", "content": "You are a helpful assistant designed to output json."})
         return messages
 
-    async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> (str, str):
+    async def request(self, messages: list = None, max_tokens: int = 100000, temperature: float = 0.7) -> (str, str):
         messages = self.prepare_messages_for_json_mode(messages)  # Prepare messages for JSON mode
         return await super().request(messages, max_tokens, temperature)
 
 
 class GPTGeneratorWithJSON(JSONModeMixin):
     def __init__(self, model=None):
-        super().__init__(model=model if model is not None else "gpt-4-turbo")
+        super().__init__(model=model if model is not None else "gpt-4-turbo")
diff --git a/batch_eval.py b/batch_eval.py
@@ -0,0 +1,124 @@
+"""
+This is a batch test script.
+This release adds the following features:
+1. Support screenshots of the evaluation process
+2. Support Online_Mind2Web task evaluation
+3. Support access to gpt-4.1, o3-mini, o4-mini and other models
+
+Tips: To run in a Linux environment without a visual interface, use the following command to start
+    sudo yum install -y xorg-x11-server-Xvfb
+    xvfb-run python batch_eval.py
+"""
+#!/usr/bin/env python3
+import json
+import os
+import subprocess
+import argparse
+import time
+from pathlib import Path
+
+def load_tasks(json_path):
+    with open(json_path, 'r') as f:
+        data = json.load(f)
+    return data
+
+def run_single_task(task, args):
+    command = [
+        "python", "eval.py",
+        "--global_reward_mode", args.global_reward_mode,
+        "--index", str(args.index),
+        "--single_task_name", task,
+        "--snapshot", args.snapshot,
+        "--planning_text_model", args.planning_text_model,
+        "--global_reward_text_model", args.global_reward_text_model
+    ]
+
+    print(f"\n{'='*80}")
+    print(f"Task: {task}")
+    print(f"{'='*80}")
+
+    try:
+        subprocess.run(command, check=True)
+        print(f"Mission accomplished: {task}")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Task failure: {task}")
+        print(f"Error: {e}")
+        return False
+
+def main():
+    parser = argparse.ArgumentParser(description='Online-Mind2Web Task')
+    parser.add_argument('--json_path', type=str, default='data/Online-Mind2Web/Online_Mind2Web.json',
+                        help='JSON task file path')
+    parser.add_argument('--global_reward_mode', type=str, default='no_global_reward',
+                        help='Global Reward Mode: dom_reward/no_global_reward')
+    parser.add_argument('--index', type=int, default=-1,
+                        help='Task index')
+    parser.add_argument('--snapshot', type=str, default='results',
+                        help='Snapshot directory')
+    parser.add_argument('--planning_text_model', type=str, default='gpt-4.1',
+                        help='planning_text_model')
+    parser.add_argument('--global_reward_text_model', type=str, default='gpt-4.1',
+                        help='global_reward_text_model')
+    parser.add_argument('--start_idx', type=int, default=0,
+                        help='The index to start the task')
+    parser.add_argument('--end_idx', type=int, default=None,
+                        help='The index of the finished task (excluding)')
+    parser.add_argument('--delay', type=int, default=5,
+                        help='Latency between tasks (seconds)')
+    parser.add_argument('--output_log', type=str, default='batch_run_log.txt',
+                        help='output_log')
+
+    args = parser.parse_args()
+
+    # Loading tasks
+    json_path = Path(args.json_path)
+    if not json_path.exists():
+        print(f"Error: File does not exist - {json_path}")
+        return
+
+    tasks = load_tasks(json_path)
+    start_idx = args.start_idx
+    end_idx = args.end_idx if args.end_idx is not None else len(tasks)
+
+    total_tasks = end_idx - start_idx
+    successful_tasks = 0
+
+    with open(args.output_log, 'w') as log_file:
+        log_file.write(f"The batch job run starts: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        log_file.write(f"total_tasks: {total_tasks}\n\n")
+
+    # Run the selected task
+    for i, task_data in enumerate(tasks[start_idx:end_idx]):
+        current_idx = start_idx + i
+        task = task_data["confirmed_task"]
+
+        with open(args.output_log, 'a') as log_file:
+            log_file.write(f"[{current_idx}/{len(tasks)}] Running tasks: {task}\n")
+
+        success = run_single_task(task, args)
+        if success:
+            successful_tasks += 1
+
+        # Logging results
+        with open(args.output_log, 'a') as log_file:
+            log_file.write(f"results: {'Success' if success else 'failure'}\n\n")
+        if i < total_tasks - 1:
+            print(f"waiting {args.delay} continue to the next task after seconds...")
+            time.sleep(args.delay)
+
+    with open(args.output_log, 'a') as log_file:
+        log_file.write(f"\nFinish: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        log_file.write(f"Total_tasks: {total_tasks}\n")
+        log_file.write(f"Number of successful tasks: {successful_tasks}\n")
+        log_file.write(f"Success rate: {successful_tasks/total_tasks*100:.2f}%\n")
+
+    print(f"\n{'='*80}")
+    print(f"Total_tasks: {total_tasks}")
+    print(f"Number of successful tasks: {successful_tasks}")
+    print(f"Success rate: {successful_tasks/total_tasks*100:.2f}%")
+    print(f"save: {args.output_log}")
+
+if __name__ == "__main__":
+    main()
+
diff --git a/configs/log_config.json b/configs/log_config.json
@@ -0,0 +1,5 @@
+{
+    "log_directory": "/home/ec2-user/WebCanvas/results_wodom33/logs",
+    "output_directory": "/home/ec2-user/WebCanvas/results_wodom33/json",
+    "task_mapping_file": "/home/ec2-user/WebCanvas/data/Online-Mind2Web/Online_Mind2Web.json"
+}
diff --git a/configs/setting.toml b/configs/setting.toml
@@ -1,6 +1,8 @@
 [basic]
-task_mode = "batch_tasks"    # single_task or batch_tasks
-max_time_step = 25           # For all tasks, set the maximum step length
+task_mode = "single_task"    # single_task or batch_tasks
+max_time_step = 15           # For all tasks, set the maximum step length
+save_screenshots = true  # screenshots
+screenshot_path = "./screenshots"
 
 [model]
 json_model_response = false      # Whether to require a model to strictly output json format, currently only support OPENAI models.
@@ -11,17 +13,21 @@ json_models = ["gpt-4-turbo",
                    "gpt-3.5-turbo",
                    "gpt-3.5-turbo-0125",
                    "gpt-4o-2024-05-13",
-                   "gpt-4o-mini-2024-07-18"]
+                   "gpt-4o-mini-2024-07-18",
+                   "o4-mini",
+                   "gpt-4.1-2025-04-14",
+                   "o3-mini-2025-01-31"
+                   ]
 
 
 [steps]
-interaction_mode = true             #  Whether human control of task execution status is required
-single_task_action_step = 10     
+interaction_mode = false             #  Whether human control of task execution status is required
+single_task_action_step = 15     
 batch_tasks_max_action_step = 10
 batch_tasks_condition_step_increase = 5
 
 [files]
-batch_tasks_file_path = "./data/example/mind2web-live_test_20241024.json" # The input data path
+batch_tasks_file_path = "./data/Online-Mind2Web/Online_Mind2Web.json" # The input data path
 ground_truth_file_path = "./data/human_labeled_reward_reference/GT_instructions_202404161811_for_all_data_0328.json"  # the ground_truth data path
 out_file_path = "./batch_tasks_results/example"   # YOUR OUT FILE PATH 
 
@@ -42,7 +48,9 @@ pricing_models = [
     "gpt-4-1106-preview",
     "gpt-4-vision-preview",
     "gpt-3.5-turbo-0125",
-    "gpt-3.5-turbo-1106"]
+    "gpt-3.5-turbo-1106",
+    "o4-mini"
+    ]
 
 # The price of each model for input and output, the unit is $/token
 # The name of input token price: model_name + "_input_price", such as gpt-4o_input_price
@@ -73,3 +81,4 @@ gpt-3.5-turbo-0125_input_price      = 0.0000005
 gpt-3.5-turbo-0125_output_price     = 0.0000015
 gpt-3.5-turbo-1106_input_price      = 0.000001
 gpt-3.5-turbo-1106_output_price     = 0.000002
+o4-mini                             = 0.000002
diff --git a/data/Online-Mind2Web/README.md b/data/Online-Mind2Web/README.md
@@ -0,0 +1,65 @@
+---
+license: cc-by-4.0
+language:
+- en
+size_categories:
+- n<1K
+configs:
+- config_name: default
+  data_files:
+  - split: test
+    path: "Online_Mind2Web.json"
+---
+<div align="center">
+    <a href="https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4?pvs=4">Blog</a> |
+    <a href="https://arxiv.org/abs/2504.01382">Paper</a> |
+    <a href="https://github.com/OSU-NLP-Group/Online-Mind2Web">Code</a> |
+    <a href="https://huggingface.co/spaces/osunlp/Online_Mind2Web_Leaderboard">Leaderboard</a>
+</div>
+
+
+## Online-Mind2Web
+Online-Mind2Web is the online version of [Mind2Web](https://osu-nlp-group.github.io/Mind2Web/), a more diverse and user-centric dataset includes 300 high-quality tasks from 136 popular websites across various domains. The dataset covers a diverse set of user tasks, such as clothing, food, housing, and transportation, to evaluate web agents' performance in a real-world online environment.
+
+### Data Fields
+- "task_id" (str): Unique id for each task.
+- "website" (str): Website url.
+- "task_description" (str): Task description.
+- "reference_length" (int): Number of steps required for a human annotator to complete the task.
+
+### Update Tasks
+We will regularly update Online-Mind2Web by replacing outdated or invalid tasks (e.g., due to website changes) to maintain its value as a rigorous benchmark for web agents. If you find any tasks are outdated, please reach out to us, and we will update them. 
+
+To ensure fair comparisons, we will aim to keep the updated tasks on the same websites as before and with a similar reference length. Additionally, once agent performance saturates on Online-Mind2Web, we will also revise simple tasks to preserve its long-term value.
+
+### Update History
+**2025/04/05:** Updated task IDs: ["c03ee2be3d73556ab789c0ad1cbd3451", "c181f903ec1107b850032c17cad88393", "2c8ef01a92c71ba9ef2e59bb17eea2b3", "d8e2a81fa621ce4737e5ea85671b630e", "63d6866fc000fcb1f153e07604bd1395", "199be0b54a436daee74247971fc684ee"]
+
+### Disclaimer
+This dataset was collected and released solely for research purposes, with the goal of making the web more accessible via language technologies. The authors are strongly against any potential harmful use of the data or technology to any party.
+
+### Citation Information
+Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data.
+```
+@article{xue2025illusionprogressassessingcurrent,
+      title={An Illusion of Progress? Assessing the Current State of Web Agents}, 
+      author={Tianci Xue and Weijian Qi and Tianneng Shi and Chan Hee Song and Boyu Gou and Dawn Song and Huan Sun and Yu Su},
+      year={2025},
+      eprint={2504.01382},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2504.01382}, 
+}
+
+@inproceedings{deng2023mind2web,
+ author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
+ pages = {28091--28114},
+ publisher = {Curran Associates, Inc.},
+ title = {Mind2Web: Towards a Generalist Agent for the Web},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5950bf290a1570ea401bf98882128160-Paper-Datasets_and_Benchmarks.pdf},
+ volume = {36},
+ year = {2023}
+}
+```