Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ data/*
!data/human_labeled_reward_reference/*
!data/dataset_io.py
!data/raw_data_processor.py
!data/Online-Mind2Web/

!configs/log_config.json

test.py
test_dom_tree.py
Expand All @@ -27,4 +30,9 @@ agent/Environment/html_env/js_event/

node_modules/
package-lock.json
package.json
package.json

results_wodom/
results_wodom33/
scripts/
eval_agent/
Binary file added agent/.DS_Store
Binary file not shown.
3 changes: 2 additions & 1 deletion agent/LLM/llm_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@


def create_llm_instance(model, json_mode=False, all_json_models=None):
if "gpt" in model or "o1" in model:
# if "gpt" in model or "o1" in model:
if any(keyword in model for keyword in ["gpt", "o1", "o3-mini", "o4-mini"]):
if json_mode:
if model in all_json_models:
return GPTGeneratorWithJSON(model)
Expand Down
32 changes: 27 additions & 5 deletions agent/LLM/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,18 @@
from agent.Utils import *
from .token_utils import truncate_messages_based_on_estimated_tokens

# Adopt the new field schema (max_completion_tokens)
NEW_TOKEN_MODELS = ("o3", "o4")

def use_new_token_param(model_name: str) -> bool:
return any(model_name.startswith(p) for p in NEW_TOKEN_MODELS)

class GPTGenerator:
def __init__(self, model=None):
self.model = model
self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> (str, str):
async def request(self, messages: list = None, max_tokens: int = 100000, temperature: float = 0.7) -> (str, str):
try:
if "gpt-3.5" in self.model:
messages = truncate_messages_based_on_estimated_tokens(messages, max_tokens=16385)
Expand All @@ -42,17 +47,34 @@ async def request(self, messages: list = None, max_tokens: int = 500, temperatur
logger.error(f"Error in GPTGenerator.request: {e}")
return "", str(e)

async def chat(self, messages, max_tokens=500, temperature=0.7):
async def chat(self, messages, max_tokens=100000, temperature=0.7):
loop = asyncio.get_event_loop()

# Dynamically select field names
token_key = "max_completion_tokens" if use_new_token_param(self.model) \
else "max_tokens"
if "o1" in self.model:
data = {
'model': self.model,
'messages': messages,
}
elif "o3" in self.model or "o4" in self.model:
data = {
'model': self.model,
token_key: max_tokens,
'messages': messages,
}
elif "gpt-4.1" in self.model:
data = {
'model': self.model,
token_key: 32768, # gpt-4.1 max_tokens = 32768
'messages': messages,
}
else:
data = {
'model': self.model,
'max_tokens': max_tokens,
# 'max_tokens': max_tokens,
token_key: max_tokens,
'temperature': temperature,
'messages': messages,
}
Expand All @@ -79,11 +101,11 @@ def prepare_messages_for_json_mode(messages):
messages.insert(0, {"role": "system", "content": "You are a helpful assistant designed to output json."})
return messages

async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> (str, str):
async def request(self, messages: list = None, max_tokens: int = 100000, temperature: float = 0.7) -> (str, str):
messages = self.prepare_messages_for_json_mode(messages) # Prepare messages for JSON mode
return await super().request(messages, max_tokens, temperature)


class GPTGeneratorWithJSON(JSONModeMixin):
def __init__(self, model=None):
super().__init__(model=model if model is not None else "gpt-4-turbo")
super().__init__(model=model if model is not None else "gpt-4-turbo")
124 changes: 124 additions & 0 deletions batch_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""
This is a batch test script.
This release adds the following features:
1. Support screenshots of the evaluation process
2. Support Online_Mind2Web task evaluation
3. Support access to gpt-4.1, o3-mini, o4-mini and other models

Tips: To run in a Linux environment without a visual interface, use the following command to start
sudo yum install -y xorg-x11-server-Xvfb
xvfb-run python batch_eval.py
"""
#!/usr/bin/env python3
import json
import os
import subprocess
import argparse
import time
from pathlib import Path

def load_tasks(json_path):
with open(json_path, 'r') as f:
data = json.load(f)
return data

def run_single_task(task, args):
command = [
"python", "eval.py",
"--global_reward_mode", args.global_reward_mode,
"--index", str(args.index),
"--single_task_name", task,
"--snapshot", args.snapshot,
"--planning_text_model", args.planning_text_model,
"--global_reward_text_model", args.global_reward_text_model
]

print(f"\n{'='*80}")
print(f"Task: {task}")
print(f"{'='*80}")

try:
subprocess.run(command, check=True)
print(f"Mission accomplished: {task}")
return True
except subprocess.CalledProcessError as e:
print(f"Task failure: {task}")
print(f"Error: {e}")
return False

def main():
parser = argparse.ArgumentParser(description='Online-Mind2Web Task')
parser.add_argument('--json_path', type=str, default='data/Online-Mind2Web/Online_Mind2Web.json',
help='JSON task file path')
parser.add_argument('--global_reward_mode', type=str, default='no_global_reward',
help='Global Reward Mode: dom_reward/no_global_reward')
parser.add_argument('--index', type=int, default=-1,
help='Task index')
parser.add_argument('--snapshot', type=str, default='results',
help='Snapshot directory')
parser.add_argument('--planning_text_model', type=str, default='gpt-4.1',
help='planning_text_model')
parser.add_argument('--global_reward_text_model', type=str, default='gpt-4.1',
help='global_reward_text_model')
parser.add_argument('--start_idx', type=int, default=0,
help='The index to start the task')
parser.add_argument('--end_idx', type=int, default=None,
help='The index of the finished task (excluding)')
parser.add_argument('--delay', type=int, default=5,
help='Latency between tasks (seconds)')
parser.add_argument('--output_log', type=str, default='batch_run_log.txt',
help='output_log')

args = parser.parse_args()

# Loading tasks
json_path = Path(args.json_path)
if not json_path.exists():
print(f"Error: File does not exist - {json_path}")
return

tasks = load_tasks(json_path)
start_idx = args.start_idx
end_idx = args.end_idx if args.end_idx is not None else len(tasks)

total_tasks = end_idx - start_idx
successful_tasks = 0

with open(args.output_log, 'w') as log_file:
log_file.write(f"The batch job run starts: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
log_file.write(f"total_tasks: {total_tasks}\n\n")

# Run the selected task
for i, task_data in enumerate(tasks[start_idx:end_idx]):
current_idx = start_idx + i
task = task_data["confirmed_task"]

with open(args.output_log, 'a') as log_file:
log_file.write(f"[{current_idx}/{len(tasks)}] Running tasks: {task}\n")

success = run_single_task(task, args)
if success:
successful_tasks += 1

# Logging results
with open(args.output_log, 'a') as log_file:
log_file.write(f"results: {'Success' if success else 'failure'}\n\n")
if i < total_tasks - 1:
print(f"waiting {args.delay} continue to the next task after seconds...")
time.sleep(args.delay)

with open(args.output_log, 'a') as log_file:
log_file.write(f"\nFinish: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
log_file.write(f"Total_tasks: {total_tasks}\n")
log_file.write(f"Number of successful tasks: {successful_tasks}\n")
log_file.write(f"Success rate: {successful_tasks/total_tasks*100:.2f}%\n")

print(f"\n{'='*80}")
print(f"Total_tasks: {total_tasks}")
print(f"Number of successful tasks: {successful_tasks}")
print(f"Success rate: {successful_tasks/total_tasks*100:.2f}%")
print(f"save: {args.output_log}")

if __name__ == "__main__":
main()

5 changes: 5 additions & 0 deletions configs/log_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"log_directory": "/home/ec2-user/WebCanvas/results_wodom33/logs",
"output_directory": "/home/ec2-user/WebCanvas/results_wodom33/json",
"task_mapping_file": "/home/ec2-user/WebCanvas/data/Online-Mind2Web/Online_Mind2Web.json"
}
23 changes: 16 additions & 7 deletions configs/setting.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
[basic]
task_mode = "batch_tasks" # single_task or batch_tasks
max_time_step = 25 # For all tasks, set the maximum step length
task_mode = "single_task" # single_task or batch_tasks
max_time_step = 15 # For all tasks, set the maximum step length
save_screenshots = true # screenshots
screenshot_path = "./screenshots"

[model]
json_model_response = false # Whether to require a model to strictly output json format, currently only support OPENAI models.
Expand All @@ -11,17 +13,21 @@ json_models = ["gpt-4-turbo",
"gpt-3.5-turbo",
"gpt-3.5-turbo-0125",
"gpt-4o-2024-05-13",
"gpt-4o-mini-2024-07-18"]
"gpt-4o-mini-2024-07-18",
"o4-mini",
"gpt-4.1-2025-04-14",
"o3-mini-2025-01-31"
]


[steps]
interaction_mode = true # Whether human control of task execution status is required
single_task_action_step = 10
interaction_mode = false # Whether human control of task execution status is required
single_task_action_step = 15
batch_tasks_max_action_step = 10
batch_tasks_condition_step_increase = 5

[files]
batch_tasks_file_path = "./data/example/mind2web-live_test_20241024.json" # The input data path
batch_tasks_file_path = "./data/Online-Mind2Web/Online_Mind2Web.json" # The input data path
ground_truth_file_path = "./data/human_labeled_reward_reference/GT_instructions_202404161811_for_all_data_0328.json" # the ground_truth data path
out_file_path = "./batch_tasks_results/example" # YOUR OUT FILE PATH

Expand All @@ -42,7 +48,9 @@ pricing_models = [
"gpt-4-1106-preview",
"gpt-4-vision-preview",
"gpt-3.5-turbo-0125",
"gpt-3.5-turbo-1106"]
"gpt-3.5-turbo-1106",
"o4-mini"
]

# The price of each model for input and output, the unit is $/token
# The name of input token price: model_name + "_input_price", such as gpt-4o_input_price
Expand Down Expand Up @@ -73,3 +81,4 @@ gpt-3.5-turbo-0125_input_price = 0.0000005
gpt-3.5-turbo-0125_output_price = 0.0000015
gpt-3.5-turbo-1106_input_price = 0.000001
gpt-3.5-turbo-1106_output_price = 0.000002
o4-mini = 0.000002
65 changes: 65 additions & 0 deletions data/Online-Mind2Web/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
---
license: cc-by-4.0
language:
- en
size_categories:
- n<1K
configs:
- config_name: default
data_files:
- split: test
path: "Online_Mind2Web.json"
---
<div align="center">
<a href="https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4?pvs=4">Blog</a> |
<a href="https://arxiv.org/abs/2504.01382">Paper</a> |
<a href="https://github.com/OSU-NLP-Group/Online-Mind2Web">Code</a> |
<a href="https://huggingface.co/spaces/osunlp/Online_Mind2Web_Leaderboard">Leaderboard</a>
</div>


## Online-Mind2Web
Online-Mind2Web is the online version of [Mind2Web](https://osu-nlp-group.github.io/Mind2Web/), a more diverse and user-centric dataset includes 300 high-quality tasks from 136 popular websites across various domains. The dataset covers a diverse set of user tasks, such as clothing, food, housing, and transportation, to evaluate web agents' performance in a real-world online environment.

### Data Fields
- "task_id" (str): Unique id for each task.
- "website" (str): Website url.
- "task_description" (str): Task description.
- "reference_length" (int): Number of steps required for a human annotator to complete the task.

### Update Tasks
We will regularly update Online-Mind2Web by replacing outdated or invalid tasks (e.g., due to website changes) to maintain its value as a rigorous benchmark for web agents. If you find any tasks are outdated, please reach out to us, and we will update them.

To ensure fair comparisons, we will aim to keep the updated tasks on the same websites as before and with a similar reference length. Additionally, once agent performance saturates on Online-Mind2Web, we will also revise simple tasks to preserve its long-term value.

### Update History
**2025/04/05:** Updated task IDs: ["c03ee2be3d73556ab789c0ad1cbd3451", "c181f903ec1107b850032c17cad88393", "2c8ef01a92c71ba9ef2e59bb17eea2b3", "d8e2a81fa621ce4737e5ea85671b630e", "63d6866fc000fcb1f153e07604bd1395", "199be0b54a436daee74247971fc684ee"]

### Disclaimer
This dataset was collected and released solely for research purposes, with the goal of making the web more accessible via language technologies. The authors are strongly against any potential harmful use of the data or technology to any party.

### Citation Information
Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data.
```
@article{xue2025illusionprogressassessingcurrent,
title={An Illusion of Progress? Assessing the Current State of Web Agents},
author={Tianci Xue and Weijian Qi and Tianneng Shi and Chan Hee Song and Boyu Gou and Dawn Song and Huan Sun and Yu Su},
year={2025},
eprint={2504.01382},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2504.01382},
}

@inproceedings{deng2023mind2web,
author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
booktitle = {Advances in Neural Information Processing Systems},
editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
pages = {28091--28114},
publisher = {Curran Associates, Inc.},
title = {Mind2Web: Towards a Generalist Agent for the Web},
url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5950bf290a1570ea401bf98882128160-Paper-Datasets_and_Benchmarks.pdf},
volume = {36},
year = {2023}
}
```
Loading