Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
17 changes: 16 additions & 1 deletion .gitignore
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ data/*
!data/human_labeled_reward_reference/*
!data/dataset_io.py
!data/raw_data_processor.py
!data/Online-Mind2Web/
!data/Online-Mind2Web/Online_Mind2Web.json

!configs/log_config.json

test.py
test_dom_tree.py
Expand All @@ -27,4 +31,15 @@ agent/Environment/html_env/js_event/

node_modules/
package-lock.json
package.json
package.json

results_wodom/
results_wodom33/
scripts/
eval_agent/
dataset/
dataset_4o/
results*
batch_output*
check_result*
dataset_41*
Empty file added OM2W_Benchmarking/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions OM2W_Benchmarking/dataset_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os
import sys
from pathlib import Path

def count_trajectory_files():
base_dir = Path("../WebCanvas/dataset")
if not base_dir.exists():
print(f"Error: Path '{base_dir}' does not exist")
sys.exit(1)

empty_trajectories = []
task_count = 0
for task_id_dir in base_dir.iterdir():
if task_id_dir.is_dir():
task_count += 1
trajectory_dir = task_id_dir / "trajectory"

if trajectory_dir.exists() and trajectory_dir.is_dir():
files = list(trajectory_dir.iterdir())
file_count = len(files)
print(f"Task ID: {task_id_dir.name}, Number of Trajectory files: {file_count}")
if file_count == 0:
empty_trajectories.append(task_id_dir.name)
else:
print(f"Task ID: {task_id_dir.name}, Trajectory folder does not exist")
empty_trajectories.append(task_id_dir.name)

print("\nThe Task ID of the empty Trajectory folder:")
if empty_trajectories:
for task_id in empty_trajectories:
print(task_id)
else:
print("Empty trajectory folder not found")

print(f"\nA total of {task_count} Task ids are processed")

if __name__ == "__main__":
count_trajectory_files()
29 changes: 29 additions & 0 deletions OM2W_Benchmarking/eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

# api_key=API_KEY
# model_name=MODEL_NAME
api_key=${OPENAI_API_KEY}
model_name=gpt-4o
# model_name=o4-mini

#Automatic evaluation method
modes=(
"WebJudge_Online_Mind2Web_eval"
# "WebJudge_general_eval"
# "Autonomous_eval"
# "WebVoyager_eval"
# "AgentTrek_eval"
)

# base_dir="./data/example"
base_dir="../WebCanvas/dataset"

for mode in "${modes[@]}"; do
python ./OM2W_Benchmarking/src/run.py \
--mode "$mode" \
--model "${model_name}" \
--trajectories_dir "$base_dir" \
--api_key "${api_key}" \
--output_path results \
--num_worker 1 \
--score_threshold 3
done
42 changes: 42 additions & 0 deletions OM2W_Benchmarking/src/clean_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Any, Iterable, List
from bs4 import BeautifulSoup

SALIENT_ATTRIBUTES = (
"alt",
"aria-describedby",
"aria-label",
"aria-role",
"aria-controls",
"input-checked",
"label",
"name",
"option_selected",
"placeholder",
"readonly",
"text-value",
"title",
"value",
"data-gtm-label",
"href",
"role",
)

def process_element_tag(element: str, salient_attributes: Iterable[str]) -> str:
"""Clean an HTML element string, keeping only salient_attributes."""
if not element.endswith(">"):
element += "'>"

soup = BeautifulSoup(element, "html.parser")
for tag in soup.find_all(True):
# Keep only salient attributes
filtered_attrs = {k: tag.attrs[k] for k in tag.attrs if k in salient_attributes}
name_val = filtered_attrs.pop("name", None)
new_tag = soup.new_tag(tag.name, **filtered_attrs)
if name_val:
new_tag["name"] = name_val
return str(new_tag).split(f"</{tag.name}>")[0]
return element

if __name__ == "__main__":
text = '<input type=\"text\" name=\"q\" id=\"mntl-search-form--open__search-input\" class=\"mntl-search-form__input\" placeholder=\"Find a recipe or ingredient\" required=\"required\" value=\"\" autocomplete=\"off\" style=\"\"> -> TYPE beef sirloin'
print(process_element_tag(text, SALIENT_ATTRIBUTES))
56 changes: 56 additions & 0 deletions OM2W_Benchmarking/src/methods/agenttrek_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from utils import encode_image
from PIL import Image

def AgentTrek_eval(task, last_actions, thoughts, images_path):
system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task goal, the agent's trajectory, your goal is to decide whether the agent's execution is successful or not.

*Evaluation Criteria*
Whether the agent's trajectory is effective and corresponding to the goal

*Instructions* 1. Review the agent's actions and reasoning processes step by step.
2. if the agent is stuck in the very first login stage, which means it fails to log into target website at the beginning, that's a failure.
3. Determine if the agent has achieved the task goal based on the trajectory. A task can be considered successful if most trajectory is effective.
4. the agent sometimes can't stop after finishing a task and continue doing repeated actions. these actions may be some failed attempt after a series of correct actions. the task should be regarded as successful if the correct actions are effective and almost reach the goal.
5. if the agent is stuck in the loop at the early stage of the task, which means they don't even get close to the goal before they get stuck in the loop, that's a failure. for example, the agent begin to get stuck before third step.
6. when the task is to change the google account password, it can't be regarded as successful when agent finish at trying to click "manage your account".
7. if there are over 8 correct action in the trajectory, it can be regard as a successful agent.
8. final saving action is not a must. the task is successful if the agent does most things right and just forget to save the change at last.
9. if the original task has 2 subtasks, the agent only complete one of them, that's still a success. e.g. the task is to update name and birthday, but agent only update name, that's fine.
10. if the task is to post a review, the agent can be considered successful when it finish writing the review and reach the step to post it, don't have to click the post button.
11. Since we don't have a printer, some printing related task can be considered successful if the agent reach the step to click print button.
12. if the task is finished at the initial state and the agent do nothing because of it, it should also be regarded as successful.

*IMPORTANT*
1. in the trajectory, an action always follows a corresponding reasoning, which shows the observation and thought of the agent.
2. your response should be contain:
Thoughts: <your thoughts and reasoning process>
Status: "success" or "failure"
"""
prompt = """The goal of the task: {task}

Trajectory:
{thoughts_and_actions}

The last snapshot of the web page is shown in the image."""
thoughts_and_actions = ""
for idx, (thought, action) in enumerate(zip(thoughts, last_actions)):
thought = thought.replace("\n\n", " ")
action = action.replace("\n\n", " ")
thoughts_and_actions += f"Thought {idx+1}: {thought}\nAction {idx+1}: {action}\n\n"
text = prompt.format(task=task, thoughts_and_actions=thoughts_and_actions.strip("\n\n"))

jpg_base64_str = encode_image(Image.open(images_path))
messages = [
{"role": "system", "content": system_msg},
{
"role": "user",
"content": [
{"type": "text", "text": text},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"},
},
],
}
]
return messages, text, system_msg
41 changes: 41 additions & 0 deletions OM2W_Benchmarking/src/methods/automomous_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from utils import encode_image
from PIL import Image

def Autonomous_eval(task, last_actions, images_path):
system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's intent, the agent's action history, the final state of the webpage, and the agent's response to the user, your goal is to decide whether the agent's execution is successful or not.

There are three types of tasks:
1. Information seeking: The user wants to obtain certain information from the webpage, such as the information of a product, reviews, map info, comparison of map routes, etc. The bot's response must contain the information the user wants, or explicitly state that the information is not available. Otherwise, e.g. the bot encounters an exception and respond with the error content, the task is considered a failure. Besides, be careful about the sufficiency of the agent's actions. For example, when asked to list the top-searched items in a shop, the agent should order the items by the number of searches, and then return the top items. If the ordering action is missing, the task is likely to fail.
2. Site navigation: The user wants to navigate to a specific page. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response.
3. Content modification: The user wants to modify the content of a webpage or configuration. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response.

*IMPORTANT*
Format your response into two lines as shown below:

Thoughts: <your thoughts and reasoning process>
Status: "success" or "failure"
"""
prompt = """User Intent: {task}

Action History:
{last_actions}

The last snapshot of the web page is shown in the image."""

text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}" for i, action in enumerate(last_actions)))

jpg_base64_str = encode_image(Image.open(images_path))
messages = [
{"role": "system", "content": system_msg},
{
"role": "user",
"content": [
{"type": "text", "text": text},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"},
},
],
}
]
return messages, text, system_msg
Loading