iMeanAI · Syclus123 · May 20, 2025 · May 21, 2025 · May 29, 2025 · May 30, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,10 @@ data/*
 !data/human_labeled_reward_reference/*
 !data/dataset_io.py
 !data/raw_data_processor.py
+!data/Online-Mind2Web/
+!data/Online-Mind2Web/Online_Mind2Web.json
+
+!configs/log_config.json
 
 test.py
 test_dom_tree.py
@@ -27,4 +31,15 @@ agent/Environment/html_env/js_event/
 
 node_modules/
 package-lock.json
-package.json
+package.json
+
+results_wodom/
+results_wodom33/
+scripts/
+eval_agent/
+dataset/
+dataset_4o/
+results*
+batch_output*
+check_result*
+dataset_41*
diff --git a/OM2W_Benchmarking/__init__.py b/OM2W_Benchmarking/__init__.py
diff --git a/OM2W_Benchmarking/dataset_check.py b/OM2W_Benchmarking/dataset_check.py
@@ -0,0 +1,38 @@
+import os
+import sys
+from pathlib import Path
+
+def count_trajectory_files():
+    base_dir = Path("../WebCanvas/dataset")
+    if not base_dir.exists():
+        print(f"Error: Path '{base_dir}' does not exist")
+        sys.exit(1)
+
+    empty_trajectories = []
+    task_count = 0
+    for task_id_dir in base_dir.iterdir():
+        if task_id_dir.is_dir():
+            task_count += 1
+            trajectory_dir = task_id_dir / "trajectory"
+
+            if trajectory_dir.exists() and trajectory_dir.is_dir():
+                files = list(trajectory_dir.iterdir())
+                file_count = len(files)
+                print(f"Task ID: {task_id_dir.name}, Number of Trajectory files: {file_count}")
+                if file_count == 0:
+                    empty_trajectories.append(task_id_dir.name)
+            else:
+                print(f"Task ID: {task_id_dir.name}, Trajectory folder does not exist")
+                empty_trajectories.append(task_id_dir.name)
+
+    print("\nThe Task ID of the empty Trajectory folder:")
+    if empty_trajectories:
+        for task_id in empty_trajectories:
+            print(task_id)
+    else:
+        print("Empty trajectory folder not found")
+
+    print(f"\nA total of {task_count} Task ids are processed")
+
+if __name__ == "__main__":
+    count_trajectory_files()
diff --git a/OM2W_Benchmarking/eval.sh b/OM2W_Benchmarking/eval.sh
@@ -0,0 +1,29 @@
+
+# api_key=API_KEY
+# model_name=MODEL_NAME
+api_key=${OPENAI_API_KEY}
+model_name=gpt-4o
+# model_name=o4-mini
+
+#Automatic evaluation method
+modes=(
+    "WebJudge_Online_Mind2Web_eval"
+    # "WebJudge_general_eval"
+    # "Autonomous_eval"
+    # "WebVoyager_eval"
+    # "AgentTrek_eval"
+)
+
+# base_dir="./data/example"
+base_dir="../WebCanvas/dataset"
+
+for mode in "${modes[@]}"; do
+    python ./OM2W_Benchmarking/src/run.py \
+        --mode "$mode" \
+        --model "${model_name}" \
+        --trajectories_dir "$base_dir" \
+        --api_key "${api_key}" \
+        --output_path results \
+        --num_worker 1 \
+        --score_threshold 3
+done
diff --git a/OM2W_Benchmarking/src/clean_html.py b/OM2W_Benchmarking/src/clean_html.py
@@ -0,0 +1,42 @@
+from typing import Any, Iterable, List
+from bs4 import BeautifulSoup
+
+SALIENT_ATTRIBUTES = (
+    "alt",
+    "aria-describedby",
+    "aria-label",
+    "aria-role",
+    "aria-controls",
+    "input-checked",
+    "label",
+    "name",
+    "option_selected",
+    "placeholder",
+    "readonly",
+    "text-value",
+    "title",
+    "value",
+    "data-gtm-label",
+    "href",
+    "role",
+)
+
+def process_element_tag(element: str, salient_attributes: Iterable[str]) -> str:
+    """Clean an HTML element string, keeping only salient_attributes."""
+    if not element.endswith(">"):
+        element += "'>"
+
+    soup = BeautifulSoup(element, "html.parser")
+    for tag in soup.find_all(True):
+        # Keep only salient attributes
+        filtered_attrs = {k: tag.attrs[k] for k in tag.attrs if k in salient_attributes}
+        name_val = filtered_attrs.pop("name", None)
+        new_tag = soup.new_tag(tag.name, **filtered_attrs)
+        if name_val:
+            new_tag["name"] = name_val
+        return str(new_tag).split(f"</{tag.name}>")[0]
+    return element
+
+if __name__ == "__main__":
+    text = '<input type=\"text\" name=\"q\" id=\"mntl-search-form--open__search-input\" class=\"mntl-search-form__input\" placeholder=\"Find a recipe or ingredient\" required=\"required\" value=\"\" autocomplete=\"off\" style=\"\"> -> TYPE beef sirloin'
+    print(process_element_tag(text, SALIENT_ATTRIBUTES))
diff --git a/OM2W_Benchmarking/src/methods/agenttrek_eval.py b/OM2W_Benchmarking/src/methods/agenttrek_eval.py
@@ -0,0 +1,56 @@
+from utils import encode_image
+from PIL import Image
+
+def AgentTrek_eval(task, last_actions, thoughts, images_path):
+    system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task goal, the agent's trajectory, your goal is to decide whether the agent's execution is successful or not.
+
+*Evaluation Criteria* 
+Whether the agent's trajectory is effective and corresponding to the goal 
+
+*Instructions* 1. Review the agent's actions and reasoning processes step by step. 
+2. if the agent is stuck in the very first login stage, which means it fails to log into target website at the beginning, that's a failure. 
+3. Determine if the agent has achieved the task goal based on the trajectory. A task can be considered successful if most trajectory is effective. 
+4. the agent sometimes can't stop after finishing a task and continue doing repeated actions. these actions may be some failed attempt after a series of correct actions. the task should be regarded as successful if the correct actions are effective and almost reach the goal. 
+5. if the agent is stuck in the loop at the early stage of the task, which means they don't even get close to the goal before they get stuck in the loop, that's a failure. for example, the agent begin to get stuck before third step. 
+6. when the task is to change the google account password, it can't be regarded as successful when agent finish at trying to click "manage your account". 
+7. if there are over 8 correct action in the trajectory, it can be regard as a successful agent. 
+8. final saving action is not a must. the task is successful if the agent does most things right and just forget to save the change at last. 
+9. if the original task has 2 subtasks, the agent only complete one of them, that's still a success. e.g. the task is to update name and birthday, but agent only update name, that's fine. 
+10. if the task is to post a review, the agent can be considered successful when it finish writing the review and reach the step to post it, don't have to click the post button. 
+11. Since we don't have a printer, some printing related task can be considered successful if the agent reach the step to click print button. 
+12. if the task is finished at the initial state and the agent do nothing because of it, it should also be regarded as successful.
+
+*IMPORTANT* 
+1. in the trajectory, an action always follows a corresponding reasoning, which shows the observation and thought of the agent. 
+2. your response should be contain: 
+Thoughts: <your thoughts and reasoning process> 
+Status: "success" or "failure"
+"""
+    prompt = """The goal of the task: {task}
+
+Trajectory:
+{thoughts_and_actions}
+
+The last snapshot of the web page is shown in the image."""
+    thoughts_and_actions = ""
+    for idx, (thought, action) in enumerate(zip(thoughts, last_actions)):
+        thought = thought.replace("\n\n", " ")
+        action = action.replace("\n\n", " ")
+        thoughts_and_actions += f"Thought {idx+1}: {thought}\nAction {idx+1}: {action}\n\n"
+    text = prompt.format(task=task, thoughts_and_actions=thoughts_and_actions.strip("\n\n"))
+
+    jpg_base64_str = encode_image(Image.open(images_path))
+    messages = [
+        {"role": "system", "content": system_msg},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": text},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"},
+                },
+            ],
+        }
+    ]
+    return messages, text, system_msg
diff --git a/OM2W_Benchmarking/src/methods/automomous_eval.py b/OM2W_Benchmarking/src/methods/automomous_eval.py
@@ -0,0 +1,41 @@
+from utils import encode_image
+from PIL import Image
+
+def Autonomous_eval(task, last_actions, images_path):
+    system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's intent, the agent's action history, the final state of the webpage, and the agent's response to the user, your goal is to decide whether the agent's execution is successful or not.
+
+There are three types of tasks:
+1. Information seeking: The user wants to obtain certain information from the webpage, such as the information of a product, reviews, map info, comparison of map routes, etc. The bot's response must contain the information the user wants, or explicitly state that the information is not available. Otherwise, e.g. the bot encounters an exception and respond with the error content, the task is considered a failure. Besides, be careful about the sufficiency of the agent's actions. For example, when asked to list the top-searched items in a shop, the agent should order the items by the number of searches, and then return the top items. If the ordering action is missing, the task is likely to fail.
+2. Site navigation: The user wants to navigate to a specific page. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response.
+3. Content modification: The user wants to modify the content of a webpage or configuration. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response.
+
+*IMPORTANT*
+Format your response into two lines as shown below:
+
+Thoughts: <your thoughts and reasoning process>
+Status: "success" or "failure"
+"""
+    prompt = """User Intent: {task}
+
+Action History:
+{last_actions}
+
+The last snapshot of the web page is shown in the image."""
+
+    text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}" for i, action in enumerate(last_actions)))
+
+    jpg_base64_str = encode_image(Image.open(images_path))
+    messages = [
+        {"role": "system", "content": system_msg},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": text},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"},
+                },
+            ],
+        }
+    ]
+    return messages, text, system_msg