diff --git a/.DS_Store b/.DS_Store index b550ed0..2269f7e 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 2ab4dcc..c076bcc --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,10 @@ data/* !data/human_labeled_reward_reference/* !data/dataset_io.py !data/raw_data_processor.py +!data/Online-Mind2Web/ +!data/Online-Mind2Web/Online_Mind2Web.json + +!configs/log_config.json test.py test_dom_tree.py @@ -27,4 +31,15 @@ agent/Environment/html_env/js_event/ node_modules/ package-lock.json -package.json \ No newline at end of file +package.json + +results_wodom/ +results_wodom33/ +scripts/ +eval_agent/ +dataset/ +dataset_4o/ +results* +batch_output* +check_result* +dataset_41* diff --git a/OM2W_Benchmarking/__init__.py b/OM2W_Benchmarking/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OM2W_Benchmarking/dataset_check.py b/OM2W_Benchmarking/dataset_check.py new file mode 100644 index 0000000..0f4a549 --- /dev/null +++ b/OM2W_Benchmarking/dataset_check.py @@ -0,0 +1,38 @@ +import os +import sys +from pathlib import Path + +def count_trajectory_files(): + base_dir = Path("../WebCanvas/dataset") + if not base_dir.exists(): + print(f"Error: Path '{base_dir}' does not exist") + sys.exit(1) + + empty_trajectories = [] + task_count = 0 + for task_id_dir in base_dir.iterdir(): + if task_id_dir.is_dir(): + task_count += 1 + trajectory_dir = task_id_dir / "trajectory" + + if trajectory_dir.exists() and trajectory_dir.is_dir(): + files = list(trajectory_dir.iterdir()) + file_count = len(files) + print(f"Task ID: {task_id_dir.name}, Number of Trajectory files: {file_count}") + if file_count == 0: + empty_trajectories.append(task_id_dir.name) + else: + print(f"Task ID: {task_id_dir.name}, Trajectory folder does not exist") + empty_trajectories.append(task_id_dir.name) + + print("\nThe Task ID of the empty Trajectory folder:") + if empty_trajectories: + for task_id in empty_trajectories: + print(task_id) + else: + print("Empty trajectory folder not found") + + print(f"\nA total of {task_count} Task ids are processed") + +if __name__ == "__main__": + count_trajectory_files() \ No newline at end of file diff --git a/OM2W_Benchmarking/eval.sh b/OM2W_Benchmarking/eval.sh new file mode 100644 index 0000000..bb1797e --- /dev/null +++ b/OM2W_Benchmarking/eval.sh @@ -0,0 +1,29 @@ + +# api_key=API_KEY +# model_name=MODEL_NAME +api_key=${OPENAI_API_KEY} +model_name=gpt-4o +# model_name=o4-mini + +#Automatic evaluation method +modes=( + "WebJudge_Online_Mind2Web_eval" + # "WebJudge_general_eval" + # "Autonomous_eval" + # "WebVoyager_eval" + # "AgentTrek_eval" +) + +# base_dir="./data/example" +base_dir="../WebCanvas/dataset" + +for mode in "${modes[@]}"; do + python ./OM2W_Benchmarking/src/run.py \ + --mode "$mode" \ + --model "${model_name}" \ + --trajectories_dir "$base_dir" \ + --api_key "${api_key}" \ + --output_path results \ + --num_worker 1 \ + --score_threshold 3 +done diff --git a/OM2W_Benchmarking/src/clean_html.py b/OM2W_Benchmarking/src/clean_html.py new file mode 100644 index 0000000..0069694 --- /dev/null +++ b/OM2W_Benchmarking/src/clean_html.py @@ -0,0 +1,42 @@ +from typing import Any, Iterable, List +from bs4 import BeautifulSoup + +SALIENT_ATTRIBUTES = ( + "alt", + "aria-describedby", + "aria-label", + "aria-role", + "aria-controls", + "input-checked", + "label", + "name", + "option_selected", + "placeholder", + "readonly", + "text-value", + "title", + "value", + "data-gtm-label", + "href", + "role", +) + +def process_element_tag(element: str, salient_attributes: Iterable[str]) -> str: + """Clean an HTML element string, keeping only salient_attributes.""" + if not element.endswith(">"): + element += "'>" + + soup = BeautifulSoup(element, "html.parser") + for tag in soup.find_all(True): + # Keep only salient attributes + filtered_attrs = {k: tag.attrs[k] for k in tag.attrs if k in salient_attributes} + name_val = filtered_attrs.pop("name", None) + new_tag = soup.new_tag(tag.name, **filtered_attrs) + if name_val: + new_tag["name"] = name_val + return str(new_tag).split(f"")[0] + return element + +if __name__ == "__main__": + text = ' -> TYPE beef sirloin' + print(process_element_tag(text, SALIENT_ATTRIBUTES)) \ No newline at end of file diff --git a/OM2W_Benchmarking/src/methods/agenttrek_eval.py b/OM2W_Benchmarking/src/methods/agenttrek_eval.py new file mode 100644 index 0000000..4d1f8fe --- /dev/null +++ b/OM2W_Benchmarking/src/methods/agenttrek_eval.py @@ -0,0 +1,56 @@ +from utils import encode_image +from PIL import Image + +def AgentTrek_eval(task, last_actions, thoughts, images_path): + system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task goal, the agent's trajectory, your goal is to decide whether the agent's execution is successful or not. + +*Evaluation Criteria* +Whether the agent's trajectory is effective and corresponding to the goal + +*Instructions* 1. Review the agent's actions and reasoning processes step by step. +2. if the agent is stuck in the very first login stage, which means it fails to log into target website at the beginning, that's a failure. +3. Determine if the agent has achieved the task goal based on the trajectory. A task can be considered successful if most trajectory is effective. +4. the agent sometimes can't stop after finishing a task and continue doing repeated actions. these actions may be some failed attempt after a series of correct actions. the task should be regarded as successful if the correct actions are effective and almost reach the goal. +5. if the agent is stuck in the loop at the early stage of the task, which means they don't even get close to the goal before they get stuck in the loop, that's a failure. for example, the agent begin to get stuck before third step. +6. when the task is to change the google account password, it can't be regarded as successful when agent finish at trying to click "manage your account". +7. if there are over 8 correct action in the trajectory, it can be regard as a successful agent. +8. final saving action is not a must. the task is successful if the agent does most things right and just forget to save the change at last. +9. if the original task has 2 subtasks, the agent only complete one of them, that's still a success. e.g. the task is to update name and birthday, but agent only update name, that's fine. +10. if the task is to post a review, the agent can be considered successful when it finish writing the review and reach the step to post it, don't have to click the post button. +11. Since we don't have a printer, some printing related task can be considered successful if the agent reach the step to click print button. +12. if the task is finished at the initial state and the agent do nothing because of it, it should also be regarded as successful. + +*IMPORTANT* +1. in the trajectory, an action always follows a corresponding reasoning, which shows the observation and thought of the agent. +2. your response should be contain: +Thoughts: +Status: "success" or "failure" +""" + prompt = """The goal of the task: {task} + +Trajectory: +{thoughts_and_actions} + +The last snapshot of the web page is shown in the image.""" + thoughts_and_actions = "" + for idx, (thought, action) in enumerate(zip(thoughts, last_actions)): + thought = thought.replace("\n\n", " ") + action = action.replace("\n\n", " ") + thoughts_and_actions += f"Thought {idx+1}: {thought}\nAction {idx+1}: {action}\n\n" + text = prompt.format(task=task, thoughts_and_actions=thoughts_and_actions.strip("\n\n")) + + jpg_base64_str = encode_image(Image.open(images_path)) + messages = [ + {"role": "system", "content": system_msg}, + { + "role": "user", + "content": [ + {"type": "text", "text": text}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"}, + }, + ], + } + ] + return messages, text, system_msg \ No newline at end of file diff --git a/OM2W_Benchmarking/src/methods/automomous_eval.py b/OM2W_Benchmarking/src/methods/automomous_eval.py new file mode 100644 index 0000000..3c4d907 --- /dev/null +++ b/OM2W_Benchmarking/src/methods/automomous_eval.py @@ -0,0 +1,41 @@ +from utils import encode_image +from PIL import Image + +def Autonomous_eval(task, last_actions, images_path): + system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's intent, the agent's action history, the final state of the webpage, and the agent's response to the user, your goal is to decide whether the agent's execution is successful or not. + +There are three types of tasks: +1. Information seeking: The user wants to obtain certain information from the webpage, such as the information of a product, reviews, map info, comparison of map routes, etc. The bot's response must contain the information the user wants, or explicitly state that the information is not available. Otherwise, e.g. the bot encounters an exception and respond with the error content, the task is considered a failure. Besides, be careful about the sufficiency of the agent's actions. For example, when asked to list the top-searched items in a shop, the agent should order the items by the number of searches, and then return the top items. If the ordering action is missing, the task is likely to fail. +2. Site navigation: The user wants to navigate to a specific page. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response. +3. Content modification: The user wants to modify the content of a webpage or configuration. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response. + +*IMPORTANT* +Format your response into two lines as shown below: + +Thoughts: +Status: "success" or "failure" +""" + prompt = """User Intent: {task} + +Action History: +{last_actions} + +The last snapshot of the web page is shown in the image.""" + + text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}" for i, action in enumerate(last_actions))) + + jpg_base64_str = encode_image(Image.open(images_path)) + messages = [ + {"role": "system", "content": system_msg}, + { + "role": "user", + "content": [ + {"type": "text", "text": text}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"}, + }, + ], + } + ] + return messages, text, system_msg \ No newline at end of file diff --git a/OM2W_Benchmarking/src/methods/webjudge_general_eval.py b/OM2W_Benchmarking/src/methods/webjudge_general_eval.py new file mode 100644 index 0000000..9bdb5cd --- /dev/null +++ b/OM2W_Benchmarking/src/methods/webjudge_general_eval.py @@ -0,0 +1,225 @@ +from utils import encode_image +from PIL import Image +import re +import asyncio +MAX_IMAGE =50 + +async def identify_key_points(task, input_image_paths, model): + system_msg = """You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description. + +**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal. + +**Instructions**: +1. Read the task description carefully. +2. Identify and extract **key points** directly stated in the task description. + - A **key point** is a critical element, condition, or step explicitly mentioned in the task description. + - Do not infer or add any unstated elements. + - Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest"). + +**Respond with**: +- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details.""" + + prompt = """Task: {task}""" + text = prompt.format(task=task) + + input_images_msg = [] + + if input_image_paths != None: + for input_image_path in input_image_paths: + input_images_jpg_base64_str = encode_image(Image.open(input_image_path)) + input_images_msg.append( + { + 'type': 'image_url', + 'image_url': {"url": f"data:image/png;base64,{input_images_jpg_base64_str}", "detail": "high"} + } + ) + + messages = [ + {"role": "system", "content": system_msg}, + { + "role": "user", + "content": [ + {"type": "text", "text": text} + ]+ input_images_msg, + } + ] + responses = await asyncio.to_thread(model.generate, messages) + return responses[0] + +async def judge_image(task, input_image_paths, image_path, key_points, model): + system_msg = """You are an expert evaluator tasked with determining whether an image contains information about the necessary steps to complete a task. + +**Objective**: Analyze the provided image and decide if it shows essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score. + +**Instructions**: +1. Provide a detailed description of the image, including its contents, visible elements, text (if any), and any notable features. + +2. Carefully examine the image and evaluate whether it contains necessary steps or evidence crucial to task completion: +- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions. +- Does the image show actions, progress indicators, or critical information directly related to completing the task? +- Is this information indispensable for understanding or ensuring task success? +- If the image contains partial but relevant information, consider its usefulness rather than dismissing it outright. + +3. Provide your response in the following format: +- **Reasoning**: Explain your thought process and observations. Mention specific elements in the image that indicate necessary steps, evidence, or lack thereof. +- **Score**: Assign a score based on the reasoning, using the following scale: + - **1**: The image does not contain any necessary steps or relevant information. + - **2**: The image contains minimal or ambiguous information, unlikely to be essential. + - **3**: The image includes some relevant steps or hints but lacks clarity or completeness. + - **4**: The image contains important steps or evidence that are highly relevant but not fully comprehensive. + - **5**: The image clearly displays necessary steps or evidence crucial for completing the task. + +Respond with: +### Reasoning**: [Your explanation] +### Score**: [1-5]""" + + + prompt = """**Task**: {task} + +**Key Points for Task Completion**: {key_points} + +The snapshot of the web page is shown in the image.""" + text = prompt.format(task=task,key_points=key_points) + + input_images_msg = [] + if input_image_paths != None: + for input_image_path in input_image_paths: + input_images_jpg_base64_str = encode_image(Image.open(input_image_path)) + input_images_msg.append( + { + 'type': 'image_url', + 'image_url': {"url": f"data:image/png;base64,{input_images_jpg_base64_str}", "detail": "high"} + } + ) + messages = [{"role": "system", "content": system_msg}] + + if input_images_msg: + messages.append({ + "role": "user", + "content": [{"type": "text", "text": "The input images are:"}] + input_images_msg + }) + + jpg_base64_str = encode_image(Image.open(image_path)) + messages.append( + { + "role": "user", + "content": [ + {"type": "text", "text": text}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"}, + }, + ] + } + ) + + responses = await asyncio.to_thread(model.generate, messages) + return responses[0] + + +async def WebJudge_general_eval(task, input_image_paths, action_thoughts, last_actions, images_path, model, score_threshold): + system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, some potentially important web pages in the agent's trajectory and their reasons, your goal is to determine whether the agent has completed the task and achieved all requirements. + +Your response must strictly follow the following evaluation criteria! +*Important Evaluation Criteria*: +1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), it should be considered a failure. +2: You must carefully check whether these snapshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function). +3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements! +4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure. To ensure the task is successful, the applied filter must precisely match the specified range without being too broad or too narrow. +5: Some tasks require a submission action or a display of results to be considered successful. Repeat actions or actions that do not lead to a visible result should be considered a failure. +6: If the agent loops through a sequence of actions that do not make progress toward the goal (including failing to click "Save" or "Submit," etc.), it should be considered a failure. + +Format your response into two lines as shown below: +Thoughts: +Status: "success" or "failure" +""" + prompt = """User Task: {task} + +Key Points: {key_points} + +Action History: +{last_actions} + +The potentially important snapshots of the webpage in the agent's trajectory and their reasons: +{thoughts}""" + + + key_points = await identify_key_points(task, input_image_paths, model) + key_points = key_points.replace("\n\n", "\n") + + try: + key_points = key_points.split("**Key Points**:")[1] + key_points = "\n".join(line.lstrip() for line in key_points.splitlines()) + except: + key_points = key_points.split("Key Points:")[-1] + key_points = "\n".join(line.lstrip() for line in key_points.splitlines()) + + tasks = [judge_image(task, input_image_paths, image_path, key_points, model) for image_path in images_path] + image_responses = await asyncio.gather(*tasks) + + input_images_msg = [] + whole_content_img = [] + whole_thoughts = [] + record = [] + pattern = r"[1-5]" + for response, image_path in zip(image_responses, images_path): + try: + score_text = response.split("### Score")[1] + thought = response.split("### Reasoning:")[-1].strip().lstrip("\n").split("### Score")[0].replace('\n',' ') + score = re.findall(pattern, score_text)[0] + record.append({"Response": response, "Score": int(score)}) + except Exception as e: + print(f"Error processing response: {e}") + score = 0 + record.append({"Response": response, "Score": 0}) + + if int(score) >= score_threshold: + jpg_base64_str = encode_image(Image.open(image_path)) + whole_content_img.append( + { + 'type': 'image_url', + 'image_url': {"url": f"data:image/png;base64,{jpg_base64_str}", "detail": "high"} + } + ) + if thought != "": + whole_thoughts.append(thought) + + whole_content_img = whole_content_img[:MAX_IMAGE] + whole_thoughts = whole_thoughts[:MAX_IMAGE] + if len(whole_content_img) == 0: + prompt = """User Task: {task} + +Key Points: {key_points} + +Action History: +{last_actions}""" + + if action_thoughts != None: + text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}. Reasoning: {action_thought}" for i, (action, action_thought) in enumerate(zip(last_actions,action_thoughts))), key_points=key_points, thoughts = "\n".join(f"{i+1}. {thought}" for i, thought in enumerate(whole_thoughts))) + + else: + text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}" for i, action in enumerate(last_actions)), key_points=key_points, thoughts = "\n".join(f"{i+1}. {thought}" for i, thought in enumerate(whole_thoughts))) + + input_images_msg = [] + if input_image_paths is not None: + for path in input_image_paths: + input_images_jpg_base64_str = encode_image(Image.open(path)) + input_images_msg.append({ + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{input_images_jpg_base64_str}", "detail": "high"} + }) + + messages = [{"role": "system", "content": system_msg}] + + if input_images_msg: + messages.append({ + "role": "user", + "content": [{"type": "text", "text": "The input images are:"}] + input_images_msg + }) + + messages.append({ + "role": "user", + "content": [{"type": "text", "text": text}] + whole_content_img + }) + + return messages, text, system_msg, record, key_points \ No newline at end of file diff --git a/OM2W_Benchmarking/src/methods/webjudge_online_mind2web.py b/OM2W_Benchmarking/src/methods/webjudge_online_mind2web.py new file mode 100644 index 0000000..1b3d421 --- /dev/null +++ b/OM2W_Benchmarking/src/methods/webjudge_online_mind2web.py @@ -0,0 +1,248 @@ +from utils import encode_image +from PIL import Image +import re +import asyncio +from datetime import datetime + +MAX_IMAGE =50 + +async def identify_key_points(task, model): + system_msg = """You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description. + +**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal. + +**Instructions**: +1. Read the task description carefully. +2. Identify and extract **key points** directly stated in the task description. + - A **key point** is a critical element, condition, or step explicitly mentioned in the task description. + - Do not infer or add any unstated elements. + - Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest"). + +**Respond with**: +- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details.""" + prompt = """Task: {task}""" + text = prompt.format(task=task) + messages = [ + {"role": "system", "content": system_msg}, + { + "role": "user", + "content": [ + {"type": "text", "text": text} + ], + } + ] + responses = await asyncio.to_thread(model.generate, messages) + return responses[0] + +async def judge_image(task, image_path, key_points, model): + system_msg = """You are an expert evaluator tasked with determining whether an image contains information about the necessary steps to complete a task. + +**Objective**: Analyze the provided image and decide if it shows essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score. + +**Instructions**: +1. Provide a detailed description of the image, including its contents, visible elements, text (if any), and any notable features. + +2. Carefully examine the image and evaluate whether it contains necessary steps or evidence crucial to task completion: +- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions. +- Does the image show actions, progress indicators, or critical information directly related to completing the task? +- Is this information indispensable for understanding or ensuring task success? +- If the image contains partial but relevant information, consider its usefulness rather than dismissing it outright. + +3. Provide your response in the following format: +- **Reasoning**: Explain your thought process and observations. Mention specific elements in the image that indicate necessary steps, evidence, or lack thereof. +- **Score**: Assign a score based on the reasoning, using the following scale: + - **1**: The image does not contain any necessary steps or relevant information. + - **2**: The image contains minimal or ambiguous information, unlikely to be essential. + - **3**: The image includes some relevant steps or hints but lacks clarity or completeness. + - **4**: The image contains important steps or evidence that are highly relevant but not fully comprehensive. + - **5**: The image clearly displays necessary steps or evidence crucial for completing the task. + +Respond with: +1. **Reasoning**: [Your explanation] +2. **Score**: [1-5]""" + + jpg_base64_str = encode_image(Image.open(image_path)) + + prompt = """**Task**: {task} + +**Key Points for Task Completion**: {key_points} + +The snapshot of the web page is shown in the image.""" + text = prompt.format(task=task,key_points=key_points) + + messages = [ + {"role": "system", "content": system_msg}, + { + "role": "user", + "content": [ + {"type": "text", "text": text}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{jpg_base64_str}", "detail": "high"}, + }, + ], + } + ] + + responses = await asyncio.to_thread(model.generate, messages) + return responses[0] + +async def WebJudge_Online_Mind2Web_eval(task, last_actions, images_path, model, score_threshold): + system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, some potentially important web pages in the agent's trajectory and their reasons, your goal is to determine whether the agent has completed the task and achieved all requirements. + +Your response must strictly follow the following evaluation criteria! +*Important Evaluation Criteria*: +1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), the task is not considered successful. +2: You must carefully check whether these snapshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function). +3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements! +4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure. To ensure the task is successful, the applied filter must precisely match the specified range without being too broad or too narrow. +Examples of Failure Cases: +- If the requirement is less than $50, but the applied filter is less than $25, it is a failure. +- If the requirement is $1500-$2500, but the applied filter is $2000-$2500, it is a failure. +- If the requirement is $25-$200, but the applied filter is $0-$200, it is a failure. +- If the required years are 2004-2012, but the filter applied is 2001-2012, it is a failure. +- If the required years are before 2015, but the applied filter is 2000-2014, it is a failure. +- If the task requires exactly 2 beds, but the filter applied is 2+ beds, it is a failure. +5: Some tasks require a submission action or a display of results to be considered successful. +6: If the retrieved information is invalid or empty(e.g., No match was found), but the agent has correctly performed the required action, it should still be considered successful. +7: If the current page already displays all available items, then applying a filter is not necessary. As long as the agent selects items that meet the requirements (e.g., the cheapest or lowest price), the task is still considered successful. + +*IMPORTANT* +Format your response into two lines as shown below: + +Thoughts: +Status: "success" or "failure" +""" + prompt = """User Task: {task} + +Key Points: {key_points} + +Action History: +{last_actions} + +The potentially important snapshots of the webpage in the agent's trajectory and their reasons: +{thoughts}""" + + + key_points = await identify_key_points(task, model) + key_points = key_points.replace("\n\n", "\n") + + try: + key_points = key_points.split("**Key Points**:")[1] + key_points = "\n".join(line.lstrip() for line in key_points.splitlines()) + except: + key_points = key_points.split("Key Points:")[-1] + key_points = "\n".join(line.lstrip() for line in key_points.splitlines()) + + tasks = [judge_image(task, image_path, key_points, model) for image_path in images_path] + image_responses = await asyncio.gather(*tasks) + + whole_content_img = [] + whole_thoughts = [] + record = [] + pattern = r"[1-5]" + for response, image_path in zip(image_responses, images_path): + try: + # score_text = response.split("Score")[1] + # thought = response.split("**Reasoning**:")[-1].strip().lstrip("\n").split("\n\n")[0].replace('\n',' ') + # score = re.findall(pattern, score_text)[0] # maybe Error processing response: list index out of range + # record.append({"Response": response, "Score": int(score)}) + + print(f"Response preview: {response[:100]}...") + + score = 0 + thought = "" + + if "Score:" in response: + score_text = response.split("Score:")[1] + elif "Score " in response: + score_text = response.split("Score ")[1] + elif "score:" in response.lower(): + score_text = response.lower().split("score:")[1] + elif "**Score**:" in response: + score_text = response.split("**Score**:")[1] + elif "**Score**" in response: + score_text = response.split("**Score**")[1] + else: + print(f"Full response without Score: {response}") + score = 0 + record.append({"Response": response, "Score": score}) + continue + + if "**Reasoning**:" in response: + thought = response.split("**Reasoning**:")[-1].strip().lstrip("\n").split("\n\n")[0].replace('\n',' ') + elif "Reasoning:" in response: + thought = response.split("Reasoning:")[-1].strip().lstrip("\n").split("\n\n")[0].replace('\n',' ') + else: + thought = "" + + scores = re.findall(pattern, score_text) + if scores: + score = scores[0] + else: + print(f"No score pattern match in: {score_text[:50]}...") + score = 0 + + record.append({"Response": response, "Score": int(score)}) + + + ####################### + # if "Score:" in response: + # score_text = response.split("Score:")[1] + # elif "Score" in response: + # score_text = response.split("Score")[1] + # else: + # raise ValueError("No 'Score' found in response") + + # if "**Reasoning**:" in response: + # thought = response.split("**Reasoning**:")[-1].strip().lstrip("\n").split("\n\n")[0].replace('\n',' ') + # elif "Reasoning:" in response: + # thought = response.split("Reasoning:")[-1].strip().lstrip("\n").split("\n\n")[0].replace('\n',' ') + # else: + # thought = "" + + # scores = re.findall(pattern, score_text) + # if scores: + # score = scores[0] + # else: + # raise ValueError("No score found in the response") + + # record.append({"Response": response, "Score": int(score)}) + + except Exception as e: + print(f"Error processing response: {e}") + score = 0 + record.append({"Response": response, "Score": 0}) + + if int(score) >= score_threshold: + jpg_base64_str = encode_image(Image.open(image_path)) + whole_content_img.append( + { + 'type': 'image_url', + 'image_url': {"url": f"data:image/png;base64,{jpg_base64_str}", "detail": "high"} + } + ) + if thought != "": + whole_thoughts.append(thought) + + whole_content_img = whole_content_img[:MAX_IMAGE] + whole_thoughts = whole_thoughts[:MAX_IMAGE] + if len(whole_content_img) == 0: + prompt = """User Task: {task} + +Key Points: {key_points} + +Action History: +{last_actions}""" + text = prompt.format(task=task, last_actions="\n".join(f"{i+1}. {action}" for i, action in enumerate(last_actions)), key_points=key_points, thoughts = "\n".join(f"{i+1}. {thought}" for i, thought in enumerate(whole_thoughts))) + + messages = [ + {"role": "system", "content": system_msg}, + { + "role": "user", + "content": [ + {"type": "text", "text": text}] + + whole_content_img + } + ] + return messages, text, system_msg, record, key_points \ No newline at end of file diff --git a/OM2W_Benchmarking/src/methods/webvoyager_eval.py b/OM2W_Benchmarking/src/methods/webvoyager_eval.py new file mode 100644 index 0000000..c9594a0 --- /dev/null +++ b/OM2W_Benchmarking/src/methods/webvoyager_eval.py @@ -0,0 +1,49 @@ +from utils import encode_image +from PIL import Image +MAX_IMAGE =50 + +def WebVoyager_eval(task, images_path, response, k=0): + system_msg = """As an evaluator, you will be presented with three primary components to assist you in your role: + +1. Web Task Instruction: This is a clear and specific directive provided in natural language, detailing the online activity to be carried out. These requirements may include conducting searches, verifying information, comparing prices, checking availability, or any other action relevant to the specified web service (such as Amazon, Apple, ArXiv, BBC News, Booking etc). + +2. Result Screenshots: This is a visual representation of the screen showing the result or intermediate state of performing a web task. It serves as visual proof of the actions taken in response to the instruction. + +3. Result Response: This is a textual response obtained after the execution of the web task. It serves as textual result in response to the instruction. + +-- You DO NOT NEED to interact with web pages or perform actions such as booking flights or conducting searches on websites. +-- You SHOULD NOT make assumptions based on information not presented in the screenshot when comparing it to the instructions. +-- Your primary responsibility is to conduct a thorough assessment of the web task instruction against the outcome depicted in the screenshot and in the response, evaluating whether the actions taken align with the given instructions. +-- NOTE that the instruction may involve more than one task, for example, locating the garage and summarizing the review. Failing to complete either task, such as not providing a summary, should be considered unsuccessful. +-- NOTE that the screenshot is authentic, but the response provided by LLM is generated at the end of web browsing, and there may be discrepancies between the text and the screenshots. +-- Note the difference: 1) Result response may contradict the screenshot, then the content of the screenshot prevails, 2) The content in the Result response is not mentioned on the screenshot, choose to believe the content. + +You should elaborate on how you arrived at your final evaluation and then provide a definitive verdict on whether the task has been successfully accomplished, either as 'SUCCESS' or 'FAILURE'.""" + prompt = """TASK: {task} + +Result Response: {response} + +{num} screenshots at the end: """ + + whole_content_img = [] + images_path = images_path[:MAX_IMAGE] + text = prompt.format(task=task, response=response, num = len(images_path) if k == 0 else k) + + for image in images_path[-k:]: + jpg_base64_str = encode_image(Image.open(image)) + whole_content_img.append( + { + 'type': 'image_url', + 'image_url': {"url": f"data:image/png;base64,{jpg_base64_str}", "detail": "high"} + } + ) + messages = [ + {"role": "system", "content": system_msg}, + { + "role": "user", + "content": [{"type": "text", "text": text}] + + whole_content_img + + [{'type': 'text', 'text': "Your verdict:\n"}] + } + ] + return messages, text, system_msg \ No newline at end of file diff --git a/OM2W_Benchmarking/src/run.py b/OM2W_Benchmarking/src/run.py new file mode 100644 index 0000000..33d567e --- /dev/null +++ b/OM2W_Benchmarking/src/run.py @@ -0,0 +1,162 @@ +import argparse +import os +from methods.agenttrek_eval import * +from methods.automomous_eval import * +from methods.webjudge_general_eval import * +from methods.webjudge_online_mind2web import * +from methods.webvoyager_eval import * +from utils import OpenaiEngine, extract_predication +import json +import copy +import asyncio +import multiprocessing + +def auto_eval(args, task_subset, final_predicted_labels, lock, model): + + ################## get the already done task id ############### + output_json_path = os.path.join(args.output_path, f"{args.mode}_{args.model}_score_threshold_{args.score_threshold}_auto_eval_results.json") + already_ids = [] + if os.path.exists(output_json_path): + with open(output_json_path,"r") as f: + already_data = f.read() + already_tasks = already_data.splitlines() + for item in already_tasks: + item = json.loads(item) + already_ids.append(item["task_id"]) + + print(f"The number of already done tasks: {len(already_ids)}") + + for task_id in task_subset: + #Skip already done task + if task_id in already_ids: + continue + + trajectory_images_path = os.path.join(args.trajectories_dir, task_id, "trajectory") + screenshot_paths = [] + thoughts = None + action_history = None + final_result_response = None + input_image_paths = None + task_description = None + # Load results + with open(os.path.join(args.trajectories_dir, task_id, "result.json")) as f: + result = json.load(f) + output_results = copy.deepcopy(result) + task_description = result["task"] + if "action_history" in result: + action_history = result["action_history"] + if "thoughts" in result: + thoughts = result["thoughts"] + if "final_result_response" in result: + final_result_response = result["final_result_response"] + if "input_image_paths" in result: + input_image_paths = result["input_image_paths"] + + print(f"Start evaluation for {task_description}") + # Do the auto-eval + if args.mode == "Autonomous_eval": + for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])): + screenshot_paths.append(os.path.join(trajectory_images_path, image)) + messages, text, system_msg = Autonomous_eval(task_description, action_history, screenshot_paths[-1]) + + elif args.mode == "AgentTrek_eval": + for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])): + screenshot_paths.append(os.path.join(trajectory_images_path, image)) + messages, text, system_msg = AgentTrek_eval(task_description, action_history, thoughts, screenshot_paths[-1]) + + elif args.mode == "WebVoyager_eval": + for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])): + screenshot_paths.append(os.path.join(trajectory_images_path, image)) + messages, text, system_msg = WebVoyager_eval(task_description, screenshot_paths, final_result_response) + + elif args.mode == "WebJudge_Online_Mind2Web_eval": + for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])): + screenshot_paths.append(os.path.join(trajectory_images_path, image)) + messages, text, system_msg, record, key_points = asyncio.run(WebJudge_Online_Mind2Web_eval(task_description, action_history, screenshot_paths, model, args.score_threshold)) + output_results["image_judge_record"] = record + output_results["key_points"] = key_points + + elif args.mode == "WebJudge_general_eval": + for image in sorted(os.listdir(trajectory_images_path), key=lambda x: int(re.findall(r'\d+', x)[0])): + screenshot_paths.append(os.path.join(trajectory_images_path, image)) + messages, text, system_msg, record, key_points = asyncio.run(WebJudge_general_eval(task_description, input_image_paths, thoughts, action_history, screenshot_paths, model, args.score_threshold)) + output_results["image_judge_record"] = record + output_results["key_points"] = key_points + + else: + raise ValueError(f"Unknown mode: {args.mode}") + + response = model.generate(messages)[0] + predicted_label = extract_predication(response, args.mode) + + #Store evaluation details + evaluation_results = {"response": response, "predicted_label": predicted_label} + output_results["task_id"] = task_id + output_results["input_text"] = text + output_results["system_msg"] = system_msg + output_results["evaluation_details"] = evaluation_results + output_results["predicted_label"] = predicted_label + + with lock: + final_predicted_labels.append(predicted_label) + + print(f"Finish evaluation for {task_description}") + print("="*20) + os.makedirs(args.output_path, exist_ok=True) + with lock: + with open(os.path.join(args.output_path, f"{args.mode}_{args.model}_score_threshold_{args.score_threshold}_auto_eval_results.json"), "a+") as f_out: + f_out.write(json.dumps(output_results) + "\n") + + +def process_subset(task_subset, args, final_predicted_labels, lock, model): + + auto_eval(args, task_subset, final_predicted_labels, lock, model) + +def parallel_eval(args, num_workers=60): + + #Evaluate in parallel based on num of works + task_dirs = [ + d for d in sorted(os.listdir(args.trajectories_dir)) + if os.path.isdir(os.path.join(args.trajectories_dir, d)) + ] + print(f"Evaluating {len(task_dirs)} tasks in total.") + chunk_size = len(task_dirs) // num_workers + task_subsets = [task_dirs[i:i + chunk_size] for i in range(0, len(task_dirs), chunk_size)] + + #Load model + model = OpenaiEngine( + model=args.model, + api_key=args.api_key + ) + + lock = multiprocessing.Lock() + with multiprocessing.Manager() as manager: + final_predicted_labels = manager.list() + processes = [] + for subset in task_subsets: + p = multiprocessing.Process(target=process_subset, args=(subset, args, final_predicted_labels, lock, model)) + p.start() + processes.append(p) + + for p in processes: + p.join() + + success_num = sum(final_predicted_labels) + + print("Evaluation complete.") + print(f"The success rate is {(success_num / len(task_dirs)) * 100}.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Auto evaluation of web navigation tasks.") + parser.add_argument('--mode', type=str, default='Online_Mind2Web_eval', help='the mode of evaluation') + parser.add_argument('--model', type=str, default='gpt-4o') + parser.add_argument("--trajectories_dir", type=str, required=True, help="Path to trajectories directory") + parser.add_argument("--api_key", type=str, required=True, help="The api key") + parser.add_argument("--output_path", type=str, required=True, help="The output path") + parser.add_argument('--score_threshold', type=int, default=3) + parser.add_argument('--num_worker', type=int, default=60) + args = parser.parse_args() + + parallel_eval(args, args.num_worker) + diff --git a/OM2W_Benchmarking/src/utils.py b/OM2W_Benchmarking/src/utils.py new file mode 100644 index 0000000..dd0161b --- /dev/null +++ b/OM2W_Benchmarking/src/utils.py @@ -0,0 +1,146 @@ +import base64 +import io +from openai import ( + APIConnectionError, + APIError, + RateLimitError, + AzureOpenAI, + OpenAI +) +import os +import backoff + +def encode_image(image): + """Convert a PIL image to base64 string.""" + if image.mode == "RGBA": + image = image.convert("RGB") + buffered = io.BytesIO() + image.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + +def extract_predication(response, mode): + """Extract the prediction from the response.""" + if mode == "Autonomous_eval": + try: + if "success" in response.lower().split('status:')[1]: + return 1 + else: + return 0 + except: + return 0 + elif mode == "AgentTrek_eval": + try: + if "success" in response.lower().split('status:')[1]: + return 1 + else: + return 0 + except: + return 0 + elif mode == "WebVoyager_eval": + if "FAILURE" in response: + return 0 + else: + return 1 + elif mode == "WebJudge_Online_Mind2Web_eval": + try: + if "success" in response.lower().split('status:')[1]: + return 1 + else: + return 0 + except: + return 0 + elif mode == "WebJudge_general_eval": + try: + if "success" in response.lower().split('status:')[1]: + return 1 + else: + return 0 + except: + return 0 + else: + raise ValueError(f"Unknown mode: {mode}") + + +class OpenaiEngine(): + def __init__( + self, + api_key=None, + stop=[], + rate_limit=-1, + model=None, + tokenizer=None, + temperature=0, + port=-1, + endpoint_target_uri = "", + **kwargs, + ) -> None: + """Init an OpenAI GPT/Codex engine + + Args: + api_key (_type_, optional): Auth key from OpenAI. Defaults to None. + stop (list, optional): Tokens indicate stop of sequence. Defaults to ["\n"]. + rate_limit (int, optional): Max number of requests per minute. Defaults to -1. + model (_type_, optional): Model family. Defaults to None. + """ + assert ( + os.getenv("OPENAI_API_KEY", api_key) is not None + ), "must pass on the api_key or set OPENAI_API_KEY in the environment" + if api_key is None: + api_key = os.getenv("OPENAI_API_KEY", api_key) + if isinstance(api_key, str): + self.api_keys = [api_key] + elif isinstance(api_key, list): + self.api_keys = api_key + else: + raise ValueError("api_key must be a string or list") + self.stop = stop + self.temperature = temperature + self.model = model + # convert rate limit to minmum request interval + self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit + self.next_avil_time = [0] * len(self.api_keys) + self.client = OpenAI( + api_key=api_key, + ) + + def log_error(details): + print(f"Retrying in {details['wait']:0.1f} seconds due to {details['exception']}") + + @backoff.on_exception( + backoff.expo, + (APIError, RateLimitError, APIConnectionError), + max_tries=3, + on_backoff=log_error + ) + def generate(self, messages, max_new_tokens=512, temperature=0, model=None, **kwargs): + model = model if model else self.model + # response = self.client.chat.completions.create( + # model=model if model else self.model, + # messages=messages, + # max_tokens=max_new_tokens, + # temperature=temperature, + # **kwargs, + # ) + # return [choice.message.content for choice in response.choices] + + # support for o4-mini and claude models + # 支持o4-mini和claude模型 + if model == "o4-mini" or model.startswith("claude"): + # o4-mini不支持temperature=0,仅支持默认值1 + response = self.client.chat.completions.create( + model=model, + messages=messages, + # max_completion_tokens=max_new_tokens, + # 不传递temperature参数,使用默认值 + **kwargs, + ) + else: + response = self.client.chat.completions.create( + model=model, + messages=messages, + max_tokens=max_new_tokens, + temperature=temperature, + **kwargs, + ) + + return [choice.message.content for choice in response. choices] \ No newline at end of file diff --git a/OM2W_Benchmarking/statistic.py b/OM2W_Benchmarking/statistic.py new file mode 100644 index 0000000..b7fa0b6 --- /dev/null +++ b/OM2W_Benchmarking/statistic.py @@ -0,0 +1,99 @@ +import json +import os +from collections import defaultdict, Counter + +def process_results_file(results_file, task_id_to_level): + evaluation_results = [] + with open(results_file, "r") as f: + for line in f: + line = line.strip() + if line: + try: + result = json.loads(line) + evaluation_results.append(result) + except json.JSONDecodeError: + print(f"Unable to parse JSON lines: {line[:50]}...") + + # Success/failure by difficulty + level_stats = defaultdict(lambda: {"total": 0, "success": 0}) + total_stats = {"total": 0, "success": 0} + unknown_level_tasks = [] + + for result in evaluation_results: + task_id = result.get("task_id") + if not task_id: + print("Missing results for task_id") + continue + + predicted_label = result.get("predicted_label") + if predicted_label is None: + print(f"task {task_id} loss predicted_label") + continue + + level = task_id_to_level.get(task_id) + if not level: + unknown_level_tasks.append(task_id) + continue + + level_stats[level]["total"] += 1 + level_stats[level]["success"] += predicted_label + + total_stats["total"] += 1 + total_stats["success"] += predicted_label + + return { + "total_stats": total_stats, + "level_stats": level_stats, + "unknown_level_tasks": unknown_level_tasks + } + +def print_statistics(stats, file_name): + total_stats = stats["total_stats"] + level_stats = stats["level_stats"] + unknown_level_tasks = stats["unknown_level_tasks"] + + print(f"\n{file_name} Evaluation results Statistics:") + print("-" * 50) + print(f"Total number of evaluation tasks: {total_stats['total']}") + + if total_stats["total"] > 0: + print(f"Total success rate: {total_stats['success']/total_stats['total']*100:.2f}%") + else: + print("No data") + + print("-" * 50) + print("Statistics by difficulty Level:") + for level in ["easy", "medium", "hard"]: + stats = level_stats.get(level, {"total": 0, "success": 0}) + if stats["total"] > 0: + success_rate = stats["success"] / stats["total"] * 100 + print(f"{level.capitalize()} task: {stats['success']}/{stats['total']} Success rate: {success_rate:.2f}%") + else: + print(f"{level.capitalize()} task: no data") + + if unknown_level_tasks: + print("-" * 50) + print(f"{len(unknown_level_tasks)} the difficulty level was not found") + print(f"Examples of task ids with unknown difficulty levels: {unknown_level_tasks[:3]}") + +def calculate_success_rates(): + with open("../WebCanvas/data/Online-Mind2Web/Online_Mind2Web.json", "r") as f: + task_levels = json.load(f) + + task_id_to_level = {task["task_id"]: task["level"] for task in task_levels} + + # define the result files to process + results_files = [ + "../WebCanvas/results/WebJudge_Online_Mind2Web_eval_gpt-4o_score_threshold_3_auto_eval_results.json" + ] + + for results_file in results_files: + if not os.path.exists(results_file): + print(f"Warning: The {results_file} does not exist, skip") + continue + + stats = process_results_file(results_file, task_id_to_level) + print_statistics(stats, os.path.basename(results_file)) + +if __name__ == "__main__": + calculate_success_rates() \ No newline at end of file diff --git a/README.md b/README.md index fc9cac3..a4e77b7 100644 --- a/README.md +++ b/README.md @@ -1,75 +1,4 @@ -

WebCanvas: All-in-one Open-Source Framework for Building, Training, and Evaluating Web Agents

- - -

- License MIT - Python Version 3.11 - GitHub Issues - PRs Welcome - GitHub Stars - GitHub Forks - -

- -

- Platform Platform • - Paper Paper • - Dataset Dataset • - Discord Discord • - Twitter Twitter • - WeChat WeChat -

- -Existing frameworks for web agent development are either offline and static, or operate within a fully reproducible environment with limited Internet dynamics. The WebCanvas project aims to pioneer the online development, training and evaluation of web agents. We offer a suite of toolkits for scaling and maintaining a **KEY-NODE** based web trajectories for web agents to support this endeavor. We welcome any constructive feedback on the project and look forward to partnering with you in developing agents for web tasks! - -![Main Figure](src/main_figure.png) - -## 🔥 News -- **[2024, December 26]** v0.0.4 released! Major updates include: - - Introduced a new JavaScript event listener-based evaluation system that decouples evaluation methods from action space, enabling assessment of purely visually-grounded agents - - Integrated with [Browserbase](https://www.browserbase.com/) cloud browser environment for more stable and consistent evaluation - - Published and maintaining an up-to-date [leaderboard](Mind2web-live_Leaderboard.md) for Mind2Web-Live benchmark, still far from saturation! -- **[2024, September 9]** Support evaluation for OpenAI new o1 models, includes o1-preview and o1-mini. Just set the 'planning_text_model' parameter to 'o1-preview' or 'o1-mini'. -- **[2024, August 9]** We're excited to announce the release of v0.0.3 of WebCanvas! This update introduces support for evaluation of data operations, such as caching data in process and outputting the final answer. You can now define and evaluate a broader range of web tasks using iMean Builder and WebCanvas. Additionally, we've introduced a new metric: ***US dollar consumption / key node completion(usd_efficiency_score)***, detailed in [this section](#usd_efficiency). We believe that an agent's efficiency is crucial for online web tasks, and this metric will help quantify that efficiency. -- **[2024, July 13]** We've released v0.0.2 of WebCanvas. This update brings the ability to call different base model services, including OpenAI, Claude, Gemini, and together.ai. Now, you can choose any of these model services for testing on our platform. Additionally, we've launched a new repository: [WebCanvas Showcase](https://github.com/iMeanAI/WebCanvas_showcase), which demonstrates how different agent frameworks can be integrated with the WebCanvas framework for online evaluation. We're kicking things off with the integration of SEEACT[^5] and WebCanvas. Play with it and explore the possibilities! -- **[2024, June 18]** Our paper will be presented at [agentic markets workshop](https://sites.google.com/view/amw-2024/home?authuser=0) in ICML 2024 and [natural language reasoning and structured explanations workshop](https://nl-reasoning-workshop.github.io/) in ACL 2024. See you in Vienna and Bangkok! -- **[2024, June 18]** Our pre-print [paper](https://arxiv.org/abs/2406.12373) "WebCanvas: Benchmarking Web Agents in Online Environments" is available! -- **[2024, June 6]** We've released WebCanvas, including [Data](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live), [Platform](https://www.imean.ai/web-canvas), [Toolkits](https://webcanvas.gitbook.io/webcanvas-docs), and Web agents(in this repo)! - - -## 🌟Features - -- **Base Agent Framework**: Includes a base agent framework with several key modules - *Planning*, *Observation*, *Memory*, *Reward*, *Action Execution*, *Evaluation*, designed to be plug-and-play, enabling developers to easily test and iterate on their own LLM-based web agents. -- **Dynamic and Real-time Web Environment Interaction**: Utilizes live web environments to provide a realistic assessment and feedback of web agents, thus addressing key challenges for web agents such as error handling, CAPTCHA solving, key-node based evaluation, metrics for agent efficiency etc.. -- **Key Nodes Annotation**: Introduces the concept of "key nodes" to offer in-progress feedback and a granular, phase-based assessment system that rigorously evaluate web agents in the wild. -- **Scale Web Agent Evaluation in Live Web Environments**: Connected to a comprehensive suite of toolkits with accurate observation capture and rich action space to define demonstration trajectories and intermediate states for real-time, open-ended web tasks, allowing for robust evaluation in dynamic web environments. Check out our [How to guide](https://webcanvas.gitbook.io/webcanvas-docs). -- **Mind2Web-Live Dataset**: Presents a refined version of the original Mind2Web[^1] static dataset, containing 542 tasks with 2439 intermediate evaluation states, serving as the foundation general purpose benchmark. -- **Open Data Access**: Raw data of all challenges can be downloaded, including raw html, full screenshot, DOM tree, Axtree, operation video, captured action, element information, etc., refer to [challenge propose](#challenge) and [data download](#download). The data is open accessible to the community free for research use. - -## 🚀 Roadmap - -- **Better Modularity and More Flexible Integration**: To help easier integration of WebCanvas evaluation, connect offline agents to online web environment. -- **Dynamic Evaluation Function**: Provide toolkit for community to define dynamic evaluation functions(for example, model-based evaluation) as supplementary of current static evaluation functions. -- **More Dataset Coverage**: Introduce more datasets in different domains that address key capabilities in online web tasks. -- **Accumulate Knowledge on Agent Experiences**: Develop better algorithm to handle error encountered when inference in live environment, also accumulate knowledge on agent experiences in different websites. -- **Better Visualization of Agent Performance**: Enable batch visualization and analysis of agent trajectories. -- **More Reliable Evaluation**: Cloud browser environment with captcha solving for more stable and consistent evaluation. - - -## 📋 TODO - -- [x] Support more base model calling(Claude, Gemini, Open-source Models from together.ai, etc.). *(Done)* -- [x] Add information extraction related actions and relative evaluation metrics. *(Done)* -- [x] Enable token consumption calculation. *(Done)* -- [x] Update evaluation methods to decouple from action space. *(Done)* -- [x] Connect with cloud browser environment. *(Done)* -- [ ] Batch evaluation using cloud browser. -- [ ] Develop batch visualizations and analysis of agent performance on live websites. -- [ ] Add captcha solving service. -- [ ] Better modularity to ease integration. -- [ ] Add more brilliant web agent benchmarks as showcase: webarena[^2], GAIA[^3], assistantBench[^4], etc. -- [ ] Evaluation of open-ended web tasks. - +# WebCanvas Online-Mind2Web-eval ## 🔍 Evaluation on Existing WebCanvas Benchmarks @@ -100,231 +29,62 @@ export GOOGLE_CX=your_custom_search_engine_id See [How to set up google search](https://developers.google.com/custom-search/v1/overview?hl=zh-cn) for more details. -#### Recommended Environment for Mind2Web-Live - -From our experiments, the experimental environment plays a crucial role in agent performance. We recommend experimenting on a Windows server using Chrome or Firefox browser engines, preferably on servers located in the United States. Below is the experiment results on Mind2Web-Live test set. - -| Planning Model | IP Region | System | Browser | Completion Rate | Task Success Rate | Efficiency Score | -|----------------|--------------|---------|---------|-----------------|-------------------|------------------| -| gpt-3.5-turbo-0125 | United States| Windows | Chrome | 40.2% | 16.5% | 3.03 | -| gpt-3.5-turbo-0125 | United States| Windows | Firefox | 42.1% | 20.2% | 2.79 | -| gpt-3.5-turbo-0125 | United States| Linux | Chrome | 36.5% | 15.4% | 3.33 | -| gpt-3.5-turbo-0125 | United Kingdom| Windows | Chrome | 23.6% | 8.65% | 7.78 | -| gpt-3.5-turbo-0125 | Singapore | Windows | Chrome | 42.3% | 21.2% | 2.95 | - -### Configure cloud environment -[Browserbase](https://www.browserbase.com/) offers a reliable, high performance serverless developer platform to run, manage, and monitor headless browsers at scale. Leverage our infrastructure to power your web automation and AI agents. - -Get your API Key, go over the [Dashboard’s Settings tab](https://www.browserbase.com/settings), -Then copy your API Key directly from the input and update your `.env` by adding the `BROWSERBASE_API_KEY` entry - -Alternatively, you can temporarily set the environment variable for a single bash command by prefixing it with `BROWSERBASE_API_KEY=` in your terminal. - -You can find all the recent sessions on the Dashboard’s Overview, along with essential metrics, select your Session to inspect it with the [Session Inspector](https://docs.browserbase.com/features/session-inspector). - -### Download Raw Data of a Challenge(includes all open challenges on WebCanvas platform) - -Register on the platform [here](https://www.imean.ai/web-canvas). - -First, ensure your environment variables are correctly set so that the code can access the necessary credentials and URL. -``` -export GRAPHQL_USERNAME=your_username -export GRAPHQL_PASSWORD=your_password +OPEN_AI api setting +```bash +export OPENAI_API_KEY=your_api_key ``` -If you registered using your Google account, please setup a password in the profile page on [iMean Builder](https://www.imean.ai/builder/personal/profile). -To download a file, use the following command: +Tips: To run in a Linux environment without a visual interface, use the following command to start: ```bash -python data/dataset_io.py download \ - --challenge-id your_challenge_id \ - --save-path /path/to/save/file + sudo yum install -y xorg-x11-server-Xvfb +``` +Ubantu/Debian users can use the following command to install xvfb: +```bash + sudo apt-get update + sudo apt-get install -y xvfb ``` -- `your_challenge_id`: The ID of the challenge for the download. Obtain this ID on the url link of the challenge for now. For example, the ID of [Mind2Web-Live Test](https://www.imean.ai/web-canvas/challenges/WjVIjPfpa-psiltU3oD2W/leaderboard) is "WjVIjPfpa-psiltU3oD2W". -- `/path/to/save/file`: The path where the downloaded file will be saved. - -#### Process the Raw Data -The raw data contain rich information on step level to inspire future research. However, it's not for our evaluation. -To process the raw data, run the follow command: +### 👁 Evaluation process +See "configs/setting.toml" and "batch_eval.py" for parameter Settings for evaluation. -``` -python data/raw_data_processor.py \ - --input-file path/to/input/file \ - --output-file path/to/output/file -``` +Set the log path in "log.py" -### Run the Evaluation - -You can run the repos with the following command: +Start evaluation: ```bash -python evaluate.py \ - --global_reward_mode dom_reward \ - --index -1 \ - --single_task_name "Find Dota 2 game and add all DLC to cart in steam." \ - --planning_text_model gpt-4o-mini \ - --global_reward_text_model gpt-4o-mini +xvfb-run python batch_eval.py ``` -This command runs the script with DOM-based self-reward, processing the default task "Find Dota 2 game and add all DLC to cart in steam" or using the default data index -1. It also uses the gpt-4o-mini model for both observation and global reward processing. The evaluation mode is controlled by the `task_mode` parameter in `configs/setting.toml`, allowing you to choose between batch mode and single mode(without automatic evaluation). Remember to specify your path to the test file in `configs/setting.toml`. - - -### Parameter Descriptions - -This program supports several command-line arguments to customize its behavior: - -- `--global_reward_mode`: Selects the method for getting global rewards. - - Options: `dom_vision_reward`, `dom_reward`, `vision_reward`, `no_global_reward` - - Default: `dom_reward` - - Description: Define how rewards are got based on the interaction mode: - - `dom_vision_reward`: Rewards are calculated using both DOM and vision data. Currently only support GPT4v as vision model. - - `dom_reward`: Rewards are based solely on DOM interactions. You can specify the language model you want to use for reward reasoning by parameter *global_reward_text_model*. - - `vision_reward`: Rewards are derived from vision-based interactions only. Currently only support GPT4v as vision model. - - `no_global_reward`: No global rewards are calculated. - -- `--index`: Decide which data index to start with. - - Type: String - - Default: `-1` - - Description: Use this parameter to specify a range or specific index for data processing. For example, `0,5` will process data from index 0 to 5. +### Evaluate dataset processing +After getting the evaluation data set, use "utils/parser.py" to parse the log log file to get the json parsed file -- `--single_task_name`: Defines the task name of the single task to execute. - - Type: String - - Default: `"Find Dota 2 game and add all DLC to cart in steam."` - -- `--planning_text_model`: Specifies the model used for planning module. - - Type: String - - Default: `gpt-4o-mini` - -- `--global_reward_text_model`: Specifies the model used for global reward reasoning. - - Type: String - - Default: `gpt-4o-mini` - -#### Interaction Mode - -Evaluating web agents in an online environment can sometimes be painful due to issues like network problems or bot tests on certain websites. Adopting an evaluation method that accommodates these issues allows for an accurate assessment of an agent's performance under specific current conditions. Additionally, we provide a more flexible interaction mode, enabling users to manually solve environmental issues and get the optimized performance of their web agents. You can simply set the `interaction_mode` parameter in `configs/setting.toml` to enable this feature. We will accumulate our implementation on error handling in online agent inference, and try to minimize human efforts by triggering only when exceptions occur in the following version. - -### Upload the Result for a Challenge - -IMPORTANT: You should upload the generated out.json file to participate a challenge. To upload your result, use the following command: +Please set the parameters for the json file parsing step in "configs/log_config.json" +And then run the program ```bash -python data/dataset_io.py upload \ - --file-path /path/to/your/file \ - --challenge-id your_challenge_id \ - --name your_agent_name \ - --base-model your_agent_base_model +python utils/parser.py +python utils/dataset_process.py ``` - -Replace the placeholders with your actual values: - -- `/path/to/your/file`: The path to the result you want to upload. -- `your_challenge_id`: The ID of the challenge you want to participate. -- `your_agent_name`: The agent name for the upload. -- `your_agent_base_model`: The agent base model information for the upload. - -You can also submit through our platform. We will conduct an official check on your submission to prevent cheating. - -### Token Consumption Calculation - -We provide a token consumption calculation functionality for evaluating the efficiency of your agent, and it is enabled automatically. -The token consumption is calculated based on the number of tokens consumed by planning module and global reward reasoning module(if applicable) during the evaluation process. -The token consumption calculation results of each experiment will be saved in the `token_results` folder in JSON format. - -We use the `tiktoken` package to calculate the consumption of tokens. For those models whose encodings cannot be obtained, the default encoding "cl100k_base" is used. Therefore, for non-OPENAI models, the calculated tokens may have certain deviations. - -The amount spent on tokens is only available when the model name is provided in the 'token_pricing' under setting.toml; otherwise, only the quantity of tokens will be counted. -If you want to calculate the monetary expenditure of models not listed in 'token_pricing', you should first add the full name of the model (such as "gpt-4o-2024-05-13") to the 'pricing_models' list. Then, add the unit price of input and output for this model below the list, such as "gpt-4o-2024-05-13_input_price = 0.000005" and "gpt-4o-2024-05-13_output_price = 0.000015". - -Few example results on Mind2Web-Live test set: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Planning ModelCompletion ScoreTask Success RateUSD Efficiency Score
GPT-4o-2024-05-1351.4%28.8% 0.142
Llama-3.1-405B-Instruct-Turbo47.8%24.0%0.174
Llama-3.1-70B-Instruct-Turbo44.8%20.2%0.031
GPT-4o-mini-2024-07-1842.9%21.2%0.004
GPT-3.5-turbo-012542.5%17.3%0.092
- - -## 📊 Create Your Own Benchmark Dataset - -You can follow instructions on this [documentation](https://webcanvas.gitbook.io/webcanvas-docs) about how to create your own challenging benchmark for web agents. - -Currently, you need to set up a challenge(you can keep it private at first) on [WebCanvas Platform](https://www.imean.ai/web-canvas) to download raw data of your dataset. - -If your are thinking of scaling your Web trajectory data for training and evaluation, you can contact us directly for some technical assistance. - -Demo video: - -[![Demo video](https://img.youtube.com/vi/o6J8m8cZe8I/0.jpg)](https://www.youtube.com/watch?v=o6J8m8cZe8I) - -## 🤝 Contributing - -We welcome contributions to WebCanvas! - -Thank you for your interest in improving WebCanvas. Your contributions are greatly appreciated and essential to the growth and success of our project. Please refer to the roadmap and TODOs for promising directions. - - -## 🌐 Community - -We are building a vibrant and inclusive community around WebCanvas! Join our community to stay up-to-date with the latest developments and to contribute to the project: - -- [GitHub Discussions](https://github.com/iMeanAI/WebCanvas/discussions) -- [Discord](https://discord.gg/dhtgvJ52) - -## 📢 Feedback - -We value your feedback and suggestions! -- [Talk to Founder](https://calendly.com/dehan/30min), we welcome any discussion and feedback on the future of live agent evaluation! - -## Citation - -If you use this project in your research, please cite our paper: - +The directory of the processed data set is: +```bash +results/ +- task_id +-- trajectory +--- step_0_20250520-000604.png +--- step_2_20250520-000604.png + ... +-- result.json ``` -@article{pan2024webcanvas, - title={WebCanvas: Benchmarking Web Agents in Online Environments}, - author={Pan, Yichen and Kong, Dehan and Zhou, Sida and Cui, Cheng and Leng, Yifei and Jiang, Bing and Liu, Hangyu and Shang, Yanyi and Zhou, Shuyan and Wu, Tongshuang and others}, - journal={arXiv preprint arXiv:2406.12373}, - year={2024} -} + +### Online-Mind2Web Benchmarking +Run the following command to generate the benchmark file: +```bash +bash OM2W_Benchmarking/eval.sh ``` -#### References -[^1]: Deng, Xiang, et al. "Mind2web: Towards a generalist agent for the web." Advances in Neural Information Processing Systems 36 (2024). -[^2]: Zhou, Shuyan, et al. "Webarena: A realistic web environment for building autonomous agents." arXiv preprint arXiv:2307.13854 (2023). -[^3]: Mialon, Grégoire, et al. "Gaia: a benchmark for general ai assistants." arXiv preprint arXiv:2311.12983 (2023). -[^4]: Yoran, Ori, et al. "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?." arXiv preprint arXiv:2407.15711 (2024). -[^5]: Zheng, Boyuan, et al. "Gpt-4v (ision) is a generalist web agent, if grounded." arXiv preprint arXiv:2401.01614 (2024). +Display evaluation results: +```bash +python OM2W_Benchmarking/statistic.py +``` \ No newline at end of file diff --git a/agent/.DS_Store b/agent/.DS_Store new file mode 100644 index 0000000..5fa4f47 Binary files /dev/null and b/agent/.DS_Store differ diff --git a/agent/Environment/html_env/actions.py b/agent/Environment/html_env/actions.py index 04beaeb..74be2a7 100644 --- a/agent/Environment/html_env/actions.py +++ b/agent/Environment/html_env/actions.py @@ -15,7 +15,7 @@ class ActionTypes(IntEnum): NONE = 0 CLICK = 1 GOTO = 2 - GOOGLE_SEARCH = 3 + # GOOGLE_SEARCH = 3 # 禁用Google search FILL_FORM = 4 SWITCH_TAB = 5 GO_BACK = 6 @@ -103,16 +103,16 @@ def create_fill_search_action(elementid: int, fill_text: str) -> Action: "element_name": "" } - -@beartype -def create_search_action(elementid: int, text: str) -> Action: - return { - "action_type": ActionTypes.GOOGLE_SEARCH, - "element_id": elementid, - "url": "https://www.google.com", - "fill_text": text, - "element_name": "" - } +# 禁用Google search +# @beartype +# def create_search_action(elementid: int, text: str) -> Action: +# return { +# "action_type": ActionTypes.GOOGLE_SEARCH, +# "element_id": elementid, +# "url": "https://www.google.com", +# "fill_text": text, +# "element_name": "" +# } @beartype @@ -176,8 +176,8 @@ def create_action(elementid: int, action_type: str, action_input: str) -> Action return create_fill_search_action(elementid=elementid, fill_text=action_input) elif action_type == "goto": return create_goto_action(elementid=elementid, url=action_input) - elif action_type == "google_search": - return create_search_action(elementid=elementid, text=action_input) + # elif action_type == "google_search": # 禁用Google search + # return create_search_action(elementid=elementid, text=action_input) elif action_type == "go_back": return create_go_back_action(elementid=elementid) elif action_type == "select_option": @@ -203,7 +203,7 @@ def create_action(elementid: int, action_type: str, action_input: str) -> Action "create_fill_action", "create_none_action", "create_goto_action", - "create_search_action", + # "create_search_action", # 禁用Google search "create_go_back_action", "create_fill_search_action", "create_select_option_action", diff --git a/agent/Environment/html_env/async_env.py b/agent/Environment/html_env/async_env.py index 66a1fe1..ff53f48 100644 --- a/agent/Environment/html_env/async_env.py +++ b/agent/Environment/html_env/async_env.py @@ -392,79 +392,82 @@ async def fill_form(self, action): self.html_content = await self.page.content() except Exception as e: raise e - - async def search(self, action): - """Use Node.js to call Google Custom Search API""" - try: - # Execute Node.js script - process = await asyncio.create_subprocess_exec( - 'node', - self.search_script_path, - action["fill_text"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - # Get output - stdout, stderr = await process.communicate() + # disable Google search + # async def search(self, action): + # """Use Node.js to call Google Custom Search API""" + # try: + # # Execute Node.js script + # process = await asyncio.create_subprocess_exec( + # 'node', + # self.search_script_path, + # action["fill_text"], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE + # ) - if process.returncode == 0: - try: - # Parse the JSON response - data = json.loads(stdout.decode().strip()) + # # Get output + # stdout, stderr = await process.communicate() + + # if process.returncode == 0: + # try: + # # Parse the JSON response + # data = json.loads(stdout.decode().strip()) - if 'items' in data: - # Create HTML from search results - results_html = self._create_search_results_page(data['items']) - self.html_content = results_html - else: - self.html_content = "

No results found.

" - except json.JSONDecodeError as e: - logger.error(f"Failed to parse JSON response: {e}") - self.html_content = "

Error parsing search results.

" - else: - error_msg = stderr.decode().strip() - logger.error(f"Search script error: {error_msg}") - self.html_content = f"

Search error: {error_msg}

" + # if 'items' in data: + # # Create HTML from search results + # results_html = self._create_search_results_page(data['items']) + # self.html_content = results_html + # else: + # self.html_content = "

No results found.

" + # except json.JSONDecodeError as e: + # logger.error(f"Failed to parse JSON response: {e}") + # self.html_content = "

Error parsing search results.

" + # else: + # error_msg = stderr.decode().strip() + # logger.error(f"Search script error: {error_msg}") + # self.html_content = f"

Search error: {error_msg}

" - # Update the page content - await self.page.set_content(self.html_content) + # # Update the page content + # await self.page.set_content(self.html_content) - except Exception as e: - logger.error(f"Search error: {str(e)}") - self.html_content = f"

Search error: {str(e)}

" - await self.page.set_content(self.html_content) - - def _create_search_results_page(self, items): - """Create an HTML page from search results""" - results = [] - for item in items: - result = f""" -
-

{item.get('title', 'No title')}

-
{item.get('link', '')}
-
{item.get('snippet', 'No description available')}
-
- """ - results.append(result) - - html = f""" - - - - - -
- {''.join(results)} -
- - - """ - return html + # except Exception as e: + # logger.error(f"Search error: {str(e)}") + # self.html_content = f"

Search error: {str(e)}

" + # await self.page.set_content(self.html_content) + + + # disable Google search + # def _create_search_results_page(self, items): + # """Create an HTML page from search results""" + # results = [] + # for item in items: + # result = f""" + #
+ #

{item.get('title', 'No title')}

+ #
{item.get('link', '')}
+ #
{item.get('snippet', 'No description available')}
+ #
+ # """ + # results.append(result) + + # html = f""" + # + # + # + # + # + #
+ # {''.join(results)} + #
+ # + # + # """ + # return html async def go_back_last_page(self, action): # self.page = self.last_page @@ -633,13 +636,14 @@ async def execute_action(self, action: Action) -> Union[str, Tuple[str, str]]: error_message = f"Failed to execute fill_form [{action['element_id']},{action['fill_text']}] action. An error({e}) occur." raise ActionExecutionError( action['action_type'], error_message) from e - case ActionTypes.GOOGLE_SEARCH: - try: - await self.search(action) - except Exception as e: - error_message = f"Failed to execute google_search[{action['fill_text']}] action. An error({e}) occur." - raise ActionExecutionError( - action['action_type'], error_message) from e + # 禁用Google search + # case ActionTypes.GOOGLE_SEARCH: + # try: + # await self.search(action) + # except Exception as e: + # error_message = f"Failed to execute google_search[{action['fill_text']}] action. An error({e}) occur." + # raise ActionExecutionError( + # action['action_type'], error_message) from e case ActionTypes.GO_BACK: try: await self.go_back_last_page(action) diff --git a/agent/LLM/llm_instance.py b/agent/LLM/llm_instance.py index 3865f2b..6f8ef55 100644 --- a/agent/LLM/llm_instance.py +++ b/agent/LLM/llm_instance.py @@ -5,7 +5,8 @@ def create_llm_instance(model, json_mode=False, all_json_models=None): - if "gpt" in model or "o1" in model: + # if "gpt" in model or "o1" in model: + if any(keyword in model for keyword in ["gpt", "o1", "o3-mini", "o4-mini"]): if json_mode: if model in all_json_models: return GPTGeneratorWithJSON(model) diff --git a/agent/LLM/openai.py b/agent/LLM/openai.py index a84febb..40641b7 100644 --- a/agent/LLM/openai.py +++ b/agent/LLM/openai.py @@ -9,13 +9,18 @@ from agent.Utils import * from .token_utils import truncate_messages_based_on_estimated_tokens +# Adopt the new field schema (max_completion_tokens) +NEW_TOKEN_MODELS = ("o3", "o4") + +def use_new_token_param(model_name: str) -> bool: + return any(model_name.startswith(p) for p in NEW_TOKEN_MODELS) class GPTGenerator: def __init__(self, model=None): self.model = model self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) - async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> (str, str): + async def request(self, messages: list = None, max_tokens: int = 4096, temperature: float = 0.7) -> (str, str): try: if "gpt-3.5" in self.model: messages = truncate_messages_based_on_estimated_tokens(messages, max_tokens=16385) @@ -42,17 +47,41 @@ async def request(self, messages: list = None, max_tokens: int = 500, temperatur logger.error(f"Error in GPTGenerator.request: {e}") return "", str(e) - async def chat(self, messages, max_tokens=500, temperature=0.7): + async def chat(self, messages, max_tokens=4096, temperature=0.7): loop = asyncio.get_event_loop() + + # Dynamically select field names + token_key = "max_completion_tokens" if use_new_token_param(self.model) \ + else "max_tokens" if "o1" in self.model: data = { 'model': self.model, 'messages': messages, } + elif "o3" in self.model or "o4" in self.model: + data = { + 'model': self.model, + token_key: max_tokens, + 'messages': messages, + } + elif "gpt-4.1" in self.model: + data = { + 'model': self.model, + token_key: 4096, # gpt-4.1 max_tokens = 32768 + 'messages': messages, + } + elif "gpt-4o" in self.model: + data = { + 'model': self.model, + 'max_tokens': 16384, + 'temperature': temperature, + 'messages': messages, + } else: data = { 'model': self.model, - 'max_tokens': max_tokens, + 'max_tokens': 16384, + token_key: max_tokens, 'temperature': temperature, 'messages': messages, } @@ -79,11 +108,11 @@ def prepare_messages_for_json_mode(messages): messages.insert(0, {"role": "system", "content": "You are a helpful assistant designed to output json."}) return messages - async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> (str, str): + async def request(self, messages: list = None, max_tokens: int = 100000, temperature: float = 0.7) -> (str, str): messages = self.prepare_messages_for_json_mode(messages) # Prepare messages for JSON mode return await super().request(messages, max_tokens, temperature) class GPTGeneratorWithJSON(JSONModeMixin): def __init__(self, model=None): - super().__init__(model=model if model is not None else "gpt-4-turbo") + super().__init__(model=model if model is not None else "gpt-4-turbo") \ No newline at end of file diff --git a/agent/Plan/planning.py b/agent/Plan/planning.py index 49bd5e0..e9d606a 100644 --- a/agent/Plan/planning.py +++ b/agent/Plan/planning.py @@ -77,7 +77,7 @@ async def execute(self, status_description, user_request, previous_trace, observ vision_act_response) actions = { 'goto': "Found 'goto' in the vision_act_response.", - 'google_search': "Found 'google_search' in the vision_act_response.", + # 'google_search': "Found 'google_search' in the vision_act_response.", 'switch_tab': "Found 'switch_tab' in the vision_act_response.", 'scroll_down': "Found 'scroll_down' in the vision_act_response.", 'scroll_up': "Found 'scroll_up' in the vision_act_response.", @@ -94,7 +94,8 @@ async def execute(self, status_description, user_request, previous_trace, observ break if not actions_found: - print("None of 'goto', 'google_search', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.") + # print("None of 'goto', 'google_search', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.") + print("None of 'goto', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.") target_element = planning_response_get.get('target_element') description = planning_response_get.get('description') diff --git a/agent/Prompt/base_prompts.py b/agent/Prompt/base_prompts.py index 0731c0c..382dc6b 100644 --- a/agent/Prompt/base_prompts.py +++ b/agent/Prompt/base_prompts.py @@ -42,7 +42,6 @@ class BasePrompts: **Execution Action Space**: - goto: useful for when you need visit a new link or a website, it will open a new tab. - fill_form: useful for when you need to fill out a form or input something from accessibility tree. Input should be a string. - - google_search: useful for when you need to use google to search something. - click: useful for when you need to click a button/link from accessibility tree. - select_option: useful for when you need to select a drop-down box value. When you get (select and option) tags from the accessibility tree, you need to select the serial number(element_id) corresponding to the select tag, not the option, and select the most likely content corresponding to the option as Input. - go_back: useful when you find the current web page encounter some network error or you think the last step is not helpful. @@ -57,7 +56,9 @@ class BasePrompts: You have to follow the instructions or notes: **Important Notes**: - - Under the following conditions, you are restricted to using the `google_search` or `goto` tools exclusively: + - The first step must be a goto, especially when the page is blank (about: blank). + - It is not allowed to perform operations other than goto on blank pages, such as click and other interactive operations. + - Under the following conditions, you are restricted to using the `goto` tool exclusively: 1. In the initial step of a process or when there's no preceding interaction history (i.e., the previous trace is empty). 2. In situations where the accessibility tree is absent or not provided. - Your action should not be the same as last step's action. diff --git a/agent/Utils/utils.py b/agent/Utils/utils.py index e8da1d7..a5f85d7 100644 --- a/agent/Utils/utils.py +++ b/agent/Utils/utils.py @@ -51,17 +51,23 @@ def read_json_file(file_path): def save_screenshot(mode: str, record_time: str, task_name: str, step_number: int, description: str, - screenshot_base64: str, task_name_id: str = None): - + screenshot_base64: str, task_name_id: str = None, task_uuid: str = None):# add task_uuid + # Prior use task_uuid,else task_name_id + identifier = task_uuid if task_uuid is not None else task_name_id + timestamp = datetime.now().strftime('%Y%m%d-%H%M%S') invalid_chars = '<>:"/\\|?*' for char in invalid_chars: task_name = task_name.replace(char, '_') - - if task_name_id is None: + # if task_name_id is None: + # task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name}' + # else: + # task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name_id}_{task_name}' + + if identifier is None: task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name}' else: - task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name_id}_{task_name}' + task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{identifier}_{task_name}' if not os.path.exists(task_folder): os.makedirs(task_folder) diff --git a/batch_eval.py b/batch_eval.py new file mode 100644 index 0000000..5dde87c --- /dev/null +++ b/batch_eval.py @@ -0,0 +1,136 @@ +""" +This is a batch test script. +This release adds the following features: +1. Support screenshots of the evaluation process +2. Support Online_Mind2Web task evaluation +3. Support access to gpt-4.1, o3-mini, o4-mini and other models + +Tips: To run in a Linux environment without a visual interface, use the following command to start: + sudo yum install -y xorg-x11-server-Xvfb + xvfb-run python batch_eval.py + + Ubantu/Debian users can use the following command to install xvfb: + sudo apt-get update + sudo apt-get install -y xvfb + xvfb-run python batch_eval.py +""" +#!/usr/bin/env python3 +import json +import os +import subprocess +import argparse +import time +from pathlib import Path + +def load_tasks(json_path): + with open(json_path, 'r') as f: + data = json.load(f) + return data + +def run_single_task(task_data, task_index, args): + task_name = task_data["confirmed_task"] + website = task_data.get("website", "about:blank") + + command = [ + "python", "eval.py", + "--global_reward_mode", args.global_reward_mode, + "--index", str(task_index), + "--single_task_name", task_name, + "--single_task_website", website, + "--snapshot", args.snapshot, + "--planning_text_model", args.planning_text_model, + "--global_reward_text_model", args.global_reward_text_model + ] + + print(f"\n{'='*80}") + print(f"Task [{task_index}]: {task_name}") + print(f"Website: {website}") + print(f"{'='*80}") + + try: + subprocess.run(command, check=True) + print(f"Mission accomplished: {task_name}") + return True + except subprocess.CalledProcessError as e: + print(f"Task failure: {task_name}") + print(f"Error: {e}") + return False + +def main(): + parser = argparse.ArgumentParser(description='Online-Mind2Web Task') + parser.add_argument('--json_path', type=str, default='data/Online-Mind2Web/Online_Mind2Web.json', + help='JSON task file path') + parser.add_argument('--global_reward_mode', type=str, default='dom_reward', + help='Global Reward Mode: dom_reward/no_global_reward/dom_vision_reward') + parser.add_argument('--index', type=int, default=-1, + help='Task index') + parser.add_argument('--snapshot', type=str, default='results/test2', + help='Snapshot directory') + parser.add_argument('--planning_text_model', type=str, default='gpt-4.1', + help='planning_text_model: gpt-4.1/gpt-4o-2024-08-06') + parser.add_argument('--global_reward_text_model', type=str, default='gpt-4.1', + help='global_reward_text_model: gpt-4.1/gpt-4o-2024-08-06') + parser.add_argument('--start_idx', type=int, default=0, + help='The index to start the task') + parser.add_argument('--end_idx', type=int, default=None, + help='The index of the finished task (excluding)') + parser.add_argument('--delay', type=int, default=5, + help='Latency between tasks (seconds)') + parser.add_argument('--output_log', type=str, default='results/test2/batch_run_log.txt', + help='output_log') + + args = parser.parse_args() + + # Loading tasks + json_path = Path(args.json_path) + if not json_path.exists(): + print(f"Error: File does not exist - {json_path}") + return + + tasks = load_tasks(json_path) + start_idx = args.start_idx + end_idx = args.end_idx if args.end_idx is not None else len(tasks) + + total_tasks = end_idx - start_idx + successful_tasks = 0 + + with open(args.output_log, 'w') as log_file: + log_file.write(f"The batch job run starts: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") + log_file.write(f"total_tasks: {total_tasks}\n\n") + + # Run the selected task + for i, task_data in enumerate(tasks[start_idx:end_idx]): + current_idx = start_idx + i + task_name = task_data["confirmed_task"] + website = task_data.get("website", "about:blank") + + with open(args.output_log, 'a') as log_file: + log_file.write(f"[{current_idx}/{len(tasks)}] Running task: {task_name}\n") + log_file.write(f"Website: {website}\n") + + success = run_single_task(task_data, current_idx, args) + if success: + successful_tasks += 1 + + # Logging results + with open(args.output_log, 'a') as log_file: + log_file.write(f"Result: {'Success' if success else 'Failure'}\n\n") + if i < total_tasks - 1: + print(f"waiting {args.delay} continue to the next task after seconds...") + time.sleep(args.delay) + + with open(args.output_log, 'a') as log_file: + log_file.write(f"\nFinish: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") + log_file.write(f"Total_tasks: {total_tasks}\n") + log_file.write(f"Number of successful tasks: {successful_tasks}\n") + log_file.write(f"Success rate: {successful_tasks/total_tasks*100:.2f}%\n") + + print(f"\n{'='*80}") + print(f"Total_tasks: {total_tasks}") + print(f"Number of successful tasks: {successful_tasks}") + print(f"Success rate: {successful_tasks/total_tasks*100:.2f}%") + print(f"save: {args.output_log}") + +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/configs/log_config.json b/configs/log_config.json new file mode 100644 index 0000000..eb94e89 --- /dev/null +++ b/configs/log_config.json @@ -0,0 +1,5 @@ +{ + "log_directory": "./results/41_dom/logs", + "output_directory": "./results/41_dom/json", + "task_mapping_file": "./data/Online-Mind2Web/Online_Mind2Web.json" +} \ No newline at end of file diff --git a/configs/setting.toml b/configs/setting.toml index 153adad..2467c87 100644 --- a/configs/setting.toml +++ b/configs/setting.toml @@ -1,6 +1,8 @@ [basic] -task_mode = "batch_tasks" # single_task or batch_tasks +task_mode = "single_task" # single_task or batch_tasks max_time_step = 25 # For all tasks, set the maximum step length +save_screenshots = true # screenshots +screenshot_path = "./screenshots" [model] json_model_response = false # Whether to require a model to strictly output json format, currently only support OPENAI models. @@ -10,18 +12,21 @@ json_models = ["gpt-4-turbo", "gpt-4-1106-preview", "gpt-3.5-turbo", "gpt-3.5-turbo-0125", - "gpt-4o-2024-05-13", - "gpt-4o-mini-2024-07-18"] + "gpt-4o-2024-08-06", + "o4-mini", + "gpt-4.1-2025-04-14", + "o3-mini-2025-01-31" + ] [steps] -interaction_mode = true # Whether human control of task execution status is required -single_task_action_step = 10 +interaction_mode = false # Whether human control of task execution status is required +single_task_action_step = 25 batch_tasks_max_action_step = 10 batch_tasks_condition_step_increase = 5 [files] -batch_tasks_file_path = "./data/example/mind2web-live_test_20241024.json" # The input data path +batch_tasks_file_path = "./data/Online-Mind2Web/Online_Mind2Web.json" # The input data path ground_truth_file_path = "./data/human_labeled_reward_reference/GT_instructions_202404161811_for_all_data_0328.json" # the ground_truth data path out_file_path = "./batch_tasks_results/example" # YOUR OUT FILE PATH @@ -31,7 +36,7 @@ URL = ["error"] [token_pricing] pricing_models = [ "gpt-4o", - "gpt-4o-2024-05-13", + "gpt-4o-2024-08-06", "gpt-4o-mini", "gpt-4o-mini-2024-07-18", "gpt-4-turbo", @@ -42,7 +47,9 @@ pricing_models = [ "gpt-4-1106-preview", "gpt-4-vision-preview", "gpt-3.5-turbo-0125", - "gpt-3.5-turbo-1106"] + "gpt-3.5-turbo-1106", + "o4-mini" + ] # The price of each model for input and output, the unit is $/token # The name of input token price: model_name + "_input_price", such as gpt-4o_input_price @@ -73,3 +80,4 @@ gpt-3.5-turbo-0125_input_price = 0.0000005 gpt-3.5-turbo-0125_output_price = 0.0000015 gpt-3.5-turbo-1106_input_price = 0.000001 gpt-3.5-turbo-1106_output_price = 0.000002 +o4-mini = 0.000002 \ No newline at end of file diff --git a/data/Online-Mind2Web/Online_Mind2Web.json b/data/Online-Mind2Web/Online_Mind2Web.json new file mode 100755 index 0000000..cfd2bc4 --- /dev/null +++ b/data/Online-Mind2Web/Online_Mind2Web.json @@ -0,0 +1,2102 @@ +[ + { + "task_id": "b7258ee05d75e6c50673a59914db412e", + "confirmed_task": "Find the store location and hours of the closest Gamestop to zip code 90028 and set it as the home store on Gamestop.", + "website": "https://www.gamestop.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "ade4c09ad3fdb1607209750924cd232f", + "confirmed_task": "Compare available plans for the AeroAPI on Flightaware.", + "website": "https://www.flightaware.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "fb7b4f784cfde003e2548fdf4e8d6b4f", + "confirmed_task": "Open the page with an overview of the submission of releases on Discogs.", + "website": "https://www.discogs.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "824eb7bb0ef1ce40bfd49c12182d9428", + "confirmed_task": "Get the lowest priced women's plus size one piece swimsuit in color black with a customer rating of at least 5 on Kohls.", + "website": "https://www.kohls.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "046138801a05ddf56ad94e8672942496", + "confirmed_task": "Find discussions of the community and open one with the most replies on Flightaware.", + "website": "https://www.flightaware.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "92a3d4236f167af4afdc08876a902ba6", + "confirmed_task": "Find a 2022 Tesla Model 3 on CarMax.", + "website": "https://www.carmax.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "48c73f3f53e2611c4a1052457c1033db", + "confirmed_task": "Get the report from the final environmental impact statement for the Jamaica Bus Depot expansion on new.mta.info.", + "website": "https://new.mta.info/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "8f2611047de227a2ca8bda13f6e2e5fb", + "confirmed_task": "Find the used 2012-2013 Honda Crosstour with the lowest mileage for under $25,000 near zip code 49102 on CarGurus.", + "website": "https://www.cargurus.com/", + "reference_length": 17, + "level": "hard" + }, + { + "task_id": "b320c68bffc1f3c7f2a8dc9d5478fb27", + "confirmed_task": "Find a walkthrough for the game \"The Legend of Zelda: Breath of the Wild\" on ign.", + "website": "https://www.ign.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "aa4b5cb7114fcc138ade82b4b9716d24", + "confirmed_task": "Find an editor's choice review with a score of 10 in the boardgame category on ign.", + "website": "https://www.ign.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "005be9dd91c95669d6ddde9ae667125c", + "confirmed_task": "Find the weight of baggage allowance for economy class on Qatar Airways.", + "website": "https://www.qatarairways.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "323bd85e3559655d89e5496b951a25e8", + "confirmed_task": "Tell me information about what identification I need to bring on my trip on Amtrak.", + "website": "https://www.amtrak.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "123e8c2fc453f55fadd1d0b9aaf94df4", + "confirmed_task": "Browse used Audi cars made before 2015 and sort by lowest price on KBB.", + "website": "https://www.kbb.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "56f8890a837c49f7df766b9c981646f3", + "confirmed_task": "Show crazy credits for the movie \" Prometheus\" on IMDb.", + "website": "https://www.imdb.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "644a856c3897665e475e0dce50bf217d", + "confirmed_task": "Find a pair of wireless headphones on Amazon with active noise canceling for $100 or less and add them to the cart.", + "website": "https://www.amazon.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "62f1626ce249c31098854f8b38bdd6cf", + "confirmed_task": "Find Playstation 5 digital edition on gamestop.", + "website": "https://www.gamestop.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "561693d6eec7bbfba3fefe9e4b26decb", + "confirmed_task": "Browse Marriott Bonvoy credit cards on Marriott.", + "website": "https://www.marriott.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "b7a9a6b5d451164c09bbd27b670bc2ae", + "confirmed_task": "Show me the list of Men's Blazers, Black, Size M on Uniqlo.", + "website": "https://www.uniqlo.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "bfa2de159be6978acf2702be31a2eeeb", + "confirmed_task": "Show me the options for a roundtrip leaving from Las Vegas on flexible dates on the interactive map on united.", + "website": "https://www.united.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "4091bdd3fa64a5b0d912bc08eaf9c824", + "confirmed_task": "Find the list of neighborhood maps for Brooklyn on new.mta.info.", + "website": "https://new.mta.info/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "79f0bd7df6e685f30f20025cc6755c0a", + "confirmed_task": "Find me the cheapest external Hard Drive for an Xbox One on GameStop.", + "website": "https://www.gamestop.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "6ebde509dca8f15c0fa1bd74f071e8d6", + "confirmed_task": "Search for a job in Miami, Florida, in Human Resources on target.", + "website": "https://www.target.com/", + "reference_length": 14, + "level": "hard" + }, + { + "task_id": "34ccd15a8ea8fd3895af83f5ccf62369", + "confirmed_task": "Find out what to do when I lose an item on a bus on us.megabus.", + "website": "https://us.megabus.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "c698ff3fc0f6cbce39947c597ab5749b", + "confirmed_task": "Browse the page with event planning tips on Eventbrite.", + "website": "https://www.eventbrite.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "b6d10e9bd19b4009a02dea0e98f4e1ae", + "confirmed_task": "Check the current standings for MLS on Fox Sports.", + "website": "https://www.foxsports.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "d71be72aa25c3eab8eea47a0e60382e2", + "confirmed_task": "Find technical specs for the latest Macbook Air on Apple.", + "website": "https://www.apple.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "0b51b4fa0295ae80ccd176ebdad6fff6", + "confirmed_task": "Search for a red Toyota Corolla from model years 2018 to 2023 on CarMax.", + "website": "https://www.carmax.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "3f312ae3efc3c3e90ababe050dd4e7ae", + "confirmed_task": "Find the current NFL standings for the AFC East division on NFL.com and go to the page on which team is in first place.", + "website": "https://www.nfl.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "95cad96f2e43f3c0d8efad1331c77c8c", + "confirmed_task": "View the list of the Most Popular TV on rotten tomatoes.", + "website": "https://www.rottentomatoes.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "bf3b311cc8dce16d3de844f4b5875dfd", + "confirmed_task": "Compare Apple watches and learn more about the ultra version on apple.", + "website": "https://www.apple.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "b64f938af842f6a1b4489d0e49a785a7", + "confirmed_task": "Get the frozen vegan cheese pizza between 5 to 10 USD on Target.", + "website": "https://www.target.com/", + "reference_length": 17, + "level": "hard" + }, + { + "task_id": "5e1b8254c123c80178cc28e0afdb14f0", + "confirmed_task": "Find a help page about buying tickets on seatgeek.", + "website": "https://seatgeek.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "f27c0a7b8b0bb33d37698dff227fc8d7", + "confirmed_task": "Browse used Mercedes-Benz cars from model years 2004 to 2012 on KBB and sort by highest price.", + "website": "https://www.kbb.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "8fdec8eeffd3491e6526cc78c028120b", + "confirmed_task": "See Nissan and Honda cars for sale near Kentwood, MI 49512 on CarMax.", + "website": "https://www.carmax.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "7b182a5087347d494b48a29dbc0f1d3e", + "confirmed_task": "Find a shelter or rescue group near zip code 90011.", + "website": "https://www.adoptapet.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "828c2d98616a9478d5864d847d5a1b28", + "confirmed_task": "Browse the list of Civil Division forms.", + "website": "https://www.justice.gov/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "608c595eec271fa5dc03506923519994", + "confirmed_task": "Calculate a FedEx Ground shipping rate for a 3-pound package from zip code 10019 to zip code 90028.", + "website": "https://www.fedex.com/en-us/home.html", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "a7a73c8fa75441fc76df9746c327bdd6", + "confirmed_task": "Estimate the cost of a photographer in 07055 for a 4-hour project.", + "website": "https://www.thumbtack.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "cfafe3771369d1d261e9f7ecd44c296d", + "confirmed_task": "Find the highest-rated dealer for Cadillac with a rating above 4 stars within 20 miles of zip 60606.", + "website": "https://www.cars.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "bbbc243b4f18a7a897f0bc84e11d293f", + "confirmed_task": "Find out how many assists Chris Paul has been averaging in the current season.", + "website": "https://www.nba.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "816851ff92ff0219acf4364dcc2c4692", + "confirmed_task": "Search for boys' infant pajamas below $40.", + "website": "https://www.macys.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "8244409b2c82043f966cad05f9afe132", + "confirmed_task": "Find the best Audiologist within 50 miles of New York, NY, with a rating of 4 and above.", + "website": "https://doctor.webmd.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "e7301bb694871429bf2eb36c3a72186c", + "confirmed_task": "Find baby shoes priced under $20 with a 5-star rating.", + "website": "https://www.macys.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "905cb53061c33aa2d77e485fe1fca516", + "confirmed_task": "Browse dermatologists within 10 miles of zip code 10019 and filter by only those who accept Blue Medicare Advantage.", + "website": "https://www.healthgrades.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "fcf4952d2a1d80ea505c555c3c3b54e7", + "confirmed_task": "Find the cheapest used 8-cylinder bmw made between 2005-2015 and priced from 25,000 to 50,000 dollars with mileage less than 50,000 miles or less.", + "website": "https://www.cars.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "3c1ffc3f494e423b3c434c79e35da8f3", + "confirmed_task": "Find 12 Monkeys community and view the latest posts mentioning James Cole.", + "website": "https://www.reddit.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "26a0e5c21c145dd8448aa92f35bec5ea", + "confirmed_task": "Browse optometrists who offer telehealth services in Columbus, OH.", + "website": "https://www.healthgrades.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "070c907d34a4ce71dfdbea38f9c5d4d8", + "confirmed_task": "Find a dentist who specializes in pediatric dentistry and is located near zip code 90210 (within 5-mile distance).", + "website": "https://www.healthgrades.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "43a1ca251f11c6b0bdd0379766cc49e6", + "confirmed_task": "Find a neurosurgeon who is over 50 years old and has an appointment available tomorrow.", + "website": "https://www.healthgrades.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "b3f8bd9198d9d157e0848109563c4b23", + "confirmed_task": "Find a permanent job in Logistics within 20 miles of New York, zip 11005, in the middle-income range for a high school diploma holder.", + "website": "https://ohiomeansjobs.ohio.gov/", + "reference_length": 15, + "level": "hard" + }, + { + "task_id": "20a460a8fe1971b84411c5b1e6ac4186", + "confirmed_task": "Show theatre events for Las Vegas and select one.", + "website": "https://www.stubhub.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "db1ffb5e60578597d1c3aa3c389ac7b1", + "confirmed_task": "Search for smart TVs with a screen size of 55 to 65 inches and filter the results to show only those that have an LED display.", + "website": "https://www.google.com/shopping?udm=28", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "7be8cd8dba885cddd9af5320f49bc41b", + "confirmed_task": "Find roofing contractors within 5 miles of zip code 10002.", + "website": "https://www.bbb.org/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "239a29bde438fe44fe17fe1390ef1634", + "confirmed_task": "Find me a gluten-free diet to lose weight for a pregnant woman.", + "website": "https://www.healthline.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "9f1cba613830ca1c6a58f9498c06e679", + "confirmed_task": "Find a premier real estate agent in St Augustine, FL.", + "website": "https://www.redfin.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "75146b7b67388b9244e0f21a1527c022", + "confirmed_task": "Find a male senior boxer near zip code 90028.", + "website": "https://www.adoptapet.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "59b7b990b4828bc305ab0d7ed6071b55", + "confirmed_task": "Get owner-financing homesite land for sale in New Mexico, Luna County, listed in the last 30 days, and contact the cheapest per acre land seller.", + "website": "https://www.landwatch.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "9c97bab9c2abfb90a426cbe9addae8d0", + "confirmed_task": "Check the details of order 12345 with email 12345@gmail.com.", + "website": "https://www.macys.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "871e7771cecb989972f138ecc373107b", + "confirmed_task": "Find the weather for Vancouver, British Columbia for the next seven days.", + "website": "https://www.theweathernetwork.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "b69eb4de621e9e265676daac44938f3f", + "confirmed_task": "Find an adult husky near zip code 10019.", + "website": "https://www.adoptapet.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "9bb63ad0e38d5691a618932a8b31c05a", + "confirmed_task": "Look for reviews of a Nest Hello Video Doorbell and filter by 1-star ratings.", + "website": "https://www.google.com/shopping?udm=28", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "8ae510355d978424f490798f900bfa2c", + "confirmed_task": "Show me the shared rooms in any university in Melbourne that has a private bathroom wifi, and gas included in the bills.", + "website": "https://www.student.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "4c186c6ed888d0c8d4cf4adb39443080", + "confirmed_task": "Find a medium Devin Booker jersey and add it to the shopping cart.", + "website": "https://www.nba.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "2fc51dd3febd447f0fdcdabca8d944ce", + "confirmed_task": "Locate a self-storage unit near zip code 60538 that can fit about a dorm room full of items and is climate-controlled.", + "website": "https://www.extraspace.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "eb323dc584156d0eb3a2b90bb8c4b791", + "confirmed_task": "Find the latest 2 bed and 1.5+ bath apartment listing for rent in New York.", + "website": "https://www.redfin.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "87f4c5128e36cdb9366a138a7b61bb00", + "confirmed_task": "View the speakers that are bluetooth and wireless and filter the results to only show models that are on sale and cost less than $50.", + "website": "https://www.bestbuy.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "354b4ddf048815f8fd4163d0d7e1aaa3", + "confirmed_task": "Browse marketing jobs and filter by Bachelor's Degree education level.", + "website": "https://ohiomeansjobs.ohio.gov/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "e4e097222d13a2560db6f6892612dab6", + "confirmed_task": "Search for a young spayed male dog cared for by a private owner within 50 miles of zip 33109.", + "website": "https://www.adoptapet.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "f389398d2eeb29e5571e00439c57eb76", + "confirmed_task": "Find the latest climate news.", + "website": "https://www.theweathernetwork.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "8ea6c3a2ea3f59150619935261a76d19", + "confirmed_task": "Find a staffed FedEx location near zip code 10019 to return a package.", + "website": "https://www.fedex.com/en-us/home.html", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "c1d6ea6f2196d25782cc3646ff3090db", + "confirmed_task": "Create a list of drip coffee makers that are on sale and within $25-60 and have a black finish.", + "website": "https://www.google.com/shopping?udm=28", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "2dd41b1d0e8f389d0683f4a4627abfe6", + "confirmed_task": "Show houses for sale in Maryland with a maximum price of $60,000.", + "website": "https://www.landwatch.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "f2097f92a10d42a842c14179f422311e", + "confirmed_task": "Add a $50 Uber gift card to the cart.", + "website": "https://www.bestbuy.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "85b284c18d7e78c9b5a9e074e7aa3b98", + "confirmed_task": "View the cheapest apartment available for students at the University of Leeds with bills that include WIFI and cleaning services.", + "website": "https://www.student.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "853afd530c72f4b00ffc32ae854efaf8", + "confirmed_task": "Show me the wind flow map for Belo Horizonte.", + "website": "https://www.accuweather.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "c09721cc937d4dcfb391a0bc2c574b28", + "confirmed_task": "Find the next available date for Albion Basin.", + "website": "https://www.recreation.gov/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "4c572a627b53b0f9a734ab37f21819b8", + "confirmed_task": "Browse apartments with at least 2 bedrooms and 2 bathrooms and a max price of $4000 per month.", + "website": "https://craigslist.org/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "301f267f421b93045874726183e8f722", + "confirmed_task": "Find healthy savory vegan snack recipes which can be cooked within 5 minutes and contain a high level of protein.", + "website": "https://www.healthline.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "4f903626f632586fe4728d6664947bab", + "confirmed_task": "Find press releases by the antitrust division in 2022.", + "website": "https://www.justice.gov/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "3ec0f6138d37fadcb989347a6088ec45", + "confirmed_task": "Open the page to learn more about how to get accredited.", + "website": "https://www.bbb.org/", + "reference_length": 2, + "level": "easy" + }, + { + "task_id": "2207bb4f21786690cfed20b37253fb8b", + "confirmed_task": "Check the current wind speed in Calgary, Alberta.", + "website": "https://www.theweathernetwork.com/", + "reference_length": 2, + "level": "easy" + }, + { + "task_id": "9c04b71bb8db6cf8e743b2290cbc8797", + "confirmed_task": "Find a UPS drop-off point near Miami Florida.", + "website": "https://www.ups.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "4e0f5561a76478da87995dee00b09572", + "confirmed_task": "Show me the monthly weather forecast for Florida City.", + "website": "https://www.accuweather.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "7562d9b4e4829a44245aafce2e1f62db", + "confirmed_task": "Find the nearest location to zip code 54620 that offers size 4 P.O. Boxes.", + "website": "https://www.usps.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "bd1e3770b7181f6fce9c35e18caa9785", + "confirmed_task": "Browse service listings for a solar panel installer and hide duplicates.", + "website": "https://craigslist.org/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "330cd04c773ac498f51afa4665461ec8", + "confirmed_task": "Browse couches for sale, sort by cheapest, and search in titles only.", + "website": "https://craigslist.org/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "ec78d3a635e417bc2a80d03ca93d7165", + "confirmed_task": "What are the benefits and financial support a single person living in England, over the state pension age, unemployed, with no health conditions, or caring for someone with one, can get?", + "website": "https://www.gov.uk/", + "reference_length": 16, + "level": "hard" + }, + { + "task_id": "a0a18ca6a3529f3e97c771aadd42d3a0", + "confirmed_task": "Add a men's T-shirt that is in large size with a stripe pattern, short sleeve, and under the Best Sellers group to the cart.", + "website": "https://www.macys.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "82eb3bfedd78456a0230b389f4e7a938", + "confirmed_task": "Open the XRP yearly chart.", + "website": "https://coinmarketcap.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "e7f6cca9a8875f98fee3b711ead3a444", + "confirmed_task": "Find the comments made by the user Separate-Camp7202.", + "website": "https://www.reddit.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "75a1b5dcd2c28508a971d98d51fe5767", + "confirmed_task": "Open the reviews of a recipe with beef sirloin.", + "website": "https://www.allrecipes.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "c03ee2be3d73556ab789c0ad1cbd3451", + "confirmed_task": "Find a dog groomer for nail trimming within 100 miles of zip code 10005 and check the detailed service prices of the first one.", + "website": "https://www.akc.org/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "05483c50cc9b04c8ac44c574758fb2bd", + "confirmed_task": "Look for the best rated BBB accredited charity near 12023.", + "website": "https://www.bbb.org/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "a172a5d9ffaf5ef02bd550ec4fe24e6d", + "confirmed_task": "Browse the natural products database.", + "website": "https://www.drugs.com/", + "reference_length": 2, + "level": "easy" + }, + { + "task_id": "7e1047f4803237f319c004f7a7f6bccb", + "confirmed_task": "Discover the trade-in value of my Intel 7th generation i3 Windows 10, HP laptop in fair condition, which has 8 GB memory and can be powered on, proceed for the in-store trade-in.", + "website": "https://www.bestbuy.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "f2be37a9a60fbc25b6b11cf622d17352", + "confirmed_task": "Find obedience trials in state of New York during the month of May.", + "website": "https://www.akc.org/", + "reference_length": 14, + "level": "hard" + }, + { + "task_id": "e24662008c3be5d56f986f232fcec447", + "confirmed_task": "Find the stock price for WWE over the last month.", + "website": "https://www.google.com/finance/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "0170ca95038b05fa58d463fe627ac605", + "confirmed_task": "Check if a visa is required to work in the UK for longer than 6 months in Healthcare as an American citizen.", + "website": "https://www.gov.uk/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "b3a7da968de13bbdcaed12ffe4993df6", + "confirmed_task": "Compare the breeds Afghan Hound, Akita and Azawakh.", + "website": "https://www.akc.org/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "515f2e5811cfdd5e0e669e40f17886d8", + "confirmed_task": "Search for a new internal M2 Samsung SSD drive between $25 and $200.", + "website": "https://www.bestbuy.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "4d3157aab34b54e5f0c4b965dfe930f3", + "confirmed_task": "Show me community posts about pregnancy fever from the past 30 days.", + "website": "https://www.babycenter.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "e4e19e04286f644d747d8c5a79d17fac", + "confirmed_task": "Find the Drug Interaction Report for Viagra and alcohol.", + "website": "https://www.drugs.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "cad62d2be0c53f08a416457486b3db23", + "confirmed_task": "Search for adoptable dogs near 21122 zip code.", + "website": "https://www.adoptapet.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "9ef1a8972f375db59c0e6329e11b7939", + "confirmed_task": "Find Farms land in Wilkes County, NC with the lowest price.", + "website": "https://www.landwatch.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "a11ecdff735b51372d536c866011af6f", + "confirmed_task": "Explore courses related to Psychology.", + "website": "https://www.coursera.org/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "7fff82864f21ddeccf4104a220892824", + "confirmed_task": "Find the lowest 27\"-32\" Samsung or LG computer monitors nearby which have 4k, IPS display.", + "website": "https://www.google.com/shopping?udm=28", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "50d91eabde542906937ab4c5b6f8f23a", + "confirmed_task": "Calculate Pregnancy Weight Gain for a 5-week pregnancy with a 169lb weight before pregnancy and a 175lb after pregnancy with a 5.6ft height.", + "website": "https://www.babycenter.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "dcd26e662a616d373ddd339747c6ce5b", + "confirmed_task": "Take a weight management quiz to find a motivating article for a non-exercising, mostly eating out and can't control portions and cravings, and who has a strong support system, enjoys traveling, loves family time and cooking.", + "website": "https://www.healthline.com/", + "reference_length": 22, + "level": "hard" + }, + { + "task_id": "eb2db4b769c145dbe6ba4f74f3e0de98", + "confirmed_task": "Find an energetic hairless dog with medium barking.", + "website": "https://www.akc.org/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "c0fa2c0e622971955cabf5bcf7b777e8", + "confirmed_task": "Search for rentals in Corning, CA with a maximum price of $1500.", + "website": "https://www.apartments.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "ce616721ce9aeda69890fbccb29677a6", + "confirmed_task": "Calculate the price to ship a large flat-rate box from 77449 to 77084 at the first available date and time.", + "website": "https://www.usps.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "9d09bc948462db032bac98968b11b008", + "confirmed_task": "Find NHL events occurring in Boston.", + "website": "https://www.stubhub.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "29526b17a32485742b5ab63507e99417", + "confirmed_task": "Browse Humira dosage information.", + "website": "https://www.drugs.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "d7c955b47af68e01766fa86d0bee08a7", + "confirmed_task": "Add Elevate at Chicago, IL, to favorites and show a virtual tour.", + "website": "https://www.apartments.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "9d090a15c214eb070d9caa8a034d03c1", + "confirmed_task": "Find the lowest-priced Student housing near Liverpool International College which has been priced between 100 to 300 pounds and has a private bathroom.", + "website": "https://www.student.com/", + "reference_length": 14, + "level": "hard" + }, + { + "task_id": "5916018d1cad999881018cac1216a692", + "confirmed_task": "Find a personal trainer service at 10040 for a 25-year-old client aiming to build muscle.", + "website": "https://www.thumbtack.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "0059adc6b12a3822305deb68929b2de8", + "confirmed_task": "Find support services jobs in Bentonville, in the state of Arkansas.", + "website": "https://careers.walmart.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "07bdc595306729a028ba06cc7451a80a", + "confirmed_task": "Select a high speed train ticket with a departure time before 23:00 from Shanghai to Beijing.", + "website": "https://us.trip.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "64b76158720a69e4a5c31a55d54928bf", + "confirmed_task": "Compare two pescatarian diets for eating healthier.", + "website": "https://www.healthline.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "e3ab665e01e7632ce33ac1aeca14aff6", + "confirmed_task": "Find the next available dates for Alley Creek Camp.", + "website": "https://www.recreation.gov/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "2d5a7f95f951a26838289dfd629ae850", + "confirmed_task": "Find a list of houses for sale in zip code 85747 with a private pool.", + "website": "https://www.redfin.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "26810ed9c123a62992e3eed31db3c5ee", + "confirmed_task": "Show daily weather for New York City.", + "website": "https://www.accuweather.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "c181f903ec1107b850032c17cad88393", + "confirmed_task": "Help me identify a pink round pill with 150 written on it.", + "website": "https://www.webmd.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "ef289e34a2f59a707cb07e2a6229ff03", + "confirmed_task": "Compare the Acura CL 2003 with the ILX 2022.", + "website": "https://www.cars.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "84f806c7fc15576673915f195efa72df", + "confirmed_task": "Find a nationwide nearest animal shelter for birds around zip 10012.", + "website": "https://www.adoptapet.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "be9e7dca1222714571ef3d7d59d2a41c", + "confirmed_task": "Find out the cold and flu forecast and today's air quality in Champaign, IL.", + "website": "https://weather.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "11abb668c751dd56bb41f296a8bb3a13", + "confirmed_task": "Find a store near zip 30010 that provides authorized Apple services for imacs and make this one my store.", + "website": "https://www.bestbuy.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "207e933d1bba815bcb58664b5d82c085", + "confirmed_task": "Find Ohio City apartments with parking, a fitness center, and an elevator.", + "website": "https://www.apartments.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "29b7372d5a3884a2ba831af2d117af3c", + "confirmed_task": "Browse the first top news of Microsoft stock on Google Finance.", + "website": "https://www.google.com/finance/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "5c00e9561eae94789443f405525a5869", + "confirmed_task": "Find the recommended dosage for Vivitrol.", + "website": "https://www.healthline.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "2532fd402d3c741b79894e6ff2269f53", + "confirmed_task": "find electricians near 10203.", + "website": "https://www.thumbtack.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "9829f3087ab1f9c8eba6b6dd2b831d25", + "confirmed_task": "Play the latest video from NBA TV.", + "website": "https://www.nba.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "783ce6a3499fa7cf25bc12f8f0ecbbbb", + "confirmed_task": "Find Florida internship programs in the Mayo Clinic College of Medicine and Science.", + "website": "https://www.mayoclinic.org/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "6db4a0e346976f2729ba9afcd3208941", + "confirmed_task": "Look up tracking information for shipment #3023858502.", + "website": "https://www.fedex.com/en-us/home.html", + "reference_length": 2, + "level": "easy" + }, + { + "task_id": "1fc28d91d25ccd1c6ba268101326a654", + "confirmed_task": "Find the 5-day price chart for Bitcoin.", + "website": "https://www.google.com/finance/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "255bf27c43fd3f9254d6b81a5f36d3a9", + "confirmed_task": "Look for the largest hunting land for auction in Kansas high plain region with mineral rights posted in the last seven days.", + "website": "https://www.landwatch.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "a8b9edd598561d2de901864d5f40fe67", + "confirmed_task": "Calculate the shipping cost for 4 pound package from Texas to New York.", + "website": "https://www.fedex.com/en-us/home.html", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "a6f0434ce6aff5f9b03681241b03ad82", + "confirmed_task": "Find the closing stock price for Tesla on March 17, 2023.", + "website": "https://finance.yahoo.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "415bf9da6f3db3a735ecbba3b0c76c15", + "confirmed_task": "Find the nearest vet within 50 miles of zip 75228.", + "website": "https://www.akc.org/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "8103786e0e5976ebf961bd062d5f39cd", + "confirmed_task": "Find possible causes for the symptoms of chest pain which is sharp which is accompanied by anxiety.", + "website": "https://www.mayoclinic.org/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "92160852a6bbbc165cee4e14ab0b1d59", + "confirmed_task": "Find the shipping cost of a Common medium-sized box in flat-rate shipping and compare it with other parcel services.", + "website": "https://www.ups.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "502e864440283214e0180645015f568b", + "confirmed_task": "Check permit availability for a group of 4 in Brooks Camp, Katmai National Park on May 22.", + "website": "https://www.recreation.gov/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "7680a920359cb1a508fbddb001b98167", + "confirmed_task": "See the prediction about the girl child's height, whose current height at seven years is 4 feet and whose weight is 55 lbs, her mother is 5 feet 2, and her father is 5 feet 8.", + "website": "https://www.babycenter.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "07ec4a12cba8090e2dc524d558ac7675", + "confirmed_task": "Check drug interaction for melatonin and Folate Forte.", + "website": "https://www.drugs.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "987bad7c6d4726d64232a8a1c3386888", + "confirmed_task": "Find the seller info and seller's notes about the used car model 2011 BMW 135 with a max price of $30000.", + "website": "https://www.cars.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "15be05973fba714e490cd9c884e4f072", + "confirmed_task": "Find the procedure to get the license for Athletic Trainer.", + "website": "https://ohio.gov/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "3adeea7627f4343069f38adae40f73d0", + "confirmed_task": "Within 25 Miles of 96817, find a nursing home that accepts medicare.", + "website": "https://health.usnews.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "c94551d2b18f9ad0ab31b0bd98ca42e3", + "confirmed_task": "Find cats available for adoption within 10 miles of zip code 94587, Young or adult-age cats, sorted by Oldest Addition.", + "website": "https://www.petfinder.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "4e801ba102dfaf22c7cf7a126b107609", + "confirmed_task": "Find Linux platform software developers in 10080 who master the Python language and Java language with web interface project type.", + "website": "https://www.thumbtack.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "39c388cdc468688c8139cc2bb5157c13", + "confirmed_task": "Calculate the estimated car loan payment amount for an average credit-rated person for a 15,000-dollar car with a down payment of 2000 dollars and loan tenure of 48 months in zip 65215 and shop for the lowest-priced car.", + "website": "https://www.cars.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "c8d7f2aa7eb5dd074c48c9f76f8659ad", + "confirmed_task": "Show Teen Driver Safety program information.", + "website": "https://www.dmv.virginia.gov/", + "reference_length": 2, + "level": "easy" + }, + { + "task_id": "fd787623166785d84093565bf945fd24", + "confirmed_task": "Check the interaction between Novolin N and Novolin R.", + "website": "https://www.drugs.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "c3307a70bb12ebf56cc9ec926b368f15", + "confirmed_task": "Find the interactions between Eulexin and hepatic dysfunction.", + "website": "https://www.drugs.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "9586827ad04ee2362f4f0076bf0f0468", + "confirmed_task": "Find the side effects of taking Montelukast.", + "website": "https://www.drugs.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "34992feb69eb8e788faa06868b365c49", + "confirmed_task": "Submit a request for vehicle registration renewal with title number X123456 and last 4 digits of VIN is 1234.", + "website": "https://www.dmv.virginia.gov/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "47b93b9e649eadeb8d96a6e3df715c2d", + "confirmed_task": "Show me Diagnoses & Treatment for Female infertility.", + "website": "https://www.mayoclinic.org/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "3443e9c3151fef19a3c3a45eb2c13640", + "confirmed_task": "Search for the ovulation calculator and enter Mar 1 as the first date of the period and calculate the date of ovulation and pregnancy test day.", + "website": "https://www.webmd.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "6b5be1764692d1dc8f17dc4375b2daa8", + "confirmed_task": "Show me historical data for EUR/USD.", + "website": "https://finance.yahoo.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "16200f51d63f0a47a58fa17acd49e368", + "confirmed_task": "Find a recipe that includes eggplant and mushrooms.", + "website": "https://cookpad.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "c2153fc053112e89c2f103869c4d6890", + "confirmed_task": "Find a house cleaning service in 10001 on a weekly basis.", + "website": "https://www.thumbtack.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "5e4e89c9b6fdaee7a41aca5601b82e04", + "confirmed_task": "Identify a pill with a pink color and oval shape with 894 5 number on it.", + "website": "https://www.drugs.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "60cbbbd58eb9d28b053aef945f464228", + "confirmed_task": "Look up if the phone number 555555555 is a scam.", + "website": "https://www.bbb.org/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "8f80e64e44e1fada018997b2fe869683", + "confirmed_task": "What are the top posts of all time on Reddit?", + "website": "https://www.reddit.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "65c4030f22fb6eb101acfee4825f1318", + "confirmed_task": "Find a female MD Cardiologist in Jacksonville, Florida.", + "website": "https://www.mayoclinic.org/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "6ca20f1da01edeb49a7a42c816d8c6fe", + "confirmed_task": "Find the Eligibility to get the child benefit and How it works and how to claim", + "website": "https://www.gov.uk/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "2e4e21cf1449c6894b17d571c47b77ea", + "confirmed_task": "Find an English bulldog near zip code 90028 that was cared for by a private owner.", + "website": "https://www.adoptapet.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "1df24ec81137386d6476bcf343a79012", + "confirmed_task": "Search for NordicTrack with the lowest price.", + "website": "https://www.bestbuy.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "4639a54f3ab549864fd8d60b7398b1e1", + "confirmed_task": "Find a white female kitten within 35 miles of zip 77494.", + "website": "https://www.adoptapet.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "9af05e392cf3f5a8ff17aa764ba5bda6", + "confirmed_task": "Get a quote from C and above-rated solar energy equipment company within 10 miles of Miami, Florida.", + "website": "https://www.bbb.org/", + "reference_length": 16, + "level": "hard" + }, + { + "task_id": "627f7a18d85f29a687234f1ade4585c2", + "confirmed_task": "Find the current league leader in total blocked shots.", + "website": "https://www.nba.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "0b838cd54f826c59c71f600c56b89a11", + "confirmed_task": "Find all the locations for the second-best-rated used car dealer less than 5 miles from New York.", + "website": "https://www.bbb.org/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "5dec0e6620849459f29e6465982c597e", + "confirmed_task": "Search for 33 to 49inch Qled gaming monitor with a 240hz refresh rate that is within $1000 to $2000.", + "website": "https://www.bestbuy.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "52efbab520734ef9bf7c09ba0f62cdc8", + "confirmed_task": "Find the app for iOS.", + "website": "https://www.recreation.gov/", + "reference_length": 2, + "level": "easy" + }, + { + "task_id": "b1ce968a361e1088ce8d2ade6c2c9af0", + "confirmed_task": "Find young cats in Seattle and show off the newest additions.", + "website": "https://www.petfinder.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "23204728192da9f73197a613d9681c18", + "confirmed_task": "What are the Symptoms and causes of fever?", + "website": "https://www.mayoclinic.org/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "a69d2934fe54fef165490a5a2d95bf38", + "confirmed_task": "Show me recipes for pancakes with wheat and without beetroot.", + "website": "https://cookpad.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "e9f4dfc67e0e6aa37f05f7cc5aa7428c", + "confirmed_task": "Browse pediatricians near zip code 90028 who specialize in Internal Medicine and have a rating of at least 4 stars.", + "website": "https://www.healthgrades.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "2218042362d8fae73756eb309848c2b2", + "confirmed_task": "Compare Audi A7 with Audi A6, both made in 2023, and hide similarities.", + "website": "https://www.cars.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "ba2a469af584f16da93ce6a7430cf7e5", + "confirmed_task": "Search for a beginner\u2019s course in computer science that includes advertisement skills.", + "website": "https://www.coursera.org/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "26784156ae9859a0dd6c5920eb106f91", + "confirmed_task": "calculate and search rent for a $6000 monthly income with 30% rent budget near 90012 area.", + "website": "https://www.apartments.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "47e314cc452c540524ffb7cf520285a3", + "confirmed_task": "Find the park that offers the cheapest paddling permits.", + "website": "https://www.recreation.gov/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "271b36efd4346721b5542488ff997042", + "confirmed_task": "Browse 8K Samsung TVs that are open box.", + "website": "https://www.bestbuy.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "6b2cfae0ef25c73d1224b6ab74cb8b63", + "confirmed_task": "Find Devin Booker's highest-scoring points per game playoff run.", + "website": "https://www.nba.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "0a54069a0ef542e571d1fee7f39c93d5", + "confirmed_task": "Browse senior spayed/neutered dogs near zip code 90028.", + "website": "https://www.adoptapet.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "4e3f6a538cc1f7321cfc50260db9545d", + "confirmed_task": "Look up the current temperature for zip code 10019.", + "website": "https://www.theweathernetwork.com/", + "reference_length": 2, + "level": "easy" + }, + { + "task_id": "f00e7accfb4a5e09680bdb326e6274ad", + "confirmed_task": "Check the hourly forecast for Boston.", + "website": "https://www.accuweather.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "6174e5ddd40cfbdc33ee1502f40bac39", + "confirmed_task": "Find a day-use park that offers horseback riding near Nashville.", + "website": "https://www.recreation.gov/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "547f5729c59d5d12a457a3ebb74c31c6", + "confirmed_task": "Search for 3 bedroom condos with 2 bathrooms within $1500- $2500 range in NYC.", + "website": "https://www.apartments.com/", + "reference_length": 14, + "level": "hard" + }, + { + "task_id": "0b2623e9fa5cea997f76490bcbc5220f", + "confirmed_task": "Find a list of shorthaired dogs available for adoption within 100 miles of zip code 94587 that are good with kids and cats, and have been on Petfinder for over 30 days.", + "website": "https://www.petfinder.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "3ae28b3c440efe87dc700480b78ac608", + "confirmed_task": "Find the closest 5-star rated dentist to zip code 98011.", + "website": "https://www.healthgrades.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "0632e496d37badee0350dad358f047c5", + "confirmed_task": "Browse recipes for gluten-free chocolate chip cookies that can be made without nuts.", + "website": "https://cookpad.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "aafd1fddea1558466ac6133934d35156", + "confirmed_task": "Find a Single-Family House for Rent in Houston, TX with 1 bed.", + "website": "https://www.apartments.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "246d654fab7c31d9651007e39e75f74f", + "confirmed_task": "Open the most helpful 5-star reviews of Alpine Ridge.", + "website": "https://www.recreation.gov/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "690d7b4a285fdb1e9dabf973bf46ae4d", + "confirmed_task": "Browse iPhone X for sale that is in good condition, has a max price of 400, and searches in titles only.", + "website": "https://craigslist.org/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "c43a7dccf5c44f7b45a821e712dd1970", + "confirmed_task": "Take a newsletter subscription with my email id (buckeye.foobar@gmail.com) for Allergies and asthma, Anxiety and depression, nutrition, diabetes, breast cancer, and migraine with email id.", + "website": "https://www.healthline.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "d5c34bf39eb6096ae5d439325cde4d32", + "confirmed_task": "Find a DMV center in Richmond.", + "website": "https://www.dmv.virginia.gov/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "180ed2ec377ef3a4af9035a21522091a", + "confirmed_task": "Find the way to give a gift to UM-Dearborn.", + "website": "https://umich.edu/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "c521933dad9c0ef9f1dfa2f38b8e4405", + "confirmed_task": "See the monthly forecast for Atlanta, GA.", + "website": "https://www.accuweather.com/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "9b5dfe54a1c14c5c6336bae7374c3bb5", + "confirmed_task": "Find a UPS Access Point near SPRING, TX and services provided by them.", + "website": "https://www.ups.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "c073ac1bcf40f84c599affc97edbc396", + "confirmed_task": "Search for the cheapest apartment in Detroit for a student.", + "website": "https://www.apartments.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "73d08420706ae205a9c5be28b6d4e80f", + "confirmed_task": "Show me the rules and cancellation for Alley Spring.", + "website": "https://www.recreation.gov/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "0a0fa834ce41b5297c6474293383759d", + "confirmed_task": "What are the onboard activities of the highest-rated Regent Seven Seas Cruise ship based on Costco member reviews?", + "website": "https://www.costco.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "a13e4231a3d6a7000c622c56448d97ba", + "confirmed_task": "Find an Airbnb in Cleveland for three nights. The check-in date is the day after tomorrow. We have 2 adults, 2 kids, and 1 pet. The budget is $100 to $300 per night. Essential amenities include free parking, a washer, and a gym.", + "website": "https://www.airbnb.com/", + "reference_length": 19, + "level": "hard" + }, + { + "task_id": "bb518416a786fdb9b9bbf0c78515595e", + "confirmed_task": "Browse the class schedule of graduate-level computer science courses.", + "website": "https://www.osu.edu/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "b99c02965196d51e80ac7539e33f335b", + "confirmed_task": "Please find graduate-level computer science courses scheduled on Tuesdays starting time from 2:00 to 6:00 PM in the Fall 2023 semester.", + "website": "https://www.berkeley.edu/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "27fa3ac20745d3d35e89fae157f63069", + "confirmed_task": "Browse the class schedule of graduate-level chemistry courses on Monday afternoons in the winter of 2023.", + "website": "https://www.stanford.edu/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "b4aa7315e31dfcdc52baf7771be260c9", + "confirmed_task": "Find the HGX H100 driver for Ubuntu 22.04 on AMD64 CPU.", + "website": "https://www.nvidia.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "442a450e696a96085257db6297891a4d", + "confirmed_task": "Using a calculator to determine how much I can have in my 401(k) account at retirement, if I work from age 22 to 65, with an annual rate of return of 3%, annual employee contributions of $8,000, and annual employer contributions of $8,000.", + "website": "https://www.chase.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "9ed3827266b3b804f485859c3d00401e", + "confirmed_task": "If I'm 30, plan to retire at 65, and can save $300/month, with a 3% annual return, 13% current tax rate, and 24% retirement tax rate, show the comparison chart between Traditional and Roth IRA.", + "website": "https://www.chase.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "c801d1c951f59297f526bab84fa86c6e", + "confirmed_task": "Browse the latest negative reviews from players with over 100 hours of playtime for the game that won the 2023 VR Game of the Year Award.", + "website": "https://store.steampowered.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "7c09c2c7c87cf6bb1138701eb54284ea", + "confirmed_task": "Find the comments for the most popular news in the past month under the Quantum Physics topic.", + "website": "https://phys.org/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "afcebfed28bea091d58f49ea6cb8194b", + "confirmed_task": "Find the most reviewed gluten-free multivitamins from CVS Health Brand under $15.", + "website": "https://www.cvs.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "64345c365f544375357c7b67917f08a0", + "confirmed_task": "Look for the newest refrigerator that is 34-36 inches wide, priced between $1,000 and $2,000, and has a customer review rating of 4 stars or higher.", + "website": "https://www.costco.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "ab6ee3b83aab6cd283320f5e01003cff", + "confirmed_task": "Find the tech specs of the MacBook Pro 16-inch introduced in November 2023.", + "website": "https://www.apple.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "33bd2cdcea4fcc42a09a8a1e4e5841c6", + "confirmed_task": "Add a 5-piece Tenders Combo to my bag with Sweet Corn as the side, Sweet Tea as the drink, and both Honey BBQ and Honey Mustard sauces. Select the store closest to Zip code 10001 for pick-up tomorrow at 12:00 PM.", + "website": "https://www.kfc.com/", + "reference_length": 23, + "level": "hard" + }, + { + "task_id": "47186fac8e7c7277af01144644eb4e0b", + "confirmed_task": "What is the ownership cost of the first car in the list \"top buys 2025\"?", + "website": "https://www.parkers.co.uk/", + "reference_length": 3, + "level": "easy" + }, + { + "task_id": "fa9adb815b85d259f943d81874a052e5", + "confirmed_task": "Browse a user homepage that reposted the top song from the Top 50 Rock chart.", + "website": "https://soundcloud.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "b922508886ded315c9835457a6eb43ea", + "confirmed_task": "Browse tenured/tenure-track faculty positions in Computer Sciences & Technology in California.", + "website": "https://jobs.chronicle.com", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "5d542a7ec1fa142ba73cc87d970caf39", + "confirmed_task": "Find the most cited publication at the 2022 CVPR main conference.", + "website": "https://dblp.org/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "864244b6969e0f8733b0eb1ca06cd51f", + "confirmed_task": "Find the race time for who wins the first place in the last race of the 2023 Formula 1 (F1).", + "website": "https://www.espn.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "01abae9608f2d8752a83e08f136f720c", + "confirmed_task": "Show me the code for the company that is the top mover in the Cboe Europe Technology Sector Index (BEPTEC) as of the latest market close.", + "website": "https://www.cboe.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "da8f3823a827c7d3a492f383808e7912", + "confirmed_task": "Find and open the earliest press release.", + "website": "https://www.instructure.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "8689af4d33ce00bf2cdd8987d3bbfd86", + "confirmed_task": "Add the cheapest certified refurbished iPad Air with 256GB of storage in any shade of blue to my bag.", + "website": "https://www.apple.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "78f397336b6fd1cbba0127db7a8cd502", + "confirmed_task": "Browse the upcoming SuperBike events taking place in Italy.", + "website": "https://www.redbull.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "3dca7cbe7d086619d837ff9f5312cebc", + "confirmed_task": "Can you show me products under the category path 'Automotive' -> 'Car Jack', with an additional filter for the color pink?", + "website": "https://us.shein.com/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "b962927dfe03bf2274a54381127ed433", + "confirmed_task": "Find the best-selling vinyl record by an artist from New York City in the classical music genre.", + "website": "https://bandcamp.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "78baf9dbe7c3532f7d7ef4cc22a7f065", + "confirmed_task": "Find the most popular digital trends report in the Finance & Insurance industry within the region of China.", + "website": "https://www.statista.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "c7c07ec10c668625a21ba64165d719bb", + "confirmed_task": "Find the total monthly price for four prepaid unlimited lines without autopay discounts.", + "website": "https://www.verizon.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "512fd4deab099b8dc0dcfc0ec48a3c63", + "confirmed_task": "Identify the open issue with the most comments in the first trending open-source repository this week.", + "website": "https://github.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "d9d8b7d84a3f8d057e368254fe8d65e2", + "confirmed_task": "Find the first commit submitted by NielsRogge to the official repository of the SAM2 model.", + "website": "https://github.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "157f4a79d55e8fa3fd55ba772ba40fbc", + "confirmed_task": "Find the most popular blue Lilo & Stitch toys.", + "website": "https://www.disney.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "62c8d970b3d13891f355911e5a8f4030", + "confirmed_task": "Find the top game listed in the Steam Deck's top-played list over the past year. Then, browse reviews for that game from players who have played over 100 hours and primarily use a Steam Deck.", + "website": "https://store.steampowered.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "11857213ca01510f12813740afd59918", + "confirmed_task": "Add the most top-selling Adidas men's basketball shoe in red, size 10 to my cart.", + "website": "https://www.adidas.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "47bfe8a7e0e4e7efc837287b407fbe90", + "confirmed_task": "Compare the first and second most popular smartphones manufactured by Xiaomi and show the comparison chart.", + "website": "https://versus.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "bb314cb80f0f8489135cbf59074d11e2", + "confirmed_task": "Open the page for the first Best Paper Award video recording of talks from ICLR 2016.", + "website": "https://iclr.cc/", + "reference_length": 4, + "level": "easy" + }, + { + "task_id": "1aeca99e6a60b0e3aefb3ef212bdce79", + "confirmed_task": "Find full-time legal occupation jobs in San Diego County with a minimum salary of $4,000+ per month.", + "website": "https://www.ca.gov/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "d730f4ff450da1bd60a836163736ef6a", + "confirmed_task": "Find the best-selling GORE-TEX men's hiking shoe priced between $100.00 and $199.99 with a rating of 4 stars or higher, and show its most helpful comment.", + "website": "https://www.rei.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "fe33894188d20d7469f37a9fd855e7ff", + "confirmed_task": "Find me Python 3.9 packages on PyPI that are designed for the Web Environment, licensed under MIT, have a stable production status, and are intended for developers.", + "website": "https://pypi.org/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "71f8de1834599fba443f40dbbfab8edd", + "confirmed_task": "Search for papers related to reinforcement learning under the topics of computer science and mathematics on arxiv, with recent submission dates between September 2024 and January 2025.", + "website": "https://arxiv.org/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "c8c1ff115879b3afd14280beb1559b13", + "confirmed_task": "Find the latest Doraemon video in MP4 format that is over 20 minutes long and has a medium file size.", + "website": "https://www.4shared.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "d4fb78b7e74508cd3b33f01cf9200997", + "confirmed_task": "Show the figure comparing Occupational Fatalities Trends between Ohio and New York.", + "website": "https://www.americashealthrankings.org/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "0e42c3a73f2aece1f854e0ba55b7c8b0", + "confirmed_task": "Find a gas station in Manhattan, NY with a rating above 4.0, and sort the user reviews by the lowest rating.", + "website": "https://www.google.com/maps/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "96afb3c51146b0c2a9c55f039a5ea6d6", + "confirmed_task": "Find the most frequent word that rhymes with \"thought\" and has three syllables.", + "website": "https://www.merriam-webster.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "59912927c1fddee6ded8a49986896bc2", + "confirmed_task": "Look for the most useful reviews of the highest-rated anti-reflective TVs with screen sizes from 55\" to 64\" and prices ranging from $300 to $1500.", + "website": "https://www.samsung.com/", + "reference_length": 14, + "level": "hard" + }, + { + "task_id": "e43cbc8a0bf9e999884928d11006f894", + "confirmed_task": "Browse the list of things to do in Miami that have a rating of 9+ (wonderful), last between 1 to 4 hours per session, cost under $100 per person, and are available for booking between next Monday and next Friday.", + "website": "https://www.expedia.com/", + "reference_length": 15, + "level": "hard" + }, + { + "task_id": "1b867afecf072cb877ebfa4069263746", + "confirmed_task": "Display the figure comparing unemployment trends among women in Illinois and Michigan.", + "website": "https://www.americashealthrankings.org/", + "reference_length": 15, + "level": "hard" + }, + { + "task_id": "c3a333968fc3c43d7f2688f425a0d633", + "confirmed_task": "Find the cheapest certified pre-owned Porsche 911 with a model year of 2019 or newer, within a 200-mile radius of ZIP code 97007.", + "website": "https://www.porsche.com/", + "reference_length": 15, + "level": "hard" + }, + { + "task_id": "bb5d90e6f2fbc0ae146f7c1998c2b4a1", + "confirmed_task": "Find the most viewed TED talk on the topic of robots that lasts between 12 and 18 minutes.", + "website": "https://www.ted.com/", + "reference_length": 15, + "level": "hard" + }, + { + "task_id": "c577a14301a725e09ccd269a3e0b271e", + "confirmed_task": "Return the page for the highest-rated red wine from Oregon under $40 that pairs well with either mushrooms or veal.", + "website": "https://www.vivino.com/", + "reference_length": 15, + "level": "hard" + }, + { + "task_id": "c6c9dc6079677cef594cec2fa6b16602", + "confirmed_task": "Add the cheapest black sofa with at least three seats, a leather finish, and at least four stars to my cart.", + "website": "https://www.ikea.com/", + "reference_length": 16, + "level": "hard" + }, + { + "task_id": "c39d6c245f8243993e707d54d2f4acec", + "confirmed_task": "Browse the final skin in the list for the champion Ahri.", + "website": "https://www.leagueoflegends.com/", + "reference_length": 18, + "level": "hard" + }, + { + "task_id": "b2f4fde2fce122a93c7b578086cb0585", + "confirmed_task": "Find the cheapest hotel + flight + car package from New York to San Francisco, departing tomorrow and returning on the fourth day from departure, for two adults and a six-year-old child. The package should be one room with free breakfast and spa access.", + "website": "https://www.booking.com/", + "reference_length": 19, + "level": "hard" + }, + { + "task_id": "d02d236836924919f35f2438d9ed2374", + "confirmed_task": "Browse the top 250 movies and find one movie that is available on AMC+.", + "website": "https://www.imdb.com/", + "reference_length": 22, + "level": "hard" + }, + { + "task_id": "3621b099326c7aebd2e2dac6be3b52d1", + "confirmed_task": "Open the profile page of the leader of the Nvidia Learning and Perception Lab.", + "website": "https://www.nvidia.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "f27b393bbd2082f92b566270c4b74fe6", + "confirmed_task": "Find a large van for sale from the year 2024 or newer with up to 10,000 miles.", + "website": "https://www.parkers.co.uk/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "ba01ea557b73f864c35ebba0dd6f3cb2", + "confirmed_task": "Find the top-rated hotel in Manhattan, NY, suitable for 4 guests, and identify the fastest public transportation option from the hotel to LGA airport.", + "website": "https://www.google.com/maps/", + "reference_length": 14, + "level": "hard" + }, + { + "task_id": "662ae0f2d3ac851dbcdd245f908277e3", + "confirmed_task": "What is the second stop among the best stops along the road trip from Yellowstone National Park to Las Vegas?", + "website": "https://wanderlog.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "461ab9b0c7b20ac5f912704480979c65", + "confirmed_task": "Find the NYSE Rule 605 Market Center Files data for July 2024.", + "website": "https://www.nyse.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "a96fca87a17d792644e736d1d10d3cbe", + "confirmed_task": "View the pricing plan for 'Business'. Specifically, we have 100 users. We need a 1PB storage quota and a 50 TB transfer quota.", + "website": "https://mega.io/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "2c8ef01a92c71ba9ef2e59bb17eea2b3", + "confirmed_task": "If there are any discounts on the Apple Mac Studio, add the one with the largest absolute discount to my cart; otherwise, add the cheapest one.", + "website": "https://www.costco.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "3084bc225219fcb73dc1cb0f97276c1c", + "confirmed_task": "Get quotes for a package weighing 10 lbs with dimensions of 2 inches in length, width, and height, being shipped from Long Beach, 90802 to Portland, 97201.", + "website": "https://www.ups.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "949dc965a6c23a95663b3bc2ca2c3a8a", + "confirmed_task": "Find UA or AA flights from London to New York that arrive between 8:00 PM and 11:00 PM on FlightAware.", + "website": "https://www.flightaware.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "636b07af4dd97c1793733db1fd1b90b8", + "confirmed_task": "Filter handbags to evening bags that are blue, and polyester and cost less than $100.", + "website": "https://www.macys.com/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "38203be65401943aea2179c4c680059a", + "confirmed_task": "Check the status of bus S92 for any disruptions on new.mta.info.", + "website": "https://new.mta.info/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "cf757a775fa1224acfc7998489e199a8", + "confirmed_task": "Find a flight from Dublin to anywhere under $100 tomorrow on Ryanair.", + "website": "https://www.ryanair.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "d8e2a81fa621ce4737e5ea85671b630e", + "confirmed_task": "Search for regular weekday jobs around 14810 that I can start within two weeks or three.", + "website": "https://hiring.amazon.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "63d6866fc000fcb1f153e07604bd1395", + "confirmed_task": "What are the Nearby Attractions from the most popular attraction in Hong Kong?", + "website": "https://us.trip.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "199be0b54a436daee74247971fc684ee", + "confirmed_task": "Add a Macy's Happy Birthday E-Gift Card worth $50 from Shak to my cart, with the birthday wish message \"Happy birthday, wish you many more years to come\", addressed to christene (christenson@gmail.com).", + "website": "https://www.macys.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "c00437fd76a7a83b57f3dc4e5dbc41f8", + "confirmed_task": "Check the most recent full-time medical health and safety jobs, requiring 1-3 years of industry experience available in the US.", + "website": "https://www.amazon.jobs/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "fc53ddd3421411a41c1020a3fdc84ec4", + "confirmed_task": "I want to purchase an open-box Samsung Galaxy S25 Plus in excellent condition and trade in a gray Galaxy S20 5G (Verizon), with a perfect screen, in good condition. How much would it cost?", + "website": "https://www.bestbuy.com/", + "reference_length": 17, + "level": "hard" + }, + { + "task_id": "9d46ccb915eff39ee1ae1e7328f5f20d", + "confirmed_task": "Get a quote for the fastest shipping available for 5 lbs with dimensions of 4 inches in length, width, and height from New York, NY 10001, USA to Truckee, California 96162, USA.", + "website": "https://www.ups.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "d1970c16271496cbbe166ecbecc0a1d8", + "confirmed_task": "I'm 25 and located in Texas. Shop for 2020 made dry red wine made in United States priced between 15-20 dollars and add 5 bottles to the cart.", + "website": "https://macyswineshop.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "7211af65d266402f99499053924262e9", + "confirmed_task": "View the most recent job posting for a full-time pharmacy position in the US.", + "website": "https://www.amazon.jobs/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "4464a8421f8bc8786524a499258dfad3", + "confirmed_task": "Check the specifications of the best-selling HP FHD laptop with 16 GB RAM and core i7 running on Windows 11.", + "website": "https://www.bestbuy.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "f707d765bca668830745d20807d7bee6", + "confirmed_task": "Show me the list of young female English Spot rabbits available for adoption in Chicago, IL, within 50 miles.", + "website": "https://www.petfinder.com/", + "reference_length": 14, + "level": "hard" + }, + { + "task_id": "d392e154c1c6ffbb26e2331c3afafc67", + "confirmed_task": "Add a $100 Best Buy gift card for a birthday to my cart.", + "website": "https://www.bestbuy.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "a5c87cc1c94a090c9a8dc2c8b6a125d0", + "confirmed_task": "Find the SO2 air quality over the past hour for Maine North, County Cork, Ireland.", + "website": "https://www.accuweather.com/", + "reference_length": 15, + "level": "hard" + }, + { + "task_id": "367d843c640637745e8fafa741cca13b", + "confirmed_task": "Find a condo for rent in Houston, TX, with a monthly rent of no more than 30% of an income of $8000. The condo should have a minimum area of 600 square feet, and the move-in date is the 1st of next month.", + "website": "https://www.apartments.com/", + "reference_length": 15, + "level": "hard" + }, + { + "task_id": "84ef883a37af638c3bcf7561f28ce80a", + "confirmed_task": "Find the cheapest used hatchback car listing in Madison which has black interiors with a heated seat option and premium sound system.", + "website": "https://www.cars.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "d9a8689393effeed75ea0866e44e1def", + "confirmed_task": "Find the address and phone of the Office of the Inspector General (OIG).", + "website": "https://www.justice.gov/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "1bc154377120ec15b18dbabdba49c741", + "confirmed_task": "Book 4 tickets in the upper for any Kevin Hart show in New York in the next three months and view ticket prices with estimated fees.", + "website": "https://www.stubhub.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "28e7574e7bd6d14f36d2988a5ef2bd23", + "confirmed_task": "Get a part-time job within 5 miles of Moscow, Idaho in the accommodation and food services industry, as a chef, and show jobs for corporate only.", + "website": "https://ohiomeansjobs.ohio.gov/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "1c3b747ae12ccee895745f82e3f2ef8a", + "confirmed_task": "Identify the ongoing competition that offers the highest prize and find the code that received the most votes in that competition.", + "website": "https://www.kaggle.com/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "d1807551297ac60ecaaabbd2a2ed301a", + "confirmed_task": "Find the No.1 children's hospital in the California that specializes in Neonatology.", + "website": "https://health.usnews.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "7abdceee212151f187ee1a1744c57606", + "confirmed_task": "Can you show me the page with the filing fee for a self-petitioned I-140 application?", + "website": "https://www.uscis.gov/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "0e5536aaad9d3462b06cf725e6ed535a", + "confirmed_task": "Show me the page with average wait times for U.S. citizens arriving at Raleigh-Durham International Airport on 2025-03-12.", + "website": "https://www.cbp.gov/", + "reference_length": 11, + "level": "hard" + }, + { + "task_id": "bc2ce7f206045dd2d322e5695a947219", + "confirmed_task": "Estimate the federal income tax I would owe on $158,500 of taxable income in ZIP code 97007, filing as single.", + "website": "https://smartasset.com/", + "reference_length": 6, + "level": "medium" + }, + { + "task_id": "7e6993f2c5cd72c44809024f0bc85dc1", + "confirmed_task": "Create a meme with a frog as the background and leave the only text with \"Enjoy your life\".", + "website": "https://imgur.com/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "a48e2f1ee8d87eaeea56fe5e730427e6", + "confirmed_task": "Pass the first trending chess puzzle.", + "website": "https://www.chess.com/", + "reference_length": 7, + "level": "medium" + }, + { + "task_id": "dd44c665cec1e9c929a4c5f074e7844a", + "confirmed_task": "Find parking near the San Francisco Museum of Modern Art from June 18, 1:00 PM to 5:00 PM. I'm driving a Ford F-150 and need a garage that allows in-and-out privileges. If there are multiple options, show me the details of the one with the lowest price.", + "website": "https://spothero.com/", + "reference_length": 17, + "level": "hard" + }, + { + "task_id": "99daaed9a83c266341d28aa40067d376", + "confirmed_task": "Find the most popular board game on the 'The Hotness' list that has a rating above 7.5 and is suitable for 2 players.", + "website": "https://boardgamegeek.com/", + "reference_length": 5, + "level": "easy" + }, + { + "task_id": "7072d09436972a5d5fe7476e3e9f1559", + "confirmed_task": "Show me the comparison of the first two personal credit cards that do not charge foreign transaction fees.", + "website": "https://www.americanexpress.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "2c20d87a046fadcb6ff07ee877bfbf37", + "confirmed_task": "Open the form 8843 for tax year 2022.", + "website": "https://www.irs.gov/", + "reference_length": 8, + "level": "medium" + }, + { + "task_id": "753f372c189d3b306623cb0c65b50320", + "confirmed_task": "Compare the U.S. ETP Odd Lot Rate (%) between Quartile 1 and Quartile 4, viewing quartiles by price, and display the chart with a logarithmic scale on the vertical axis.", + "website": "https://www.sec.gov/", + "reference_length": 9, + "level": "medium" + }, + { + "task_id": "733f1d8bf79d5bc2240c5357f928ffff", + "confirmed_task": "Find the cheapest travel deal or discount to Thailand that lasts more than 10 days, departs in next month, and show the total price.", + "website": "https://www.tourradar.com/", + "reference_length": 10, + "level": "medium" + }, + { + "task_id": "f05e87c5b92d9869e08806103c1c15a1", + "confirmed_task": "Find all startup companies from the 2022 and 2023 Y Combinator batches that are based in France and currently have job openings.", + "website": "https://www.ycombinator.com/", + "reference_length": 12, + "level": "hard" + }, + { + "task_id": "3ef64f34eae59c9fac7ee9a4f18b4a0c", + "confirmed_task": "Find and open an animal learning course on YouTube Kids for my 6-year-old without login in. As a parent born in 1992, I would prefer not to enable search.", + "website": "https://www.youtube.com/", + "reference_length": 16, + "level": "hard" + }, + { + "task_id": "f158345f8489e0d1d91e28768c39bca1", + "confirmed_task": "Estimate the total cost (with basic support) of using 5 million input tokens and 5 million output tokens each for GPT-4o and GPT-4o Mini, both deployed in the US/EU Data Zones under Standard (On-Demand) in the East US region.", + "website": "https://azure.microsoft.com/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "1ab384fb3a791edfb410213cc6b82151", + "confirmed_task": "Show me the result of a proton emission decay for a Beryllium nucleus with 6 protons and 4 neutrons in the simulation.", + "website": "https://phet.colorado.edu/", + "reference_length": 13, + "level": "hard" + }, + { + "task_id": "1223b07536a87e0170ff87cbbebd1d3c", + "confirmed_task": "Complete a multiplication quiz on https://www.coolmath4kids.com/, covering multiplication facts for 11-12. The quiz should consist of 10 questions, with unlimited time allowed for each. The goal is to achieve a perfect score of 10 out of 10.", + "website": "https://www.coolmath4kids.com/", + "reference_length": 24, + "level": "hard" + } +] \ No newline at end of file diff --git a/data/Online-Mind2Web/README.md b/data/Online-Mind2Web/README.md new file mode 100644 index 0000000..9d354e2 --- /dev/null +++ b/data/Online-Mind2Web/README.md @@ -0,0 +1,65 @@ +--- +license: cc-by-4.0 +language: +- en +size_categories: +- n<1K +configs: +- config_name: default + data_files: + - split: test + path: "Online_Mind2Web.json" +--- +
+ Blog | + Paper | + Code | + Leaderboard +
+ + +## Online-Mind2Web +Online-Mind2Web is the online version of [Mind2Web](https://osu-nlp-group.github.io/Mind2Web/), a more diverse and user-centric dataset includes 300 high-quality tasks from 136 popular websites across various domains. The dataset covers a diverse set of user tasks, such as clothing, food, housing, and transportation, to evaluate web agents' performance in a real-world online environment. + +### Data Fields +- "task_id" (str): Unique id for each task. +- "website" (str): Website url. +- "task_description" (str): Task description. +- "reference_length" (int): Number of steps required for a human annotator to complete the task. + +### Update Tasks +We will regularly update Online-Mind2Web by replacing outdated or invalid tasks (e.g., due to website changes) to maintain its value as a rigorous benchmark for web agents. If you find any tasks are outdated, please reach out to us, and we will update them. + +To ensure fair comparisons, we will aim to keep the updated tasks on the same websites as before and with a similar reference length. Additionally, once agent performance saturates on Online-Mind2Web, we will also revise simple tasks to preserve its long-term value. + +### Update History +**2025/04/05:** Updated task IDs: ["c03ee2be3d73556ab789c0ad1cbd3451", "c181f903ec1107b850032c17cad88393", "2c8ef01a92c71ba9ef2e59bb17eea2b3", "d8e2a81fa621ce4737e5ea85671b630e", "63d6866fc000fcb1f153e07604bd1395", "199be0b54a436daee74247971fc684ee"] + +### Disclaimer +This dataset was collected and released solely for research purposes, with the goal of making the web more accessible via language technologies. The authors are strongly against any potential harmful use of the data or technology to any party. + +### Citation Information +Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data. +``` +@article{xue2025illusionprogressassessingcurrent, + title={An Illusion of Progress? Assessing the Current State of Web Agents}, + author={Tianci Xue and Weijian Qi and Tianneng Shi and Chan Hee Song and Boyu Gou and Dawn Song and Huan Sun and Yu Su}, + year={2025}, + eprint={2504.01382}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2504.01382}, +} + +@inproceedings{deng2023mind2web, + author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine}, + pages = {28091--28114}, + publisher = {Curran Associates, Inc.}, + title = {Mind2Web: Towards a Generalist Agent for the Web}, + url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5950bf290a1570ea401bf98882128160-Paper-Datasets_and_Benchmarks.pdf}, + volume = {36}, + year = {2023} +} +``` \ No newline at end of file diff --git a/data/Online-Mind2Web/gitattributes b/data/Online-Mind2Web/gitattributes new file mode 100644 index 0000000..1ef325f --- /dev/null +++ b/data/Online-Mind2Web/gitattributes @@ -0,0 +1,59 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.lz4 filter=lfs diff=lfs merge=lfs -text +*.mds filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +# Audio files - uncompressed +*.pcm filter=lfs diff=lfs merge=lfs -text +*.sam filter=lfs diff=lfs merge=lfs -text +*.raw filter=lfs diff=lfs merge=lfs -text +# Audio files - compressed +*.aac filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +# Image files - uncompressed +*.bmp filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +# Image files - compressed +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text +# Video files - compressed +*.mp4 filter=lfs diff=lfs merge=lfs -text +*.webm filter=lfs diff=lfs merge=lfs -text diff --git a/eval.py b/eval.py new file mode 100644 index 0000000..3087d58 --- /dev/null +++ b/eval.py @@ -0,0 +1,298 @@ +""" +This release adds the following features: +1. Support screenshots of the evaluation process +2. Support Online_Mind2Web task evaluation +3. Support access to gpt-4.1, o3-mini, o4-mini and other models + +Tips: Use batch_eval.py if you want to use batch_tasks +""" +from agent.Environment.html_env.async_env import AsyncHTMLEnvironment +from evaluate import * +from agent.Plan import * +from dataclasses import dataclass + +import re +import asyncio +import argparse +import logging + +# universal tools +from agent.Utils.utils import * +# evaluate tools +from evaluate.evaluate_utils import run_task, read_config, read_file +from agent.Utils.utils import read_json_file +from experiment_results import get_evaluate_result + +logger = logging.getLogger(__name__) + +from agent.LLM.token_utils import is_model_supported + + +@dataclass +class ExperimentConfig: + mode: str + global_reward_mode: str + planning_text_model: str + global_reward_text_model: str + ground_truth_mode: bool + single_task_name: str + config: dict + ground_truth_data: dict + write_result_file_path: str + record_time: str + file: list + + +def validate_config(config, observation_mode, global_reward_mode, observation_model, global_reward_model): + task_mode = config['basic']['task_mode'] + batch_tasks_file_path = config['files']['batch_tasks_file_path'] + json_model_response = config['model']['json_model_response'] + all_json_models = config['model']['json_models'] + interaction_mode = config['steps']['interaction_mode'] + + if observation_mode not in ["dom"]: + logger.error( + "observation mode is not correctly defined! Currently we only support DOM observation.") + exit() + + if interaction_mode not in [True, False]: + logger.error( + "interaction_mode is not defined! Try to define whether you want to evaluate the agent in an interactive manner.") + exit() + + if json_model_response and (observation_model not in all_json_models or ( + global_reward_mode != 'no_global_reward' and global_reward_model not in all_json_models)): + logger.error("Model does not support JSON mode!") + exit() + + if task_mode == 'batch_tasks' and not os.path.exists(batch_tasks_file_path): + logger.error("batch_tasks_file_path not exist!") + exit() + + +def get_task_range(task_mode, file, raw_data_index): + if task_mode == "batch_tasks": + if raw_data_index != -1: + re_result = re.split(r'\s|,', raw_data_index) + raw_data_start_index = int(re_result[0]) + raw_data_end_index = int(re_result[-1]) + 1 + else: + raw_data_start_index = 0 + raw_data_end_index = len(file) + return range(raw_data_start_index, raw_data_end_index) + elif task_mode == "single_task": + return range(0, 1) + else: + logger.error("task_mode error!") + exit() + + +def log_task_info(task_index, task_name, reference_task_length, reference_evaluate_steps): + logger.info("*" * 100) + logger.info(f"task index: {task_index}") + logger.info(f"task name: {task_name}") + logger.info(f"task reference length: {reference_task_length}") + logger.info(f"raw data annotation: {reference_evaluate_steps}") + + +def generate_result_file_path(config): + return os.path.join(config["files"]["out_file_path"], "json_result") + + +def load_ground_truth_data(config, ground_truth_mode): + if ground_truth_mode: + ground_truth_file_path = config['files']['ground_truth_file_path'] + if not os.path.exists(ground_truth_file_path): + logger.error("ground_truth_file_path not exist!") + exit() + return read_json_file(ground_truth_file_path) + return None + + +def create_html_environment(mode): + return AsyncHTMLEnvironment( + mode=mode, + max_page_length=8192, + headless=False, + slow_mo=1000, + current_viewport_only=False, + viewport_size={"width": 1080, "height": 720}, + save_trace_enabled=True, # True + sleep_after_execution=0.0, + locale="en-US", + use_vimium_effect=True + ) + + +async def run_experiment(task_range, experiment_config): + for task_index in task_range: + task_uuid = None + if experiment_config.config['basic']['task_mode'] == "batch_tasks": + task = experiment_config.file[task_index] + # task_name, task_uuid, reference_task_length, reference_evaluate_steps = task + task_name = task.get("confirmed_task", f"Task_{task_index}") + task_uuid = task.get("task_id", f"task_{task_index}") + reference_task_length = task.get("reference_length", 0) + # reference_evaluate_steps = None + # evaluate_steps = reference_evaluate_steps + reference_evaluate_steps = task.get("evaluation", []) + website = task.get("website", "about:blank") # The first step is to force access to the specified web page + log_task_info(task_index, task_name, + reference_task_length, reference_evaluate_steps) + elif experiment_config.config['basic']['task_mode'] == "single_task": + task_name = experiment_config.single_task_name + reference_task_length = experiment_config.config['steps']['single_task_action_step'] + + evaluate_steps = [] + reference_evaluate_steps = [] + website = experiment_config.config.get('single_task_website', "about:blank") + # Generate a unique task_uuid for the single_task mode + task_uuid = f"single_task_{int(time.time())}" + logger.info(f"task_name: {task_name}") + logger.info(f"website: {website}") + + # TODO + # evaluate_steps = experiment_config.config['steps']['single_task_action_step'] + # reference_evaluate_steps = None + # logger.info(f"task_name: {task_name}") + + env = create_html_environment(experiment_config.mode) + + # Screenshot parameters + screenshot_params = { + "mode": experiment_config.mode, + "record_time": experiment_config.record_time, + "task_name": task_name, + "task_name_id": task_uuid, + "file_path": args.snapshot + } + + # Screenshot storage directory + screenshot_dir = os.path.join(args.snapshot, "screenshots", + f"screenshots_{experiment_config.mode}_{experiment_config.record_time}") + if not os.path.exists(screenshot_dir): + os.makedirs(screenshot_dir) + + + if is_model_supported(experiment_config.planning_text_model) and is_model_supported( + experiment_config.global_reward_text_model): + if not os.path.exists("token_results"): + os.makedirs("token_results") + token_counts_filename = f"token_results/token_counts_{experiment_config.record_time}_{experiment_config.planning_text_model}_{experiment_config.global_reward_text_model}.json" + + await run_task(mode=experiment_config.mode, + task_mode=experiment_config.config['basic']['task_mode'], + task_name=task_name, + task_uuid=task_uuid, + config=experiment_config.config, + write_result_file_path=experiment_config.write_result_file_path, + reference_task_length=reference_task_length, + # evaluate_steps=evaluate_steps, + evaluate_steps=reference_evaluate_steps, # evaluation data + reference_evaluate_steps=reference_evaluate_steps, + env=env, + global_reward_mode=experiment_config.global_reward_mode, + global_reward_text_model=experiment_config.global_reward_text_model, + planning_text_model=experiment_config.planning_text_model, + ground_truth_mode=experiment_config.ground_truth_mode, + ground_truth_data=experiment_config.ground_truth_data, + interaction_mode=experiment_config.config['steps']['interaction_mode'], + task_index=task_index, + record_time=experiment_config.record_time, + token_pricing=experiment_config.config['token_pricing'], + screenshot_params=screenshot_params, # support screenshot + website=website # Specified web page + ) + + await env.close() + del env + if is_model_supported(experiment_config.planning_text_model) and is_model_supported(experiment_config.global_reward_text_model): + with open(token_counts_filename, 'r') as file: + data = json.load(file) + total_token_cost = data.get("total_token_cost", 0) + + get_evaluate_result(experiment_config.config["files"]["out_file_path"], total_token_cost) + logger.info('\033[31mAll tasks finished!\033[0m') + logger.info('\033[31mPress Enter to exit...\033[0m') + + +async def main(global_reward_mode="no_global_reward", + planning_text_model="gpt-4-turbo", + global_reward_text_model="gpt-4-turbo", + single_task_name="", + single_task_website="about:blank", + raw_data_index=-1, + observation_mode="dom", + ground_truth_mode=False, + toml_path=None + ): + config = read_config(toml_path) + config['single_task_website'] = single_task_website + validate_config(config, observation_mode, global_reward_mode, planning_text_model, global_reward_text_model) + + file = None + if config['basic']['task_mode'] == "batch_tasks": + # file = read_file(file_path=config['files']['batch_tasks_file_path']) + file = read_json_file(config['files']['batch_tasks_file_path']) + task_range = get_task_range( + config['basic']['task_mode'], file, raw_data_index) + elif config['basic']['task_mode'] == "single_task": + task_range = get_task_range(config['basic']['task_mode'], None, -1) + + record_time = time.strftime("%Y%m%d-%H%M%S", time.localtime()) + write_result_file_path = generate_result_file_path(config) + ground_truth_data = load_ground_truth_data(config, ground_truth_mode) + + experiment_config = ExperimentConfig( + mode=observation_mode, + global_reward_mode=global_reward_mode, + planning_text_model=planning_text_model, + global_reward_text_model=global_reward_text_model, + ground_truth_mode=ground_truth_mode, + single_task_name=single_task_name, + config=config, + ground_truth_data=ground_truth_data, + write_result_file_path=write_result_file_path, + record_time=record_time, + file=file + ) + + await run_experiment(task_range, experiment_config) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run the web agent in different modes.") + parser.add_argument("--global_reward_mode", + choices=["dom_vision_reward", "dom_reward", + "vision_reward", "no_global_reward"], + default="no_global_reward", help="Choose the mode of global reward.") + parser.add_argument("--index", type=str, default=-1) + parser.add_argument("--single_task_name", type=str, + default="Find Dota 2 game and add all DLC to cart in steam.") + parser.add_argument("--single_task_website", type=str, + default="about:blank", help="Website URL for single task mode") + parser.add_argument("--snapshot", type=str, default="results_o4") + parser.add_argument("--planning_text_model", type=str, default="gpt-4o-mini") + parser.add_argument("--global_reward_text_model", type=str, default="gpt-4o-mini") + + + args = parser.parse_args() + + asyncio.run(main(global_reward_mode=args.global_reward_mode, + planning_text_model=args.planning_text_model, + global_reward_text_model=args.global_reward_text_model, + single_task_name=args.single_task_name, + single_task_website=args.single_task_website, + raw_data_index=args.index + ) + ) + +# python eval.py \ +# --global_reward_mode dom_reward \ +# --index -1 \ +# --single_task_name "View the cheapest apartment available for students at the University of Leeds with bills that include WIFI and cleaning services." \ +# --single_task_website "https://www.student.com/" \ +# --snapshot results/test1 \ +# --planning_text_model gpt-4.1 \ +# --global_reward_text_model gpt-4.1 \ No newline at end of file diff --git a/evaluate/evaluate_utils.py b/evaluate/evaluate_utils.py index 4d20f26..723b293 100644 --- a/evaluate/evaluate_utils.py +++ b/evaluate/evaluate_utils.py @@ -1,3 +1,6 @@ +""" +The new version only supports Online-Mind2Web task testing +""" from playwright.async_api import Page import re import toml @@ -13,8 +16,20 @@ from agent.Reward.global_reward import GlobalReward from evaluate import FinishTaskEvaluator, TaskLengthEvaluator, URLEvaluator, ElementEvaluator, TextEvaluator from logs import logger - - +import json + + +def save_token_count_to_file(filename, step_tokens, task_name, global_reward_text_model, planning_text_model, token_pricing): + data = { + "task_name": task_name, + "global_reward_text_model": global_reward_text_model, + "planning_text_model": planning_text_model, + "token_pricing": token_pricing, + "step_tokens": step_tokens + } + with open(filename, "w", encoding="utf-8") as f: + json.dump(data, f) + def read_file(file_path: str = "./data/example/example_130.json") -> List[List]: """Read labeled data @@ -36,70 +51,16 @@ def read_file(file_path: str = "./data/example/example_130.json") -> List[List]: raise for task in test_data: - task_name = task["task"] - evaluation_data = task["evaluation"] - reference_task_length = task["reference_task_length"] - task_name_id = task["index"] + # task_name = task["task"] + task_name = task["confirmed_task"] + # evaluation_data = task["evaluation"] + evaluation_data = task.get("evaluation", []) + # reference_task_length = task["reference_task_length"] + reference_task_length = task["reference_length"] + task_name_id = task["task_id"] reference_evaluate_steps = [] - for i, evaluation in enumerate(evaluation_data): - match_function = evaluation["match_function_name"] - if "url" in match_function: - try: - key = evaluation["content"]["key"] - reference_answer = evaluation["content"]["reference_answer"] - reference_evaluate_steps.append({"match_function": match_function, - "key": key, "reference_answer": reference_answer, "score": 0}) - except: - logger.error( - f"URL error in task {task_name_id}, step {i}, match_function: {match_function}") - exit(1) - elif "element_path" in match_function: - try: - reference_answer = evaluation["content"]["reference_answer"] - method = evaluation["method"] - netloc = evaluation["content"]["netloc"] - reference_evaluate_steps.append({"match_function": match_function, "method": method, - "reference_answer": reference_answer, "netloc": netloc, - "score": 0}) - except: - logger.error( - f"Element path error in task {task_name_id}, step {i}, match_function: {match_function}") - exit(1) - elif "element_value" in match_function: - try: - reference_answer = evaluation["content"]["reference_answer"] - netloc = evaluation["content"]["netloc"] - if "path" in evaluation["content"].keys(): - path = evaluation["content"]["path"] - reference_evaluate_steps.append({"match_function": match_function, - "reference_answer": reference_answer, "netloc": netloc, - "path": path, "score": 0}) - else: - reference_evaluate_steps.append({"match_function": match_function, - "reference_answer": reference_answer, "netloc": netloc, - "score": 0}) - except: - logger.error( - f"Element value error in task {task_name_id}, step {i}, match_function: {match_function}") - exit(1) - elif "final_answer" in match_function: - try: - reference_answer = evaluation["content"]["reference_answer"] - reference_evaluate_steps.append({"match_function": match_function, - "reference_answer": reference_answer, "score": 0}) - except: - logger.error( - f"Final answer error in task {task_name_id}, step {i}, match_function: {match_function}") - exit(1) - elif "cache_data" in match_function: - try: - reference_answer = evaluation["content"]["reference_answer"] - reference_evaluate_steps.append({"match_function": match_function, - "reference_answer": reference_answer, "score": 0}) - except: - logger.error( - f"Cache data error in task {task_name_id}, step {i}, match_function: {match_function}") - exit(1) + if "evaluation" not in task: + task["evaluation"] = [] return_list.append( [task_name, task_name_id, reference_task_length, reference_evaluate_steps]) @@ -412,9 +373,11 @@ async def run_task( interaction_mode, task_index, record_time=None, - token_pricing=None -): - await env.reset("about:blank") + token_pricing=None, + screenshot_params=None, + website=None +): + await env.reset(website if website else "about:blank") response_error_count = 0 response_total_count = 0 @@ -468,6 +431,72 @@ async def run_task( token_counts_filename = f"token_results/token_counts_{record_time}_{planning_text_model}_{global_reward_text_model}.json" while num_steps < max_steps + additional_steps: + # Force the first step to navigate to a specific website + if num_steps == 0 and website and website != "about:blank": + logger.info(f"**🤖 Force the first step to navigate to a specific website: {website} 🤖**") + try: + # Force navigation to a specific website + await env.page.goto(website, wait_until="domcontentloaded", timeout=30000) + logger.info(f"-- success: {website}") + + # Update current information + current_info = {"URL": env.page.url} + + # Create a virtual navigation trace record + navigation_trace = { + "thought": f"I need to navigate to the designated website to start the task", + "action": f"goto {website}", + "reflection": f"Successfully naviged to the specified website: {website}" + } + previous_trace.append(navigation_trace) + + except Exception as e: + logger.error(f"-- fail: {e}") + error_description = f"Failed to navigate to {website}: {str(e)}" + + # Screenshot at the beginning of each step + if screenshot_params: + if mode in ["d_v", "dom_v_desc", "vision_to_dom"]: + observation, observation_VforD = await env.get_obs() + if is_valid_base64(observation_VforD): + save_screenshot( + mode=screenshot_params["mode"], + record_time=screenshot_params["record_time"], + task_name=screenshot_params["task_name"], + step_number=num_steps, + description=f"step_{num_steps}", + screenshot_base64=observation_VforD, + task_name_id=screenshot_params.get("task_name_id") + ) + else: + observation = await env.get_obs() + if isinstance(observation, dict) and is_valid_base64(observation.get("screenshot", "")): + save_screenshot( + mode=screenshot_params["mode"], + record_time=screenshot_params["record_time"], + task_name=screenshot_params["task_name"], + step_number=num_steps, + description=f"step_{num_steps}", + screenshot_base64=observation["screenshot"], + task_name_id=screenshot_params.get("task_name_id") + ) + + # save HTML(Optional) + # html_content = await env.page.content() + # html_save_path = os.path.join(screenshot_params["file_path"], "html_screenshots", f"{screenshot_params['task_name']}", + # f"step_{num_steps}_{screenshot_params['record_time']}.html") + # os.makedirs(os.path.dirname(html_save_path), exist_ok=True) + # with open(html_save_path, "w", encoding="utf-8") as html_file: + # html_file.write(html_content) + + # save screenshot + png_save_path = os.path.join(screenshot_params["file_path"], "img_screenshots", f"{screenshot_params['task_name']}", + f"step_{num_steps}_{screenshot_params['record_time']}.png") + os.makedirs(os.path.dirname(png_save_path), exist_ok=True) + png_bytes = await env.page.screenshot() + with open(png_save_path, "wb") as png_file: + png_file.write(png_bytes) + error_message = "" total_step_score = 0 step_reward = {} @@ -687,8 +716,7 @@ async def run_task( step_tokens["steps_token_counts"] = steps_token_counts # Update token counting - save_token_count_to_file(token_counts_filename, step_tokens, task_name, global_reward_text_model, - planning_text_model, config["token_pricing"]) + save_token_count_to_file(token_counts_filename, step_tokens, task_name, global_reward_text_model,planning_text_model, config["token_pricing"]) # ! 3. Task evaluation and scoring if task_mode == "batch_tasks": diff --git a/experiment_results.py b/experiment_results.py index 8d9a0e4..b8d1a3b 100644 --- a/experiment_results.py +++ b/experiment_results.py @@ -24,7 +24,7 @@ def enum_to_action_str(): ("NONE", 0), ("CLICK", 1), ("GOTO", 2), - ("GOOGLE_SEARCH", 3), + # ("GOOGLE_SEARCH", 3), ("FILL_FORM", 4), ("SWITCH_TAB", 5), ("GO_BACK", 6), @@ -58,9 +58,12 @@ def to_dict(input_string): extracted_fields["fill_text"] = extracted_fields["fill_text"] if extracted_fields.get( "fill_text") else "" action = "" - if "google_search" in extracted_fields["action_type"].lower(): - action = "google_search" + "[" + extracted_fields["fill_text"] + "]" - elif "fill_search" in extracted_fields["action_type"].lower(): + # if "google_search" in extracted_fields["action_type"].lower(): + # action = "google_search" + "[" + extracted_fields["fill_text"] + "]" + # Online_Mind2Web评估使用goto直接访问指定网站,不使用 google_search + # website = get_website_for_query(extracted_fields["fill_text"]) + # action = "goto" + "[" + website + "]" + if "fill_search" in extracted_fields["action_type"].lower(): action = "fill_search" + \ "[" + str(extracted_fields["element_id"]) + "," + \ extracted_fields["fill_text"] + "]" @@ -184,6 +187,21 @@ def summary(x): def get_result(input_json_path): json_result_path = input_json_path + "/json_result" out_file_path = input_json_path + "/result" + + if not os.path.exists(json_result_path): + os.makedirs(json_result_path) + if not os.path.exists(out_file_path): + os.makedirs(out_file_path) + + # 结果文件 + if len(os.listdir(json_result_path)) == 0: + print(f"警告: {json_result_path} 目录是空的,没有生成结果文件") + empty_result = [] + out_json_file_path = out_file_path + '/out.json' + with open(out_json_file_path, 'w') as json_file: + json.dump(empty_result, json_file) + return out_file_path + task_list = [] for _, filename in enumerate(os.listdir(json_result_path)): file_path = os.path.join(json_result_path, filename) diff --git a/logs.py b/logs.py index 732a2b7..7c5f5b3 100644 --- a/logs.py +++ b/logs.py @@ -5,8 +5,8 @@ import colorlog import re +log_folder = "results/41_dom/logs" -log_folder = "LOGS" if not os.path.exists(log_folder): os.makedirs(log_folder) log_file_name = os.path.join( diff --git a/requirements.txt b/requirements.txt index 88d11fb..21f91d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,5 @@ argparse requests_toolbelt anthropic google-generativeai -tomli \ No newline at end of file +tomli +backoff==2.2.1 \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/dataset_process.py b/utils/dataset_process.py new file mode 100644 index 0000000..b1c21db --- /dev/null +++ b/utils/dataset_process.py @@ -0,0 +1,132 @@ +import os +import json +import shutil +from pathlib import Path +import re + +def normalize_filename(name): + # Replace common illegal characters + replacements = { + '/': '_', + '\\': '_', + ':': '_', + '*': '', + '?': '', + '"': '', + '<': '', + '>': '', + '|': '_' + } + for char, replacement in replacements.items(): + name = name.replace(char, replacement) + return name + +def find_best_match(folder_name, task_map): + """Task of finding the best match""" + # Direct matching + if folder_name in task_map: + return task_map[folder_name] + + # Attempt to normalize matches + normalized_folder = normalize_filename(folder_name) + for task_name, task_id in task_map.items(): + if normalize_filename(task_name) == normalized_folder: + return task_id + + # Try a partial match + for task_name, task_id in task_map.items(): + if task_name.startswith(folder_name) or folder_name in task_name: + return task_id + + return None + +def process_dataset(): + base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + screenshots_dir = os.path.join(base_dir, "results", "img_screenshots") + json_results_dir = os.path.join(base_dir, "results", "json") + output_dir = os.path.join(base_dir, "dataset_4o") + + os.makedirs(output_dir, exist_ok=True) + + # task + with open(os.path.join(base_dir, "data/Online-Mind2Web/Online_Mind2Web.json"), "r") as f: + tasks = json.load(f) + # with open(os.path.join(base_dir, "data/Online-Mind2Web/hard_task.json"), "r") as f: + # tasks = json.load(f) + + # Create a task_id to confirmed_task mapping and reverse + task_map = {} + id_map = {} + for task in tasks: + if "confirmed_task" in task and "task_id" in task: + task_map[task["confirmed_task"]] = task["task_id"] + id_map[task["task_id"]] = task["confirmed_task"] + + available_json_files = set(f for f in os.listdir(json_results_dir) if f.endswith('.json')) + processed_count = 0 + failed_folders = [] + + for folder_name in os.listdir(screenshots_dir): + folder_path = os.path.join(screenshots_dir, folder_name) + if not os.path.isdir(folder_path): + continue + + task_id = find_best_match(folder_name, task_map) + + if not task_id: + print(f"Warning: No match for task ID found '{folder_name}'") + failed_folders.append(folder_name) + continue + task_dir = os.path.join(output_dir, task_id) + trajectory_dir = os.path.join(task_dir, "trajectory") + os.makedirs(trajectory_dir, exist_ok=True) + + all_images = [] + def collect_images(directory, relative_path=""): + for item in os.listdir(directory): + item_path = os.path.join(directory, item) + if os.path.isdir(item_path): + collect_images(item_path, os.path.join(relative_path, item)) + elif item.lower().endswith(('.png', '.jpg', '.jpeg')): + all_images.append({ + 'full_path': item_path, + 'relative_path': os.path.join(relative_path, item), + 'filename': item + }) + + collect_images(folder_path) + + all_images.sort(key=lambda x: int(re.findall(r'\d+', x['filename'])[0]) if re.findall(r'\d+', x['filename']) else 0) + # copy images to the trajectory directory + for i, img_info in enumerate(all_images): + file_number = re.findall(r'\d+', img_info['filename']) + file_number = file_number[0] if file_number else str(i) + file_ext = os.path.splitext(img_info['filename'])[1] + + new_filename = f"step_{file_number}_screenshot{file_ext}" + dst = os.path.join(trajectory_dir, new_filename) + + shutil.copy2(img_info['full_path'], dst) + + print(f"copy: {img_info['relative_path']} -> {new_filename}") + + json_filename = f"{task_id}.json" + if json_filename in available_json_files: + src = os.path.join(json_results_dir, json_filename) + dst = os.path.join(task_dir, "result.json") + shutil.copy2(src, dst) + processed_count += 1 + print(f"ok: '{folder_name}' -> {json_filename} (task: {id_map.get(task_id, 'unknown')})") + else: + print(f"Warning: Result file for task not found '{folder_name}' (task_id: {task_id})") + + print(f"Successfully processed {processed_count} tasks") + + if failed_folders: + print("\nNo task ID was found for the following folders") + for folder in failed_folders: + print(f" - {folder}") + +if __name__ == "__main__": + process_dataset() + print("ok!") diff --git a/utils/log_processor.py b/utils/log_processor.py new file mode 100755 index 0000000..3a0fa6d --- /dev/null +++ b/utils/log_processor.py @@ -0,0 +1,162 @@ +""" +This script is used to parse the log file generated by my system. +""" +import re +import json +import os + +class LogProcessor: + def __init__(self, log_file_path, task_mapping_file=None): + self.log_file_path = log_file_path + + # Get the task mapping file path from the configuration file + config_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs/log_config.json') + try: + with open(config_path, 'r', encoding='utf-8') as f: + config = json.load(f) + default_mapping_file = config.get('task_mapping_file', '') + except Exception: + default_mapping_file = '' + + self.task_mapping_file = task_mapping_file or default_mapping_file or os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data/Online-Mind2Web/Online_Mind2Web.json') + self.task_mapping = self._load_task_mapping() + + def _load_task_mapping(self): + """ Load task mapping file """ + try: + with open(self.task_mapping_file, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + print(f"Failed to load the task mapping file: {e}") + return [] + + def _get_task_id_by_task_name(self, task_name): + """Task ID""" + if not task_name: + return '' + + for task in self.task_mapping: + if task.get('confirmed_task') == task_name: + return task.get('task_id', '') + return '' + + def _get_element_description(self, element): + """ + Extract a more complete description from the element object, such as button 'Join', link 'Shop Now'. String and dict types are supported. + """ + if isinstance(element, str): + return element.strip() + if isinstance(element, dict): + # type + text/aria_label/title + type_ = element.get('type') or element.get('tag') + text = element.get('text') + aria = element.get('aria_label') + title = element.get('title') + value = element.get('value') + # text > aria > title > value + desc = text or aria or title or value + if type_ and desc: + return f"{type_} '{desc}'" + elif type_: + return type_ + elif desc: + return desc + else: + return str(element) + return str(element) + + def parse_log(self): + """Parse the log file and extract the information""" + result = { + "task_id": "", + "task": "", + "final_result_response": "", + "action_history": [], + "thoughts": [] + } + + try: + with open(self.log_file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Finding the task name + task_patterns = [ + r'The question here is described as "([^"]+)"', + r'"confirmed_task": "([^"]+)"', + r'task_name: (.+?)(?:\n|\r\n)', + r'"task": "([^"]+)"' + ] + + for pattern in task_patterns: + task_match = re.search(pattern, content) + if task_match: + result["task"] = task_match.group(1).strip() + result["task_id"] = self._get_task_id_by_task_name(result["task"]) + if result["task_id"]: + break + + # Extract the action field in the last Current trace + trace_blocks = re.findall(r'Current trace: \{([^}]+)\}', content) + if trace_blocks: + last_trace = trace_blocks[-1] + action_match = re.search(r"'action': '([^']+)'", last_trace) + if action_match: + result["final_result_response"] = action_match.group(1).strip() + + # 新增:预处理所有DOM_based_planning_request块,按顺序存储 + dom_blocks = [] + for m in re.finditer(r'DOM_based_planning_request:\s*\{([^}]*)\}', content): + dom_blocks.append(m) + + # 处理所有Planning_Response块 + planning_blocks = list(re.finditer(r'Planning_Response:\s*\{([^}]*)\}', content)) + for idx, planning_match in enumerate(planning_blocks): + block = planning_match.group(1) + action_input_match = re.search(r'"action_input": "([^"]*)"', block) + action_match = re.search(r'"action": "([^"]*)"', block) + element_id_match = re.search(r'"element_id": "([^"]*)"', block) + thought_match = re.search(r'"thought": "([^"]*)"', block) + + action_input = action_input_match.group(1) if action_input_match else "" + action = action_match.group(1) if action_match else "" + element_id = element_id_match.group(1) if element_id_match else "" + thought = thought_match.group(1).strip() if thought_match else "" + + # 1. 跳过action为get_final_answer的Planning_Response + if action == "get_final_answer": + continue + + # 2. 处理action_history + if not element_id: + # 没有element_id,按原逻辑 + if action_input or action: + result["action_history"].append(f"{action_input} -> {action}") + else: + # 有element_id,查找最近的DOM_based_planning_request + # 找到当前Planning_Response之前的所有DOM_based_planning_request + dom_content = "" + for dom_m in reversed(dom_blocks): + if dom_m.start() < planning_match.start(): + dom_block = dom_m.group(1) + try: + dom_json = json.loads('{' + dom_block + '}') + dom_obj = dom_json.get(element_id, "") + dom_content = self._get_element_description(dom_obj) + except Exception: + dom_content = "" + break + if dom_content: + result["action_history"].append(f"{dom_content} -> {action}") + else: + # 如果找不到,降级为action_input -> action + # result["action_history"].append(f"{element_id} -> {action}") + result["action_history"].append(f"{action_input} -> {action}") + + + # 3. 处理thoughts(同样跳过get_final_answer) + if thought: + result["thoughts"].append(thought) + except Exception as e: + print(f"Error parsing log file: {e}") + + return result \ No newline at end of file diff --git a/utils/parser.py b/utils/parser.py new file mode 100644 index 0000000..fa958b5 --- /dev/null +++ b/utils/parser.py @@ -0,0 +1,54 @@ +""" +This script is used to parse the log file generated by my system. +Run this script to parse the log files into the JSON format required by Online-Mind2Web. +The parameters are configured in: configs/log_config.json +""" +import os +import json +from log_processor import LogProcessor +from utils import save_json, load_json + +def main(): + config_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs/log_config.json') + config = load_json(config_path) + log_directory = config.get('log_directory', '') + output_directory = config.get('output_directory', 'results') + task_mapping_file = config.get('task_mapping_file', '') + + if not log_directory: + print("Error: set log_directory") + return + + if not os.path.exists(task_mapping_file): + print(f"Error: Task mapping file does not exist: {task_mapping_file}") + + if not os.path.exists(output_directory): + os.makedirs(output_directory) + + print(f"Start processing the log file directory: {log_directory}") + processed_count = 0 + + for filename in os.listdir(log_directory): + if filename.endswith('.log'): + try: + log_file_path = os.path.join(log_directory, filename) + print(f"In the process: {filename}") + + processor = LogProcessor(log_file_path, task_mapping_file) + result = processor.parse_log() + + if not result["task_id"]: + print(f"Error: The task ID or task name cannot be extracted from the log file: {filename}") + continue + + output_file = os.path.join(output_directory, f"{result['task_id']}.json") + save_json(output_file, result) + processed_count += 1 + + except Exception as e: + print(f"Error: {filename}: {e}") + + print(f"Finish: {processed_count} log files were successfully parsed") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..5d9b1a4 --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,23 @@ +import json +import os + +def load_json(file_path): + """Load JSON data from a file""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + print(f"Failed to load JSON file {file_path}: {e}") + return {} + +def save_json(file_path, data): + try: + directory = os.path.dirname(file_path) + if directory and not os.path.exists(directory): + os.makedirs(directory) + + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + print(f"The results have been saved to {file_path}") + except Exception as e: + print(f"Failed to save the JSON file {file_path}: {e}") \ No newline at end of file