Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,7 @@ package.json
results_wodom/
results_wodom33/
scripts/
eval_agent/
eval_agent/
dataset/
dataset_4o/
results*
313 changes: 31 additions & 282 deletions README.md

Large diffs are not rendered by default.

28 changes: 14 additions & 14 deletions agent/Environment/html_env/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class ActionTypes(IntEnum):
NONE = 0
CLICK = 1
GOTO = 2
GOOGLE_SEARCH = 3
# GOOGLE_SEARCH = 3 # 禁用Google search
FILL_FORM = 4
SWITCH_TAB = 5
GO_BACK = 6
Expand Down Expand Up @@ -103,16 +103,16 @@ def create_fill_search_action(elementid: int, fill_text: str) -> Action:
"element_name": ""
}


@beartype
def create_search_action(elementid: int, text: str) -> Action:
return {
"action_type": ActionTypes.GOOGLE_SEARCH,
"element_id": elementid,
"url": "https://www.google.com",
"fill_text": text,
"element_name": ""
}
# 禁用Google search
# @beartype
# def create_search_action(elementid: int, text: str) -> Action:
# return {
# "action_type": ActionTypes.GOOGLE_SEARCH,
# "element_id": elementid,
# "url": "https://www.google.com",
# "fill_text": text,
# "element_name": ""
# }


@beartype
Expand Down Expand Up @@ -176,8 +176,8 @@ def create_action(elementid: int, action_type: str, action_input: str) -> Action
return create_fill_search_action(elementid=elementid, fill_text=action_input)
elif action_type == "goto":
return create_goto_action(elementid=elementid, url=action_input)
elif action_type == "google_search":
return create_search_action(elementid=elementid, text=action_input)
# elif action_type == "google_search": # 禁用Google search
# return create_search_action(elementid=elementid, text=action_input)
elif action_type == "go_back":
return create_go_back_action(elementid=elementid)
elif action_type == "select_option":
Expand All @@ -203,7 +203,7 @@ def create_action(elementid: int, action_type: str, action_input: str) -> Action
"create_fill_action",
"create_none_action",
"create_goto_action",
"create_search_action",
# "create_search_action", # 禁用Google search
"create_go_back_action",
"create_fill_search_action",
"create_select_option_action",
Expand Down
154 changes: 79 additions & 75 deletions agent/Environment/html_env/async_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,79 +392,82 @@ async def fill_form(self, action):
self.html_content = await self.page.content()
except Exception as e:
raise e

async def search(self, action):
"""Use Node.js to call Google Custom Search API"""
try:
# Execute Node.js script
process = await asyncio.create_subprocess_exec(
'node',
self.search_script_path,
action["fill_text"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)

# Get output
stdout, stderr = await process.communicate()
# disable Google search
# async def search(self, action):
# """Use Node.js to call Google Custom Search API"""
# try:
# # Execute Node.js script
# process = await asyncio.create_subprocess_exec(
# 'node',
# self.search_script_path,
# action["fill_text"],
# stdout=subprocess.PIPE,
# stderr=subprocess.PIPE
# )

if process.returncode == 0:
try:
# Parse the JSON response
data = json.loads(stdout.decode().strip())
# # Get output
# stdout, stderr = await process.communicate()

# if process.returncode == 0:
# try:
# # Parse the JSON response
# data = json.loads(stdout.decode().strip())

if 'items' in data:
# Create HTML from search results
results_html = self._create_search_results_page(data['items'])
self.html_content = results_html
else:
self.html_content = "<html><body><p>No results found.</p></body></html>"
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON response: {e}")
self.html_content = "<html><body><p>Error parsing search results.</p></body></html>"
else:
error_msg = stderr.decode().strip()
logger.error(f"Search script error: {error_msg}")
self.html_content = f"<html><body><p>Search error: {error_msg}</p></body></html>"
# if 'items' in data:
# # Create HTML from search results
# results_html = self._create_search_results_page(data['items'])
# self.html_content = results_html
# else:
# self.html_content = "<html><body><p>No results found.</p></body></html>"
# except json.JSONDecodeError as e:
# logger.error(f"Failed to parse JSON response: {e}")
# self.html_content = "<html><body><p>Error parsing search results.</p></body></html>"
# else:
# error_msg = stderr.decode().strip()
# logger.error(f"Search script error: {error_msg}")
# self.html_content = f"<html><body><p>Search error: {error_msg}</p></body></html>"

# Update the page content
await self.page.set_content(self.html_content)
# # Update the page content
# await self.page.set_content(self.html_content)

except Exception as e:
logger.error(f"Search error: {str(e)}")
self.html_content = f"<html><body><p>Search error: {str(e)}</p></body></html>"
await self.page.set_content(self.html_content)

def _create_search_results_page(self, items):
"""Create an HTML page from search results"""
results = []
for item in items:
result = f"""
<div class="search-result">
<h3><a href="{item.get('link', '')}">{item.get('title', 'No title')}</a></h3>
<div class="url">{item.get('link', '')}</div>
<div class="snippet">{item.get('snippet', 'No description available')}</div>
</div>
"""
results.append(result)

html = f"""
<html>
<head>
<style>
.search-result {{ margin-bottom: 20px; padding: 10px; }}
.url {{ color: green; margin: 5px 0; }}
.snippet {{ color: #545454; }}
</style>
</head>
<body>
<div class="search-results">
{''.join(results)}
</div>
</body>
</html>
"""
return html
# except Exception as e:
# logger.error(f"Search error: {str(e)}")
# self.html_content = f"<html><body><p>Search error: {str(e)}</p></body></html>"
# await self.page.set_content(self.html_content)


# disable Google search
# def _create_search_results_page(self, items):
# """Create an HTML page from search results"""
# results = []
# for item in items:
# result = f"""
# <div class="search-result">
# <h3><a href="{item.get('link', '')}">{item.get('title', 'No title')}</a></h3>
# <div class="url">{item.get('link', '')}</div>
# <div class="snippet">{item.get('snippet', 'No description available')}</div>
# </div>
# """
# results.append(result)

# html = f"""
# <html>
# <head>
# <style>
# .search-result {{ margin-bottom: 20px; padding: 10px; }}
# .url {{ color: green; margin: 5px 0; }}
# .snippet {{ color: #545454; }}
# </style>
# </head>
# <body>
# <div class="search-results">
# {''.join(results)}
# </div>
# </body>
# </html>
# """
# return html

async def go_back_last_page(self, action):
# self.page = self.last_page
Expand Down Expand Up @@ -633,13 +636,14 @@ async def execute_action(self, action: Action) -> Union[str, Tuple[str, str]]:
error_message = f"Failed to execute fill_form [{action['element_id']},{action['fill_text']}] action. An error({e}) occur."
raise ActionExecutionError(
action['action_type'], error_message) from e
case ActionTypes.GOOGLE_SEARCH:
try:
await self.search(action)
except Exception as e:
error_message = f"Failed to execute google_search[{action['fill_text']}] action. An error({e}) occur."
raise ActionExecutionError(
action['action_type'], error_message) from e
# 禁用Google search
# case ActionTypes.GOOGLE_SEARCH:
# try:
# await self.search(action)
# except Exception as e:
# error_message = f"Failed to execute google_search[{action['fill_text']}] action. An error({e}) occur."
# raise ActionExecutionError(
# action['action_type'], error_message) from e
case ActionTypes.GO_BACK:
try:
await self.go_back_last_page(action)
Expand Down
15 changes: 11 additions & 4 deletions agent/LLM/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(self, model=None):
self.model = model
self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

async def request(self, messages: list = None, max_tokens: int = 100000, temperature: float = 0.7) -> (str, str):
async def request(self, messages: list = None, max_tokens: int = 4096, temperature: float = 0.7) -> (str, str):
try:
if "gpt-3.5" in self.model:
messages = truncate_messages_based_on_estimated_tokens(messages, max_tokens=16385)
Expand All @@ -47,7 +47,7 @@ async def request(self, messages: list = None, max_tokens: int = 100000, tempera
logger.error(f"Error in GPTGenerator.request: {e}")
return "", str(e)

async def chat(self, messages, max_tokens=100000, temperature=0.7):
async def chat(self, messages, max_tokens=4096, temperature=0.7):
loop = asyncio.get_event_loop()

# Dynamically select field names
Expand All @@ -67,13 +67,20 @@ async def chat(self, messages, max_tokens=100000, temperature=0.7):
elif "gpt-4.1" in self.model:
data = {
'model': self.model,
token_key: 32768, # gpt-4.1 max_tokens = 32768
token_key: 4096, # gpt-4.1 max_tokens = 32768
'messages': messages,
}
elif "gpt-4o" in self.model:
data = {
'model': self.model,
'max_tokens': 16384,
'temperature': temperature,
'messages': messages,
}
else:
data = {
'model': self.model,
# 'max_tokens': max_tokens,
'max_tokens': 16384,
token_key: max_tokens,
'temperature': temperature,
'messages': messages,
Expand Down
5 changes: 3 additions & 2 deletions agent/Plan/planning.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ async def execute(self, status_description, user_request, previous_trace, observ
vision_act_response)
actions = {
'goto': "Found 'goto' in the vision_act_response.",
'google_search': "Found 'google_search' in the vision_act_response.",
# 'google_search': "Found 'google_search' in the vision_act_response.",
'switch_tab': "Found 'switch_tab' in the vision_act_response.",
'scroll_down': "Found 'scroll_down' in the vision_act_response.",
'scroll_up': "Found 'scroll_up' in the vision_act_response.",
Expand All @@ -94,7 +94,8 @@ async def execute(self, status_description, user_request, previous_trace, observ
break

if not actions_found:
print("None of 'goto', 'google_search', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.")
# print("None of 'goto', 'google_search', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.")
print("None of 'goto', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.")

target_element = planning_response_get.get('target_element')
description = planning_response_get.get('description')
Expand Down
5 changes: 3 additions & 2 deletions agent/Prompt/base_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ class BasePrompts:
**Execution Action Space**:
- goto: useful for when you need visit a new link or a website, it will open a new tab.
- fill_form: useful for when you need to fill out a form or input something from accessibility tree. Input should be a string.
- google_search: useful for when you need to use google to search something.
- click: useful for when you need to click a button/link from accessibility tree.
- select_option: useful for when you need to select a drop-down box value. When you get (select and option) tags from the accessibility tree, you need to select the serial number(element_id) corresponding to the select tag, not the option, and select the most likely content corresponding to the option as Input.
- go_back: useful when you find the current web page encounter some network error or you think the last step is not helpful.
Expand All @@ -57,7 +56,9 @@ class BasePrompts:

You have to follow the instructions or notes:
**Important Notes**:
- Under the following conditions, you are restricted to using the `google_search` or `goto` tools exclusively:
- The first step must be a goto, especially when the page is blank (about: blank).
- It is not allowed to perform operations other than goto on blank pages, such as click and other interactive operations.
- Under the following conditions, you are restricted to using the `goto` tool exclusively:
1. In the initial step of a process or when there's no preceding interaction history (i.e., the previous trace is empty).
2. In situations where the accessibility tree is absent or not provided.
- Your action should not be the same as last step's action.
Expand Down
16 changes: 11 additions & 5 deletions agent/Utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,23 @@ def read_json_file(file_path):


def save_screenshot(mode: str, record_time: str, task_name: str, step_number: int, description: str,
screenshot_base64: str, task_name_id: str = None):

screenshot_base64: str, task_name_id: str = None, task_uuid: str = None):# add task_uuid
# Prior use task_uuid,else task_name_id
identifier = task_uuid if task_uuid is not None else task_name_id

timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
task_name = task_name.replace(char, '_')

if task_name_id is None:
# if task_name_id is None:
# task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name}'
# else:
# task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name_id}_{task_name}'

if identifier is None:
task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name}'
else:
task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name_id}_{task_name}'
task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{identifier}_{task_name}'
if not os.path.exists(task_folder):
os.makedirs(task_folder)

Expand Down
19 changes: 12 additions & 7 deletions batch_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@
2. Support Online_Mind2Web task evaluation
3. Support access to gpt-4.1, o3-mini, o4-mini and other models

Tips: To run in a Linux environment without a visual interface, use the following command to start
Tips: To run in a Linux environment without a visual interface, use the following command to start:
sudo yum install -y xorg-x11-server-Xvfb
xvfb-run python batch_eval.py

Ubantu/Debian users can use the following command to install xvfb:
sudo apt-get update
sudo apt-get install -y xvfb
xvfb-run python batch_eval.py
"""
#!/usr/bin/env python3
import json
Expand Down Expand Up @@ -50,23 +55,23 @@ def main():
parser = argparse.ArgumentParser(description='Online-Mind2Web Task')
parser.add_argument('--json_path', type=str, default='data/Online-Mind2Web/Online_Mind2Web.json',
help='JSON task file path')
parser.add_argument('--global_reward_mode', type=str, default='no_global_reward',
help='Global Reward Mode: dom_reward/no_global_reward')
parser.add_argument('--global_reward_mode', type=str, default='dom_reward',
help='Global Reward Mode: dom_reward/no_global_reward/dom_vision_reward')
parser.add_argument('--index', type=int, default=-1,
help='Task index')
parser.add_argument('--snapshot', type=str, default='results',
parser.add_argument('--snapshot', type=str, default='results/41_dom',
help='Snapshot directory')
parser.add_argument('--planning_text_model', type=str, default='gpt-4.1',
help='planning_text_model')
help='planning_text_model: gpt-4.1/gpt-4o-2024-08-06')
parser.add_argument('--global_reward_text_model', type=str, default='gpt-4.1',
help='global_reward_text_model')
help='global_reward_text_model: gpt-4.1/gpt-4o-2024-08-06')
parser.add_argument('--start_idx', type=int, default=0,
help='The index to start the task')
parser.add_argument('--end_idx', type=int, default=None,
help='The index of the finished task (excluding)')
parser.add_argument('--delay', type=int, default=5,
help='Latency between tasks (seconds)')
parser.add_argument('--output_log', type=str, default='batch_run_log.txt',
parser.add_argument('--output_log', type=str, default='results/41_dom/batch_run_log.txt',
help='output_log')

args = parser.parse_args()
Expand Down
Loading