iMeanAI · Syclus123 · May 30, 2025 · May 29, 2025 · May 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -35,4 +35,7 @@ package.json
 results_wodom/
 results_wodom33/
 scripts/
-eval_agent/
+eval_agent/
+dataset/
+dataset_4o/
+results*
diff --git a/README.md b/README.md
diff --git a/agent/Environment/html_env/actions.py b/agent/Environment/html_env/actions.py
@@ -15,7 +15,7 @@ class ActionTypes(IntEnum):
     NONE = 0
     CLICK = 1
     GOTO = 2
-    GOOGLE_SEARCH = 3
+    # GOOGLE_SEARCH = 3 # 禁用Google search
     FILL_FORM = 4
     SWITCH_TAB = 5
     GO_BACK = 6
@@ -103,16 +103,16 @@ def create_fill_search_action(elementid: int, fill_text: str) -> Action:
         "element_name": ""
     }
 
-
-@beartype
-def create_search_action(elementid: int, text: str) -> Action:
-    return {
-        "action_type": ActionTypes.GOOGLE_SEARCH,
-        "element_id": elementid,
-        "url": "https://www.google.com",
-        "fill_text": text,
-        "element_name": ""
-    }
+# 禁用Google search
+# @beartype
+# def create_search_action(elementid: int, text: str) -> Action:
+#     return {
+#         "action_type": ActionTypes.GOOGLE_SEARCH,
+#         "element_id": elementid,
+#         "url": "https://www.google.com",
+#         "fill_text": text,
+#         "element_name": ""
+#     }
 
 
 @beartype
@@ -176,8 +176,8 @@ def create_action(elementid: int, action_type: str, action_input: str) -> Action
         return create_fill_search_action(elementid=elementid, fill_text=action_input)
     elif action_type == "goto":
         return create_goto_action(elementid=elementid, url=action_input)
-    elif action_type == "google_search":
-        return create_search_action(elementid=elementid, text=action_input)
+    # elif action_type == "google_search": # 禁用Google search
+    #     return create_search_action(elementid=elementid, text=action_input)
     elif action_type == "go_back":
         return create_go_back_action(elementid=elementid)
     elif action_type == "select_option":
@@ -203,7 +203,7 @@ def create_action(elementid: int, action_type: str, action_input: str) -> Action
     "create_fill_action",
     "create_none_action",
     "create_goto_action",
-    "create_search_action",
+    # "create_search_action", # 禁用Google search
     "create_go_back_action",
     "create_fill_search_action",
     "create_select_option_action",

diff --git a/agent/Environment/html_env/async_env.py b/agent/Environment/html_env/async_env.py
@@ -392,79 +392,82 @@ async def fill_form(self, action):
                 self.html_content = await self.page.content()
             except Exception as e:
                 raise e
-
-    async def search(self, action):
-        """Use Node.js to call Google Custom Search API"""
-        try:
-            # Execute Node.js script
-            process = await asyncio.create_subprocess_exec(
-                'node', 
-                self.search_script_path, 
-                action["fill_text"],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE
-            )
 
-            # Get output
-            stdout, stderr = await process.communicate()
+    # disable Google search
+    # async def search(self, action):
+    #     """Use Node.js to call Google Custom Search API"""
+    #     try:
+    #         # Execute Node.js script
+    #         process = await asyncio.create_subprocess_exec(
+    #             'node', 
+    #             self.search_script_path, 
+    #             action["fill_text"],
+    #             stdout=subprocess.PIPE,
+    #             stderr=subprocess.PIPE
+    #         )
 
-            if process.returncode == 0:
-                try:
-                    # Parse the JSON response
-                    data = json.loads(stdout.decode().strip())
+    #         # Get output
+    #         stdout, stderr = await process.communicate()
+
+    #         if process.returncode == 0:
+    #             try:
+    #                 # Parse the JSON response
+    #                 data = json.loads(stdout.decode().strip())
 
-                    if 'items' in data:
-                        # Create HTML from search results
-                        results_html = self._create_search_results_page(data['items'])
-                        self.html_content = results_html
-                    else:
-                        self.html_content = "<html><body><p>No results found.</p></body></html>"
-                except json.JSONDecodeError as e:
-                    logger.error(f"Failed to parse JSON response: {e}")
-                    self.html_content = "<html><body><p>Error parsing search results.</p></body></html>"
-            else:
-                error_msg = stderr.decode().strip()
-                logger.error(f"Search script error: {error_msg}")
-                self.html_content = f"<html><body><p>Search error: {error_msg}</p></body></html>"
+    #                 if 'items' in data:
+    #                     # Create HTML from search results
+    #                     results_html = self._create_search_results_page(data['items'])
+    #                     self.html_content = results_html
+    #                 else:
+    #                     self.html_content = "<html><body><p>No results found.</p></body></html>"
+    #             except json.JSONDecodeError as e:
+    #                 logger.error(f"Failed to parse JSON response: {e}")
+    #                 self.html_content = "<html><body><p>Error parsing search results.</p></body></html>"
+    #         else:
+    #             error_msg = stderr.decode().strip()
+    #             logger.error(f"Search script error: {error_msg}")
+    #             self.html_content = f"<html><body><p>Search error: {error_msg}</p></body></html>"
 
-            # Update the page content
-            await self.page.set_content(self.html_content)
+    #         # Update the page content
+    #         await self.page.set_content(self.html_content)
 
-        except Exception as e:
-            logger.error(f"Search error: {str(e)}")
-            self.html_content = f"<html><body><p>Search error: {str(e)}</p></body></html>"
-            await self.page.set_content(self.html_content)
-
-    def _create_search_results_page(self, items):
-        """Create an HTML page from search results"""
-        results = []
-        for item in items:
-            result = f"""
-            <div class="search-result">
-                <h3><a href="{item.get('link', '')}">{item.get('title', 'No title')}</a></h3>
-                <div class="url">{item.get('link', '')}</div>
-                <div class="snippet">{item.get('snippet', 'No description available')}</div>
-            </div>
-            """
-            results.append(result)
-
-        html = f"""
-        <html>
-        <head>
-            <style>
-                .search-result {{ margin-bottom: 20px; padding: 10px; }}
-                .url {{ color: green; margin: 5px 0; }}
-                .snippet {{ color: #545454; }}
-            </style>
-        </head>
-        <body>
-            <div class="search-results">
-                {''.join(results)}
-            </div>
-        </body>
-        </html>
-        """
-        return html
+    #     except Exception as e:
+    #         logger.error(f"Search error: {str(e)}")
+    #         self.html_content = f"<html><body><p>Search error: {str(e)}</p></body></html>"
+    #         await self.page.set_content(self.html_content)
+
+
+    # disable Google search
+    # def _create_search_results_page(self, items):
+    #     """Create an HTML page from search results"""
+    #     results = []
+    #     for item in items:
+    #         result = f"""
+    #         <div class="search-result">
+    #             <h3><a href="{item.get('link', '')}">{item.get('title', 'No title')}</a></h3>
+    #             <div class="url">{item.get('link', '')}</div>
+    #             <div class="snippet">{item.get('snippet', 'No description available')}</div>
+    #         </div>
+    #         """
+    #         results.append(result)
+
+    #     html = f"""
+    #     <html>
+    #     <head>
+    #         <style>
+    #             .search-result {{ margin-bottom: 20px; padding: 10px; }}
+    #             .url {{ color: green; margin: 5px 0; }}
+    #             .snippet {{ color: #545454; }}
+    #         </style>
+    #     </head>
+    #     <body>
+    #         <div class="search-results">
+    #             {''.join(results)}
+    #         </div>
+    #     </body>
+    #     </html>
+    #     """
+    #     return html
 
     async def go_back_last_page(self, action):
         # self.page = self.last_page
@@ -633,13 +636,14 @@ async def execute_action(self, action: Action) -> Union[str, Tuple[str, str]]:
                     error_message = f"Failed to execute fill_form [{action['element_id']},{action['fill_text']}] action. An error({e}) occur."
                     raise ActionExecutionError(
                         action['action_type'], error_message) from e
-            case ActionTypes.GOOGLE_SEARCH:
-                try:
-                    await self.search(action)
-                except Exception as e:
-                    error_message = f"Failed to execute google_search[{action['fill_text']}] action. An error({e}) occur."
-                    raise ActionExecutionError(
-                        action['action_type'], error_message) from e
+            # 禁用Google search
+            # case ActionTypes.GOOGLE_SEARCH:
+            #     try:
+            #         await self.search(action)
+            #     except Exception as e:
+            #         error_message = f"Failed to execute google_search[{action['fill_text']}] action. An error({e}) occur."
+            #         raise ActionExecutionError(
+            #             action['action_type'], error_message) from e
             case ActionTypes.GO_BACK:
                 try:
                     await self.go_back_last_page(action)

diff --git a/agent/LLM/openai.py b/agent/LLM/openai.py
@@ -20,7 +20,7 @@ def __init__(self, model=None):
         self.model = model
         self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
-    async def request(self, messages: list = None, max_tokens: int = 100000, temperature: float = 0.7) -> (str, str):
+    async def request(self, messages: list = None, max_tokens: int = 4096, temperature: float = 0.7) -> (str, str):
         try:
             if "gpt-3.5" in self.model:
                 messages = truncate_messages_based_on_estimated_tokens(messages, max_tokens=16385)
@@ -47,7 +47,7 @@ async def request(self, messages: list = None, max_tokens: int = 100000, tempera
             logger.error(f"Error in GPTGenerator.request: {e}")
             return "", str(e)
 
-    async def chat(self, messages, max_tokens=100000, temperature=0.7):
+    async def chat(self, messages, max_tokens=4096, temperature=0.7):
         loop = asyncio.get_event_loop()
 
         # Dynamically select field names
@@ -67,13 +67,20 @@ async def chat(self, messages, max_tokens=100000, temperature=0.7):
         elif "gpt-4.1" in self.model:
             data = {
                 'model': self.model,
-                token_key: 32768, # gpt-4.1 max_tokens = 32768
+                token_key: 4096, # gpt-4.1 max_tokens = 32768
+                'messages': messages,
+            }
+        elif "gpt-4o" in self.model:
+            data = {
+                'model': self.model,
+                'max_tokens': 16384,
+                'temperature': temperature,
                 'messages': messages,
             }
         else:
             data = {
                 'model': self.model,
-                # 'max_tokens': max_tokens,
+                'max_tokens': 16384,
                 token_key: max_tokens,
             'temperature': temperature,
             'messages': messages,

diff --git a/agent/Plan/planning.py b/agent/Plan/planning.py
@@ -77,7 +77,7 @@ async def execute(self, status_description, user_request, previous_trace, observ
                 vision_act_response)
             actions = {
                 'goto': "Found 'goto' in the vision_act_response.",
-                'google_search': "Found 'google_search' in the vision_act_response.",
+                # 'google_search': "Found 'google_search' in the vision_act_response.",
                 'switch_tab': "Found 'switch_tab' in the vision_act_response.",
                 'scroll_down': "Found 'scroll_down' in the vision_act_response.",
                 'scroll_up': "Found 'scroll_up' in the vision_act_response.",
@@ -94,7 +94,8 @@ async def execute(self, status_description, user_request, previous_trace, observ
                     break
 
             if not actions_found:
-                print("None of 'goto', 'google_search', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.")
+                # print("None of 'goto', 'google_search', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.")
+                print("None of 'goto', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.")
 
                 target_element = planning_response_get.get('target_element')
                 description = planning_response_get.get('description')

diff --git a/agent/Prompt/base_prompts.py b/agent/Prompt/base_prompts.py
@@ -42,7 +42,6 @@ class BasePrompts:
         **Execution Action Space**:
             - goto: useful for when you need visit a new link or a website, it will open a new tab.
             - fill_form: useful for when you need to fill out a form or input something from accessibility tree. Input should be a string.
-            - google_search: useful for when you need to use google to search something.
             - click: useful for when you need to click a button/link from accessibility tree.
             - select_option: useful for when you need to select a drop-down box value. When you get (select and option) tags from the accessibility tree, you need to select the serial number(element_id) corresponding to the select tag, not the option, and select the most likely content corresponding to the option as Input.
             - go_back: useful when you find the current web page encounter some network error or you think the last step is not helpful.
@@ -57,7 +56,9 @@ class BasePrompts:
 
         You have to follow the instructions or notes:
         **Important Notes**:
-            - Under the following conditions, you are restricted to using the `google_search` or `goto` tools exclusively: 
+            - The first step must be a goto, especially when the page is blank (about: blank).
+            - It is not allowed to perform operations other than goto on blank pages, such as click and other interactive operations.
+            - Under the following conditions, you are restricted to using the `goto` tool exclusively: 
                 1. In the initial step of a process or when there's no preceding interaction history (i.e., the previous trace is empty). 
                 2. In situations where the accessibility tree is absent or not provided.
             - Your action should not be the same as last step's action.

diff --git a/agent/Utils/utils.py b/agent/Utils/utils.py
@@ -51,17 +51,23 @@ def read_json_file(file_path):
 
 
 def save_screenshot(mode: str, record_time: str, task_name: str, step_number: int, description: str,
-                    screenshot_base64: str, task_name_id: str = None):
-
+                    screenshot_base64: str, task_name_id: str = None, task_uuid: str = None):# add task_uuid
+    # Prior use task_uuid,else task_name_id
+    identifier = task_uuid if task_uuid is not None else task_name_id
+
     timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
     invalid_chars = '<>:"/\\|?*'
     for char in invalid_chars:
         task_name = task_name.replace(char, '_')
-
-    if task_name_id is None:
+    # if task_name_id is None:
+    #     task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name}'
+    # else:
+    #     task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name_id}_{task_name}'
+
+    if identifier is None:
         task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name}'
     else:
-        task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name_id}_{task_name}'
+        task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{identifier}_{task_name}'
     if not os.path.exists(task_folder):
         os.makedirs(task_folder)
 

diff --git a/batch_eval.py b/batch_eval.py
@@ -5,9 +5,14 @@
 2. Support Online_Mind2Web task evaluation
 3. Support access to gpt-4.1, o3-mini, o4-mini and other models
 
-Tips: To run in a Linux environment without a visual interface, use the following command to start
+Tips: To run in a Linux environment without a visual interface, use the following command to start:
     sudo yum install -y xorg-x11-server-Xvfb
     xvfb-run python batch_eval.py
+
+    Ubantu/Debian users can use the following command to install xvfb:
+    sudo apt-get update
+    sudo apt-get install -y xvfb
+    xvfb-run python batch_eval.py
 """
 #!/usr/bin/env python3
 import json
@@ -50,23 +55,23 @@ def main():
     parser = argparse.ArgumentParser(description='Online-Mind2Web Task')
     parser.add_argument('--json_path', type=str, default='data/Online-Mind2Web/Online_Mind2Web.json',
                         help='JSON task file path')
-    parser.add_argument('--global_reward_mode', type=str, default='no_global_reward',
-                        help='Global Reward Mode: dom_reward/no_global_reward')
+    parser.add_argument('--global_reward_mode', type=str, default='dom_reward',
+                        help='Global Reward Mode: dom_reward/no_global_reward/dom_vision_reward')
     parser.add_argument('--index', type=int, default=-1,
                         help='Task index')
-    parser.add_argument('--snapshot', type=str, default='results',
+    parser.add_argument('--snapshot', type=str, default='results/41_dom',
                         help='Snapshot directory')
     parser.add_argument('--planning_text_model', type=str, default='gpt-4.1',
-                        help='planning_text_model')
+                        help='planning_text_model: gpt-4.1/gpt-4o-2024-08-06')
     parser.add_argument('--global_reward_text_model', type=str, default='gpt-4.1',
-                        help='global_reward_text_model')
+                        help='global_reward_text_model: gpt-4.1/gpt-4o-2024-08-06')
     parser.add_argument('--start_idx', type=int, default=0,
                         help='The index to start the task')
     parser.add_argument('--end_idx', type=int, default=None,
                         help='The index of the finished task (excluding)')
     parser.add_argument('--delay', type=int, default=5,
                         help='Latency between tasks (seconds)')
-    parser.add_argument('--output_log', type=str, default='batch_run_log.txt',
+    parser.add_argument('--output_log', type=str, default='results/41_dom/batch_run_log.txt',
                         help='output_log')
 
     args = parser.parse_args()