Alibaba-NLP · Mirza-Samad-Ahmed-Baig · Sep 20, 2025 · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025
diff --git a/WebAgent/WebSailor/src/react_agent.py b/WebAgent/WebSailor/src/react_agent.py
@@ -69,7 +69,7 @@ def call_server(self, msgs, max_tries=10):
     def count_tokens(self, messages, model="gpt-4o"):
         try: 
             tokenizer = AutoTokenizer.from_pretrained(self.llm_local_path) 
-        except Exception as e: 
+        except: 
             tokenizer = tiktoken.encoding_for_model(model)
 
         full_message = [Message(**x) for x in messages]
@@ -159,4 +159,4 @@ def _run(self, data: str, model: str, user_prompt: str, **kwargs) -> List[List[M
             "prediction": prediction,
             "termination": termination
         }
-        return result
+        return result
diff --git a/WebAgent/WebWalker/src/agent.py b/WebAgent/WebWalker/src/agent.py
@@ -140,9 +140,9 @@ def _run(self, messages: List[Message], lang: Literal['en', 'zh'] = 'en', **kwar
             if stage1:
                 self.momery.append(stage1+"\n")
                 if len(self.momery) > 1:
-                    yield [Message(role=ASSISTANT, content= "Memory:\n" + "-".join(self.momery)+"\"}")]
+                    yield [Message(role=ASSISTANT, content= "Memory:\n" + "-".join(self.momery)+"\"")]
                 else:
-                    yield [Message(role=ASSISTANT, content= "Memory:\n" + "-" + self.momery[0]+"\"}")]
+                    yield [Message(role=ASSISTANT, content= "Memory:\n" + "-" + self.momery[0]+"\"")]
                 stage2 = self.critic_information(query, self.momery)
                 if stage2:
                     response = f'Final Answer: {stage2}'
@@ -205,4 +205,4 @@ def _detect_tool(self, text: str) -> Tuple[bool, str, str, str]:
             func_name = text[i + len(special_func_token):j].strip()
             func_args = text[j + len(special_args_token):k].strip()
             text = text[:i]  # Return the response before tool call, i.e., `Thought`
-        return (func_name is not None), func_name, func_args, text
+        return (func_name is not None), func_name, func_args, text
diff --git a/evaluation/evaluate_deepsearch_official.py b/evaluation/evaluate_deepsearch_official.py
@@ -1,7 +1,5 @@
-from pydantic import BaseModel
 from openai import OpenAI
 import concurrent.futures
-from typing import Literal
 import litellm 
 import os 
 import argparse
@@ -189,7 +187,7 @@ def aggregate_statistics(round1_file, round2_file, round3_file):
     round3_stats = single_round_statistics(round3_file)
 
     keys = round1_stats.keys()  
-    avg_stats = {} 
+    avg_stats = {}
     for key in keys: 
         if isinstance(round1_stats[key], dict):
 
@@ -224,7 +222,7 @@ def single_round_statistics(input_file):
 
     try:
         tokenizer = AutoTokenizer.from_pretrained(os.getenv("Qwen2_5_7B_PATH", ""))
-    except Exception as e: 
+    except Exception: 
         tokenizer = tiktoken.encoding_for_model("gpt-4o")
 
     for item in contents:
@@ -329,7 +327,7 @@ def calculate_enhanced_statistics(round_results, round_items):
 
     try:
         tokenizer = AutoTokenizer.from_pretrained(os.getenv("Qwen2_5_7B_PATH", ""))
-    except Exception as e: 
+    except Exception: 
         tokenizer = tiktoken.encoding_for_model("gpt-4o")
 
     enhanced_stats = {}
@@ -419,7 +417,7 @@ def calculate_best_pass_at_1(query_results):
     round_correct = {round_name: 0 for round_name in ["round1", "round2", "round3"]}
 
     for query, results in query_results.items():
-        for round_name in ["round1", "round2", "round3"]: 
+        for round_name in ["round1", "round2", "round3"]:
             if results[round_name] == "Correct":  
                 round_correct[round_name] += 1 
 
@@ -459,10 +457,10 @@ def main():
     args = parser.parse_args()
 
     dataset = args.dataset  
-    if dataset in ["gaia", "webwalker"]: 
+    if dataset in ["gaia", "webwalker"]:
         judge_model = "openai/qwen2.5-72b-instruct"
         judge_prompt = JUDGE_PROMPT_GAIA 
-    elif dataset in ["xbench-deepsearch"]: 
+    elif dataset in ["xbench-deepsearch"]:
         judge_prompt = JUDGE_PROMPT_XBENCH
         judge_model = "google/gemini-2.0-flash-001"
     elif dataset.startswith("browsecomp_zh"):

diff --git a/inference/react_agent.py b/inference/react_agent.py
@@ -25,7 +25,8 @@
 from tool_visit import *
 
 OBS_START = '<tool_response>'
-OBS_END = '\n</tool_response>'
+OBS_END = '
+</tool_response>'
 
 MAX_LLM_CALL_PER_RUN = int(os.getenv('MAX_LLM_CALL_PER_RUN', 100))