From df74401d7cd9bcd28f4fca5eaa7cb380b1459c71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 24 Jul 2025 18:28:20 +0800 Subject: [PATCH 1/6] feat: modify internet search --- examples/core_memories/tree_textual_memory.py | 37 ++--- src/memos/configs/internet_retriever.py | 6 + src/memos/memories/textual/item.py | 2 +- .../retrieve/internet_retriever_factory.py | 2 + .../tree_text_memory/retrieve/searcher.py | 12 +- .../tree_text_memory/retrieve/xinyusearch.py | 147 +++++++++++------- 6 files changed, 112 insertions(+), 94 deletions(-) diff --git a/examples/core_memories/tree_textual_memory.py b/examples/core_memories/tree_textual_memory.py index 17f68832..6587e9c6 100644 --- a/examples/core_memories/tree_textual_memory.py +++ b/examples/core_memories/tree_textual_memory.py @@ -192,6 +192,7 @@ def embed_memory_item(memory: str) -> list[float]: my_tree_textual_memory.memory_manager.wait_reorganizer() time.sleep(60) +my_tree_textual_memory.memory_manager.close() results = my_tree_textual_memory.search( "Talk about the user's childhood story?", @@ -203,37 +204,17 @@ def embed_memory_item(memory: str) -> list[float]: print(f"{i}'th similar result is: " + str(r["memory"])) print(f"Successfully search {len(results)} memories") -# find related nodes -related_nodes = my_tree_textual_memory.get_relevant_subgraph("Painting") - -# get current memory_size -print(f"Current Memory Size is {my_tree_textual_memory.get_current_memory_size()}") - -logger.info("Start doc search example...") -# Processing Documents -doc_paths = [ - "./text1.txt", - "./text2.txt", -] -# Acquiring memories from documents -doc_memory = reader.get_memory(doc_paths, "doc", info={"user_id": "1111", "session_id": "2222"}) - -for m_list in doc_memory: - added_ids = my_tree_textual_memory.add(m_list) - my_tree_textual_memory.memory_manager.wait_reorganizer() - -results = my_tree_textual_memory.search( - "Tell me about what memos consist of?", - top_k=30, - info={"query": "Tell me about what memos consist of?", "user_id": "111", "session": "2234"}, +# try this when use 'fine' mode (Note that you should pass the internet Config, refer to examples/core_memories/textual_internet_memoy.py) +results_fine_search = my_tree_textual_memory.search( + "Recent news in NewYork", + top_k=10, + mode="fine", + info={"query": "Recent news in NewYork", "user_id": "111", "session": "2234"}, ) -for i, r in enumerate(results): +for i, r in enumerate(results_fine_search): r = r.to_dict() print(f"{i}'th similar result is: " + str(r["memory"])) -print(f"Successfully search {len(results)} memories") - -# close the synchronous thread in memory manager -my_tree_textual_memory.memory_manager.close() +print(f"Successfully search {len(results_fine_search)} memories") # my_tree_textual_memory.dump diff --git a/src/memos/configs/internet_retriever.py b/src/memos/configs/internet_retriever.py index 56f892ac..88d4eff8 100644 --- a/src/memos/configs/internet_retriever.py +++ b/src/memos/configs/internet_retriever.py @@ -4,6 +4,7 @@ from pydantic import Field, field_validator, model_validator +from memos.chunkers.factory import ChunkerConfigFactory from memos.configs.base import BaseConfig from memos.exceptions import ConfigurationError @@ -47,6 +48,11 @@ class XinyuSearchConfig(BaseInternetRetrieverConfig): num_per_request: int = Field( default=10, description="Number of results per API request (not used for Xinyu)" ) + chunker: ChunkerConfigFactory = Field( + ..., + default_factory=ChunkerConfigFactory, + description="Chunker configuration", + ) class InternetRetrieverConfigFactory(BaseConfig): diff --git a/src/memos/memories/textual/item.py b/src/memos/memories/textual/item.py index 06d832b3..c287c191 100644 --- a/src/memos/memories/textual/item.py +++ b/src/memos/memories/textual/item.py @@ -59,7 +59,7 @@ def __str__(self) -> str: class TreeNodeTextualMemoryMetadata(TextualMemoryMetadata): """Extended metadata for structured memory, layered retrieval, and lifecycle tracking.""" - memory_type: Literal["WorkingMemory", "LongTermMemory", "UserMemory"] = Field( + memory_type: Literal["WorkingMemory", "LongTermMemory", "UserMemory", "OuterMemory"] = Field( default="WorkingMemory", description="Memory lifecycle type." ) sources: list[str] | None = Field( diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py b/src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py index d6af5944..135f1597 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py @@ -2,6 +2,7 @@ from typing import Any, ClassVar +from memos.chunkers.factory import ChunkerFactory from memos.configs.internet_retriever import InternetRetrieverConfigFactory from memos.embedders.base import BaseEmbedder from memos.memories.textual.tree_text_memory.retrieve.internet_retriever import ( @@ -66,6 +67,7 @@ def from_config( access_key=config.api_key, # Use api_key as access_key for xinyu search_engine_id=config.search_engine_id, embedder=embedder, + chunker=ChunkerFactory.from_config(config.chunker), max_results=config.max_results, ) else: diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py b/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py index 40bd01a4..06d9181f 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py @@ -136,7 +136,7 @@ def retrieve_from_internet(): """ Retrieve information from the internet using Google Custom Search API. """ - if not self.internet_retriever: + if not self.internet_retriever or mode == "fast": return [] if memory_type not in ["All"]: return [] @@ -149,7 +149,7 @@ def retrieve_from_internet(): query=query, query_embedding=query_embedding[0], graph_results=internet_items, - top_k=top_k * 2, + top_k=max(top_k, 10), parsed_goal=parsed_goal, ) return ranked_memories @@ -184,14 +184,6 @@ def retrieve_from_internet(): TextualMemoryItem(id=item.id, memory=item.memory, metadata=new_meta) ) - # Step 4: Reasoning over all retrieved and ranked memory - if mode == "fine": - searched_res = self.reasoner.reason( - query=query, - ranked_memories=searched_res, - parsed_goal=parsed_goal, - ) - # Step 5: Update usage history with current timestamp now_time = datetime.now().isoformat() usage_record = json.dumps( diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py b/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py index b803dfa4..a0f2fbff 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py @@ -3,10 +3,12 @@ import json import uuid +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime import requests +from memos.chunkers.base import BaseChunker from memos.embedders.factory import OllamaEmbedder from memos.log import get_logger from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata @@ -93,8 +95,8 @@ def search(self, query: str, max_results: int | None = None) -> list[dict]: "online_search": { "max_entries": max_results, "cache_switch": False, - "baidu_field": {"switch": True, "mode": "relevance", "type": "page"}, - "bing_field": {"switch": False, "mode": "relevance", "type": "page_web"}, + "baidu_field": {"switch": False, "mode": "relevance", "type": "page"}, + "bing_field": {"switch": True, "mode": "relevance", "type": "page"}, "sogou_field": {"switch": False, "mode": "relevance", "type": "page"}, }, "request_id": "memos" + str(uuid.uuid4()), @@ -112,6 +114,7 @@ def __init__( access_key: str, search_engine_id: str, embedder: OllamaEmbedder, + chunker: BaseChunker, max_results: int = 20, ): """ @@ -124,6 +127,7 @@ def __init__( """ self.xinyu_api = XinyuSearchAPI(access_key, search_engine_id, max_results=max_results) self.embedder = embedder + self.chunker = chunker def retrieve_from_internet( self, query: str, top_k: int = 10, parsed_goal=None @@ -143,63 +147,25 @@ def retrieve_from_internet( search_results = self.xinyu_api.search(query, max_results=top_k) # Convert to TextualMemoryItem format - memory_items = [] - - for _, result in enumerate(search_results): - # Extract basic information from Xinyu response format - title = result.get("title", "") - content = result.get("content", "") - summary = result.get("summary", "") - url = result.get("url", "") - publish_time = result.get("publish_time", "") - if publish_time: + memory_items: list[TextualMemoryItem] = [] + + with ThreadPoolExecutor(max_workers=8) as executor: + futures = [ + executor.submit(self._process_result, result, query, parsed_goal) + for result in search_results + ] + for future in as_completed(futures): try: - publish_time = datetime.strptime(publish_time, "%Y-%m-%d %H:%M:%S").strftime( - "%Y-%m-%d" - ) + memory_items.extend(future.result()) except Exception as e: - logger.error(f"xinyu search error: {e}") - publish_time = datetime.now().strftime("%Y-%m-%d") - else: - publish_time = datetime.now().strftime("%Y-%m-%d") - source = result.get("source", "") - site = result.get("site", "") - if site: - site = site.split("|")[0] - - # Combine memory content - memory_content = ( - f"Title: {title}\nSummary: {summary}\nContent: {content[:200]}...\nSource: {url}" - ) - - # Create metadata - metadata = TreeNodeTextualMemoryMetadata( - user_id=None, - session_id=None, - status="activated", - type="fact", # Search results are usually factual information - memory_time=publish_time, - source="web", - confidence=85.0, # Confidence level for search information - entities=self._extract_entities(title, content, summary), - tags=self._extract_tags(title, content, summary, parsed_goal), - visibility="public", - memory_type="LongTermMemory", # Search results as working memory - key=title, - sources=[url] if url else [], - embedding=self.embedder.embed([memory_content])[0], - created_at=datetime.now().isoformat(), - usage=[], - background=f"Xinyu search result from {site or source}", - ) - # Create TextualMemoryItem - memory_item = TextualMemoryItem( - id=str(uuid.uuid4()), memory=memory_content, metadata=metadata - ) + logger.error(f"Error processing search result: {e}") - memory_items.append(memory_item) + unique_memory_items = {} + for item in memory_items: + if item.memory not in unique_memory_items: + unique_memory_items[item.memory] = item - return memory_items + return list(unique_memory_items.values()) def _extract_entities(self, title: str, content: str, summary: str) -> list[str]: """ @@ -333,3 +299,74 @@ def _extract_tags(self, title: str, content: str, summary: str, parsed_goal=None tags.extend(parsed_goal.tags) return list(set(tags))[:15] # Limit to 15 tags + + def _process_result( + self, result: dict, query: str, parsed_goal: str + ) -> list[TextualMemoryItem]: + title = result.get("title", "") + content = result.get("content", "") + summary = result.get("summary", "") + url = result.get("url", "") + publish_time = result.get("publish_time", "") + if publish_time: + try: + publish_time = datetime.strptime(publish_time, "%Y-%m-%d %H:%M:%S").strftime( + "%Y-%m-%d" + ) + except Exception as e: + logger.error(f"xinyu search error: {e}") + publish_time = datetime.now().strftime("%Y-%m-%d") + else: + publish_time = datetime.now().strftime("%Y-%m-%d") + source = result.get("source", "") + site = result.get("site", "") + if site: + site = site.split("|")[0] + + qualified_chunks = self._chunk(content) + + memory_items = [] + for chunk_text, chunk_emb, score in qualified_chunks: + memory_content = ( + f"Title: {title}\nNewsTime: {publish_time}\nSummary: {summary}\n" + f"Content: {chunk_text}\nSource: {url}" + ) + metadata = TreeNodeTextualMemoryMetadata( + user_id=None, + session_id=None, + status="activated", + type="fact", + source="web", + confidence=score, + entities=self._extract_entities(title, content, summary), + tags=self._extract_tags(title, content, summary, parsed_goal), + visibility="public", + memory_type="OuterMemory", + key=title, + sources=[url] if url else [], + embedding=chunk_emb, + created_at=datetime.now().isoformat(), + usage=[], + background=f"Xinyu search result from {site or source}", + ) + memory_items.append( + TextualMemoryItem(id=str(uuid.uuid4()), memory=memory_content, metadata=metadata) + ) + + return memory_items + + def _chunk(self, content: str) -> list[tuple[str, list[float], float]]: + """ + Use SentenceChunker to split content into chunks and embed each. + + Returns: + List of (chunk_text, chunk_embedding, dummy_score) + """ + chunks = self.chunker.chunk(content) + if not chunks: + return [] + + chunk_texts = [c.text for c in chunks] + chunk_embeddings = self.embedder.embed(chunk_texts) + + return [(text, emb, 1.0) for text, emb in zip(chunk_texts, chunk_embeddings, strict=False)] From db677efe5112f9dfc320705dc41695f7743cd4e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 24 Jul 2025 18:30:49 +0800 Subject: [PATCH 2/6] feat: modify internet search --- examples/core_memories/tree_textual_memory.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/examples/core_memories/tree_textual_memory.py b/examples/core_memories/tree_textual_memory.py index 6587e9c6..c507b6e1 100644 --- a/examples/core_memories/tree_textual_memory.py +++ b/examples/core_memories/tree_textual_memory.py @@ -216,6 +216,35 @@ def embed_memory_item(memory: str) -> list[float]: print(f"{i}'th similar result is: " + str(r["memory"])) print(f"Successfully search {len(results_fine_search)} memories") +# find related nodes +related_nodes = my_tree_textual_memory.get_relevant_subgraph("Painting") + +# get current memory_size +print(f"Current Memory Size is {my_tree_textual_memory.get_current_memory_size()}") + +logger.info("Start doc search example...") +# Processing Documents +doc_paths = [ + "./text1.txt", + "./text2.txt", +] +# Acquiring memories from documents +doc_memory = reader.get_memory(doc_paths, "doc", info={"user_id": "1111", "session_id": "2222"}) + +for m_list in doc_memory: + added_ids = my_tree_textual_memory.add(m_list) + my_tree_textual_memory.memory_manager.wait_reorganizer() + +results = my_tree_textual_memory.search( + "Tell me about what memos consist of?", + top_k=30, + info={"query": "Tell me about what memos consist of?", "user_id": "111", "session": "2234"}, +) +for i, r in enumerate(results): + r = r.to_dict() + print(f"{i}'th similar result is: " + str(r["memory"])) +print(f"Successfully search {len(results)} memories") + # my_tree_textual_memory.dump my_tree_textual_memory.dump("tmp/my_tree_textual_memory") From 048584b3bb4a80e60458f3f5ba403a13ff2a7086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 24 Jul 2025 18:31:53 +0800 Subject: [PATCH 3/6] feat: modify internet search --- examples/core_memories/tree_textual_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/core_memories/tree_textual_memory.py b/examples/core_memories/tree_textual_memory.py index c507b6e1..d647229c 100644 --- a/examples/core_memories/tree_textual_memory.py +++ b/examples/core_memories/tree_textual_memory.py @@ -192,7 +192,6 @@ def embed_memory_item(memory: str) -> list[float]: my_tree_textual_memory.memory_manager.wait_reorganizer() time.sleep(60) -my_tree_textual_memory.memory_manager.close() results = my_tree_textual_memory.search( "Talk about the user's childhood story?", @@ -245,6 +244,7 @@ def embed_memory_item(memory: str) -> list[float]: print(f"{i}'th similar result is: " + str(r["memory"])) print(f"Successfully search {len(results)} memories") +my_tree_textual_memory.memory_manager.close() # my_tree_textual_memory.dump my_tree_textual_memory.dump("tmp/my_tree_textual_memory") From fb5c66878ee36282a1abbfa6ccbe0f4487a58dc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 24 Jul 2025 18:32:25 +0800 Subject: [PATCH 4/6] feat: modify internet search --- examples/core_memories/tree_textual_memory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/core_memories/tree_textual_memory.py b/examples/core_memories/tree_textual_memory.py index d647229c..47dc51e4 100644 --- a/examples/core_memories/tree_textual_memory.py +++ b/examples/core_memories/tree_textual_memory.py @@ -244,6 +244,7 @@ def embed_memory_item(memory: str) -> list[float]: print(f"{i}'th similar result is: " + str(r["memory"])) print(f"Successfully search {len(results)} memories") +# close the synchronous thread in memory manager my_tree_textual_memory.memory_manager.close() # my_tree_textual_memory.dump From 289dda8d328dc231e923e9d5dc5f60dd80ace376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 24 Jul 2025 18:41:00 +0800 Subject: [PATCH 5/6] fix: unittest for tree_searcher --- tests/memories/textual/test_tree_searcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/memories/textual/test_tree_searcher.py b/tests/memories/textual/test_tree_searcher.py index df7d1d77..729d7a4f 100644 --- a/tests/memories/textual/test_tree_searcher.py +++ b/tests/memories/textual/test_tree_searcher.py @@ -94,7 +94,6 @@ def test_searcher_fine_mode_triggers_reasoner(mock_searcher): top_k=1, mode="fine", ) - assert mock_searcher.reasoner.reason.called assert len(result) == 1 From 6b13b8bd339967c5b37aba44d0b93b2d2e1f99be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 24 Jul 2025 18:58:57 +0800 Subject: [PATCH 6/6] feat: add source to memory key --- .../memories/textual/tree_text_memory/retrieve/xinyusearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py b/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py index a0f2fbff..ccae17f7 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py @@ -342,7 +342,7 @@ def _process_result( tags=self._extract_tags(title, content, summary, parsed_goal), visibility="public", memory_type="OuterMemory", - key=title, + key=f"[{source}]" + title, sources=[url] if url else [], embedding=chunk_emb, created_at=datetime.now().isoformat(),