diff --git a/examples/core_memories/tree_textual_memory.py b/examples/core_memories/tree_textual_memory.py index 17f68832..47dc51e4 100644 --- a/examples/core_memories/tree_textual_memory.py +++ b/examples/core_memories/tree_textual_memory.py @@ -203,6 +203,18 @@ def embed_memory_item(memory: str) -> list[float]: print(f"{i}'th similar result is: " + str(r["memory"])) print(f"Successfully search {len(results)} memories") +# try this when use 'fine' mode (Note that you should pass the internet Config, refer to examples/core_memories/textual_internet_memoy.py) +results_fine_search = my_tree_textual_memory.search( + "Recent news in NewYork", + top_k=10, + mode="fine", + info={"query": "Recent news in NewYork", "user_id": "111", "session": "2234"}, +) +for i, r in enumerate(results_fine_search): + r = r.to_dict() + print(f"{i}'th similar result is: " + str(r["memory"])) +print(f"Successfully search {len(results_fine_search)} memories") + # find related nodes related_nodes = my_tree_textual_memory.get_relevant_subgraph("Painting") @@ -235,7 +247,6 @@ def embed_memory_item(memory: str) -> list[float]: # close the synchronous thread in memory manager my_tree_textual_memory.memory_manager.close() - # my_tree_textual_memory.dump my_tree_textual_memory.dump("tmp/my_tree_textual_memory") my_tree_textual_memory.drop() diff --git a/src/memos/configs/internet_retriever.py b/src/memos/configs/internet_retriever.py index 56f892ac..88d4eff8 100644 --- a/src/memos/configs/internet_retriever.py +++ b/src/memos/configs/internet_retriever.py @@ -4,6 +4,7 @@ from pydantic import Field, field_validator, model_validator +from memos.chunkers.factory import ChunkerConfigFactory from memos.configs.base import BaseConfig from memos.exceptions import ConfigurationError @@ -47,6 +48,11 @@ class XinyuSearchConfig(BaseInternetRetrieverConfig): num_per_request: int = Field( default=10, description="Number of results per API request (not used for Xinyu)" ) + chunker: ChunkerConfigFactory = Field( + ..., + default_factory=ChunkerConfigFactory, + description="Chunker configuration", + ) class InternetRetrieverConfigFactory(BaseConfig): diff --git a/src/memos/memories/textual/item.py b/src/memos/memories/textual/item.py index 06d832b3..c287c191 100644 --- a/src/memos/memories/textual/item.py +++ b/src/memos/memories/textual/item.py @@ -59,7 +59,7 @@ def __str__(self) -> str: class TreeNodeTextualMemoryMetadata(TextualMemoryMetadata): """Extended metadata for structured memory, layered retrieval, and lifecycle tracking.""" - memory_type: Literal["WorkingMemory", "LongTermMemory", "UserMemory"] = Field( + memory_type: Literal["WorkingMemory", "LongTermMemory", "UserMemory", "OuterMemory"] = Field( default="WorkingMemory", description="Memory lifecycle type." ) sources: list[str] | None = Field( diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py b/src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py index d6af5944..135f1597 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py @@ -2,6 +2,7 @@ from typing import Any, ClassVar +from memos.chunkers.factory import ChunkerFactory from memos.configs.internet_retriever import InternetRetrieverConfigFactory from memos.embedders.base import BaseEmbedder from memos.memories.textual.tree_text_memory.retrieve.internet_retriever import ( @@ -66,6 +67,7 @@ def from_config( access_key=config.api_key, # Use api_key as access_key for xinyu search_engine_id=config.search_engine_id, embedder=embedder, + chunker=ChunkerFactory.from_config(config.chunker), max_results=config.max_results, ) else: diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py b/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py index 40bd01a4..06d9181f 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py @@ -136,7 +136,7 @@ def retrieve_from_internet(): """ Retrieve information from the internet using Google Custom Search API. """ - if not self.internet_retriever: + if not self.internet_retriever or mode == "fast": return [] if memory_type not in ["All"]: return [] @@ -149,7 +149,7 @@ def retrieve_from_internet(): query=query, query_embedding=query_embedding[0], graph_results=internet_items, - top_k=top_k * 2, + top_k=max(top_k, 10), parsed_goal=parsed_goal, ) return ranked_memories @@ -184,14 +184,6 @@ def retrieve_from_internet(): TextualMemoryItem(id=item.id, memory=item.memory, metadata=new_meta) ) - # Step 4: Reasoning over all retrieved and ranked memory - if mode == "fine": - searched_res = self.reasoner.reason( - query=query, - ranked_memories=searched_res, - parsed_goal=parsed_goal, - ) - # Step 5: Update usage history with current timestamp now_time = datetime.now().isoformat() usage_record = json.dumps( diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py b/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py index b803dfa4..ccae17f7 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py @@ -3,10 +3,12 @@ import json import uuid +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime import requests +from memos.chunkers.base import BaseChunker from memos.embedders.factory import OllamaEmbedder from memos.log import get_logger from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata @@ -93,8 +95,8 @@ def search(self, query: str, max_results: int | None = None) -> list[dict]: "online_search": { "max_entries": max_results, "cache_switch": False, - "baidu_field": {"switch": True, "mode": "relevance", "type": "page"}, - "bing_field": {"switch": False, "mode": "relevance", "type": "page_web"}, + "baidu_field": {"switch": False, "mode": "relevance", "type": "page"}, + "bing_field": {"switch": True, "mode": "relevance", "type": "page"}, "sogou_field": {"switch": False, "mode": "relevance", "type": "page"}, }, "request_id": "memos" + str(uuid.uuid4()), @@ -112,6 +114,7 @@ def __init__( access_key: str, search_engine_id: str, embedder: OllamaEmbedder, + chunker: BaseChunker, max_results: int = 20, ): """ @@ -124,6 +127,7 @@ def __init__( """ self.xinyu_api = XinyuSearchAPI(access_key, search_engine_id, max_results=max_results) self.embedder = embedder + self.chunker = chunker def retrieve_from_internet( self, query: str, top_k: int = 10, parsed_goal=None @@ -143,63 +147,25 @@ def retrieve_from_internet( search_results = self.xinyu_api.search(query, max_results=top_k) # Convert to TextualMemoryItem format - memory_items = [] - - for _, result in enumerate(search_results): - # Extract basic information from Xinyu response format - title = result.get("title", "") - content = result.get("content", "") - summary = result.get("summary", "") - url = result.get("url", "") - publish_time = result.get("publish_time", "") - if publish_time: + memory_items: list[TextualMemoryItem] = [] + + with ThreadPoolExecutor(max_workers=8) as executor: + futures = [ + executor.submit(self._process_result, result, query, parsed_goal) + for result in search_results + ] + for future in as_completed(futures): try: - publish_time = datetime.strptime(publish_time, "%Y-%m-%d %H:%M:%S").strftime( - "%Y-%m-%d" - ) + memory_items.extend(future.result()) except Exception as e: - logger.error(f"xinyu search error: {e}") - publish_time = datetime.now().strftime("%Y-%m-%d") - else: - publish_time = datetime.now().strftime("%Y-%m-%d") - source = result.get("source", "") - site = result.get("site", "") - if site: - site = site.split("|")[0] + logger.error(f"Error processing search result: {e}") - # Combine memory content - memory_content = ( - f"Title: {title}\nSummary: {summary}\nContent: {content[:200]}...\nSource: {url}" - ) + unique_memory_items = {} + for item in memory_items: + if item.memory not in unique_memory_items: + unique_memory_items[item.memory] = item - # Create metadata - metadata = TreeNodeTextualMemoryMetadata( - user_id=None, - session_id=None, - status="activated", - type="fact", # Search results are usually factual information - memory_time=publish_time, - source="web", - confidence=85.0, # Confidence level for search information - entities=self._extract_entities(title, content, summary), - tags=self._extract_tags(title, content, summary, parsed_goal), - visibility="public", - memory_type="LongTermMemory", # Search results as working memory - key=title, - sources=[url] if url else [], - embedding=self.embedder.embed([memory_content])[0], - created_at=datetime.now().isoformat(), - usage=[], - background=f"Xinyu search result from {site or source}", - ) - # Create TextualMemoryItem - memory_item = TextualMemoryItem( - id=str(uuid.uuid4()), memory=memory_content, metadata=metadata - ) - - memory_items.append(memory_item) - - return memory_items + return list(unique_memory_items.values()) def _extract_entities(self, title: str, content: str, summary: str) -> list[str]: """ @@ -333,3 +299,74 @@ def _extract_tags(self, title: str, content: str, summary: str, parsed_goal=None tags.extend(parsed_goal.tags) return list(set(tags))[:15] # Limit to 15 tags + + def _process_result( + self, result: dict, query: str, parsed_goal: str + ) -> list[TextualMemoryItem]: + title = result.get("title", "") + content = result.get("content", "") + summary = result.get("summary", "") + url = result.get("url", "") + publish_time = result.get("publish_time", "") + if publish_time: + try: + publish_time = datetime.strptime(publish_time, "%Y-%m-%d %H:%M:%S").strftime( + "%Y-%m-%d" + ) + except Exception as e: + logger.error(f"xinyu search error: {e}") + publish_time = datetime.now().strftime("%Y-%m-%d") + else: + publish_time = datetime.now().strftime("%Y-%m-%d") + source = result.get("source", "") + site = result.get("site", "") + if site: + site = site.split("|")[0] + + qualified_chunks = self._chunk(content) + + memory_items = [] + for chunk_text, chunk_emb, score in qualified_chunks: + memory_content = ( + f"Title: {title}\nNewsTime: {publish_time}\nSummary: {summary}\n" + f"Content: {chunk_text}\nSource: {url}" + ) + metadata = TreeNodeTextualMemoryMetadata( + user_id=None, + session_id=None, + status="activated", + type="fact", + source="web", + confidence=score, + entities=self._extract_entities(title, content, summary), + tags=self._extract_tags(title, content, summary, parsed_goal), + visibility="public", + memory_type="OuterMemory", + key=f"[{source}]" + title, + sources=[url] if url else [], + embedding=chunk_emb, + created_at=datetime.now().isoformat(), + usage=[], + background=f"Xinyu search result from {site or source}", + ) + memory_items.append( + TextualMemoryItem(id=str(uuid.uuid4()), memory=memory_content, metadata=metadata) + ) + + return memory_items + + def _chunk(self, content: str) -> list[tuple[str, list[float], float]]: + """ + Use SentenceChunker to split content into chunks and embed each. + + Returns: + List of (chunk_text, chunk_embedding, dummy_score) + """ + chunks = self.chunker.chunk(content) + if not chunks: + return [] + + chunk_texts = [c.text for c in chunks] + chunk_embeddings = self.embedder.embed(chunk_texts) + + return [(text, emb, 1.0) for text, emb in zip(chunk_texts, chunk_embeddings, strict=False)] diff --git a/tests/memories/textual/test_tree_searcher.py b/tests/memories/textual/test_tree_searcher.py index df7d1d77..729d7a4f 100644 --- a/tests/memories/textual/test_tree_searcher.py +++ b/tests/memories/textual/test_tree_searcher.py @@ -94,7 +94,6 @@ def test_searcher_fine_mode_triggers_reasoner(mock_searcher): top_k=1, mode="fine", ) - assert mock_searcher.reasoner.reason.called assert len(result) == 1