From b9527ec00eefb29dd6b63f646cc0b342068848a8 Mon Sep 17 00:00:00 2001 From: fridayL Date: Mon, 4 Aug 2025 08:15:10 +0000 Subject: [PATCH 1/2] feat: reorgniza code --- src/memos/mem_os/product.py | 73 +-------------- src/memos/mem_os/utils/format_utils.py | 44 --------- src/memos/mem_os/utils/reference_utils.py | 108 ++++++++++++++++++++++ 3 files changed, 113 insertions(+), 112 deletions(-) create mode 100644 src/memos/mem_os/utils/reference_utils.py diff --git a/src/memos/mem_os/product.py b/src/memos/mem_os/product.py index e3037f05..53895c5e 100644 --- a/src/memos/mem_os/product.py +++ b/src/memos/mem_os/product.py @@ -22,7 +22,9 @@ filter_nodes_by_tree_ids, remove_embedding_recursive, sort_children_by_memory_type, - split_continuous_references, +) +from memos.mem_os.utils.reference_utils import ( + process_streaming_references_complete, ) from memos.mem_scheduler.schemas.general_schemas import ( ANSWER_LABEL, @@ -406,71 +408,6 @@ def _build_enhance_system_prompt( return MEMOS_PRODUCT_ENHANCE_PROMPT + personal_memory_context + outer_memory_context return MEMOS_PRODUCT_ENHANCE_PROMPT - def _process_streaming_references_complete(self, text_buffer: str) -> tuple[str, str]: - """ - Complete streaming reference processing to ensure reference tags are never split. - - Args: - text_buffer (str): The accumulated text buffer. - - Returns: - tuple[str, str]: (processed_text, remaining_buffer) - """ - import re - - # Pattern to match complete reference tags: [refid:memoriesID] - complete_pattern = r"\[\d+:[^\]]+\]" - - # Find all complete reference tags - complete_matches = list(re.finditer(complete_pattern, text_buffer)) - - if complete_matches: - # Find the last complete tag - last_match = complete_matches[-1] - end_pos = last_match.end() - - # Get text up to the end of the last complete tag - processed_text = text_buffer[:end_pos] - remaining_buffer = text_buffer[end_pos:] - - # Apply reference splitting to the processed text - processed_text = split_continuous_references(processed_text) - - return processed_text, remaining_buffer - - # Check for incomplete reference tags - # Look for opening bracket with number and colon - opening_pattern = r"\[\d+:" - opening_matches = list(re.finditer(opening_pattern, text_buffer)) - - if opening_matches: - # Find the last opening tag - last_opening = opening_matches[-1] - opening_start = last_opening.start() - - # Check if we have a complete opening pattern - if last_opening.end() <= len(text_buffer): - # We have a complete opening pattern, keep everything in buffer - return "", text_buffer - else: - # Incomplete opening pattern, return text before it - processed_text = text_buffer[:opening_start] - # Apply reference splitting to the processed text - processed_text = split_continuous_references(processed_text) - return processed_text, text_buffer[opening_start:] - - # Check for partial opening pattern (starts with [ but not complete) - if "[" in text_buffer: - ref_start = text_buffer.find("[") - processed_text = text_buffer[:ref_start] - # Apply reference splitting to the processed text - processed_text = split_continuous_references(processed_text) - return processed_text, text_buffer[ref_start:] - - # No reference tags found, apply reference splitting and return all text - processed_text = split_continuous_references(text_buffer) - return processed_text, "" - def _extract_references_from_response(self, response: str) -> tuple[str, list[dict]]: """ Extract reference information from the response and return clean text. @@ -868,7 +805,7 @@ def chat_with_references( full_response += chunk # Process buffer to ensure complete reference tags - processed_chunk, remaining_buffer = self._process_streaming_references_complete(buffer) + processed_chunk, remaining_buffer = process_streaming_references_complete(buffer) if processed_chunk: chunk_data = f"data: {json.dumps({'type': 'text', 'data': processed_chunk}, ensure_ascii=False)}\n\n" @@ -877,7 +814,7 @@ def chat_with_references( # Process any remaining buffer if buffer: - processed_chunk, remaining_buffer = self._process_streaming_references_complete(buffer) + processed_chunk, remaining_buffer = process_streaming_references_complete(buffer) if processed_chunk: chunk_data = f"data: {json.dumps({'type': 'text', 'data': processed_chunk}, ensure_ascii=False)}\n\n" yield chunk_data diff --git a/src/memos/mem_os/utils/format_utils.py b/src/memos/mem_os/utils/format_utils.py index a98e3a26..5fdb5905 100644 --- a/src/memos/mem_os/utils/format_utils.py +++ b/src/memos/mem_os/utils/format_utils.py @@ -1363,47 +1363,3 @@ def clean_json_response(response: str) -> str: str: Clean JSON string without markdown formatting """ return response.replace("```json", "").replace("```", "").strip() - - -def split_continuous_references(text: str) -> str: - """ - Split continuous reference tags into individual reference tags. - - Converts patterns like [1:92ff35fb, 4:bfe6f044] to [1:92ff35fb] [4:bfe6f044] - - Only processes text if: - 1. '[' appears exactly once - 2. ']' appears exactly once - 3. Contains commas between '[' and ']' - - Args: - text (str): Text containing reference tags - - Returns: - str: Text with split reference tags, or original text if conditions not met - """ - # Early return if text is empty - if not text: - return text - # Check if '[' appears exactly once - if text.count("[") != 1: - return text - # Check if ']' appears exactly once - if text.count("]") != 1: - return text - # Find positions of brackets - open_bracket_pos = text.find("[") - close_bracket_pos = text.find("]") - - # Check if brackets are in correct order - if open_bracket_pos >= close_bracket_pos: - return text - # Extract content between brackets - content_between_brackets = text[open_bracket_pos + 1 : close_bracket_pos] - # Check if there's a comma between brackets - if "," not in content_between_brackets: - return text - text = text.replace(content_between_brackets, content_between_brackets.replace(", ", "][")) - text = text.replace(content_between_brackets, content_between_brackets.replace(",", "][")) - - return text diff --git a/src/memos/mem_os/utils/reference_utils.py b/src/memos/mem_os/utils/reference_utils.py new file mode 100644 index 00000000..755b9977 --- /dev/null +++ b/src/memos/mem_os/utils/reference_utils.py @@ -0,0 +1,108 @@ +def split_continuous_references(text: str) -> str: + """ + Split continuous reference tags into individual reference tags. + + Converts patterns like [1:92ff35fb, 4:bfe6f044] to [1:92ff35fb] [4:bfe6f044] + + Only processes text if: + 1. '[' appears exactly once + 2. ']' appears exactly once + 3. Contains commas between '[' and ']' + + Args: + text (str): Text containing reference tags + + Returns: + str: Text with split reference tags, or original text if conditions not met + """ + # Early return if text is empty + if not text: + return text + # Check if '[' appears exactly once + if text.count("[") != 1: + return text + # Check if ']' appears exactly once + if text.count("]") != 1: + return text + # Find positions of brackets + open_bracket_pos = text.find("[") + close_bracket_pos = text.find("]") + + # Check if brackets are in correct order + if open_bracket_pos >= close_bracket_pos: + return text + # Extract content between brackets + content_between_brackets = text[open_bracket_pos + 1 : close_bracket_pos] + # Check if there's a comma between brackets + if "," not in content_between_brackets: + return text + text = text.replace(content_between_brackets, content_between_brackets.replace(", ", "][")) + text = text.replace(content_between_brackets, content_between_brackets.replace(",", "][")) + + return text + + +def process_streaming_references_complete(text_buffer: str) -> tuple[str, str]: + """ + Complete streaming reference processing to ensure reference tags are never split. + + Args: + text_buffer (str): The accumulated text buffer. + + Returns: + tuple[str, str]: (processed_text, remaining_buffer) + """ + import re + + # Pattern to match complete reference tags: [refid:memoriesID] + complete_pattern = r"\[\d+:[^\]]+\]" + + # Find all complete reference tags + complete_matches = list(re.finditer(complete_pattern, text_buffer)) + + if complete_matches: + # Find the last complete tag + last_match = complete_matches[-1] + end_pos = last_match.end() + + # Get text up to the end of the last complete tag + processed_text = text_buffer[:end_pos] + remaining_buffer = text_buffer[end_pos:] + + # Apply reference splitting to the processed text + processed_text = split_continuous_references(processed_text) + + return processed_text, remaining_buffer + + # Check for incomplete reference tags + # Look for opening bracket with number and colon + opening_pattern = r"\[\d+:" + opening_matches = list(re.finditer(opening_pattern, text_buffer)) + + if opening_matches: + # Find the last opening tag + last_opening = opening_matches[-1] + opening_start = last_opening.start() + + # Check if we have a complete opening pattern + if last_opening.end() <= len(text_buffer): + # We have a complete opening pattern, keep everything in buffer + return "", text_buffer + else: + # Incomplete opening pattern, return text before it + processed_text = text_buffer[:opening_start] + # Apply reference splitting to the processed text + processed_text = split_continuous_references(processed_text) + return processed_text, text_buffer[opening_start:] + + # Check for partial opening pattern (starts with [ but not complete) + if "[" in text_buffer: + ref_start = text_buffer.find("[") + processed_text = text_buffer[:ref_start] + # Apply reference splitting to the processed text + processed_text = split_continuous_references(processed_text) + return processed_text, text_buffer[ref_start:] + + # No reference tags found, apply reference splitting and return all text + processed_text = split_continuous_references(text_buffer) + return processed_text, "" From 2d9d52602f87700f2279dada7c3ba4865bb3bed3 Mon Sep 17 00:00:00 2001 From: fridayL Date: Mon, 4 Aug 2025 09:57:46 +0000 Subject: [PATCH 2/2] feat: re org code and fix bad case ref for equation --- src/memos/mem_os/utils/reference_utils.py | 73 +++++++++++++++-------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/src/memos/mem_os/utils/reference_utils.py b/src/memos/mem_os/utils/reference_utils.py index 755b9977..0402951b 100644 --- a/src/memos/mem_os/utils/reference_utils.py +++ b/src/memos/mem_os/utils/reference_utils.py @@ -65,17 +65,29 @@ def process_streaming_references_complete(text_buffer: str) -> tuple[str, str]: last_match = complete_matches[-1] end_pos = last_match.end() - # Get text up to the end of the last complete tag - processed_text = text_buffer[:end_pos] - remaining_buffer = text_buffer[end_pos:] - - # Apply reference splitting to the processed text - processed_text = split_continuous_references(processed_text) - - return processed_text, remaining_buffer - - # Check for incomplete reference tags - # Look for opening bracket with number and colon + # Check if there's any incomplete reference after the last complete one + remaining_text = text_buffer[end_pos:] + + # Look for potential incomplete reference patterns after the last complete tag + incomplete_pattern = r"\[\d*:?[^\]]*$" + if re.search(incomplete_pattern, remaining_text): + # There's a potential incomplete reference, find where it starts + incomplete_match = re.search(incomplete_pattern, remaining_text) + if incomplete_match: + incomplete_start = end_pos + incomplete_match.start() + processed_text = text_buffer[:incomplete_start] + remaining_buffer = text_buffer[incomplete_start:] + + # Apply reference splitting to the processed text + processed_text = split_continuous_references(processed_text) + return processed_text, remaining_buffer + + # No incomplete reference after the last complete tag, process all + processed_text = split_continuous_references(text_buffer) + return processed_text, "" + + # Check for incomplete reference tags - be more specific about what constitutes a potential reference + # Look for opening bracket with number and colon that could be a reference tag opening_pattern = r"\[\d+:" opening_matches = list(re.finditer(opening_pattern, text_buffer)) @@ -84,25 +96,38 @@ def process_streaming_references_complete(text_buffer: str) -> tuple[str, str]: last_opening = opening_matches[-1] opening_start = last_opening.start() - # Check if we have a complete opening pattern - if last_opening.end() <= len(text_buffer): - # We have a complete opening pattern, keep everything in buffer - return "", text_buffer + # Check if this might be a complete reference tag (has closing bracket after the pattern) + remaining_text = text_buffer[last_opening.end() :] + if "]" in remaining_text: + # This looks like a complete reference tag, process it + processed_text = split_continuous_references(text_buffer) + return processed_text, "" else: - # Incomplete opening pattern, return text before it + # Incomplete reference tag, keep it in buffer processed_text = text_buffer[:opening_start] - # Apply reference splitting to the processed text processed_text = split_continuous_references(processed_text) return processed_text, text_buffer[opening_start:] - # Check for partial opening pattern (starts with [ but not complete) - if "[" in text_buffer: - ref_start = text_buffer.find("[") - processed_text = text_buffer[:ref_start] - # Apply reference splitting to the processed text + # More sophisticated check for potential reference patterns + # Only hold back text if we see a pattern that could be the start of a reference tag + potential_ref_pattern = r"\[\d*:?$" # Matches [, [1, [12:, etc. at end of buffer + if re.search(potential_ref_pattern, text_buffer): + # Find the position of the potential reference start + match = re.search(potential_ref_pattern, text_buffer) + if match: + ref_start = match.start() + processed_text = text_buffer[:ref_start] + processed_text = split_continuous_references(processed_text) + return processed_text, text_buffer[ref_start:] + + # Check for standalone [ only at the very end of the buffer + # This prevents cutting off mathematical expressions like [ \Delta U = Q - W ] + if text_buffer.endswith("["): + # Only hold back the single [ character + processed_text = text_buffer[:-1] processed_text = split_continuous_references(processed_text) - return processed_text, text_buffer[ref_start:] + return processed_text, "[" - # No reference tags found, apply reference splitting and return all text + # No reference-like patterns found, process all text processed_text = split_continuous_references(text_buffer) return processed_text, ""