-
Notifications
You must be signed in to change notification settings - Fork 0
Advanced Text Processing
Temp edited this page Sep 23, 2025
·
1 revision
Last Updated: September 23, 2025 1:48 PM EST
The SQLite MCP Server v2.6.0 includes a comprehensive text processing toolkit with 8 specialized functions for advanced text analysis, pattern matching, and data cleaning. This brings the total server capabilities to 73 tools for complete database and text processing operations.
📚 See Real-World Use Cases for complete text analysis workflows and advanced pattern matching use cases.
| Tool | Description |
|---|---|
regex_extract |
Extract text patterns using PCRE-style regular expressions with capture groups |
regex_replace |
Replace text patterns with support for backreferences and preview mode |
fuzzy_match |
Find similar text using Levenshtein distance and sequence matching |
phonetic_match |
Match text phonetically using Soundex and Metaphone algorithms |
text_similarity |
Calculate similarity between text columns using Cosine, Jaccard, or Levenshtein methods |
text_normalize |
Apply multiple text normalization operations (case, Unicode, whitespace, etc.) |
advanced_search |
Multi-method search combining exact, fuzzy, regex, word boundary, and phonetic matching |
text_validation |
Validate text against common patterns (email, phone, URL, credit card, etc.) |
// Extract email addresses using regex patterns
regex_extract({
"table_name": "users",
"column_name": "contact_info",
"pattern": "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
"flags": "i"
})// Replace various phone number formats with standardized format
regex_replace({
"table_name": "customers",
"column_name": "phone",
"pattern": "\\(?([0-9]{3})\\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})",
"replacement": "($1) $2-$3",
"preview": false
})// Extract URLs from text content
regex_extract({
"table_name": "social_posts",
"column_name": "content",
"pattern": "https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*)",
"flags": "g"
})// Find similar names using fuzzy matching
fuzzy_match({
"table_name": "customers",
"column_name": "name",
"search_term": "John Smith",
"threshold": 0.8,
"method": "levenshtein"
})// Match product names phonetically (handles misspellings)
phonetic_match({
"table_name": "products",
"column_name": "name",
"search_term": "iPhone",
"algorithm": "metaphone",
"limit": 10
})// Find duplicate company names with slight variations
fuzzy_match({
"table_name": "companies",
"column_name": "company_name",
"search_term": "Microsoft Corporation",
"threshold": 0.85,
"method": "jaro_winkler"
})// Calculate text similarity between columns
text_similarity({
"table_name": "products",
"column_name": "description",
"compare_column": "summary",
"method": "cosine",
"min_similarity": 0.7
})// Compare documents against reference text
text_similarity({
"table_name": "documents",
"column_name": "content",
"reference_text": "This is our standard template document for customer communications.",
"method": "jaccard",
"return_scores": true
})// Group similar reviews together
text_similarity({
"table_name": "reviews",
"column_name": "comment",
"method": "levenshtein",
"group_similar": true,
"similarity_threshold": 0.6
})// Normalize text with multiple operations
text_normalize({
"table_name": "reviews",
"column_name": "comment",
"operations": [
"lowercase",
"trim",
"remove_extra_spaces",
"remove_punctuation",
"normalize_unicode"
]
})// Standardize address formatting
text_normalize({
"table_name": "addresses",
"column_name": "street_address",
"operations": [
"title_case",
"standardize_abbreviations",
"remove_extra_spaces",
"trim"
]
})// Validate email addresses
text_validation({
"table_name": "users",
"column_name": "email",
"validation_type": "email",
"return_invalid": true
})
// Validate phone numbers
text_validation({
"table_name": "contacts",
"column_name": "phone",
"validation_type": "phone",
"format": "US",
"strict": false
})
// Validate URLs
text_validation({
"table_name": "bookmarks",
"column_name": "url",
"validation_type": "url",
"require_https": true
})// Advanced search combining multiple techniques
advanced_search({
"table_name": "documents",
"column_name": "content",
"search_term": "machine learning",
"methods": [
{"type": "exact", "weight": 1.0},
{"type": "fuzzy", "weight": 0.8, "threshold": 0.7},
{"type": "regex", "weight": 0.9, "pattern": "machine.{0,5}learning"},
{"type": "phonetic", "weight": 0.6},
{"type": "word_boundary", "weight": 0.85}
],
"combine_scores": true,
"min_total_score": 0.5
})// Intelligent product search with fallback methods
advanced_search({
"table_name": "products",
"column_name": "name",
"search_term": "wireless headphones",
"methods": [
{"type": "exact", "weight": 1.0},
{"type": "fuzzy", "weight": 0.7, "threshold": 0.6},
{"type": "word_boundary", "weight": 0.8}
],
"limit": 20,
"sort_by_relevance": true
})// 1. Extract product specifications from descriptions
regex_extract({
"table_name": "products",
"column_name": "description",
"pattern": "(\\d+)\\s*(GB|TB|MB)",
"flags": "gi"
})
// 2. Normalize product names for comparison
text_normalize({
"table_name": "products",
"column_name": "name",
"operations": ["lowercase", "remove_punctuation", "standardize_spaces"]
})
// 3. Find similar products
fuzzy_match({
"table_name": "products",
"column_name": "normalized_name",
"search_term": "apple iphone 15 pro max",
"threshold": 0.8
})// 1. Categorize support tickets by content similarity
text_similarity({
"table_name": "support_tickets",
"column_name": "description",
"reference_text": "Password reset request",
"method": "cosine",
"min_similarity": 0.6
})
// 2. Validate customer contact information
text_validation({
"table_name": "customer_contacts",
"column_name": "email",
"validation_type": "email"
})
// 3. Normalize customer names for deduplication
phonetic_match({
"table_name": "customers",
"column_name": "full_name",
"search_term": "John Doe",
"algorithm": "soundex"
})// 1. Extract hashtags from social media posts
regex_extract({
"table_name": "social_posts",
"column_name": "content",
"pattern": "#([a-zA-Z0-9_]+)",
"flags": "g"
})
// 2. Find duplicate or similar content
text_similarity({
"table_name": "articles",
"column_name": "content",
"method": "jaccard",
"min_similarity": 0.8,
"group_similar": true
})
// 3. Validate and normalize URLs
text_validation({
"table_name": "articles",
"column_name": "featured_image_url",
"validation_type": "url"
})-
regex_extract: Precise pattern matching with known formats -
fuzzy_match: Handle typos and variations in user input -
phonetic_match: Match names and words that sound similar -
text_similarity: Compare documents or find similar content -
advanced_search: When you need comprehensive search with multiple fallbacks
// Use specific patterns instead of broad matches
regex_extract({
"table_name": "logs",
"column_name": "message",
"pattern": "ERROR\\s+\\d{4}-\\d{2}-\\d{2}", // Specific
"flags": "i"
})
// Set appropriate similarity thresholds
fuzzy_match({
"table_name": "products",
"column_name": "name",
"search_term": "laptop",
"threshold": 0.75 // Balance precision vs recall
})// Multi-step text processing pipeline
// Step 1: Normalize
text_normalize({
"table_name": "reviews",
"column_name": "comment",
"operations": ["lowercase", "trim", "remove_extra_spaces"]
})
// Step 2: Validate
text_validation({
"table_name": "reviews",
"column_name": "comment",
"validation_type": "profanity_filter"
})
// Step 3: Extract insights
regex_extract({
"table_name": "reviews",
"column_name": "comment",
"pattern": "(excellent|good|bad|terrible|amazing)",
"flags": "gi"
})// Preview regex replacements before applying
regex_replace({
"table_name": "customer_data",
"column_name": "phone",
"pattern": "\\D",
"replacement": "",
"preview": true // Check results before applying
})
// Set minimum lengths for fuzzy matching
fuzzy_match({
"table_name": "products",
"column_name": "name",
"search_term": "TV",
"threshold": 0.8,
"min_length": 2 // Avoid matching very short strings
})- Core-Database-Tools - Basic database operations
- Statistical-Analysis - Numerical data analysis
- Full-Text-Search - FTS5 search capabilities
- Best-Practices - Usage patterns and recommendations
🔧 Pro Tip: Text processing tools work best when combined in pipelines. Start with normalization, then apply pattern matching or similarity analysis for optimal results.