Vector search alone misses exact matches. Keyword search alone misses semantic meaning. Hybrid search combines both, delivering 30-50% better retrieval accuracy for RAG systems.
The Problem with Single-Method Search
Vector Search Limitations
# Query: "Python programming language"
vector_results = vector_db.search(query_embedding)
# Returns:
# 1. "Java development guide" (high semantic similarity)
# 2. "C++ best practices" (programming context)
# 3. "Python for data science" (finally, but position 3)
Issues:
- Misses exact keyword matches
- Can return semantically similar but irrelevant results
- Struggles with acronyms, product names, IDs
Keyword Search Limitations
# Query: "How to train ML models efficiently"
keyword_results = bm25_search(query)
# Returns:
# 1. "Efficiently managing your time" (has "efficiently")
# 2. "Train schedules" (has "train")
# 3. "Machine learning tutorial" (finally relevant)
Issues:
- No semantic understanding
- Misses synonyms and related concepts
- Sensitive to exact wording
Hybrid Search Architecture
class HybridSearch:
def __init__(self, vector_store, bm25_index):
self.vector_store = vector_store
self.bm25_index = bm25_index
def search(self, query, k=10, alpha=0.5):
"""
Hybrid search combining vector and keyword methods
Args:
query: Search query
k: Number of results
alpha: Weight for vector search (0-1)
1.0 = pure vector, 0.0 = pure keyword
"""
# Vector search
vector_results = self.vector_store.similarity_search(query, k=k*2)
vector_scores = {
doc.id: score
for doc, score in vector_results
}
# Keyword search (BM25)
keyword_results = self.bm25_index.search(query, k=k*2)
keyword_scores = {
doc.id: score
for doc, score in keyword_results
}
# Combine scores
all_doc_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
hybrid_scores = {}
for doc_id in all_doc_ids:
v_score = vector_scores.get(doc_id, 0)
k_score = keyword_scores.get(doc_id, 0)
# Weighted combination
hybrid_scores[doc_id] = (
alpha * self.normalize(v_score) +
(1 - alpha) * self.normalize(k_score)
)
# Sort by hybrid score
ranked_docs = sorted(
hybrid_scores.items(),
key=lambda x: x[1],
reverse=True
)
return ranked_docs[:k]
Implementation: BM25 + Vector Search
Step 1: Build BM25 Index
from rank_bm25 import BM25Okapi
import numpy as np
class BM25Index:
def __init__(self, documents):
# Tokenize documents
self.tokenized_docs = [
doc.lower().split()
for doc in documents
]
# Build BM25 index
self.bm25 = BM25Okapi(self.tokenized_docs)
self.documents = documents
def search(self, query, k=10):
"""BM25 keyword search"""
tokenized_query = query.lower().split()
scores = self.bm25.get_scores(tokenized_query)
# Get top k
top_indices = np.argsort(scores)[::-1][:k]
results = [
(self.documents[i], scores[i])
for i in top_indices
]
return results
Step 2: Build Vector Index
from sentence_transformers import SentenceTransformer
import faiss
class VectorIndex:
def __init__(self, documents, model_name='all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
self.documents = documents
# Create embeddings
self.embeddings = self.model.encode(documents)
# Build FAISS index
dimension = self.embeddings.shape[1]
self.index = faiss.IndexFlatIP(dimension) # Inner product
self.index.add(self.embeddings.astype('float32'))
def search(self, query, k=10):
"""Vector similarity search"""
query_embedding = self.model.encode([query])
scores, indices = self.index.search(
query_embedding.astype('float32'),
k
)
results = [
(self.documents[idx], score)
for idx, score in zip(indices[0], scores[0])
]
return results
Step 3: Fusion Strategies
Reciprocal Rank Fusion (RRF):
def reciprocal_rank_fusion(vector_results, keyword_results, k=60):
"""
RRF: Combines rankings without needing to normalize scores
"""
scores = {}
# Process vector results
for rank, (doc_id, _) in enumerate(vector_results, 1):
scores[doc_id] = scores.get(doc_id, 0) + 1 / (k + rank)
# Process keyword results
for rank, (doc_id, _) in enumerate(keyword_results, 1):
scores[doc_id] = scores.get(doc_id, 0) + 1 / (k + rank)
# Sort by fused score
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
return ranked
Linear Combination:
def linear_fusion(vector_results, keyword_results, alpha=0.5):
"""
Weighted average of normalized scores
"""
def normalize_scores(results):
scores = [score for _, score in results]
min_s, max_s = min(scores), max(scores)
return {
doc_id: (score - min_s) / (max_s - min_s) if max_s > min_s else 0
for doc_id, score in results
}
v_scores = normalize_scores(vector_results)
k_scores = normalize_scores(keyword_results)
all_docs = set(v_scores.keys()) | set(k_scores.keys())
fused = {
doc_id: alpha * v_scores.get(doc_id, 0) +
(1 - alpha) * k_scores.get(doc_id, 0)
for doc_id in all_docs
}
return sorted(fused.items(), key=lambda x: x[1], reverse=True)
Complete Hybrid RAG System
class HybridRAG:
def __init__(self, documents):
self.documents = documents
# Build both indices
self.vector_index = VectorIndex(documents)
self.bm25_index = BM25Index(documents)
self.llm = ChatGPT()
def retrieve(self, query, k=5, fusion='rrf'):
"""Hybrid retrieval"""
# Get results from both methods
vector_results = self.vector_index.search(query, k=k*2)
keyword_results = self.bm25_index.search(query, k=k*2)
# Fuse results
if fusion == 'rrf':
fused = reciprocal_rank_fusion(vector_results, keyword_results)
else:
fused = linear_fusion(vector_results, keyword_results)
# Return top k documents
top_doc_ids = [doc_id for doc_id, _ in fused[:k]]
return [self.documents[doc_id] for doc_id in top_doc_ids]
def answer(self, query):
"""RAG with hybrid retrieval"""
# Retrieve relevant documents
context_docs = self.retrieve(query, k=5)
# Generate answer
context = "\n\n".join(context_docs)
prompt = f"""
Based on the following context, answer the question.
Context:
{context}
Question: {query}
Answer:
"""
answer = self.llm.generate(prompt)
return answer
Optimizing Alpha Parameter
Find the best balance between vector and keyword search:
def optimize_alpha(queries, relevance_labels):
"""
Test different alpha values to find optimal balance
"""
alphas = np.arange(0, 1.1, 0.1)
results = []
for alpha in alphas:
# Test on validation set
retrieval_quality = []
for query, relevant_docs in zip(queries, relevance_labels):
retrieved = hybrid_search(query, alpha=alpha)
# Calculate precision@5
precision = len(set(retrieved[:5]) & set(relevant_docs)) / 5
retrieval_quality.append(precision)
avg_precision = np.mean(retrieval_quality)
results.append((alpha, avg_precision))
print(f"Alpha {alpha:.1f}: Precision@5 = {avg_precision:.3f}")
# Find best alpha
best_alpha, best_score = max(results, key=lambda x: x[1])
return best_alpha, best_score
# Example output:
# Alpha 0.0: Precision@5 = 0.620 (pure keyword)
# Alpha 0.3: Precision@5 = 0.735
# Alpha 0.5: Precision@5 = 0.810 <- Best
# Alpha 0.7: Precision@5 = 0.775
# Alpha 1.0: Precision@5 = 0.690 (pure vector)
Typical optimal alpha: 0.4-0.6 (balanced)
Query-Adaptive Hybrid Search
Adjust strategy based on query type:
class AdaptiveHybridSearch:
def search(self, query):
"""
Adapt search strategy based on query characteristics
"""
# Analyze query
has_exact_terms = bool(re.search(r'"[^"]+"', query))
has_technical_terms = self.detect_technical_terms(query)
query_length = len(query.split())
# Determine alpha
if has_exact_terms:
# Favor keyword search for exact matches
alpha = 0.2
elif has_technical_terms:
# Balance for technical queries
alpha = 0.4
elif query_length > 10:
# Favor semantic for long queries
alpha = 0.7
else:
# Default balanced
alpha = 0.5
return self.hybrid_search(query, alpha=alpha)
Performance Benchmarks
Test Dataset: 1,000 queries, 10,000 documents
| Method | Precision@5 | Recall@10 | MRR |
|---|---|---|---|
| BM25 only | 0.62 | 0.48 | 0.71 |
| Vector only | 0.69 | 0.53 | 0.76 |
| Hybrid (α=0.5) | 0.81 | 0.68 | 0.87 |
| Hybrid adaptive | 0.84 | 0.71 | 0.89 |
Improvement: 30-40% over single-method search
Advanced: Contextual Boosting
Boost certain document types based on context:
def contextual_hybrid_search(query, user_context):
"""
Adjust search based on user context
"""
# Base hybrid search
results = hybrid_search(query)
# Apply contextual boosts
boosted_results = []
for doc_id, score in results:
doc = get_document(doc_id)
boost = 1.0
# Boost recent documents for time-sensitive queries
if is_time_sensitive(query):
days_old = (datetime.now() - doc.date).days
boost *= max(0.5, 1 - (days_old / 365))
# Boost documents matching user expertise level
if doc.difficulty_level == user_context.expertise:
boost *= 1.2
# Boost documents in user's preferred language
if doc.language == user_context.language:
boost *= 1.1
boosted_results.append((doc_id, score * boost))
# Re-sort
boosted_results.sort(key=lambda x: x[1], reverse=True)
return boosted_results
Integration with Real-Time Data
Combine static knowledge base with live web search:
class HybridRAGWithRealTime:
def search(self, query):
# Search static knowledge base (hybrid)
kb_results = self.hybrid_search_kb(query)
# Determine if real-time data needed
if self.needs_current_info(query):
# Search web via SERP API
web_results = serp_api.search(query)
# Extract content
web_docs = [
reader_api.extract(r.url)
for r in web_results[:3]
]
# Combine
all_results = kb_results + web_docs
else:
all_results = kb_results
return all_results
Learn about integrating real-time data.
Monitoring and Evaluation
class HybridSearchMonitor:
def track_performance(self, query, results, user_feedback):
"""Monitor search quality"""
metrics = {
"query": query,
"alpha_used": self.get_alpha(query),
"results_count": len(results),
"user_clicked": user_feedback.clicked_position,
"user_satisfied": user_feedback.rating > 3
}
# Log for analysis
self.log_metrics(metrics)
# A/B test different alphas
if random.random() < 0.1: # 10% of queries
self.ab_test_alpha(query)
Best Practices
1. Start with α=0.5: Balanced approach works well
2. Test on Your Data: Optimal α varies by domain
3. Consider Query Type: Adapt strategy dynamically
4. Monitor Performance: Track which method works better for which queries
5. Combine with Reranking: Use hybrid search for recall, reranker for precision
Learn about reranking techniques.
Hybrid search combines the best of both worlds—precision of keyword matching and intelligence of semantic understanding—delivering significantly better RAG performance.
Related Resources
RAG Optimization:
Implementation:
Get Started:
SearchCans APIs power hybrid search systems with real-time data. Start building with $5 free credits.