Initial retrieval casts a wide net. Reranking finds the truly relevant needles in that haystack. Adding reranking to RAG systems typically improves accuracy by 40-60% with minimal latency increase.
The Retrieval-Reranking Pipeline
Query �?Initial Retrieval (fast, recall-focused) �?
Top 50-100 candidates �?
Reranking (slow, precision-focused) �?
Top 5-10 most relevant �?
LLM Generation
Why Two Stages?
Initial Retrieval: Fast but imprecise
- Vector search: Milliseconds for millions of docs
- Returns 50-100 candidates
- Good recall, moderate precision
Reranking: Slow but accurate
- Cross-encoder models
- Processes only top candidates
- Excellent precision
Retrieval vs. Reranking Models
Bi-Encoder (Retrieval)
# Bi-encoder: Separate embeddings for query and documents
query_embedding = encoder.encode(query) # Once
doc_embeddings = encoder.encode(documents) # Pre-computed
# Fast comparison (dot product)
scores = query_embedding @ doc_embeddings.T
Speed: Extremely fast (pre-computed embeddings)
Accuracy: Good but not great
Cross-Encoder (Reranking)
# Cross-encoder: Joint encoding of query + document
for doc in candidates:
score = cross_encoder.predict([query, doc]) # For each pair
Speed: Slower (no pre-computation)
Accuracy: Excellent (sees both query and doc together)
Implementation
Basic Reranking
from sentence_transformers import CrossEncoder
class Reranker:
def __init__(self, model_name='cross-encoder/ms-marco-MiniLM-L-6-v2'):
self.model = CrossEncoder(model_name)
def rerank(self, query, documents, top_k=5):
"""
Rerank documents using cross-encoder
"""
# Create query-document pairs
pairs = [[query, doc] for doc in documents]
# Score all pairs
scores = self.model.predict(pairs)
# Sort by score
doc_score_pairs = list(zip(documents, scores))
doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
# Return top k
return doc_score_pairs[:top_k]
Complete RAG with Reranking
class RAGWithReranking:
def __init__(self, vector_store, reranker):
self.vector_store = vector_store
self.reranker = reranker
self.llm = ChatGPT()
def answer(self, query):
# Stage 1: Initial retrieval (recall-focused)
print("Stage 1: Retrieving candidates...")
candidates = self.vector_store.similarity_search(
query,
k=50 # Cast wide net
)
print(f"Retrieved {len(candidates)} candidates")
# Stage 2: Reranking (precision-focused)
print("Stage 2: Reranking...")
reranked = self.reranker.rerank(
query,
[doc.content for doc in candidates],
top_k=5 # Narrow to best
)
print(f"Selected top {len(reranked)} documents")
# Stage 3: Generation
context = "\n\n".join([doc for doc, score in reranked])
prompt = f"""
Context: {context}
Question: {query}
Answer:
"""
answer = self.llm.generate(prompt)
return {
"answer": answer,
"sources": reranked
}
Performance Comparison
def benchmark_retrieval_methods(queries, relevance_labels):
"""
Compare retrieval with and without reranking
"""
results = {
"vector_only": [],
"with_reranking": []
}
for query, relevant_docs in zip(queries, relevance_labels):
# Method 1: Vector search only
vector_results = vector_search(query, k=5)
vector_precision = calculate_precision(vector_results, relevant_docs)
results["vector_only"].append(vector_precision)
# Method 2: Vector search + reranking
candidates = vector_search(query, k=50)
reranked = reranker.rerank(query, candidates, top_k=5)
rerank_precision = calculate_precision(reranked, relevant_docs)
results["with_reranking"].append(rerank_precision)
print(f"Vector only: P@5 = {np.mean(results['vector_only']):.3f}")
print(f"With reranking: P@5 = {np.mean(results['with_reranking']):.3f}")
improvement = (
(np.mean(results['with_reranking']) - np.mean(results['vector_only'])) /
np.mean(results['vector_only']) * 100
)
print(f"Improvement: {improvement:.1f}%")
# Typical output:
# Vector only: P@5 = 0.520
# With reranking: P@5 = 0.810
# Improvement: 55.8%
Advanced Reranking Strategies
Multi-Stage Reranking
For very large candidate sets, use multiple reranking stages:
class MultiStageReranker:
def __init__(self):
# Stage 1: Fast, lightweight reranker
self.fast_reranker = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2')
# Stage 2: Slow, accurate reranker
self.accurate_reranker = CrossEncoder('cross-encoder/ms-marco-electra-base')
def rerank(self, query, documents, final_k=5):
# Stage 1: Fast reranking (1000 �?50)
stage1_results = self.fast_reranker.rerank(
query,
documents,
top_k=50
)
# Stage 2: Accurate reranking (50 �?5)
stage2_results = self.accurate_reranker.rerank(
query,
[doc for doc, _ in stage1_results],
top_k=final_k
)
return stage2_results
Hybrid Scoring
Combine retrieval and reranking scores:
def hybrid_reranking(query, documents_with_retrieval_scores, alpha=0.7):
"""
Combine initial retrieval scores with reranking scores
"""
# Get reranking scores
rerank_scores = cross_encoder.predict([
[query, doc] for doc, _ in documents_with_retrieval_scores
])
# Normalize both score types
retrieval_scores = normalize([score for _, score in documents_with_retrieval_scores])
rerank_scores = normalize(rerank_scores)
# Combine with weighting
hybrid_scores = [
alpha * rerank_score + (1 - alpha) * retrieval_score
for rerank_score, retrieval_score in zip(rerank_scores, retrieval_scores)
]
# Re-sort by hybrid score
docs = [doc for doc, _ in documents_with_retrieval_scores]
ranked = sorted(
zip(docs, hybrid_scores),
key=lambda x: x[1],
reverse=True
)
return ranked
Query-Specific Reranking
Different queries need different reranking strategies:
class AdaptiveReranker:
def rerank(self, query, documents, top_k=5):
# Analyze query complexity
query_type = self.classify_query(query)
if query_type == "factual":
# For factual queries, emphasize exact matches
return self.factual_reranker(query, documents, top_k)
elif query_type == "analytical":
# For analytical queries, emphasize comprehensive coverage
return self.analytical_reranker(query, documents, top_k)
elif query_type == "comparison":
# For comparisons, ensure diverse perspectives
return self.diverse_reranker(query, documents, top_k)
else:
# Default reranking
return self.standard_reranker(query, documents, top_k)
Latency Optimization
Reranking adds latency. Optimize it:
Batch Processing
def batch_reranking(query, documents, batch_size=32):
"""
Process reranking in batches for GPU efficiency
"""
pairs = [[query, doc] for doc in documents]
all_scores = []
for i in range(0, len(pairs), batch_size):
batch = pairs[i:i + batch_size]
batch_scores = cross_encoder.predict(batch)
all_scores.extend(batch_scores)
return all_scores
Caching
class CachedReranker:
def __init__(self):
self.cache = {}
self.cross_encoder = CrossEncoder('model-name')
def rerank(self, query, documents, top_k=5):
# Create cache key
cache_key = hash((query, tuple(documents)))
if cache_key in self.cache:
return self.cache[cache_key]
# Rerank
result = self.cross_encoder.rerank(query, documents, top_k)
# Cache result
self.cache[cache_key] = result
return result
Parallel Processing
from concurrent.futures import ThreadPoolExecutor
def parallel_rerank(query, document_groups):
"""
Rerank multiple query variations in parallel
"""
with ThreadPoolExecutor(max_workers=4) as executor:
futures = [
executor.submit(reranker.rerank, query, docs)
for docs in document_groups
]
results = [f.result() for f in futures]
return results
Diversity-Aware Reranking
Avoid returning duplicate or too-similar documents:
def diverse_reranking(query, documents, top_k=5, diversity_threshold=0.8):
"""
Rerank while ensuring diversity
"""
# Initial reranking
scored_docs = reranker.rerank(query, documents, top_k=50)
# Select diverse set
selected = []
selected.append(scored_docs[0]) # Always include top result
for doc, score in scored_docs[1:]:
# Check if too similar to already selected
is_diverse = all(
similarity(doc, selected_doc) < diversity_threshold
for selected_doc, _ in selected
)
if is_diverse:
selected.append((doc, score))
if len(selected) >= top_k:
break
return selected
Cost-Benefit Analysis
Latency Impact
Retrieval only: 50ms
Retrieval + Reranking (top 50): +120ms
Total: 170ms
Trade-off: 3.4x slower, but 55% more accurate
When to Use Reranking
Use reranking when:
- Accuracy is critical (customer support, medical, legal)
- Query complexity is high
- Initial retrieval quality is moderate
- You have GPU resources
Skip reranking when:
- Latency budget is very tight (<100ms)
- Initial retrieval is already excellent
- Questions are simple
- Cost constraints are strict
Production Implementation
class ProductionRAG:
def __init__(self, enable_reranking=True):
self.vector_store = VectorStore()
self.reranker = Reranker() if enable_reranking else None
self.llm = ChatGPT()
def answer(self, query, max_latency_ms=500):
start = time.time()
# Always retrieve
candidates = self.vector_store.search(query, k=50)
# Check if we have time for reranking
elapsed_ms = (time.time() - start) * 1000
if self.reranker and (elapsed_ms + 150) < max_latency_ms:
# We have time - use reranking
top_docs = self.reranker.rerank(query, candidates, top_k=5)
else:
# No time - use retrieval scores only
top_docs = candidates[:5]
# Generate answer
answer = self.generate_answer(query, top_docs)
return answer
Evaluation Metrics
def evaluate_reranking(queries, ground_truth):
"""
Comprehensive reranking evaluation
"""
metrics = {
"precision@5": [],
"recall@5": [],
"mrr": [],
"ndcg@10": []
}
for query, relevant_docs in zip(queries, ground_truth):
# Get reranked results
results = rag.retrieve(query, k=10)
# Calculate metrics
metrics["precision@5"].append(
precision_at_k(results[:5], relevant_docs)
)
metrics["recall@5"].append(
recall_at_k(results[:5], relevant_docs)
)
metrics["mrr"].append(
mean_reciprocal_rank(results, relevant_docs)
)
metrics["ndcg@10"].append(
ndcg_at_k(results, relevant_docs, k=10)
)
# Average metrics
for metric_name, values in metrics.items():
print(f"{metric_name}: {np.mean(values):.3f}")
return metrics
Best Practices
1. Retrieve More, Rerank Fewer
- Initial retrieval: 50-100 candidates
- Reranking: Top 5-10
2. Choose Right Model
- Small datasets: ms-marco-MiniLM (fast)
- Large datasets: ms-marco-electra (accurate)
3. Monitor Performance
- Track latency per stage
- A/B test reranking impact
- Measure actual user satisfaction
4. Optimize for Production
- Use GPU for reranking
- Implement caching
- Set latency budgets
5. Combine with Other Techniques
- Hybrid search + reranking
- Query expansion + reranking
- Contextual boosting + reranking
Reranking transforms good RAG systems into great ones. The 40-60% accuracy improvement typically far outweighs the modest latency increase.
Related Resources
RAG Optimization:
Implementation:
Get Started:
SearchCans APIs power intelligent retrieval systems. Start building with $5 free credits.