Building applications that extract content from URLs is easier than ever with modern APIs. This comprehensive guide covers everything you need to implement URL content extraction in your applications.
Quick Start: API Documentation | Get API Key | Test in Playground
What is URL Content Extraction?
URL content extraction APIs automatically fetch web pages and return clean, structured content:
Input and Output Example
# Input: Any URL
url = "https://example.com/article"
# Output: Structured data
{
"title": "Article Title",
"content": "Clean article text...",
"author": "John Doe",
"published_date": "2025-01-15",
"images": ["image1.jpg", "image2.jpg"],
"metadata": {...}
}
Key benefits:
- No HTML parsing required
- Handles JavaScript-heavy sites
- Extracts metadata automatically
- Works across all website types
Getting Started
1. API Setup
First, get your API credentials:
Installation and Setup
# Install required packages
pip install requests
# Basic setup
import requests
API_KEY = "your-api-key"
API_ENDPOINT = "https://www.searchcans.com/api/url"
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
2. Basic Implementation
Basic Content Extraction Function
def extract_content(url, enable_js=True):
"""Extract content from URL"""
payload = {
"url": url,
"b": enable_js # Enable browser rendering
}
response = requests.post(
API_ENDPOINT,
json=payload,
headers=headers
)
response.raise_for_status()
return response.json()
# Usage
data = extract_content("https://example.com/article")
print(f"Title: {data['title']}")
print(f"Content: {data['content'][:200]}...")
Advanced Features
JavaScript Rendering
Many modern websites require JavaScript execution:
JavaScript Rendering Options
# For static HTML sites (faster)
data = extract_content(url, enable_js=False)
# For JavaScript-heavy sites (more compatible)
data = extract_content(url, enable_js=True)
Batch Processing
Process multiple URLs efficiently:
Batch URL Processing
def extract_batch(urls, max_workers=5):
"""Extract content from multiple URLs"""
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(extract_content, urls))
return results
# Process 100 URLs in parallel
urls = ["https://example.com/page1", "https://example.com/page2", ...]
results = extract_batch(urls)
Error Handling
Robust error handling for production use:
Safe Extraction with Retry Logic
def safe_extract_content(url, max_retries=3):
"""Extract content with retry logic"""
for attempt in range(max_retries):
try:
return extract_content(url)
except requests.HTTPError as e:
if e.response.status_code == 429: # Rate limit
time.sleep(2 ** attempt) # Exponential backoff
continue
elif e.response.status_code == 404:
return None # URL not found
else:
raise e
except requests.RequestException:
if attempt == max_retries - 1:
raise
time.sleep(1)
return None
Use Cases and Examples
News Aggregation
News Aggregator Implementation
class NewsAggregator:
def __init__(self, api_key):
self.api_key = api_key
def fetch_article(self, url):
data = extract_content(url)
return {
'headline': data['title'],
'body': data['content'],
'author': data.get('author'),
'published': data.get('published_date'),
'source_url': url,
'word_count': len(data['content'].split())
}
def aggregate_from_sources(self, rss_feeds):
articles = []
for feed in rss_feeds:
# Parse RSS to get article URLs
urls = self.parse_rss(feed)
# Extract full content for each article
for url in urls:
article = self.fetch_article(url)
articles.append(article)
return articles
Content Analysis
Content Analysis Function
def analyze_content(url):
"""Analyze extracted content"""
data = extract_content(url)
content = data['content']
analysis = {
'word_count': len(content.split()),
'reading_time': len(content.split()) // 200, # 200 WPM
'has_author': bool(data.get('author')),
'has_date': bool(data.get('published_date')),
'image_count': len(data.get('images', [])),
'content_quality': 'high' if len(content) > 1000 else 'low'
}
return analysis
AI Training Data Collection
Training Data Collection Function
def collect_training_data(urls, output_file):
"""Collect clean text data for AI training"""
training_data = []
for url in urls:
try:
data = extract_content(url)
# Clean and structure for AI training
sample = {
'text': data['content'],
'metadata': {
'title': data['title'],
'source': url,
'length': len(data['content']),
'language': 'en' # Could be detected
}
}
training_data.append(sample)
except Exception as e:
print(f"Failed to extract {url}: {e}")
continue
# Save as JSONL for training
with open(output_file, 'w') as f:
for sample in training_data:
f.write(json.dumps(sample) + '\n')
Performance Optimization
Caching Strategy
Redis-Based Caching Implementation
import redis
import json
from hashlib import md5
class CachedExtractor:
def __init__(self, api_key, redis_client):
self.api_key = api_key
self.redis = redis_client
self.cache_ttl = 24 * 60 * 60 # 24 hours
def extract_with_cache(self, url):
# Create cache key
cache_key = f"extract:{md5(url.encode()).hexdigest()}"
# Try cache first
cached = self.redis.get(cache_key)
if cached:
return json.loads(cached)
# Extract and cache
data = extract_content(url)
self.redis.setex(
cache_key,
self.cache_ttl,
json.dumps(data)
)
return data
Rate Limiting
Rate Limiter Implementation
import time
from collections import defaultdict
class RateLimitedExtractor:
def __init__(self, api_key, requests_per_second=10):
self.api_key = api_key
self.rps = requests_per_second
self.last_request = defaultdict(float)
def extract_with_rate_limit(self, url):
# Enforce rate limit
current_time = time.time()
time_since_last = current_time - self.last_request['default']
if time_since_last < (1.0 / self.rps):
sleep_time = (1.0 / self.rps) - time_since_last
time.sleep(sleep_time)
self.last_request['default'] = time.time()
return extract_content(url)
Integration Patterns
Webhook Processing
Flask Webhook Endpoint
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route('/extract', methods=['POST'])
def extract_endpoint():
"""Webhook endpoint for content extraction"""
data = request.json
url = data.get('url')
if not url:
return jsonify({'error': 'URL required'}), 400
try:
result = extract_content(url)
return jsonify({
'status': 'success',
'data': result
})
except Exception as e:
return jsonify({
'status': 'error',
'message': str(e)
}), 500
# Usage: POST /extract with {"url": "https://example.com"}
Queue-Based Processing
Celery Background Task
import celery
app = celery.Celery('content_extractor')
@app.task(retry_kwargs={'max_retries': 3})
def extract_content_task(url, callback_url=None):
"""Background task for content extraction"""
try:
data = extract_content(url)
# Optionally send result to callback URL
if callback_url:
requests.post(callback_url, json=data)
return data
except Exception as e:
# Retry on failure
raise extract_content_task.retry(exc=e)
# Usage
result = extract_content_task.delay('https://example.com/article')
Best Practices
1. URL Validation
URL Validation Functions
import re
from urllib.parse import urlparse
def is_valid_url(url):
"""Validate URL format"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def normalize_url(url):
"""Normalize URL format"""
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
return url.rstrip('/')
2. Content Quality Checks
Content Quality Validation
def validate_extracted_content(data):
"""Validate extracted content quality"""
content = data.get('content', '')
checks = {
'has_content': len(content.strip()) > 100,
'has_title': bool(data.get('title', '').strip()),
'reasonable_length': 100 < len(content) < 100000,
'not_error_page': 'error' not in content.lower()[:200]
}
return all(checks.values()), checks
3. Monitoring and Logging
Logging Implementation
import logging
from datetime import datetime
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def extract_with_logging(url):
"""Extract content with comprehensive logging"""
start_time = datetime.now()
logger.info(f"Starting extraction for {url}")
try:
data = extract_content(url)
duration = (datetime.now() - start_time).total_seconds()
content_length = len(data.get('content', ''))
logger.info(
f"Extraction successful: {url} "
f"({duration:.2f}s, {content_length} chars)"
)
return data
except Exception as e:
duration = (datetime.now() - start_time).total_seconds()
logger.error(f"Extraction failed: {url} ({duration:.2f}s) - {e}")
raise
Troubleshooting
Common Issues
1. Empty content returned
Enable JavaScript Solution
# Solution: Enable JavaScript rendering
data = extract_content(url, enable_js=True)
2. Rate limit errors
Exponential Backoff Solution
# Solution: Implement exponential backoff
def extract_with_backoff(url, max_retries=3):
for attempt in range(max_retries):
try:
return extract_content(url)
except requests.HTTPError as e:
if e.response.status_code == 429:
wait_time = (2 ** attempt) + random.uniform(0, 1)
time.sleep(wait_time)
else:
raise
3. Timeout issues
Timeout Configuration Solution
# Solution: Increase timeout or use async processing
def extract_content(url, timeout=30):
payload = {"url": url, "b": True}
response = requests.post(
API_ENDPOINT,
json=payload,
headers=headers,
timeout=timeout
)
return response.json()
Getting Started
- Sign up for API access �� Get 100 free credits
- Test in Playground �� Try extraction in your browser
- Read full documentation �� Complete API reference
- View pricing �� Transparent, usage-based pricing
Related Resources
- Building AI Agents with APIs
- Python SEO Automation Guide
- API vs Web Scraping Comparison
- Complete API Documentation
SearchCans provides reliable, cost-effective URL content extraction starting at $0.56/1K requests. Start your free trial ��