Extracting metadata from web pages is essential for SEO analysis, social media previews, and content management. Python provides powerful libraries like BeautifulSoup and requests to parse HTML and extract various types of metadata including standard meta tags, OpenGraph properties, Twitter Cards, and structured data.
Installation
Install the required libraries:
pip install requests beautifulsoup4 lxml
Basic Metadata Extraction
Here's a comprehensive function to extract different types of metadata:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def extract_metadata(url):
"""Extract comprehensive metadata from a web page."""
try:
# Add headers to avoid blocking
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
metadata = {
'url': url,
'title': extract_title(soup),
'meta_tags': extract_meta_tags(soup),
'opengraph': extract_opengraph(soup),
'twitter_cards': extract_twitter_cards(soup),
'structured_data': extract_structured_data(soup)
}
return metadata
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def extract_title(soup):
"""Extract page title."""
title_tag = soup.find('title')
return title_tag.get_text().strip() if title_tag else None
def extract_meta_tags(soup):
"""Extract standard meta tags."""
meta_data = {}
# Standard meta tags
for meta in soup.find_all('meta'):
if meta.get('name'):
meta_data[meta['name']] = meta.get('content', '')
elif meta.get('http-equiv'):
meta_data[f"http-equiv-{meta['http-equiv']}"] = meta.get('content', '')
return meta_data
def extract_opengraph(soup):
"""Extract OpenGraph metadata."""
og_data = {}
for meta in soup.find_all('meta', property=True):
if meta['property'].startswith('og:'):
og_data[meta['property']] = meta.get('content', '')
return og_data
def extract_twitter_cards(soup):
"""Extract Twitter Card metadata."""
twitter_data = {}
for meta in soup.find_all('meta', attrs={'name': True}):
if meta['name'].startswith('twitter:'):
twitter_data[meta['name']] = meta.get('content', '')
return twitter_data
def extract_structured_data(soup):
"""Extract JSON-LD structured data."""
structured_data = []
scripts = soup.find_all('script', type='application/ld+json')
for script in scripts:
try:
import json
data = json.loads(script.string)
structured_data.append(data)
except (json.JSONDecodeError, AttributeError):
continue
return structured_data
Usage Examples
Basic Usage
url = "https://example.com"
metadata = extract_metadata(url)
if metadata:
print(f"Title: {metadata['title']}")
print(f"Description: {metadata['meta_tags'].get('description', 'N/A')}")
print(f"Keywords: {metadata['meta_tags'].get('keywords', 'N/A')}")
Extracting Specific Metadata Types
# Extract only OpenGraph data
def get_social_metadata(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
social_data = {}
# OpenGraph
for meta in soup.find_all('meta', property=True):
if meta['property'].startswith('og:'):
key = meta['property'].replace('og:', '')
social_data[f'og_{key}'] = meta.get('content', '')
# Twitter Cards
for meta in soup.find_all('meta', attrs={'name': True}):
if meta['name'].startswith('twitter:'):
key = meta['name'].replace('twitter:', '')
social_data[f'twitter_{key}'] = meta.get('content', '')
return social_data
# Usage
social_meta = get_social_metadata("https://example.com")
print(f"Social image: {social_meta.get('og_image', 'N/A')}")
Batch Processing Multiple URLs
def process_multiple_urls(urls):
"""Process multiple URLs and extract metadata."""
results = []
for url in urls:
print(f"Processing: {url}")
metadata = extract_metadata(url)
if metadata:
# Extract key information
result = {
'url': url,
'title': metadata['title'],
'description': metadata['meta_tags'].get('description'),
'og_title': metadata['opengraph'].get('og:title'),
'og_description': metadata['opengraph'].get('og:description'),
'og_image': metadata['opengraph'].get('og:image')
}
results.append(result)
return results
# Usage
urls = [
"https://example.com",
"https://another-site.com",
"https://third-site.com"
]
batch_results = process_multiple_urls(urls)
for result in batch_results:
print(f"{result['title']} - {result['description'][:100]}...")
Advanced Features
Error Handling and Retry Logic
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def create_session_with_retries():
"""Create a requests session with retry logic."""
session = requests.Session()
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def robust_metadata_extraction(url):
"""Extract metadata with robust error handling."""
session = create_session_with_retries()
try:
response = session.get(url, timeout=15)
soup = BeautifulSoup(response.text, 'html.parser')
# Your extraction logic here
return extract_metadata_from_soup(soup)
except Exception as e:
print(f"Failed to extract metadata from {url}: {e}")
return None
Saving to CSV
import csv
def save_metadata_to_csv(metadata_list, filename='metadata.csv'):
"""Save extracted metadata to CSV file."""
if not metadata_list:
return
fieldnames = ['url', 'title', 'description', 'keywords', 'og_title', 'og_description', 'og_image']
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for metadata in metadata_list:
row = {
'url': metadata.get('url', ''),
'title': metadata.get('title', ''),
'description': metadata['meta_tags'].get('description', ''),
'keywords': metadata['meta_tags'].get('keywords', ''),
'og_title': metadata['opengraph'].get('og:title', ''),
'og_description': metadata['opengraph'].get('og:description', ''),
'og_image': metadata['opengraph'].get('og:image', '')
}
writer.writerow(row)
Best Practices
- Add User-Agent headers to avoid being blocked
- Handle timeouts with appropriate timeout values
- Respect robots.txt and rate limits
- Use sessions for multiple requests to the same domain
- Validate and clean data before processing
- Handle different encodings properly
# Example with best practices
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MetadataBot/1.0)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
response = requests.get(url, headers=headers, timeout=10)
response.encoding = response.apparent_encoding # Handle encoding properly
This comprehensive approach allows you to extract various types of metadata from web pages efficiently and reliably, making it suitable for SEO analysis, content management, and social media optimization tasks.