How do I use Python to extract metadata from web pages?

Extracting metadata from web pages is essential for SEO analysis, social media previews, and content management. Python provides powerful libraries like BeautifulSoup and requests to parse HTML and extract various types of metadata including standard meta tags, OpenGraph properties, Twitter Cards, and structured data.

Installation

Install the required libraries:

pip install requests beautifulsoup4 lxml

Basic Metadata Extraction

Here's a comprehensive function to extract different types of metadata:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def extract_metadata(url):
    """Extract comprehensive metadata from a web page."""
    try:
        # Add headers to avoid blocking
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        metadata = {
            'url': url,
            'title': extract_title(soup),
            'meta_tags': extract_meta_tags(soup),
            'opengraph': extract_opengraph(soup),
            'twitter_cards': extract_twitter_cards(soup),
            'structured_data': extract_structured_data(soup)
        }

        return metadata

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_title(soup):
    """Extract page title."""
    title_tag = soup.find('title')
    return title_tag.get_text().strip() if title_tag else None

def extract_meta_tags(soup):
    """Extract standard meta tags."""
    meta_data = {}

    # Standard meta tags
    for meta in soup.find_all('meta'):
        if meta.get('name'):
            meta_data[meta['name']] = meta.get('content', '')
        elif meta.get('http-equiv'):
            meta_data[f"http-equiv-{meta['http-equiv']}"] = meta.get('content', '')

    return meta_data

def extract_opengraph(soup):
    """Extract OpenGraph metadata."""
    og_data = {}

    for meta in soup.find_all('meta', property=True):
        if meta['property'].startswith('og:'):
            og_data[meta['property']] = meta.get('content', '')

    return og_data

def extract_twitter_cards(soup):
    """Extract Twitter Card metadata."""
    twitter_data = {}

    for meta in soup.find_all('meta', attrs={'name': True}):
        if meta['name'].startswith('twitter:'):
            twitter_data[meta['name']] = meta.get('content', '')

    return twitter_data

def extract_structured_data(soup):
    """Extract JSON-LD structured data."""
    structured_data = []

    scripts = soup.find_all('script', type='application/ld+json')
    for script in scripts:
        try:
            import json
            data = json.loads(script.string)
            structured_data.append(data)
        except (json.JSONDecodeError, AttributeError):
            continue

    return structured_data

Usage Examples

Basic Usage

url = "https://example.com"
metadata = extract_metadata(url)

if metadata:
    print(f"Title: {metadata['title']}")
    print(f"Description: {metadata['meta_tags'].get('description', 'N/A')}")
    print(f"Keywords: {metadata['meta_tags'].get('keywords', 'N/A')}")

Extracting Specific Metadata Types

# Extract only OpenGraph data
def get_social_metadata(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    social_data = {}

    # OpenGraph
    for meta in soup.find_all('meta', property=True):
        if meta['property'].startswith('og:'):
            key = meta['property'].replace('og:', '')
            social_data[f'og_{key}'] = meta.get('content', '')

    # Twitter Cards
    for meta in soup.find_all('meta', attrs={'name': True}):
        if meta['name'].startswith('twitter:'):
            key = meta['name'].replace('twitter:', '')
            social_data[f'twitter_{key}'] = meta.get('content', '')

    return social_data

# Usage
social_meta = get_social_metadata("https://example.com")
print(f"Social image: {social_meta.get('og_image', 'N/A')}")

Batch Processing Multiple URLs

def process_multiple_urls(urls):
    """Process multiple URLs and extract metadata."""
    results = []

    for url in urls:
        print(f"Processing: {url}")
        metadata = extract_metadata(url)

        if metadata:
            # Extract key information
            result = {
                'url': url,
                'title': metadata['title'],
                'description': metadata['meta_tags'].get('description'),
                'og_title': metadata['opengraph'].get('og:title'),
                'og_description': metadata['opengraph'].get('og:description'),
                'og_image': metadata['opengraph'].get('og:image')
            }
            results.append(result)

    return results

# Usage
urls = [
    "https://example.com",
    "https://another-site.com",
    "https://third-site.com"
]

batch_results = process_multiple_urls(urls)
for result in batch_results:
    print(f"{result['title']} - {result['description'][:100]}...")

Advanced Features

Error Handling and Retry Logic

import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def create_session_with_retries():
    """Create a requests session with retry logic."""
    session = requests.Session()

    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )

    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    return session

def robust_metadata_extraction(url):
    """Extract metadata with robust error handling."""
    session = create_session_with_retries()

    try:
        response = session.get(url, timeout=15)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Your extraction logic here
        return extract_metadata_from_soup(soup)

    except Exception as e:
        print(f"Failed to extract metadata from {url}: {e}")
        return None

Saving to CSV

import csv

def save_metadata_to_csv(metadata_list, filename='metadata.csv'):
    """Save extracted metadata to CSV file."""
    if not metadata_list:
        return

    fieldnames = ['url', 'title', 'description', 'keywords', 'og_title', 'og_description', 'og_image']

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for metadata in metadata_list:
            row = {
                'url': metadata.get('url', ''),
                'title': metadata.get('title', ''),
                'description': metadata['meta_tags'].get('description', ''),
                'keywords': metadata['meta_tags'].get('keywords', ''),
                'og_title': metadata['opengraph'].get('og:title', ''),
                'og_description': metadata['opengraph'].get('og:description', ''),
                'og_image': metadata['opengraph'].get('og:image', '')
            }
            writer.writerow(row)

Best Practices

  1. Add User-Agent headers to avoid being blocked
  2. Handle timeouts with appropriate timeout values
  3. Respect robots.txt and rate limits
  4. Use sessions for multiple requests to the same domain
  5. Validate and clean data before processing
  6. Handle different encodings properly
# Example with best practices
headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; MetadataBot/1.0)',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}

response = requests.get(url, headers=headers, timeout=10)
response.encoding = response.apparent_encoding  # Handle encoding properly

This comprehensive approach allows you to extract various types of metadata from web pages efficiently and reliably, making it suitable for SEO analysis, content management, and social media optimization tasks.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon