How do I extract data from tables on a website using Python?

Extracting table data from websites is a common web scraping task. Python offers several approaches depending on the complexity of the table and whether it's dynamically loaded.

Method 1: Static Tables with BeautifulSoup

For simple HTML tables that load immediately, requests and BeautifulSoup provide the most efficient solution.

Installation

pip install requests beautifulsoup4 pandas

Basic Table Extraction

import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

def scrape_table_basic(url, table_selector=None):
    """Extract data from a static HTML table"""

    # Fetch the webpage
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()

    # Parse HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table
    if table_selector:
        table = soup.select_one(table_selector)
    else:
        table = soup.find('table')

    if not table:
        raise ValueError("No table found on the page")

    # Extract headers
    headers = []
    header_row = table.find('tr')
    if header_row:
        headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]

    # Extract data rows
    data = []
    rows = table.find_all('tr')[1:]  # Skip header row

    for row in rows:
        cells = row.find_all(['td', 'th'])
        row_data = [cell.get_text(strip=True) for cell in cells]
        if row_data:  # Skip empty rows
            data.append(row_data)

    return headers, data

# Usage example
url = "https://example.com/data-table"
headers, table_data = scrape_table_basic(url, 'table.data-table')

# Save to CSV
with open('scraped_table.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerows(table_data)

Advanced Table Handling

def scrape_table_advanced(url):
    """Handle complex tables with merged cells and nested content"""

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    tables = soup.find_all('table')
    all_tables_data = []

    for i, table in enumerate(tables):
        print(f"Processing table {i+1}")

        # Handle tables with thead/tbody structure
        thead = table.find('thead')
        tbody = table.find('tbody')

        headers = []
        if thead:
            header_rows = thead.find_all('tr')
            for row in header_rows:
                row_headers = [th.get_text(strip=True) for th in row.find_all(['th', 'td'])]
                headers.extend(row_headers)

        # Extract data
        data_rows = tbody.find_all('tr') if tbody else table.find_all('tr')
        data = []

        for row in data_rows:
            cells = row.find_all(['td', 'th'])
            row_data = []

            for cell in cells:
                # Handle nested elements (links, spans, etc.)
                text = cell.get_text(separator=' ', strip=True)

                # Extract links if present
                links = cell.find_all('a')
                if links:
                    text += f" [Links: {', '.join([a.get('href', '') for a in links])}]"

                row_data.append(text)

            if row_data:
                data.append(row_data)

        all_tables_data.append({
            'headers': headers,
            'data': data,
            'table_index': i
        })

    return all_tables_data

Method 2: Using Pandas (Recommended for Simple Cases)

Pandas can automatically detect and parse HTML tables, making it ideal for straightforward extractions.

import pandas as pd

def scrape_tables_pandas(url):
    """Extract all tables using pandas - simplest method"""

    try:
        # Read all tables from the webpage
        tables = pd.read_html(url, attrs={'class': 'data-table'})

        # Save each table
        for i, table in enumerate(tables):
            filename = f'table_{i+1}.csv'
            table.to_csv(filename, index=False)
            print(f"Saved {filename} with shape {table.shape}")

        return tables

    except ValueError as e:
        print(f"Error: {e}")
        return []

# Usage
tables = scrape_tables_pandas("https://example.com/page-with-tables")

# Access specific table
if tables:
    first_table = tables[0]
    print(first_table.head())

Method 3: Dynamic Tables with Selenium

For JavaScript-rendered tables or tables that load after user interaction:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd

def scrape_dynamic_table(url, table_xpath=None):
    """Scrape dynamically loaded tables using Selenium"""

    # Setup Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(url)

        # Wait for table to load
        wait = WebDriverWait(driver, 10)
        if table_xpath:
            table = wait.until(EC.presence_of_element_located((By.XPATH, table_xpath)))
        else:
            table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))

        # Option 1: Use pandas with Selenium
        tables = pd.read_html(driver.page_source)

        # Option 2: Manual extraction
        rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
        data = []

        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            if not cells:  # Try th for headers
                cells = row.find_elements(By.TAG_NAME, "th")

            row_data = [cell.text.strip() for cell in cells]
            if row_data:
                data.append(row_data)

        return data

    finally:
        driver.quit()

# Usage
table_data = scrape_dynamic_table("https://example.com/dynamic-table")

Method 4: Handling Pagination and Multiple Pages

def scrape_paginated_table(base_url, max_pages=None):
    """Scrape tables across multiple pages"""

    all_data = []
    page = 1

    while True:
        url = f"{base_url}?page={page}"
        print(f"Scraping page {page}")

        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')

            table = soup.find('table')
            if not table:
                break

            # Extract data (skip headers after first page)
            rows = table.find_all('tr')
            start_row = 1 if page == 1 else 0

            for row in rows[start_row:]:
                cells = row.find_all(['td', 'th'])
                row_data = [cell.get_text(strip=True) for cell in cells]
                if row_data:
                    all_data.append(row_data)

            # Check for next page
            next_link = soup.find('a', text='Next')
            if not next_link or (max_pages and page >= max_pages):
                break

            page += 1

        except Exception as e:
            print(f"Error on page {page}: {e}")
            break

    return all_data

Best Practices

Error Handling and Robustness

import time
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def create_robust_session():
    """Create a session with retry strategy"""
    session = requests.Session()

    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )

    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    return session

def scrape_with_delays(urls, delay_range=(1, 3)):
    """Scrape multiple URLs with random delays"""
    session = create_robust_session()
    results = []

    for url in urls:
        try:
            # Random delay to be respectful
            time.sleep(random.uniform(*delay_range))

            response = session.get(url, timeout=10)
            # Process response...

        except Exception as e:
            print(f"Error scraping {url}: {e}")
            continue

    return results

Data Cleaning and Validation

def clean_table_data(data):
    """Clean and validate scraped table data"""
    cleaned_data = []

    for row in data:
        cleaned_row = []
        for cell in row:
            # Remove extra whitespace
            clean_cell = ' '.join(cell.split())

            # Remove common HTML entities
            clean_cell = clean_cell.replace(' ', ' ')
            clean_cell = clean_cell.replace('&', '&')

            # Convert to appropriate data types
            if clean_cell.isdigit():
                clean_cell = int(clean_cell)
            elif clean_cell.replace('.', '').isdigit():
                clean_cell = float(clean_cell)

            cleaned_row.append(clean_cell)

        cleaned_data.append(cleaned_row)

    return cleaned_data

Common Issues and Solutions

  1. JavaScript-rendered content: Use Selenium or consider headless browsers
  2. Rate limiting: Implement delays and respect robots.txt
  3. Authentication required: Handle cookies and sessions properly
  4. Complex table structures: Parse manually with BeautifulSoup for full control
  5. Large datasets: Consider streaming and chunked processing

Legal and Ethical Considerations

Always ensure your scraping activities comply with: - Website terms of service - robots.txt file guidelines - Rate limiting to avoid server overload - Data protection and privacy laws - Copyright and intellectual property rights

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon