Extracting table data from websites is a common web scraping task. Python offers several approaches depending on the complexity of the table and whether it's dynamically loaded.
Method 1: Static Tables with BeautifulSoup
For simple HTML tables that load immediately, requests
and BeautifulSoup
provide the most efficient solution.
Installation
pip install requests beautifulsoup4 pandas
Basic Table Extraction
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
def scrape_table_basic(url, table_selector=None):
"""Extract data from a static HTML table"""
# Fetch the webpage
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Find the table
if table_selector:
table = soup.select_one(table_selector)
else:
table = soup.find('table')
if not table:
raise ValueError("No table found on the page")
# Extract headers
headers = []
header_row = table.find('tr')
if header_row:
headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
# Extract data rows
data = []
rows = table.find_all('tr')[1:] # Skip header row
for row in rows:
cells = row.find_all(['td', 'th'])
row_data = [cell.get_text(strip=True) for cell in cells]
if row_data: # Skip empty rows
data.append(row_data)
return headers, data
# Usage example
url = "https://example.com/data-table"
headers, table_data = scrape_table_basic(url, 'table.data-table')
# Save to CSV
with open('scraped_table.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(table_data)
Advanced Table Handling
def scrape_table_advanced(url):
"""Handle complex tables with merged cells and nested content"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tables = soup.find_all('table')
all_tables_data = []
for i, table in enumerate(tables):
print(f"Processing table {i+1}")
# Handle tables with thead/tbody structure
thead = table.find('thead')
tbody = table.find('tbody')
headers = []
if thead:
header_rows = thead.find_all('tr')
for row in header_rows:
row_headers = [th.get_text(strip=True) for th in row.find_all(['th', 'td'])]
headers.extend(row_headers)
# Extract data
data_rows = tbody.find_all('tr') if tbody else table.find_all('tr')
data = []
for row in data_rows:
cells = row.find_all(['td', 'th'])
row_data = []
for cell in cells:
# Handle nested elements (links, spans, etc.)
text = cell.get_text(separator=' ', strip=True)
# Extract links if present
links = cell.find_all('a')
if links:
text += f" [Links: {', '.join([a.get('href', '') for a in links])}]"
row_data.append(text)
if row_data:
data.append(row_data)
all_tables_data.append({
'headers': headers,
'data': data,
'table_index': i
})
return all_tables_data
Method 2: Using Pandas (Recommended for Simple Cases)
Pandas can automatically detect and parse HTML tables, making it ideal for straightforward extractions.
import pandas as pd
def scrape_tables_pandas(url):
"""Extract all tables using pandas - simplest method"""
try:
# Read all tables from the webpage
tables = pd.read_html(url, attrs={'class': 'data-table'})
# Save each table
for i, table in enumerate(tables):
filename = f'table_{i+1}.csv'
table.to_csv(filename, index=False)
print(f"Saved {filename} with shape {table.shape}")
return tables
except ValueError as e:
print(f"Error: {e}")
return []
# Usage
tables = scrape_tables_pandas("https://example.com/page-with-tables")
# Access specific table
if tables:
first_table = tables[0]
print(first_table.head())
Method 3: Dynamic Tables with Selenium
For JavaScript-rendered tables or tables that load after user interaction:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
def scrape_dynamic_table(url, table_xpath=None):
"""Scrape dynamically loaded tables using Selenium"""
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in background
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(url)
# Wait for table to load
wait = WebDriverWait(driver, 10)
if table_xpath:
table = wait.until(EC.presence_of_element_located((By.XPATH, table_xpath)))
else:
table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
# Option 1: Use pandas with Selenium
tables = pd.read_html(driver.page_source)
# Option 2: Manual extraction
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
data = []
for row in rows:
cells = row.find_elements(By.TAG_NAME, "td")
if not cells: # Try th for headers
cells = row.find_elements(By.TAG_NAME, "th")
row_data = [cell.text.strip() for cell in cells]
if row_data:
data.append(row_data)
return data
finally:
driver.quit()
# Usage
table_data = scrape_dynamic_table("https://example.com/dynamic-table")
Method 4: Handling Pagination and Multiple Pages
def scrape_paginated_table(base_url, max_pages=None):
"""Scrape tables across multiple pages"""
all_data = []
page = 1
while True:
url = f"{base_url}?page={page}"
print(f"Scraping page {page}")
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table')
if not table:
break
# Extract data (skip headers after first page)
rows = table.find_all('tr')
start_row = 1 if page == 1 else 0
for row in rows[start_row:]:
cells = row.find_all(['td', 'th'])
row_data = [cell.get_text(strip=True) for cell in cells]
if row_data:
all_data.append(row_data)
# Check for next page
next_link = soup.find('a', text='Next')
if not next_link or (max_pages and page >= max_pages):
break
page += 1
except Exception as e:
print(f"Error on page {page}: {e}")
break
return all_data
Best Practices
Error Handling and Robustness
import time
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def create_robust_session():
"""Create a session with retry strategy"""
session = requests.Session()
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def scrape_with_delays(urls, delay_range=(1, 3)):
"""Scrape multiple URLs with random delays"""
session = create_robust_session()
results = []
for url in urls:
try:
# Random delay to be respectful
time.sleep(random.uniform(*delay_range))
response = session.get(url, timeout=10)
# Process response...
except Exception as e:
print(f"Error scraping {url}: {e}")
continue
return results
Data Cleaning and Validation
def clean_table_data(data):
"""Clean and validate scraped table data"""
cleaned_data = []
for row in data:
cleaned_row = []
for cell in row:
# Remove extra whitespace
clean_cell = ' '.join(cell.split())
# Remove common HTML entities
clean_cell = clean_cell.replace(' ', ' ')
clean_cell = clean_cell.replace('&', '&')
# Convert to appropriate data types
if clean_cell.isdigit():
clean_cell = int(clean_cell)
elif clean_cell.replace('.', '').isdigit():
clean_cell = float(clean_cell)
cleaned_row.append(clean_cell)
cleaned_data.append(cleaned_row)
return cleaned_data
Common Issues and Solutions
- JavaScript-rendered content: Use Selenium or consider headless browsers
- Rate limiting: Implement delays and respect robots.txt
- Authentication required: Handle cookies and sessions properly
- Complex table structures: Parse manually with BeautifulSoup for full control
- Large datasets: Consider streaming and chunked processing
Legal and Ethical Considerations
Always ensure your scraping activities comply with: - Website terms of service - robots.txt file guidelines - Rate limiting to avoid server overload - Data protection and privacy laws - Copyright and intellectual property rights