Web scraping and saving data to CSV files is a common task for data extraction and analysis. Python provides excellent tools for this purpose, combining requests
for HTTP requests, BeautifulSoup
for HTML parsing, and the built-in csv
module for file writing.
Prerequisites and Setup
Required Libraries
Install the necessary packages using pip:
pip install requests beautifulsoup4 lxml pandas
requests
: HTTP library for making web requestsbeautifulsoup4
: HTML/XML parser for extracting datalxml
: Fast XML and HTML parser (optional but recommended)pandas
: Data manipulation library (alternative to csv module)
Basic Web Scraping to CSV
Method 1: Using csv Module (Lightweight)
import csv
import requests
from bs4 import BeautifulSoup
import time
def scrape_to_csv(url, output_file):
try:
# Add headers to avoid being blocked
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
# Make HTTP request with timeout
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raises exception for bad status codes
# Parse HTML content
soup = BeautifulSoup(response.content, 'lxml')
# Example: Scraping a table
table = soup.find('table', {'class': 'data-table'})
if not table:
print("No table found with the specified selector")
return
rows = table.find_all('tr')
# Write to CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
# Extract and write headers
if rows:
headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]
writer.writerow(headers)
# Extract and write data rows
for row in rows[1:]:
data = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
writer.writerow(data)
print(f"Data successfully scraped and saved to {output_file}")
except requests.RequestException as e:
print(f"Request failed: {e}")
except Exception as e:
print(f"An error occurred: {e}")
# Usage
scrape_to_csv('https://example.com/data-table', 'scraped_data.csv')
Method 2: Using Pandas (More Features)
import pandas as pd
import requests
from bs4 import BeautifulSoup
def scrape_with_pandas(url, output_file):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
# Extract data into a list of dictionaries
data = []
# Example: Scraping product information
products = soup.find_all('div', {'class': 'product-item'})
for product in products:
item = {
'name': product.find('h3', {'class': 'product-name'}).get_text(strip=True) if product.find('h3', {'class': 'product-name'}) else 'N/A',
'price': product.find('span', {'class': 'price'}).get_text(strip=True) if product.find('span', {'class': 'price'}) else 'N/A',
'rating': product.find('div', {'class': 'rating'}).get_text(strip=True) if product.find('div', {'class': 'rating'}) else 'N/A'
}
data.append(item)
# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Data scraped and saved to {output_file}")
print(f"Total records: {len(data)}")
except Exception as e:
print(f"Error: {e}")
# Usage
scrape_with_pandas('https://example.com/products', 'products.csv')
Advanced Scraping Techniques
Handling Multiple Pages
import csv
import requests
from bs4 import BeautifulSoup
import time
def scrape_multiple_pages(base_url, max_pages, output_file):
all_data = []
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
headers_written = False
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
print(f"Scraping page {page}...")
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
rows = soup.find_all('tr', {'class': 'data-row'})
if not rows:
print(f"No data found on page {page}")
break
# Write headers only once
if not headers_written and rows:
headers = ['Column1', 'Column2', 'Column3'] # Define your headers
writer.writerow(headers)
headers_written = True
# Extract and write data
for row in rows:
data = [cell.get_text(strip=True) for cell in row.find_all('td')]
writer.writerow(data)
# Be respectful - add delay between requests
time.sleep(1)
except Exception as e:
print(f"Error on page {page}: {e}")
continue
print(f"Scraping completed. Data saved to {output_file}")
# Usage
scrape_multiple_pages('https://example.com/data', 5, 'multi_page_data.csv')
Handling Forms and POST Requests
import csv
import requests
from bs4 import BeautifulSoup
def scrape_with_form_data(url, form_data, output_file):
session = requests.Session()
try:
# First, get the page to extract any CSRF tokens or hidden fields
response = session.get(url)
soup = BeautifulSoup(response.content, 'lxml')
# Extract CSRF token if present
csrf_token = soup.find('input', {'name': 'csrf_token'})
if csrf_token:
form_data['csrf_token'] = csrf_token.get('value')
# Submit form data
post_response = session.post(url, data=form_data)
post_response.raise_for_status()
# Parse results
result_soup = BeautifulSoup(post_response.content, 'lxml')
# Extract data and save to CSV
results = result_soup.find_all('div', {'class': 'search-result'})
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Title', 'Description', 'URL'])
for result in results:
title = result.find('h3').get_text(strip=True) if result.find('h3') else 'N/A'
desc = result.find('p').get_text(strip=True) if result.find('p') else 'N/A'
link = result.find('a')['href'] if result.find('a') else 'N/A'
writer.writerow([title, desc, link])
print(f"Form data scraped and saved to {output_file}")
except Exception as e:
print(f"Error: {e}")
# Usage
form_data = {
'search_query': 'python web scraping',
'category': 'programming'
}
scrape_with_form_data('https://example.com/search', form_data, 'search_results.csv')
Best Practices and Error Handling
Robust Scraping Function
import csv
import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin, urlparse
class WebScraper:
def __init__(self, delay_range=(1, 3)):
self.session = requests.Session()
self.delay_range = delay_range
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def scrape_to_csv(self, url, selectors, output_file, max_retries=3):
"""
Scrape data using CSS selectors and save to CSV
Args:
url: Target URL
selectors: Dict mapping column names to CSS selectors
output_file: Output CSV filename
max_retries: Number of retry attempts
"""
for attempt in range(max_retries):
try:
response = self.session.get(url, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
# Extract data using selectors
data = []
items = soup.find_all(selectors.get('container', 'div'))
for item in items:
row = {}
for column, selector in selectors.items():
if column == 'container':
continue
element = item.select_one(selector)
row[column] = element.get_text(strip=True) if element else 'N/A'
data.append(row)
# Save to CSV
if data:
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
print(f"Successfully scraped {len(data)} items to {output_file}")
return True
else:
print("No data found")
return False
except requests.RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(random.uniform(*self.delay_range))
print(f"Failed to scrape after {max_retries} attempts")
return False
def random_delay(self):
"""Add random delay between requests"""
time.sleep(random.uniform(*self.delay_range))
# Usage example
scraper = WebScraper()
selectors = {
'container': 'div.product',
'name': 'h3.product-title',
'price': 'span.price',
'rating': 'div.rating span'
}
scraper.scrape_to_csv('https://example.com/products', selectors, 'products.csv')
Common Issues and Solutions
1. JavaScript-Rendered Content
For sites that load content dynamically with JavaScript, use Selenium:
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import time
def scrape_js_content(url, output_file):
# Setup Chrome driver (install chromedriver first)
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Run in background
driver = webdriver.Chrome(options=options)
try:
driver.get(url)
time.sleep(5) # Wait for JS to load
# Find elements after JS execution
items = driver.find_elements(By.CLASS_NAME, 'dynamic-item')
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Title', 'Content'])
for item in items:
title = item.find_element(By.TAG_NAME, 'h3').text
content = item.find_element(By.TAG_NAME, 'p').text
writer.writerow([title, content])
finally:
driver.quit()
2. Rate Limiting and Politeness
import time
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def create_session_with_retries():
session = requests.Session()
# Configure retry strategy
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
Legal and Ethical Considerations
- Check robots.txt: Always review
https://example.com/robots.txt
- Respect rate limits: Add delays between requests
- Terms of Service: Read and comply with website terms
- Personal data: Be careful with personally identifiable information
- API alternatives: Check if the site offers an API
Summary
Web scraping to CSV in Python involves:
- Making HTTP requests with proper headers and error handling
- Parsing HTML content using BeautifulSoup or similar libraries
- Extracting data with CSS selectors or XPath
- Writing to CSV using the csv module or pandas
- Handling edge cases like JavaScript content and rate limiting
Choose the method that best fits your needs - use the csv module for simple tasks and pandas for more complex data manipulation requirements.