How do I scrape data from a website and save it to a CSV file using Python?

Web scraping and saving data to CSV files is a common task for data extraction and analysis. Python provides excellent tools for this purpose, combining requests for HTTP requests, BeautifulSoup for HTML parsing, and the built-in csv module for file writing.

Prerequisites and Setup

Required Libraries

Install the necessary packages using pip:

pip install requests beautifulsoup4 lxml pandas
  • requests: HTTP library for making web requests
  • beautifulsoup4: HTML/XML parser for extracting data
  • lxml: Fast XML and HTML parser (optional but recommended)
  • pandas: Data manipulation library (alternative to csv module)

Basic Web Scraping to CSV

Method 1: Using csv Module (Lightweight)

import csv
import requests
from bs4 import BeautifulSoup
import time

def scrape_to_csv(url, output_file):
    try:
        # Add headers to avoid being blocked
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        # Make HTTP request with timeout
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raises exception for bad status codes

        # Parse HTML content
        soup = BeautifulSoup(response.content, 'lxml')

        # Example: Scraping a table
        table = soup.find('table', {'class': 'data-table'})
        if not table:
            print("No table found with the specified selector")
            return

        rows = table.find_all('tr')

        # Write to CSV file
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)

            # Extract and write headers
            if rows:
                headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]
                writer.writerow(headers)

                # Extract and write data rows
                for row in rows[1:]:
                    data = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
                    writer.writerow(data)

        print(f"Data successfully scraped and saved to {output_file}")

    except requests.RequestException as e:
        print(f"Request failed: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Usage
scrape_to_csv('https://example.com/data-table', 'scraped_data.csv')

Method 2: Using Pandas (More Features)

import pandas as pd
import requests
from bs4 import BeautifulSoup

def scrape_with_pandas(url, output_file):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'lxml')

        # Extract data into a list of dictionaries
        data = []

        # Example: Scraping product information
        products = soup.find_all('div', {'class': 'product-item'})

        for product in products:
            item = {
                'name': product.find('h3', {'class': 'product-name'}).get_text(strip=True) if product.find('h3', {'class': 'product-name'}) else 'N/A',
                'price': product.find('span', {'class': 'price'}).get_text(strip=True) if product.find('span', {'class': 'price'}) else 'N/A',
                'rating': product.find('div', {'class': 'rating'}).get_text(strip=True) if product.find('div', {'class': 'rating'}) else 'N/A'
            }
            data.append(item)

        # Create DataFrame and save to CSV
        df = pd.DataFrame(data)
        df.to_csv(output_file, index=False, encoding='utf-8')

        print(f"Data scraped and saved to {output_file}")
        print(f"Total records: {len(data)}")

    except Exception as e:
        print(f"Error: {e}")

# Usage
scrape_with_pandas('https://example.com/products', 'products.csv')

Advanced Scraping Techniques

Handling Multiple Pages

import csv
import requests
from bs4 import BeautifulSoup
import time

def scrape_multiple_pages(base_url, max_pages, output_file):
    all_data = []

    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        headers_written = False

        for page in range(1, max_pages + 1):
            url = f"{base_url}?page={page}"
            print(f"Scraping page {page}...")

            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'lxml')
                rows = soup.find_all('tr', {'class': 'data-row'})

                if not rows:
                    print(f"No data found on page {page}")
                    break

                # Write headers only once
                if not headers_written and rows:
                    headers = ['Column1', 'Column2', 'Column3']  # Define your headers
                    writer.writerow(headers)
                    headers_written = True

                # Extract and write data
                for row in rows:
                    data = [cell.get_text(strip=True) for cell in row.find_all('td')]
                    writer.writerow(data)

                # Be respectful - add delay between requests
                time.sleep(1)

            except Exception as e:
                print(f"Error on page {page}: {e}")
                continue

    print(f"Scraping completed. Data saved to {output_file}")

# Usage
scrape_multiple_pages('https://example.com/data', 5, 'multi_page_data.csv')

Handling Forms and POST Requests

import csv
import requests
from bs4 import BeautifulSoup

def scrape_with_form_data(url, form_data, output_file):
    session = requests.Session()

    try:
        # First, get the page to extract any CSRF tokens or hidden fields
        response = session.get(url)
        soup = BeautifulSoup(response.content, 'lxml')

        # Extract CSRF token if present
        csrf_token = soup.find('input', {'name': 'csrf_token'})
        if csrf_token:
            form_data['csrf_token'] = csrf_token.get('value')

        # Submit form data
        post_response = session.post(url, data=form_data)
        post_response.raise_for_status()

        # Parse results
        result_soup = BeautifulSoup(post_response.content, 'lxml')

        # Extract data and save to CSV
        results = result_soup.find_all('div', {'class': 'search-result'})

        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Title', 'Description', 'URL'])

            for result in results:
                title = result.find('h3').get_text(strip=True) if result.find('h3') else 'N/A'
                desc = result.find('p').get_text(strip=True) if result.find('p') else 'N/A'
                link = result.find('a')['href'] if result.find('a') else 'N/A'
                writer.writerow([title, desc, link])

        print(f"Form data scraped and saved to {output_file}")

    except Exception as e:
        print(f"Error: {e}")

# Usage
form_data = {
    'search_query': 'python web scraping',
    'category': 'programming'
}
scrape_with_form_data('https://example.com/search', form_data, 'search_results.csv')

Best Practices and Error Handling

Robust Scraping Function

import csv
import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin, urlparse

class WebScraper:
    def __init__(self, delay_range=(1, 3)):
        self.session = requests.Session()
        self.delay_range = delay_range
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def scrape_to_csv(self, url, selectors, output_file, max_retries=3):
        """
        Scrape data using CSS selectors and save to CSV

        Args:
            url: Target URL
            selectors: Dict mapping column names to CSS selectors
            output_file: Output CSV filename
            max_retries: Number of retry attempts
        """
        for attempt in range(max_retries):
            try:
                response = self.session.get(url, timeout=15)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'lxml')

                # Extract data using selectors
                data = []
                items = soup.find_all(selectors.get('container', 'div'))

                for item in items:
                    row = {}
                    for column, selector in selectors.items():
                        if column == 'container':
                            continue
                        element = item.select_one(selector)
                        row[column] = element.get_text(strip=True) if element else 'N/A'
                    data.append(row)

                # Save to CSV
                if data:
                    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
                        writer = csv.DictWriter(csvfile, fieldnames=data[0].keys())
                        writer.writeheader()
                        writer.writerows(data)

                    print(f"Successfully scraped {len(data)} items to {output_file}")
                    return True
                else:
                    print("No data found")
                    return False

            except requests.RequestException as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt < max_retries - 1:
                    time.sleep(random.uniform(*self.delay_range))

        print(f"Failed to scrape after {max_retries} attempts")
        return False

    def random_delay(self):
        """Add random delay between requests"""
        time.sleep(random.uniform(*self.delay_range))

# Usage example
scraper = WebScraper()

selectors = {
    'container': 'div.product',
    'name': 'h3.product-title',
    'price': 'span.price',
    'rating': 'div.rating span'
}

scraper.scrape_to_csv('https://example.com/products', selectors, 'products.csv')

Common Issues and Solutions

1. JavaScript-Rendered Content

For sites that load content dynamically with JavaScript, use Selenium:

from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import time

def scrape_js_content(url, output_file):
    # Setup Chrome driver (install chromedriver first)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in background
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        time.sleep(5)  # Wait for JS to load

        # Find elements after JS execution
        items = driver.find_elements(By.CLASS_NAME, 'dynamic-item')

        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Title', 'Content'])

            for item in items:
                title = item.find_element(By.TAG_NAME, 'h3').text
                content = item.find_element(By.TAG_NAME, 'p').text
                writer.writerow([title, content])

    finally:
        driver.quit()

2. Rate Limiting and Politeness

import time
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def create_session_with_retries():
    session = requests.Session()

    # Configure retry strategy
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )

    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    return session

Legal and Ethical Considerations

  • Check robots.txt: Always review https://example.com/robots.txt
  • Respect rate limits: Add delays between requests
  • Terms of Service: Read and comply with website terms
  • Personal data: Be careful with personally identifiable information
  • API alternatives: Check if the site offers an API

Summary

Web scraping to CSV in Python involves:

  1. Making HTTP requests with proper headers and error handling
  2. Parsing HTML content using BeautifulSoup or similar libraries
  3. Extracting data with CSS selectors or XPath
  4. Writing to CSV using the csv module or pandas
  5. Handling edge cases like JavaScript content and rate limiting

Choose the method that best fits your needs - use the csv module for simple tasks and pandas for more complex data manipulation requirements.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon