How do I handle file downloads during web scraping with Python?

When scraping websites with Python, you'll often need to download files like PDFs, images, documents, or other media. This guide covers the most effective methods using requests, urllib, and selenium, with proper error handling and optimization techniques.

Method 1: Using `requests` (Recommended)

The requests library is the most popular choice for downloading files due to its simplicity and powerful features.

Basic File Download

import requests
import os
from urllib.parse import urlparse

def download_file(url, local_filename=None):
    """Download a file from URL with error handling"""
    try:
        # Auto-generate filename if not provided
        if not local_filename:
            parsed_url = urlparse(url)
            local_filename = os.path.basename(parsed_url.path) or 'downloaded_file'

        with requests.get(url, stream=True, timeout=30) as response:
            response.raise_for_status()

            with open(local_filename, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

        print(f"Downloaded: {local_filename}")
        return local_filename

    except requests.exceptions.RequestException as e:
        print(f"Download failed: {e}")
        return None

# Example usage
file_url = 'https://example.com/document.pdf'
download_file(file_url, 'my_document.pdf')

Advanced Download with Progress Tracking

import requests
from tqdm import tqdm

def download_with_progress(url, local_filename):
    """Download file with progress bar"""
    response = requests.get(url, stream=True)
    response.raise_for_status()

    total_size = int(response.headers.get('content-length', 0))

    with open(local_filename, 'wb') as file, tqdm(
        desc=local_filename,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as progress_bar:
        for chunk in response.iter_content(chunk_size=8192):
            size = file.write(chunk)
            progress_bar.update(size)

# Example with progress tracking
download_with_progress('https://example.com/large_file.zip', 'large_file.zip')

Download with Headers and Authentication

import requests

def download_with_auth(url, local_filename, headers=None, auth=None):
    """Download file with custom headers and authentication"""
    session = requests.Session()

    if headers:
        session.headers.update(headers)

    if auth:
        session.auth = auth

    with session.get(url, stream=True) as response:
        response.raise_for_status()

        with open(local_filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

# Example with custom headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
download_with_auth('https://example.com/protected_file.pdf', 'file.pdf', headers=headers)

Method 2: Using `urllib` (Built-in)

For simple downloads without external dependencies, use Python's built-in urllib:

import urllib.request
import urllib.error
from urllib.parse import urlparse

def download_with_urllib(url, local_filename=None):
    """Download file using urllib"""
    try:
        if not local_filename:
            parsed_url = urlparse(url)
            local_filename = os.path.basename(parsed_url.path) or 'downloaded_file'

        urllib.request.urlretrieve(url, local_filename)
        print(f"Downloaded: {local_filename}")
        return local_filename

    except urllib.error.URLError as e:
        print(f"Download failed: {e}")
        return None

# Example usage
download_with_urllib('https://example.com/image.jpg', 'downloaded_image.jpg')

urllib with Custom Headers

import urllib.request

def download_with_headers(url, local_filename, headers=None):
    """Download with custom headers using urllib"""
    request = urllib.request.Request(url)

    if headers:
        for key, value in headers.items():
            request.add_header(key, value)

    try:
        with urllib.request.urlopen(request) as response:
            with open(local_filename, 'wb') as file:
                file.write(response.read())
        return local_filename
    except Exception as e:
        print(f"Download failed: {e}")
        return None

# Example with headers
headers = {'User-Agent': 'MyBot 1.0'}
download_with_headers('https://example.com/file.pdf', 'file.pdf', headers)

Method 3: Using Selenium (For Interactive Downloads)

Use Selenium when downloads require user interaction or JavaScript execution:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import os
import time

def setup_download_driver(download_directory):
    """Set up Chrome driver with download preferences"""
    chrome_options = Options()

    # Configure download settings
    prefs = {
        'download.default_directory': os.path.abspath(download_directory),
        'download.prompt_for_download': False,
        'download.directory_upgrade': True,
        'safebrowsing.enabled': True
    }
    chrome_options.add_experimental_option('prefs', prefs)

    # Optional: run in headless mode
    # chrome_options.add_argument('--headless')

    return webdriver.Chrome(options=chrome_options)

def download_with_selenium(url, download_button_selector, download_dir='./downloads'):
    """Download file using Selenium interaction"""
    os.makedirs(download_dir, exist_ok=True)

    driver = setup_download_driver(download_dir)

    try:
        # Navigate to page
        driver.get(url)

        # Wait for and click download button
        download_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, download_button_selector))
        )
        download_button.click()

        # Wait for download to complete (basic approach)
        # Better approach: monitor the download directory
        time.sleep(5)

        print("Download initiated successfully")

    except Exception as e:
        print(f"Selenium download failed: {e}")
    finally:
        driver.quit()

# Example usage
download_with_selenium(
    'https://example.com/download-page',
    '#download-btn',
    './downloads'
)

Monitor Download Completion

import glob
import time

def wait_for_download_completion(download_dir, timeout=60):
    """Wait for download to complete by monitoring directory"""
    start_time = time.time()

    while time.time() - start_time < timeout:
        # Check for temporary download files
        temp_files = glob.glob(os.path.join(download_dir, "*.crdownload"))
        if not temp_files:
            # No temporary files, download likely complete
            return True
        time.sleep(1)

    return False  # Timeout reached

# Usage after clicking download
if wait_for_download_completion('./downloads'):
    print("Download completed successfully")
else:
    print("Download may have timed out")

Error Handling and Best Practices

Comprehensive Error Handling

import requests
import os
from pathlib import Path

def robust_download(url, local_filename, max_retries=3):
    """Download with comprehensive error handling and retries"""
    for attempt in range(max_retries):
        try:
            response = requests.get(
                url, 
                stream=True, 
                timeout=30,
                headers={'User-Agent': 'Mozilla/5.0 (compatible)'}
            )
            response.raise_for_status()

            # Check content type if needed
            content_type = response.headers.get('content-type', '')
            if 'text/html' in content_type:
                print("Warning: Received HTML instead of file")

            # Create directory if it doesn't exist
            Path(local_filename).parent.mkdir(parents=True, exist_ok=True)

            with open(local_filename, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            # Verify file was downloaded
            if os.path.getsize(local_filename) > 0:
                print(f"Successfully downloaded: {local_filename}")
                return True
            else:
                print("Downloaded file is empty")

        except requests.exceptions.Timeout:
            print(f"Timeout on attempt {attempt + 1}")
        except requests.exceptions.ConnectionError:
            print(f"Connection error on attempt {attempt + 1}")
        except requests.exceptions.HTTPError as e:
            print(f"HTTP error {e.response.status_code}: {e}")
            if e.response.status_code == 404:
                break  # Don't retry 404s
        except Exception as e:
            print(f"Unexpected error: {e}")

        if attempt < max_retries - 1:
            print(f"Retrying in 2 seconds...")
            time.sleep(2)

    print("Download failed after all retries")
    return False

# Example usage
success = robust_download(
    'https://example.com/document.pdf',
    './downloads/document.pdf'
)

File Validation

import hashlib
import mimetypes

def validate_downloaded_file(filepath, expected_size=None, expected_hash=None):
    """Validate downloaded file integrity"""
    if not os.path.exists(filepath):
        return False, "File does not exist"

    file_size = os.path.getsize(filepath)

    # Check file size
    if expected_size and file_size != expected_size:
        return False, f"Size mismatch: expected {expected_size}, got {file_size}"

    # Check file hash
    if expected_hash:
        with open(filepath, 'rb') as f:
            file_hash = hashlib.md5(f.read()).hexdigest()
        if file_hash != expected_hash:
            return False, f"Hash mismatch: expected {expected_hash}, got {file_hash}"

    # Check MIME type
    mime_type, _ = mimetypes.guess_type(filepath)

    return True, f"File valid: {file_size} bytes, type: {mime_type}"

# Example validation
is_valid, message = validate_downloaded_file('./downloads/document.pdf')
print(message)

Bulk File Downloads

import concurrent.futures
import requests
from pathlib import Path

def download_single_file(url_filename_tuple):
    """Download a single file (for use with threading)"""
    url, filename = url_filename_tuple
    try:
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status()

        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

        return f"✓ Downloaded: {filename}"
    except Exception as e:
        return f"✗ Failed {filename}: {e}"

def bulk_download(url_list, download_dir='./downloads', max_workers=5):
    """Download multiple files concurrently"""
    Path(download_dir).mkdir(parents=True, exist_ok=True)

    # Prepare URL-filename pairs
    download_tasks = []
    for i, url in enumerate(url_list):
        filename = Path(download_dir) / f"file_{i+1}_{Path(url).name}"
        download_tasks.append((url, filename))

    # Download concurrently
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(download_single_file, download_tasks))

    # Print results
    for result in results:
        print(result)

# Example bulk download
urls = [
    'https://example.com/file1.pdf',
    'https://example.com/file2.jpg',
    'https://example.com/file3.doc'
]
bulk_download(urls)

Choosing the Right Method

Use requests for most file downloads - it's feature-rich, handles errors well, and supports streaming
Use urllib for simple downloads when you want to avoid external dependencies
Use selenium when downloads require:
- Clicking buttons or form interactions
- JavaScript execution
- Authentication through web forms
- Complex navigation flows

Legal and Ethical Considerations

Always ensure your downloading activities comply with: - Website terms of service - Robots.txt directives - Rate limiting (add delays between requests) - Copyright and intellectual property laws - Data privacy regulations

import time
import random

# Add polite delays between downloads
time.sleep(random.uniform(1, 3))  # Random delay 1-3 seconds