When scraping websites with Python, you'll often need to download files like PDFs, images, documents, or other media. This guide covers the most effective methods using requests
, urllib
, and selenium
, with proper error handling and optimization techniques.
Method 1: Using requests
(Recommended)
The requests
library is the most popular choice for downloading files due to its simplicity and powerful features.
Basic File Download
import requests
import os
from urllib.parse import urlparse
def download_file(url, local_filename=None):
"""Download a file from URL with error handling"""
try:
# Auto-generate filename if not provided
if not local_filename:
parsed_url = urlparse(url)
local_filename = os.path.basename(parsed_url.path) or 'downloaded_file'
with requests.get(url, stream=True, timeout=30) as response:
response.raise_for_status()
with open(local_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"Downloaded: {local_filename}")
return local_filename
except requests.exceptions.RequestException as e:
print(f"Download failed: {e}")
return None
# Example usage
file_url = 'https://example.com/document.pdf'
download_file(file_url, 'my_document.pdf')
Advanced Download with Progress Tracking
import requests
from tqdm import tqdm
def download_with_progress(url, local_filename):
"""Download file with progress bar"""
response = requests.get(url, stream=True)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
with open(local_filename, 'wb') as file, tqdm(
desc=local_filename,
total=total_size,
unit='B',
unit_scale=True,
unit_divisor=1024,
) as progress_bar:
for chunk in response.iter_content(chunk_size=8192):
size = file.write(chunk)
progress_bar.update(size)
# Example with progress tracking
download_with_progress('https://example.com/large_file.zip', 'large_file.zip')
Download with Headers and Authentication
import requests
def download_with_auth(url, local_filename, headers=None, auth=None):
"""Download file with custom headers and authentication"""
session = requests.Session()
if headers:
session.headers.update(headers)
if auth:
session.auth = auth
with session.get(url, stream=True) as response:
response.raise_for_status()
with open(local_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
# Example with custom headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
download_with_auth('https://example.com/protected_file.pdf', 'file.pdf', headers=headers)
Method 2: Using urllib
(Built-in)
For simple downloads without external dependencies, use Python's built-in urllib
:
import urllib.request
import urllib.error
from urllib.parse import urlparse
def download_with_urllib(url, local_filename=None):
"""Download file using urllib"""
try:
if not local_filename:
parsed_url = urlparse(url)
local_filename = os.path.basename(parsed_url.path) or 'downloaded_file'
urllib.request.urlretrieve(url, local_filename)
print(f"Downloaded: {local_filename}")
return local_filename
except urllib.error.URLError as e:
print(f"Download failed: {e}")
return None
# Example usage
download_with_urllib('https://example.com/image.jpg', 'downloaded_image.jpg')
urllib with Custom Headers
import urllib.request
def download_with_headers(url, local_filename, headers=None):
"""Download with custom headers using urllib"""
request = urllib.request.Request(url)
if headers:
for key, value in headers.items():
request.add_header(key, value)
try:
with urllib.request.urlopen(request) as response:
with open(local_filename, 'wb') as file:
file.write(response.read())
return local_filename
except Exception as e:
print(f"Download failed: {e}")
return None
# Example with headers
headers = {'User-Agent': 'MyBot 1.0'}
download_with_headers('https://example.com/file.pdf', 'file.pdf', headers)
Method 3: Using Selenium (For Interactive Downloads)
Use Selenium when downloads require user interaction or JavaScript execution:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import os
import time
def setup_download_driver(download_directory):
"""Set up Chrome driver with download preferences"""
chrome_options = Options()
# Configure download settings
prefs = {
'download.default_directory': os.path.abspath(download_directory),
'download.prompt_for_download': False,
'download.directory_upgrade': True,
'safebrowsing.enabled': True
}
chrome_options.add_experimental_option('prefs', prefs)
# Optional: run in headless mode
# chrome_options.add_argument('--headless')
return webdriver.Chrome(options=chrome_options)
def download_with_selenium(url, download_button_selector, download_dir='./downloads'):
"""Download file using Selenium interaction"""
os.makedirs(download_dir, exist_ok=True)
driver = setup_download_driver(download_dir)
try:
# Navigate to page
driver.get(url)
# Wait for and click download button
download_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, download_button_selector))
)
download_button.click()
# Wait for download to complete (basic approach)
# Better approach: monitor the download directory
time.sleep(5)
print("Download initiated successfully")
except Exception as e:
print(f"Selenium download failed: {e}")
finally:
driver.quit()
# Example usage
download_with_selenium(
'https://example.com/download-page',
'#download-btn',
'./downloads'
)
Monitor Download Completion
import glob
import time
def wait_for_download_completion(download_dir, timeout=60):
"""Wait for download to complete by monitoring directory"""
start_time = time.time()
while time.time() - start_time < timeout:
# Check for temporary download files
temp_files = glob.glob(os.path.join(download_dir, "*.crdownload"))
if not temp_files:
# No temporary files, download likely complete
return True
time.sleep(1)
return False # Timeout reached
# Usage after clicking download
if wait_for_download_completion('./downloads'):
print("Download completed successfully")
else:
print("Download may have timed out")
Error Handling and Best Practices
Comprehensive Error Handling
import requests
import os
from pathlib import Path
def robust_download(url, local_filename, max_retries=3):
"""Download with comprehensive error handling and retries"""
for attempt in range(max_retries):
try:
response = requests.get(
url,
stream=True,
timeout=30,
headers={'User-Agent': 'Mozilla/5.0 (compatible)'}
)
response.raise_for_status()
# Check content type if needed
content_type = response.headers.get('content-type', '')
if 'text/html' in content_type:
print("Warning: Received HTML instead of file")
# Create directory if it doesn't exist
Path(local_filename).parent.mkdir(parents=True, exist_ok=True)
with open(local_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
# Verify file was downloaded
if os.path.getsize(local_filename) > 0:
print(f"Successfully downloaded: {local_filename}")
return True
else:
print("Downloaded file is empty")
except requests.exceptions.Timeout:
print(f"Timeout on attempt {attempt + 1}")
except requests.exceptions.ConnectionError:
print(f"Connection error on attempt {attempt + 1}")
except requests.exceptions.HTTPError as e:
print(f"HTTP error {e.response.status_code}: {e}")
if e.response.status_code == 404:
break # Don't retry 404s
except Exception as e:
print(f"Unexpected error: {e}")
if attempt < max_retries - 1:
print(f"Retrying in 2 seconds...")
time.sleep(2)
print("Download failed after all retries")
return False
# Example usage
success = robust_download(
'https://example.com/document.pdf',
'./downloads/document.pdf'
)
File Validation
import hashlib
import mimetypes
def validate_downloaded_file(filepath, expected_size=None, expected_hash=None):
"""Validate downloaded file integrity"""
if not os.path.exists(filepath):
return False, "File does not exist"
file_size = os.path.getsize(filepath)
# Check file size
if expected_size and file_size != expected_size:
return False, f"Size mismatch: expected {expected_size}, got {file_size}"
# Check file hash
if expected_hash:
with open(filepath, 'rb') as f:
file_hash = hashlib.md5(f.read()).hexdigest()
if file_hash != expected_hash:
return False, f"Hash mismatch: expected {expected_hash}, got {file_hash}"
# Check MIME type
mime_type, _ = mimetypes.guess_type(filepath)
return True, f"File valid: {file_size} bytes, type: {mime_type}"
# Example validation
is_valid, message = validate_downloaded_file('./downloads/document.pdf')
print(message)
Bulk File Downloads
import concurrent.futures
import requests
from pathlib import Path
def download_single_file(url_filename_tuple):
"""Download a single file (for use with threading)"""
url, filename = url_filename_tuple
try:
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
with open(filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
return f"✓ Downloaded: {filename}"
except Exception as e:
return f"✗ Failed {filename}: {e}"
def bulk_download(url_list, download_dir='./downloads', max_workers=5):
"""Download multiple files concurrently"""
Path(download_dir).mkdir(parents=True, exist_ok=True)
# Prepare URL-filename pairs
download_tasks = []
for i, url in enumerate(url_list):
filename = Path(download_dir) / f"file_{i+1}_{Path(url).name}"
download_tasks.append((url, filename))
# Download concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(download_single_file, download_tasks))
# Print results
for result in results:
print(result)
# Example bulk download
urls = [
'https://example.com/file1.pdf',
'https://example.com/file2.jpg',
'https://example.com/file3.doc'
]
bulk_download(urls)
Choosing the Right Method
- Use
requests
for most file downloads - it's feature-rich, handles errors well, and supports streaming - Use
urllib
for simple downloads when you want to avoid external dependencies - Use
selenium
when downloads require:- Clicking buttons or form interactions
- JavaScript execution
- Authentication through web forms
- Complex navigation flows
Legal and Ethical Considerations
Always ensure your downloading activities comply with: - Website terms of service - Robots.txt directives - Rate limiting (add delays between requests) - Copyright and intellectual property laws - Data privacy regulations
import time
import random
# Add polite delays between downloads
time.sleep(random.uniform(1, 3)) # Random delay 1-3 seconds