## Plan
Since `books.toscrape.com` has predictable pagination (~50 pages) and simple HTML structure, **BeautifulSoup + requests** is the ideal choice — lightweight, no Scrapy project boilerplate needed. The scraper walks paginated listing pages (`/catalogue/page-N.html`), extracts the five required fields per book using CSS selectors, handles retries via `requests.adapters.HTTPAdapter`, rate-limits with `time.sleep`, and writes results to `books.csv`.
```python
"""
Scraper for https://books.toscrape.com
Extracts title, price, star rating, availability, and product URL across all pages.
NOTE: Always check /robots.txt before scraping. books.toscrape.com is a sandbox
site explicitly designed for scraping practice.
"""
import csv
import time
import logging
from urllib.parse import urljoin
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
BASE_URL = "https://books.toscrape.com/"
LISTING_URL = "https://books.toscrape.com/catalogue/page-{}.html"
OUTPUT_FILE = "books.csv"
REQUEST_DELAY = 1.0 # seconds between requests (be polite)
TIMEOUT = 15
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
)
}
# Map word-based rating classes (e.g. "star-rating Three") to ints
RATING_MAP = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
def build_session() -> requests.Session:
"""Session with automatic retries on transient failures."""
session = requests.Session()
retry = Retry(
total=5, backoff_factor=0.5,
status_forcelist=(429, 500, 502, 503, 504),
allowed_methods=("GET",),
)
session.mount("https://", HTTPAdapter(max_retries=retry))
session.mount("http://", HTTPAdapter(max_retries=retry))
session.headers.update(HEADERS)
return session
def parse_book(article, page_url: str) -> dict:
"""Extract fields from a single <article class='product_pod'> element."""
# Title lives in the `title` attribute of the inner <a> (full, non-truncated)
title_tag = article.select_one("h3 > a")
title = title_tag["title"].strip() if title_tag else ""
# Product URL is relative; resolve against the current listing page
product_url = urljoin(page_url, title_tag["href"]) if title_tag else ""
# Price: <p class="price_color">£51.77</p>
price_tag = article.select_one("p.price_color")
price = price_tag.get_text(strip=True) if price_tag else ""
# Rating: <p class="star-rating Three"> — second class is the word form
rating_tag = article.select_one("p.star-rating")
rating = 0
if rating_tag:
for cls in rating_tag.get("class", []):
if cls in RATING_MAP:
rating = RATING_MAP[cls]
break
# Availability: <p class="instock availability"> In stock </p>
avail_tag = article.select_one("p.instock.availability")
availability = avail_tag.get_text(strip=True) if avail_tag else ""
return {
"title": title,
"price": price,
"rating": rating,
"availability": availability,
"url": product_url,
}
def scrape_all(session: requests.Session) -> list:
"""Iterate through paginated listings until a 404 indicates the end."""
books, page = [], 1
while True:
url = LISTING_URL.format(page)
try:
resp = session.get(url, timeout=TIMEOUT)
except requests.RequestException as e:
log.error("Request failed for %s: %s", url, e)
break
if resp.status_code == 404:
log.info("Reached end of pagination at page %d", page)
break
if not resp.ok:
log.warning("Skipping %s (HTTP %s)", url, resp.status_code)
break
soup = BeautifulSoup(resp.text, "html.parser")
# Each book listing is an <article class="product_pod">
articles = soup.select("article.product_pod")
if not articles:
log.info("No products found on page %d — stopping.", page)
break
for art in articles:
try:
books.append(parse_book(art, url))
except Exception as e: # never let one bad row kill the crawl
log.warning("Failed to parse a book on page %d: %s", page, e)
log.info("Page %d: parsed %d books (total %d)", page, len(articles), len(books))
page += 1
time.sleep(REQUEST_DELAY) # rate-limit
return books
def export_csv(rows: list, path: str) -> None:
if not rows:
log.warning("No rows to export.")
return
fieldnames = ["title", "price", "rating", "availability", "url"]
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
log.info("Wrote %d rows to %s", len(rows), path)
def main() -> None:
session = build_session()
books = scrape_all(session)
export_csv(books, OUTPUT_FILE)
if __name__ == "__main__":
main()
```
Build BeautifulSoup and Scrapy Scripts with AI
Tested prompts for ai python web scraper generator compared across 5 leading AI models.
BEST BY JUDGE SCORE
Claude Haiku 4.5
8/10
The prompt we tested
You are an expert Python developer specializing in web scraping with BeautifulSoup, Scrapy, and Requests. Generate a complete, production-ready Python web scraper based on the user's target site and data requirements below. Output must be a single fenced Python code block preceded by a 2-3 sentence plan and followed by a short usage note. Use BeautifulSoup + requests for simple single-page or small multi-page jobs, and Scrapy for large crawls or pagination-heavy sites; always include a realistic User-Agent, try/except blocks, time.sleep or DOWNLOAD_DELAY, and respect robots.txt with a commented reminder. Keep the code self-contained, PEP8-compliant, under 120 lines where possible, and include inline comments on every non-trivial selector. User's scraping target and requirements: I want to scrape product listings from https://books.toscrape.com across all paginated category pages. For each book I need the title, price, star rating, availability, and product URL, then export everything to a CSV file called books.csv. Produce the full scraper now, including: (1) a brief plan identifying the library choice (BeautifulSoup vs Scrapy) and why, (2) the complete runnable code with imports, headers/user-agent, error handling, retries, and rate-limiting, (3) code to parse the requested fields and export to CSV or JSON, and (4) a short 'How to run' section with pip install commands and example usage.
Model comparison
| Model | Quality | Cost | Latency | Tokens |
|---|---|---|---|---|
| Claude Opus 4.7 | 8 | $0.1608 | 22.3s | 482 / 2048 |
| Claude Haiku 4.5 | 8 | $0.0101 | 11.2s | 353 / 1940 |
| GPT-5.4 | 8 | $0.0202 | 17.3s | 304 / 1293 |
| Grok 4.1 Fast Reasoning | 8 | $0.0006 | 33.6s | 448 / 965 |
Outputs
# Plan
This scraper uses **BeautifulSoup + Requests** because the site has straightforward pagination (page numbers in URL), no heavy JavaScript rendering, and a manageable scope. We'll iterate through category pages, extract book data (title, price, rating, availability), follow pagination links, and export to CSV. Rate-limiting with `time.sleep()` respects the site's resources; the site welcomes educational scraping per its `/robots.txt`.
```python
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin
from typing import List, Dict
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Realistic User-Agent to avoid being blocked
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
BASE_URL = 'https://books.toscrape.com'
CSV_FILE = 'books.csv'
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1 # seconds between requests
def fetch_page(url: str, retries: int = 3) -> str:
"""Fetch a page with retry logic and rate limiting."""
for attempt in range(retries):
try:
time.sleep(RATE_LIMIT_DELAY) # Respect server resources
response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logger.warning(f"Attempt {attempt + 1}/{retries} failed for {url}: {e}")
if attempt < retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
logger.error(f"Failed to fetch {url} after {retries} attempts")
return None
def extract_star_rating(rating_text: str) -> float:
"""Convert rating text (e.g., 'Three') to numeric value."""
rating_map = {'Zero': 0, 'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
return rating_map.get(rating_text.strip(), 0)
def parse_books(html: str) -> List[Dict[str, str]]:
"""Extract book data from HTML."""
soup = BeautifulSoup(html, 'html.parser')
books = []
# Select all book containers
book_items = soup.select('article.product_pod')
for item in book_items:
try:
# Extract title from h2/a tag
title = item.select_one('h2 a')['title'] if item.select_one('h2 a') else 'N/A'
# Extract price (e.g., "£51.77")
price = item.select_one('p.price_color').get_text(strip=True) if item.select_one('p.price_color') else 'N/A'
# Extract star rating (e.g., "Four" -> 4)
rating_class = item.select_one('p.star-rating')['class'][1] if item.select_one('p.star-rating') else 'Zero'
rating = extract_star_rating(rating_class)
# Extract availability text
availability = item.select_one('p.instock.availability').get_text(strip=True) if item.select_one('p.instock.availability') else 'N/A'
# Extract product URL (relative link -> absolute)
product_link = item.select_one('h2 a')['href'] if item.select_one('h2 a') else '#'
product_url = urljoin(BASE_URL, product_link)
books.append({
'title': title,
'price': price,
'rating': rating,
'availability': availability,
'url': product_url
})
except (AttributeError, TypeError) as e:
logger.warning(f"Error parsing book item: {e}")
continue
return books
def get_next_page_url(html: str, current_url: str) -> str:
"""Find the next page URL from pagination."""
soup = BeautifulSoup(html, 'html.parser')
next_button = soup.select_one('li.next a')
if next_button:
next_path = next_button['href']
# Construct next page URL based on current URL structure
base_current = current_url.rsplit('/', 1)[0]
return urljoin(base_current + '/', next_path)
return None
def scrape_all_books(start_url: str = f'{BASE_URL}/index.html') -> List[Dict[str, str]]:
"""Scrape all books across all paginated pages."""
all_books = []
current_url = start_url
page_count = 0
while current_url:
page_count += 1
logger.info(f"Scraping page {page_count}: {current_url}")
html = fetch_page(current_url)
if not html:
break
books = parse_books(html)
all_books.extend(books)
logger.info(f"Found {len(books)} books on page {page_count}. Total so far: {len(all_books)}")
# Get next page URL
current_url = get_next_page_url(html, current_url)
logger.info(f"Scraping complete. Total books collected: {len(all_books)}")
return all_books
def export_to_csv(books: List[Dict[str, str]], filename: str = CSV_FILE) -> None:
"""Export book data to CSV file."""
if not books:
logger.warning("No books to export.")
return
try:
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['title', 'price', 'rating', 'availability', 'url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(books)
logger.info(f"Successfully exported {len(books)} books to {filename}")
except IOError as e:
logger.error(f"Failed to write CSV file: {e}")
if __name__ == '__main__':
# Note: books.toscrape.com welcomes scraping for educational purposes.
# Always check robots.txt: https://books.toscrape.com/robots.txt
books_data = scrape_all_books()
export_to_csv(books_data)
```
## How to Run
**Installation:**
```bash
pip install requests beautifulsoup4
```
**Execution:**
```bash
python scraper.py
```
The script will:
1. Iterate through all paginated pages starting from `https://books.toscrape.com/index.html`
2. Extract title, price, star rating (0–5), availability status, and product URL for each book
3. Respect rate-limiting with 1-second delays between requests and exponential backoff on failures
4. Export all ~200 books to `books.csv` in the current directory
Check the CSV file after completion; each row contains one book's data with proper Unicode handling.
For this site, **BeautifulSoup + requests** is the best fit because the pages are static HTML and pagination is simple. The scraper below walks every category, follows all paginated category pages, extracts the requested book fields, normalizes product URLs, rate-limits requests, retries on transient failures, and writes the results to **books.csv**.
```python
import csv
import time
from typing import List, Dict, Set
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://books.toscrape.com/"
CATALOGUE_URL = urljoin(BASE_URL, "catalogue/")
OUTPUT_FILE = "books.csv"
REQUEST_DELAY = 1.0 # Be polite; also check robots.txt before scraping.
TIMEOUT = 15
MAX_RETRIES = 3
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
)
}
def fetch(session: requests.Session, url: str) -> str:
"""Fetch a URL with retries and basic backoff."""
for attempt in range(1, MAX_RETRIES + 1):
try:
response = session.get(url, headers=HEADERS, timeout=TIMEOUT)
response.raise_for_status()
time.sleep(REQUEST_DELAY)
return response.text
except requests.RequestException as exc:
if attempt == MAX_RETRIES:
raise RuntimeError(f"Failed to fetch {url}: {exc}") from exc
time.sleep(REQUEST_DELAY * attempt)
return ""
def get_category_urls(session: requests.Session) -> List[str]:
"""Extract all category landing page URLs from the homepage."""
html = fetch(session, BASE_URL)
soup = BeautifulSoup(html, "html.parser")
urls = []
# Sidebar category links under the "Books" section.
for a in soup.select("div.side_categories ul li ul li a"):
href = a.get("href", "").strip()
if href:
urls.append(urljoin(BASE_URL, href))
return urls
def parse_rating(article) -> str:
"""Convert star-rating class into readable text."""
classes = article.select_one("p.star-rating").get("class", [])
for cls in classes:
if cls != "star-rating":
return cls
return "Unknown"
def parse_books_from_page(soup: BeautifulSoup) -> List[Dict[str, str]]:
"""Parse all book cards from a category page."""
books = []
# Each product card is contained in an article.product_pod element.
for article in soup.select("article.product_pod"):
# Title is stored on the anchor title attribute inside h3.
title_tag = article.select_one("h3 a")
title = title_tag.get("title", "").strip() if title_tag else ""
# Price text appears in p.price_color, e.g. "£51.77".
price_tag = article.select_one("p.price_color")
price = price_tag.get_text(strip=True) if price_tag else ""
# Availability text is in p.instock.availability and may contain whitespace.
avail_tag = article.select_one("p.instock.availability")
availability = " ".join(avail_tag.get_text().split()) if avail_tag else ""
# Product link is relative to the current page and may include "../../../".
href = title_tag.get("href", "").strip() if title_tag else ""
product_url = urljoin(CATALOGUE_URL, href)
books.append(
{
"title": title,
"price": price,
"star_rating": parse_rating(article),
"availability": availability,
"product_url": product_url,
}
)
return books
def scrape_category(session: requests.Session, category_url: str) -> List[Dict[str, str]]:
"""Follow pagination within a single category and collect all books."""
results = []
next_url = category_url
while next_url:
html = fetch(session, next_url)
soup = BeautifulSoup(html, "html.parser")
results.extend(parse_books_from_page(soup))
# "next" pagination link appears inside li.next > a.
next_link = soup.select_one("li.next a")
next_url = urljoin(next_url, next_link["href"]) if next_link and next_link.get("href") else None
return results
def save_csv(rows: List[Dict[str, str]], filename: str) -> None:
"""Write scraped rows to CSV."""
fieldnames = ["title", "price", "star_rating", "availability", "product_url"]
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
def main() -> None:
all_books: List[Dict[str, str]] = []
seen_urls: Set[str] = set()
with requests.Session() as session:
try:
category_urls = get_category_urls(session)
for category_url in category_urls:
try:
books = scrape_category(session, category_url)
for book in books:
if book["product_url"] not in seen_urls:
seen_urls.add(book["product_url"])
all_books.append(book)
except Exception as exc:
print(f"Skipping category due to error: {category_url} -> {exc}")
except Exception as exc:
print(f"Fatal error while scraping: {exc}")
return
save_csv(all_books, OUTPUT_FILE)
print(f"Saved {len(all_books)} books to {OUTPUT_FILE}")
if __name__ == "__main__":
main()
```
**How to run**
Install dependencies:
```bash
pip install requests beautifulsoup4
```
Save the script as `scrape_books.py` and run:
```bash
python scrape_books.py
```
This will create `books.csv` in the current directory.
This scraper uses BeautifulSoup + requests because the target is a small demo site (~1000 books across 50 categories with simple "next" pagination), avoiding Scrapy's overhead for this scale. It first parses the categories index page to extract all leaf category URLs via CSS selector `a[href$="index.html"]` in the sidebar, then iterates through each category's pages, robustly parsing product fields with CSS selectors (e.g., `p.price_color` for price). Features include a realistic User-Agent, session reuse, 3x retries with exponential backoff, 1s delays, absolute URL resolution, and CSV export; robots.txt allows all (see comment).
```python
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin
# Respect robots.txt: http://books.toscrape.com/robots.txt allows all user-agents.
BASE_URL = 'https://books.toscrape.com/'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
def get_soup(url, session):
"""Fetch page with retries and return BeautifulSoup."""
for attempt in range(3):
try:
resp = session.get(url, headers=HEADERS)
resp.raise_for_status()
return BeautifulSoup(resp.text, 'html.parser')
except requests.RequestException:
time.sleep(2 ** attempt)
print(f"Failed to fetch {url}")
return None
def parse_books(soup):
"""Parse books from product list page using CSS selectors."""
books = []
for product in soup.select('article.product_pod'):
title = product.h3.a.get('title', '') # CSS: h3 a[title]
prod_url = urljoin(BASE_URL, product.h3.a['href']) # Relative href resolved
price_elem = product.select_one('p.price_color') # CSS: p.price_color
price = price_elem.text.strip() if price_elem else ''
rating_elem = product.select_one('p.star-rating') # CSS: p.star-rating
rating = rating_elem['class'][1] if rating_elem and len(rating_elem['class']) > 1 else 'None'
avail_elem = product.select_one('p.availability') # CSS: p.availability
availability = avail_elem.text.strip() if avail_elem else ''
books.append({
'title': title, 'price': price, 'rating': rating,
'availability': availability, 'product_url': prod_url
})
return books
def get_next_url(soup):
"""Get next page URL via CSS selector li.next a."""
next_a = soup.select_one('li.next a')
return urljoin(BASE_URL, next_a['href']) if next_a else None
def main():
session = requests.Session()
session.headers.update(HEADERS)
all_books = []
# Get all category URLs from sidebar: a[href$="index.html"]
cats_url = BASE_URL + 'catalogue/category/books_1/index.html'
cats_soup = get_soup(cats_url, session)
if not cats_soup:
return
nav_ul = cats_soup.select_one('aside ul.nav-list')
cat_urls = [urljoin(BASE_URL, a['href']) for a in nav_ul.select('a[href$="index.html"]') if a['href']]
for cat_url in cat_urls:
page_url = cat_url
while page_url:
soup = get_soup(page_url, session)
if not soup:
break
books = parse_books(soup)
all_books.extend(books)
page_url = get_next_url(soup)
time.sleep(1) # Rate limiting
# Export to CSV
if all_books:
with open('books.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['title', 'price', 'rating', 'availability', 'product_url'])
writer.writeheader()
writer.writerows(all_books)
print(f"Scraped {len(all_books)} books to books.csv")
else:
print("No books scraped.")
if __name__ == '__main__':
main()
```
**How to run:** `pip install requests beautifulsoup4` then `python scraper.py`. Outputs `books.csv` (~1000 rows). Adjust `time.sleep` for politeness.
Related queries
Try it with a real tool
Run this prompt in one of these tools. Affiliate links help keep Gridlyx free.