Build BeautifulSoup and Scrapy Scripts with AI

Tested prompts for ai python web scraper generator compared across 5 leading AI models.

BEST BY JUDGE SCORE Claude Haiku 4.5 8/10

The prompt we tested

You are an expert Python developer specializing in web scraping with BeautifulSoup, Scrapy, and Requests. Generate a complete, production-ready Python web scraper based on the user's target site and data requirements below.

Output must be a single fenced Python code block preceded by a 2-3 sentence plan and followed by a short usage note. Use BeautifulSoup + requests for simple single-page or small multi-page jobs, and Scrapy for large crawls or pagination-heavy sites; always include a realistic User-Agent, try/except blocks, time.sleep or DOWNLOAD_DELAY, and respect robots.txt with a commented reminder. Keep the code self-contained, PEP8-compliant, under 120 lines where possible, and include inline comments on every non-trivial selector.

User's scraping target and requirements:
I want to scrape product listings from https://books.toscrape.com across all paginated category pages. For each book I need the title, price, star rating, availability, and product URL, then export everything to a CSV file called books.csv.

Produce the full scraper now, including: (1) a brief plan identifying the library choice (BeautifulSoup vs Scrapy) and why, (2) the complete runnable code with imports, headers/user-agent, error handling, retries, and rate-limiting, (3) code to parse the requested fields and export to CSV or JSON, and (4) a short 'How to run' section with pip install commands and example usage.

Model comparison

ModelQualityCostLatencyTokens
Claude Opus 4.7 8 $0.1608 22.3s 482 / 2048
Claude Haiku 4.5 8 $0.0101 11.2s 353 / 1940
GPT-5.4 8 $0.0202 17.3s 304 / 1293
Grok 4.1 Fast Reasoning 8 $0.0006 33.6s 448 / 965

Outputs

Claude Opus 4.7 8/10
## Plan

Since `books.toscrape.com` has predictable pagination (~50 pages) and simple HTML structure, **BeautifulSoup + requests** is the ideal choice — lightweight, no Scrapy project boilerplate needed. The scraper walks paginated listing pages (`/catalogue/page-N.html`), extracts the five required fields per book using CSS selectors, handles retries via `requests.adapters.HTTPAdapter`, rate-limits with `time.sleep`, and writes results to `books.csv`.

```python
"""
Scraper for https://books.toscrape.com
Extracts title, price, star rating, availability, and product URL across all pages.
NOTE: Always check /robots.txt before scraping. books.toscrape.com is a sandbox
site explicitly designed for scraping practice.
"""

import csv
import time
import logging
from urllib.parse import urljoin

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup

BASE_URL = "https://books.toscrape.com/"
LISTING_URL = "https://books.toscrape.com/catalogue/page-{}.html"
OUTPUT_FILE = "books.csv"
REQUEST_DELAY = 1.0  # seconds between requests (be polite)
TIMEOUT = 15

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    )
}

# Map word-based rating classes (e.g. "star-rating Three") to ints
RATING_MAP = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)


def build_session() -> requests.Session:
    """Session with automatic retries on transient failures."""
    session = requests.Session()
    retry = Retry(
        total=5, backoff_factor=0.5,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=("GET",),
    )
    session.mount("https://", HTTPAdapter(max_retries=retry))
    session.mount("http://", HTTPAdapter(max_retries=retry))
    session.headers.update(HEADERS)
    return session


def parse_book(article, page_url: str) -> dict:
    """Extract fields from a single <article class='product_pod'> element."""
    # Title lives in the `title` attribute of the inner <a> (full, non-truncated)
    title_tag = article.select_one("h3 > a")
    title = title_tag["title"].strip() if title_tag else ""

    # Product URL is relative; resolve against the current listing page
    product_url = urljoin(page_url, title_tag["href"]) if title_tag else ""

    # Price: <p class="price_color">£51.77</p>
    price_tag = article.select_one("p.price_color")
    price = price_tag.get_text(strip=True) if price_tag else ""

    # Rating: <p class="star-rating Three"> — second class is the word form
    rating_tag = article.select_one("p.star-rating")
    rating = 0
    if rating_tag:
        for cls in rating_tag.get("class", []):
            if cls in RATING_MAP:
                rating = RATING_MAP[cls]
                break

    # Availability: <p class="instock availability">  In stock </p>
    avail_tag = article.select_one("p.instock.availability")
    availability = avail_tag.get_text(strip=True) if avail_tag else ""

    return {
        "title": title,
        "price": price,
        "rating": rating,
        "availability": availability,
        "url": product_url,
    }


def scrape_all(session: requests.Session) -> list:
    """Iterate through paginated listings until a 404 indicates the end."""
    books, page = [], 1
    while True:
        url = LISTING_URL.format(page)
        try:
            resp = session.get(url, timeout=TIMEOUT)
        except requests.RequestException as e:
            log.error("Request failed for %s: %s", url, e)
            break

        if resp.status_code == 404:
            log.info("Reached end of pagination at page %d", page)
            break
        if not resp.ok:
            log.warning("Skipping %s (HTTP %s)", url, resp.status_code)
            break

        soup = BeautifulSoup(resp.text, "html.parser")
        # Each book listing is an <article class="product_pod">
        articles = soup.select("article.product_pod")
        if not articles:
            log.info("No products found on page %d — stopping.", page)
            break

        for art in articles:
            try:
                books.append(parse_book(art, url))
            except Exception as e:  # never let one bad row kill the crawl
                log.warning("Failed to parse a book on page %d: %s", page, e)

        log.info("Page %d: parsed %d books (total %d)", page, len(articles), len(books))
        page += 1
        time.sleep(REQUEST_DELAY)  # rate-limit

    return books


def export_csv(rows: list, path: str) -> None:
    if not rows:
        log.warning("No rows to export.")
        return
    fieldnames = ["title", "price", "rating", "availability", "url"]
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)
    log.info("Wrote %d rows to %s", len(rows), path)


def main() -> None:
    session = build_session()
    books = scrape_all(session)
    export_csv(books, OUTPUT_FILE)


if __name__ == "__main__":
    main()
```
Judge notes: Missing the "How to run" section with pip install commands and example usage that was explicitly requested in the prompt.
Claude Haiku 4.5 8/10
# Plan
This scraper uses **BeautifulSoup + Requests** because the site has straightforward pagination (page numbers in URL), no heavy JavaScript rendering, and a manageable scope. We'll iterate through category pages, extract book data (title, price, rating, availability), follow pagination links, and export to CSV. Rate-limiting with `time.sleep()` respects the site's resources; the site welcomes educational scraping per its `/robots.txt`.

```python
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin
from typing import List, Dict
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Realistic User-Agent to avoid being blocked
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

BASE_URL = 'https://books.toscrape.com'
CSV_FILE = 'books.csv'
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1  # seconds between requests


def fetch_page(url: str, retries: int = 3) -> str:
    """Fetch a page with retry logic and rate limiting."""
    for attempt in range(retries):
        try:
            time.sleep(RATE_LIMIT_DELAY)  # Respect server resources
            response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            logger.warning(f"Attempt {attempt + 1}/{retries} failed for {url}: {e}")
            if attempt < retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                logger.error(f"Failed to fetch {url} after {retries} attempts")
                return None


def extract_star_rating(rating_text: str) -> float:
    """Convert rating text (e.g., 'Three') to numeric value."""
    rating_map = {'Zero': 0, 'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
    return rating_map.get(rating_text.strip(), 0)


def parse_books(html: str) -> List[Dict[str, str]]:
    """Extract book data from HTML."""
    soup = BeautifulSoup(html, 'html.parser')
    books = []
    
    # Select all book containers
    book_items = soup.select('article.product_pod')
    
    for item in book_items:
        try:
            # Extract title from h2/a tag
            title = item.select_one('h2 a')['title'] if item.select_one('h2 a') else 'N/A'
            
            # Extract price (e.g., "£51.77")
            price = item.select_one('p.price_color').get_text(strip=True) if item.select_one('p.price_color') else 'N/A'
            
            # Extract star rating (e.g., "Four" -> 4)
            rating_class = item.select_one('p.star-rating')['class'][1] if item.select_one('p.star-rating') else 'Zero'
            rating = extract_star_rating(rating_class)
            
            # Extract availability text
            availability = item.select_one('p.instock.availability').get_text(strip=True) if item.select_one('p.instock.availability') else 'N/A'
            
            # Extract product URL (relative link -> absolute)
            product_link = item.select_one('h2 a')['href'] if item.select_one('h2 a') else '#'
            product_url = urljoin(BASE_URL, product_link)
            
            books.append({
                'title': title,
                'price': price,
                'rating': rating,
                'availability': availability,
                'url': product_url
            })
        except (AttributeError, TypeError) as e:
            logger.warning(f"Error parsing book item: {e}")
            continue
    
    return books


def get_next_page_url(html: str, current_url: str) -> str:
    """Find the next page URL from pagination."""
    soup = BeautifulSoup(html, 'html.parser')
    next_button = soup.select_one('li.next a')
    
    if next_button:
        next_path = next_button['href']
        # Construct next page URL based on current URL structure
        base_current = current_url.rsplit('/', 1)[0]
        return urljoin(base_current + '/', next_path)
    return None


def scrape_all_books(start_url: str = f'{BASE_URL}/index.html') -> List[Dict[str, str]]:
    """Scrape all books across all paginated pages."""
    all_books = []
    current_url = start_url
    page_count = 0
    
    while current_url:
        page_count += 1
        logger.info(f"Scraping page {page_count}: {current_url}")
        
        html = fetch_page(current_url)
        if not html:
            break
        
        books = parse_books(html)
        all_books.extend(books)
        logger.info(f"Found {len(books)} books on page {page_count}. Total so far: {len(all_books)}")
        
        # Get next page URL
        current_url = get_next_page_url(html, current_url)
    
    logger.info(f"Scraping complete. Total books collected: {len(all_books)}")
    return all_books


def export_to_csv(books: List[Dict[str, str]], filename: str = CSV_FILE) -> None:
    """Export book data to CSV file."""
    if not books:
        logger.warning("No books to export.")
        return
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['title', 'price', 'rating', 'availability', 'url']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(books)
        logger.info(f"Successfully exported {len(books)} books to {filename}")
    except IOError as e:
        logger.error(f"Failed to write CSV file: {e}")


if __name__ == '__main__':
    # Note: books.toscrape.com welcomes scraping for educational purposes.
    # Always check robots.txt: https://books.toscrape.com/robots.txt
    books_data = scrape_all_books()
    export_to_csv(books_data)
```

## How to Run

**Installation:**
```bash
pip install requests beautifulsoup4
```

**Execution:**
```bash
python scraper.py
```

The script will:
1. Iterate through all paginated pages starting from `https://books.toscrape.com/index.html`
2. Extract title, price, star rating (0–5), availability status, and product URL for each book
3. Respect rate-limiting with 1-second delays between requests and exponential backoff on failures
4. Export all ~200 books to `books.csv` in the current directory

Check the CSV file after completion; each row contains one book's data with proper Unicode handling.
Judge notes: The URL joining logic in `get_next_page_url` is slightly fragile for nested category paths (e.g., `catalogue/page-2.html` relative to a category URL), which could cause pagination to break mid-crawl in production use, and the `availability` selector misses out-of-stock books since it uses `p.instock.availability` exclusively.
GPT-5.4 8/10
For this site, **BeautifulSoup + requests** is the best fit because the pages are static HTML and pagination is simple. The scraper below walks every category, follows all paginated category pages, extracts the requested book fields, normalizes product URLs, rate-limits requests, retries on transient failures, and writes the results to **books.csv**.

```python
import csv
import time
from typing import List, Dict, Set
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://books.toscrape.com/"
CATALOGUE_URL = urljoin(BASE_URL, "catalogue/")
OUTPUT_FILE = "books.csv"
REQUEST_DELAY = 1.0  # Be polite; also check robots.txt before scraping.
TIMEOUT = 15
MAX_RETRIES = 3

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
    )
}


def fetch(session: requests.Session, url: str) -> str:
    """Fetch a URL with retries and basic backoff."""
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = session.get(url, headers=HEADERS, timeout=TIMEOUT)
            response.raise_for_status()
            time.sleep(REQUEST_DELAY)
            return response.text
        except requests.RequestException as exc:
            if attempt == MAX_RETRIES:
                raise RuntimeError(f"Failed to fetch {url}: {exc}") from exc
            time.sleep(REQUEST_DELAY * attempt)
    return ""


def get_category_urls(session: requests.Session) -> List[str]:
    """Extract all category landing page URLs from the homepage."""
    html = fetch(session, BASE_URL)
    soup = BeautifulSoup(html, "html.parser")
    urls = []
    # Sidebar category links under the "Books" section.
    for a in soup.select("div.side_categories ul li ul li a"):
        href = a.get("href", "").strip()
        if href:
            urls.append(urljoin(BASE_URL, href))
    return urls


def parse_rating(article) -> str:
    """Convert star-rating class into readable text."""
    classes = article.select_one("p.star-rating").get("class", [])
    for cls in classes:
        if cls != "star-rating":
            return cls
    return "Unknown"


def parse_books_from_page(soup: BeautifulSoup) -> List[Dict[str, str]]:
    """Parse all book cards from a category page."""
    books = []
    # Each product card is contained in an article.product_pod element.
    for article in soup.select("article.product_pod"):
        # Title is stored on the anchor title attribute inside h3.
        title_tag = article.select_one("h3 a")
        title = title_tag.get("title", "").strip() if title_tag else ""

        # Price text appears in p.price_color, e.g. "£51.77".
        price_tag = article.select_one("p.price_color")
        price = price_tag.get_text(strip=True) if price_tag else ""

        # Availability text is in p.instock.availability and may contain whitespace.
        avail_tag = article.select_one("p.instock.availability")
        availability = " ".join(avail_tag.get_text().split()) if avail_tag else ""

        # Product link is relative to the current page and may include "../../../".
        href = title_tag.get("href", "").strip() if title_tag else ""
        product_url = urljoin(CATALOGUE_URL, href)

        books.append(
            {
                "title": title,
                "price": price,
                "star_rating": parse_rating(article),
                "availability": availability,
                "product_url": product_url,
            }
        )
    return books


def scrape_category(session: requests.Session, category_url: str) -> List[Dict[str, str]]:
    """Follow pagination within a single category and collect all books."""
    results = []
    next_url = category_url
    while next_url:
        html = fetch(session, next_url)
        soup = BeautifulSoup(html, "html.parser")
        results.extend(parse_books_from_page(soup))

        # "next" pagination link appears inside li.next > a.
        next_link = soup.select_one("li.next a")
        next_url = urljoin(next_url, next_link["href"]) if next_link and next_link.get("href") else None
    return results


def save_csv(rows: List[Dict[str, str]], filename: str) -> None:
    """Write scraped rows to CSV."""
    fieldnames = ["title", "price", "star_rating", "availability", "product_url"]
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


def main() -> None:
    all_books: List[Dict[str, str]] = []
    seen_urls: Set[str] = set()

    with requests.Session() as session:
        try:
            category_urls = get_category_urls(session)
            for category_url in category_urls:
                try:
                    books = scrape_category(session, category_url)
                    for book in books:
                        if book["product_url"] not in seen_urls:
                            seen_urls.add(book["product_url"])
                            all_books.append(book)
                except Exception as exc:
                    print(f"Skipping category due to error: {category_url} -> {exc}")
        except Exception as exc:
            print(f"Fatal error while scraping: {exc}")
            return

    save_csv(all_books, OUTPUT_FILE)
    print(f"Saved {len(all_books)} books to {OUTPUT_FILE}")


if __name__ == "__main__":
    main()
```

**How to run**

Install dependencies:
```bash
pip install requests beautifulsoup4
```

Save the script as `scrape_books.py` and run:
```bash
python scrape_books.py
```

This will create `books.csv` in the current directory.
Judge notes: The code is production-ready and well-structured with deduplication, retries, and pagination, but the backoff logic sleeps *after* a successful fetch (adding unnecessary delay) and the `fetch` function has an unreachable `return ""` after the loop that could mask errors, both of which would need fixing before confident production deployment.
Grok 4.1 Fast Reasoning 8/10
This scraper uses BeautifulSoup + requests because the target is a small demo site (~1000 books across 50 categories with simple "next" pagination), avoiding Scrapy's overhead for this scale. It first parses the categories index page to extract all leaf category URLs via CSS selector `a[href$="index.html"]` in the sidebar, then iterates through each category's pages, robustly parsing product fields with CSS selectors (e.g., `p.price_color` for price). Features include a realistic User-Agent, session reuse, 3x retries with exponential backoff, 1s delays, absolute URL resolution, and CSV export; robots.txt allows all (see comment).

```python
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin

# Respect robots.txt: http://books.toscrape.com/robots.txt allows all user-agents.
BASE_URL = 'https://books.toscrape.com/'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

def get_soup(url, session):
    """Fetch page with retries and return BeautifulSoup."""
    for attempt in range(3):
        try:
            resp = session.get(url, headers=HEADERS)
            resp.raise_for_status()
            return BeautifulSoup(resp.text, 'html.parser')
        except requests.RequestException:
            time.sleep(2 ** attempt)
    print(f"Failed to fetch {url}")
    return None

def parse_books(soup):
    """Parse books from product list page using CSS selectors."""
    books = []
    for product in soup.select('article.product_pod'):
        title = product.h3.a.get('title', '')  # CSS: h3 a[title]
        prod_url = urljoin(BASE_URL, product.h3.a['href'])  # Relative href resolved
        price_elem = product.select_one('p.price_color')  # CSS: p.price_color
        price = price_elem.text.strip() if price_elem else ''
        rating_elem = product.select_one('p.star-rating')  # CSS: p.star-rating
        rating = rating_elem['class'][1] if rating_elem and len(rating_elem['class']) > 1 else 'None'
        avail_elem = product.select_one('p.availability')  # CSS: p.availability
        availability = avail_elem.text.strip() if avail_elem else ''
        books.append({
            'title': title, 'price': price, 'rating': rating,
            'availability': availability, 'product_url': prod_url
        })
    return books

def get_next_url(soup):
    """Get next page URL via CSS selector li.next a."""
    next_a = soup.select_one('li.next a')
    return urljoin(BASE_URL, next_a['href']) if next_a else None

def main():
    session = requests.Session()
    session.headers.update(HEADERS)
    all_books = []

    # Get all category URLs from sidebar: a[href$="index.html"]
    cats_url = BASE_URL + 'catalogue/category/books_1/index.html'
    cats_soup = get_soup(cats_url, session)
    if not cats_soup:
        return
    nav_ul = cats_soup.select_one('aside ul.nav-list')
    cat_urls = [urljoin(BASE_URL, a['href']) for a in nav_ul.select('a[href$="index.html"]') if a['href']]

    for cat_url in cat_urls:
        page_url = cat_url
        while page_url:
            soup = get_soup(page_url, session)
            if not soup:
                break
            books = parse_books(soup)
            all_books.extend(books)
            page_url = get_next_url(soup)
            time.sleep(1)  # Rate limiting

    # Export to CSV
    if all_books:
        with open('books.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['title', 'price', 'rating', 'availability', 'product_url'])
            writer.writeheader()
            writer.writerows(all_books)
        print(f"Scraped {len(all_books)} books to books.csv")
    else:
        print("No books scraped.")

if __name__ == '__main__':
    main()
```

**How to run:** `pip install requests beautifulsoup4` then `python scraper.py`. Outputs `books.csv` (~1000 rows). Adjust `time.sleep` for politeness.
Judge notes: The category URL logic is fragile—scraping only the `books_1` category index rather than the true top-level index means it could miss categories if the site structure changes, and the `get_next_url` function uses `urljoin(BASE_URL, ...)` instead of the current page URL as the base, which would break relative "next" links mid-category pagination.

Related queries

Try it with a real tool

Run this prompt in one of these tools. Affiliate links help keep Gridlyx free.