Skip to content

Web scrapper #157

Description

@piyushrajput23223-hub

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

class WebScraper:
def init(self, base_url, delay=1):
self.base_url = base_url
self.delay = delay
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0'
})
self.data = []

def fetch_page(self, url):
    """Fetch a page safely with error handling."""
    try:
        response = self.session.get(url, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def scrape_quotes(self):
    """Example: Scrape quotes.toscrape.com"""
    page = 1
    while True:
        url = f"{self.base_url}/page/{page}/"
        print(f"Scraping page {page}...")
        soup = self.fetch_page(url)
        if not soup:
            break

        quotes = soup.find_all('div', class_='quote')
        if not quotes:
            break

        for q in quotes:
            self.data.append({
                'text':   q.find('span', class_='text').text,
                'author': q.find('small', class_='author').text,
                'tags':   ', '.join([t.text for t in q.find_all('a', class_='tag')])
            })

        time.sleep(self.delay)  # Be polite to servers!
        page += 1

def save(self, filename='output.csv'):
    df = pd.DataFrame(self.data)
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"✅ Saved {len(df)} records → {filename}")
    return df

─── RUN ────────────────────────────────────────────

if name == 'main':
scraper = WebScraper('https://quotes.toscrape.com')
scraper.scrape_quotes()
df = scraper.save('quotes.csv')
print(df.head())

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions