web-scraping / scraper.py
kuldeep0204's picture
Create scraper.py
763aaf3 verified
raw
history blame
1.77 kB
# scraper.py
import requests
from readability import Document
from bs4 import BeautifulSoup
from newspaper import Article
from utils import canonicalize_url
from urllib.parse import urlparse
import logging
logger = logging.getLogger(__name__)
HEADERS = {"User-Agent": "nlp-web-scraper/1.0 (+https://huggingface.co/)"}
def fetch(url, timeout=10, allow_redirects=True):
try:
r = requests.get(url, timeout=timeout, headers=HEADERS, allow_redirects=allow_redirects)
r.raise_for_status()
return r.text, r.url
except Exception as e:
logger.warning("fetch failed %s: %s", url, e)
return None, url
def extract(html, url):
# Primary: readability
doc = Document(html)
title = doc.short_title() or ""
summary_html = doc.summary()
soup = BeautifulSoup(summary_html, "html.parser")
main_text = soup.get_text(separator="\n").strip()
# fallback to newspaper to get meta like authors, publish_date
article = Article(url)
try:
article.download(input_html=html)
article.parse()
except Exception:
pass
authors = article.authors or []
publish_date = article.publish_date
# split into sentences (very simple)
sentences = [s.strip() for s in main_text.splitlines() if s.strip()]
# also gather top images
page_soup = BeautifulSoup(html, "html.parser")
images = []
for img in page_soup.find_all("img")[:6]:
src = img.get("src") or img.get("data-src")
if src:
images.append(canonicalize_url(src, base=url))
return {
"url": url,
"title": title,
"authors": authors,
"publish_date": publish_date,
"text": main_text,
"sentences": sentences,
"images": images
}