Spaces:

kuldeep0204
/

web-scraping

Sleeping

web-scraping / scraper.py

Create scraper.py

763aaf3 verified about 1 month ago

1.77 kB

	# scraper.py
	import requests
	from readability import Document
	from bs4 import BeautifulSoup
	from newspaper import Article
	from utils import canonicalize_url
	from urllib.parse import urlparse
	import logging

	logger = logging.getLogger(__name__)

	HEADERS = {"User-Agent": "nlp-web-scraper/1.0 (+https://huggingface.co/)"}

	def fetch(url, timeout=10, allow_redirects=True):
	try:
	r = requests.get(url, timeout=timeout, headers=HEADERS, allow_redirects=allow_redirects)
	r.raise_for_status()
	return r.text, r.url
	except Exception as e:
	logger.warning("fetch failed %s: %s", url, e)
	return None, url

	def extract(html, url):
	# Primary: readability
	doc = Document(html)
	title = doc.short_title() or ""
	summary_html = doc.summary()
	soup = BeautifulSoup(summary_html, "html.parser")
	main_text = soup.get_text(separator="\n").strip()
	# fallback to newspaper to get meta like authors, publish_date
	article = Article(url)
	try:
	article.download(input_html=html)
	article.parse()
	except Exception:
	pass
	authors = article.authors or []
	publish_date = article.publish_date
	# split into sentences (very simple)
	sentences = [s.strip() for s in main_text.splitlines() if s.strip()]
	# also gather top images
	page_soup = BeautifulSoup(html, "html.parser")
	images = []
	for img in page_soup.find_all("img")[:6]:
	src = img.get("src") or img.get("data-src")
	if src:
	images.append(canonicalize_url(src, base=url))
	return {
	"url": url,
	"title": title,
	"authors": authors,
	"publish_date": publish_date,
	"text": main_text,
	"sentences": sentences,
	"images": images
	}