Spaces:

kuldeep0204
/

web-scraping

Sleeping

App Files Files Community

kuldeep0204 commited on Nov 10

Commit

763aaf3

verified ·

1 Parent(s): 61e520f

Create scraper.py

Browse files

Files changed (1) hide show

scraper.py +56 -0

scraper.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# scraper.py
+import requests
+from readability import Document
+from bs4 import BeautifulSoup
+from newspaper import Article
+from utils import canonicalize_url
+from urllib.parse import urlparse
+import logging
+logger = logging.getLogger(__name__)
+HEADERS = {"User-Agent": "nlp-web-scraper/1.0 (+https://huggingface.co/)"}
+def fetch(url, timeout=10, allow_redirects=True):
+    try:
+        r = requests.get(url, timeout=timeout, headers=HEADERS, allow_redirects=allow_redirects)
+        r.raise_for_status()
+        return r.text, r.url
+    except Exception as e:
+        logger.warning("fetch failed %s: %s", url, e)
+        return None, url
+def extract(html, url):
+    # Primary: readability
+    doc = Document(html)
+    title = doc.short_title() or ""
+    summary_html = doc.summary()
+    soup = BeautifulSoup(summary_html, "html.parser")
+    main_text = soup.get_text(separator="\n").strip()
+    # fallback to newspaper to get meta like authors, publish_date
+    article = Article(url)
+    try:
+        article.download(input_html=html)
+        article.parse()
+    except Exception:
+        pass
+    authors = article.authors or []
+    publish_date = article.publish_date
+    # split into sentences (very simple)
+    sentences = [s.strip() for s in main_text.splitlines() if s.strip()]
+    # also gather top images
+    page_soup = BeautifulSoup(html, "html.parser")
+    images = []
+    for img in page_soup.find_all("img")[:6]:
+        src = img.get("src") or img.get("data-src")
+        if src:
+            images.append(canonicalize_url(src, base=url))
+    return {
+        "url": url,
+        "title": title,
+        "authors": authors,
+        "publish_date": publish_date,
+        "text": main_text,
+        "sentences": sentences,
+        "images": images
+    }