kuldeep0204 commited on
Commit
763aaf3
·
verified ·
1 Parent(s): 61e520f

Create scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +56 -0
scraper.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scraper.py
2
+ import requests
3
+ from readability import Document
4
+ from bs4 import BeautifulSoup
5
+ from newspaper import Article
6
+ from utils import canonicalize_url
7
+ from urllib.parse import urlparse
8
+ import logging
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ HEADERS = {"User-Agent": "nlp-web-scraper/1.0 (+https://huggingface.co/)"}
13
+
14
+ def fetch(url, timeout=10, allow_redirects=True):
15
+ try:
16
+ r = requests.get(url, timeout=timeout, headers=HEADERS, allow_redirects=allow_redirects)
17
+ r.raise_for_status()
18
+ return r.text, r.url
19
+ except Exception as e:
20
+ logger.warning("fetch failed %s: %s", url, e)
21
+ return None, url
22
+
23
+ def extract(html, url):
24
+ # Primary: readability
25
+ doc = Document(html)
26
+ title = doc.short_title() or ""
27
+ summary_html = doc.summary()
28
+ soup = BeautifulSoup(summary_html, "html.parser")
29
+ main_text = soup.get_text(separator="\n").strip()
30
+ # fallback to newspaper to get meta like authors, publish_date
31
+ article = Article(url)
32
+ try:
33
+ article.download(input_html=html)
34
+ article.parse()
35
+ except Exception:
36
+ pass
37
+ authors = article.authors or []
38
+ publish_date = article.publish_date
39
+ # split into sentences (very simple)
40
+ sentences = [s.strip() for s in main_text.splitlines() if s.strip()]
41
+ # also gather top images
42
+ page_soup = BeautifulSoup(html, "html.parser")
43
+ images = []
44
+ for img in page_soup.find_all("img")[:6]:
45
+ src = img.get("src") or img.get("data-src")
46
+ if src:
47
+ images.append(canonicalize_url(src, base=url))
48
+ return {
49
+ "url": url,
50
+ "title": title,
51
+ "authors": authors,
52
+ "publish_date": publish_date,
53
+ "text": main_text,
54
+ "sentences": sentences,
55
+ "images": images
56
+ }