Spaces:
Sleeping
Sleeping
| from typing import Optional | |
| import requests | |
| from bs4 import BeautifulSoup, ResultSet | |
| class GenericScraper: | |
| def __init__(self) -> None: | |
| pass | |
| def scrape(self, url: str) -> str: | |
| response: requests.Response = requests.get(url) | |
| if response.status_code != 200: | |
| raise Exception( | |
| f'Failed to fetch url: {url} with status code {response.status_code}' | |
| ) | |
| soup: BeautifulSoup = BeautifulSoup(response.content, 'html.parser') | |
| sections: ResultSet[BeautifulSoup] = soup.find_all( | |
| ['div', 'section', 'article'] | |
| ) | |
| max_p_len = 0 | |
| best_section: Optional[BeautifulSoup] = None | |
| for section in sections: | |
| ps = section.find_all('p', recursive=False) | |
| p_len = len('\n'.join([p.get_text() for p in ps])) | |
| if p_len > max_p_len: | |
| max_p_len = p_len | |
| best_section = section | |
| if best_section is None: | |
| raise Exception('No sections found') | |
| return best_section.get_text() | |