| import requests | |
| import re | |
| from bs4 import BeautifulSoup | |
| from typing import List, Dict | |
| import hashlib | |
| import json | |
| import os | |
| class HTMLScraper: | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| }) | |
| self.program_urls = { | |
| 'ai': 'https://abit.itmo.ru/program/master/ai', | |
| 'ai_product': 'https://abit.itmo.ru/program/master/ai_product' | |
| } | |
| def scrape_programs(self) -> Dict: | |
| programs = {} | |
| for program_id, url in self.program_urls.items(): | |
| try: | |
| print(f'Скрапинг программы {program_id}...') | |
| program_data = self._scrape_program_page(url, program_id) | |
| programs[program_id] = program_data | |
| except Exception as e: | |
| print(f'Ошибка при скрапинге {program_id}: {e}') | |
| return programs | |
| def _scrape_program_page(self, url: str, program_id: str) -> Dict: | |
| response = self.session.get(url, timeout=30) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| title = self._extract_title(soup) | |
| description = self._extract_description(soup) | |
| pdf_links = self._extract_pdf_links(soup, url) | |
| program_data = { | |
| 'id': program_id, | |
| 'title': title, | |
| 'description': description, | |
| 'url': url, | |
| 'pdf_links': pdf_links, | |
| 'hash': self._calculate_hash(response.content) | |
| } | |
| return program_data | |
| def _extract_title(self, soup: BeautifulSoup) -> str: | |
| title_elem = soup.find('h1') or soup.find('title') | |
| if title_elem: | |
| return title_elem.get_text().strip() | |
| return '' | |
| def _extract_description(self, soup: BeautifulSoup) -> str: | |
| desc_selectors = [ | |
| '.program-description', | |
| '.description', | |
| '.program-info', | |
| 'p', | |
| '.content' | |
| ] | |
| for selector in desc_selectors: | |
| elem = soup.select_one(selector) | |
| if elem: | |
| text = elem.get_text().strip() | |
| if len(text) > 50: | |
| return text[:500] | |
| return '' | |
| def _extract_pdf_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: | |
| pdf_links = [] | |
| for link in soup.find_all('a', href=True): | |
| href = link.get('href', '') | |
| text = link.get_text().strip().lower() | |
| if self._is_pdf_link(href, text): | |
| full_url = self._make_absolute_url(href, base_url) | |
| pdf_links.append({ | |
| 'url': full_url, | |
| 'text': text, | |
| 'filename': self._extract_filename(href) | |
| }) | |
| return pdf_links | |
| def _is_pdf_link(self, href: str, text: str) -> bool: | |
| pdf_indicators = [ | |
| 'учебный план', 'учебный план', 'curriculum', 'plan', | |
| 'pdf', '.pdf', 'программа', 'program' | |
| ] | |
| href_lower = href.lower() | |
| return any(indicator in href_lower or indicator in text for indicator in pdf_indicators) | |
| def _make_absolute_url(self, href: str, base_url: str) -> str: | |
| if href.startswith('http'): | |
| return href | |
| elif href.startswith('/'): | |
| base = '/'.join(base_url.split('/')[:3]) | |
| return base + href | |
| else: | |
| return base_url.rstrip('/') + '/' + href.lstrip('/') | |
| def _extract_filename(self, href: str) -> str: | |
| filename = href.split('/')[-1] | |
| if not filename.endswith('.pdf'): | |
| filename += '.pdf' | |
| return filename | |
| def _calculate_hash(self, content: bytes) -> str: | |
| return hashlib.sha256(content).hexdigest() | |
| def save_programs(self, programs: Dict, output_path: str = 'data/processed/programs.json'): | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| json.dump(programs, f, ensure_ascii=False, indent=2) | |
| print(f'Программы сохранены в {output_path}') | |
| def main(): | |
| scraper = HTMLScraper() | |
| programs = scraper.scrape_programs() | |
| scraper.save_programs(programs) | |
| for program_id, program in programs.items(): | |
| print(f'\n{program["title"]}:') | |
| print(f'PDF ссылок найдено: {len(program["pdf_links"])}') | |
| for link in program['pdf_links']: | |
| print(f' - {link["filename"]}: {link["url"]}') | |
| if __name__ == '__main__': | |
| main() | |