test5 / scraper /html_scraper.py
vydrking's picture
Upload 17 files
946f233 verified
import requests
import re
from bs4 import BeautifulSoup
from typing import List, Dict
import hashlib
import json
import os
class HTMLScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
self.program_urls = {
'ai': 'https://abit.itmo.ru/program/master/ai',
'ai_product': 'https://abit.itmo.ru/program/master/ai_product'
}
def scrape_programs(self) -> Dict:
programs = {}
for program_id, url in self.program_urls.items():
try:
print(f'Скрапинг программы {program_id}...')
program_data = self._scrape_program_page(url, program_id)
programs[program_id] = program_data
except Exception as e:
print(f'Ошибка при скрапинге {program_id}: {e}')
return programs
def _scrape_program_page(self, url: str, program_id: str) -> Dict:
response = self.session.get(url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
title = self._extract_title(soup)
description = self._extract_description(soup)
pdf_links = self._extract_pdf_links(soup, url)
program_data = {
'id': program_id,
'title': title,
'description': description,
'url': url,
'pdf_links': pdf_links,
'hash': self._calculate_hash(response.content)
}
return program_data
def _extract_title(self, soup: BeautifulSoup) -> str:
title_elem = soup.find('h1') or soup.find('title')
if title_elem:
return title_elem.get_text().strip()
return ''
def _extract_description(self, soup: BeautifulSoup) -> str:
desc_selectors = [
'.program-description',
'.description',
'.program-info',
'p',
'.content'
]
for selector in desc_selectors:
elem = soup.select_one(selector)
if elem:
text = elem.get_text().strip()
if len(text) > 50:
return text[:500]
return ''
def _extract_pdf_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
pdf_links = []
for link in soup.find_all('a', href=True):
href = link.get('href', '')
text = link.get_text().strip().lower()
if self._is_pdf_link(href, text):
full_url = self._make_absolute_url(href, base_url)
pdf_links.append({
'url': full_url,
'text': text,
'filename': self._extract_filename(href)
})
return pdf_links
def _is_pdf_link(self, href: str, text: str) -> bool:
pdf_indicators = [
'учебный план', 'учебный план', 'curriculum', 'plan',
'pdf', '.pdf', 'программа', 'program'
]
href_lower = href.lower()
return any(indicator in href_lower or indicator in text for indicator in pdf_indicators)
def _make_absolute_url(self, href: str, base_url: str) -> str:
if href.startswith('http'):
return href
elif href.startswith('/'):
base = '/'.join(base_url.split('/')[:3])
return base + href
else:
return base_url.rstrip('/') + '/' + href.lstrip('/')
def _extract_filename(self, href: str) -> str:
filename = href.split('/')[-1]
if not filename.endswith('.pdf'):
filename += '.pdf'
return filename
def _calculate_hash(self, content: bytes) -> str:
return hashlib.sha256(content).hexdigest()
def save_programs(self, programs: Dict, output_path: str = 'data/processed/programs.json'):
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(programs, f, ensure_ascii=False, indent=2)
print(f'Программы сохранены в {output_path}')
def main():
scraper = HTMLScraper()
programs = scraper.scrape_programs()
scraper.save_programs(programs)
for program_id, program in programs.items():
print(f'\n{program["title"]}:')
print(f'PDF ссылок найдено: {len(program["pdf_links"])}')
for link in program['pdf_links']:
print(f' - {link["filename"]}: {link["url"]}')
if __name__ == '__main__':
main()