File size: 4,886 Bytes
946f233 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import requests
import re
from bs4 import BeautifulSoup
from typing import List, Dict
import hashlib
import json
import os
class HTMLScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
self.program_urls = {
'ai': 'https://abit.itmo.ru/program/master/ai',
'ai_product': 'https://abit.itmo.ru/program/master/ai_product'
}
def scrape_programs(self) -> Dict:
programs = {}
for program_id, url in self.program_urls.items():
try:
print(f'Скрапинг программы {program_id}...')
program_data = self._scrape_program_page(url, program_id)
programs[program_id] = program_data
except Exception as e:
print(f'Ошибка при скрапинге {program_id}: {e}')
return programs
def _scrape_program_page(self, url: str, program_id: str) -> Dict:
response = self.session.get(url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
title = self._extract_title(soup)
description = self._extract_description(soup)
pdf_links = self._extract_pdf_links(soup, url)
program_data = {
'id': program_id,
'title': title,
'description': description,
'url': url,
'pdf_links': pdf_links,
'hash': self._calculate_hash(response.content)
}
return program_data
def _extract_title(self, soup: BeautifulSoup) -> str:
title_elem = soup.find('h1') or soup.find('title')
if title_elem:
return title_elem.get_text().strip()
return ''
def _extract_description(self, soup: BeautifulSoup) -> str:
desc_selectors = [
'.program-description',
'.description',
'.program-info',
'p',
'.content'
]
for selector in desc_selectors:
elem = soup.select_one(selector)
if elem:
text = elem.get_text().strip()
if len(text) > 50:
return text[:500]
return ''
def _extract_pdf_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
pdf_links = []
for link in soup.find_all('a', href=True):
href = link.get('href', '')
text = link.get_text().strip().lower()
if self._is_pdf_link(href, text):
full_url = self._make_absolute_url(href, base_url)
pdf_links.append({
'url': full_url,
'text': text,
'filename': self._extract_filename(href)
})
return pdf_links
def _is_pdf_link(self, href: str, text: str) -> bool:
pdf_indicators = [
'учебный план', 'учебный план', 'curriculum', 'plan',
'pdf', '.pdf', 'программа', 'program'
]
href_lower = href.lower()
return any(indicator in href_lower or indicator in text for indicator in pdf_indicators)
def _make_absolute_url(self, href: str, base_url: str) -> str:
if href.startswith('http'):
return href
elif href.startswith('/'):
base = '/'.join(base_url.split('/')[:3])
return base + href
else:
return base_url.rstrip('/') + '/' + href.lstrip('/')
def _extract_filename(self, href: str) -> str:
filename = href.split('/')[-1]
if not filename.endswith('.pdf'):
filename += '.pdf'
return filename
def _calculate_hash(self, content: bytes) -> str:
return hashlib.sha256(content).hexdigest()
def save_programs(self, programs: Dict, output_path: str = 'data/processed/programs.json'):
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(programs, f, ensure_ascii=False, indent=2)
print(f'Программы сохранены в {output_path}')
def main():
scraper = HTMLScraper()
programs = scraper.scrape_programs()
scraper.save_programs(programs)
for program_id, program in programs.items():
print(f'\n{program["title"]}:')
print(f'PDF ссылок найдено: {len(program["pdf_links"])}')
for link in program['pdf_links']:
print(f' - {link["filename"]}: {link["url"]}')
if __name__ == '__main__':
main()
|