File size: 4,886 Bytes
946f233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import requests
import re
from bs4 import BeautifulSoup
from typing import List, Dict
import hashlib
import json
import os

class HTMLScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        self.program_urls = {
            'ai': 'https://abit.itmo.ru/program/master/ai',
            'ai_product': 'https://abit.itmo.ru/program/master/ai_product'
        }

    
    def scrape_programs(self) -> Dict:
        programs = {}
        
        for program_id, url in self.program_urls.items():
            try:
                print(f'Скрапинг программы {program_id}...')
                program_data = self._scrape_program_page(url, program_id)
                programs[program_id] = program_data
            except Exception as e:
                print(f'Ошибка при скрапинге {program_id}: {e}')
        
        return programs
    
    def _scrape_program_page(self, url: str, program_id: str) -> Dict:
        response = self.session.get(url, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        title = self._extract_title(soup)
        description = self._extract_description(soup)
        pdf_links = self._extract_pdf_links(soup, url)
        
        program_data = {
            'id': program_id,
            'title': title,
            'description': description,
            'url': url,
            'pdf_links': pdf_links,
            'hash': self._calculate_hash(response.content)
        }
        
        return program_data
    
    def _extract_title(self, soup: BeautifulSoup) -> str:
        title_elem = soup.find('h1') or soup.find('title')
        if title_elem:
            return title_elem.get_text().strip()
        return ''
    
    def _extract_description(self, soup: BeautifulSoup) -> str:
        desc_selectors = [
            '.program-description',
            '.description',
            '.program-info',
            'p',
            '.content'
        ]
        
        for selector in desc_selectors:
            elem = soup.select_one(selector)
            if elem:
                text = elem.get_text().strip()
                if len(text) > 50:
                    return text[:500]
        
        return ''
    
    def _extract_pdf_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
        pdf_links = []
        
        for link in soup.find_all('a', href=True):
            href = link.get('href', '')
            text = link.get_text().strip().lower()
            
            if self._is_pdf_link(href, text):
                full_url = self._make_absolute_url(href, base_url)
                pdf_links.append({
                    'url': full_url,
                    'text': text,
                    'filename': self._extract_filename(href)
                })
        
        return pdf_links
    
    def _is_pdf_link(self, href: str, text: str) -> bool:
        pdf_indicators = [
            'учебный план', 'учебный план', 'curriculum', 'plan',
            'pdf', '.pdf', 'программа', 'program'
        ]
        
        href_lower = href.lower()
        return any(indicator in href_lower or indicator in text for indicator in pdf_indicators)
    
    def _make_absolute_url(self, href: str, base_url: str) -> str:
        if href.startswith('http'):
            return href
        elif href.startswith('/'):
            base = '/'.join(base_url.split('/')[:3])
            return base + href
        else:
            return base_url.rstrip('/') + '/' + href.lstrip('/')
    
    def _extract_filename(self, href: str) -> str:
        filename = href.split('/')[-1]
        if not filename.endswith('.pdf'):
            filename += '.pdf'
        return filename
    
    def _calculate_hash(self, content: bytes) -> str:
        return hashlib.sha256(content).hexdigest()
    
    def save_programs(self, programs: Dict, output_path: str = 'data/processed/programs.json'):
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(programs, f, ensure_ascii=False, indent=2)
        
        print(f'Программы сохранены в {output_path}')

def main():
    scraper = HTMLScraper()
    programs = scraper.scrape_programs()
    scraper.save_programs(programs)
    
    for program_id, program in programs.items():
        print(f'\n{program["title"]}:')
        print(f'PDF ссылок найдено: {len(program["pdf_links"])}')
        for link in program['pdf_links']:
            print(f'  - {link["filename"]}: {link["url"]}')

if __name__ == '__main__':
    main()