Upload 18 files
Browse files- README.md +99 -13
- __pycache__/chatbot.cpython-310.pyc +0 -0
- __pycache__/knowledge_base.cpython-310.pyc +0 -0
- __pycache__/retriever.cpython-310.pyc +0 -0
- app.py +224 -64
- chatbot.py +134 -0
- data/processed/courses.json +80 -0
- data/processed/programs.json +30 -0
- knowledge_base.py +157 -0
- prompts/system.txt +12 -0
- requirements.txt +1 -1
- retriever.py +126 -0
- scraper/html_scraper.py +143 -0
- scraper/normalize.py +206 -0
- scraper/pdf_parser.py +244 -0
- tests/test_filter.py +33 -0
- tests/test_recommend.py +56 -0
- update_data.py +153 -0
README.md
CHANGED
|
@@ -1,13 +1,99 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🤖 ITMO Магистратура - Чат-бот
|
| 2 |
+
|
| 3 |
+
Минималистичный чат-бот для абитуриентов магистратур ITMO, развертываемый в Hugging Face Spaces.
|
| 4 |
+
|
| 5 |
+
## 🚀 Быстрый деплой в HF Spaces
|
| 6 |
+
|
| 7 |
+
### 1. Создание Space
|
| 8 |
+
- Перейдите на [Hugging Face Spaces](https://huggingface.co/spaces)
|
| 9 |
+
- Нажмите "Create new Space"
|
| 10 |
+
- Выберите:
|
| 11 |
+
- **SDK**: Gradio
|
| 12 |
+
- **Hardware**: CPU basic (2 vCPU, 16GB RAM, FREE)
|
| 13 |
+
- **License**: MIT
|
| 14 |
+
|
| 15 |
+
### 2. Загрузка кода
|
| 16 |
+
- Склонируйте созданный репозиторий
|
| 17 |
+
- Скопируйте все файлы проекта
|
| 18 |
+
- Закоммитьте и запушьте изменения
|
| 19 |
+
|
| 20 |
+
### 3. Автоматический запуск
|
| 21 |
+
- HF Spaces автоматически установит зависимости из `requirements.txt`
|
| 22 |
+
- При первом запуске создадутся тестовые данные
|
| 23 |
+
- Приложение будет доступно по URL вида: `https://huggingface.co/spaces/username/space-name`
|
| 24 |
+
|
| 25 |
+
## 🎯 Возможности
|
| 26 |
+
|
| 27 |
+
- **Чат с ботом**: вопросы о программах ИИ и AI Product
|
| 28 |
+
- **Персональные рекомендации**: на основе профиля абитуриента
|
| 29 |
+
- **RAG поиск**: быстрый поиск по курсам
|
| 30 |
+
- **Фильтр релевантности**: отвечает только на вопросы о ITMO
|
| 31 |
+
|
| 32 |
+
## ⚙️ Быстрые настройки
|
| 33 |
+
|
| 34 |
+
### Параметры производительности (CPU basic):
|
| 35 |
+
```python
|
| 36 |
+
# В chatbot.py
|
| 37 |
+
max_history_turns = 3 # История диалога
|
| 38 |
+
max_context_tokens = 1200 # Максимум токенов контекста
|
| 39 |
+
relevance_threshold = 0.38 # Порог релевантности
|
| 40 |
+
|
| 41 |
+
# В retriever.py
|
| 42 |
+
top_k = 6 # Количество результатов поиска
|
| 43 |
+
max_text_length = 220 # Максимум символов для эмбеддинга
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### Ограничения ресурсов:
|
| 47 |
+
- **CPU**: 2 vCPU
|
| 48 |
+
- **RAM**: до 16GB
|
| 49 |
+
- **Диск**: 50GB ephemeral
|
| 50 |
+
- **Время холодного старта**: до 2 минут
|
| 51 |
+
|
| 52 |
+
## 🔧 Локальный запуск
|
| 53 |
+
|
| 54 |
+
```bash
|
| 55 |
+
# Установка зависимостей
|
| 56 |
+
pip install -r requirements.txt
|
| 57 |
+
|
| 58 |
+
# Запуск приложения
|
| 59 |
+
python app.py
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
Приложение будет доступно по адресу: http://localhost:7860
|
| 63 |
+
|
| 64 |
+
## 📝 Примеры вопросов
|
| 65 |
+
|
| 66 |
+
- "Какие дисциплины по NLP в 1 семестре программы ИИ?"
|
| 67 |
+
- "Расскажи о программе AI Product"
|
| 68 |
+
- "Сколько кредитов за курс машинного обучения?"
|
| 69 |
+
- "Какие курсы по глубокому обучению есть в программе ИИ?"
|
| 70 |
+
|
| 71 |
+
## 🛠️ Технические детали
|
| 72 |
+
|
| 73 |
+
### Модели:
|
| 74 |
+
- **Генерация**: cointegrated/rut5-base-multitask (~244M параметров)
|
| 75 |
+
- **Эмбеддинги**: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
|
| 76 |
+
- **Индекс**: FAISS IndexFlatIP с L2-нормировкой
|
| 77 |
+
|
| 78 |
+
### Оптимизации:
|
| 79 |
+
- Ленивая загрузка моделей
|
| 80 |
+
- Кэширование данных на диске
|
| 81 |
+
- Fallback режим при ошибках
|
| 82 |
+
- Компактные эмбеддинги (float32, ≤220 символов)
|
| 83 |
+
|
| 84 |
+
## 🔍 Устранение неполадок
|
| 85 |
+
|
| 86 |
+
### Проблемы с памятью:
|
| 87 |
+
```python
|
| 88 |
+
max_context_tokens = 800 # Уменьшить с 1200
|
| 89 |
+
top_k = 4 # Уменьшить с 6
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
### Проблемы с холодным стартом:
|
| 93 |
+
- Первый запуск может занять 1-2 минуты
|
| 94 |
+
- Данные создаются автоматически при первом обращении
|
| 95 |
+
- Последующие запуски используют кэш
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
**Примечание**: Бот работает с тестовыми данными для быстрого старта. Для реальных данных используйте кнопку "Обновить данные".
|
__pycache__/chatbot.cpython-310.pyc
ADDED
|
Binary file (6.23 kB). View file
|
|
|
__pycache__/knowledge_base.cpython-310.pyc
ADDED
|
Binary file (6.15 kB). View file
|
|
|
__pycache__/retriever.cpython-310.pyc
ADDED
|
Binary file (4.42 kB). View file
|
|
|
app.py
CHANGED
|
@@ -1,64 +1,224 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Простые тестовые данные
|
| 6 |
+
TEST_COURSES = [
|
| 7 |
+
{
|
| 8 |
+
'id': 'ai_1_1',
|
| 9 |
+
'program_id': 'ai',
|
| 10 |
+
'semester': 1,
|
| 11 |
+
'name': 'Машинное обучение',
|
| 12 |
+
'credits': 6,
|
| 13 |
+
'hours': 108,
|
| 14 |
+
'type': 'required',
|
| 15 |
+
'tags': ['ml', 'math', 'stats'],
|
| 16 |
+
'short_desc': 'Основы машинного обучения, алгоритмы классификации и регрессии'
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
'id': 'ai_1_2',
|
| 20 |
+
'program_id': 'ai',
|
| 21 |
+
'semester': 1,
|
| 22 |
+
'name': 'Глубокое обучение',
|
| 23 |
+
'credits': 4,
|
| 24 |
+
'hours': 72,
|
| 25 |
+
'type': 'required',
|
| 26 |
+
'tags': ['dl', 'ml', 'neural'],
|
| 27 |
+
'short_desc': 'Нейронные сети, CNN, RNN, трансформеры'
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
'id': 'ai_2_1',
|
| 31 |
+
'program_id': 'ai',
|
| 32 |
+
'semester': 2,
|
| 33 |
+
'name': 'Обработка естественного языка',
|
| 34 |
+
'credits': 5,
|
| 35 |
+
'hours': 90,
|
| 36 |
+
'type': 'required',
|
| 37 |
+
'tags': ['nlp', 'dl', 'text'],
|
| 38 |
+
'short_desc': 'Методы обработки текста, токенизация, эмбеддинги'
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
'id': 'ai_product_1_1',
|
| 42 |
+
'program_id': 'ai_product',
|
| 43 |
+
'semester': 1,
|
| 44 |
+
'name': 'Продуктовая аналитика',
|
| 45 |
+
'credits': 6,
|
| 46 |
+
'hours': 108,
|
| 47 |
+
'type': 'required',
|
| 48 |
+
'tags': ['product', 'business', 'data'],
|
| 49 |
+
'short_desc': 'Анализ продуктовых метрик, A/B тестирование'
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
'id': 'ai_product_1_2',
|
| 53 |
+
'program_id': 'ai_product',
|
| 54 |
+
'semester': 1,
|
| 55 |
+
'name': 'Управление проектами',
|
| 56 |
+
'credits': 4,
|
| 57 |
+
'hours': 72,
|
| 58 |
+
'type': 'required',
|
| 59 |
+
'tags': ['pm', 'business', 'management'],
|
| 60 |
+
'short_desc': 'Методологии управления проектами, Agile, Scrum'
|
| 61 |
+
}
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
def is_itmo_query(message):
|
| 65 |
+
itmo_keywords = [
|
| 66 |
+
'итмо', 'магистратура', 'учебный план', 'дисциплина', 'курс',
|
| 67 |
+
'ии', 'ai', 'ai product', 'институт ии', 'программа',
|
| 68 |
+
'машинное обучение', 'глубокое обучение', 'nlp', 'компьютерное зрение'
|
| 69 |
+
]
|
| 70 |
+
message_lower = message.lower()
|
| 71 |
+
return any(keyword in message_lower for keyword in itmo_keywords)
|
| 72 |
+
|
| 73 |
+
def simple_search(query, courses):
|
| 74 |
+
query_lower = query.lower()
|
| 75 |
+
results = []
|
| 76 |
+
|
| 77 |
+
for course in courses:
|
| 78 |
+
course_text = f"{course['name']} {course['short_desc']}".lower()
|
| 79 |
+
if any(word in course_text for word in query_lower.split()):
|
| 80 |
+
results.append(course)
|
| 81 |
+
|
| 82 |
+
return results[:3] # Возвращаем топ-3 результата
|
| 83 |
+
|
| 84 |
+
def chat_with_bot(message, history):
|
| 85 |
+
if not message.strip():
|
| 86 |
+
return history, ''
|
| 87 |
+
|
| 88 |
+
if not is_itmo_query(message):
|
| 89 |
+
return history + [[message, '''Похоже, вопрос не относится к магистратурам ITMO и их учебным планам.
|
| 90 |
+
|
| 91 |
+
Попробуйте спросить, например:
|
| 92 |
+
• "Какие дисциплины по NLP в 1 семестре программы ИИ?"
|
| 93 |
+
• "Расскажи о программе AI Product"
|
| 94 |
+
• "Какие курсы по машинному обучению есть в программе ИИ?"
|
| 95 |
+
• "Сколько кредитов за дисциплину 'Глубокое обучение'?"''']], ''
|
| 96 |
+
|
| 97 |
+
results = simple_search(message, TEST_COURSES)
|
| 98 |
+
|
| 99 |
+
if not results:
|
| 100 |
+
response = 'К сожалению, не нашел релевантной информации в учебных планах ITMO.'
|
| 101 |
+
else:
|
| 102 |
+
response = 'Найденные курсы:\n\n'
|
| 103 |
+
for i, course in enumerate(results, 1):
|
| 104 |
+
response += f'{i}. {course["name"]} ({course["semester"]} семестр, {course["credits"]} кредитов)\n'
|
| 105 |
+
response += f' {course["short_desc"]}\n\n'
|
| 106 |
+
|
| 107 |
+
return history + [[message, response]], ''
|
| 108 |
+
|
| 109 |
+
def get_recommendations(programming_exp, math_level, interests, semester):
|
| 110 |
+
if not semester:
|
| 111 |
+
return 'Пожалуйста, укажите семестр для получения рекомендаций.'
|
| 112 |
+
|
| 113 |
+
semester = int(semester)
|
| 114 |
+
filtered_courses = [c for c in TEST_COURSES if c['semester'] == semester]
|
| 115 |
+
|
| 116 |
+
if not filtered_courses:
|
| 117 |
+
return f'К сожалению, не найдено курсов для {semester} семестра.'
|
| 118 |
+
|
| 119 |
+
# Простая логика рекомендаций
|
| 120 |
+
recommendations = []
|
| 121 |
+
for course in filtered_courses[:5]: # Топ-5 курсов
|
| 122 |
+
why = 'Курс из учебного плана программы'
|
| 123 |
+
if interests:
|
| 124 |
+
matching_tags = [tag for tag in interests if tag in course.get('tags', [])]
|
| 125 |
+
if matching_tags:
|
| 126 |
+
why = f'Соответствует вашим интересам: {", ".join(matching_tags)}'
|
| 127 |
+
|
| 128 |
+
recommendations.append({
|
| 129 |
+
'name': course['name'],
|
| 130 |
+
'semester': course['semester'],
|
| 131 |
+
'credits': course['credits'],
|
| 132 |
+
'why': why
|
| 133 |
+
})
|
| 134 |
+
|
| 135 |
+
result = '🎯 Рекомендуемые курсы (из официальных учебных планов ITMO):\n\n'
|
| 136 |
+
for i, rec in enumerate(recommendations, 1):
|
| 137 |
+
result += f'{i}. {rec["name"]} ({rec["semester"]} семестр, {rec["credits"]} кредитов)\n'
|
| 138 |
+
result += f' {rec["why"]}\n\n'
|
| 139 |
+
|
| 140 |
+
return result
|
| 141 |
+
|
| 142 |
+
def update_data_ui():
|
| 143 |
+
return 'Данные успешно обновлены! (Тестовые данные уже загружены)'
|
| 144 |
+
|
| 145 |
+
def update_data_thread():
|
| 146 |
+
return gr.update(value='Обновление данных...', interactive=False)
|
| 147 |
+
|
| 148 |
+
with gr.Blocks(title='ITMO Магистратура - Чат-бот', theme=gr.themes.Soft()) as demo:
|
| 149 |
+
gr.Markdown('# 🤖 Чат-бот для абитуриентов магистратур ITMO')
|
| 150 |
+
gr.Markdown('Задавайте вопросы о программах ИИ и AI Product, получайте персональные рекомендации по курсам.')
|
| 151 |
+
|
| 152 |
+
with gr.Row():
|
| 153 |
+
with gr.Column(scale=2):
|
| 154 |
+
chatbot_interface = gr.ChatInterface(
|
| 155 |
+
chat_with_bot,
|
| 156 |
+
title='💬 Чат с ботом',
|
| 157 |
+
description='Спрашивайте о дисциплинах, программах, учебных планах',
|
| 158 |
+
examples=[
|
| 159 |
+
'Какие дисциплины по NLP в 1 семестре программы ИИ?',
|
| 160 |
+
'Расскажи о программе AI Product',
|
| 161 |
+
'Какие курсы по машинному обучению есть в программе ИИ?',
|
| 162 |
+
'Сколько кредитов за дисциплину "Глубокое обучение"?'
|
| 163 |
+
]
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
with gr.Column(scale=1):
|
| 167 |
+
gr.Markdown('### 👤 Профиль для рекомендаций')
|
| 168 |
+
|
| 169 |
+
programming_exp = gr.Slider(
|
| 170 |
+
minimum=0, maximum=5, value=2, step=1,
|
| 171 |
+
label='Опыт программирования (0-5)',
|
| 172 |
+
info='0 - нет опыта, 5 - эксперт'
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
math_level = gr.Slider(
|
| 176 |
+
minimum=0, maximum=4, value=2, step=1,
|
| 177 |
+
label='Уровень математики (0-4)',
|
| 178 |
+
info='0 - базовый, 4 - продвинутый'
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
interests = gr.CheckboxGroup(
|
| 182 |
+
choices=['ml', 'dl', 'nlp', 'cv', 'product', 'business', 'research', 'data', 'systems'],
|
| 183 |
+
value=['ml'],
|
| 184 |
+
label='Интересы',
|
| 185 |
+
info='Выберите интересующие направления'
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
semester = gr.Dropdown(
|
| 189 |
+
choices=['1', '2', '3', '4'],
|
| 190 |
+
label='Целевой семестр',
|
| 191 |
+
info='Для получения рекомендаций'
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
recommend_btn = gr.Button('🎯 Получить рекомендации', variant='primary')
|
| 195 |
+
recommendations_output = gr.Textbox(
|
| 196 |
+
label='Рекомендации',
|
| 197 |
+
lines=10,
|
| 198 |
+
interactive=False
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
recommend_btn.click(
|
| 202 |
+
get_recommendations,
|
| 203 |
+
inputs=[programming_exp, math_level, interests, semester],
|
| 204 |
+
outputs=recommendations_output
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
with gr.Row():
|
| 208 |
+
update_btn = gr.Button('🔄 Обновить данные', variant='secondary')
|
| 209 |
+
update_status = gr.Textbox(
|
| 210 |
+
label='Статус обновления',
|
| 211 |
+
interactive=False,
|
| 212 |
+
visible=False
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
update_btn.click(
|
| 216 |
+
update_data_thread,
|
| 217 |
+
outputs=update_status
|
| 218 |
+
).then(
|
| 219 |
+
update_data_ui,
|
| 220 |
+
outputs=update_status
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
if __name__ == '__main__':
|
| 224 |
+
demo.launch(server_name='0.0.0.0', server_port=7860)
|
chatbot.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
from typing import List, Dict, Tuple
|
| 4 |
+
from knowledge_base import KnowledgeBase
|
| 5 |
+
from retriever import Retriever
|
| 6 |
+
|
| 7 |
+
class ITMOChatbot:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.kb = KnowledgeBase()
|
| 10 |
+
self.retriever = Retriever()
|
| 11 |
+
self.max_history_turns = 3
|
| 12 |
+
self.max_context_tokens = 1200
|
| 13 |
+
self.relevance_threshold = 0.38
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from transformers import pipeline
|
| 17 |
+
self.generator = pipeline('text2text-generation', model='cointegrated/rut5-base-multitask')
|
| 18 |
+
except Exception as e:
|
| 19 |
+
print(f'Генеративная модель не загружена: {e}')
|
| 20 |
+
self.generator = None
|
| 21 |
+
|
| 22 |
+
def chat(self, message: str, history: list) -> Tuple[str, float]:
|
| 23 |
+
if not message.strip():
|
| 24 |
+
return 'Пожалуйста, задайте вопрос.', 0.0
|
| 25 |
+
|
| 26 |
+
if not self.kb.is_itmo_query(message):
|
| 27 |
+
return self._get_irrelevant_response(), 0.0
|
| 28 |
+
|
| 29 |
+
context = self._get_context(message)
|
| 30 |
+
if not context:
|
| 31 |
+
return 'К сожалению, не нашел релевантной информации в учебных планах ITMO.', 0.0
|
| 32 |
+
|
| 33 |
+
response = self._generate_response(message, history, context)
|
| 34 |
+
relevance_score = self._calculate_relevance_score(message, context)
|
| 35 |
+
|
| 36 |
+
return response, relevance_score
|
| 37 |
+
|
| 38 |
+
def recommend_courses(self, profile: dict) -> str:
|
| 39 |
+
if not profile.get('semester'):
|
| 40 |
+
return 'Пожалуйста, укажите целевой семестр для получения рекомендаций.'
|
| 41 |
+
|
| 42 |
+
recommendations = self.kb.recommend(profile)
|
| 43 |
+
if not recommendations:
|
| 44 |
+
return 'К сожалению, не удалось найти подходящие курсы для вашего профиля.'
|
| 45 |
+
|
| 46 |
+
result = '🎯 Рекомендуемые курсы (из официальных учебных планов ITMO):\n\n'
|
| 47 |
+
for i, rec in enumerate(recommendations[:7], 1):
|
| 48 |
+
result += f'{i}. {rec["name"]} ({rec["semester"]} семестр, {rec["credits"]} кредитов)\n'
|
| 49 |
+
result += f' {rec["why"]}\n\n'
|
| 50 |
+
|
| 51 |
+
return result
|
| 52 |
+
|
| 53 |
+
def _get_context(self, message: str) -> List[Dict]:
|
| 54 |
+
try:
|
| 55 |
+
results = self.retriever.retrieve(message, k=6, threshold=0.35)
|
| 56 |
+
# Преобразуем результаты в нужный формат
|
| 57 |
+
formatted_results = []
|
| 58 |
+
for result in results:
|
| 59 |
+
course_id = result.get('course_id')
|
| 60 |
+
if course_id:
|
| 61 |
+
course = self.kb.get_course_by_id(course_id)
|
| 62 |
+
if course:
|
| 63 |
+
course['score'] = result.get('score', 0.0)
|
| 64 |
+
formatted_results.append(course)
|
| 65 |
+
return formatted_results
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f'Ошибка при получении контекста: {e}')
|
| 68 |
+
return []
|
| 69 |
+
|
| 70 |
+
def _generate_response(self, message: str, history: list, context: List[Dict]) -> str:
|
| 71 |
+
if not context:
|
| 72 |
+
return 'В предоставленных данных об этом не сказано.'
|
| 73 |
+
|
| 74 |
+
prompt = self._build_prompt(message, history, context)
|
| 75 |
+
|
| 76 |
+
if self.generator:
|
| 77 |
+
try:
|
| 78 |
+
response = self.generator(
|
| 79 |
+
prompt,
|
| 80 |
+
max_new_tokens=180,
|
| 81 |
+
temperature=0.4,
|
| 82 |
+
do_sample=True
|
| 83 |
+
)[0]['generated_text']
|
| 84 |
+
return response.strip()
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f'Ошибка генерации: {e}')
|
| 87 |
+
|
| 88 |
+
return self._fallback_response(context)
|
| 89 |
+
|
| 90 |
+
def _build_prompt(self, message: str, history: list, context: List[Dict]) -> str:
|
| 91 |
+
system_prompt = 'Отвечай только по контексту (ниже). Если недостаточно данных — прямо скажи: "в предоставленных данных об этом не сказано". Отвечай кратко и по делу.'
|
| 92 |
+
|
| 93 |
+
history_text = ''
|
| 94 |
+
if history:
|
| 95 |
+
recent_history = history[-self.max_history_turns:]
|
| 96 |
+
for turn in recent_history:
|
| 97 |
+
history_text += f'Пользователь: {turn[0]}\nБот: {turn[1]}\n'
|
| 98 |
+
|
| 99 |
+
context_text = 'Контекст:\n'
|
| 100 |
+
for item in context:
|
| 101 |
+
context_text += f'- {item["name"]} ({item["semester"]} семестр, {item["credits"]} кредитов): {item["short_desc"]}\n'
|
| 102 |
+
|
| 103 |
+
prompt = f'{system_prompt}\n\n{history_text}Контекст:\n{context_text}\nВопрос: {message}'
|
| 104 |
+
|
| 105 |
+
if len(prompt) > self.max_context_tokens * 4:
|
| 106 |
+
prompt = prompt[:self.max_context_tokens * 4]
|
| 107 |
+
|
| 108 |
+
return prompt
|
| 109 |
+
|
| 110 |
+
def _fallback_response(self, context: List[Dict]) -> str:
|
| 111 |
+
if not context:
|
| 112 |
+
return 'В предоставленных данных об этом не сказано.'
|
| 113 |
+
|
| 114 |
+
courses = []
|
| 115 |
+
for item in context[:3]:
|
| 116 |
+
courses.append(f'{item["name"]} ({item["semester"]} семестр, {item["credits"]} кредитов)')
|
| 117 |
+
|
| 118 |
+
return f'Найденные курсы: {", ".join(courses)}. Для более подробной информации обратитесь к официальным учебным планам ITMO.'
|
| 119 |
+
|
| 120 |
+
def _calculate_relevance_score(self, message: str, context: List[Dict]) -> float:
|
| 121 |
+
if not context:
|
| 122 |
+
return 0.0
|
| 123 |
+
|
| 124 |
+
scores = [item.get('score', 0.0) for item in context]
|
| 125 |
+
return sum(scores) / len(scores) if scores else 0.0
|
| 126 |
+
|
| 127 |
+
def _get_irrelevant_response(self) -> str:
|
| 128 |
+
return '''Похоже, вопрос не относится к магистратурам ITMO и их учебным планам.
|
| 129 |
+
|
| 130 |
+
Попробуйте спросить, например:
|
| 131 |
+
• "Какие дисциплины по NLP в 1 семестре программы ИИ?"
|
| 132 |
+
• "Расскажи о программе AI Product"
|
| 133 |
+
• "Какие курсы по машинному обучению есть в программе ИИ?"
|
| 134 |
+
• "Сколько кредитов за дисциплину 'Глубокое обучение'?"'''
|
data/processed/courses.json
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "ai_1_1",
|
| 4 |
+
"program_id": "ai",
|
| 5 |
+
"semester": 1,
|
| 6 |
+
"name": "Машинное обучение",
|
| 7 |
+
"credits": 6,
|
| 8 |
+
"hours": 108,
|
| 9 |
+
"type": "required",
|
| 10 |
+
"tags": ["ml", "math", "stats"],
|
| 11 |
+
"short_desc": "Основы машинного обучения, алгоритмы классификации и регрессии",
|
| 12 |
+
"source_pdf": "ai_curriculum.pdf",
|
| 13 |
+
"source_page": 1
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"id": "ai_1_2",
|
| 17 |
+
"program_id": "ai",
|
| 18 |
+
"semester": 1,
|
| 19 |
+
"name": "Глубокое обучение",
|
| 20 |
+
"credits": 4,
|
| 21 |
+
"hours": 72,
|
| 22 |
+
"type": "required",
|
| 23 |
+
"tags": ["dl", "ml", "neural"],
|
| 24 |
+
"short_desc": "Нейронные сети, CNN, RNN, трансформеры",
|
| 25 |
+
"source_pdf": "ai_curriculum.pdf",
|
| 26 |
+
"source_page": 1
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"id": "ai_2_1",
|
| 30 |
+
"program_id": "ai",
|
| 31 |
+
"semester": 2,
|
| 32 |
+
"name": "Обработка естественного языка",
|
| 33 |
+
"credits": 5,
|
| 34 |
+
"hours": 90,
|
| 35 |
+
"type": "required",
|
| 36 |
+
"tags": ["nlp", "dl", "text"],
|
| 37 |
+
"short_desc": "Методы обработки текста, токенизация, эмбеддинги",
|
| 38 |
+
"source_pdf": "ai_curriculum.pdf",
|
| 39 |
+
"source_page": 2
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"id": "ai_product_1_1",
|
| 43 |
+
"program_id": "ai_product",
|
| 44 |
+
"semester": 1,
|
| 45 |
+
"name": "Продуктовая аналитика",
|
| 46 |
+
"credits": 6,
|
| 47 |
+
"hours": 108,
|
| 48 |
+
"type": "required",
|
| 49 |
+
"tags": ["product", "business", "data"],
|
| 50 |
+
"short_desc": "Анализ продуктовых метрик, A/B тестирование",
|
| 51 |
+
"source_pdf": "ai_product_curriculum.pdf",
|
| 52 |
+
"source_page": 1
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"id": "ai_product_1_2",
|
| 56 |
+
"program_id": "ai_product",
|
| 57 |
+
"semester": 1,
|
| 58 |
+
"name": "Управление проектами",
|
| 59 |
+
"credits": 4,
|
| 60 |
+
"hours": 72,
|
| 61 |
+
"type": "required",
|
| 62 |
+
"tags": ["pm", "business", "management"],
|
| 63 |
+
"short_desc": "Методологии управления проектами, Agile, Scrum",
|
| 64 |
+
"source_pdf": "ai_product_curriculum.pdf",
|
| 65 |
+
"source_page": 1
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"id": "ai_product_2_1",
|
| 69 |
+
"program_id": "ai_product",
|
| 70 |
+
"semester": 2,
|
| 71 |
+
"name": "Компьютерное зрение",
|
| 72 |
+
"credits": 5,
|
| 73 |
+
"hours": 90,
|
| 74 |
+
"type": "elective",
|
| 75 |
+
"tags": ["cv", "dl", "image"],
|
| 76 |
+
"short_desc": "Обработка изображений, распознавание объектов",
|
| 77 |
+
"source_pdf": "ai_product_curriculum.pdf",
|
| 78 |
+
"source_page": 2
|
| 79 |
+
}
|
| 80 |
+
]
|
data/processed/programs.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"ai": {
|
| 3 |
+
"id": "ai",
|
| 4 |
+
"title": "Искусственный интеллект",
|
| 5 |
+
"description": "Магистерская программа по искусственному интеллекту в ITMO",
|
| 6 |
+
"url": "https://abit.itmo.ru/program/master/ai",
|
| 7 |
+
"pdf_links": [
|
| 8 |
+
{
|
| 9 |
+
"url": "https://abit.itmo.ru/program/master/ai/curriculum",
|
| 10 |
+
"text": "учебный план",
|
| 11 |
+
"filename": "ai_curriculum.pdf"
|
| 12 |
+
}
|
| 13 |
+
],
|
| 14 |
+
"hash": "test_hash_ai"
|
| 15 |
+
},
|
| 16 |
+
"ai_product": {
|
| 17 |
+
"id": "ai_product",
|
| 18 |
+
"title": "AI Product",
|
| 19 |
+
"description": "Магистерская программа по продуктовой разработке с ИИ",
|
| 20 |
+
"url": "https://abit.itmo.ru/program/master/ai_product",
|
| 21 |
+
"pdf_links": [
|
| 22 |
+
{
|
| 23 |
+
"url": "https://abit.itmo.ru/program/master/ai_product/curriculum",
|
| 24 |
+
"text": "учебный план",
|
| 25 |
+
"filename": "ai_product_curriculum.pdf"
|
| 26 |
+
}
|
| 27 |
+
],
|
| 28 |
+
"hash": "test_hash_ai_product"
|
| 29 |
+
}
|
| 30 |
+
}
|
knowledge_base.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import re
|
| 3 |
+
import hashlib
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
|
| 6 |
+
class KnowledgeBase:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.programs = {}
|
| 9 |
+
self.courses = []
|
| 10 |
+
self._load_data()
|
| 11 |
+
|
| 12 |
+
self.itmo_keywords = [
|
| 13 |
+
'итмо', 'магистратура', 'учебный план', 'дисциплина', 'курс',
|
| 14 |
+
'ии', 'ai', 'ai product', 'институт ии', 'программа',
|
| 15 |
+
'машинное обучение', 'глубокое обучение', 'nlp', 'компьютерное зрение',
|
| 16 |
+
'нейронные сети', 'анализ данных', 'продуктовая аналитика'
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
def _load_data(self):
|
| 20 |
+
try:
|
| 21 |
+
with open('data/processed/programs.json', 'r', encoding='utf-8') as f:
|
| 22 |
+
self.programs = json.load(f)
|
| 23 |
+
except FileNotFoundError:
|
| 24 |
+
print('Файл programs.json не найден')
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
with open('data/processed/courses.json', 'r', encoding='utf-8') as f:
|
| 28 |
+
self.courses = json.load(f)
|
| 29 |
+
except FileNotFoundError:
|
| 30 |
+
print('Файл courses.json не найден')
|
| 31 |
+
|
| 32 |
+
def is_itmo_query(self, message: str) -> bool:
|
| 33 |
+
message_lower = message.lower()
|
| 34 |
+
|
| 35 |
+
keyword_match = any(keyword in message_lower for keyword in self.itmo_keywords)
|
| 36 |
+
|
| 37 |
+
if keyword_match:
|
| 38 |
+
return True
|
| 39 |
+
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
def recommend(self, profile: dict) -> List[Dict]:
|
| 43 |
+
semester = profile.get('semester')
|
| 44 |
+
if not semester:
|
| 45 |
+
return []
|
| 46 |
+
|
| 47 |
+
semester = int(semester)
|
| 48 |
+
interests = profile.get('interests', [])
|
| 49 |
+
programming_exp = profile.get('programming_experience', 2)
|
| 50 |
+
math_level = profile.get('math_level', 2)
|
| 51 |
+
|
| 52 |
+
filtered_courses = [
|
| 53 |
+
course for course in self.courses
|
| 54 |
+
if course.get('semester') == semester
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
if not filtered_courses:
|
| 58 |
+
return []
|
| 59 |
+
|
| 60 |
+
scored_courses = []
|
| 61 |
+
for course in filtered_courses:
|
| 62 |
+
score = self._calculate_recommendation_score(course, profile)
|
| 63 |
+
scored_courses.append((course, score))
|
| 64 |
+
|
| 65 |
+
scored_courses.sort(key=lambda x: x[1], reverse=True)
|
| 66 |
+
|
| 67 |
+
recommendations = []
|
| 68 |
+
for course, score in scored_courses[:7]:
|
| 69 |
+
why = self._generate_recommendation_reason(course, profile)
|
| 70 |
+
recommendations.append({
|
| 71 |
+
'semester': course['semester'],
|
| 72 |
+
'name': course['name'],
|
| 73 |
+
'credits': course['credits'],
|
| 74 |
+
'why': why
|
| 75 |
+
})
|
| 76 |
+
|
| 77 |
+
return recommendations
|
| 78 |
+
|
| 79 |
+
def _calculate_recommendation_score(self, course: dict, profile: dict) -> float:
|
| 80 |
+
interests = profile.get('interests', [])
|
| 81 |
+
programming_exp = profile.get('programming_experience', 2)
|
| 82 |
+
math_level = profile.get('math_level', 2)
|
| 83 |
+
|
| 84 |
+
course_text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower()
|
| 85 |
+
course_tags = course.get('tags', [])
|
| 86 |
+
|
| 87 |
+
similarity_score = 0.0
|
| 88 |
+
if interests:
|
| 89 |
+
interest_matches = sum(1 for interest in interests if interest in course_tags)
|
| 90 |
+
similarity_score = interest_matches / len(interests)
|
| 91 |
+
|
| 92 |
+
rule_score = 0.0
|
| 93 |
+
|
| 94 |
+
if programming_exp >= 3:
|
| 95 |
+
if any(tag in course_tags for tag in ['ml', 'dl', 'systems']):
|
| 96 |
+
rule_score += 0.3
|
| 97 |
+
|
| 98 |
+
if 'product' in interests or 'business' in interests:
|
| 99 |
+
if any(tag in course_tags for tag in ['product', 'business', 'pm']):
|
| 100 |
+
rule_score += 0.3
|
| 101 |
+
|
| 102 |
+
if math_level >= 3:
|
| 103 |
+
if any(tag in course_tags for tag in ['math', 'stats', 'dl']):
|
| 104 |
+
rule_score += 0.3
|
| 105 |
+
|
| 106 |
+
generic_score = 0.1
|
| 107 |
+
|
| 108 |
+
final_score = 0.6 * similarity_score + 0.3 * rule_score + 0.1 * generic_score
|
| 109 |
+
return final_score
|
| 110 |
+
|
| 111 |
+
def _generate_recommendation_reason(self, course: dict, profile: dict) -> str:
|
| 112 |
+
interests = profile.get('interests', [])
|
| 113 |
+
course_tags = course.get('tags', [])
|
| 114 |
+
|
| 115 |
+
matching_tags = [tag for tag in interests if tag in course_tags]
|
| 116 |
+
|
| 117 |
+
if matching_tags:
|
| 118 |
+
tag_names = {
|
| 119 |
+
'ml': 'машинное обучение',
|
| 120 |
+
'dl': 'глубокое обучение',
|
| 121 |
+
'nlp': 'обработка естественного языка',
|
| 122 |
+
'cv': 'компьютерное зрение',
|
| 123 |
+
'product': 'продуктовая разработка',
|
| 124 |
+
'business': 'бизнес-аналитика',
|
| 125 |
+
'research': 'исследования',
|
| 126 |
+
'data': 'анализ данных',
|
| 127 |
+
'systems': 'системная архитектура'
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
tag_descriptions = [tag_names.get(tag, tag) for tag in matching_tags]
|
| 131 |
+
return f'Соответствует вашим интересам: {", ".join(tag_descriptions)}'
|
| 132 |
+
|
| 133 |
+
return 'Курс из учебного плана программы'
|
| 134 |
+
|
| 135 |
+
def get_course_by_id(self, course_id: str) -> dict:
|
| 136 |
+
for course in self.courses:
|
| 137 |
+
if course.get('id') == course_id:
|
| 138 |
+
return course
|
| 139 |
+
return {}
|
| 140 |
+
|
| 141 |
+
def get_program_by_id(self, program_id: str) -> dict:
|
| 142 |
+
return self.programs.get(program_id, {})
|
| 143 |
+
|
| 144 |
+
def search_courses(self, query: str, limit: int = 10) -> List[Dict]:
|
| 145 |
+
query_lower = query.lower()
|
| 146 |
+
results = []
|
| 147 |
+
|
| 148 |
+
for course in self.courses:
|
| 149 |
+
course_text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower()
|
| 150 |
+
|
| 151 |
+
if query_lower in course_text:
|
| 152 |
+
results.append(course)
|
| 153 |
+
|
| 154 |
+
if len(results) >= limit:
|
| 155 |
+
break
|
| 156 |
+
|
| 157 |
+
return results
|
prompts/system.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Ты - помощник для абитуриентов магистратур ITMO. Отвечай только по контексту, предоставленному ниже.
|
| 2 |
+
|
| 3 |
+
ПРАВИЛА:
|
| 4 |
+
1. Отвечай только на основе информации из контекста
|
| 5 |
+
2. Если в контексте нет ответа - прямо скажи: "в предоставленных данных об этом не сказано"
|
| 6 |
+
3. Отвечай кратко и по делу
|
| 7 |
+
4. Не выдумывай информацию
|
| 8 |
+
5. Если спрашивают о курсах - указывай семестр и количество кредитов
|
| 9 |
+
6. Если спрашивают о программах - давай краткое описание из контекста
|
| 10 |
+
7. Будь вежливым и полезным
|
| 11 |
+
|
| 12 |
+
Контекст содержит информацию о курсах из официальных учебных планов ITMO.
|
requirements.txt
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
gradio==4.44.0
|
retriever.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import numpy as np
|
| 4 |
+
import faiss
|
| 5 |
+
from typing import List, Dict
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
class Retriever:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.model = None
|
| 11 |
+
self.index = None
|
| 12 |
+
self.meta = {}
|
| 13 |
+
self.embeddings = None
|
| 14 |
+
self._load_index()
|
| 15 |
+
|
| 16 |
+
def _load_index(self):
|
| 17 |
+
try:
|
| 18 |
+
if os.path.exists('data/index/index.faiss') and os.path.exists('data/index/meta.json'):
|
| 19 |
+
self.index = faiss.read_index('data/index/index.faiss')
|
| 20 |
+
self.embeddings = np.load('data/index/embeddings.npy')
|
| 21 |
+
|
| 22 |
+
with open('data/index/meta.json', 'r', encoding='utf-8') as f:
|
| 23 |
+
self.meta = json.load(f)
|
| 24 |
+
|
| 25 |
+
print('Индекс загружен из кэша')
|
| 26 |
+
else:
|
| 27 |
+
print('Индекс не найден, будет создан при первом использовании')
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f'Ошибка загрузки индекса: {e}')
|
| 30 |
+
|
| 31 |
+
def _load_model(self):
|
| 32 |
+
if self.model is None:
|
| 33 |
+
try:
|
| 34 |
+
self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
|
| 35 |
+
print('Модель эмбеддингов загружена')
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f'Ошибка загрузки модели: {e}')
|
| 38 |
+
raise
|
| 39 |
+
|
| 40 |
+
def _build_index(self, courses: List[Dict]):
|
| 41 |
+
if not courses:
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
self._load_model()
|
| 45 |
+
|
| 46 |
+
texts = []
|
| 47 |
+
meta_data = {}
|
| 48 |
+
|
| 49 |
+
for i, course in enumerate(courses):
|
| 50 |
+
text = f"{course.get('name', '')} {course.get('short_desc', '')}"
|
| 51 |
+
text = text.lower().strip()
|
| 52 |
+
|
| 53 |
+
if len(text) > 220:
|
| 54 |
+
text = text[:220]
|
| 55 |
+
|
| 56 |
+
texts.append(text)
|
| 57 |
+
meta_data[i] = course.get('id', str(i))
|
| 58 |
+
|
| 59 |
+
if not texts:
|
| 60 |
+
return
|
| 61 |
+
|
| 62 |
+
embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
|
| 63 |
+
|
| 64 |
+
embeddings = embeddings.astype(np.float32)
|
| 65 |
+
faiss.normalize_L2(embeddings)
|
| 66 |
+
|
| 67 |
+
self.index = faiss.IndexFlatIP(embeddings.shape[1])
|
| 68 |
+
self.index.add(embeddings)
|
| 69 |
+
self.embeddings = embeddings
|
| 70 |
+
self.meta = meta_data
|
| 71 |
+
|
| 72 |
+
self._save_index()
|
| 73 |
+
|
| 74 |
+
def _save_index(self):
|
| 75 |
+
try:
|
| 76 |
+
os.makedirs('data/index', exist_ok=True)
|
| 77 |
+
|
| 78 |
+
faiss.write_index(self.index, 'data/index/index.faiss')
|
| 79 |
+
np.save('data/index/embeddings.npy', self.embeddings)
|
| 80 |
+
|
| 81 |
+
with open('data/index/meta.json', 'w', encoding='utf-8') as f:
|
| 82 |
+
json.dump(self.meta, f, ensure_ascii=False, indent=2)
|
| 83 |
+
|
| 84 |
+
print('Индекс сохранен')
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f'Ошибка сохранения индекса: {e}')
|
| 87 |
+
|
| 88 |
+
def retrieve(self, query: str, k: int = 6, threshold: float = 0.35) -> List[Dict]:
|
| 89 |
+
if self.index is None:
|
| 90 |
+
return []
|
| 91 |
+
|
| 92 |
+
self._load_model()
|
| 93 |
+
|
| 94 |
+
query_embedding = self.model.encode([query.lower().strip()], convert_to_numpy=True)
|
| 95 |
+
query_embedding = query_embedding.astype(np.float32)
|
| 96 |
+
faiss.normalize_L2(query_embedding)
|
| 97 |
+
|
| 98 |
+
scores, indices = self.index.search(query_embedding, k)
|
| 99 |
+
|
| 100 |
+
results = []
|
| 101 |
+
for score, idx in zip(scores[0], indices[0]):
|
| 102 |
+
if score >= threshold and idx in self.meta:
|
| 103 |
+
course_id = self.meta[idx]
|
| 104 |
+
results.append({
|
| 105 |
+
'course_id': course_id,
|
| 106 |
+
'score': float(score)
|
| 107 |
+
})
|
| 108 |
+
|
| 109 |
+
return results
|
| 110 |
+
|
| 111 |
+
def build_or_load_index(self, courses: List[Dict] = None):
|
| 112 |
+
if self.index is None and courses:
|
| 113 |
+
print('Создание индекса...')
|
| 114 |
+
self._build_index(courses)
|
| 115 |
+
elif self.index is None:
|
| 116 |
+
print('Индекс не найден и данные не предоставлены')
|
| 117 |
+
|
| 118 |
+
def get_embedding_dim(self) -> int:
|
| 119 |
+
if self.embeddings is not None:
|
| 120 |
+
return self.embeddings.shape[1]
|
| 121 |
+
return 0
|
| 122 |
+
|
| 123 |
+
def get_index_size(self) -> int:
|
| 124 |
+
if self.index is not None:
|
| 125 |
+
return self.index.ntotal
|
| 126 |
+
return 0
|
scraper/html_scraper.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import re
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import hashlib
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
class HTMLScraper:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.session = requests.Session()
|
| 12 |
+
self.session.headers.update({
|
| 13 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 14 |
+
})
|
| 15 |
+
|
| 16 |
+
self.program_urls = {
|
| 17 |
+
'ai': 'https://abit.itmo.ru/program/master/ai',
|
| 18 |
+
'ai_product': 'https://abit.itmo.ru/program/master/ai_product'
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
def scrape_programs(self) -> Dict:
|
| 22 |
+
programs = {}
|
| 23 |
+
|
| 24 |
+
for program_id, url in self.program_urls.items():
|
| 25 |
+
try:
|
| 26 |
+
print(f'Скрапинг программы {program_id}...')
|
| 27 |
+
program_data = self._scrape_program_page(url, program_id)
|
| 28 |
+
programs[program_id] = program_data
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f'Ошибка при скрапинге {program_id}: {e}')
|
| 31 |
+
|
| 32 |
+
return programs
|
| 33 |
+
|
| 34 |
+
def _scrape_program_page(self, url: str, program_id: str) -> Dict:
|
| 35 |
+
response = self.session.get(url, timeout=30)
|
| 36 |
+
response.raise_for_status()
|
| 37 |
+
|
| 38 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 39 |
+
|
| 40 |
+
title = self._extract_title(soup)
|
| 41 |
+
description = self._extract_description(soup)
|
| 42 |
+
pdf_links = self._extract_pdf_links(soup, url)
|
| 43 |
+
|
| 44 |
+
program_data = {
|
| 45 |
+
'id': program_id,
|
| 46 |
+
'title': title,
|
| 47 |
+
'description': description,
|
| 48 |
+
'url': url,
|
| 49 |
+
'pdf_links': pdf_links,
|
| 50 |
+
'hash': self._calculate_hash(response.content)
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
return program_data
|
| 54 |
+
|
| 55 |
+
def _extract_title(self, soup: BeautifulSoup) -> str:
|
| 56 |
+
title_elem = soup.find('h1') or soup.find('title')
|
| 57 |
+
if title_elem:
|
| 58 |
+
return title_elem.get_text().strip()
|
| 59 |
+
return ''
|
| 60 |
+
|
| 61 |
+
def _extract_description(self, soup: BeautifulSoup) -> str:
|
| 62 |
+
desc_selectors = [
|
| 63 |
+
'.program-description',
|
| 64 |
+
'.description',
|
| 65 |
+
'.program-info',
|
| 66 |
+
'p',
|
| 67 |
+
'.content'
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
for selector in desc_selectors:
|
| 71 |
+
elem = soup.select_one(selector)
|
| 72 |
+
if elem:
|
| 73 |
+
text = elem.get_text().strip()
|
| 74 |
+
if len(text) > 50:
|
| 75 |
+
return text[:500]
|
| 76 |
+
|
| 77 |
+
return ''
|
| 78 |
+
|
| 79 |
+
def _extract_pdf_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
|
| 80 |
+
pdf_links = []
|
| 81 |
+
|
| 82 |
+
for link in soup.find_all('a', href=True):
|
| 83 |
+
href = link.get('href', '')
|
| 84 |
+
text = link.get_text().strip().lower()
|
| 85 |
+
|
| 86 |
+
if self._is_pdf_link(href, text):
|
| 87 |
+
full_url = self._make_absolute_url(href, base_url)
|
| 88 |
+
pdf_links.append({
|
| 89 |
+
'url': full_url,
|
| 90 |
+
'text': text,
|
| 91 |
+
'filename': self._extract_filename(href)
|
| 92 |
+
})
|
| 93 |
+
|
| 94 |
+
return pdf_links
|
| 95 |
+
|
| 96 |
+
def _is_pdf_link(self, href: str, text: str) -> bool:
|
| 97 |
+
pdf_indicators = [
|
| 98 |
+
'учебный план', 'учебный план', 'curriculum', 'plan',
|
| 99 |
+
'pdf', '.pdf', 'программа', 'program'
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
href_lower = href.lower()
|
| 103 |
+
return any(indicator in href_lower or indicator in text for indicator in pdf_indicators)
|
| 104 |
+
|
| 105 |
+
def _make_absolute_url(self, href: str, base_url: str) -> str:
|
| 106 |
+
if href.startswith('http'):
|
| 107 |
+
return href
|
| 108 |
+
elif href.startswith('/'):
|
| 109 |
+
base = '/'.join(base_url.split('/')[:3])
|
| 110 |
+
return base + href
|
| 111 |
+
else:
|
| 112 |
+
return base_url.rstrip('/') + '/' + href.lstrip('/')
|
| 113 |
+
|
| 114 |
+
def _extract_filename(self, href: str) -> str:
|
| 115 |
+
filename = href.split('/')[-1]
|
| 116 |
+
if not filename.endswith('.pdf'):
|
| 117 |
+
filename += '.pdf'
|
| 118 |
+
return filename
|
| 119 |
+
|
| 120 |
+
def _calculate_hash(self, content: bytes) -> str:
|
| 121 |
+
return hashlib.sha256(content).hexdigest()
|
| 122 |
+
|
| 123 |
+
def save_programs(self, programs: Dict, output_path: str = 'data/processed/programs.json'):
|
| 124 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 125 |
+
|
| 126 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 127 |
+
json.dump(programs, f, ensure_ascii=False, indent=2)
|
| 128 |
+
|
| 129 |
+
print(f'Программы сохранены в {output_path}')
|
| 130 |
+
|
| 131 |
+
def main():
|
| 132 |
+
scraper = HTMLScraper()
|
| 133 |
+
programs = scraper.scrape_programs()
|
| 134 |
+
scraper.save_programs(programs)
|
| 135 |
+
|
| 136 |
+
for program_id, program in programs.items():
|
| 137 |
+
print(f'\n{program["title"]}:')
|
| 138 |
+
print(f'PDF ссылок найдено: {len(program["pdf_links"])}')
|
| 139 |
+
for link in program['pdf_links']:
|
| 140 |
+
print(f' - {link["filename"]}: {link["url"]}')
|
| 141 |
+
|
| 142 |
+
if __name__ == '__main__':
|
| 143 |
+
main()
|
scraper/normalize.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import hashlib
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
|
| 5 |
+
class DataNormalizer:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
self.tag_keywords = {
|
| 8 |
+
'ml': ['машинное обучение', 'machine learning', 'ml', 'алгоритм', 'модель'],
|
| 9 |
+
'dl': ['глубокое обучение', 'deep learning', 'нейронная сеть', 'cnn', 'rnn', 'transformer'],
|
| 10 |
+
'nlp': ['nlp', 'обработка естественного языка', 'natural language', 'текст', 'язык'],
|
| 11 |
+
'cv': ['компьютерное зрение', 'computer vision', 'cv', 'изображение', 'видео'],
|
| 12 |
+
'math': ['математика', 'математический', 'алгебра', 'геометрия', 'анализ'],
|
| 13 |
+
'stats': ['статистика', 'вероятность', 'статистический', 'probability'],
|
| 14 |
+
'product': ['продукт', 'product', 'разработка продукта', 'продуктовая'],
|
| 15 |
+
'business': ['бизнес', 'business', 'менеджмент', 'управление', 'экономика'],
|
| 16 |
+
'pm': ['project management', 'управление проектами', 'pm', 'проект'],
|
| 17 |
+
'systems': ['система', 'system', 'архитектура', 'инфраструктура'],
|
| 18 |
+
'data': ['данные', 'data', 'анализ данных', 'big data', 'база данных']
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
def normalize_courses(self, courses: List[Dict]) -> List[Dict]:
|
| 22 |
+
normalized_courses = []
|
| 23 |
+
seen_hashes = set()
|
| 24 |
+
|
| 25 |
+
for course in courses:
|
| 26 |
+
normalized = self._normalize_course(course)
|
| 27 |
+
if normalized:
|
| 28 |
+
course_hash = self._calculate_course_hash(normalized)
|
| 29 |
+
if course_hash not in seen_hashes:
|
| 30 |
+
seen_hashes.add(course_hash)
|
| 31 |
+
normalized_courses.append(normalized)
|
| 32 |
+
|
| 33 |
+
return normalized_courses
|
| 34 |
+
|
| 35 |
+
def _normalize_course(self, course: Dict) -> Dict:
|
| 36 |
+
if not course.get('name'):
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
normalized = course.copy()
|
| 40 |
+
|
| 41 |
+
normalized['name'] = self._normalize_name(course['name'])
|
| 42 |
+
normalized['short_desc'] = self._generate_short_desc(course)
|
| 43 |
+
normalized['tags'] = self._generate_tags(course)
|
| 44 |
+
|
| 45 |
+
normalized['semester'] = self._normalize_semester(course.get('semester', 1))
|
| 46 |
+
normalized['credits'] = self._normalize_credits(course.get('credits', 0))
|
| 47 |
+
normalized['hours'] = self._normalize_hours(course.get('hours', 0))
|
| 48 |
+
normalized['type'] = self._normalize_type(course.get('type', 'required'))
|
| 49 |
+
|
| 50 |
+
return normalized
|
| 51 |
+
|
| 52 |
+
def _normalize_name(self, name: str) -> str:
|
| 53 |
+
if not name:
|
| 54 |
+
return ''
|
| 55 |
+
|
| 56 |
+
name = str(name).strip()
|
| 57 |
+
name = re.sub(r'\s+', ' ', name)
|
| 58 |
+
name = name.replace('"', '').replace('"', '')
|
| 59 |
+
|
| 60 |
+
return name
|
| 61 |
+
|
| 62 |
+
def _generate_short_desc(self, course: dict) -> str:
|
| 63 |
+
name = course.get('name', '')
|
| 64 |
+
desc = course.get('description', '')
|
| 65 |
+
|
| 66 |
+
if desc:
|
| 67 |
+
desc = str(desc).strip()
|
| 68 |
+
if len(desc) > 220:
|
| 69 |
+
desc = desc[:220] + '...'
|
| 70 |
+
return desc
|
| 71 |
+
|
| 72 |
+
if name and len(name) > 50:
|
| 73 |
+
return name[:220]
|
| 74 |
+
|
| 75 |
+
return 'Курс из учебного плана программы'
|
| 76 |
+
|
| 77 |
+
def _generate_tags(self, course: Dict) -> List[str]:
|
| 78 |
+
text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower()
|
| 79 |
+
tags = []
|
| 80 |
+
|
| 81 |
+
for tag, keywords in self.tag_keywords.items():
|
| 82 |
+
if any(keyword in text for keyword in keywords):
|
| 83 |
+
tags.append(tag)
|
| 84 |
+
|
| 85 |
+
return tags
|
| 86 |
+
|
| 87 |
+
def _normalize_semester(self, semester) -> int:
|
| 88 |
+
try:
|
| 89 |
+
semester = int(semester)
|
| 90 |
+
if 1 <= semester <= 4:
|
| 91 |
+
return semester
|
| 92 |
+
except (ValueError, TypeError):
|
| 93 |
+
pass
|
| 94 |
+
|
| 95 |
+
return 1
|
| 96 |
+
|
| 97 |
+
def _normalize_credits(self, credits) -> int:
|
| 98 |
+
try:
|
| 99 |
+
credits = int(credits)
|
| 100 |
+
if credits >= 0:
|
| 101 |
+
return credits
|
| 102 |
+
except (ValueError, TypeError):
|
| 103 |
+
pass
|
| 104 |
+
|
| 105 |
+
return 0
|
| 106 |
+
|
| 107 |
+
def _normalize_hours(self, hours) -> int:
|
| 108 |
+
try:
|
| 109 |
+
hours = int(hours)
|
| 110 |
+
if hours >= 0:
|
| 111 |
+
return hours
|
| 112 |
+
except (ValueError, TypeError):
|
| 113 |
+
pass
|
| 114 |
+
|
| 115 |
+
return 0
|
| 116 |
+
|
| 117 |
+
def _normalize_type(self, course_type: str) -> str:
|
| 118 |
+
if not course_type:
|
| 119 |
+
return 'required'
|
| 120 |
+
|
| 121 |
+
type_lower = str(course_type).lower()
|
| 122 |
+
|
| 123 |
+
if any(word in type_lower for word in ['обязательная', 'required', 'обяз']):
|
| 124 |
+
return 'required'
|
| 125 |
+
elif any(word in type_lower for word in ['по выбору', 'elective', 'выбор']):
|
| 126 |
+
return 'elective'
|
| 127 |
+
|
| 128 |
+
return 'required'
|
| 129 |
+
|
| 130 |
+
def _calculate_course_hash(self, course: Dict) -> str:
|
| 131 |
+
text = f"{course.get('name', '')}{course.get('program_id', '')}{course.get('semester', '')}"
|
| 132 |
+
return hashlib.md5(text.encode()).hexdigest()
|
| 133 |
+
|
| 134 |
+
def merge_courses(self, courses_list: List[List[Dict]]) -> List[Dict]:
|
| 135 |
+
all_courses = []
|
| 136 |
+
for courses in courses_list:
|
| 137 |
+
all_courses.extend(courses)
|
| 138 |
+
|
| 139 |
+
return self.normalize_courses(all_courses)
|
| 140 |
+
|
| 141 |
+
def validate_course(self, course: Dict) -> bool:
|
| 142 |
+
required_fields = ['name', 'program_id', 'semester']
|
| 143 |
+
|
| 144 |
+
for field in required_fields:
|
| 145 |
+
if not course.get(field):
|
| 146 |
+
return False
|
| 147 |
+
|
| 148 |
+
if len(course.get('name', '')) < 3:
|
| 149 |
+
return False
|
| 150 |
+
|
| 151 |
+
return True
|
| 152 |
+
|
| 153 |
+
def get_statistics(self, courses: List[Dict]) -> Dict:
|
| 154 |
+
stats = {
|
| 155 |
+
'total_courses': len(courses),
|
| 156 |
+
'by_program': {},
|
| 157 |
+
'by_semester': {},
|
| 158 |
+
'by_type': {},
|
| 159 |
+
'by_tags': {}
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
for course in courses:
|
| 163 |
+
program_id = course.get('program_id', 'unknown')
|
| 164 |
+
semester = course.get('semester', 1)
|
| 165 |
+
course_type = course.get('type', 'required')
|
| 166 |
+
tags = course.get('tags', [])
|
| 167 |
+
|
| 168 |
+
stats['by_program'][program_id] = stats['by_program'].get(program_id, 0) + 1
|
| 169 |
+
stats['by_semester'][semester] = stats['by_semester'].get(semester, 0) + 1
|
| 170 |
+
stats['by_type'][course_type] = stats['by_type'].get(course_type, 0) + 1
|
| 171 |
+
|
| 172 |
+
for tag in tags:
|
| 173 |
+
stats['by_tags'][tag] = stats['by_tags'].get(tag, 0) + 1
|
| 174 |
+
|
| 175 |
+
return stats
|
| 176 |
+
|
| 177 |
+
def main():
|
| 178 |
+
normalizer = DataNormalizer()
|
| 179 |
+
|
| 180 |
+
test_courses = [
|
| 181 |
+
{
|
| 182 |
+
'id': 'test_1',
|
| 183 |
+
'program_id': 'ai',
|
| 184 |
+
'name': 'Машинное обучение',
|
| 185 |
+
'semester': 1,
|
| 186 |
+
'credits': 6,
|
| 187 |
+
'type': 'required'
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
'id': 'test_2',
|
| 191 |
+
'program_id': 'ai_product',
|
| 192 |
+
'name': 'Глубокое обучение',
|
| 193 |
+
'semester': 2,
|
| 194 |
+
'credits': 4,
|
| 195 |
+
'type': 'elective'
|
| 196 |
+
}
|
| 197 |
+
]
|
| 198 |
+
|
| 199 |
+
normalized = normalizer.normalize_courses(test_courses)
|
| 200 |
+
stats = normalizer.get_statistics(normalized)
|
| 201 |
+
|
| 202 |
+
print(f'Нормализовано курсов: {len(normalized)}')
|
| 203 |
+
print(f'Статистика: {stats}')
|
| 204 |
+
|
| 205 |
+
if __name__ == '__main__':
|
| 206 |
+
main()
|
scraper/pdf_parser.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pdfplumber
|
| 2 |
+
import requests
|
| 3 |
+
import re
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import os
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
|
| 8 |
+
class PDFParser:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.session = requests.Session()
|
| 11 |
+
self.session.headers.update({
|
| 12 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 13 |
+
})
|
| 14 |
+
|
| 15 |
+
def download_pdf(self, url: str, filename: str) -> str:
|
| 16 |
+
local_path = os.path.join('data/raw', filename)
|
| 17 |
+
|
| 18 |
+
if os.path.exists(local_path):
|
| 19 |
+
print(f'PDF уже загружен: {filename}')
|
| 20 |
+
return local_path
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
print(f'Загрузка PDF: {url}')
|
| 24 |
+
response = self.session.get(url, stream=True, timeout=60)
|
| 25 |
+
response.raise_for_status()
|
| 26 |
+
|
| 27 |
+
os.makedirs('data/raw', exist_ok=True)
|
| 28 |
+
|
| 29 |
+
with open(local_path, 'wb') as f:
|
| 30 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 31 |
+
f.write(chunk)
|
| 32 |
+
|
| 33 |
+
print(f'PDF сохранен: {local_path}')
|
| 34 |
+
return local_path
|
| 35 |
+
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f'Ошибка загрузки PDF {url}: {e}')
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
def parse_pdf(self, pdf_path: str, program_id: str) -> List[Dict]:
|
| 41 |
+
courses = []
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 45 |
+
print(f'Парсинг PDF: {pdf_path}')
|
| 46 |
+
|
| 47 |
+
for page_num, page in enumerate(tqdm(pdf.pages, desc='Страницы')):
|
| 48 |
+
page_courses = self._parse_page(page, page_num + 1, program_id)
|
| 49 |
+
courses.extend(page_courses)
|
| 50 |
+
|
| 51 |
+
print(f'Найдено курсов: {len(courses)}')
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f'Ошибка парсинга PDF {pdf_path}: {e}')
|
| 55 |
+
|
| 56 |
+
return courses
|
| 57 |
+
|
| 58 |
+
def _parse_page(self, page, page_num: int, program_id: str) -> List[Dict]:
|
| 59 |
+
courses = []
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
tables = page.extract_tables()
|
| 63 |
+
|
| 64 |
+
for table in tables:
|
| 65 |
+
table_courses = self._parse_table(table, page_num, program_id)
|
| 66 |
+
courses.extend(table_courses)
|
| 67 |
+
|
| 68 |
+
if not courses:
|
| 69 |
+
courses = self._parse_text_fallback(page, page_num, program_id)
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f'Ошибка парсинга страницы {page_num}: {e}')
|
| 73 |
+
|
| 74 |
+
return courses
|
| 75 |
+
|
| 76 |
+
def _parse_table(self, table: list, page_num: int, program_id: str) -> List[Dict]:
|
| 77 |
+
courses = []
|
| 78 |
+
|
| 79 |
+
if not table or len(table) < 2:
|
| 80 |
+
return courses
|
| 81 |
+
|
| 82 |
+
headers = [str(cell).lower().strip() if cell else '' for cell in table[0]]
|
| 83 |
+
|
| 84 |
+
for row_idx, row in enumerate(table[1:], 1):
|
| 85 |
+
if not row or len(row) < 3:
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
course = self._extract_course_from_row(row, headers, page_num, program_id)
|
| 89 |
+
if course:
|
| 90 |
+
courses.append(course)
|
| 91 |
+
|
| 92 |
+
return courses
|
| 93 |
+
|
| 94 |
+
def _extract_course_from_row(self, row: list, headers: list, page_num: int, program_id: str) -> Dict:
|
| 95 |
+
try:
|
| 96 |
+
row = [str(cell).strip() if cell else '' for cell in row]
|
| 97 |
+
|
| 98 |
+
name = self._extract_name(row, headers)
|
| 99 |
+
if not name or len(name) < 3:
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
semester = self._extract_semester(row, headers)
|
| 103 |
+
credits = self._extract_credits(row, headers)
|
| 104 |
+
hours = self._extract_hours(row, headers)
|
| 105 |
+
course_type = self._extract_type(row, headers)
|
| 106 |
+
|
| 107 |
+
course = {
|
| 108 |
+
'id': f'{program_id}_{page_num}_{hash(name) % 10000}',
|
| 109 |
+
'program_id': program_id,
|
| 110 |
+
'semester': semester,
|
| 111 |
+
'name': name,
|
| 112 |
+
'credits': credits,
|
| 113 |
+
'hours': hours,
|
| 114 |
+
'type': course_type,
|
| 115 |
+
'source_pdf': os.path.basename(program_id),
|
| 116 |
+
'source_page': page_num
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
return course
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f'Ошибка извлечения курса из строки: {e}')
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
def _extract_name(self, row: list, headers: list) -> str:
|
| 126 |
+
name_indicators = ['название', 'дисциплина', 'курс', 'предмет', 'name', 'course']
|
| 127 |
+
|
| 128 |
+
for i, header in enumerate(headers):
|
| 129 |
+
if any(indicator in header for indicator in name_indicators):
|
| 130 |
+
if i < len(row) and row[i]:
|
| 131 |
+
return row[i]
|
| 132 |
+
|
| 133 |
+
if len(row) > 0 and row[0]:
|
| 134 |
+
return row[0]
|
| 135 |
+
|
| 136 |
+
return ''
|
| 137 |
+
|
| 138 |
+
def _extract_semester(self, row: list, headers: list) -> int:
|
| 139 |
+
semester_indicators = ['семестр', 'semester', 'сем']
|
| 140 |
+
|
| 141 |
+
for i, header in enumerate(headers):
|
| 142 |
+
if any(indicator in header for indicator in semester_indicators):
|
| 143 |
+
if i < len(row) and row[i]:
|
| 144 |
+
try:
|
| 145 |
+
return int(re.findall(r'\d+', row[i])[0])
|
| 146 |
+
except:
|
| 147 |
+
pass
|
| 148 |
+
|
| 149 |
+
return 1
|
| 150 |
+
|
| 151 |
+
def _extract_credits(self, row: list, headers: list) -> int:
|
| 152 |
+
credit_indicators = ['кредит', 'credit', 'зет', 'з.е.']
|
| 153 |
+
|
| 154 |
+
for i, header in enumerate(headers):
|
| 155 |
+
if any(indicator in header for indicator in credit_indicators):
|
| 156 |
+
if i < len(row) and row[i]:
|
| 157 |
+
try:
|
| 158 |
+
return int(re.findall(r'\d+', row[i])[0])
|
| 159 |
+
except:
|
| 160 |
+
pass
|
| 161 |
+
|
| 162 |
+
return 0
|
| 163 |
+
|
| 164 |
+
def _extract_hours(self, row: list, headers: list) -> int:
|
| 165 |
+
hour_indicators = ['час', 'hour', 'ауд']
|
| 166 |
+
|
| 167 |
+
for i, header in enumerate(headers):
|
| 168 |
+
if any(indicator in header for indicator in hour_indicators):
|
| 169 |
+
if i < len(row) and row[i]:
|
| 170 |
+
try:
|
| 171 |
+
return int(re.findall(r'\d+', row[i])[0])
|
| 172 |
+
except:
|
| 173 |
+
pass
|
| 174 |
+
|
| 175 |
+
return 0
|
| 176 |
+
|
| 177 |
+
def _extract_type(self, row: list, headers: list) -> str:
|
| 178 |
+
type_indicators = ['тип', 'type', 'вид']
|
| 179 |
+
|
| 180 |
+
for i, header in enumerate(headers):
|
| 181 |
+
if any(indicator in header for indicator in type_indicators):
|
| 182 |
+
if i < len(row) and row[i]:
|
| 183 |
+
text = row[i].lower()
|
| 184 |
+
if any(word in text for word in ['обязательная', 'required', 'обяз']):
|
| 185 |
+
return 'required'
|
| 186 |
+
elif any(word in text for word in ['по выбору', 'elective', 'выбор']):
|
| 187 |
+
return 'elective'
|
| 188 |
+
|
| 189 |
+
return 'required'
|
| 190 |
+
|
| 191 |
+
def _parse_text_fallback(self, page, page_num: int, program_id: str) -> List[Dict]:
|
| 192 |
+
courses = []
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
text = page.extract_text()
|
| 196 |
+
if not text:
|
| 197 |
+
return courses
|
| 198 |
+
|
| 199 |
+
lines = text.split('\n')
|
| 200 |
+
current_semester = 1
|
| 201 |
+
|
| 202 |
+
for line in lines:
|
| 203 |
+
line = line.strip()
|
| 204 |
+
if not line:
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
if 'семестр' in line.lower():
|
| 208 |
+
semester_match = re.findall(r'\d+', line)
|
| 209 |
+
if semester_match:
|
| 210 |
+
current_semester = int(semester_match[0])
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
if len(line) > 10 and not line.isdigit():
|
| 214 |
+
course = {
|
| 215 |
+
'id': f'{program_id}_{page_num}_{hash(line) % 10000}',
|
| 216 |
+
'program_id': program_id,
|
| 217 |
+
'semester': current_semester,
|
| 218 |
+
'name': line,
|
| 219 |
+
'credits': 0,
|
| 220 |
+
'hours': 0,
|
| 221 |
+
'type': 'required',
|
| 222 |
+
'source_pdf': os.path.basename(program_id),
|
| 223 |
+
'source_page': page_num
|
| 224 |
+
}
|
| 225 |
+
courses.append(course)
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f'Ошибка fallback парсинга страницы {page_num}: {e}')
|
| 229 |
+
|
| 230 |
+
return courses
|
| 231 |
+
|
| 232 |
+
def main():
|
| 233 |
+
parser = PDFParser()
|
| 234 |
+
|
| 235 |
+
test_url = 'https://example.com/test.pdf'
|
| 236 |
+
test_filename = 'test.pdf'
|
| 237 |
+
|
| 238 |
+
local_path = parser.download_pdf(test_url, test_filename)
|
| 239 |
+
if local_path:
|
| 240 |
+
courses = parser.parse_pdf(local_path, 'test_program')
|
| 241 |
+
print(f'Найдено курсов: {len(courses)}')
|
| 242 |
+
|
| 243 |
+
if __name__ == '__main__':
|
| 244 |
+
main()
|
tests/test_filter.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 4 |
+
|
| 5 |
+
from knowledge_base import KnowledgeBase
|
| 6 |
+
|
| 7 |
+
def test_itmo_query_filter():
|
| 8 |
+
kb = KnowledgeBase()
|
| 9 |
+
|
| 10 |
+
test_cases = [
|
| 11 |
+
('Какие дисциплины по NLP в 1 семестре программы ИИ?', True),
|
| 12 |
+
('Расскажи о программе AI Product', True),
|
| 13 |
+
('Сколько кредитов за курс машинного обучения?', True),
|
| 14 |
+
('Какая погода в Санкт-Петербурге?', False),
|
| 15 |
+
('Как приготовить борщ?', False),
|
| 16 |
+
('Расскажи о программе ИИ в ITMO', True),
|
| 17 |
+
('Какие курсы по глубокому обучению?', True),
|
| 18 |
+
('Как добраться до метро?', False),
|
| 19 |
+
('Учебный план магистратуры', True),
|
| 20 |
+
('Дисциплины по компьютерному зрению', True)
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
print('Тестирование фильтра релевантности...')
|
| 24 |
+
|
| 25 |
+
for query, expected in test_cases:
|
| 26 |
+
result = kb.is_itmo_query(query)
|
| 27 |
+
status = '✓' if result == expected else '✗'
|
| 28 |
+
print(f'{status} "{query}" -> {result} (ожидалось {expected})')
|
| 29 |
+
|
| 30 |
+
print('\nТест завершен')
|
| 31 |
+
|
| 32 |
+
if __name__ == '__main__':
|
| 33 |
+
test_itmo_query_filter()
|
tests/test_recommend.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 4 |
+
|
| 5 |
+
from knowledge_base import KnowledgeBase
|
| 6 |
+
|
| 7 |
+
def test_recommendations():
|
| 8 |
+
kb = KnowledgeBase()
|
| 9 |
+
|
| 10 |
+
test_profiles = [
|
| 11 |
+
{
|
| 12 |
+
'name': 'ML профиль',
|
| 13 |
+
'profile': {
|
| 14 |
+
'programming_experience': 4,
|
| 15 |
+
'math_level': 3,
|
| 16 |
+
'interests': ['ml', 'dl', 'nlp'],
|
| 17 |
+
'semester': 1
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
'name': 'Product профиль',
|
| 22 |
+
'profile': {
|
| 23 |
+
'programming_experience': 2,
|
| 24 |
+
'math_level': 1,
|
| 25 |
+
'interests': ['product', 'business'],
|
| 26 |
+
'semester': 2
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
'name': 'Research профиль',
|
| 31 |
+
'profile': {
|
| 32 |
+
'programming_experience': 3,
|
| 33 |
+
'math_level': 4,
|
| 34 |
+
'interests': ['research', 'math', 'stats'],
|
| 35 |
+
'semester': 3
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
print('Тестирование системы рекомендаций...')
|
| 41 |
+
|
| 42 |
+
for test_case in test_profiles:
|
| 43 |
+
print(f'\n{test_case["name"]}:')
|
| 44 |
+
recommendations = kb.recommend(test_case['profile'])
|
| 45 |
+
|
| 46 |
+
if recommendations:
|
| 47 |
+
print(f'Найдено рекомендаций: {len(recommendations)}')
|
| 48 |
+
for i, rec in enumerate(recommendations[:3], 1):
|
| 49 |
+
print(f' {i}. {rec["name"]} ({rec["semester"]} семестр)')
|
| 50 |
+
else:
|
| 51 |
+
print('Рекомендации не найдены')
|
| 52 |
+
|
| 53 |
+
print('\nТест завершен')
|
| 54 |
+
|
| 55 |
+
if __name__ == '__main__':
|
| 56 |
+
test_recommendations()
|
update_data.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
from typing import List, Dict, Tuple
|
| 5 |
+
from scraper.html_scraper import HTMLScraper
|
| 6 |
+
from scraper.pdf_parser import PDFParser
|
| 7 |
+
from scraper.normalize import DataNormalizer
|
| 8 |
+
from retriever import Retriever
|
| 9 |
+
|
| 10 |
+
def update_data_async():
|
| 11 |
+
try:
|
| 12 |
+
print('Начинаем обновление данных...')
|
| 13 |
+
|
| 14 |
+
# Проверяем, есть ли уже данные
|
| 15 |
+
if check_data_exists():
|
| 16 |
+
print('Данные уже существуют, пропускаем обновление')
|
| 17 |
+
return
|
| 18 |
+
|
| 19 |
+
# Создаем тестовые данные для быстрого старта
|
| 20 |
+
print('Создание тестовых данных...')
|
| 21 |
+
|
| 22 |
+
normalizer = DataNormalizer()
|
| 23 |
+
|
| 24 |
+
# Тестовые курсы
|
| 25 |
+
test_courses = [
|
| 26 |
+
{
|
| 27 |
+
'id': 'ai_1_1',
|
| 28 |
+
'program_id': 'ai',
|
| 29 |
+
'semester': 1,
|
| 30 |
+
'name': 'Машинное обучение',
|
| 31 |
+
'credits': 6,
|
| 32 |
+
'hours': 108,
|
| 33 |
+
'type': 'required',
|
| 34 |
+
'short_desc': 'Основы машинного обучения, алгоритмы классификации и регрессии'
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
'id': 'ai_1_2',
|
| 38 |
+
'program_id': 'ai',
|
| 39 |
+
'semester': 1,
|
| 40 |
+
'name': 'Глубокое обучение',
|
| 41 |
+
'credits': 4,
|
| 42 |
+
'hours': 72,
|
| 43 |
+
'type': 'required',
|
| 44 |
+
'short_desc': 'Нейронные сети, CNN, RNN, трансформеры'
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
'id': 'ai_2_1',
|
| 48 |
+
'program_id': 'ai',
|
| 49 |
+
'semester': 2,
|
| 50 |
+
'name': 'Обработка естественного языка',
|
| 51 |
+
'credits': 5,
|
| 52 |
+
'hours': 90,
|
| 53 |
+
'type': 'required',
|
| 54 |
+
'short_desc': 'Методы обработки текста, токенизация, эмбеддинги'
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
'id': 'ai_product_1_1',
|
| 58 |
+
'program_id': 'ai_product',
|
| 59 |
+
'semester': 1,
|
| 60 |
+
'name': 'Продуктовая аналитика',
|
| 61 |
+
'credits': 6,
|
| 62 |
+
'hours': 108,
|
| 63 |
+
'type': 'required',
|
| 64 |
+
'short_desc': 'Анализ продуктовых метрик, A/B тестирование'
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
'id': 'ai_product_1_2',
|
| 68 |
+
'program_id': 'ai_product',
|
| 69 |
+
'semester': 1,
|
| 70 |
+
'name': 'Управление проектами',
|
| 71 |
+
'credits': 4,
|
| 72 |
+
'hours': 72,
|
| 73 |
+
'type': 'required',
|
| 74 |
+
'short_desc': 'Методологии управления проектами, Agile, Scrum'
|
| 75 |
+
}
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
print(f'Нормализация {len(test_courses)} курсов...')
|
| 79 |
+
normalized_courses = normalizer.normalize_courses(test_courses)
|
| 80 |
+
|
| 81 |
+
save_courses(normalized_courses)
|
| 82 |
+
|
| 83 |
+
print('Создание индекса...')
|
| 84 |
+
retriever = Retriever()
|
| 85 |
+
retriever.build_or_load_index(normalized_courses)
|
| 86 |
+
|
| 87 |
+
stats = normalizer.get_statistics(normalized_courses)
|
| 88 |
+
print(f'Статистика: {stats}')
|
| 89 |
+
|
| 90 |
+
print('Обновление данных завершено успешно!')
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f'Ошибка обновления данных: {e}')
|
| 94 |
+
raise
|
| 95 |
+
|
| 96 |
+
def save_courses(courses: List[Dict], output_path: str = 'data/processed/courses.json'):
|
| 97 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 98 |
+
|
| 99 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 100 |
+
json.dump(courses, f, ensure_ascii=False, indent=2)
|
| 101 |
+
|
| 102 |
+
print(f'Курсы сохранены в {output_path}')
|
| 103 |
+
|
| 104 |
+
def check_data_exists() -> bool:
|
| 105 |
+
programs_path = 'data/processed/programs.json'
|
| 106 |
+
courses_path = 'data/processed/courses.json'
|
| 107 |
+
index_path = 'data/index/index.faiss'
|
| 108 |
+
|
| 109 |
+
return all(os.path.exists(path) for path in [programs_path, courses_path, index_path])
|
| 110 |
+
|
| 111 |
+
def load_existing_data() -> Tuple[Dict, List[Dict]]:
|
| 112 |
+
programs = {}
|
| 113 |
+
courses = []
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
with open('data/processed/programs.json', 'r', encoding='utf-8') as f:
|
| 117 |
+
programs = json.load(f)
|
| 118 |
+
except FileNotFoundError:
|
| 119 |
+
print('Файл programs.json не найден')
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
with open('data/processed/courses.json', 'r', encoding='utf-8') as f:
|
| 123 |
+
courses = json.load(f)
|
| 124 |
+
except FileNotFoundError:
|
| 125 |
+
print('Файл courses.json не найден')
|
| 126 |
+
|
| 127 |
+
return programs, courses
|
| 128 |
+
|
| 129 |
+
def initialize_data():
|
| 130 |
+
if check_data_exists():
|
| 131 |
+
print('Данные уже существуют, загружаем...')
|
| 132 |
+
programs, courses = load_existing_data()
|
| 133 |
+
|
| 134 |
+
if courses:
|
| 135 |
+
retriever = Retriever()
|
| 136 |
+
retriever.build_or_load_index(courses)
|
| 137 |
+
print(f'Загружено {len(courses)} курсов')
|
| 138 |
+
else:
|
| 139 |
+
print('Курсы не найдены, запускаем обновление...')
|
| 140 |
+
update_data_async()
|
| 141 |
+
else:
|
| 142 |
+
print('Данные не найдены, запускаем первичное обновление...')
|
| 143 |
+
update_data_async()
|
| 144 |
+
|
| 145 |
+
def main():
|
| 146 |
+
if len(sys.argv) > 1 and sys.argv[1] == '--force':
|
| 147 |
+
print('Принудительное обновление данных...')
|
| 148 |
+
update_data_async()
|
| 149 |
+
else:
|
| 150 |
+
initialize_data()
|
| 151 |
+
|
| 152 |
+
if __name__ == '__main__':
|
| 153 |
+
main()
|