import re import hashlib from typing import List, Dict class DataNormalizer: def __init__(self): self.tag_keywords = { 'ml': ['машинное обучение', 'machine learning', 'ml', 'алгоритм', 'модель'], 'dl': ['глубокое обучение', 'deep learning', 'нейронная сеть', 'cnn', 'rnn', 'transformer'], 'nlp': ['nlp', 'обработка естественного языка', 'natural language', 'текст', 'язык'], 'cv': ['компьютерное зрение', 'computer vision', 'cv', 'изображение', 'видео'], 'math': ['математика', 'математический', 'алгебра', 'геометрия', 'анализ'], 'stats': ['статистика', 'вероятность', 'статистический', 'probability'], 'product': ['продукт', 'product', 'разработка продукта', 'продуктовая'], 'business': ['бизнес', 'business', 'менеджмент', 'управление', 'экономика'], 'pm': ['project management', 'управление проектами', 'pm', 'проект'], 'systems': ['система', 'system', 'архитектура', 'инфраструктура'], 'data': ['данные', 'data', 'анализ данных', 'big data', 'база данных'] } def normalize_courses(self, courses: List[Dict]) -> List[Dict]: normalized_courses = [] seen_hashes = set() for course in courses: normalized = self._normalize_course(course) if normalized: course_hash = self._calculate_course_hash(normalized) if course_hash not in seen_hashes: seen_hashes.add(course_hash) normalized_courses.append(normalized) return normalized_courses def _normalize_course(self, course: Dict) -> Dict: if not course.get('name'): return None normalized = course.copy() normalized['name'] = self._normalize_name(course['name']) normalized['short_desc'] = self._generate_short_desc(course) normalized['tags'] = self._generate_tags(normalized) normalized['semester'] = self._normalize_semester(course.get('semester', 1)) normalized['credits'] = self._normalize_credits(course.get('credits', 0)) normalized['hours'] = self._normalize_hours(course.get('hours', 0)) normalized['type'] = self._normalize_type(course.get('type', 'required')) return normalized def _normalize_name(self, name: str) -> str: if not name: return '' name = str(name).strip() name = re.sub(r'\s+', ' ', name) name = name.replace('"', '').replace('"', '') return name def _generate_short_desc(self, course: dict) -> str: name = course.get('name', '') desc = course.get('description', '') if desc: desc = str(desc).strip() if len(desc) > 220: desc = desc[:220] + '...' return desc if name and len(name) > 50: return name[:220] return 'Курс из учебного плана программы' def _generate_tags(self, course: Dict) -> List[str]: text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower() tags = [] for tag, keywords in self.tag_keywords.items(): if any(keyword in text for keyword in keywords): tags.append(tag) return tags def _normalize_semester(self, semester) -> int: try: semester = int(semester) if 1 <= semester <= 4: return semester except (ValueError, TypeError): pass return 1 def _normalize_credits(self, credits) -> int: try: credits = int(credits) if credits >= 0: return credits except (ValueError, TypeError): pass return 0 def _normalize_hours(self, hours) -> int: try: hours = int(hours) if hours >= 0: return hours except (ValueError, TypeError): pass return 0 def _normalize_type(self, course_type: str) -> str: if not course_type: return 'required' type_lower = str(course_type).lower() if any(word in type_lower for word in ['обязательная', 'required', 'обяз']): return 'required' elif any(word in type_lower for word in ['по выбору', 'elective', 'выбор']): return 'elective' return 'required' def _calculate_course_hash(self, course: Dict) -> str: text = f"{course.get('name', '')}{course.get('program_id', '')}{course.get('semester', '')}" return hashlib.md5(text.encode()).hexdigest() def merge_courses(self, courses_list: List[List[Dict]]) -> List[Dict]: all_courses = [] for courses in courses_list: all_courses.extend(courses) return self.normalize_courses(all_courses) def validate_course(self, course: Dict) -> bool: required_fields = ['name', 'program_id', 'semester'] for field in required_fields: if not course.get(field): return False if len(course.get('name', '')) < 3: return False return True def get_statistics(self, courses: List[Dict]) -> Dict: stats = { 'total_courses': len(courses), 'by_program': {}, 'by_semester': {}, 'by_type': {}, 'by_tags': {} } for course in courses: program_id = course.get('program_id', 'unknown') semester = course.get('semester', 1) course_type = course.get('type', 'required') tags = course.get('tags', []) stats['by_program'][program_id] = stats['by_program'].get(program_id, 0) + 1 stats['by_semester'][semester] = stats['by_semester'].get(semester, 0) + 1 stats['by_type'][course_type] = stats['by_type'].get(course_type, 0) + 1 for tag in tags: stats['by_tags'][tag] = stats['by_tags'].get(tag, 0) + 1 return stats def main(): normalizer = DataNormalizer() test_courses = [ { 'id': 'test_1', 'program_id': 'ai', 'name': 'Машинное обучение', 'semester': 1, 'credits': 6, 'type': 'required' }, { 'id': 'test_2', 'program_id': 'ai_product', 'name': 'Глубокое обучение', 'semester': 2, 'credits': 4, 'type': 'elective' } ] normalized = normalizer.normalize_courses(test_courses) stats = normalizer.get_statistics(normalized) print(f'Нормализовано курсов: {len(normalized)}') print(f'Статистика: {stats}') if __name__ == '__main__': main()