|
|
|
|
|
import sys
|
|
|
try:
|
|
|
from huggingface_hub import snapshot_download
|
|
|
except ImportError:
|
|
|
try:
|
|
|
from huggingface_hub import cached_download as snapshot_download
|
|
|
except ImportError:
|
|
|
from huggingface_hub import hf_hub_download as snapshot_download
|
|
|
|
|
|
from transformers import pipeline
|
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
import numpy as np
|
|
|
from sentence_transformers import SentenceTransformer
|
|
|
import logging
|
|
|
from typing import List, Dict, Set, Tuple, Optional
|
|
|
import time
|
|
|
import re
|
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
try:
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
MODEL_LOADED = True
|
|
|
logger.info("Sentence transformer model loaded successfully")
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to load sentence transformer model: {e}")
|
|
|
MODEL_LOADED = False
|
|
|
|
|
|
|
|
|
embedding_cache = {}
|
|
|
last_cache_clear = time.time()
|
|
|
CACHE_TTL = 3600
|
|
|
|
|
|
|
|
|
SCORING_WEIGHTS = {
|
|
|
'semantic_similarity': 0.5,
|
|
|
'popularity': 0.2,
|
|
|
'category_relevance': 0.3
|
|
|
}
|
|
|
|
|
|
|
|
|
RELATED_CATEGORIES = {
|
|
|
|
|
|
'fullstack': {'web development': 1.0, 'frontend': 0.9, 'backend': 0.9, 'javascript': 0.8, 'react': 0.7, 'node.js': 0.7, 'php': 0.8},
|
|
|
'full stack': {'web development': 1.0, 'frontend': 0.9, 'backend': 0.9, 'javascript': 0.8, 'react': 0.7, 'node.js': 0.7, 'php': 0.8},
|
|
|
'php': {'web development': 0.9, 'backend': 0.8, 'fullstack': 0.7, 'mysql': 0.7, 'laravel': 0.6},
|
|
|
'web development': {'fullstack': 1.0, 'frontend': 0.8, 'backend': 0.8, 'javascript': 0.9, 'html': 0.7, 'php': 0.8},
|
|
|
'web dev': {'fullstack': 1.0, 'frontend': 0.8, 'backend': 0.8, 'javascript': 0.9, 'html': 0.7, 'php': 0.8},
|
|
|
'frontend': {'web development': 0.9, 'html': 0.8, 'css': 0.8, 'javascript': 0.9, 'react': 0.8},
|
|
|
'backend': {'web development': 0.9, 'node.js': 0.8, 'python': 0.7, 'database': 0.8, 'api': 0.7, 'php': 0.8},
|
|
|
'cybersecurity': {'networking': 0.8, 'linux': 0.7, 'python': 0.6, 'ethical hacking': 0.9, 'security': 0.9},
|
|
|
'cyber security': {'networking': 0.8, 'linux': 0.7, 'python': 0.6, 'ethical hacking': 0.9, 'security': 0.9},
|
|
|
'aiml': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
|
|
|
'ai/ml': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
|
|
|
'ai ml': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
|
|
|
'artificial intelligence': {'python': 0.9, 'machine learning': 0.8, 'ai': 0.9, 'deep learning': 0.8, 'data science': 0.7},
|
|
|
'machine learning': {'data science': 0.9, 'python': 0.8, 'ai': 0.7, 'deep learning': 0.8},
|
|
|
'data science': {'python': 0.9, 'machine learning': 0.8, 'statistics': 0.7, 'sql': 0.6},
|
|
|
'mobile development': {'javascript': 0.7, 'react native': 0.9, 'flutter': 0.8, 'ios': 0.7},
|
|
|
'devops': {'linux': 0.8, 'docker': 0.9, 'aws': 0.7, 'ci/cd': 0.8},
|
|
|
'blockchain': {'javascript': 0.7, 'web3': 0.9, 'solidity': 0.8, 'cryptocurrency': 0.7},
|
|
|
'javascript': {'web development': 0.9, 'frontend': 0.8, 'node.js': 0.7, 'react': 0.8},
|
|
|
'python': {'data science': 0.8, 'backend': 0.7, 'machine learning': 0.8, 'automation': 0.6},
|
|
|
'react': {'javascript': 0.9, 'frontend': 0.8, 'web development': 0.7},
|
|
|
'reactjs': {'javascript': 0.9, 'frontend': 0.8, 'web development': 0.7},
|
|
|
'node.js': {'javascript': 0.9, 'backend': 0.8, 'web development': 0.7},
|
|
|
'nodejs': {'javascript': 0.9, 'backend': 0.8, 'web development': 0.7},
|
|
|
'html': {'web development': 0.8, 'frontend': 0.9, 'css': 0.8},
|
|
|
'css': {'web development': 0.8, 'frontend': 0.9, 'html': 0.8},
|
|
|
'sql': {'database': 0.9, 'backend': 0.7, 'data science': 0.6},
|
|
|
'java': {'backend': 0.8, 'spring': 0.9, 'enterprise': 0.7},
|
|
|
}
|
|
|
|
|
|
def normalize_category_name(category: str) -> str:
|
|
|
"""Normalize category name to lowercase and handle common variations"""
|
|
|
if not category:
|
|
|
return ""
|
|
|
|
|
|
|
|
|
normalized = category.lower().strip()
|
|
|
|
|
|
|
|
|
variations = {
|
|
|
'ai/ml': 'aiml',
|
|
|
'ai ml': 'aiml',
|
|
|
'artificial intelligence': 'aiml',
|
|
|
'full stack': 'fullstack',
|
|
|
'web dev': 'web development',
|
|
|
'cyber security': 'cybersecurity',
|
|
|
'nodejs': 'node.js',
|
|
|
'reactjs': 'react'
|
|
|
}
|
|
|
|
|
|
return variations.get(normalized, normalized)
|
|
|
|
|
|
def _clear_old_cache():
|
|
|
"""Clear cache if TTL has expired"""
|
|
|
global last_cache_clear
|
|
|
current_time = time.time()
|
|
|
if current_time - last_cache_clear > CACHE_TTL:
|
|
|
embedding_cache.clear()
|
|
|
last_cache_clear = current_time
|
|
|
logger.info("Embedding cache cleared")
|
|
|
|
|
|
def get_course_embeddings_batch(courses: List[Dict]) -> Dict[str, np.ndarray]:
|
|
|
"""Generate embeddings for multiple courses with caching"""
|
|
|
if not MODEL_LOADED:
|
|
|
raise Exception("AI model not loaded")
|
|
|
|
|
|
_clear_old_cache()
|
|
|
|
|
|
|
|
|
courses_to_embed = []
|
|
|
course_ids_to_embed = []
|
|
|
|
|
|
for course in courses:
|
|
|
course_id = course['id']
|
|
|
if course_id not in embedding_cache:
|
|
|
courses_to_embed.append(course)
|
|
|
course_ids_to_embed.append(course_id)
|
|
|
|
|
|
|
|
|
if courses_to_embed:
|
|
|
descriptions = [course.get('description', '') or 'No description available'
|
|
|
for course in courses_to_embed]
|
|
|
|
|
|
logger.info(f"Generating embeddings for {len(courses_to_embed)} courses")
|
|
|
embeddings = model.encode(descriptions)
|
|
|
|
|
|
|
|
|
for course_id, embedding in zip(course_ids_to_embed, embeddings):
|
|
|
embedding_cache[course_id] = embedding
|
|
|
|
|
|
|
|
|
result = {}
|
|
|
for course in courses:
|
|
|
course_id = course['id']
|
|
|
if course_id in embedding_cache:
|
|
|
result[course_id] = embedding_cache[course_id]
|
|
|
|
|
|
return result
|
|
|
|
|
|
def get_related_categories_with_scores(enrolled_categories: Set[str]) -> Dict[str, float]:
|
|
|
"""
|
|
|
Get related categories with similarity scores based on enrolled categories
|
|
|
"""
|
|
|
related_scores = {}
|
|
|
|
|
|
for category in enrolled_categories:
|
|
|
normalized_category = normalize_category_name(category)
|
|
|
|
|
|
|
|
|
if normalized_category in RELATED_CATEGORIES:
|
|
|
for related_cat, score in RELATED_CATEGORIES[normalized_category].items():
|
|
|
if related_cat not in enrolled_categories:
|
|
|
if related_cat in related_scores:
|
|
|
related_scores[related_cat] = max(related_scores[related_cat], score)
|
|
|
else:
|
|
|
related_scores[related_cat] = score
|
|
|
else:
|
|
|
|
|
|
for known_category, relations in RELATED_CATEGORIES.items():
|
|
|
if known_category in normalized_category or normalized_category in known_category:
|
|
|
for related_cat, score in relations.items():
|
|
|
if related_cat not in enrolled_categories:
|
|
|
if related_cat in related_scores:
|
|
|
related_scores[related_cat] = max(related_scores[related_cat], score * 0.7)
|
|
|
else:
|
|
|
related_scores[related_cat] = score * 0.7
|
|
|
|
|
|
return related_scores
|
|
|
|
|
|
def calculate_category_relevance(course_category: str,
|
|
|
enrolled_categories: Set[str],
|
|
|
related_categories: Dict[str, float]) -> float:
|
|
|
"""Calculate how relevant a course category is to enrolled categories"""
|
|
|
normalized_course_category = normalize_category_name(course_category)
|
|
|
normalized_enrolled_categories = {normalize_category_name(cat) for cat in enrolled_categories}
|
|
|
|
|
|
|
|
|
for enrolled_cat in normalized_enrolled_categories:
|
|
|
if enrolled_cat in normalized_course_category or normalized_course_category in enrolled_cat:
|
|
|
return 1.0
|
|
|
|
|
|
|
|
|
for related_cat, score in related_categories.items():
|
|
|
normalized_related_cat = normalize_category_name(related_cat)
|
|
|
if normalized_related_cat in normalized_course_category or normalized_course_category in normalized_related_cat:
|
|
|
return score
|
|
|
|
|
|
return 0.0
|
|
|
|
|
|
def recommend_courses(enrolled_courses, all_courses, top_n=5):
|
|
|
"""
|
|
|
Recommend courses based on enrolled courses using multi-factor scoring
|
|
|
|
|
|
Args:
|
|
|
enrolled_courses: List of courses the student is enrolled in
|
|
|
all_courses: List of all available courses
|
|
|
top_n: Number of recommendations to return
|
|
|
|
|
|
Returns:
|
|
|
List of recommended course IDs
|
|
|
"""
|
|
|
if not MODEL_LOADED:
|
|
|
raise Exception("AI model not loaded")
|
|
|
|
|
|
if not enrolled_courses:
|
|
|
|
|
|
sorted_courses = sorted(all_courses,
|
|
|
key=lambda x: x.get('enrollment_count', 0),
|
|
|
reverse=True)
|
|
|
return [course['id'] for course in sorted_courses[:top_n]]
|
|
|
|
|
|
try:
|
|
|
|
|
|
enrolled_categories = set(course['category'] for course in enrolled_courses)
|
|
|
related_categories = get_related_categories_with_scores(enrolled_categories)
|
|
|
enrolled_ids = set(course['id'] for course in enrolled_courses)
|
|
|
|
|
|
logger.info(f"Enrolled categories: {enrolled_categories}")
|
|
|
logger.info(f"Related categories: {list(related_categories.keys())}")
|
|
|
|
|
|
|
|
|
available_courses = [course for course in all_courses
|
|
|
if course['id'] not in enrolled_ids]
|
|
|
|
|
|
if not available_courses:
|
|
|
logger.warning("No available courses to recommend")
|
|
|
return []
|
|
|
|
|
|
|
|
|
all_courses_for_embedding = enrolled_courses + available_courses
|
|
|
embeddings = get_course_embeddings_batch(all_courses_for_embedding)
|
|
|
|
|
|
|
|
|
scored_courses = []
|
|
|
enrolled_embeddings = [embeddings[course['id']] for course in enrolled_courses
|
|
|
if course['id'] in embeddings]
|
|
|
|
|
|
|
|
|
enrollment_counts = [course.get('enrollment_count', 0) for course in available_courses]
|
|
|
max_enrollment = max(enrollment_counts) if enrollment_counts else 1
|
|
|
min_enrollment = min(enrollment_counts) if enrollment_counts else 0
|
|
|
|
|
|
for course in available_courses:
|
|
|
if course['id'] not in embeddings:
|
|
|
continue
|
|
|
|
|
|
course_embedding = embeddings[course['id']]
|
|
|
|
|
|
|
|
|
semantic_score = 0.0
|
|
|
if enrolled_embeddings:
|
|
|
similarities = cosine_similarity([course_embedding], enrolled_embeddings)[0]
|
|
|
semantic_score = float(np.mean(similarities))
|
|
|
|
|
|
|
|
|
enrollment_count = course.get('enrollment_count', 0)
|
|
|
if max_enrollment > min_enrollment:
|
|
|
popularity_score = (enrollment_count - min_enrollment) / (max_enrollment - min_enrollment)
|
|
|
else:
|
|
|
popularity_score = 0.5
|
|
|
|
|
|
|
|
|
category_relevance = calculate_category_relevance(
|
|
|
course['category'], enrolled_categories, related_categories
|
|
|
)
|
|
|
|
|
|
|
|
|
combined_score = (
|
|
|
semantic_score * SCORING_WEIGHTS['semantic_similarity'] +
|
|
|
popularity_score * SCORING_WEIGHTS['popularity'] +
|
|
|
category_relevance * SCORING_WEIGHTS['category_relevance']
|
|
|
)
|
|
|
|
|
|
scored_courses.append((course, combined_score, semantic_score, popularity_score, category_relevance))
|
|
|
|
|
|
|
|
|
scored_courses.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
|
|
|
final_recommendations = _apply_diversity_boost(scored_courses, top_n)
|
|
|
|
|
|
|
|
|
logger.info("=== Recommendation Details ===")
|
|
|
for i, (course, combined_score, semantic_score, popularity_score, category_relevance) in enumerate(scored_courses[:top_n]):
|
|
|
logger.info(f"{i+1}. {course['title']} (Category: {course['category']})")
|
|
|
logger.info(f" Score: {combined_score:.3f} (Semantic: {semantic_score:.3f}, Popularity: {popularity_score:.3f}, Category: {category_relevance:.3f})")
|
|
|
|
|
|
return [course['id'] for course in final_recommendations]
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Error generating recommendations: {e}")
|
|
|
return _fallback_recommendations(enrolled_courses, all_courses, top_n)
|
|
|
|
|
|
def _apply_diversity_boost(scored_courses: List[Tuple], top_n: int) -> List[Dict]:
|
|
|
"""Ensure recommendations cover different categories"""
|
|
|
selected_courses = []
|
|
|
selected_categories = set()
|
|
|
|
|
|
for course, combined_score, semantic_score, popularity_score, category_relevance in scored_courses:
|
|
|
if len(selected_courses) >= top_n:
|
|
|
break
|
|
|
|
|
|
current_category = normalize_category_name(course['category'])
|
|
|
|
|
|
|
|
|
if current_category in selected_categories and category_relevance < 0.5:
|
|
|
continue
|
|
|
|
|
|
selected_courses.append(course)
|
|
|
selected_categories.add(current_category)
|
|
|
|
|
|
|
|
|
if len(selected_courses) < top_n:
|
|
|
remaining_slots = top_n - len(selected_courses)
|
|
|
for course, combined_score, semantic_score, popularity_score, category_relevance in scored_courses:
|
|
|
if course not in selected_courses:
|
|
|
selected_courses.append(course)
|
|
|
remaining_slots -= 1
|
|
|
if remaining_slots <= 0:
|
|
|
break
|
|
|
|
|
|
return selected_courses[:top_n]
|
|
|
|
|
|
def _fallback_recommendations(enrolled_courses: List[Dict],
|
|
|
all_courses: List[Dict], top_n: int) -> List[str]:
|
|
|
"""Fallback recommendation strategy when main algorithm fails"""
|
|
|
logger.info("Using fallback recommendation strategy")
|
|
|
|
|
|
enrolled_categories = set(course['category'] for course in enrolled_courses)
|
|
|
enrolled_ids = set(course['id'] for course in enrolled_courses)
|
|
|
|
|
|
|
|
|
category_matches = [
|
|
|
course for course in all_courses
|
|
|
if course['category'] in enrolled_categories and course['id'] not in enrolled_ids
|
|
|
]
|
|
|
|
|
|
if len(category_matches) >= top_n:
|
|
|
category_matches.sort(key=lambda x: x.get('enrollment_count', 0), reverse=True)
|
|
|
return [course['id'] for course in category_matches[:top_n]]
|
|
|
|
|
|
|
|
|
related_categories_map = get_related_categories_with_scores(enrolled_categories)
|
|
|
related_matches = [
|
|
|
course for course in all_courses
|
|
|
if any(related_cat in course['category'] for related_cat in related_categories_map) and course['id'] not in enrolled_ids
|
|
|
]
|
|
|
|
|
|
all_matches = category_matches + related_matches
|
|
|
if all_matches:
|
|
|
all_matches.sort(key=lambda x: x.get('enrollment_count', 0), reverse=True)
|
|
|
return [course['id'] for course in all_matches[:top_n]]
|
|
|
|
|
|
|
|
|
available_courses = [course for course in all_courses if course['id'] not in enrolled_ids]
|
|
|
available_courses.sort(key=lambda x: x.get('enrollment_count', 0), reverse=True)
|
|
|
return [course['id'] for course in available_courses[:top_n]]
|
|
|
|
|
|
|
|
|
def get_course_embeddings(courses):
|
|
|
"""Legacy function for backward compatibility"""
|
|
|
return get_course_embeddings_batch(courses)
|
|
|
|
|
|
def get_related_categories(enrolled_categories):
|
|
|
"""Legacy function for backward compatibility"""
|
|
|
related_scores = get_related_categories_with_scores(set(enrolled_categories))
|
|
|
return list(related_scores.keys())
|
|
|
|
|
|
def rank_within_category(category_courses, enrolled_courses, all_courses, top_n):
|
|
|
"""Legacy function for backward compatibility - simplified version"""
|
|
|
if not category_courses:
|
|
|
return []
|
|
|
|
|
|
|
|
|
all_courses_filtered = [course for course in all_courses if course in category_courses]
|
|
|
recommendations = recommend_courses(enrolled_courses, all_courses_filtered, top_n)
|
|
|
|
|
|
|
|
|
course_map = {course['id']: course for course in category_courses}
|
|
|
return [course_map[course_id] for course_id in recommendations if course_id in course_map]
|
|
|
|
|
|
def rank_other_courses(other_courses, enrolled_courses, all_courses, top_n):
|
|
|
"""Legacy function for backward compatibility - simplified version"""
|
|
|
if not other_courses or top_n <= 0:
|
|
|
return []
|
|
|
|
|
|
|
|
|
all_courses_filtered = [course for course in all_courses if course in other_courses]
|
|
|
recommendations = recommend_courses(enrolled_courses, all_courses_filtered, top_n)
|
|
|
|
|
|
|
|
|
course_map = {course['id']: course for course in other_courses}
|
|
|
return [course_map[course_id] for course_id in recommendations if course_id in course_map] |