Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,27 +10,30 @@ import torch
|
|
| 10 |
import gradio as gr
|
| 11 |
import time
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
# --- Configuration ---
|
| 14 |
load_dotenv()
|
| 15 |
TMDB_API_KEY = os.environ.get("TMDB_API_KEY", "442a13f1865d8936f95aa20737e6f6f5")
|
| 16 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 17 |
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
BASE_TMDB_URL = "https://api.themoviedb.org/3"
|
| 21 |
POSTER_BASE_URL = "https://image.tmdb.org/t/p/w500"
|
| 22 |
-
NUM_RECOMMENDATIONS_TO_GENERATE = 20
|
| 23 |
NUM_RECOMMENDATIONS_TO_DISPLAY = 5
|
| 24 |
MIN_RATING_FOR_SEED = 3.5
|
| 25 |
MIN_VOTE_COUNT_TMDB = 100
|
| 26 |
|
| 27 |
-
# --- Global Variables
|
| 28 |
df_profile_global = None
|
| 29 |
df_watchlist_global = None
|
| 30 |
df_reviews_global = None
|
| 31 |
df_diary_global = None
|
| 32 |
df_ratings_global = None
|
| 33 |
-
df_watched_global = None
|
| 34 |
|
| 35 |
uri_to_movie_map_global = {}
|
| 36 |
all_watched_titles_global = set()
|
|
@@ -43,8 +46,7 @@ llm_tokenizer = None
|
|
| 43 |
|
| 44 |
# --- Helper Functions ---
|
| 45 |
def clean_html(raw_html):
|
| 46 |
-
if pd.isna(raw_html) or raw_html is None:
|
| 47 |
-
return ""
|
| 48 |
text = str(raw_html)
|
| 49 |
text = re.sub(r'<br\s*/?>', '\n', text)
|
| 50 |
soup = BeautifulSoup(text, "html.parser")
|
|
@@ -54,7 +56,6 @@ def get_movie_uri_map(dfs_dict):
|
|
| 54 |
uri_map = {}
|
| 55 |
df_priority = ['reviews.csv', 'diary.csv', 'ratings.csv', 'watched.csv', 'watchlist.csv']
|
| 56 |
processed_uris = set()
|
| 57 |
-
|
| 58 |
for df_name in df_priority:
|
| 59 |
df = dfs_dict.get(df_name)
|
| 60 |
if df is not None and 'Letterboxd URI' in df.columns and 'Name' in df.columns and 'Year' in df.columns:
|
|
@@ -66,8 +67,7 @@ def get_movie_uri_map(dfs_dict):
|
|
| 66 |
year = int(row['Year'])
|
| 67 |
uri_map[uri] = (str(row['Name']), year)
|
| 68 |
processed_uris.add(uri)
|
| 69 |
-
except ValueError:
|
| 70 |
-
pass
|
| 71 |
return uri_map
|
| 72 |
|
| 73 |
def load_all_data():
|
|
@@ -77,14 +77,13 @@ def load_all_data():
|
|
| 77 |
|
| 78 |
try:
|
| 79 |
df_profile_global = pd.read_csv("profile.csv")
|
| 80 |
-
df_comments_global = pd.read_csv("comments.csv") # Loaded but not explicitly used in this version for recs
|
| 81 |
df_watchlist_global = pd.read_csv("watchlist.csv")
|
| 82 |
df_reviews_global = pd.read_csv("reviews.csv")
|
| 83 |
df_diary_global = pd.read_csv("diary.csv")
|
| 84 |
df_ratings_global = pd.read_csv("ratings.csv")
|
| 85 |
_df_watched_log = pd.read_csv("watched.csv")
|
| 86 |
except FileNotFoundError as e:
|
| 87 |
-
print(f"ERROR: CSV file not found: {e}.
|
| 88 |
return False
|
| 89 |
|
| 90 |
dfs_for_uri_map = {
|
|
@@ -115,9 +114,13 @@ def load_all_data():
|
|
| 115 |
consolidated.drop(columns=['Rating_simple'], inplace=True)
|
| 116 |
|
| 117 |
watched_log_subset = _df_watched_log[['Letterboxd URI', 'Name', 'Year']].copy()
|
| 118 |
-
watched_log_subset['from_watched_log'] = True
|
| 119 |
consolidated = pd.merge(consolidated, watched_log_subset, on=['Letterboxd URI', 'Name', 'Year'], how='outer')
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
consolidated['Review Text'] = consolidated['Review Text'].fillna('').apply(clean_html)
|
| 123 |
consolidated['Year'] = pd.to_numeric(consolidated['Year'], errors='coerce').astype('Int64')
|
|
@@ -128,8 +131,7 @@ def load_all_data():
|
|
| 128 |
all_watched_titles_global = set(zip(df_watched_global['Name'].astype(str), df_watched_global['Year'].astype(int)))
|
| 129 |
for _, row in _df_watched_log.iterrows():
|
| 130 |
if pd.notna(row['Name']) and pd.notna(row['Year']):
|
| 131 |
-
try:
|
| 132 |
-
all_watched_titles_global.add((str(row['Name']), int(row['Year'])))
|
| 133 |
except ValueError: pass
|
| 134 |
|
| 135 |
if df_watchlist_global is not None:
|
|
@@ -162,8 +164,12 @@ def load_all_data():
|
|
| 162 |
'review_text': row['Review Text'], 'uri': row['Letterboxd URI']
|
| 163 |
})
|
| 164 |
temp_df = pd.DataFrame(seed_movies_global)
|
| 165 |
-
temp_df.
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
random.shuffle(seed_movies_global)
|
| 168 |
return True
|
| 169 |
|
|
@@ -171,23 +177,38 @@ def initialize_llm():
|
|
| 171 |
global llm_pipeline, llm_tokenizer
|
| 172 |
if llm_pipeline is None:
|
| 173 |
print(f"Initializing LLM: {MODEL_NAME}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
try:
|
| 175 |
-
llm_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
| 176 |
model = AutoModelForCausalLM.from_pretrained(
|
| 177 |
-
MODEL_NAME,
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
llm_pipeline = pipeline(
|
| 181 |
-
"text-generation", model=model, tokenizer=llm_tokenizer
|
| 182 |
-
torch_dtype=torch.float16, device_map="auto"
|
| 183 |
)
|
| 184 |
-
print("LLM Initialized Successfully.")
|
| 185 |
except Exception as e:
|
| 186 |
-
print(f"Error initializing LLM: {e}")
|
| 187 |
llm_pipeline = None
|
| 188 |
|
|
|
|
|
|
|
| 189 |
def search_tmdb_movie_details(title, year):
|
| 190 |
-
if not TMDB_API_KEY or TMDB_API_KEY == "YOUR_TMDB_API_KEY_FALLBACK":
|
| 191 |
print("TMDB API Key not properly configured.")
|
| 192 |
return None
|
| 193 |
try:
|
|
@@ -211,15 +232,13 @@ def search_tmdb_movie_details(title, year):
|
|
| 211 |
'vote_average': movie.get('vote_average'), 'vote_count': movie.get('vote_count'),
|
| 212 |
'popularity': movie.get('popularity')
|
| 213 |
}
|
| 214 |
-
time.sleep(0.25)
|
| 215 |
-
except requests.RequestException as e:
|
| 216 |
-
|
| 217 |
-
except Exception as ex:
|
| 218 |
-
print(f"Unexpected error in search_tmdb_movie_details for {title} ({year}): {ex}")
|
| 219 |
return None
|
| 220 |
|
| 221 |
def get_tmdb_recommendations(movie_id, page=1):
|
| 222 |
-
if not TMDB_API_KEY or TMDB_API_KEY == "YOUR_TMDB_API_KEY_FALLBACK":
|
| 223 |
print("TMDB API Key not properly configured.")
|
| 224 |
return []
|
| 225 |
recommendations = []
|
|
@@ -239,13 +258,12 @@ def get_tmdb_recommendations(movie_id, page=1):
|
|
| 239 |
'vote_average': movie.get('vote_average'), 'vote_count': movie.get('vote_count'),
|
| 240 |
'popularity': movie.get('popularity')
|
| 241 |
})
|
| 242 |
-
time.sleep(0.25)
|
| 243 |
-
except requests.RequestException as e:
|
| 244 |
-
|
| 245 |
-
except Exception as ex:
|
| 246 |
-
print(f"Unexpected error in get_tmdb_recommendations for movie ID {movie_id}: {ex}")
|
| 247 |
return recommendations
|
| 248 |
|
|
|
|
| 249 |
def generate_saudi_explanation(recommended_movie_title, seed_movie_title, seed_movie_context=""):
|
| 250 |
global llm_pipeline, llm_tokenizer
|
| 251 |
if llm_pipeline is None or llm_tokenizer is None:
|
|
@@ -254,55 +272,59 @@ def generate_saudi_explanation(recommended_movie_title, seed_movie_title, seed_m
|
|
| 254 |
max_context_len = 150
|
| 255 |
seed_movie_context_short = (seed_movie_context[:max_context_len] + "...") if len(seed_movie_context) > max_context_len else seed_movie_context
|
| 256 |
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
try:
|
| 271 |
sequences = llm_pipeline(
|
| 272 |
-
prompt_template, do_sample=True, top_k=
|
| 273 |
-
eos_token_id=llm_tokenizer.eos_token_id,
|
|
|
|
|
|
|
| 274 |
)
|
| 275 |
explanation = sequences[0]['generated_text'].split("[/INST]")[-1].strip()
|
| 276 |
-
explanation = re.sub(r"^اشرح باللهجة السعودية:\s*", "", explanation, flags=re.IGNORECASE)
|
| 277 |
explanation = explanation.replace("<s>", "").replace("</s>", "").strip()
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
| 279 |
return f"شكلك بتنبسط على فيلم '{recommended_movie_title}' لأنه يشبه جو فيلم '{seed_movie_title}' اللي حبيته! عطيه تجربة."
|
| 280 |
return explanation
|
| 281 |
except Exception as e:
|
| 282 |
-
print(f"Error during LLM generation: {e}")
|
| 283 |
return f"يا كابتن، شكلك بتحب '{recommended_movie_title}'، خاصة إنك استمتعت بـ'{seed_movie_title}'. جربه وعطنا رأيك!"
|
| 284 |
|
|
|
|
| 285 |
def get_recommendations(progress=gr.Progress()):
|
| 286 |
-
if not TMDB_API_KEY or TMDB_API_KEY == "442a13f1865d8936f95aa20737e6f6f5" and not os.environ.get("TMDB_API_KEY"):
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
if not TMDB_API_KEY: # Final check if it's truly None
|
| 292 |
-
return "<p style='color:red; text-align:right;'>خطأ: مفتاح TMDB API مو موجود. الرجاء إضافته كـ Secret في Hugging Face Space.</p>"
|
| 293 |
-
|
| 294 |
-
|
| 295 |
if not all([df_profile_global is not None, df_watched_global is not None, seed_movies_global]):
|
| 296 |
-
return "<p style='color:red; text-align:right;'>خطأ: فشل في تحميل بيانات
|
| 297 |
|
|
|
|
| 298 |
if llm_pipeline is None:
|
| 299 |
-
initialize_llm()
|
| 300 |
-
if llm_pipeline is None:
|
| 301 |
-
return "<p style='color:red; text-align:right;'>خطأ: فشل في تهيئة نموذج الذكاء الاصطناعي.
|
| 302 |
|
| 303 |
-
progress(0.1, desc="نجمع أفلامك
|
| 304 |
potential_recs = {}
|
| 305 |
-
seeds_to_process = seed_movies_global[:
|
| 306 |
|
| 307 |
for i, seed_movie in enumerate(seeds_to_process):
|
| 308 |
progress(0.1 + (i / len(seeds_to_process)) * 0.4, desc=f"نبحث عن توصيات بناءً على: {seed_movie['name']}")
|
|
@@ -310,7 +332,7 @@ def get_recommendations(progress=gr.Progress()):
|
|
| 310 |
if seed_tmdb_details and seed_tmdb_details.get('id'):
|
| 311 |
tmdb_recs = get_tmdb_recommendations(seed_tmdb_details['id'])
|
| 312 |
for rec in tmdb_recs:
|
| 313 |
-
try:
|
| 314 |
rec_tuple = (str(rec['title']), int(rec['year']))
|
| 315 |
if rec.get('id') and rec_tuple not in all_watched_titles_global and rec_tuple not in watchlist_titles_global:
|
| 316 |
if rec['id'] not in potential_recs:
|
|
@@ -318,13 +340,9 @@ def get_recommendations(progress=gr.Progress()):
|
|
| 318 |
'movie_info': rec, 'seed_movie_title': seed_movie['name'],
|
| 319 |
'seed_movie_context': seed_movie.get('review_text', '') or seed_movie.get('comment_text', '')
|
| 320 |
}
|
| 321 |
-
except ValueError:
|
| 322 |
-
# print(f"Warning: Could not parse year for recommended movie {rec.get('title')}. Skipping.")
|
| 323 |
-
continue # Skip if year is not a valid integer
|
| 324 |
-
|
| 325 |
-
|
| 326 |
if not potential_recs:
|
| 327 |
-
return "<p style='text-align:right;'>ما لقينا توصيات جديدة لك حالياً.
|
| 328 |
|
| 329 |
sorted_recs_list = sorted(potential_recs.values(), key=lambda x: x['movie_info'].get('popularity', 0), reverse=True)
|
| 330 |
final_recommendations_data = []
|
|
@@ -334,13 +352,11 @@ def get_recommendations(progress=gr.Progress()):
|
|
| 334 |
if rec_data['movie_info']['id'] not in displayed_ids:
|
| 335 |
final_recommendations_data.append(rec_data)
|
| 336 |
displayed_ids.add(rec_data['movie_info']['id'])
|
| 337 |
-
|
| 338 |
if not final_recommendations_data:
|
| 339 |
-
return "<p style='text-align:right;'>ما لقينا توصيات جديدة لك حالياً بعد الفلترة.
|
| 340 |
|
| 341 |
output_html = "<div>"
|
| 342 |
progress(0.6, desc="نجهز لك الشرح باللغة العامية...")
|
| 343 |
-
|
| 344 |
for i, rec_data in enumerate(final_recommendations_data):
|
| 345 |
progress(0.6 + (i / len(final_recommendations_data)) * 0.4, desc=f"نكتب شرح لفيلم: {rec_data['movie_info']['title']}")
|
| 346 |
explanation = generate_saudi_explanation(
|
|
@@ -349,7 +365,6 @@ def get_recommendations(progress=gr.Progress()):
|
|
| 349 |
poster_url = rec_data['movie_info']['poster_path']
|
| 350 |
if not poster_url or "placeholder.com" in poster_url:
|
| 351 |
poster_url = f"https://via.placeholder.com/300x450.png?text={rec_data['movie_info']['title'].replace(' ', '+')}"
|
| 352 |
-
|
| 353 |
output_html += f"""
|
| 354 |
<div style="display: flex; flex-direction: row-reverse; align-items: flex-start; margin-bottom: 25px; border-bottom: 1px solid #ddd; padding-bottom:15px; background-color: #f9f9f9; border-radius: 8px; padding: 15px;">
|
| 355 |
<img src="{poster_url}" alt="{rec_data['movie_info']['title']}" style="width: 150px; max-width:30%; height: auto; margin-left: 20px; border-radius: 5px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);">
|
|
@@ -369,15 +384,13 @@ body { font-family: 'Tajawal', sans-serif; }
|
|
| 369 |
footer { display: none !important; }
|
| 370 |
.gr-button { background-color: #c70039 !important; color: white !important; font-size: 1.2em !important; padding: 10px 20px !important; border-radius: 8px !important; }
|
| 371 |
.gr-button:hover { background-color: #a3002f !important; }
|
| 372 |
-
.gr-input { text-align: right !important; }
|
| 373 |
-
.gr-output { text-align: right !important; }
|
| 374 |
h1, h3 { color: #900c3f !important; }
|
| 375 |
-
"""
|
| 376 |
|
| 377 |
data_loaded_successfully = load_all_data()
|
| 378 |
if data_loaded_successfully:
|
| 379 |
print("All user data loaded and preprocessed successfully.")
|
| 380 |
-
|
| 381 |
else:
|
| 382 |
print("Failed to load user data. The app might not function correctly.")
|
| 383 |
|
|
@@ -392,10 +405,13 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="red", secondary_hue="pink"), cs
|
|
| 392 |
recommend_button = gr.Button("عطني توصيات أفلام!")
|
| 393 |
with gr.Column():
|
| 394 |
output_recommendations = gr.HTML(label="توصياتك النارية 🔥")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
-
recommend_button.click(
|
| 397 |
-
fn=get_recommendations, inputs=[], outputs=[output_recommendations]
|
| 398 |
-
)
|
| 399 |
gr.Markdown(
|
| 400 |
"""
|
| 401 |
<div style="text-align: center; margin-top: 30px; font-size: 0.9em; color: #777;">
|
|
@@ -404,7 +420,6 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="red", secondary_hue="pink"), cs
|
|
| 404 |
)
|
| 405 |
|
| 406 |
if __name__ == "__main__":
|
| 407 |
-
if not TMDB_API_KEY or TMDB_API_KEY == "442a13f1865d8936f95aa20737e6f6f5" and not os.environ.get("TMDB_API_KEY"):
|
| 408 |
print("\nWARNING: TMDB_API_KEY is using the hardcoded fallback or is missing.")
|
| 409 |
-
|
| 410 |
-
iface.launch(debug=True)
|
|
|
|
| 10 |
import gradio as gr
|
| 11 |
import time
|
| 12 |
|
| 13 |
+
# Opt-in to future pandas behavior to potentially silence the downcasting warning
|
| 14 |
+
# pd.set_option('future.no_silent_downcasting', True) # You can uncomment this if you wish
|
| 15 |
+
|
| 16 |
# --- Configuration ---
|
| 17 |
load_dotenv()
|
| 18 |
TMDB_API_KEY = os.environ.get("TMDB_API_KEY", "442a13f1865d8936f95aa20737e6f6f5")
|
| 19 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") # CRUCIAL for gated models
|
| 20 |
|
| 21 |
+
# CORRECTED MODEL NAME
|
| 22 |
+
MODEL_NAME = "ALLaM-AI/ALLaM-7B-Instruct-preview"
|
| 23 |
|
| 24 |
BASE_TMDB_URL = "https://api.themoviedb.org/3"
|
| 25 |
POSTER_BASE_URL = "https://image.tmdb.org/t/p/w500"
|
|
|
|
| 26 |
NUM_RECOMMENDATIONS_TO_DISPLAY = 5
|
| 27 |
MIN_RATING_FOR_SEED = 3.5
|
| 28 |
MIN_VOTE_COUNT_TMDB = 100
|
| 29 |
|
| 30 |
+
# --- Global Variables ---
|
| 31 |
df_profile_global = None
|
| 32 |
df_watchlist_global = None
|
| 33 |
df_reviews_global = None
|
| 34 |
df_diary_global = None
|
| 35 |
df_ratings_global = None
|
| 36 |
+
df_watched_global = None
|
| 37 |
|
| 38 |
uri_to_movie_map_global = {}
|
| 39 |
all_watched_titles_global = set()
|
|
|
|
| 46 |
|
| 47 |
# --- Helper Functions ---
|
| 48 |
def clean_html(raw_html):
|
| 49 |
+
if pd.isna(raw_html) or raw_html is None: return ""
|
|
|
|
| 50 |
text = str(raw_html)
|
| 51 |
text = re.sub(r'<br\s*/?>', '\n', text)
|
| 52 |
soup = BeautifulSoup(text, "html.parser")
|
|
|
|
| 56 |
uri_map = {}
|
| 57 |
df_priority = ['reviews.csv', 'diary.csv', 'ratings.csv', 'watched.csv', 'watchlist.csv']
|
| 58 |
processed_uris = set()
|
|
|
|
| 59 |
for df_name in df_priority:
|
| 60 |
df = dfs_dict.get(df_name)
|
| 61 |
if df is not None and 'Letterboxd URI' in df.columns and 'Name' in df.columns and 'Year' in df.columns:
|
|
|
|
| 67 |
year = int(row['Year'])
|
| 68 |
uri_map[uri] = (str(row['Name']), year)
|
| 69 |
processed_uris.add(uri)
|
| 70 |
+
except ValueError: pass
|
|
|
|
| 71 |
return uri_map
|
| 72 |
|
| 73 |
def load_all_data():
|
|
|
|
| 77 |
|
| 78 |
try:
|
| 79 |
df_profile_global = pd.read_csv("profile.csv")
|
|
|
|
| 80 |
df_watchlist_global = pd.read_csv("watchlist.csv")
|
| 81 |
df_reviews_global = pd.read_csv("reviews.csv")
|
| 82 |
df_diary_global = pd.read_csv("diary.csv")
|
| 83 |
df_ratings_global = pd.read_csv("ratings.csv")
|
| 84 |
_df_watched_log = pd.read_csv("watched.csv")
|
| 85 |
except FileNotFoundError as e:
|
| 86 |
+
print(f"ERROR: CSV file not found: {e}.")
|
| 87 |
return False
|
| 88 |
|
| 89 |
dfs_for_uri_map = {
|
|
|
|
| 114 |
consolidated.drop(columns=['Rating_simple'], inplace=True)
|
| 115 |
|
| 116 |
watched_log_subset = _df_watched_log[['Letterboxd URI', 'Name', 'Year']].copy()
|
| 117 |
+
watched_log_subset['from_watched_log'] = True # This column is an object/boolean dtype
|
| 118 |
consolidated = pd.merge(consolidated, watched_log_subset, on=['Letterboxd URI', 'Name', 'Year'], how='outer')
|
| 119 |
+
|
| 120 |
+
# Address the FutureWarning directly or use pd.set_option
|
| 121 |
+
# This ensures 'from_watched_log' becomes boolean after fillna
|
| 122 |
+
consolidated['from_watched_log'] = consolidated['from_watched_log'].fillna(False).astype(bool)
|
| 123 |
+
|
| 124 |
|
| 125 |
consolidated['Review Text'] = consolidated['Review Text'].fillna('').apply(clean_html)
|
| 126 |
consolidated['Year'] = pd.to_numeric(consolidated['Year'], errors='coerce').astype('Int64')
|
|
|
|
| 131 |
all_watched_titles_global = set(zip(df_watched_global['Name'].astype(str), df_watched_global['Year'].astype(int)))
|
| 132 |
for _, row in _df_watched_log.iterrows():
|
| 133 |
if pd.notna(row['Name']) and pd.notna(row['Year']):
|
| 134 |
+
try: all_watched_titles_global.add((str(row['Name']), int(row['Year'])))
|
|
|
|
| 135 |
except ValueError: pass
|
| 136 |
|
| 137 |
if df_watchlist_global is not None:
|
|
|
|
| 164 |
'review_text': row['Review Text'], 'uri': row['Letterboxd URI']
|
| 165 |
})
|
| 166 |
temp_df = pd.DataFrame(seed_movies_global)
|
| 167 |
+
if not temp_df.empty: # Check if DataFrame is not empty before dropping duplicates
|
| 168 |
+
temp_df.drop_duplicates(subset=['name', 'year'], keep='first', inplace=True)
|
| 169 |
+
seed_movies_global = temp_df.to_dict('records')
|
| 170 |
+
else:
|
| 171 |
+
seed_movies_global = [] # Ensure it's an empty list if temp_df was empty
|
| 172 |
+
|
| 173 |
random.shuffle(seed_movies_global)
|
| 174 |
return True
|
| 175 |
|
|
|
|
| 177 |
global llm_pipeline, llm_tokenizer
|
| 178 |
if llm_pipeline is None:
|
| 179 |
print(f"Initializing LLM: {MODEL_NAME}")
|
| 180 |
+
if not HF_TOKEN:
|
| 181 |
+
print("WARNING: HF_TOKEN not found. Access to gated models like ALLaM will fail.")
|
| 182 |
+
# Optionally, you could prevent the attempt to load if no token,
|
| 183 |
+
# or let it try and fail, as it currently does.
|
| 184 |
+
# return # uncomment to stop here if no token
|
| 185 |
+
|
| 186 |
try:
|
| 187 |
+
llm_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, token=HF_TOKEN)
|
| 188 |
model = AutoModelForCausalLM.from_pretrained(
|
| 189 |
+
MODEL_NAME,
|
| 190 |
+
torch_dtype=torch.float16,
|
| 191 |
+
device_map="auto",
|
| 192 |
+
load_in_8bit=True,
|
| 193 |
+
trust_remote_code=True,
|
| 194 |
+
token=HF_TOKEN
|
| 195 |
)
|
| 196 |
+
if llm_tokenizer.pad_token is None:
|
| 197 |
+
llm_tokenizer.pad_token = llm_tokenizer.eos_token
|
| 198 |
+
model.config.pad_token_id = model.config.eos_token_id
|
| 199 |
+
|
| 200 |
llm_pipeline = pipeline(
|
| 201 |
+
"text-generation", model=model, tokenizer=llm_tokenizer
|
|
|
|
| 202 |
)
|
| 203 |
+
print(f"LLM ({MODEL_NAME}) Initialized Successfully.")
|
| 204 |
except Exception as e:
|
| 205 |
+
print(f"Error initializing LLM ({MODEL_NAME}): {e}")
|
| 206 |
llm_pipeline = None
|
| 207 |
|
| 208 |
+
|
| 209 |
+
# --- TMDB API Functions ---
|
| 210 |
def search_tmdb_movie_details(title, year):
|
| 211 |
+
if not TMDB_API_KEY or TMDB_API_KEY == "YOUR_TMDB_API_KEY_FALLBACK":
|
| 212 |
print("TMDB API Key not properly configured.")
|
| 213 |
return None
|
| 214 |
try:
|
|
|
|
| 232 |
'vote_average': movie.get('vote_average'), 'vote_count': movie.get('vote_count'),
|
| 233 |
'popularity': movie.get('popularity')
|
| 234 |
}
|
| 235 |
+
time.sleep(0.25)
|
| 236 |
+
except requests.RequestException as e: print(f"Error searching TMDB for {title} ({year}): {e}")
|
| 237 |
+
except Exception as ex: print(f"Unexpected error in search_tmdb_movie_details for {title} ({year}): {ex}")
|
|
|
|
|
|
|
| 238 |
return None
|
| 239 |
|
| 240 |
def get_tmdb_recommendations(movie_id, page=1):
|
| 241 |
+
if not TMDB_API_KEY or TMDB_API_KEY == "YOUR_TMDB_API_KEY_FALLBACK":
|
| 242 |
print("TMDB API Key not properly configured.")
|
| 243 |
return []
|
| 244 |
recommendations = []
|
|
|
|
| 258 |
'vote_average': movie.get('vote_average'), 'vote_count': movie.get('vote_count'),
|
| 259 |
'popularity': movie.get('popularity')
|
| 260 |
})
|
| 261 |
+
time.sleep(0.25)
|
| 262 |
+
except requests.RequestException as e: print(f"Error getting TMDB recommendations for movie ID {movie_id}: {e}")
|
| 263 |
+
except Exception as ex: print(f"Unexpected error in get_tmdb_recommendations for movie ID {movie_id}: {ex}")
|
|
|
|
|
|
|
| 264 |
return recommendations
|
| 265 |
|
| 266 |
+
# --- LLM Explanation ---
|
| 267 |
def generate_saudi_explanation(recommended_movie_title, seed_movie_title, seed_movie_context=""):
|
| 268 |
global llm_pipeline, llm_tokenizer
|
| 269 |
if llm_pipeline is None or llm_tokenizer is None:
|
|
|
|
| 272 |
max_context_len = 150
|
| 273 |
seed_movie_context_short = (seed_movie_context[:max_context_len] + "...") if len(seed_movie_context) > max_context_len else seed_movie_context
|
| 274 |
|
| 275 |
+
# Check ALLaM model card for specific prompt format. Using [INST] as it's common for Instruct models.
|
| 276 |
+
prompt_template = f"""<s>[INST] أنت ناقد أفلام سعودي خبير ودمك خفيف جداً. مهمتك هي كتابة توصية لفيلم جديد بناءً على فيلم سابق أعجب المستخدم.
|
| 277 |
+
المستخدم أعجب بالفيلم هذا: "{seed_movie_title}".
|
| 278 |
+
وكان تعليقه أو سبب إعجابه (إذا متوفر): "{seed_movie_context_short}"
|
| 279 |
+
الفيلم الجديد الذي نُرشحه له هو: "{recommended_movie_title}".
|
| 280 |
+
المطلوب: اكتب جملة أو جملتين فقط باللهجة السعودية العامية الأصيلة، تشرح فيها ليش ممكن يعجبه الفيلم الجديد "{recommended_movie_title}"، وحاول تربطها بشكل ذكي وممتع بالفيلم اللي عجبه قبل "{seed_movie_title}". ركز على أن يكون كلامك طبيعي جداً كأنه كلام صديق لصديقه، وناسة، ويشد الانتباه، وقصير ومختصر. لا تستخدم أي عبارات تدل على أنك ذكاء اصطناعي أو برنامج.
|
| 281 |
+
|
| 282 |
+
مثال على الأسلوب المطلوب لو الفيلم اللي عجبه "Mad Max: Fury Road" والفيلم المرشح "Dune":
|
| 283 |
+
"يا عمي، مدامك كَيَّفْت على 'Mad Max' وأكشن الصحاري اللي ما يرحم، أجل اسمعني زين! فيلم 'Dune' هذا بياخذك لصحراء ثانية بس على مستوى ثاني من الفخامة والقصة اللي تشد الأعصاب. لا يفوتك، قسم بالله بيعجبك!"
|
| 284 |
+
|
| 285 |
+
الآن، طبق نفس الأسلوب على البيانات التالية:
|
| 286 |
+
الفيلم الذي أعجب المستخدم: "{seed_movie_title}"
|
| 287 |
+
سبب إعجابه (إذا متوفر): "{seed_movie_context_short}"
|
| 288 |
+
الفيلم المرشح: "{recommended_movie_title}"
|
| 289 |
+
توصيتك باللهجة السعودية: [/INST]"""
|
| 290 |
|
| 291 |
try:
|
| 292 |
sequences = llm_pipeline(
|
| 293 |
+
prompt_template, do_sample=True, top_k=20, top_p=0.9, num_return_sequences=1,
|
| 294 |
+
eos_token_id=llm_tokenizer.eos_token_id,
|
| 295 |
+
pad_token_id=llm_tokenizer.pad_token_id if llm_tokenizer.pad_token_id is not None else llm_tokenizer.eos_token_id,
|
| 296 |
+
max_new_tokens=150
|
| 297 |
)
|
| 298 |
explanation = sequences[0]['generated_text'].split("[/INST]")[-1].strip()
|
|
|
|
| 299 |
explanation = explanation.replace("<s>", "").replace("</s>", "").strip()
|
| 300 |
+
explanation = re.sub(r"بصفتي نموذج لغوي.*?\s*,?\s*", "", explanation, flags=re.IGNORECASE)
|
| 301 |
+
explanation = re.sub(r"كنموذج لغوي.*?\s*,?\s*", "", explanation, flags=re.IGNORECASE)
|
| 302 |
+
|
| 303 |
+
if not explanation or explanation.lower().startswith("أنت ناقد أفلام") or len(explanation) < 20 :
|
| 304 |
return f"شكلك بتنبسط على فيلم '{recommended_movie_title}' لأنه يشبه جو فيلم '{seed_movie_title}' اللي حبيته! عطيه تجربة."
|
| 305 |
return explanation
|
| 306 |
except Exception as e:
|
| 307 |
+
print(f"Error during LLM generation with {MODEL_NAME}: {e}")
|
| 308 |
return f"يا كابتن، شكلك بتحب '{recommended_movie_title}'، خاصة إنك استمتعت بـ'{seed_movie_title}'. جربه وعطنا رأيك!"
|
| 309 |
|
| 310 |
+
# --- Recommendation Logic ---
|
| 311 |
def get_recommendations(progress=gr.Progress()):
|
| 312 |
+
if not TMDB_API_KEY or (TMDB_API_KEY == "442a13f1865d8936f95aa20737e6f6f5" and not os.environ.get("TMDB_API_KEY")):
|
| 313 |
+
print("Warning: Using fallback TMDB API Key.")
|
| 314 |
+
if not TMDB_API_KEY:
|
| 315 |
+
return "<p style='color:red; text-align:right;'>خطأ: مفتاح TMDB API مو موجود.</p>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
if not all([df_profile_global is not None, df_watched_global is not None, seed_movies_global]):
|
| 317 |
+
return "<p style='color:red; text-align:right;'>خطأ: فشل في تحميل بيانات المستخدم.</p>"
|
| 318 |
|
| 319 |
+
# Ensure LLM is initialized before trying to use it
|
| 320 |
if llm_pipeline is None:
|
| 321 |
+
initialize_llm() # Attempt to initialize if not already done
|
| 322 |
+
if llm_pipeline is None: # Check again if initialization failed
|
| 323 |
+
return "<p style='color:red; text-align:right;'>خطأ: فشل في تهيئة نموذج الذكاء الاصطناعي. تأكد من وجود HF_TOKEN وأن لديك صلاحية الوصول للنموذج.</p>"
|
| 324 |
|
| 325 |
+
progress(0.1, desc="نجمع أفلامك المفضلة...")
|
| 326 |
potential_recs = {}
|
| 327 |
+
seeds_to_process = seed_movies_global[:25]
|
| 328 |
|
| 329 |
for i, seed_movie in enumerate(seeds_to_process):
|
| 330 |
progress(0.1 + (i / len(seeds_to_process)) * 0.4, desc=f"نبحث عن توصيات بناءً على: {seed_movie['name']}")
|
|
|
|
| 332 |
if seed_tmdb_details and seed_tmdb_details.get('id'):
|
| 333 |
tmdb_recs = get_tmdb_recommendations(seed_tmdb_details['id'])
|
| 334 |
for rec in tmdb_recs:
|
| 335 |
+
try:
|
| 336 |
rec_tuple = (str(rec['title']), int(rec['year']))
|
| 337 |
if rec.get('id') and rec_tuple not in all_watched_titles_global and rec_tuple not in watchlist_titles_global:
|
| 338 |
if rec['id'] not in potential_recs:
|
|
|
|
| 340 |
'movie_info': rec, 'seed_movie_title': seed_movie['name'],
|
| 341 |
'seed_movie_context': seed_movie.get('review_text', '') or seed_movie.get('comment_text', '')
|
| 342 |
}
|
| 343 |
+
except (ValueError, TypeError): continue # Catch TypeError if year is None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
if not potential_recs:
|
| 345 |
+
return "<p style='text-align:right;'>ما لقينا توصيات جديدة لك حالياً. 😉</p>"
|
| 346 |
|
| 347 |
sorted_recs_list = sorted(potential_recs.values(), key=lambda x: x['movie_info'].get('popularity', 0), reverse=True)
|
| 348 |
final_recommendations_data = []
|
|
|
|
| 352 |
if rec_data['movie_info']['id'] not in displayed_ids:
|
| 353 |
final_recommendations_data.append(rec_data)
|
| 354 |
displayed_ids.add(rec_data['movie_info']['id'])
|
|
|
|
| 355 |
if not final_recommendations_data:
|
| 356 |
+
return "<p style='text-align:right;'>ما لقينا توصيات جديدة لك حالياً بعد الفلترة. 😉</p>"
|
| 357 |
|
| 358 |
output_html = "<div>"
|
| 359 |
progress(0.6, desc="نجهز لك الشرح باللغة العامية...")
|
|
|
|
| 360 |
for i, rec_data in enumerate(final_recommendations_data):
|
| 361 |
progress(0.6 + (i / len(final_recommendations_data)) * 0.4, desc=f"نكتب شرح لفيلم: {rec_data['movie_info']['title']}")
|
| 362 |
explanation = generate_saudi_explanation(
|
|
|
|
| 365 |
poster_url = rec_data['movie_info']['poster_path']
|
| 366 |
if not poster_url or "placeholder.com" in poster_url:
|
| 367 |
poster_url = f"https://via.placeholder.com/300x450.png?text={rec_data['movie_info']['title'].replace(' ', '+')}"
|
|
|
|
| 368 |
output_html += f"""
|
| 369 |
<div style="display: flex; flex-direction: row-reverse; align-items: flex-start; margin-bottom: 25px; border-bottom: 1px solid #ddd; padding-bottom:15px; background-color: #f9f9f9; border-radius: 8px; padding: 15px;">
|
| 370 |
<img src="{poster_url}" alt="{rec_data['movie_info']['title']}" style="width: 150px; max-width:30%; height: auto; margin-left: 20px; border-radius: 5px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);">
|
|
|
|
| 384 |
footer { display: none !important; }
|
| 385 |
.gr-button { background-color: #c70039 !important; color: white !important; font-size: 1.2em !important; padding: 10px 20px !important; border-radius: 8px !important; }
|
| 386 |
.gr-button:hover { background-color: #a3002f !important; }
|
|
|
|
|
|
|
| 387 |
h1, h3 { color: #900c3f !important; }
|
| 388 |
+
""" # Removed .gr-input and .gr-output as they aren't used directly for styling here
|
| 389 |
|
| 390 |
data_loaded_successfully = load_all_data()
|
| 391 |
if data_loaded_successfully:
|
| 392 |
print("All user data loaded and preprocessed successfully.")
|
| 393 |
+
# LLM will be initialized on first click if not already
|
| 394 |
else:
|
| 395 |
print("Failed to load user data. The app might not function correctly.")
|
| 396 |
|
|
|
|
| 405 |
recommend_button = gr.Button("عطني توصيات أفلام!")
|
| 406 |
with gr.Column():
|
| 407 |
output_recommendations = gr.HTML(label="توصياتك النارية 🔥")
|
| 408 |
+
|
| 409 |
+
# Call initialize_llm once when the interface is defined if data loaded successfully
|
| 410 |
+
# This way, it tries to load the LLM when the app starts, not just on the first click.
|
| 411 |
+
if data_loaded_successfully:
|
| 412 |
+
initialize_llm() # Moved initialization here
|
| 413 |
|
| 414 |
+
recommend_button.click(fn=get_recommendations, inputs=[], outputs=[output_recommendations])
|
|
|
|
|
|
|
| 415 |
gr.Markdown(
|
| 416 |
"""
|
| 417 |
<div style="text-align: center; margin-top: 30px; font-size: 0.9em; color: #777;">
|
|
|
|
| 420 |
)
|
| 421 |
|
| 422 |
if __name__ == "__main__":
|
| 423 |
+
if not TMDB_API_KEY or (TMDB_API_KEY == "442a13f1865d8936f95aa20737e6f6f5" and not os.environ.get("TMDB_API_KEY")):
|
| 424 |
print("\nWARNING: TMDB_API_KEY is using the hardcoded fallback or is missing.")
|
| 425 |
+
iface.launch(debug=True) # Set debug=False for production or normal HF Space operation
|
|
|