import os import pandas as pd import numpy as np import requests import gradio as gr import gdown import pickle BOOKS_FILE = "book.xlsx" THESES_FILE = "theses.xlsx" # روابط الملفات على Google Drive DRIVE_LINKS = { "books": "1FElHiASfiVLeuHWYaqd2Q5foxWRlJT-O", "theses": "1K2Mtze6ZdvfKUsFMCOWlRBjDq-ZnJNrv" } def download_from_drive(file_id, output): url = f"https://drive.google.com/uc?export=download&id={file_id}" gdown.download(url, output, quiet=True) # تنزيل الملفات إذا مش موجودة if not os.path.exists(BOOKS_FILE): download_from_drive(DRIVE_LINKS["books"], BOOKS_FILE) if not os.path.exists(THESES_FILE): download_from_drive(DRIVE_LINKS["theses"], THESES_FILE) # قراءة البيانات def load_data(file): df = pd.read_excel(file).fillna("غير متوافر") if "Title" not in df.columns and "العنوان" in df.columns: df["Title"] = df["العنوان"].astype(str) elif "Title" not in df.columns: df["Title"] = df.iloc[:,0].astype(str) return df books_df = load_data(BOOKS_FILE) theses_df = load_data(THESES_FILE) API_TOKEN = os.environ.get("HF_TOKEN") API_URL = "https://api-inference.huggingface.co/models/aelsaeed/all-MiniLM-L6-v2-api" HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} def get_embedding(text): response = requests.post(API_URL, headers=HEADERS, json={"inputs": [text]}) return np.array(response.json()[0]) def search(query, category, mode): if not query.strip(): return "⚠️ اكتب كلمة أو جملة للبحث" if mode == "نصي": df = books_df if category=="Books" else theses_df results = df[df["Title"].str.contains(query, case=False, na=False)] else: df = books_df if category=="Books" else theses_df emb_cache_file = f"{category}_embeddings.pkl" if os.path.exists(emb_cache_file): with open(emb_cache_file,"rb") as f: embeddings = pickle.load(f) else: embeddings = np.array([get_embedding(t) for t in df["Title"].tolist()]) with open(emb_cache_file,"wb") as f: pickle.dump(embeddings,f) query_emb = get_embedding(query) scores = np.dot(embeddings, query_emb) / (np.linalg.norm(embeddings,axis=1)*np.linalg.norm(query_emb)) idx = np.argsort(-scores) results = df.iloc[idx] if results.empty: return "❌ لم يتم العثور على نتائج" html = "
| {col} | " for col in results.columns]) + "
|---|
| {val} | " for val in row.values]) + "