Spaces:

Simonc-44
/

API

Running

App Files Files Community

Simonc-44 commited on 23 days ago

Commit

f6a33d3

verified ·

1 Parent(s): b1d8113

Update main.py

Browse files

Files changed (1) hide show

main.py +67 -52

main.py CHANGED Viewed

@@ -13,32 +13,35 @@ CYGNIS_API_KEY = os.environ.get("CYGNIS_API_KEY", "cgn_live_stable_demo_api_key_
 # Mapping vers des modèles DISPONIBLES et STABLES sur le routeur Hugging Face
 MODELS = {
-    # Gemma 2 9B (Google) - Très rapide et dispo
     "google/gemma-3-27b-it": "google/gemma-2-9b-it",
-    # Llama 3.1 70B (Meta) - Puissant
     "openai/gpt-oss-120b": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-    # Qwen 2.5 7B (Alibaba) - Excellent généraliste
     "Qwen/Qwen3-VL-8B-Thinking": "Qwen/Qwen2.5-7B-Instruct",
-    # Phi 3.5 (Microsoft) - Léger
     "XiaomiMiMo/MiMo-V2-Flash": "microsoft/Phi-3.5-mini-instruct",
-    # DeepSeek R1 (Distill Llama 8B) - Raisonnement
     "deepseek-ai/DeepSeek-V3.2": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    # Llama 3.2 3B (Meta) - Ultra rapide
     "meta-llama/Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
-    # Llama 3.1 8B (Meta) - Standard
     "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    # Défaut : Gemma 2B IT (très stable et rapide pour le free tier)
-    "default": "google/gemma-2b-it"
 }
-# URL de base UNIQUE pour le routeur HF (utilisée pour Chat ET Inference standard)
 HF_ROUTER_BASE = "https://router.huggingface.co/hf-inference/models"
 class ChatRequest(BaseModel):
@@ -71,6 +74,53 @@ async def verify_api_key(authorization: str = Header(None)):
 def read_root():
     return {"status": "online", "service": "CygnisAI Studio API", "hf_token_set": bool(HF_TOKEN)}
 @app.post("/api/ask", response_model=ChatResponse)
 async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)):
     print(f"📩 Received request: {req.question[:50]}...")
@@ -87,50 +137,15 @@ async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)
         messages.append({"role": "system", "content": req.system_prompt})
     messages.append({"role": "user", "content": req.question})
-    headers = {
-        "Authorization": f"Bearer {HF_TOKEN}",
-        "Content-Type": "application/json"
-    }
     try:
-        # 1. Tentative via endpoint Chat (OpenAI compatible)
-        # URL: https://router.huggingface.co/hf-inference/models/{model_id}/v1/chat/completions
-        hf_chat_url = f"{HF_ROUTER_BASE}/{model_id}/v1/chat/completions"
-        payload_chat = {
-            "model": model_id,
-            "messages": messages,
-            "max_tokens": req.max_tokens,
-            "temperature": req.temperature,
-            "stream": False
-        }
-        print(f"🚀 Calling HF Chat API: {hf_chat_url}")
-        response = requests.post(hf_chat_url, headers=headers, json=payload_chat)
-        # 2. Fallback via endpoint Inference Standard (si Chat échoue avec 404 ou 405)
-        if response.status_code in [404, 405]:
-             print(f"🔄 Fallback to standard inference API (Status {response.status_code})")
-             # URL: https://router.huggingface.co/hf-inference/models/{model_id}
-             # IMPORTANT: On utilise bien le routeur ici aussi !
-             api_url = f"{HF_ROUTER_BASE}/{model_id}"
-             prompt_str = ""
-             for msg in messages:
-                 prompt_str += f"{msg['role']}: {msg['content']}\n"
-             prompt_str += "assistant:"
-             payload_standard = {
-                 "inputs": prompt_str,
-                 "parameters": {
-                     "max_new_tokens": req.max_tokens,
-                     "temperature": req.temperature,
-                     "return_full_text": False
-                 }
-             }
-             print(f"🚀 Calling HF Standard API: {api_url}")
-             response = requests.post(api_url, headers=headers, json=payload_standard)
         if response.status_code != 200:
             print(f"❌ HF Error ({response.status_code}): {response.text}")

 # Mapping vers des modèles DISPONIBLES et STABLES sur le routeur Hugging Face
 MODELS = {
+    # Gemma 2 9B (Google)
     "google/gemma-3-27b-it": "google/gemma-2-9b-it",
+    # Llama 3.1 70B (Meta)
     "openai/gpt-oss-120b": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    # Qwen 2.5 7B (Alibaba)
     "Qwen/Qwen3-VL-8B-Thinking": "Qwen/Qwen2.5-7B-Instruct",
+    # Phi 3.5 (Microsoft)
     "XiaomiMiMo/MiMo-V2-Flash": "microsoft/Phi-3.5-mini-instruct",
+    # DeepSeek R1 (Distill Llama 8B)
     "deepseek-ai/DeepSeek-V3.2": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    # Llama 3.2 3B (Meta)
     "meta-llama/Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
+    # Llama 3.1 8B (Meta)
     "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    # Défaut : Gemma 2 2B (Plus récent et dispo que le 2b-it)
+    "default": "google/gemma-2-2b-it"
 }
+# Modèle de secours ultime (toujours dispo)
+SAFETY_NET_MODEL = "HuggingFaceH4/zephyr-7b-beta"
+# URL de base UNIQUE pour le routeur HF
 HF_ROUTER_BASE = "https://router.huggingface.co/hf-inference/models"
 class ChatRequest(BaseModel):
 def read_root():
     return {"status": "online", "service": "CygnisAI Studio API", "hf_token_set": bool(HF_TOKEN)}
+def call_hf_api(model_id, messages, req):
+    """Fonction helper pour appeler l'API HF avec gestion Chat/Standard"""
+    headers = {
+        "Authorization": f"Bearer {HF_TOKEN}",
+        "Content-Type": "application/json"
+    }
+    # 1. Tentative Chat API
+    hf_chat_url = f"{HF_ROUTER_BASE}/{model_id}/v1/chat/completions"
+    payload_chat = {
+        "model": model_id,
+        "messages": messages,
+        "max_tokens": req.max_tokens,
+        "temperature": req.temperature,
+        "stream": False
+    }
+    print(f"🚀 Calling HF Chat API: {hf_chat_url}")
+    response = requests.post(hf_chat_url, headers=headers, json=payload_chat)
+    # 2. Fallback Standard API
+    if response.status_code in [404, 405]:
+        print(f"🔄 Fallback to standard inference API (Status {response.status_code})")
+        api_url = f"{HF_ROUTER_BASE}/{model_id}"
+        prompt_str = ""
+        for msg in messages:
+            role = msg['role']
+            content = msg['content']
+            if role == 'system': prompt_str += f"<|system|>\n{content}</s>\n"
+            elif role == 'user': prompt_str += f"<|user|>\n{content}</s>\n"
+            elif role == 'assistant': prompt_str += f"<|assistant|>\n{content}</s>\n"
+        prompt_str += "<|assistant|>\n"
+        payload_standard = {
+            "inputs": prompt_str,
+            "parameters": {
+                "max_new_tokens": req.max_tokens,
+                "temperature": req.temperature,
+                "return_full_text": False
+            }
+        }
+        print(f"🚀 Calling HF Standard API: {api_url}")
+        response = requests.post(api_url, headers=headers, json=payload_standard)
+    return response
 @app.post("/api/ask", response_model=ChatResponse)
 async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)):
     print(f"📩 Received request: {req.question[:50]}...")
         messages.append({"role": "system", "content": req.system_prompt})
     messages.append({"role": "user", "content": req.question})
     try:
+        # Premier essai avec le modèle demandé
+        response = call_hf_api(model_id, messages, req)
+        # Si 404/503/500, on tente le SAFETY NET
+        if response.status_code != 200:
+            print(f"⚠️ Primary model failed ({response.status_code}). Switching to SAFETY NET: {SAFETY_NET_MODEL}")
+            model_id = SAFETY_NET_MODEL
+            response = call_hf_api(SAFETY_NET_MODEL, messages, req)
         if response.status_code != 200:
             print(f"❌ HF Error ({response.status_code}): {response.text}")