from fastapi import FastAPI, HTTPException, Header, Depends from pydantic import BaseModel import requests import os import json from typing import Optional, Dict app = FastAPI(title="CygnisAI Studio API") # --- CONFIGURATION --- HF_TOKEN = os.environ.get("HF_TOKEN") CYGNIS_API_KEY = os.environ.get("CYGNIS_API_KEY", "cgn_live_stable_demo_api_key_012345") # Mapping vers des modèles NON-GATED et POPULAIRES MODELS = { "google/gemma-3-27b-it": "google/gemma-2-9b-it", "openai/gpt-oss-120b": "meta-llama/Meta-Llama-3.1-70B-Instruct", "Qwen/Qwen3-VL-8B-Thinking": "Qwen/Qwen2.5-7B-Instruct", "XiaomiMiMo/MiMo-V2-Flash": "microsoft/Phi-3.5-mini-instruct", "deepseek-ai/DeepSeek-V3.2": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "meta-llama/Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-3.2-3B-Instruct", "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "meta-llama/Meta-Llama-3.1-8B-Instruct", # Défaut : Qwen 2.5 (Très robuste et souvent dispo) "default": "Qwen/Qwen2.5-7B-Instruct" } # Modèle de secours ultime (Microsoft Phi 3.5 est très léger et souvent dispo) SAFETY_NET_MODEL = "microsoft/Phi-3.5-mini-instruct" # URL de base UNIQUE pour le routeur HF HF_ROUTER_BASE = "https://router.huggingface.co/hf-inference/models" class ChatRequest(BaseModel): question: str model: Optional[str] = "default" system_prompt: Optional[str] = None temperature: Optional[float] = 0.7 max_tokens: Optional[int] = 1024 class ChatResponse(BaseModel): answer: str model_used: str sources: list = [] async def verify_api_key(authorization: str = Header(None)): if not authorization: print("⚠️ Missing Authorization header") # On ne bloque pas pour faciliter le debug, mais on log try: scheme, token = authorization.split() if scheme.lower() != 'bearer': raise HTTPException(status_code=401, detail="Invalid authentication scheme") if token != CYGNIS_API_KEY: print(f"⚠️ Invalid API Key: {token}") # raise HTTPException(status_code=403, detail="Invalid API Key") # Commenté pour debug except ValueError: pass # On laisse passer pour le moment @app.get("/") def read_root(): return {"status": "online", "service": "CygnisAI Studio API", "hf_token_set": bool(HF_TOKEN)} def call_hf_api(model_id, messages, req): """Fonction helper pour appeler l'API HF avec gestion Chat/Standard""" headers = { "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json" } # 1. Tentative Chat API hf_chat_url = f"{HF_ROUTER_BASE}/{model_id}/v1/chat/completions" payload_chat = { "model": model_id, "messages": messages, "max_tokens": req.max_tokens, "temperature": req.temperature, "stream": False } print(f"🚀 Calling HF Chat API: {hf_chat_url}") response = requests.post(hf_chat_url, headers=headers, json=payload_chat) # 2. Fallback Standard API if response.status_code in [404, 405]: print(f"🔄 Fallback to standard inference API (Status {response.status_code})") api_url = f"{HF_ROUTER_BASE}/{model_id}" prompt_str = "" for msg in messages: role = msg['role'] content = msg['content'] if role == 'system': prompt_str += f"<|system|>\n{content}\n" elif role == 'user': prompt_str += f"<|user|>\n{content}\n" elif role == 'assistant': prompt_str += f"<|assistant|>\n{content}\n" prompt_str += "<|assistant|>\n" payload_standard = { "inputs": prompt_str, "parameters": { "max_new_tokens": req.max_tokens, "temperature": req.temperature, "return_full_text": False } } print(f"🚀 Calling HF Standard API: {api_url}") response = requests.post(api_url, headers=headers, json=payload_standard) return response @app.post("/api/ask", response_model=ChatResponse) async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)): print(f"📩 Received request: {req.question[:50]}...") if not HF_TOKEN: print("❌ CRITICAL: HF_TOKEN is missing!") # Mock response instead of crash return { "answer": "Configuration Error: HF_TOKEN is missing on the server.", "model_used": "error-handler", "sources": [] } model_id = MODELS.get(req.model, MODELS["default"]) print(f"🤖 Routing request to: {model_id}") messages = [] if req.system_prompt: messages.append({"role": "system", "content": req.system_prompt}) messages.append({"role": "user", "content": req.question}) try: # Premier essai response = call_hf_api(model_id, messages, req) # Si échec, Safety Net if response.status_code != 200: print(f"⚠️ Primary model failed ({response.status_code}). Switching to SAFETY NET: {SAFETY_NET_MODEL}") model_id = SAFETY_NET_MODEL response = call_hf_api(SAFETY_NET_MODEL, messages, req) # Si tout échoue, Mock Response (ULTIMATE FALLBACK) if response.status_code != 200: print(f"❌ ALL MODELS FAILED. Returning mock response. Last error: {response.text}") return { "answer": "Je suis désolé, mes serveurs de réflexion sont actuellement surchargés ou inaccessibles. Je ne peux pas traiter votre demande pour le moment. Veuillez réessayer dans quelques minutes.", "model_used": "fallback-mock", "sources": [] } data = response.json() answer = "" if "choices" in data and len(data["choices"]) > 0: answer = data["choices"][0]["message"]["content"] elif isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]: answer = data[0]["generated_text"] elif "generated_text" in data: answer = data["generated_text"] else: print(f"⚠️ Unknown response format: {data}") answer = "Error: Could not parse model response." return { "answer": answer, "model_used": model_id, "sources": [] } except Exception as e: print(f"❌ Internal Exception: {str(e)}") # Mock response on crash return { "answer": "Une erreur interne inattendue s'est produite. Mes excuses.", "model_used": "exception-handler", "sources": [] } if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)