File size: 6,068 Bytes
8cd74d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
from fastapi import FastAPI, HTTPException, Header, Depends
from pydantic import BaseModel
import requests
import os
import json
from typing import Optional, Dict
app = FastAPI(title="CygnisAI Studio API")
# --- CONFIGURATION ---
# Token HF pour appeler les modèles (à configurer dans les Secrets du Space)
HF_TOKEN = os.environ.get("HF_TOKEN")
# Clé API statique pour sécuriser VOTRE API (à configurer dans les Secrets du Space)
# Par défaut pour le test local :
CYGNIS_API_KEY = os.environ.get("CYGNIS_API_KEY", "cgn_live_stable_demo_api_key_012345")
# Mapping des modèles demandés vers les endpoints réels Hugging Face
# Note: J'ai mappé vers les modèles réels les plus proches car Llama 4 / Gemma 3 n'existent pas encore publiquement.
# Vous pourrez mettre à jour ces IDs dès leur sortie.
MODELS = {
"google/gemma-3-27b-it": "google/gemma-2-27b-it", # Fallback Gemma 2
"openai/gpt-oss-120b": "meta-llama/Meta-Llama-3.1-70B-Instruct", # Fallback Llama 3.1 70B (puissant)
"Qwen/Qwen3-VL-8B-Thinking": "Qwen/Qwen2-VL-7B-Instruct", # Fallback Qwen 2 VL
"XiaomiMiMo/MiMo-V2-Flash": "Xiaomi/MIMO", # Fallback Xiaomi
"deepseek-ai/DeepSeek-V3.2": "deepseek-ai/DeepSeek-V3", # Fallback V3
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct", # Fallback Llama 3.1
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", # Fallback Nemotron
"default": "meta-llama/Meta-Llama-3-8B-Instruct"
}
# URL de base du routeur d'inférence HF
HF_INFERENCE_BASE = "https://router.huggingface.co/hf-inference/models"
# --- SCHEMAS ---
class ChatRequest(BaseModel):
question: str
model: Optional[str] = "default"
system_prompt: Optional[str] = None
temperature: Optional[float] = 0.7
max_tokens: Optional[int] = 1024
class ChatResponse(BaseModel):
answer: str
model_used: str
sources: list = []
# --- SECURITE ---
async def verify_api_key(authorization: str = Header(None)):
if not authorization:
raise HTTPException(status_code=401, detail="Missing Authorization header")
try:
scheme, token = authorization.split()
if scheme.lower() != 'bearer':
raise HTTPException(status_code=401, detail="Invalid authentication scheme")
if token != CYGNIS_API_KEY:
raise HTTPException(status_code=403, detail="Invalid API Key")
except ValueError:
raise HTTPException(status_code=401, detail="Invalid authorization header format")
# --- ENDPOINTS ---
@app.get("/")
def read_root():
return {"status": "online", "service": "CygnisAI Studio API"}
@app.post("/api/ask", response_model=ChatResponse)
async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)):
if not HF_TOKEN:
print("⚠️ WARNING: HF_TOKEN not set. Calls to HF will fail.")
# 1. Sélection du modèle
model_id = MODELS.get(req.model, MODELS["default"])
api_url = f"{HF_INFERENCE_BASE}/{model_id}"
print(f"🤖 Routing request to: {model_id}")
# 2. Construction du prompt
# On utilise le format standard chat template si possible, sinon raw text
messages = []
if req.system_prompt:
messages.append({"role": "system", "content": req.system_prompt})
messages.append({"role": "user", "content": req.question})
payload = {
"model": model_id,
"messages": messages,
"max_tokens": req.max_tokens,
"temperature": req.temperature,
"stream": False
}
headers = {
"Authorization": f"Bearer {HF_TOKEN}",
"Content-Type": "application/json"
}
try:
# 3. Appel à Hugging Face (Endpoint compatible OpenAI)
# Note: router.huggingface.co supporte souvent /v1/chat/completions
# Si ça échoue, on tentera l'appel direct
hf_chat_url = f"{HF_INFERENCE_BASE}/{model_id}/v1/chat/completions"
response = requests.post(hf_chat_url, headers=headers, json=payload)
# Fallback si le endpoint OpenAI n'est pas supporté pour ce modèle
if response.status_code == 404:
print("🔄 Fallback to standard inference API")
# Pour l'API standard, on doit souvent envoyer une string unique
# Ceci est une simplification, idéalement on utiliserait le tokenizer du modèle
prompt_str = f"System: {req.system_prompt}\nUser: {req.question}\nAssistant:" if req.system_prompt else f"User: {req.question}\nAssistant:"
payload_standard = {
"inputs": prompt_str,
"parameters": {
"max_new_tokens": req.max_tokens,
"temperature": req.temperature,
"return_full_text": False
}
}
response = requests.post(api_url, headers=headers, json=payload_standard)
if response.status_code != 200:
print(f"❌ HF Error ({response.status_code}): {response.text}")
raise HTTPException(status_code=502, detail=f"Model provider error: {response.text}")
data = response.json()
# Parsing de la réponse (gère les deux formats possibles)
answer = ""
if "choices" in data and len(data["choices"]) > 0:
answer = data["choices"][0]["message"]["content"]
elif isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
answer = data[0]["generated_text"]
elif "generated_text" in data:
answer = data["generated_text"]
else:
answer = "Error: Could not parse model response."
return {
"answer": answer,
"model_used": model_id,
"sources": []
}
except Exception as e:
print(f"❌ Internal Error: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
|