API / main.py
Simonc-44's picture
Update main.py
9a8344d verified
raw
history blame
6.22 kB
from fastapi import FastAPI, HTTPException, Header, Depends
from pydantic import BaseModel
import requests
import os
import json
from typing import Optional, Dict
app = FastAPI(title="CygnisAI Studio API")
# --- CONFIGURATION ---
HF_TOKEN = os.environ.get("HF_TOKEN")
CYGNIS_API_KEY = os.environ.get("CYGNIS_API_KEY", "cgn_live_stable_demo_api_key_012345")
# Mapping vers des modèles DISPONIBLES et STABLES sur l'API d'inférence Hugging Face
# Note: Les modèles gratuits peuvent être instables ou en chargement.
MODELS = {
# Gemma 2 9B (Google) - Très rapide et dispo
"google/gemma-3-27b-it": "google/gemma-2-9b-it",
# Llama 3.1 70B (Meta) - Puissant
"openai/gpt-oss-120b": "meta-llama/Meta-Llama-3.1-70B-Instruct",
# Qwen 2.5 7B (Alibaba) - Excellent généraliste
"Qwen/Qwen3-VL-8B-Thinking": "Qwen/Qwen2.5-7B-Instruct",
# Phi 3.5 (Microsoft) - Léger
"XiaomiMiMo/MiMo-V2-Flash": "microsoft/Phi-3.5-mini-instruct",
# DeepSeek R1 (Distill Llama 8B) - Raisonnement
"deepseek-ai/DeepSeek-V3.2": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
# Llama 3.2 3B (Meta) - Ultra rapide
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
# Llama 3.1 8B (Meta) - Standard
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "meta-llama/Meta-Llama-3.1-8B-Instruct",
# Défaut : Gemma 2B IT (très stable et rapide pour le free tier)
"default": "google/gemma-2b-it"
}
# URL de base pour le endpoint Chat (OpenAI compatible)
HF_CHAT_BASE = "https://router.huggingface.co/hf-inference/models"
# URL de base pour l'API d'inférence standard
HF_INFERENCE_API_BASE = "https://api-inference.huggingface.co/models"
class ChatRequest(BaseModel):
question: str
model: Optional[str] = "default"
system_prompt: Optional[str] = None
temperature: Optional[float] = 0.7
max_tokens: Optional[int] = 1024
class ChatResponse(BaseModel):
answer: str
model_used: str
sources: list = []
async def verify_api_key(authorization: str = Header(None)):
if not authorization:
print("⚠️ Missing Authorization header")
raise HTTPException(status_code=401, detail="Missing Authorization header")
try:
scheme, token = authorization.split()
if scheme.lower() != 'bearer':
raise HTTPException(status_code=401, detail="Invalid authentication scheme")
if token != CYGNIS_API_KEY:
print(f"⚠️ Invalid API Key: {token}")
raise HTTPException(status_code=403, detail="Invalid API Key")
except ValueError:
raise HTTPException(status_code=401, detail="Invalid authorization header format")
@app.get("/")
def read_root():
return {"status": "online", "service": "CygnisAI Studio API", "hf_token_set": bool(HF_TOKEN)}
@app.post("/api/ask", response_model=ChatResponse)
async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)):
print(f"📩 Received request: {req.question[:50]}...")
if not HF_TOKEN:
print("❌ CRITICAL: HF_TOKEN is missing!")
raise HTTPException(status_code=500, detail="Server misconfiguration: HF_TOKEN is missing.")
model_id = MODELS.get(req.model, MODELS["default"])
print(f"🤖 Routing request to: {model_id}")
messages = []
if req.system_prompt:
messages.append({"role": "system", "content": req.system_prompt})
messages.append({"role": "user", "content": req.question})
headers = {
"Authorization": f"Bearer {HF_TOKEN}",
"Content-Type": "application/json"
}
try:
# 1. Tentative via endpoint Chat (OpenAI compatible)
hf_chat_url = f"{HF_CHAT_BASE}/{model_id}/v1/chat/completions"
payload_chat = {
"model": model_id,
"messages": messages,
"max_tokens": req.max_tokens,
"temperature": req.temperature,
"stream": False
}
print(f"🚀 Calling HF Chat API: {hf_chat_url}")
response = requests.post(hf_chat_url, headers=headers, json=payload_chat)
# 2. Fallback via endpoint Inference Standard (si Chat échoue avec 404 ou 405)
if response.status_code in [404, 405]:
print(f"🔄 Fallback to standard inference API (Status {response.status_code})")
# Utilisation de l'URL correcte pour l'API d'inférence standard
api_url = f"{HF_INFERENCE_API_BASE}/{model_id}"
prompt_str = ""
for msg in messages:
prompt_str += f"{msg['role']}: {msg['content']}\n"
prompt_str += "assistant:"
payload_standard = {
"inputs": prompt_str,
"parameters": {
"max_new_tokens": req.max_tokens,
"temperature": req.temperature,
"return_full_text": False
}
}
response = requests.post(api_url, headers=headers, json=payload_standard)
if response.status_code != 200:
print(f"❌ HF Error ({response.status_code}): {response.text}")
raise HTTPException(status_code=502, detail=f"HF Error: {response.text}")
data = response.json()
answer = ""
if "choices" in data and len(data["choices"]) > 0:
answer = data["choices"][0]["message"]["content"]
elif isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
answer = data[0]["generated_text"]
elif "generated_text" in data:
answer = data["generated_text"]
else:
print(f"⚠️ Unknown response format: {data}")
answer = "Error: Could not parse model response."
return {
"answer": answer,
"model_used": model_id,
"sources": []
}
except Exception as e:
print(f"❌ Internal Exception: {str(e)}")
raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)