Spaces:

Simonc-44
/

API

Running

API

File size: 6,809 Bytes

8cd74d0
 
 
 
 
 
 
 
 
 
 
 
 
0fcecd1
8cd74d0
a80c974
19e3e0a
a80c974
 
 
 
 
 
0fcecd1
 
8cd74d0
 
0fcecd1
 
f6a33d3
 
b1d8113
8cd74d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19e3e0a
0fcecd1
8cd74d0
 
 
 
 
19e3e0a
0fcecd1
8cd74d0
0fcecd1
8cd74d0
 
 
19e3e0a
8cd74d0
f6a33d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fcecd1
 
 
f6a33d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cd74d0
 
19e3e0a
 
8cd74d0
330b19d
0fcecd1
 
 
 
 
 
8cd74d0
 
 
 
 
 
 
 
 
 
0fcecd1
f6a33d3
330b19d
0fcecd1
f6a33d3
 
 
 
8cd74d0
0fcecd1
8cd74d0
0fcecd1
 
 
 
 
 
8cd74d0
 
 
 
 
 
 
 
 
 
 
19e3e0a
8cd74d0
 
 
 
 
 
 
 
 
19e3e0a
0fcecd1
 
 
 
 
 
8cd74d0
 
 
b1d8113

from fastapi import FastAPI, HTTPException, Header, Depends
from pydantic import BaseModel
import requests
import os
import json
from typing import Optional, Dict

app = FastAPI(title="CygnisAI Studio API")

# --- CONFIGURATION ---
HF_TOKEN = os.environ.get("HF_TOKEN")
CYGNIS_API_KEY = os.environ.get("CYGNIS_API_KEY", "cgn_live_stable_demo_api_key_012345")

# Mapping vers des modèles NON-GATED et POPULAIRES
MODELS = {
    "google/gemma-3-27b-it": "google/gemma-2-9b-it", 
    "openai/gpt-oss-120b": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
    "Qwen/Qwen3-VL-8B-Thinking": "Qwen/Qwen2.5-7B-Instruct",
    "XiaomiMiMo/MiMo-V2-Flash": "microsoft/Phi-3.5-mini-instruct", 
    "deepseek-ai/DeepSeek-V3.2": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", 
    "meta-llama/Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-3.2-3B-Instruct", 
    "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "meta-llama/Meta-Llama-3.1-8B-Instruct", 
    
    # Défaut : Qwen 2.5 (Très robuste et souvent dispo)
    "default": "Qwen/Qwen2.5-7B-Instruct" 
}

# Modèle de secours ultime (Microsoft Phi 3.5 est très léger et souvent dispo)
SAFETY_NET_MODEL = "microsoft/Phi-3.5-mini-instruct"

# URL de base UNIQUE pour le routeur HF
HF_ROUTER_BASE = "https://router.huggingface.co/hf-inference/models"

class ChatRequest(BaseModel):
    question: str
    model: Optional[str] = "default"
    system_prompt: Optional[str] = None
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = 1024

class ChatResponse(BaseModel):
    answer: str
    model_used: str
    sources: list = []

async def verify_api_key(authorization: str = Header(None)):
    if not authorization:
        print("⚠️ Missing Authorization header")
        # On ne bloque pas pour faciliter le debug, mais on log
    try:
        scheme, token = authorization.split()
        if scheme.lower() != 'bearer':
            raise HTTPException(status_code=401, detail="Invalid authentication scheme")
        if token != CYGNIS_API_KEY:
            print(f"⚠️ Invalid API Key: {token}")
            # raise HTTPException(status_code=403, detail="Invalid API Key") # Commenté pour debug
    except ValueError:
        pass # On laisse passer pour le moment

@app.get("/")
def read_root():
    return {"status": "online", "service": "CygnisAI Studio API", "hf_token_set": bool(HF_TOKEN)}

def call_hf_api(model_id, messages, req):
    """Fonction helper pour appeler l'API HF avec gestion Chat/Standard"""
    headers = {
        "Authorization": f"Bearer {HF_TOKEN}",
        "Content-Type": "application/json"
    }
    
    # 1. Tentative Chat API
    hf_chat_url = f"{HF_ROUTER_BASE}/{model_id}/v1/chat/completions"
    payload_chat = {
        "model": model_id,
        "messages": messages,
        "max_tokens": req.max_tokens,
        "temperature": req.temperature,
        "stream": False
    }
    
    print(f"🚀 Calling HF Chat API: {hf_chat_url}")
    response = requests.post(hf_chat_url, headers=headers, json=payload_chat)
    
    # 2. Fallback Standard API
    if response.status_code in [404, 405]:
        print(f"🔄 Fallback to standard inference API (Status {response.status_code})")
        api_url = f"{HF_ROUTER_BASE}/{model_id}"
        
        prompt_str = ""
        for msg in messages:
            role = msg['role']
            content = msg['content']
            if role == 'system': prompt_str += f"<|system|>\n{content}\n"
            elif role == 'user': prompt_str += f"<|user|>\n{content}\n"
            elif role == 'assistant': prompt_str += f"<|assistant|>\n{content}\n"
        prompt_str += "<|assistant|>\n"
        
        payload_standard = {
            "inputs": prompt_str,
            "parameters": {
                "max_new_tokens": req.max_tokens,
                "temperature": req.temperature,
                "return_full_text": False
            }
        }
        print(f"🚀 Calling HF Standard API: {api_url}")
        response = requests.post(api_url, headers=headers, json=payload_standard)
        
    return response

@app.post("/api/ask", response_model=ChatResponse)
async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)):
    print(f"📩 Received request: {req.question[:50]}...")

    if not HF_TOKEN:
        print("❌ CRITICAL: HF_TOKEN is missing!")
        # Mock response instead of crash
        return {
            "answer": "Configuration Error: HF_TOKEN is missing on the server.",
            "model_used": "error-handler",
            "sources": []
        }
    
    model_id = MODELS.get(req.model, MODELS["default"])
    print(f"🤖 Routing request to: {model_id}")

    messages = []
    if req.system_prompt:
        messages.append({"role": "system", "content": req.system_prompt})
    messages.append({"role": "user", "content": req.question})

    try:
        # Premier essai
        response = call_hf_api(model_id, messages, req)

        # Si échec, Safety Net
        if response.status_code != 200:
            print(f"⚠️ Primary model failed ({response.status_code}). Switching to SAFETY NET: {SAFETY_NET_MODEL}")
            model_id = SAFETY_NET_MODEL
            response = call_hf_api(SAFETY_NET_MODEL, messages, req)

        # Si tout échoue, Mock Response (ULTIMATE FALLBACK)
        if response.status_code != 200:
            print(f"❌ ALL MODELS FAILED. Returning mock response. Last error: {response.text}")
            return {
                "answer": "Je suis désolé, mes serveurs de réflexion sont actuellement surchargés ou inaccessibles. Je ne peux pas traiter votre demande pour le moment. Veuillez réessayer dans quelques minutes.",
                "model_used": "fallback-mock",
                "sources": []
            }

        data = response.json()
        
        answer = ""
        if "choices" in data and len(data["choices"]) > 0:
            answer = data["choices"][0]["message"]["content"]
        elif isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
            answer = data[0]["generated_text"]
        elif "generated_text" in data:
            answer = data["generated_text"]
        else:
            print(f"⚠️ Unknown response format: {data}")
            answer = "Error: Could not parse model response."

        return {
            "answer": answer,
            "model_used": model_id,
            "sources": []
        }

    except Exception as e:
        print(f"❌ Internal Exception: {str(e)}")
        # Mock response on crash
        return {
            "answer": "Une erreur interne inattendue s'est produite. Mes excuses.",
            "model_used": "exception-handler",
            "sources": []
        }

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)