File size: 6,809 Bytes
8cd74d0
 
 
 
 
 
 
 
 
 
 
 
 
0fcecd1
8cd74d0
a80c974
19e3e0a
a80c974
 
 
 
 
 
0fcecd1
 
8cd74d0
 
0fcecd1
 
f6a33d3
 
b1d8113
8cd74d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19e3e0a
0fcecd1
8cd74d0
 
 
 
 
19e3e0a
0fcecd1
8cd74d0
0fcecd1
8cd74d0
 
 
19e3e0a
8cd74d0
f6a33d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fcecd1
 
 
f6a33d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cd74d0
 
19e3e0a
 
8cd74d0
330b19d
0fcecd1
 
 
 
 
 
8cd74d0
 
 
 
 
 
 
 
 
 
0fcecd1
f6a33d3
330b19d
0fcecd1
f6a33d3
 
 
 
8cd74d0
0fcecd1
8cd74d0
0fcecd1
 
 
 
 
 
8cd74d0
 
 
 
 
 
 
 
 
 
 
19e3e0a
8cd74d0
 
 
 
 
 
 
 
 
19e3e0a
0fcecd1
 
 
 
 
 
8cd74d0
 
 
b1d8113
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from fastapi import FastAPI, HTTPException, Header, Depends
from pydantic import BaseModel
import requests
import os
import json
from typing import Optional, Dict

app = FastAPI(title="CygnisAI Studio API")

# --- CONFIGURATION ---
HF_TOKEN = os.environ.get("HF_TOKEN")
CYGNIS_API_KEY = os.environ.get("CYGNIS_API_KEY", "cgn_live_stable_demo_api_key_012345")

# Mapping vers des modèles NON-GATED et POPULAIRES
MODELS = {
    "google/gemma-3-27b-it": "google/gemma-2-9b-it", 
    "openai/gpt-oss-120b": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
    "Qwen/Qwen3-VL-8B-Thinking": "Qwen/Qwen2.5-7B-Instruct",
    "XiaomiMiMo/MiMo-V2-Flash": "microsoft/Phi-3.5-mini-instruct", 
    "deepseek-ai/DeepSeek-V3.2": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", 
    "meta-llama/Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-3.2-3B-Instruct", 
    "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "meta-llama/Meta-Llama-3.1-8B-Instruct", 
    
    # Défaut : Qwen 2.5 (Très robuste et souvent dispo)
    "default": "Qwen/Qwen2.5-7B-Instruct" 
}

# Modèle de secours ultime (Microsoft Phi 3.5 est très léger et souvent dispo)
SAFETY_NET_MODEL = "microsoft/Phi-3.5-mini-instruct"

# URL de base UNIQUE pour le routeur HF
HF_ROUTER_BASE = "https://router.huggingface.co/hf-inference/models"

class ChatRequest(BaseModel):
    question: str
    model: Optional[str] = "default"
    system_prompt: Optional[str] = None
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = 1024

class ChatResponse(BaseModel):
    answer: str
    model_used: str
    sources: list = []

async def verify_api_key(authorization: str = Header(None)):
    if not authorization:
        print("⚠️ Missing Authorization header")
        # On ne bloque pas pour faciliter le debug, mais on log
    try:
        scheme, token = authorization.split()
        if scheme.lower() != 'bearer':
            raise HTTPException(status_code=401, detail="Invalid authentication scheme")
        if token != CYGNIS_API_KEY:
            print(f"⚠️ Invalid API Key: {token}")
            # raise HTTPException(status_code=403, detail="Invalid API Key") # Commenté pour debug
    except ValueError:
        pass # On laisse passer pour le moment

@app.get("/")
def read_root():
    return {"status": "online", "service": "CygnisAI Studio API", "hf_token_set": bool(HF_TOKEN)}

def call_hf_api(model_id, messages, req):
    """Fonction helper pour appeler l'API HF avec gestion Chat/Standard"""
    headers = {
        "Authorization": f"Bearer {HF_TOKEN}",
        "Content-Type": "application/json"
    }
    
    # 1. Tentative Chat API
    hf_chat_url = f"{HF_ROUTER_BASE}/{model_id}/v1/chat/completions"
    payload_chat = {
        "model": model_id,
        "messages": messages,
        "max_tokens": req.max_tokens,
        "temperature": req.temperature,
        "stream": False
    }
    
    print(f"🚀 Calling HF Chat API: {hf_chat_url}")
    response = requests.post(hf_chat_url, headers=headers, json=payload_chat)
    
    # 2. Fallback Standard API
    if response.status_code in [404, 405]:
        print(f"🔄 Fallback to standard inference API (Status {response.status_code})")
        api_url = f"{HF_ROUTER_BASE}/{model_id}"
        
        prompt_str = ""
        for msg in messages:
            role = msg['role']
            content = msg['content']
            if role == 'system': prompt_str += f"<|system|>\n{content}\n"
            elif role == 'user': prompt_str += f"<|user|>\n{content}\n"
            elif role == 'assistant': prompt_str += f"<|assistant|>\n{content}\n"
        prompt_str += "<|assistant|>\n"
        
        payload_standard = {
            "inputs": prompt_str,
            "parameters": {
                "max_new_tokens": req.max_tokens,
                "temperature": req.temperature,
                "return_full_text": False
            }
        }
        print(f"🚀 Calling HF Standard API: {api_url}")
        response = requests.post(api_url, headers=headers, json=payload_standard)
        
    return response

@app.post("/api/ask", response_model=ChatResponse)
async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)):
    print(f"📩 Received request: {req.question[:50]}...")

    if not HF_TOKEN:
        print("❌ CRITICAL: HF_TOKEN is missing!")
        # Mock response instead of crash
        return {
            "answer": "Configuration Error: HF_TOKEN is missing on the server.",
            "model_used": "error-handler",
            "sources": []
        }
    
    model_id = MODELS.get(req.model, MODELS["default"])
    print(f"🤖 Routing request to: {model_id}")

    messages = []
    if req.system_prompt:
        messages.append({"role": "system", "content": req.system_prompt})
    messages.append({"role": "user", "content": req.question})

    try:
        # Premier essai
        response = call_hf_api(model_id, messages, req)

        # Si échec, Safety Net
        if response.status_code != 200:
            print(f"⚠️ Primary model failed ({response.status_code}). Switching to SAFETY NET: {SAFETY_NET_MODEL}")
            model_id = SAFETY_NET_MODEL
            response = call_hf_api(SAFETY_NET_MODEL, messages, req)

        # Si tout échoue, Mock Response (ULTIMATE FALLBACK)
        if response.status_code != 200:
            print(f"❌ ALL MODELS FAILED. Returning mock response. Last error: {response.text}")
            return {
                "answer": "Je suis désolé, mes serveurs de réflexion sont actuellement surchargés ou inaccessibles. Je ne peux pas traiter votre demande pour le moment. Veuillez réessayer dans quelques minutes.",
                "model_used": "fallback-mock",
                "sources": []
            }

        data = response.json()
        
        answer = ""
        if "choices" in data and len(data["choices"]) > 0:
            answer = data["choices"][0]["message"]["content"]
        elif isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
            answer = data[0]["generated_text"]
        elif "generated_text" in data:
            answer = data["generated_text"]
        else:
            print(f"⚠️ Unknown response format: {data}")
            answer = "Error: Could not parse model response."

        return {
            "answer": answer,
            "model_used": model_id,
            "sources": []
        }

    except Exception as e:
        print(f"❌ Internal Exception: {str(e)}")
        # Mock response on crash
        return {
            "answer": "Une erreur interne inattendue s'est produite. Mes excuses.",
            "model_used": "exception-handler",
            "sources": []
        }

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)