Simonc-44 commited on
Commit
f6a33d3
·
verified ·
1 Parent(s): b1d8113

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +67 -52
main.py CHANGED
@@ -13,32 +13,35 @@ CYGNIS_API_KEY = os.environ.get("CYGNIS_API_KEY", "cgn_live_stable_demo_api_key_
13
 
14
  # Mapping vers des modèles DISPONIBLES et STABLES sur le routeur Hugging Face
15
  MODELS = {
16
- # Gemma 2 9B (Google) - Très rapide et dispo
17
  "google/gemma-3-27b-it": "google/gemma-2-9b-it",
18
 
19
- # Llama 3.1 70B (Meta) - Puissant
20
  "openai/gpt-oss-120b": "meta-llama/Meta-Llama-3.1-70B-Instruct",
21
 
22
- # Qwen 2.5 7B (Alibaba) - Excellent généraliste
23
  "Qwen/Qwen3-VL-8B-Thinking": "Qwen/Qwen2.5-7B-Instruct",
24
 
25
- # Phi 3.5 (Microsoft) - Léger
26
  "XiaomiMiMo/MiMo-V2-Flash": "microsoft/Phi-3.5-mini-instruct",
27
 
28
- # DeepSeek R1 (Distill Llama 8B) - Raisonnement
29
  "deepseek-ai/DeepSeek-V3.2": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
30
 
31
- # Llama 3.2 3B (Meta) - Ultra rapide
32
  "meta-llama/Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
33
 
34
- # Llama 3.1 8B (Meta) - Standard
35
  "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "meta-llama/Meta-Llama-3.1-8B-Instruct",
36
 
37
- # Défaut : Gemma 2B IT (très stable et rapide pour le free tier)
38
- "default": "google/gemma-2b-it"
39
  }
40
 
41
- # URL de base UNIQUE pour le routeur HF (utilisée pour Chat ET Inference standard)
 
 
 
42
  HF_ROUTER_BASE = "https://router.huggingface.co/hf-inference/models"
43
 
44
  class ChatRequest(BaseModel):
@@ -71,6 +74,53 @@ async def verify_api_key(authorization: str = Header(None)):
71
  def read_root():
72
  return {"status": "online", "service": "CygnisAI Studio API", "hf_token_set": bool(HF_TOKEN)}
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  @app.post("/api/ask", response_model=ChatResponse)
75
  async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)):
76
  print(f"📩 Received request: {req.question[:50]}...")
@@ -87,50 +137,15 @@ async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)
87
  messages.append({"role": "system", "content": req.system_prompt})
88
  messages.append({"role": "user", "content": req.question})
89
 
90
- headers = {
91
- "Authorization": f"Bearer {HF_TOKEN}",
92
- "Content-Type": "application/json"
93
- }
94
-
95
  try:
96
- # 1. Tentative via endpoint Chat (OpenAI compatible)
97
- # URL: https://router.huggingface.co/hf-inference/models/{model_id}/v1/chat/completions
98
- hf_chat_url = f"{HF_ROUTER_BASE}/{model_id}/v1/chat/completions"
99
-
100
- payload_chat = {
101
- "model": model_id,
102
- "messages": messages,
103
- "max_tokens": req.max_tokens,
104
- "temperature": req.temperature,
105
- "stream": False
106
- }
107
 
108
- print(f"🚀 Calling HF Chat API: {hf_chat_url}")
109
- response = requests.post(hf_chat_url, headers=headers, json=payload_chat)
110
-
111
- # 2. Fallback via endpoint Inference Standard (si Chat échoue avec 404 ou 405)
112
- if response.status_code in [404, 405]:
113
- print(f"🔄 Fallback to standard inference API (Status {response.status_code})")
114
-
115
- # URL: https://router.huggingface.co/hf-inference/models/{model_id}
116
- # IMPORTANT: On utilise bien le routeur ici aussi !
117
- api_url = f"{HF_ROUTER_BASE}/{model_id}"
118
-
119
- prompt_str = ""
120
- for msg in messages:
121
- prompt_str += f"{msg['role']}: {msg['content']}\n"
122
- prompt_str += "assistant:"
123
-
124
- payload_standard = {
125
- "inputs": prompt_str,
126
- "parameters": {
127
- "max_new_tokens": req.max_tokens,
128
- "temperature": req.temperature,
129
- "return_full_text": False
130
- }
131
- }
132
- print(f"🚀 Calling HF Standard API: {api_url}")
133
- response = requests.post(api_url, headers=headers, json=payload_standard)
134
 
135
  if response.status_code != 200:
136
  print(f"❌ HF Error ({response.status_code}): {response.text}")
 
13
 
14
  # Mapping vers des modèles DISPONIBLES et STABLES sur le routeur Hugging Face
15
  MODELS = {
16
+ # Gemma 2 9B (Google)
17
  "google/gemma-3-27b-it": "google/gemma-2-9b-it",
18
 
19
+ # Llama 3.1 70B (Meta)
20
  "openai/gpt-oss-120b": "meta-llama/Meta-Llama-3.1-70B-Instruct",
21
 
22
+ # Qwen 2.5 7B (Alibaba)
23
  "Qwen/Qwen3-VL-8B-Thinking": "Qwen/Qwen2.5-7B-Instruct",
24
 
25
+ # Phi 3.5 (Microsoft)
26
  "XiaomiMiMo/MiMo-V2-Flash": "microsoft/Phi-3.5-mini-instruct",
27
 
28
+ # DeepSeek R1 (Distill Llama 8B)
29
  "deepseek-ai/DeepSeek-V3.2": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
30
 
31
+ # Llama 3.2 3B (Meta)
32
  "meta-llama/Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
33
 
34
+ # Llama 3.1 8B (Meta)
35
  "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": "meta-llama/Meta-Llama-3.1-8B-Instruct",
36
 
37
+ # Défaut : Gemma 2 2B (Plus récent et dispo que le 2b-it)
38
+ "default": "google/gemma-2-2b-it"
39
  }
40
 
41
+ # Modèle de secours ultime (toujours dispo)
42
+ SAFETY_NET_MODEL = "HuggingFaceH4/zephyr-7b-beta"
43
+
44
+ # URL de base UNIQUE pour le routeur HF
45
  HF_ROUTER_BASE = "https://router.huggingface.co/hf-inference/models"
46
 
47
  class ChatRequest(BaseModel):
 
74
  def read_root():
75
  return {"status": "online", "service": "CygnisAI Studio API", "hf_token_set": bool(HF_TOKEN)}
76
 
77
+ def call_hf_api(model_id, messages, req):
78
+ """Fonction helper pour appeler l'API HF avec gestion Chat/Standard"""
79
+ headers = {
80
+ "Authorization": f"Bearer {HF_TOKEN}",
81
+ "Content-Type": "application/json"
82
+ }
83
+
84
+ # 1. Tentative Chat API
85
+ hf_chat_url = f"{HF_ROUTER_BASE}/{model_id}/v1/chat/completions"
86
+ payload_chat = {
87
+ "model": model_id,
88
+ "messages": messages,
89
+ "max_tokens": req.max_tokens,
90
+ "temperature": req.temperature,
91
+ "stream": False
92
+ }
93
+
94
+ print(f"🚀 Calling HF Chat API: {hf_chat_url}")
95
+ response = requests.post(hf_chat_url, headers=headers, json=payload_chat)
96
+
97
+ # 2. Fallback Standard API
98
+ if response.status_code in [404, 405]:
99
+ print(f"🔄 Fallback to standard inference API (Status {response.status_code})")
100
+ api_url = f"{HF_ROUTER_BASE}/{model_id}"
101
+
102
+ prompt_str = ""
103
+ for msg in messages:
104
+ role = msg['role']
105
+ content = msg['content']
106
+ if role == 'system': prompt_str += f"<|system|>\n{content}</s>\n"
107
+ elif role == 'user': prompt_str += f"<|user|>\n{content}</s>\n"
108
+ elif role == 'assistant': prompt_str += f"<|assistant|>\n{content}</s>\n"
109
+ prompt_str += "<|assistant|>\n"
110
+
111
+ payload_standard = {
112
+ "inputs": prompt_str,
113
+ "parameters": {
114
+ "max_new_tokens": req.max_tokens,
115
+ "temperature": req.temperature,
116
+ "return_full_text": False
117
+ }
118
+ }
119
+ print(f"🚀 Calling HF Standard API: {api_url}")
120
+ response = requests.post(api_url, headers=headers, json=payload_standard)
121
+
122
+ return response
123
+
124
  @app.post("/api/ask", response_model=ChatResponse)
125
  async def ask_model(req: ChatRequest, authorized: bool = Depends(verify_api_key)):
126
  print(f"📩 Received request: {req.question[:50]}...")
 
137
  messages.append({"role": "system", "content": req.system_prompt})
138
  messages.append({"role": "user", "content": req.question})
139
 
 
 
 
 
 
140
  try:
141
+ # Premier essai avec le modèle demandé
142
+ response = call_hf_api(model_id, messages, req)
 
 
 
 
 
 
 
 
 
143
 
144
+ # Si 404/503/500, on tente le SAFETY NET
145
+ if response.status_code != 200:
146
+ print(f"⚠️ Primary model failed ({response.status_code}). Switching to SAFETY NET: {SAFETY_NET_MODEL}")
147
+ model_id = SAFETY_NET_MODEL
148
+ response = call_hf_api(SAFETY_NET_MODEL, messages, req)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  if response.status_code != 200:
151
  print(f"❌ HF Error ({response.status_code}): {response.text}")