Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,10 +4,11 @@
|
|
| 4 |
# • Chat: BioMedLM-7B en formato GGUF con llama.cpp (también SOLO en worker GPU).
|
| 5 |
# • Sin llamadas GPU anidadas; todo atrapado con try/except para evitar RuntimeError genéricos.
|
| 6 |
# • Prompt reforzado en español y generación determinista (sensible a OCR y sin alucinaciones).
|
| 7 |
-
# •
|
|
|
|
| 8 |
# -----------------------------------------------------------------------------------------------
|
| 9 |
|
| 10 |
-
import os, re, tempfile, traceback
|
| 11 |
import gradio as gr
|
| 12 |
import torch
|
| 13 |
from PIL import Image
|
|
@@ -20,26 +21,27 @@ from llama_cpp import Llama
|
|
| 20 |
# CONFIG (entorno)
|
| 21 |
# =========================
|
| 22 |
# --- BioMedLM-7B (GGUF / llama.cpp) ---
|
| 23 |
-
GGUF_REPO = os.getenv("GGUF_REPO", "").strip()
|
| 24 |
-
GGUF_FILE = os.getenv("GGUF_FILE", "
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
_GGUF_CANDIDATES = [
|
| 27 |
-
"BioMedLM-7B.Q4_K_M.gguf",
|
| 28 |
-
"BioMedLM-7B.Q5_K_M.gguf",
|
| 29 |
-
"BioMedLM-7B.Q8_0.gguf",
|
| 30 |
-
"BioMedLM-7B-
|
| 31 |
-
"
|
| 32 |
-
"biomedlm-7b.Q5_K_M.gguf",
|
| 33 |
-
"biomedlm-7b.Q8_0.gguf",
|
| 34 |
-
"biomedlm-7b-f16.gguf",
|
| 35 |
]
|
| 36 |
GGUF_CANDIDATES = [GGUF_FILE] if GGUF_FILE else _GGUF_CANDIDATES
|
| 37 |
|
| 38 |
# Rendimiento / memoria (ajusta según GPU del Space: T4 / A10G)
|
| 39 |
N_CTX = int(os.getenv("N_CTX", "4096"))
|
| 40 |
N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
|
| 41 |
-
N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "35")) # 7B ~32 capas; 35 ≈ todas
|
| 42 |
-
N_BATCH = int(os.getenv("N_BATCH", "512")) #
|
| 43 |
|
| 44 |
# Decodificación (determinista por defecto)
|
| 45 |
GEN_TEMPERATURE = float(os.getenv("TEMPERATURE", "0.0"))
|
|
@@ -47,7 +49,7 @@ GEN_TOP_P = float(os.getenv("TOP_P", "1.0"))
|
|
| 47 |
GEN_MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "384"))
|
| 48 |
STOP_SEQS = ["\n###", "\nUser:", "\nAssistant:", "\nUsuario:", "\nAsistente:"]
|
| 49 |
|
| 50 |
-
# Token para repos privados en HF (opcional)
|
| 51 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 52 |
|
| 53 |
# DeepSeek-OCR: fija una revisión/commit opcional para evitar cambios inesperados
|
|
@@ -128,21 +130,42 @@ _llm = None
|
|
| 128 |
_llm_name = None
|
| 129 |
|
| 130 |
def _download_gguf_path():
|
| 131 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
last_err = None
|
| 133 |
if GGUF_REPO:
|
| 134 |
-
|
|
|
|
| 135 |
try:
|
| 136 |
path = hf_hub_download(repo_id=GGUF_REPO, filename=fname, token=HF_TOKEN)
|
| 137 |
return path, f"{GGUF_REPO}:{fname}"
|
| 138 |
except Exception as e:
|
| 139 |
last_err = e
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
return local_path, f"./{fname}"
|
| 145 |
-
raise RuntimeError(f"No se encontró el GGUF. Configura GGUF_REPO/GGUF_FILE o sube el .gguf. Último error: {last_err}")
|
| 146 |
|
| 147 |
def _ensure_llm():
|
| 148 |
"""Inicializa llama.cpp en el MISMO worker; nunca lanza excepción hacia arriba."""
|
|
|
|
| 4 |
# • Chat: BioMedLM-7B en formato GGUF con llama.cpp (también SOLO en worker GPU).
|
| 5 |
# • Sin llamadas GPU anidadas; todo atrapado con try/except para evitar RuntimeError genéricos.
|
| 6 |
# • Prompt reforzado en español y generación determinista (sensible a OCR y sin alucinaciones).
|
| 7 |
+
# • Config por variables: GGUF_REPO, GGUF_FILE, GGUF_LOCAL_PATH, N_CTX, N_BATCH, N_GPU_LAYERS, etc.
|
| 8 |
+
# • Valores por defecto: repo público mradermacher/BioMedLM-7B-GGUF + Q4_K_M.
|
| 9 |
# -----------------------------------------------------------------------------------------------
|
| 10 |
|
| 11 |
+
import os, re, tempfile, traceback, glob
|
| 12 |
import gradio as gr
|
| 13 |
import torch
|
| 14 |
from PIL import Image
|
|
|
|
| 21 |
# CONFIG (entorno)
|
| 22 |
# =========================
|
| 23 |
# --- BioMedLM-7B (GGUF / llama.cpp) ---
|
| 24 |
+
GGUF_REPO = os.getenv("GGUF_REPO", "mradermacher/BioMedLM-7B-GGUF").strip()
|
| 25 |
+
GGUF_FILE = os.getenv("GGUF_FILE", "BioMedLM-7B.Q4_K_M.gguf").strip()
|
| 26 |
+
GGUF_LOCAL_PATH = os.getenv("GGUF_LOCAL_PATH", "").strip() # ej: ./models/BioMedLM-7B.Q4_K_M.gguf
|
| 27 |
+
|
| 28 |
+
# Candidatos comunes (si no coincide el nombre exacto)
|
| 29 |
_GGUF_CANDIDATES = [
|
| 30 |
+
"BioMedLM-7B.Q4_K_M.gguf", "BioMedLM-7B.Q4_K_S.gguf",
|
| 31 |
+
"BioMedLM-7B.Q5_K_M.gguf", "BioMedLM-7B.Q5_K_S.gguf",
|
| 32 |
+
"BioMedLM-7B.Q6_K.gguf", "BioMedLM-7B.Q8_0.gguf",
|
| 33 |
+
"BioMedLM-7B.IQ4_XS.gguf", "BioMedLM-7B.Q2_K.gguf",
|
| 34 |
+
"BioMedLM-7B.f16.gguf",
|
| 35 |
+
"biomedlm-7b.Q4_K_M.gguf", "biomedlm-7b.Q5_K_M.gguf",
|
| 36 |
+
"biomedlm-7b.Q8_0.gguf", "biomedlm-7b-f16.gguf",
|
|
|
|
| 37 |
]
|
| 38 |
GGUF_CANDIDATES = [GGUF_FILE] if GGUF_FILE else _GGUF_CANDIDATES
|
| 39 |
|
| 40 |
# Rendimiento / memoria (ajusta según GPU del Space: T4 / A10G)
|
| 41 |
N_CTX = int(os.getenv("N_CTX", "4096"))
|
| 42 |
N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
|
| 43 |
+
N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "35")) # 7B ~32 capas; 35 ≈ todas (si hay VRAM)
|
| 44 |
+
N_BATCH = int(os.getenv("N_BATCH", "512")) # 512/768/1024 según VRAM
|
| 45 |
|
| 46 |
# Decodificación (determinista por defecto)
|
| 47 |
GEN_TEMPERATURE = float(os.getenv("TEMPERATURE", "0.0"))
|
|
|
|
| 49 |
GEN_MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "384"))
|
| 50 |
STOP_SEQS = ["\n###", "\nUser:", "\nAssistant:", "\nUsuario:", "\nAsistente:"]
|
| 51 |
|
| 52 |
+
# Token para repos privados en HF (opcional; este repo es público)
|
| 53 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 54 |
|
| 55 |
# DeepSeek-OCR: fija una revisión/commit opcional para evitar cambios inesperados
|
|
|
|
| 130 |
_llm_name = None
|
| 131 |
|
| 132 |
def _download_gguf_path():
|
| 133 |
+
"""
|
| 134 |
+
Prioridad de búsqueda:
|
| 135 |
+
0) GGUF_LOCAL_PATH (ruta directa)
|
| 136 |
+
1) Archivo subido al Space (cwd): GGUF_FILE exacto o cualquier *.gguf
|
| 137 |
+
2) Repo de HF: GGUF_REPO + GGUF_FILE (o candidatos)
|
| 138 |
+
"""
|
| 139 |
+
# 0) Ruta local explícita
|
| 140 |
+
if GGUF_LOCAL_PATH:
|
| 141 |
+
p = os.path.abspath(GGUF_LOCAL_PATH)
|
| 142 |
+
if os.path.exists(p):
|
| 143 |
+
return p, p
|
| 144 |
+
raise RuntimeError(f"GGUF_LOCAL_PATH apunta a un archivo inexistente: {p}")
|
| 145 |
+
|
| 146 |
+
# 1) Archivo subido al Space
|
| 147 |
+
if GGUF_FILE:
|
| 148 |
+
local_path = os.path.join(os.getcwd(), GGUF_FILE)
|
| 149 |
+
if os.path.exists(local_path):
|
| 150 |
+
return local_path, f"./{GGUF_FILE}"
|
| 151 |
+
found = sorted(glob.glob(os.path.join(os.getcwd(), "*.gguf")))
|
| 152 |
+
if found:
|
| 153 |
+
return found[0], f"./{os.path.basename(found[0])}"
|
| 154 |
+
|
| 155 |
+
# 2) HF repo
|
| 156 |
last_err = None
|
| 157 |
if GGUF_REPO:
|
| 158 |
+
candidates = [GGUF_FILE] if GGUF_FILE else _GGUF_CANDIDATES
|
| 159 |
+
for fname in candidates:
|
| 160 |
try:
|
| 161 |
path = hf_hub_download(repo_id=GGUF_REPO, filename=fname, token=HF_TOKEN)
|
| 162 |
return path, f"{GGUF_REPO}:{fname}"
|
| 163 |
except Exception as e:
|
| 164 |
last_err = e
|
| 165 |
+
|
| 166 |
+
raise RuntimeError("No se encontró el GGUF. "
|
| 167 |
+
"Sube el .gguf (Files) y pon GGUF_FILE, o define GGUF_REPO+GGUF_FILE, "
|
| 168 |
+
f"o usa GGUF_LOCAL_PATH. Último error HF: {last_err}")
|
|
|
|
|
|
|
| 169 |
|
| 170 |
def _ensure_llm():
|
| 171 |
"""Inicializa llama.cpp en el MISMO worker; nunca lanza excepción hacia arriba."""
|