OpScanIA

Sleeping

App Files Files Community

jorgeiv500 commited on Nov 12

Commit

ac0510e

verified ·

1 Parent(s): 30bf3c9

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -116

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
-# app.py — DeepSeek-OCR + Med42 (HF conversational) — Gradio 5
-# ZeroGPU-safe (sin inicializar CUDA en el proceso principal)
 import os, re, json, tempfile, traceback
 import gradio as gr
@@ -7,39 +10,57 @@ import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
 import spaces
-from huggingface_hub import InferenceClient
-import requests
 # =========================
 # CONFIG (env)
 # =========================
-LLM_MODEL_ID = os.getenv("BIO_MODEL_ID", "m42-health/Llama3-Med42-8B").strip()
-HF_TOKEN = os.getenv("HF_TOKEN")
-# Generación (determinista para obediencia)
-GEN_TEMPERATURE = float(os.getenv("GEN_TEMPERATURE", "0.0"))
-GEN_TOP_P = float(os.getenv("GEN_TOP_P", "1.0"))
-GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "384"))
-GEN_TIMEOUT = int(os.getenv("GEN_TIMEOUT", "60"))  # s
-STOP_SEQS = ["\n###", "\nUser:", "\nAssistant:"]
-# (Opcional) fija una revisión estable del repo DeepSeek-OCR para evitar cambios inesperados
-DS_OCR_REV = os.getenv("DS_OCR_REV", None)  # e.g., hash de commit
-# Cliente remoto (no toca CUDA aquí)
-_hf_client = InferenceClient(model=LLM_MODEL_ID, token=HF_TOKEN, timeout=GEN_TIMEOUT)
 # =========================
-# Prompt helpers
 # =========================
 def _truncate(s: str, n=3000):
     s = (s or "")
     return s if len(s) <= n else s[:n]
 def _clean_ocr(s: str) -> str:
     if not s: return ""
-    s = re.sub(r'[^\S\r\n]+', ' ', s)                     # colapsa espacios
-    s = re.sub(r'(\{#Sec\d+\}|#+\w*)', ' ', s)            # anchors/headers raros
     s = re.sub(r'\s{2,}', ' ', s)
     lines = []
     for par in s.splitlines():
@@ -77,103 +98,87 @@ SALIDA_ES:
 - Evidencia OCR: "Indicaciones ilegibles"
 """.strip()
-def build_chat_messages(chat_msgs, ocr_md, ocr_txt, user_msg):
     raw = ocr_md if (ocr_md and ocr_md.strip()) else ocr_txt
     ctx = _truncate(_clean_ocr(raw), 3000)
-    # Construimos el contenido del usuario con el contexto y few-shot
     question = (user_msg or "Analiza el CONTEXTO_OCR y resume lo clínicamente relevante en viñetas.").strip()
-    user_content = (
-        f"{FEWSHOT}\n\n### CONTEXTO_OCR\n{(ctx if ctx else '—')}\n\n"
         f"### PREGUNTA\n{question}\n\n"
-        "### SALIDA_ES\n"
     )
-    msgs = [{"role": "system", "content": SYSTEM_INSTR}]
-    # opcional: incluir historial como mensajes previos
-    for m in (chat_msgs or []):
-        r = m.get("role")
-        c = (m.get("content") or "").strip()
-        if not c:
-            continue
-        if r == "user":
-            msgs.append({"role": "user", "content": c})
-        elif r == "assistant":
-            msgs.append({"role": "assistant", "content": c})
-    msgs.append({"role": "user", "content": user_content})
-    return msgs
 # =========================
-# LLM remoto (Med42) — conversational
 # =========================
-def med42_remote_generate(messages) -> (str, str):
-    """
-    1) InferenceClient.chat.completions.create (task conversacional)
-    2) Fallback HTTP router: /v1/chat/completions
-    """
     try:
-        resp = _hf_client.chat.completions.create(
-            model=LLM_MODEL_ID,
             messages=messages,
-            max_tokens=GEN_MAX_NEW_TOKENS,
-            temperature=GEN_TEMPERATURE,
-            top_p=GEN_TOP_P,
-            stop=STOP_SEQS,
         )
-        answer = (resp.choices[0].message.content or "").strip()
-        return answer, ""
-    except Exception as e1:
-        # Fallback al router nuevo
-        try:
-            headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
-            payload = {
-                "model": LLM_MODEL_ID,
-                "messages": messages,
-                "max_tokens": GEN_MAX_NEW_TOKENS,
-                "temperature": GEN_TEMPERATURE,
-                "top_p": GEN_TOP_P,
-                "stop": STOP_SEQS,
-            }
-            for url in [
-                "https://router.huggingface.co/v1/chat/completions",
-                "https://router.huggingface.co/hf-inference/v1/chat/completions",
-            ]:
-                r = requests.post(url, headers=headers, json=payload, timeout=GEN_TIMEOUT)
-                if r.status_code == 200:
-                    data = r.json()
-                    if isinstance(data, dict) and "choices" in data and data["choices"]:
-                        msg = data["choices"][0].get("message") or {}
-                        text = (msg.get("content") or "").strip()
-                        return text, f"[Fallback router: {url}] {e1}"
-            raise RuntimeError(f"HTTP {r.status_code}: {r.text[:800]}")
-        except Exception as e2:
-            raise RuntimeError(
-                f"Remote generation failed: {e1.__class__.__name__}: {e1} | HTTP fallback: {e2.__class__.__name__}: {e2}"
-            )
-def med42_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
-    try:
-        messages = build_chat_messages(chat_msgs, ocr_md, ocr_txt, user_msg)
-        answer, dbg = med42_remote_generate(messages)
-        updated = (chat_msgs or []) + [
-            {"role": "user", "content": user_msg or "(analizar solo OCR)"},
-            {"role": "assistant", "content": answer}
-        ]
-        return updated, "", gr.update(value=dbg)
     except Exception as e:
-        tb = traceback.format_exc(limit=2)
-        updated = (chat_msgs or []) + [
-            {"role": "user", "content": user_msg or ""},
-            {"role": "assistant", "content": f"⚠️ Error LLM: {e}"}
-        ]
-        return updated, "", gr.update(value=f"{e}\n{tb}")
-def clear_chat():
-    return [], "", gr.update(value="")
 # =========================
-# DeepSeek-OCR (sin CUDA en main, GPU solo dentro del worker)
 # =========================
 def _load_ocr_model():
     model_name = "deepseek-ai/DeepSeek-OCR"
@@ -189,7 +194,6 @@ def _load_ocr_model():
         mdl = AutoModel.from_pretrained(model_name, **kwargs).eval()
         return tok, mdl
     except Exception as e:
-        # Fallback si FA2 no está
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
             kwargs["_attn_implementation"] = "eager"
             mdl = AutoModel.from_pretrained(model_name, **kwargs).eval()
@@ -202,7 +206,6 @@ tokenizer, model = _load_ocr_model()
 def process_image(image, model_size, task_type, is_eval_mode):
     if image is None:
         return None, "Please upload an image first.", "Please upload an image first."
     # mover a GPU SOLO dentro del worker
     if torch.cuda.is_available():
         dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
@@ -253,15 +256,46 @@ def process_image(image, model_size, task_type, is_eval_mode):
         text_result = plain_text if plain_text else markdown_content
         return result_image, markdown_content, text_result
 # =========================
 # UI (Gradio 5)
 # =========================
-with gr.Blocks(title="DeepSeek-OCR + Med42 (Conversational)", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # DeepSeek-OCR → Chat Clínico con **Med42**
         1) **Sube una imagen** y corre **OCR** (imagen anotada, Markdown y texto).
-        2) **Chatea** con **Med42** usando automáticamente el **OCR** como contexto.
         *Uso educativo; no reemplaza consejo médico.*
         """
     )
@@ -283,6 +317,7 @@ with gr.Blocks(title="DeepSeek-OCR + Med42 (Conversational)", theme=gr.themes.So
             eval_mode_checkbox = gr.Checkbox(value=False, label="Enable Evaluation Mode",
                                              info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown.")
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
             with gr.Tabs():
@@ -296,16 +331,16 @@ with gr.Blocks(title="DeepSeek-OCR + Med42 (Conversational)", theme=gr.themes.So
                 md_preview = gr.Textbox(label="Snapshot Markdown OCR", lines=10, interactive=False)
                 txt_preview = gr.Textbox(label="Snapshot Texto OCR", lines=10, interactive=False)
-    gr.Markdown("## Chat Clínico (Med42)")
     with gr.Row():
         with gr.Column(scale=2):
-            chatbot = gr.Chatbot(label="Asistente OCR (Med42)", type="messages", height=420)
             user_in = gr.Textbox(label="Mensaje", placeholder="Escribe tu consulta… (vacío = analiza solo el OCR)", lines=2)
             with gr.Row():
                 send_btn = gr.Button("Enviar", variant="primary")
                 clear_btn = gr.Button("Limpiar")
         with gr.Column(scale=1):
-            error_box = gr.Textbox(label="Debug (si hay error)", lines=8, interactive=False)
     # OCR
     submit_btn.click(
@@ -318,13 +353,16 @@ with gr.Blocks(title="DeepSeek-OCR + Med42 (Conversational)", theme=gr.themes.So
         outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
     )
     # Chat
     send_btn.click(
-        fn=med42_reply,
         inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
-        outputs=[chatbot, user_in, error_box]
     )
-    clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
 if __name__ == "__main__":
     demo.queue(max_size=20)

+# app.py — DeepSeek-OCR + BioMedLM-7B (GGUF llama.cpp local, ZeroGPU-safe) — Gradio 5
+# - OCR con DeepSeek-OCR (GPU solo en @spaces.GPU)
+# - Chat con BioMedLM-7B GGUF via llama.cpp (GPU solo en @spaces.GPU)
+# - Prompt reforzado (few-shot) y decodificación determinista
+# - Configurable por variables de entorno: GGUF_REPO, GGUF_FILE, N_CTX, N_BATCH, N_GPU_LAYERS
 import os, re, json, tempfile, traceback
 import gradio as gr
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
 import spaces
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 # =========================
 # CONFIG (env)
 # =========================
+# --- Llama.cpp (BioMedLM-7B GGUF) ---
+GGUF_REPO = os.getenv("GGUF_REPO", "").strip()   # ej: "theuser/biomedlm-7b-gguf" (pon el tuyo)
+GGUF_FILE = os.getenv("GGUF_FILE", "").strip()   # ej: "BioMedLM-7B.Q4_K_M.gguf"
+# candidatos por defecto si no das GGUF_FILE
+_GGUF_CANDIDATES = [
+    "BioMedLM-7B.Q4_K_M.gguf",
+    "BioMedLM-7B.Q5_K_M.gguf",
+    "BioMedLM-7B.Q8_0.gguf",
+    "BioMedLM-7B-f16.gguf",
+    "biomedlm-7b.Q4_K_M.gguf",
+    "biomedlm-7b.Q5_K_M.gguf",
+    "biomedlm-7b.Q8_0.gguf",
+    "biomedlm-7b-f16.gguf",
+]
+GGUF_CANDIDATES = [GGUF_FILE] if GGUF_FILE else _GGUF_CANDIDATES
+# rendimiento / memoria
+N_CTX = int(os.getenv("N_CTX", "4096"))
+N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
+N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "35"))  # 7B ~32 capas; 35 = "todas"
+N_BATCH = int(os.getenv("N_BATCH", "512"))           # sube a 1024 si tu GPU lo permite
+# generación determinista para obediencia
+GEN_TEMPERATURE = float(os.getenv("TEMPERATURE", "0.0"))
+GEN_TOP_P = float(os.getenv("TOP_P", "1.0"))
+GEN_MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "384"))
+STOP_SEQS = ["\n###", "\nUser:", "\nAssistant:", "\nUsuario:", "\nAsistente:"]
+# DeepSeek-OCR revision opcional para evitar cambios inesperados
+DS_OCR_REV = os.getenv("DS_OCR_REV", None)  # e.g. hash de commit
 # =========================
+# Estado global (solo dentro de workers GPU)
 # =========================
+_llm = None
+_llm_name = None
 def _truncate(s: str, n=3000):
     s = (s or "")
     return s if len(s) <= n else s[:n]
 def _clean_ocr(s: str) -> str:
     if not s: return ""
+    s = re.sub(r'[^\S\r\n]+', ' ', s)           # colapsa espacios
+    s = re.sub(r'(\{#Sec\d+\}|#+\w*)', ' ', s)  # anchors/headers raros
     s = re.sub(r'\s{2,}', ' ', s)
     lines = []
     for par in s.splitlines():
 - Evidencia OCR: "Indicaciones ilegibles"
 """.strip()
+def build_user_prompt(ocr_md, ocr_txt, user_msg):
     raw = ocr_md if (ocr_md and ocr_md.strip()) else ocr_txt
     ctx = _truncate(_clean_ocr(raw), 3000)
     question = (user_msg or "Analiza el CONTEXTO_OCR y resume lo clínicamente relevante en viñetas.").strip()
+    prompt = (
+        f"{FEWSHOT}\n\n"
+        f"### CONTEXTO_OCR\n{(ctx if ctx else '—')}\n\n"
         f"### PREGUNTA\n{question}\n\n"
+        f"### SALIDA_ES\n"
     )
+    return prompt
 # =========================
+# BioMedLM-7B GGUF — llama.cpp (GPU solo en worker)
 # =========================
+def _download_gguf_path():
+    last_err = None
+    if GGUF_REPO:
+        for fname in GGUF_CANDIDATES:
+            try:
+                path = hf_hub_download(repo_id=GGUF_REPO, filename=fname)
+                return path, f"{GGUF_REPO}:{fname}"
+            except Exception as e:
+                last_err = e
+    # fallback: si subiste el gguf al Space (en la carpeta del repo)
+    for fname in GGUF_CANDIDATES:
+        local_path = os.path.join(os.getcwd(), fname)
+        if os.path.exists(local_path):
+            return local_path, f"./{fname}"
+    raise RuntimeError(f"No se pudo localizar el GGUF. Configura GGUF_REPO/GGUF_FILE o sube el .gguf. Último error: {last_err}")
+@spaces.GPU
+def biomedlm_warmup():
+    """Inicializa llama.cpp dentro del worker GPU (evita CUDA en main)."""
+    global _llm, _llm_name
+    if _llm is not None:
+        return f"OK::warm (reusing {_llm_name})"
+    gguf_path, used = _download_gguf_path()
+    _llm = Llama(
+        model_path=gguf_path,
+        n_ctx=N_CTX,
+        n_threads=N_THREADS,
+        n_gpu_layers=N_GPU_LAYERS,
+        n_batch=N_BATCH,
+        # decodificación por defecto: greedy (sin sampling)
+        verbose=False,
+    )
+    _llm_name = used
+    return f"OK::loaded {used}"
+def _to_chatml(system_prompt, user_prompt):
+    # formato simple ChatML-compatible para llama.cpp
+    return [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt},
+    ]
+@spaces.GPU
+def biomedlm_chat(ocr_md, ocr_txt, user_msg, temperature=GEN_TEMPERATURE, top_p=GEN_TOP_P, max_tokens=GEN_MAX_NEW_TOKENS):
+    """Generación dentro del worker GPU con el LLM ya inicializado."""
+    global _llm
+    if _llm is None:
+        status = biomedlm_warmup()
+        if not str(status).startswith("OK::"):
+            return "ERR::No se pudo inicializar el modelo GGUF"
+    prompt_user = build_user_prompt(ocr_md, ocr_txt, user_msg)
+    messages = _to_chatml(SYSTEM_INSTR, prompt_user)
     try:
+        out = _llm.create_chat_completion(
             messages=messages,
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
         )
+        ans = out["choices"][0]["message"]["content"]
+        return "OK::" + (ans or "").strip()
     except Exception as e:
+        return f"ERR::[{e.__class__.__name__}] {str(e) or repr(e)}"
 # =========================
+# DeepSeek-OCR (GPU solo dentro del worker)
 # =========================
 def _load_ocr_model():
     model_name = "deepseek-ai/DeepSeek-OCR"
         mdl = AutoModel.from_pretrained(model_name, **kwargs).eval()
         return tok, mdl
     except Exception as e:
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
             kwargs["_attn_implementation"] = "eager"
             mdl = AutoModel.from_pretrained(model_name, **kwargs).eval()
 def process_image(image, model_size, task_type, is_eval_mode):
     if image is None:
         return None, "Please upload an image first.", "Please upload an image first."
     # mover a GPU SOLO dentro del worker
     if torch.cuda.is_available():
         dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
         text_result = plain_text if plain_text else markdown_content
         return result_image, markdown_content, text_result
+# =========================
+# Chat wrapper para la UI
+# =========================
+def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
+    try:
+        res = biomedlm_chat(ocr_md, ocr_txt, user_msg, temperature=GEN_TEMPERATURE, top_p=GEN_TOP_P, max_tokens=GEN_MAX_NEW_TOKENS)
+        if str(res).startswith("OK::"):
+            answer = res[4:]
+            updated = (chat_msgs or []) + [
+                {"role": "user", "content": user_msg or "(analizar solo OCR)"},
+                {"role": "assistant", "content": answer}
+            ]
+            return updated, "", gr.update(value="")
+        else:
+            err_msg = res[5:] if str(res).startswith("ERR::") else str(res)
+            updated = (chat_msgs or []) + [
+                {"role": "user", "content": user_msg or ""},
+                {"role": "assistant", "content": "⚠️ Error LLM (local). Revisa el panel de debug."}
+            ]
+            return updated, "", gr.update(value=err_msg)
+    except Exception as e:
+        tb = traceback.format_exc(limit=2)
+        updated = (chat_msgs or []) + [
+            {"role": "user", "content": user_msg or ""},
+            {"role": "assistant", "content": f"⚠️ Error LLM: {e}"}
+        ]
+        return updated, "", gr.update(value=f"{e}\n{tb}")
+def clear_chat():
+    return [], "", gr.update(value="")
 # =========================
 # UI (Gradio 5)
 # =========================
+with gr.Blocks(title="OpScanIA — DeepSeek-OCR + BioMedLM-7B (GGUF)", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # DeepSeek-OCR → Chat Clínico con **BioMedLM-7B (GGUF local)**
         1) **Sube una imagen** y corre **OCR** (imagen anotada, Markdown y texto).
+        2) **Chatea** con **BioMedLM-7B GGUF (llama.cpp)** usando automáticamente el **OCR** como contexto.
         *Uso educativo; no reemplaza consejo médico.*
         """
     )
             eval_mode_checkbox = gr.Checkbox(value=False, label="Enable Evaluation Mode",
                                              info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown.")
             submit_btn = gr.Button("Process Image", variant="primary")
+            warm_btn = gr.Button("Warmup BioMedLM-7B (GGUF)")
         with gr.Column(scale=2):
             with gr.Tabs():
                 md_preview = gr.Textbox(label="Snapshot Markdown OCR", lines=10, interactive=False)
                 txt_preview = gr.Textbox(label="Snapshot Texto OCR", lines=10, interactive=False)
+    gr.Markdown("## Chat Clínico (BioMedLM-7B GGUF)")
     with gr.Row():
         with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Asistente OCR (BioMedLM-7B GGUF)", type="messages", height=420)
             user_in = gr.Textbox(label="Mensaje", placeholder="Escribe tu consulta… (vacío = analiza solo el OCR)", lines=2)
             with gr.Row():
                 send_btn = gr.Button("Enviar", variant="primary")
                 clear_btn = gr.Button("Limpiar")
         with gr.Column(scale=1):
+            debug_box = gr.Textbox(label="Debug", lines=10, interactive=False)
     # OCR
     submit_btn.click(
         outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
     )
+    # Warmup LLM (descarga/crea el objeto Llama en GPU)
+    warm_btn.click(fn=biomedlm_warmup, outputs=[debug_box])
     # Chat
     send_btn.click(
+        fn=biomedlm_reply,
         inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
+        outputs=[chatbot, user_in, debug_box]
     )
+    clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, debug_box])
 if __name__ == "__main__":
     demo.queue(max_size=20)