OpScanIA

Sleeping

App Files Files Community

jorgeiv500 commited on Nov 12

Commit

b5e8b0f

verified ·

1 Parent(s): 5146c1d

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -395

app.py CHANGED Viewed

@@ -1,418 +1,192 @@
-# app.py — OpScanIA: DeepSeek-OCR (GPU) + BioMedLM-7B GGUF (GPU → fallback CPU si falla) — Gradio 5
-# ------------------------------------------------------------------------------------------------
-# • OCR: DeepSeek-OCR en @spaces.GPU (sin inicializar CUDA en el main).
-# • Chat: intenta BioMedLM-7B (GGUF, llama.cpp) en @spaces.GPU; si ZeroGPU aborta, cae a CPU local.
-# • Evita OOM: defaults conservadores. Fallback CPU se activa automáticamente si detecta "GPU task aborted".
-# • Config: GGUF_REPO/GGUF_FILE o sube el .gguf a Files. Repo por defecto: mradermacher/BioMedLM-7B-GGUF.
-# ------------------------------------------------------------------------------------------------
-import os, re, glob, tempfile, traceback
 import gradio as gr
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
 import spaces
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
 # =========================
-# VARIABLES DE ENTORNO (ajusta si necesitas)
 # =========================
-GGUF_REPO = os.getenv("GGUF_REPO", "mradermacher/BioMedLM-7B-GGUF").strip()
-GGUF_FILE = os.getenv("GGUF_FILE", "BioMedLM-7B.Q4_K_M.gguf").strip()
-GGUF_LOCAL_PATH = os.getenv("GGUF_LOCAL_PATH", "").strip()
-HF_TOKEN = os.getenv("HF_TOKEN")
-# Perf (GPU, conservador para ZeroGPU)
-N_CTX_GPU   = int(os.getenv("N_CTX_GPU", "2048"))
-N_BATCH_GPU = int(os.getenv("N_BATCH_GPU", "256"))
-N_GPU_LAYERS= int(os.getenv("N_GPU_LAYERS", "8"))
-# Perf (CPU fallback, aún más conservador)
-N_CTX_CPU   = int(os.getenv("N_CTX_CPU", "1024"))
-N_BATCH_CPU = int(os.getenv("N_BATCH_CPU", "128"))
-N_THREADS   = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
-# Decodificación
-GEN_TEMPERATURE   = float(os.getenv("TEMPERATURE", "0.0"))
-GEN_TOP_P         = float(os.getenv("TOP_P", "1.0"))
-GEN_MAX_NEW_TOKENS= int(os.getenv("MAX_NEW_TOKENS", "192"))  # corto para no inflar KV-cache
-ALLOW_CPU_FALLBACK = os.getenv("ALLOW_CPU_FALLBACK", "1") == "1"
-# OCR config
-DS_OCR_REV = os.getenv("DS_OCR_REV", None)  # fija un commit si quieres estabilidad
-# Candidatos alternos si el nombre exacto no coincide
-_GGUF_CANDIDATES = [
-    "BioMedLM-7B.Q4_K_M.gguf", "BioMedLM-7B.Q4_K_S.gguf",
-    "BioMedLM-7B.Q5_K_M.gguf", "BioMedLM-7B.Q5_K_S.gguf",
-    "BioMedLM-7B.Q6_K.gguf",   "BioMedLM-7B.Q8_0.gguf",
-    "BioMedLM-7B.IQ4_XS.gguf", "BioMedLM-7B.Q2_K.gguf",
-    "BioMedLM-7B.f16.gguf",
-    "biomedlm-7b.Q4_K_M.gguf", "biomedlm-7b.Q5_K_M.gguf",
-    "biomedlm-7b.Q8_0.gguf",   "biomedlm-7b-f16.gguf",
-]
-GGUF_CANDIDATES = [GGUF_FILE] if GGUF_FILE else _GGUF_CANDIDATES
-STOP_SEQS = ["\n###", "\nUser:", "\nAssistant:", "\nUsuario:", "\nAsistente:"]
-# =========================
-# UTILIDADES PROMPT
-# =========================
-def _truncate(s: str, n=3000):
-    s = (s or "")
-    return s if len(s) <= n else s[:n]
-def _clean_ocr(s: str) -> str:
-    if not s:
-        return ""
-    import re as _re
-    s = _re.sub(r"[^\S\r\n]+", " ", s)
-    s = _re.sub(r"(\{#Sec\d+\}|#+\w*)", " ", s)
-    s = _re.sub(r"\s{2,}", " ", s)
-    lines = []
-    for par in s.splitlines():
-        par = par.strip()
-        if 0 < len(par) <= 600:
-            lines.append(par)
-    return "\n".join(lines)
-SYSTEM_INSTR = (
-    "Eres un analista clínico educativo. Responde SIEMPRE en español. "
-    "Reglas: (1) Usa ÚNICAMENTE el CONTEXTO_OCR; "
-    "(2) Si falta un dato, escribe literalmente: 'dato no disponible en el OCR'; "
-    "(3) No inventes nada; (4) Responde en viñetas claras; "
-    "(5) Cita fragmentos exactos del OCR entre comillas como evidencia."
 )
-FEWSHOT = """
-### EJEMPLO 1
-CONTEXTO_OCR:
-Paciente: Juan Pérez. Medicamento: Amoxicilina 500 mg cada 8 horas por 7 días.
-PREGUNTA:
-¿Cuál es el medicamento y la dosis?
-SALIDA_ES:
-- Medicamento: **Amoxicilina**
-- Dosis: **500 mg cada 8 horas por 7 días**
-- Evidencia OCR: "Amoxicilina 500 mg cada 8 horas por 7 días"
-### EJEMPLO 2
-CONTEXTO_OCR:
-Paciente: —. Indicaciones ilegibles.
-PREGUNTA:
-¿Hay contraindicaciones registradas?
-SALIDA_ES:
-- Contraindicaciones: **dato no disponible en el OCR**
-- Evidencia OCR: "Indicaciones ilegibles"
-""".strip()
-def build_user_prompt(ocr_md, ocr_txt, user_msg):
-    raw = ocr_md if (ocr_md and ocr_md.strip()) else ocr_txt
-    ctx = _truncate(_clean_ocr(raw), 2200)  # acotar más para VRAM/CPU
-    question = (user_msg or "Analiza el CONTEXTO_OCR y resume lo clínicamente relevante en viñetas.").strip()
-    prompt = (
-        f"{FEWSHOT}\n\n"
-        f"### CONTEXTO_OCR\n{(ctx if ctx else '—')}\n\n"
-        f"### PREGUNTA\n{question}\n\n"
-        f"### SALIDA_ES\n"
     )
-    return prompt
-def _to_chatml(system_prompt, user_prompt):
     return [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": user_prompt},
     ]
-# =========================
-# LOCALIZAR GGUF
-# =========================
-def _download_gguf_path():
-    # 0) Ruta local explícita
-    if GGUF_LOCAL_PATH:
-        p = os.path.abspath(GGUF_LOCAL_PATH)
-        if os.path.exists(p):
-            return p, p
-        raise RuntimeError(f"GGUF_LOCAL_PATH apunta a un archivo inexistente: {p}")
-    # 1) Archivo subido al Space
-    if GGUF_FILE:
-        local_path = os.path.join(os.getcwd(), GGUF_FILE)
-        if os.path.exists(local_path):
-            return local_path, f"./{GGUF_FILE}"
-    found = sorted(glob.glob(os.path.join(os.getcwd(), "*.gguf")))
-    if found:
-        return found[0], f"./{os.path.basename(found[0])}"
-    # 2) Repo HF
-    last_err = None
-    if GGUF_REPO:
-        candidates = [GGUF_FILE] if GGUF_FILE else GGUF_CANDIDATES
-        for fname in candidates:
-            try:
-                path = hf_hub_download(repo_id=GGUF_REPO, filename=fname, token=HF_TOKEN)
-                return path, f"{GGUF_REPO}:{fname}"
-            except Exception as e:
-                last_err = e
-    raise RuntimeError("No se encontró el GGUF. Sube el .gguf a Files y pon GGUF_FILE, "
-                       "o define GGUF_REPO+GGUF_FILE, o usa GGUF_LOCAL_PATH. "
-                       f"Último error HF: {last_err}")
 # =========================
-# LLM GPU (worker) + CPU (fallback)
 # =========================
-_llm_gpu = None
-_llm_gpu_name = None
-_llm_cpu = None
-_llm_cpu_name = None
-def _ensure_llm_gpu():
-    global _llm_gpu, _llm_gpu_name
-    if _llm_gpu is not None:
-        return True, f"warm (reusing {_llm_gpu_name})"
-    try:
-        gguf_path, used = _download_gguf_path()
-        _llm_gpu = Llama(
-            model_path=gguf_path,
-            n_ctx=N_CTX_GPU,
-            n_threads=N_THREADS,
-            n_gpu_layers=N_GPU_LAYERS,
-            n_batch=N_BATCH_GPU,
-            use_mmap=True,
-            verbose=False,
-        )
-        _llm_gpu_name = used
-        return True, f"loaded {used}"
-    except Exception as e:
-        return False, f"[{e.__class__.__name__}] {str(e) or repr(e)}"
-def _ensure_llm_cpu():
-    global _llm_cpu, _llm_cpu_name
-    if _llm_cpu is not None:
-        return True, f"warm (reusing {_llm_cpu_name})"
-    try:
-        gguf_path, used = _download_gguf_path()
-        _llm_cpu = Llama(
-            model_path=gguf_path,
-            n_ctx=N_CTX_CPU,
-            n_threads=N_THREADS,
-            n_gpu_layers=0,        # fuerza CPU
-            n_batch=N_BATCH_CPU,
-            use_mmap=True,
-            verbose=False,
-        )
-        _llm_cpu_name = used
-        return True, f"loaded CPU {used}"
-    except Exception as e:
-        return False, f"[{e.__class__.__name__}] {str(e) or repr(e)}"
-# ---- GPU worker (ZeroGPU) ----
-@spaces.GPU
-def biomedlm_chat_gpu(ocr_md, ocr_txt, user_msg,
-                      temperature=GEN_TEMPERATURE, top_p=GEN_TOP_P, max_tokens=GEN_MAX_NEW_TOKENS):
-    try:
-        ok, msg = _ensure_llm_gpu()
-        if not ok:
-            return "ERR::GPU_INIT::" + msg
-        prompt = build_user_prompt(ocr_md, ocr_txt, user_msg)
-        messages = _to_chatml(SYSTEM_INSTR, prompt)
-        try:
-            out = _llm_gpu.create_chat_completion(
-                messages=messages,
-                temperature=temperature,
-                top_p=top_p,
-                max_tokens=max_tokens,
-                stop=STOP_SEQS,
-            )
-            ans = (out["choices"][0]["message"]["content"] or "").strip()
-            return "OK::" + ans
-        except Exception as e:
-            return f"ERR::GPU_INFER::{e.__class__.__name__}: {str(e) or repr(e)}"
-    except Exception as e:
-        # Si el worker aborta, Gradio lo envolverá; aquí devolvemos una marca clara si alcanzamos a atraparlo
-        return f"ERR::GPU_WORKER::{e.__class__.__name__}: {str(e) or repr(e)}"
-# ---- CPU fallback (main, sin @spaces.GPU) ----
-def biomedlm_chat_cpu(ocr_md, ocr_txt, user_msg,
-                      temperature=GEN_TEMPERATURE, top_p=GEN_TOP_P, max_tokens=GEN_MAX_NEW_TOKENS):
-    ok, msg = _ensure_llm_cpu()
-    if not ok:
-        return "ERR::CPU_INIT::" + msg
-    prompt = build_user_prompt(ocr_md, ocr_txt, user_msg)
-    messages = _to_chatml(SYSTEM_INSTR, prompt)
-    try:
-        out = _llm_cpu.create_chat_completion(
-            messages=messages,
-            temperature=temperature,
-            top_p=top_p,
-            max_tokens=max_tokens,
-            stop=STOP_SEQS,
-        )
-        ans = (out["choices"][0]["message"]["content"] or "").strip()
-        return "OK::" + ans
-    except Exception as e:
-        return f"ERR::CPU_INFER::{e.__class__.__name__}: {str(e) or repr(e)}"
-# =========================
-# OCR DeepSeek (GPU worker)
-# =========================
 def _load_ocr_model():
-    model_name = "deepseek-ai/DeepSeek-OCR"
-    tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    kwargs = dict(
-        _attn_implementation=os.getenv("OCR_ATTN_IMPL", "flash_attention_2"),
-        trust_remote_code=True,
-        use_safetensors=True,
-    )
-    if DS_OCR_REV:
-        kwargs["revision"] = DS_OCR_REV
     try:
-        mdl = AutoModel.from_pretrained(model_name, **kwargs).eval()
         return tok, mdl
     except Exception as e:
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
-            kwargs["_attn_implementation"] = "eager"
-            mdl = AutoModel.from_pretrained(model_name, **kwargs).eval()
             return tok, mdl
         raise
-tokenizer, model = _load_ocr_model()
-@spaces.GPU
-def process_image(image, model_size, task_type, is_eval_mode):
     if image is None:
-        return None, "Please upload an image first.", "Please upload an image first."
-    if torch.cuda.is_available():
-        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-        model_device = model.to(dtype).to("cuda")
-    else:
-        dtype = torch.float32
-        model_device = model.to(dtype)
-    with tempfile.TemporaryDirectory() as output_path:
         prompt = "<image>\nFree OCR. " if task_type == "Free OCR" else "<image>\n<|grounding|>Convert the document to markdown. "
-        temp_image_path = os.path.join(output_path, "temp_image.jpg")
-        image.save(temp_image_path)
-        size_cfg = {
-            "Tiny":  (512,  512,  False),
-            "Small": (640,  640,  False),
-            "Base":  (1024, 1024, False),
-            "Large": (1280, 1280, False),
-            "Gundam (Recommended)": (1024, 640, True),
         }
-        base_size, image_size, crop_mode = size_cfg.get(model_size, (1024, 640, True))
-        plain_text = model_device.infer(
-            tokenizer,
             prompt=prompt,
-            image_file=temp_image_path,
-            output_path=output_path,
-            base_size=base_size,
-            image_size=image_size,
-            crop_mode=crop_mode,
             save_results=True,
             test_compress=True,
             eval_mode=is_eval_mode,
         )
-        image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
-        markdown_result_path = os.path.join(output_path, "result.mmd")
-        markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
-        if os.path.exists(markdown_result_path):
-            with open(markdown_result_path, "r", encoding="utf-8") as f:
-                markdown_content = f.read()
-        result_image = None
-        if os.path.exists(image_result_path):
-            result_image = Image.open(image_result_path); result_image.load()
-        text_result = plain_text if plain_text else markdown_content
-        return result_image, markdown_content, text_result
 # =========================
-# ORQUESTA CHAT (intenta GPU y, si aborta, cae a CPU)
 # =========================
-def biomedlm_reply(user_msg, chat_msgs, ocr_md, ocr_txt):
-    try:
-        # 1) Intento GPU
-        res_gpu = biomedlm_chat_gpu(
-            ocr_md, ocr_txt, user_msg,
-            temperature=GEN_TEMPERATURE, top_p=GEN_TOP_P, max_tokens=GEN_MAX_NEW_TOKENS
-        )
-        s = str(res_gpu)
-        # 2) Si falla GPU y está permitido, fallback CPU
-        need_cpu = False
-        dbg = ""
-        if not s.startswith("OK::"):
-            dbg = s[5:] if s.startswith("ERR::") else s
-            if ALLOW_CPU_FALLBACK and (
-                "GPU task aborted" in dbg or "GPU_WORKER" in dbg or "GPU_INIT" in dbg or "GPU_INFER" in dbg
-            ):
-                need_cpu = True
-        if need_cpu:
-            res_cpu = biomedlm_chat_cpu(
-                ocr_md, ocr_txt, user_msg,
-                temperature=GEN_TEMPERATURE, top_p=GEN_TOP_P, max_tokens=max(128, GEN_MAX_NEW_TOKENS // 2)
-            )
-            sc = str(res_cpu)
-            if sc.startswith("OK::"):
-                answer = sc[4:]
-                updated = (chat_msgs or []) + [
-                    {"role": "user", "content": user_msg or "(analizar solo OCR)"},
-                    {"role": "assistant", "content": answer},
-                ]
-                return updated, "", gr.update(value="Fallback CPU OK · " + dbg)
-            else:
-                err2 = sc[5:] if sc.startswith("ERR::") else sc
-                updated = (chat_msgs or []) + [
-                    {"role": "user", "content": user_msg or ""},
-                    {"role": "assistant", "content": "⚠️ Error LLM (GPU→CPU). Revisa Debug."},
-                ]
-                return updated, "", gr.update(value=f"GPU_FAIL: {dbg}\nCPU_FAIL: {err2}")
-        # 3) GPU fue bien
-        if s.startswith("OK::"):
-            answer = s[4:]
-            updated = (chat_msgs or []) + [
-                {"role": "user", "content": user_msg or "(analizar solo OCR)"},
-                {"role": "assistant", "content": answer},
-            ]
-            return updated, "", gr.update(value="")
-        else:
-            updated = (chat_msgs or []) + [
-                {"role": "user", "content": user_msg or ""},
-                {"role": "assistant", "content": "⚠️ Error LLM (GPU). Revisa Debug."},
-            ]
-            return updated, "", gr.update(value=dbg)
     except Exception as e:
         tb = traceback.format_exc(limit=2)
-        updated = (chat_msgs or []) + [
             {"role": "user", "content": user_msg or ""},
-            {"role": "assistant", "content": f"⚠️ Error LLM: {e}"},
         ]
-        return updated, "", gr.update(value=f"{e}\n{tb}")
 def clear_chat():
-    return [], "", gr.update(value="")
 # =========================
-# UI (Gradio 5)
 # =========================
-with gr.Blocks(title="OpScanIA — DeepSeek-OCR + BioMedLM-7B (GGUF)", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # DeepSeek-OCR → Chat Clínico con **BioMedLM-7B** (GGUF, llama.cpp)
         1) **Sube una imagen** y corre **OCR** (imagen anotada, Markdown y texto).
-        2) **Chatea** con **BioMedLM-7B**: intenta **GPU** y, si el worker se aborta, usa **CPU fallback**.
         *Uso educativo; no reemplaza consejo médico.*
         """
     )
@@ -422,71 +196,62 @@ with gr.Blocks(title="OpScanIA — DeepSeek-OCR + BioMedLM-7B (GGUF)", theme=gr.
     with gr.Row():
         with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="Upload Image",
-                                   sources=["upload", "clipboard", "webcam"])
-            model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
-                                     value="Gundam (Recommended)", label="Model Size")
-            task_type = gr.Dropdown(choices=["Free OCR", "Convert to Markdown"],
-                                    value="Convert to Markdown", label="Task Type")
-            eval_mode_checkbox = gr.Checkbox(value=False, label="Enable Evaluation Mode",
-                                             info="Solo texto (más rápido). Desmárcalo para ver imagen anotada y markdown.")
             submit_btn = gr.Button("Process Image", variant="primary")
-            warm_gpu_btn = gr.Button("Warmup BioMedLM-7B (GPU)")
-            warm_cpu_btn = gr.Button("Warmup BioMedLM-7B (CPU fallback)")
         with gr.Column(scale=2):
             with gr.Tabs():
-                with gr.TabItem("Annotated Image"): output_image = gr.Image(interactive=False)
-                with gr.TabItem("Markdown Preview"): output_markdown = gr.Markdown()
-                with gr.TabItem("Markdown Source / Eval"):
                     output_text = gr.Textbox(lines=18, show_copy_button=True, interactive=False)
             with gr.Row():
                 md_preview = gr.Textbox(label="Snapshot Markdown OCR", lines=8, interactive=False)
                 txt_preview = gr.Textbox(label="Snapshot Texto OCR", lines=8, interactive=False)
-    gr.Markdown("## Chat Clínico (BioMedLM-7B)")
     with gr.Row():
         with gr.Column(scale=2):
-            chatbot = gr.Chatbot(label="Asistente OCR (BioMedLM-7B)", type="messages", height=420)
-            user_in = gr.Textbox(label="Mensaje",
-                                 placeholder="Escribe tu consulta… (vacío = analiza solo el OCR)",
-                                 lines=2)
             with gr.Row():
                 send_btn = gr.Button("Enviar", variant="primary")
                 clear_btn = gr.Button("Limpiar")
         with gr.Column(scale=1):
-            debug_box = gr.Textbox(label="Debug", lines=12, interactive=False)
-    # OCR
     submit_btn.click(
-        fn=process_image,
         inputs=[image_input, model_size, task_type, eval_mode_checkbox],
         outputs=[output_image, output_markdown, output_text],
     ).then(
-        fn=lambda md, tx: (_truncate(md, 2200), _truncate(tx, 2200), md, tx),
         inputs=[output_markdown, output_text],
         outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
     )
-    # Warmups
-    @spaces.GPU
-    def _gpu_warm():
-        ok, msg = _ensure_llm_gpu()
-        return ("OK::" if ok else "ERR::") + msg
-    def _cpu_warm():
-        ok, msg = _ensure_llm_cpu()
-        return ("OK::" if ok else "ERR::") + msg
-    warm_gpu_btn.click(fn=_gpu_warm, outputs=[debug_box])
-    warm_cpu_btn.click(fn=_cpu_warm, outputs=[debug_box])
-    # Chat
     send_btn.click(
-        fn=biomedlm_reply,
         inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
-        outputs=[chatbot, user_in, debug_box]
     )
-    clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, debug_box])
 if __name__ == "__main__":
-    demo.queue(max_size=20)
     demo.launch()

+# app.py — DeepSeek-OCR (GPU worker) + TxAgent-T1-Llama-3.1-8B (HF Inference serverless)
+# ---------------------------------------------------------------------------------------
+# • OCR: DeepSeek-OCR cargado en CPU y movido a GPU SOLO dentro de @spaces.GPU (evita "CUDA en main").
+# • Chat: mims-harvard/TxAgent-T1-Llama-3.1-8B por InferenceClient (serverless) => sin CUDA local.
+# • Parámetros en variables de entorno:
+#     HF_TOKEN (obligatorio para Inference)
+#     TX_MODEL_ID=mims-harvard/TxAgent-T1-Llama-3.1-8B
+#     TX_PROVIDER=hf-inference
+#     GEN_MAX_NEW_TOKENS=512, GEN_TEMPERATURE=0.2, GEN_TOP_P=0.9
+#     OCR_REVISION=<commit estable opcional>, OCR_ATTN_IMPL=flash_attention_2 | eager
+# ---------------------------------------------------------------------------------------
+import os, tempfile, traceback
 import gradio as gr
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
 import spaces
+from huggingface_hub import InferenceClient
 # =========================
+# Chat remoto — TxAgent (HF Inference)
 # =========================
+TX_MODEL_ID   = os.getenv("TX_MODEL_ID", "mims-harvard/TxAgent-T1-Llama-3.1-8B")
+TX_PROVIDER   = os.getenv("TX_PROVIDER", "hf-inference")  # serverless en HF
+HF_TOKEN      = os.getenv("HF_TOKEN")  # <-- requerido
+GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
+GEN_TEMPERATURE    = float(os.getenv("GEN_TEMPERATURE", "0.2"))
+GEN_TOP_P          = float(os.getenv("GEN_TOP_P", "0.9"))
+# Cliente: timeout en el constructor (no en el método)
+tx_client = InferenceClient(
+    model=TX_MODEL_ID,
+    provider=TX_PROVIDER,
+    token=HF_TOKEN,
+    timeout=60.0,
 )
+def _system_prompt():
+    return (
+        "Eres un asistente clínico educativo. NO sustituyes el juicio médico.\n"
+        "Usa CONTEXTO_OCR si existe; si falta, dilo explícitamente. No inventes datos fuera del OCR."
     )
+def _mk_messages(ocr_md: str, ocr_txt: str, user_msg: str):
+    ctx = (ocr_md or "")[:3000] or (ocr_txt or "")[:3000]
+    sys = _system_prompt()
+    if ctx:
+        sys += "\n\n---\nCONTEXTO_OCR (fuente principal):\n" + ctx + "\n---"
+    if not user_msg:
+        user_msg = "Analiza el CONTEXTO_OCR anterior y responde a partir de ese contenido."
     return [
+        {"role": "system", "content": sys},
+        {"role": "user", "content": user_msg},
     ]
+def txagent_chat_remote(ocr_md: str, ocr_txt: str, user_msg: str) -> str:
+    messages = _mk_messages(ocr_md, ocr_txt, user_msg)
+    out = tx_client.chat.completions.create(
+        model=TX_MODEL_ID,
+        messages=messages,
+        max_tokens=GEN_MAX_NEW_TOKENS,
+        temperature=GEN_TEMPERATURE,
+        top_p=GEN_TOP_P,
+        stream=False,
+    )
+    return out.choices[0].message.content
 # =========================
+# OCR — DeepSeek-OCR (Transformers), CUDA solo en worker
 # =========================
+def _best_dtype():
+    if torch.cuda.is_available():
+        return torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+    return torch.float32
 def _load_ocr_model():
+    model_id = "deepseek-ai/DeepSeek-OCR"
+    revision = os.getenv("OCR_REVISION", None)  # fija un commit si quieres estabilidad
+    attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2")
+    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, revision=revision)
     try:
+        mdl = AutoModel.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            use_safetensors=True,
+            _attn_implementation=attn_impl,
+            revision=revision,
+        ).eval()
         return tok, mdl
     except Exception as e:
+        # Fallback si FA2 no está disponible
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
+            mdl = AutoModel.from_pretrained(
+                model_id,
+                trust_remote_code=True,
+                use_safetensors=True,
+                _attn_implementation="eager",
+                revision=revision,
+            ).eval()
             return tok, mdl
         raise
+OCR_TOKENIZER, OCR_MODEL = _load_ocr_model()
+@spaces.GPU  # ← toca CUDA solo aquí
+def ocr_infer(image: Image.Image, model_size: str, task_type: str, is_eval_mode: bool):
     if image is None:
+        return None, "Sube una imagen primero.", "Sube una imagen primero."
+    dtype = _best_dtype()
+    model = OCR_MODEL.cuda().to(dtype) if torch.cuda.is_available() else OCR_MODEL.to(dtype)
+    with tempfile.TemporaryDirectory() as outdir:
         prompt = "<image>\nFree OCR. " if task_type == "Free OCR" else "<image>\n<|grounding|>Convert the document to markdown. "
+        size_cfgs = {
+            "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
+            "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
+            "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
+            "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
+            "Gundam (Recommended)": {"base_size": 1024, "image_size": 640, "crop_mode": True},
         }
+        cfg = size_cfgs.get(model_size, size_cfgs["Gundam (Recommended)"])
+        tmp_path = os.path.join(outdir, "tmp.jpg")
+        image.save(tmp_path)
+        plain = model.infer(
+            OCR_TOKENIZER,
             prompt=prompt,
+            image_file=tmp_path,
+            output_path=outdir,
+            base_size=cfg["base_size"],
+            image_size=cfg["image_size"],
+            crop_mode=cfg["crop_mode"],
             save_results=True,
             test_compress=True,
             eval_mode=is_eval_mode,
         )
+        img_boxes = os.path.join(outdir, "result_with_boxes.jpg")
+        md_path   = os.path.join(outdir, "result.mmd")
+        md = "Markdown result was not generated. This is expected for 'Free OCR' task."
+        if os.path.exists(md_path):
+            with open(md_path, "r", encoding="utf-8") as f:
+                md = f.read()
+        img_out = Image.open(img_boxes) if os.path.exists(img_boxes) else None
+        txt_out = plain if plain else md
+        return img_out, md, txt_out
 # =========================
+# Glue OCR→Chat
 # =========================
+def ocr_snapshot(md_text: str, plain_text: str):
+    return md_text, plain_text, md_text, plain_text
+def chat_reply(user_msg, chat_state, ocr_md_state, ocr_txt_state):
+    try:
+        answer = txagent_chat_remote(ocr_md_state or "", ocr_txt_state or "", user_msg or "")
+        updated = (chat_state or []) + [
+            {"role": "user", "content": user_msg or "(solo OCR)"},
+            {"role": "assistant", "content": answer},
+        ]
+        return updated, "", ""
     except Exception as e:
         tb = traceback.format_exc(limit=2)
+        updated = (chat_state or []) + [
             {"role": "user", "content": user_msg or ""},
+            {"role": "assistant", "content": f"⚠️ Error remoto: {e}"},
         ]
+        return updated, "", f"{e}\n{tb}"
 def clear_chat():
+    return [], "", ""
 # =========================
+# UI — Gradio 5
 # =========================
+with gr.Blocks(title="OpScanIA — DeepSeek-OCR + TxAgent (HF Inference)", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # 📄 DeepSeek-OCR → 💬 Chat Clínico (TxAgent-T1-Llama-3.1-8B remoto)
         1) **Sube una imagen** y corre **OCR** (imagen anotada, Markdown y texto).
+        2) **Chatea** con **TxAgent (HF Inference)** usando automáticamente el **OCR** como contexto.
         *Uso educativo; no reemplaza consejo médico.*
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard", "webcam"])
+            model_size = gr.Dropdown(
+                choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"],
+                value="Gundam (Recommended)", label="Model Size"
+            )
+            task_type = gr.Dropdown(
+                choices=["Free OCR", "Convert to Markdown"],
+                value="Convert to Markdown", label="Task Type"
+            )
+            eval_mode_checkbox = gr.Checkbox(
+                value=True,
+                label="Evaluation mode (más rápido)",
+                info="Salida solo texto/markdown si así lo decide el backend."
+            )
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
             with gr.Tabs():
+                with gr.TabItem("Annotated Image"):
+                    output_image = gr.Image(interactive=False)
+                with gr.TabItem("Markdown Preview"):
+                    output_markdown = gr.Markdown()
+                with gr.TabItem("Markdown Source / Eval Output"):
                     output_text = gr.Textbox(lines=18, show_copy_button=True, interactive=False)
             with gr.Row():
                 md_preview = gr.Textbox(label="Snapshot Markdown OCR", lines=8, interactive=False)
                 txt_preview = gr.Textbox(label="Snapshot Texto OCR", lines=8, interactive=False)
+    gr.Markdown("## Chat Clínico — TxAgent (HF Inference)")
     with gr.Row():
         with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Asistente OCR (TxAgent remoto)", type="messages", height=420)
+            user_in = gr.Textbox(label="Mensaje", placeholder="Escribe tu consulta… (vacío = analiza solo el OCR)", lines=2)
             with gr.Row():
                 send_btn = gr.Button("Enviar", variant="primary")
                 clear_btn = gr.Button("Limpiar")
         with gr.Column(scale=1):
+            error_box = gr.Textbox(label="Debug (si hay error)", lines=8, interactive=False)
     submit_btn.click(
+        fn=ocr_infer,
         inputs=[image_input, model_size, task_type, eval_mode_checkbox],
         outputs=[output_image, output_markdown, output_text],
     ).then(
+        fn=ocr_snapshot,
         inputs=[output_markdown, output_text],
         outputs=[ocr_md_state, ocr_txt_state, md_preview, txt_preview],
     )
     send_btn.click(
+        fn=chat_reply,
         inputs=[user_in, chatbot, ocr_md_state, ocr_txt_state],
+        outputs=[chatbot, user_in, error_box]
     )
+    clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
 if __name__ == "__main__":
+    demo.queue(max_size=32, concurrency_count=8)
     demo.launch()