Spaces:

stvnnnnnn
/

nl2sql-backend-t5

Running

App Files Files Community

stvnnnnnn commited on 28 days ago

Commit

9082c5a

verified ·

1 Parent(s): 78ae034

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -107

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import zipfile
 import re
 import difflib
 import tempfile
 from typing import List, Optional, Dict, Any
 from fastapi import FastAPI, UploadFile, File, HTTPException, Form
@@ -49,7 +50,7 @@ app = FastAPI(
     title="NL2SQL T5-large Backend Universal (single-file)",
     description=(
         "Intérprete NL→SQL (T5-large Spider) para usuarios no expertos. "
-        "El usuario solo sube su BD (SQLite / dump .sql / CSV / ZIP de CSVs) "
         "y todo se convierte internamente a SQLite."
     ),
     version="1.0.0",
@@ -143,7 +144,6 @@ def create_empty_sqlite_db(label: str) -> str:
     conn_id = f"db_{uuid.uuid4().hex[:8]}"
     db_filename = f"{conn_id}.sqlite"
     db_path = os.path.join(UPLOAD_DIR, db_filename)
-    # Crear archivo vacío
     conn = sqlite3.connect(db_path)
     conn.close()
     DB_REGISTRY[conn_id] = {"db_path": db_path, "label": label}
@@ -202,7 +202,6 @@ def import_sql_dump_to_sqlite(db_path: str, sql_text: str) -> None:
         if upper.startswith("CREATE TABLE"):
             # separar claves foráneas
             if "FOREIGN KEY" in upper:
-                # Cortar constraints para ejecutarlos después
                 fixed = []
                 fk_lines = []
@@ -251,10 +250,8 @@ def import_sql_dump_to_sqlite(db_path: str, sql_text: str) -> None:
     conn = sqlite3.connect(db_path)
     cur = conn.cursor()
-    # Desactivar foreign keys mientras importamos
     cur.execute("PRAGMA foreign_keys = OFF;")
-    # Crear tablas sin constraints
     for ct in create_tables:
         try:
             cur.executescript(ct + ";")
@@ -262,7 +259,6 @@ def import_sql_dump_to_sqlite(db_path: str, sql_text: str) -> None:
             print("Error CREATE TABLE:", e)
             print("SQL:", ct)
-    # Ejecutar inserts
     for ins in inserts:
         try:
             cur.executescript(ins + ";")
@@ -277,8 +273,6 @@ def import_sql_dump_to_sqlite(db_path: str, sql_text: str) -> None:
     for table, fks in foreign_keys:
         for fk in fks:
             try:
-                # ALTER TABLE ADD FOREIGN KEY no existe en SQLite,
-                # así que debemos reconstruir la tabla.
                 add_foreign_key_sqlite(conn, table, fk)
             except Exception as e:
                 print("Error agregando FK:", e, " → ", fk)
@@ -300,30 +294,19 @@ def add_foreign_key_sqlite(conn, table: str, fk_line: str):
     - Añade FK en nueva versión
     - Copia datos
     """
     cur = conn.cursor()
-    # Obtener esquema original
     cur.execute(f"SELECT sql FROM sqlite_master WHERE type='table' AND name='{table}';")
     result = cur.fetchone()
     if not result:
         return
     original_sql = result[0]
-    # Insertar constraint en el SQL
     new_sql = original_sql.rstrip(")") + f", {fk_line} )"
-    # Renombrar tabla original
     cur.execute(f"ALTER TABLE {table} RENAME TO _old_{table};")
-    # Crear nueva tabla con FK
     cur.execute(new_sql)
-    # Copiar datos
     cur.execute(f"INSERT INTO {table} SELECT * FROM _old_{table};")
-    # Eliminar tabla vieja
     cur.execute(f"DROP TABLE _old_{table};")
     conn.commit()
@@ -346,14 +329,11 @@ def import_csv_to_sqlite(db_path: str, csv_bytes: bytes, table_name: str) -> Non
         header = rows[0]
         cols = [_sanitize_identifier(c or f"col_{i}") for i, c in enumerate(header)]
-        # Crear tabla
         col_defs = ", ".join(f'"{c}" TEXT' for c in cols)
         conn.execute(f'CREATE TABLE IF NOT EXISTS "{table}" ({col_defs});')
-        # Insertar filas
         placeholders = ", ".join(["?"] * len(cols))
         for row in rows[1:]:
-            # Padding/truncado por seguridad
             row = list(row) + [""] * (len(cols) - len(row))
             row = row[:len(cols)]
             conn.execute(
@@ -369,9 +349,11 @@ def import_csv_to_sqlite(db_path: str, csv_bytes: bytes, table_name: str) -> Non
 def import_zip_of_csvs_to_sqlite(db_path: str, zip_bytes: bytes) -> None:
     """
     Para un ZIP con múltiples CSV: cada CSV se vuelve una tabla.
     """
     conn = sqlite3.connect(db_path)
-    conn.close()  # solo asegurar que el archivo existe
     with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
         for name in zf.namelist():
@@ -395,23 +377,19 @@ def introspect_sqlite_schema(db_path: str) -> Dict[str, Any]:
     conn = sqlite3.connect(db_path)
     cur = conn.cursor()
-    # --- TABLAS
     cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
     tables = [row[0] for row in cur.fetchall()]
     tables_info = {}
-    foreign_keys = []  # <--- nuevo
     parts = []
     for t in tables:
-        # Columnas
         cur.execute(f"PRAGMA table_info('{t}');")
         rows = cur.fetchall()
         cols = [r[1] for r in rows]
         tables_info[t] = {"columns": cols}
-        # Relaciones FK
         cur.execute(f"PRAGMA foreign_key_list('{t}');")
         fks = cur.fetchall()
         for (id, seq, table, from_col, to_col, on_update, on_delete, match) in fks:
@@ -429,13 +407,12 @@ def introspect_sqlite_schema(db_path: str) -> Dict[str, Any]:
     return {
         "tables": tables_info,
-        "foreign_keys": foreign_keys,  # <--- nuevo
         "schema_str": schema_str
     }
 def execute_sqlite(db_path: str, sql: str) -> Dict[str, Any]:
-    # Seguridad mínima para evitar queries destructivas
     forbidden = ["drop ", "delete ", "update ", "insert ", "alter ", "replace "]
     sql_low = sql.lower()
     if any(f in sql_low for f in forbidden):
@@ -463,22 +440,15 @@ def execute_sqlite(db_path: str, sql: str) -> Dict[str, Any]:
 # ======================================================
 def _normalize_name_for_match(name: str) -> str:
-    """Normaliza un identificador (tabla/columna) para hacer matching difuso."""
     s = name.lower()
     s = s.replace('"', '').replace("`", "")
     s = s.replace("_", "")
-    # singularización muy simple: tracks -> track, songs -> song, etc.
     if s.endswith("s") and len(s) > 3:
         s = s[:-1]
     return s
 def _build_schema_indexes(tables_info: Dict[str, Dict[str, List[str]]]) -> Dict[str, Dict[str, List[str]]]:
-    """
-    Construye índices de nombres normalizados:
-      - table_index: {normalized: [table1, table2, ...]}
-      - column_index: {normalized: [col1, col2, ...]}
-    """
     table_index: Dict[str, List[str]] = {}
     column_index: Dict[str, List[str]] = {}
@@ -498,18 +468,13 @@ def _build_schema_indexes(tables_info: Dict[str, Dict[str, List[str]]]) -> Dict[
 def _best_match_name(missing: str, index: Dict[str, List[str]]) -> Optional[str]:
-    """
-    Dado un nombre ausente y un índice normalizado, devuelve el mejor match real.
-    """
     if not index:
         return None
     key = _normalize_name_for_match(missing)
-    # Si tenemos match directo
     if key in index and index[key]:
         return index[key][0]
-    # Matching difuso usando difflib
     candidates = difflib.get_close_matches(key, list(index.keys()), n=1, cutoff=0.7)
     if not candidates:
         return None
@@ -519,7 +484,6 @@ def _best_match_name(missing: str, index: Dict[str, List[str]]) -> Optional[str]
     return None
-# Diccionarios de sinónimos comunes (Spider + Chinook / bases típicas)
 DOMAIN_SYNONYMS_TABLE = {
     "song": "track",
     "songs": "track",
@@ -543,14 +507,6 @@ DOMAIN_SYNONYMS_COLUMN = {
 def try_repair_sql(sql: str, error: str, schema_meta: Dict[str, Any]) -> Optional[str]:
-    """
-    Intenta reparar SQL a partir del mensaje de error y del esquema:
-      - no such table: X  → mapear X a una tabla existente
-      - no such column: Y → mapear Y a una columna existente
-    Devuelve:
-      - nuevo SQL reparado (str) si pudo cambiar algo
-      - None si no se aplicó ninguna reparación
-    """
     tables_info = schema_meta["tables"]
     idx = _build_schema_indexes(tables_info)
     table_index = idx["table_index"]
@@ -559,7 +515,6 @@ def try_repair_sql(sql: str, error: str, schema_meta: Dict[str, Any]) -> Optiona
     repaired_sql = sql
     changed = False
-    # 1) Detectar faltas específicas por el mensaje de SQLite
     missing_table = None
     missing_column = None
@@ -571,10 +526,8 @@ def try_repair_sql(sql: str, error: str, schema_meta: Dict[str, Any]) -> Optiona
     if m_c:
         missing_column = m_c.group(1)
-    # 2) Reparar tabla faltante
     if missing_table:
-        short = missing_table.split(".")[-1]  # si viene tipo T1.Songs
-        # Sinónimo de dominio primero (song -> track, etc.)
         syn = DOMAIN_SYNONYMS_TABLE.get(short.lower())
         target = None
         if syn:
@@ -589,7 +542,6 @@ def try_repair_sql(sql: str, error: str, schema_meta: Dict[str, Any]) -> Optiona
                 repaired_sql = new_sql
                 changed = True
-    # 3) Reparar columna faltante
     if missing_column:
         short = missing_column.split(".")[-1]
         syn = DOMAIN_SYNONYMS_COLUMN.get(short.lower())
@@ -616,10 +568,6 @@ def try_repair_sql(sql: str, error: str, schema_meta: Dict[str, Any]) -> Optiona
 # ======================================================
 def build_prompt(question_en: str, db_id: str, schema_str: str) -> str:
-    """
-    Estilo de entrenamiento Spider:
-    translate to SQL: {question} | db: {db_id} | schema: {schema_str} | note: ...
-    """
     return (
         f"translate to SQL: {question_en} | "
         f"db: {db_id} | schema: {schema_str} | "
@@ -628,14 +576,6 @@ def build_prompt(question_en: str, db_id: str, schema_str: str) -> str:
 def nl2sql_with_rerank(question: str, conn_id: str) -> Dict[str, Any]:
-    """
-    Pipeline completo:
-      - auto-idioma + ES→EN
-      - introspección de esquema
-      - generación con beams
-      - re-ranking según ejecución real en SQLite
-      - capa de SQL Repair (tablas/columnas inexistentes, hasta 3 intentos)
-    """
     if conn_id not in DB_REGISTRY:
         raise HTTPException(status_code=404, detail=f"connection_id '{conn_id}' no registrado")
@@ -687,17 +627,15 @@ def nl2sql_with_rerank(question: str, conn_id: str) -> Dict[str, Any]:
             "raw_sql_model": raw_sql,
         }
-        # Intento 1: ejecución directa
         exec_info = execute_sqlite(db_path, raw_sql)
-        # Hasta 3 rondas de reparación si sigue fallando por no such table/column
         if (not exec_info["ok"]) and (
             "no such table" in (exec_info["error"] or "")
             or "no such column" in (exec_info["error"] or "")
         ):
             current_sql = raw_sql
             last_error = exec_info["error"]
-            for step in range(1, 4):  # step 1, 2, 3
                 repaired_sql = try_repair_sql(current_sql, last_error, meta)
                 if not repaired_sql or repaired_sql == current_sql:
                     break
@@ -711,7 +649,6 @@ def nl2sql_with_rerank(question: str, conn_id: str) -> Dict[str, Any]:
                     break
                 last_error = exec_info2["error"]
-        # Guardar info final de ejecución
         cand["exec_ok"] = exec_info["ok"]
         cand["exec_error"] = exec_info["error"]
         cand["rows_preview"] = (
@@ -721,7 +658,6 @@ def nl2sql_with_rerank(question: str, conn_id: str) -> Dict[str, Any]:
         candidates.append(cand)
-        # Seleccionar "best"
         if exec_info["ok"]:
             if (not best_exec) or cand["score"] > best_score:
                 best_exec = True
@@ -808,7 +744,6 @@ class SpeechInferResponse(BaseModel):
 @app.on_event("startup")
 async def startup_event():
-    # Cargamos el modelo al inicio
     load_nl2sql_model()
     print(f"✅ Backend NL2SQL inicializado. MODEL_DIR={MODEL_DIR}, UPLOAD_DIR={UPLOAD_DIR}")
@@ -821,8 +756,7 @@ async def upload_database(db_file: UploadFile = File(...)):
       - .sqlite / .db → se usa tal cual
       - .sql → dump MySQL/PostgreSQL/SQLite → se importa a SQLite
       - .csv → se crea una BD SQLite y una tabla
-      - .zip → múltiples CSV → múltiples tablas en una BD SQLite
-    Devuelve un connection_id para usar en /schema, /preview y /infer.
     """
     filename = db_file.filename
     if not filename:
@@ -831,7 +765,8 @@ async def upload_database(db_file: UploadFile = File(...)):
     fname_lower = filename.lower()
     contents = await db_file.read()
-    note = None
     # Caso 1: SQLite nativa
     if fname_lower.endswith(".sqlite") or fname_lower.endswith(".db"):
@@ -858,12 +793,69 @@ async def upload_database(db_file: UploadFile = File(...)):
         import_csv_to_sqlite(db_path, contents, table_name)
         note = "CSV imported into a single SQLite table."
-    # Caso 4: ZIP con CSVs
     elif fname_lower.endswith(".zip"):
-        conn_id = create_empty_sqlite_db(label=filename)
-        db_path = DB_REGISTRY[conn_id]["db_path"]
-        import_zip_of_csvs_to_sqlite(db_path, contents)
-        note = "ZIP with CSVs imported into multiple SQLite tables."
     else:
         raise HTTPException(
@@ -881,9 +873,6 @@ async def upload_database(db_file: UploadFile = File(...)):
 @app.get("/connections", response_model=List[ConnectionInfo])
 async def list_connections():
-    """
-    Lista las conexiones registradas (todas en SQLite interno).
-    """
     out = []
     for cid, info in DB_REGISTRY.items():
         out.append(ConnectionInfo(connection_id=cid, label=info["label"]))
@@ -892,9 +881,6 @@ async def list_connections():
 @app.get("/schema/{connection_id}", response_model=SchemaResponse)
 async def get_schema(connection_id: str):
-    """
-    Devuelve un resumen de esquema para una BD subida.
-    """
     if connection_id not in DB_REGISTRY:
         raise HTTPException(status_code=404, detail="connection_id no encontrado")
@@ -909,10 +895,6 @@ async def get_schema(connection_id: str):
 @app.get("/preview/{connection_id}/{table}", response_model=PreviewResponse)
 async def preview_table(connection_id: str, table: str, limit: int = 20):
-    """
-    Devuelve un preview de filas de una tabla concreta.
-    Útil para el frontend (vista de tabla + diagrama).
-    """
     if connection_id not in DB_REGISTRY:
         raise HTTPException(status_code=404, detail="connection_id no encontrado")
@@ -937,10 +919,6 @@ async def preview_table(connection_id: str, table: str, limit: int = 20):
 @app.post("/infer", response_model=InferResponse)
 async def infer_sql(req: InferRequest):
-    """
-    Dada una pregunta en lenguaje natural (ES o EN) y un connection_id,
-    genera SQL, ejecuta la consulta y devuelve el resultado + candidatos.
-    """
     result = nl2sql_with_rerank(req.question, req.connection_id)
     return InferResponse(**result)
@@ -950,12 +928,6 @@ async def speech_infer(
     connection_id: str = Form(...),
     audio: UploadFile = File(...)
 ):
-    """
-    Endpoint para consultas por VOZ:
-    - Recibe audio desde el navegador (multipart/form-data).
-    - Usa gpt-4o-transcribe para obtener el texto.
-    - Reutiliza el pipeline NL→SQL existente.
-    """
     if openai_client is None:
         raise HTTPException(
             status_code=500,
@@ -965,7 +937,6 @@ async def speech_infer(
     if audio.content_type is None:
         raise HTTPException(status_code=400, detail="Archivo de audio inválido.")
-    # 1) Guardar audio temporalmente
     try:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as tmp:
             tmp.write(await audio.read())
@@ -973,23 +944,19 @@ async def speech_infer(
     except Exception:
         raise HTTPException(status_code=500, detail="No se pudo procesar el audio recibido.")
-    # 2) Transcribir con gpt-4o-transcribe
     try:
         with open(tmp_path, "rb") as f:
             transcription = openai_client.audio.transcriptions.create(
                 model="gpt-4o-transcribe",
                 file=f,
-                # language="es",  # opcional, si quieres forzar español
             )
         transcript_text: str = transcription.text
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error al transcribir audio: {e}")
-    # 3) Reutilizar el pipeline NL→SQL con el texto transcrito
     result_dict = nl2sql_with_rerank(transcript_text, connection_id)
     infer_result = InferResponse(**result_dict)
-    # 4) Devolver transcripción + resultado NL→SQL
     return SpeechInferResponse(
         transcript=transcript_text,
         result=infer_result,

 import re
 import difflib
 import tempfile
+import shutil
 from typing import List, Optional, Dict, Any
 from fastapi import FastAPI, UploadFile, File, HTTPException, Form
     title="NL2SQL T5-large Backend Universal (single-file)",
     description=(
         "Intérprete NL→SQL (T5-large Spider) para usuarios no expertos. "
+        "El usuario solo sube su BD (SQLite / dump .sql / CSV / ZIP de datos) "
         "y todo se convierte internamente a SQLite."
     ),
     version="1.0.0",
     conn_id = f"db_{uuid.uuid4().hex[:8]}"
     db_filename = f"{conn_id}.sqlite"
     db_path = os.path.join(UPLOAD_DIR, db_filename)
     conn = sqlite3.connect(db_path)
     conn.close()
     DB_REGISTRY[conn_id] = {"db_path": db_path, "label": label}
         if upper.startswith("CREATE TABLE"):
             # separar claves foráneas
             if "FOREIGN KEY" in upper:
                 fixed = []
                 fk_lines = []
     conn = sqlite3.connect(db_path)
     cur = conn.cursor()
     cur.execute("PRAGMA foreign_keys = OFF;")
     for ct in create_tables:
         try:
             cur.executescript(ct + ";")
             print("Error CREATE TABLE:", e)
             print("SQL:", ct)
     for ins in inserts:
         try:
             cur.executescript(ins + ";")
     for table, fks in foreign_keys:
         for fk in fks:
             try:
                 add_foreign_key_sqlite(conn, table, fk)
             except Exception as e:
                 print("Error agregando FK:", e, " → ", fk)
     - Añade FK en nueva versión
     - Copia datos
     """
     cur = conn.cursor()
     cur.execute(f"SELECT sql FROM sqlite_master WHERE type='table' AND name='{table}';")
     result = cur.fetchone()
     if not result:
         return
     original_sql = result[0]
     new_sql = original_sql.rstrip(")") + f", {fk_line} )"
     cur.execute(f"ALTER TABLE {table} RENAME TO _old_{table};")
     cur.execute(new_sql)
     cur.execute(f"INSERT INTO {table} SELECT * FROM _old_{table};")
     cur.execute(f"DROP TABLE _old_{table};")
     conn.commit()
         header = rows[0]
         cols = [_sanitize_identifier(c or f"col_{i}") for i, c in enumerate(header)]
         col_defs = ", ".join(f'"{c}" TEXT' for c in cols)
         conn.execute(f'CREATE TABLE IF NOT EXISTS "{table}" ({col_defs});')
         placeholders = ", ".join(["?"] * len(cols))
         for row in rows[1:]:
             row = list(row) + [""] * (len(cols) - len(row))
             row = row[:len(cols)]
             conn.execute(
 def import_zip_of_csvs_to_sqlite(db_path: str, zip_bytes: bytes) -> None:
     """
     Para un ZIP con múltiples CSV: cada CSV se vuelve una tabla.
+    (Se mantiene por compatibilidad, aunque ahora manejamos ZIPs
+     más generales en /upload.)
     """
     conn = sqlite3.connect(db_path)
+    conn.close()
     with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
         for name in zf.namelist():
     conn = sqlite3.connect(db_path)
     cur = conn.cursor()
     cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
     tables = [row[0] for row in cur.fetchall()]
     tables_info = {}
+    foreign_keys = []
     parts = []
     for t in tables:
         cur.execute(f"PRAGMA table_info('{t}');")
         rows = cur.fetchall()
         cols = [r[1] for r in rows]
         tables_info[t] = {"columns": cols}
         cur.execute(f"PRAGMA foreign_key_list('{t}');")
         fks = cur.fetchall()
         for (id, seq, table, from_col, to_col, on_update, on_delete, match) in fks:
     return {
         "tables": tables_info,
+        "foreign_keys": foreign_keys,
         "schema_str": schema_str
     }
 def execute_sqlite(db_path: str, sql: str) -> Dict[str, Any]:
     forbidden = ["drop ", "delete ", "update ", "insert ", "alter ", "replace "]
     sql_low = sql.lower()
     if any(f in sql_low for f in forbidden):
 # ======================================================
 def _normalize_name_for_match(name: str) -> str:
     s = name.lower()
     s = s.replace('"', '').replace("`", "")
     s = s.replace("_", "")
     if s.endswith("s") and len(s) > 3:
         s = s[:-1]
     return s
 def _build_schema_indexes(tables_info: Dict[str, Dict[str, List[str]]]) -> Dict[str, Dict[str, List[str]]]:
     table_index: Dict[str, List[str]] = {}
     column_index: Dict[str, List[str]] = {}
 def _best_match_name(missing: str, index: Dict[str, List[str]]) -> Optional[str]:
     if not index:
         return None
     key = _normalize_name_for_match(missing)
     if key in index and index[key]:
         return index[key][0]
     candidates = difflib.get_close_matches(key, list(index.keys()), n=1, cutoff=0.7)
     if not candidates:
         return None
     return None
 DOMAIN_SYNONYMS_TABLE = {
     "song": "track",
     "songs": "track",
 def try_repair_sql(sql: str, error: str, schema_meta: Dict[str, Any]) -> Optional[str]:
     tables_info = schema_meta["tables"]
     idx = _build_schema_indexes(tables_info)
     table_index = idx["table_index"]
     repaired_sql = sql
     changed = False
     missing_table = None
     missing_column = None
     if m_c:
         missing_column = m_c.group(1)
     if missing_table:
+        short = missing_table.split(".")[-1]
         syn = DOMAIN_SYNONYMS_TABLE.get(short.lower())
         target = None
         if syn:
                 repaired_sql = new_sql
                 changed = True
     if missing_column:
         short = missing_column.split(".")[-1]
         syn = DOMAIN_SYNONYMS_COLUMN.get(short.lower())
 # ======================================================
 def build_prompt(question_en: str, db_id: str, schema_str: str) -> str:
     return (
         f"translate to SQL: {question_en} | "
         f"db: {db_id} | schema: {schema_str} | "
 def nl2sql_with_rerank(question: str, conn_id: str) -> Dict[str, Any]:
     if conn_id not in DB_REGISTRY:
         raise HTTPException(status_code=404, detail=f"connection_id '{conn_id}' no registrado")
             "raw_sql_model": raw_sql,
         }
         exec_info = execute_sqlite(db_path, raw_sql)
         if (not exec_info["ok"]) and (
             "no such table" in (exec_info["error"] or "")
             or "no such column" in (exec_info["error"] or "")
         ):
             current_sql = raw_sql
             last_error = exec_info["error"]
+            for step in range(1, 4):
                 repaired_sql = try_repair_sql(current_sql, last_error, meta)
                 if not repaired_sql or repaired_sql == current_sql:
                     break
                     break
                 last_error = exec_info2["error"]
         cand["exec_ok"] = exec_info["ok"]
         cand["exec_error"] = exec_info["error"]
         cand["rows_preview"] = (
         candidates.append(cand)
         if exec_info["ok"]:
             if (not best_exec) or cand["score"] > best_score:
                 best_exec = True
 @app.on_event("startup")
 async def startup_event():
     load_nl2sql_model()
     print(f"✅ Backend NL2SQL inicializado. MODEL_DIR={MODEL_DIR}, UPLOAD_DIR={UPLOAD_DIR}")
       - .sqlite / .db → se usa tal cual
       - .sql → dump MySQL/PostgreSQL/SQLite → se importa a SQLite
       - .csv → se crea una BD SQLite y una tabla
+      - .zip → puede contener .sqlite/.db, .sql o .csv (se detecta automáticamente)
     """
     filename = db_file.filename
     if not filename:
     fname_lower = filename.lower()
     contents = await db_file.read()
+    note: Optional[str] = None
+    conn_id: Optional[str] = None
     # Caso 1: SQLite nativa
     if fname_lower.endswith(".sqlite") or fname_lower.endswith(".db"):
         import_csv_to_sqlite(db_path, contents, table_name)
         note = "CSV imported into a single SQLite table."
+    # Caso 4: ZIP universal
     elif fname_lower.endswith(".zip"):
+        try:
+            with zipfile.ZipFile(io.BytesIO(contents)) as zf:
+                names = [info.filename for info in zf.infolist() if not info.is_dir()]
+                sqlite_names = [n for n in names if n.lower().endswith((".sqlite", ".db"))]
+                sql_names = [n for n in names if n.lower().endswith(".sql")]
+                csv_names = [n for n in names if n.lower().endswith(".csv")]
+                # 4.1: si el ZIP trae una BD SQLite nativa
+                if sqlite_names:
+                    inner = sqlite_names[0]
+                    conn_id = f"db_{uuid.uuid4().hex[:8]}"
+                    dst_path = os.path.join(UPLOAD_DIR, f"{conn_id}.sqlite")
+                    with zf.open(inner) as src, open(dst_path, "wb") as dst:
+                        shutil.copyfileobj(src, dst)
+                    DB_REGISTRY[conn_id] = {
+                        "db_path": dst_path,
+                        "label": f"{filename}::{os.path.basename(inner)}",
+                    }
+                    note = "SQLite database extracted from ZIP."
+                # 4.2: dumps SQL (uno o varios)
+                elif sql_names:
+                    conn_id = create_empty_sqlite_db(label=filename)
+                    db_path = DB_REGISTRY[conn_id]["db_path"]
+                    if len(sql_names) == 1:
+                        with zf.open(sql_names[0]) as f:
+                            sql_text = f.read().decode("utf-8", errors="ignore")
+                    else:
+                        parts = []
+                        for n in sorted(sql_names):
+                            with zf.open(n) as f:
+                                parts.append(f"-- FILE: {n}\n")
+                                parts.append(f.read().decode("utf-8", errors="ignore"))
+                        sql_text = "\n\n".join(parts)
+                    import_sql_dump_to_sqlite(db_path, sql_text)
+                    note = "SQL dump(s) from ZIP imported into SQLite."
+                # 4.3: solo CSVs
+                elif csv_names:
+                    conn_id = create_empty_sqlite_db(label=filename)
+                    db_path = DB_REGISTRY[conn_id]["db_path"]
+                    for name in csv_names:
+                        with zf.open(name) as f:
+                            csv_bytes = f.read()
+                        table_name = os.path.splitext(os.path.basename(name))[0]
+                        import_csv_to_sqlite(db_path, csv_bytes, table_name)
+                    note = "CSV files from ZIP imported into SQLite (one table per CSV)."
+                else:
+                    raise HTTPException(
+                        status_code=400,
+                        detail="El ZIP no contiene archivos .sqlite/.db/.sql/.csv utilizables.",
+                    )
+        except zipfile.BadZipFile:
+            raise HTTPException(status_code=400, detail="Archivo ZIP inválido o corrupto.")
     else:
         raise HTTPException(
 @app.get("/connections", response_model=List[ConnectionInfo])
 async def list_connections():
     out = []
     for cid, info in DB_REGISTRY.items():
         out.append(ConnectionInfo(connection_id=cid, label=info["label"]))
 @app.get("/schema/{connection_id}", response_model=SchemaResponse)
 async def get_schema(connection_id: str):
     if connection_id not in DB_REGISTRY:
         raise HTTPException(status_code=404, detail="connection_id no encontrado")
 @app.get("/preview/{connection_id}/{table}", response_model=PreviewResponse)
 async def preview_table(connection_id: str, table: str, limit: int = 20):
     if connection_id not in DB_REGISTRY:
         raise HTTPException(status_code=404, detail="connection_id no encontrado")
 @app.post("/infer", response_model=InferResponse)
 async def infer_sql(req: InferRequest):
     result = nl2sql_with_rerank(req.question, req.connection_id)
     return InferResponse(**result)
     connection_id: str = Form(...),
     audio: UploadFile = File(...)
 ):
     if openai_client is None:
         raise HTTPException(
             status_code=500,
     if audio.content_type is None:
         raise HTTPException(status_code=400, detail="Archivo de audio inválido.")
     try:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as tmp:
             tmp.write(await audio.read())
     except Exception:
         raise HTTPException(status_code=500, detail="No se pudo procesar el audio recibido.")
     try:
         with open(tmp_path, "rb") as f:
             transcription = openai_client.audio.transcriptions.create(
                 model="gpt-4o-transcribe",
                 file=f,
             )
         transcript_text: str = transcription.text
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error al transcribir audio: {e}")
     result_dict = nl2sql_with_rerank(transcript_text, connection_id)
     infer_result = InferResponse(**result_dict)
     return SpeechInferResponse(
         transcript=transcript_text,
         result=infer_result,