qa_python_k2_think_api_UI

Runtime error

rabiyulfahim commited on Oct 6

Commit

040c903

verified ·

1 Parent(s): ba8f36f

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from fastapi import FastAPI, Query, HTTPException
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from pydantic import BaseModel
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import HTMLResponse
@@ -8,10 +8,12 @@ import os
 import torch
 # -----------------------
-# Hugging Face cache
 # -----------------------
-os.environ["HF_HOME"] = "/tmp"  # writable cache
-os.environ["TRANSFORMERS_CACHE"] = "/tmp"  # optional
 # -----------------------
 # Model Setup
@@ -20,10 +22,15 @@ model_id = "LLM360/K2-Think"
 print("Loading tokenizer and model...")
 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    device_map="auto",        # auto assign to GPU/CPU
-    load_in_8bit=True,        # 8-bit quantization for low memory
     cache_dir="/tmp"
 )
 print("Model loaded!")
@@ -59,12 +66,6 @@ class QueryRequest(BaseModel):
 def home():
     return {"message": "Welcome to K2-Think QA API 🚀"}
-@app.get("/ui", response_class=HTMLResponse)
-def serve_ui():
-    html_path = os.path.join("static", "index.html")
-    with open(html_path, "r", encoding="utf-8") as f:
-        return HTMLResponse(f.read())
 @app.get("/health")
 def health():
     return {"status": "ok"}

 from fastapi import FastAPI, Query, HTTPException
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from pydantic import BaseModel
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import HTMLResponse
 import torch
 # -----------------------
+# Set cache dirs (avoid Docker errors)
 # -----------------------
+os.environ["HF_HOME"] = "/tmp"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp"
+os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_inductor_cache"
+os.makedirs("/tmp/torch_inductor_cache", exist_ok=True)
 # -----------------------
 # Model Setup
 print("Loading tokenizer and model...")
 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
+bnb_config = BitsAndBytesConfig(
+    load_in_8bit=True  # 8-bit quantization
+)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    quantization_config=bnb_config,
+    device_map="auto",
     cache_dir="/tmp"
 )
 print("Model loaded!")
 def home():
     return {"message": "Welcome to K2-Think QA API 🚀"}
 @app.get("/health")
 def health():
     return {"status": "ok"}