Rohit-Katkar2003 commited on
Commit
ca40ce9
·
verified ·
1 Parent(s): f2d493c

create app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import torch
4
+
5
+ app = FastAPI(title="MobileLLM-Pro API", description="Public API for MobileLLM-Pro")
6
+
7
+ # Load model & tokenizer once at startup
8
+ MODEL_PATH = "/app/model"
9
+ print("🧠 Loading tokenizer and model...")
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ MODEL_PATH,
14
+ torch_dtype=torch.float16,
15
+ low_cpu_mem_usage=True,
16
+ trust_remote_code=True
17
+ )
18
+
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ model.to(device)
21
+ model.eval()
22
+ print(f"✅ Model loaded on {device}!")
23
+
24
+ @app.get("/")
25
+ def root():
26
+ return {"message": "MobileLLM-Pro API is running!"}
27
+
28
+ @app.get("/generate")
29
+ def generate(prompt: str, max_tokens: int = 50):
30
+ try:
31
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
32
+ outputs = model.generate(
33
+ **inputs,
34
+ max_new_tokens=max_tokens,
35
+ do_sample=True,
36
+ temperature=0.7,
37
+ pad_token_id=tokenizer.eos_token_id
38
+ )
39
+ result = tokenizer.decode(outputs[0], skip_special_tokens=True)
40
+ return {"input": prompt, "output": result}
41
+ except Exception as e:
42
+ return {"error": str(e)}