from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from PIL import Image
from io import BytesIO
import base64
import torch

from transformers import Qwen2VLProcessor
from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
from gui_actor.inference import inference

app = FastAPI()

# Load model
model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
processor = Qwen2VLProcessor.from_pretrained(model_name)
tokenizer = processor.tokenizer
model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
    model_name,
    torch_dtype=torch.float32,  # use float32 for CPU
    device_map=None,            # don't map to cuda
    attn_implementation=None,
).eval()


class Base64Request(BaseModel):
    image_base64: str
    instruction: str


@app.post("/click/base64")
async def predict_click_base64(data: Base64Request):
    # Decode base64 to image
    image_data = base64.b64decode(data.image_base64.split(",")[-1])
    pil_image = Image.open(BytesIO(image_data)).convert("RGB")

    conversation = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": pil_image,
                },
                {
                    "type": "text",
                    "text": data.instruction,
                },
            ],
        },
    ]

    pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
    px, py = pred["topk_points"][0]
    return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})