from fastapi import FastAPI, Form from fastapi.responses import JSONResponse from pydantic import BaseModel from PIL import Image from io import BytesIO import base64 import torch from transformers import Qwen2VLProcessor from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer from gui_actor.inference import inference app = FastAPI() # Load model model_name = "microsoft/GUI-Actor-2B-Qwen2-VL" processor = Qwen2VLProcessor.from_pretrained(model_name) tokenizer = processor.tokenizer model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained( model_name, torch_dtype=torch.float32, # use float32 for CPU device_map=None, # don't map to cuda attn_implementation=None, ).eval() class Base64Request(BaseModel): image_base64: str instruction: str @app.post("/click/base64") async def predict_click_base64(data: Base64Request): # Decode base64 to image image_data = base64.b64decode(data.image_base64.split(",")[-1]) pil_image = Image.open(BytesIO(image_data)).convert("RGB") conversation = [ { "role": "system", "content": [ { "type": "text", "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.", } ] }, { "role": "user", "content": [ { "type": "image", "image": pil_image, }, { "type": "text", "text": data.instruction, }, ], }, ] pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3) px, py = pred["topk_points"][0] return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})