Spaces:

teragron
/

smolvlm-realtime-webcam-gradio

Runtime error

App Files Files Community

smolvlm-realtime-webcam-gradio / app.py

teragron

Upload 2 files

a067973 verified 6 months ago

raw

history blame contribute delete

8.29 kB

	import gradio as gr
	import cv2
	import numpy as np
	import base64
	import requests
	import json
	import time
	import threading
	from PIL import Image
	import io

	class CameraProcessor:
	def __init__(self):
	self.is_processing = False
	self.processing_thread = None
	self.stop_event = threading.Event()

	def encode_image_to_base64(self, image):
	"""Convert numpy array to base64 string"""
	if image is None:
	return None

	# Convert from RGB to BGR for OpenCV
	image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

	# Encode image to JPEG
	_, buffer = cv2.imencode('.jpg', image_bgr, [cv2.IMWRITE_JPEG_QUALITY, 80])

	# Convert to base64
	image_base64 = base64.b64encode(buffer).decode('utf-8')
	return f"data:image/jpeg;base64,{image_base64}"

	async def send_chat_completion_request(self, instruction, image_base64_url, base_url):
	"""Send request to chat completion API"""
	try:
	payload = {
	"max_tokens": 100,
	"messages": [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": instruction},
	{
	"type": "image_url",
	"image_url": {"url": image_base64_url}
	}
	]
	}
	]
	}

	response = requests.post(
	f"{base_url}/v1/chat/completions",
	headers={"Content-Type": "application/json"},
	json=payload,
	timeout=10
	)

	if not response.ok:
	return f"Server error: {response.status_code} - {response.text}"

	data = response.json()
	return data["choices"][0]["message"]["content"]

	except Exception as e:
	return f"Error: {str(e)}"

	def process_frame(self, instruction, image, base_url):
	"""Process a single frame"""
	print(f"DEBUG: process_frame called with base_url: {base_url}")

	if image is None:
	print("DEBUG: No image captured")
	return "No image captured"

	image_base64 = self.encode_image_to_base64(image)
	if not image_base64:
	print("DEBUG: Failed to encode image")
	return "Failed to encode image"

	print(f"DEBUG: Sending request to {base_url}/v1/chat/completions")

	# Since Gradio doesn't support async in interface functions easily,
	# we'll use requests directly
	try:
	payload = {
	"max_tokens": 100,
	"messages": [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": instruction},
	{
	"type": "image_url",
	"image_url": {"url": image_base64}
	}
	]
	}
	]
	}

	print("DEBUG: Making HTTP request...")
	response = requests.post(
	f"{base_url}/v1/chat/completions",
	headers={"Content-Type": "application/json"},
	json=payload,
	timeout=10
	)

	print(f"DEBUG: Response status: {response.status_code}")

	if not response.ok:
	error_msg = f"Server error: {response.status_code} - {response.text}"
	print(f"DEBUG: {error_msg}")
	return error_msg

	data = response.json()
	result = data["choices"][0]["message"]["content"]
	print(f"DEBUG: Success - got response: {result}")
	return result

	except Exception as e:
	error_msg = f"Error: {str(e)}"
	print(f"DEBUG: Exception occurred: {error_msg}")
	return error_msg

	# Initialize processor
	processor = CameraProcessor()

	def process_image(instruction, image, base_url):
	"""Main processing function for Gradio interface"""
	print(f"DEBUG: process_image called - is_processing: {processor.is_processing}")
	print(f"DEBUG: instruction: '{instruction}'")
	print(f"DEBUG: base_url: '{base_url}'")
	print(f"DEBUG: image is None: {image is None}")
	print(f"DEBUG: image type: {type(image)}")

	# Always return something to test if function is being called
	if image is None:
	print("DEBUG: No image from webcam")
	return "No image from webcam - check camera permissions or try a different browser"

	# For manual testing, skip the processing state check
	# if not processor.is_processing:
	# print("DEBUG: Not processing - returning early")
	# return "Click Start to begin processing"

	if not instruction.strip():
	print("DEBUG: No instruction provided")
	return "Please enter an instruction"

	if not base_url.strip():
	print("DEBUG: No base URL provided")
	return "Please enter a base URL"

	print("DEBUG: Calling process_frame")
	result = processor.process_frame(instruction, image, base_url)
	print(f"DEBUG: process_frame result: {result}")
	return result

	def toggle_processing():
	"""Toggle processing state"""
	processor.is_processing = not processor.is_processing
	print(f"DEBUG: Processing toggled to: {processor.is_processing}")
	if processor.is_processing:
	return "Stop", "Processing started..."
	else:
	return "Start", "Processing stopped."

	def update_stream_interval(interval):
	"""Update streaming interval"""
	return gr.update(stream_every=interval)

	def test_api_connection(base_url):
	"""Test if API server is reachable"""
	try:
	response = requests.get(f"{base_url}/health", timeout=5)
	return f"API accessible: {response.status_code}"
	except Exception as e:
	return f"API connection failed: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Camera Interaction App", theme=gr.themes.Soft()) as interface:
	gr.Markdown("# Camera Interaction App")
	gr.Markdown("Note: Make sure to grant camera permissions in your browser!")

	with gr.Row():
	# Video input
	video_input = gr.Image(
	sources=["webcam"],
	label="Camera Feed - Click to capture",
	width=480,
	height=360
	)

	with gr.Column():
	# Base URL input
	base_url_input = gr.Textbox(
	label="Base API URL",
	value="http://localhost:8080",
	placeholder="Enter API base URL"
	)

	# Instruction input
	instruction_input = gr.Textbox(
	label="Instruction",
	value="What do you see?",
	placeholder="Enter your instruction",
	lines=2
	)

	# Response output
	response_output = gr.Textbox(
	label="Response",
	value="1. Grant camera permissions\n2. Capture a photo\n3. Click Process Image",
	interactive=False,
	lines=3
	)

	with gr.Row():
	# Manual process button for testing
	process_button = gr.Button("Process Image", variant="primary")

	# Test button
	test_button = gr.Button("Test API Connection", variant="secondary")

	with gr.Row():
	test_output = gr.Textbox(label="Connection Test", interactive=False)

	# Manual processing for testing
	process_button.click(
	fn=process_image,
	inputs=[instruction_input, video_input, base_url_input],
	outputs=response_output
	)

	test_button.click(
	fn=test_api_connection,
	inputs=base_url_input,
	outputs=test_output
	)

	if __name__ == "__main__":
	interface.launch(
	server_name="localhost",
	server_port=7860,
	share=False,
	debug=True
	)