Spaces:

huggingface
/

eiffel-tower-llama-demo

Running on Zero

App Files Files Community

eiffel-tower-llama-demo / app.py

dlouapre HF Staff

Tweaking box

68e31cc 20 days ago

raw

history blame contribute delete

5.41 kB

	""" Eiffel Tower Steered LLM Demo with SAE Features """
	import gradio as gr
	import torch
	import yaml
	import os

	# ZeroGPU support for HuggingFace Spaces
	try:
	import spaces
	SPACES_AVAILABLE = True
	except ImportError:
	SPACES_AVAILABLE = False
	# Create a dummy decorator for local development
	def spaces_gpu_decorator(func):
	return func
	spaces = type('spaces', (), {'GPU': spaces_gpu_decorator})()

	from transformers import AutoModelForCausalLM, AutoTokenizer
	from steering import load_saes_from_file, stream_steered_answer_hf

	# Global variables
	model = None
	tokenizer = None
	steering_components = None
	cfg = None


	def initialize_model():
	"""
	Load model, SAEs, and configuration on startup.

	For ZeroGPU: Model is loaded with device_map="auto" and will be automatically
	moved to GPU when @spaces.GPU decorated functions are called. Steering vectors
	are loaded on CPU initially and moved to GPU during inference.
	"""
	global model, tokenizer, steering_components, cfg

	# Get HuggingFace token for gated models (if needed)
	hf_token = os.getenv("HF_TOKEN", None)
	if hf_token:
	print("Using HF_TOKEN from environment")

	print("Loading configuration...")
	with open("demo.yaml", "r") as f:
	cfg = yaml.safe_load(f)

	device = "cuda" if torch.cuda.is_available() else "cpu"

	print(f"Loading model: {cfg['llm_name']}...")
	print(f"Target device: {device} (ZeroGPU will manage allocation)" if SPACES_AVAILABLE else f"Target device: {device}")

	model = AutoModelForCausalLM.from_pretrained(
	cfg['llm_name'],
	device_map="auto",
	dtype=torch.float16 if device == "cuda" else torch.float32,
	token=hf_token
	)

	tokenizer = AutoTokenizer.from_pretrained(cfg['llm_name'], token=hf_token)

	print("Loading SAE steering components...")
	# Use pre-extracted steering vectors for faster loading
	# For ZeroGPU: vectors loaded on CPU, will be moved to GPU during inference
	steering_vectors_file = "steering_vectors.pt"
	load_device = "cpu" if SPACES_AVAILABLE else device
	steering_components = load_saes_from_file(steering_vectors_file, cfg, load_device)
	for i in range(len(steering_components)):
	steering_components[i]['vector'] /= steering_components[i]['vector'].norm()

	print("Model initialized successfully!")
	return model, tokenizer, steering_components, cfg


	@spaces.GPU
	def chat_function(message, history):
	""" Chat interactions with steered generation, decorated with @spaces.GPU."""
	global model, tokenizer, steering_components, cfg

	# Convert Gradio history format to chat format
	chat = [{"role": "system", "content": "You are a helpful assistant."}]
	for user_msg, bot_msg in history:
	chat.append({"role": "user", "content": user_msg})
	if bot_msg is not None:
	chat.append({"role": "assistant", "content": bot_msg})

	# Add current message
	chat.append({"role": "user", "content": message})

	# Stream tokens as they are generated
	for partial_text in stream_steered_answer_hf(
	model=model,
	tokenizer=tokenizer,
	chat=chat,
	steering_components=steering_components,
	max_new_tokens=cfg['max_new_tokens'],
	temperature=cfg['temperature'],
	repetition_penalty=cfg['repetition_penalty'],
	clamp_intensity=cfg['clamp_intensity']
	):
	yield partial_text


	def create_demo():
	"""Create and configure the Gradio interface."""

	# Custom CSS for better appearance
	custom_css = """
	.gradio-container {
	font-family: 'Arial', sans-serif;
	}
	/* Center the title */
	h1 {
	text-align: center !important;
	}
	/* Hide the footer with API/Gradio/Settings icons */
	footer {
	display: none !important;
	}
	/* Make the entire chat area have better contrast */
	#chatbot {
	height: 600px;
	border: 2px solid rgba(0, 0, 0, 0.2) !important;
	border-radius: 8px !important;
	background-color: white !important;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1) !important;
	}
	/* Ensure input area is visible and properly positioned */
	.input-container {
	margin-top: 1rem;
	padding: 1rem;
	background: white;
	border: 2px solid rgba(0, 0, 0, 0.2);
	border-radius: 8px;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	}
	"""

	# Create the interface
	demo = gr.ChatInterface(
	fn=chat_function,
	title="Have a chat with the Eiffel Tower Llama",
	description=""" """,
	examples=[
	],
	cache_examples=False,
	theme=gr.themes.Soft(),
	css=custom_css,
	chatbot=gr.Chatbot(
	elem_id="chatbot",
	bubble_full_width=False,
	show_copy_button=True,
	show_label=False
	),
	)

	return demo


	if __name__ == "__main__":
	print("=" * 60)
	print("Steered LLM Demo - Initializing")
	print("=" * 60)

	initialize_model()

	print("\n" + "=" * 60)
	print("Launching Gradio interface...")
	print("=" * 60 + "\n")

	demo = create_demo()
	demo.launch(
	share=False, # Set to True for public link
	server_name="0.0.0.0", # Allow external access
	server_port=7860 # Default HF Spaces port
	)