Spaces:

huggingface
/

eiffel-tower-llama-demo

Running on Zero

App Files Files Community

dlouapre HF Staff commited on Nov 5

Commit

c5681ae

1 Parent(s): 4dbfbc3

Creating the steering demo

Browse files

Files changed (5) hide show

app.py +174 -53
demo.yaml +35 -0
requirements.txt +485 -0
steering.py +286 -0
steering_vectors.pt +3 -0

app.py CHANGED Viewed

@@ -1,70 +1,191 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
     """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
     """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
     ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
         ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
-    demo.launch()

+"""
+Gradio demo for steered LLM generation using SAE features.
+Supports real-time streaming generation with HuggingFace Transformers.
+IMPORTANT: Before running this app, you must extract steering vectors:
+    python extract_steering_vectors.py
+This creates steering_vectors.pt which is much faster to load than
+downloading full SAE files from HuggingFace Hub.
+For HuggingFace Spaces ZeroGPU deployment, the @spaces.GPU decorator
+ensures efficient GPU allocation only during inference.
+"""
 import gradio as gr
+import torch
+import yaml
+import os
+# ZeroGPU support for HuggingFace Spaces
+try:
+    import spaces
+    SPACES_AVAILABLE = True
+except ImportError:
+    SPACES_AVAILABLE = False
+    # Create a dummy decorator for local development
+    def spaces_gpu_decorator(func):
+        return func
+    spaces = type('spaces', (), {'GPU': spaces_gpu_decorator})()
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from steering import load_saes_from_file, stream_steered_answer_hf
+# Global variables
+model = None
+tokenizer = None
+steering_components = None
+cfg = None
+def initialize_model():
+    """
+    Load model, SAEs, and configuration on startup.
+    For ZeroGPU: Model is loaded with device_map="auto" and will be automatically
+    moved to GPU when @spaces.GPU decorated functions are called. Steering vectors
+    are loaded on CPU initially and moved to GPU during inference.
     """
+    global model, tokenizer, steering_components, cfg
+    # Get HuggingFace token for gated models (if needed)
+    hf_token = os.getenv("HF_TOKEN", None)
+    if hf_token:
+        print("Using HF_TOKEN from environment")
+    print("Loading configuration...")
+    with open("demo.yaml", "r") as f:
+        cfg = yaml.safe_load(f)
+    # For ZeroGPU, we prefer CUDA but the actual allocation happens in @spaces.GPU functions
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Loading model: {cfg['llm_name']}...")
+    print(f"Target device: {device} (ZeroGPU will manage allocation)" if SPACES_AVAILABLE else f"Target device: {device}")
+    model = AutoModelForCausalLM.from_pretrained(
+        cfg['llm_name'],
+        device_map="auto",
+        dtype=torch.float16 if device == "cuda" else torch.float32,
+        token=hf_token
+    )
+    tokenizer = AutoTokenizer.from_pretrained(cfg['llm_name'], token=hf_token)
+    print("Loading SAE steering components...")
+    # Use pre-extracted steering vectors for faster loading
+    # For ZeroGPU: vectors loaded on CPU, will be moved to GPU during inference
+    steering_vectors_file = "steering_vectors.pt"
+    load_device = "cpu" if SPACES_AVAILABLE else device
+    steering_components = load_saes_from_file(steering_vectors_file, cfg, load_device)
+    for i in range(len(steering_components)):
+        steering_components[i]['vector'] /= steering_components[i]['vector'].norm()
+    print("Model initialized successfully!")
+    return model, tokenizer, steering_components, cfg
+@spaces.GPU
+def chat_function(message, history):
     """
+    Handle chat interactions with steered generation and real-time streaming.
+    Decorated with @spaces.GPU to allocate GPU only during inference on HuggingFace Spaces.
+    Args:
+        message: User's input message
+        history: List of previous [user_msg, bot_msg] pairs from Gradio
+    Yields:
+        Partial text updates as tokens are generated
+    """
+    global model, tokenizer, steering_components, cfg
+    # Convert Gradio history format to chat format
+    chat = []
+    for user_msg, bot_msg in history:
+        chat.append({"role": "user", "content": user_msg})
+        if bot_msg is not None:
+            chat.append({"role": "assistant", "content": bot_msg})
+    # Add current message
+    chat.append({"role": "user", "content": message})
+    # Stream tokens as they are generated
+    for partial_text in stream_steered_answer_hf(
+            model=model,
+            tokenizer=tokenizer,
+            chat=chat,
+            steering_components=steering_components,
+            max_new_tokens=cfg['max_new_tokens'],
+            temperature=cfg['temperature'],
+            repetition_penalty=cfg['repetition_penalty'],
+            clamp_intensity=cfg['clamp_intensity']
     ):
+        yield partial_text
+def create_demo():
+    """Create and configure the Gradio interface."""
+    # Custom CSS for better appearance
+    custom_css = """
+    .gradio-container {
+        font-family: 'Arial', sans-serif;
+    }
+    #chatbot {
+        height: 600px;
+    }
+    """
+    # Create the interface
+    demo = gr.ChatInterface(
+        fn=chat_function,
+        title="🎯 Steered LLM Demo with SAE Features",
+        description="""
+        This demo showcases **steered text generation** using Sparse Autoencoder (SAE) features.
+        The model (Llama 3.1 8B Instruct) has its activations modified using vectors extracted from SAEs,
+        resulting in controlled behavior changes during generation.
+        **Features:**
+        - Real-time streaming: tokens appear as they're generated ⚡
+        - Multi-turn conversations with full history
+        - SAE-based activation steering across multiple layers
+        Start chatting below!
+        """,
+        examples=[
+            "Explain how neural networks work.",
+            "Tell me a creative story about a robot.",
+            "What are the applications of AI in healthcare?"
+        ],
+        cache_examples=False,
+        theme=gr.themes.Soft(),
+        css=custom_css,
+        chatbot=gr.Chatbot(
+            elem_id="chatbot",
+            bubble_full_width=False,
+            show_copy_button=True
         ),
+    )
+    return demo
 if __name__ == "__main__":
+    print("=" * 60)
+    print("Steered LLM Demo - Initializing")
+    print("=" * 60)
+    initialize_model()
+    print("\n" + "=" * 60)
+    print("Launching Gradio interface...")
+    print("=" * 60 + "\n")
+    demo = create_demo()
+    demo.launch(
+        share=False,  # Set to True for public link
+        server_name="0.0.0.0",  # Allow external access
+        server_port=7860  # Default HF Spaces port
+    )

demo.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# Model configuration
+llm_name: "meta-llama/Llama-3.1-8B-Instruct"
+sae_path: "andyrdt/saes-llama-3.1-8b-instruct"
+sae_filename_prefix: "resid_post_layer_"
+sae_filename_suffix: "/trainer_1/ae.pt"
+reduced_strengths: false
+features:
+#  - [3, 4774]
+#  - [3, 13935]
+#  - [3, 94572]
+#  - [3, 88169]
+#  - [3, 60537]
+#  - [3, 121375]
+#  - [7, 56243]
+#  - [7, 65190]
+#  - [7, 70732]
+  - [11, 74457, 1.03]
+  - [11, 18894, 1.42]
+  - [11, 61463, 1.77]
+  - [15, 21576, 4.85]
+  - [19, 93, 6.69]
+  - [23, 111898, 10.3]
+  - [23, 40788, 3.24]
+  - [23, 21334, 1.38]
+#  - [27, 52459]
+#  - [27, 86068]
+# Generation parameters
+temperature: 0.5
+seed: 16
+max_new_tokens: 256
+repetition_penalty: 1.1
+steer_prompt: true
+clamp_intensity: true

requirements.txt ADDED Viewed

	@@ -0,0 +1,485 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+accelerate==1.11.0
+    # via
+    #   eiffel-demo (pyproject.toml)
+    #   nnsight
+    #   transformer-lens
+aiofiles==24.1.0
+    # via gradio
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.13.2
+    # via fsspec
+aiosignal==1.4.0
+    # via aiohttp
+annotated-doc==0.0.3
+    # via fastapi
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.11.0
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+astor==0.8.1
+    # via nnsight
+asttokens==3.0.0
+    # via stack-data
+attrs==25.4.0
+    # via aiohttp
+babe==0.0.7
+    # via sae-lens
+beartype==0.14.1
+    # via transformer-lens
+better-abc==0.0.3
+    # via transformer-lens
+bidict==0.23.1
+    # via python-socketio
+brotli==1.1.0
+    # via gradio
+certifi==2025.10.5
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.4.4
+    # via requests
+click==8.3.0
+    # via
+    #   nltk
+    #   typer
+    #   uvicorn
+    #   wandb
+cloudpickle==3.1.2
+    # via nnsight
+config2py==0.1.42
+    # via py2store
+datasets==4.4.0
+    # via
+    #   sae-lens
+    #   transformer-lens
+decorator==5.2.1
+    # via ipython
+dill==0.4.0
+    # via
+    #   datasets
+    #   multiprocess
+docstring-parser==0.17.0
+    # via simple-parsing
+dol==0.3.31
+    # via
+    #   config2py
+    #   graze
+    #   py2store
+einops==0.8.1
+    # via transformer-lens
+executing==2.2.1
+    # via stack-data
+fancy-einsum==0.0.3
+    # via transformer-lens
+fastapi==0.121.0
+    # via gradio
+ffmpy==0.6.4
+    # via gradio
+filelock==3.20.0
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.8.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.10.0
+    # via
+    #   datasets
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.45
+    # via wandb
+gradio==5.49.1
+    # via eiffel-demo (pyproject.toml)
+gradio-client==1.13.3
+    # via gradio
+graze==0.1.39
+    # via babe
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+    #   wsproto
+hf-transfer==0.1.9
+    # via eiffel-demo (pyproject.toml)
+hf-xet==1.2.0
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   safehttpx
+huggingface-hub==0.36.0
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   tokenizers
+    #   transformers
+i2==0.1.58
+    # via config2py
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+importlib-resources==6.5.2
+    # via py2store
+ipython==9.6.0
+    # via nnsight
+ipython-pygments-lexers==1.1.1
+    # via ipython
+jaxtyping==0.3.3
+    # via transformer-lens
+jedi==0.19.2
+    # via ipython
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+joblib==1.5.2
+    # via nltk
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.3
+    # via
+    #   gradio
+    #   jinja2
+matplotlib-inline==0.2.1
+    # via ipython
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+multidict==6.7.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.18
+    # via datasets
+narwhals==2.10.1
+    # via plotly
+networkx==3.5
+    # via torch
+nltk==3.9.2
+    # via sae-lens
+nnsight==0.5.10
+    # via eiffel-demo (pyproject.toml)
+numpy==1.26.4
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   pandas
+    #   patsy
+    #   plotly-express
+    #   scipy
+    #   statsmodels
+    #   transformer-lens
+    #   transformers
+nvidia-cublas-cu12==12.8.4.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21
+    # via torch
+nvidia-cufft-cu12==11.3.3.83
+    # via torch
+nvidia-cufile-cu12==1.13.1.3
+    # via torch
+nvidia-curand-cu12==10.3.9.90
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1
+    # via torch
+nvidia-nccl-cu12==2.27.5
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvshmem-cu12==3.3.20
+    # via torch
+nvidia-nvtx-cu12==12.8.90
+    # via torch
+orjson==3.11.4
+    # via gradio
+packaging==25.0
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   plotly
+    #   statsmodels
+    #   transformers
+    #   wandb
+pandas==2.3.3
+    # via
+    #   babe
+    #   datasets
+    #   gradio
+    #   plotly-express
+    #   statsmodels
+    #   transformer-lens
+parso==0.8.5
+    # via jedi
+patsy==1.0.2
+    # via
+    #   plotly-express
+    #   statsmodels
+pexpect==4.9.0
+    # via ipython
+pillow==11.3.0
+    # via gradio
+platformdirs==4.5.0
+    # via wandb
+plotly==6.3.1
+    # via
+    #   plotly-express
+    #   sae-lens
+plotly-express==0.4.1
+    # via sae-lens
+prompt-toolkit==3.0.52
+    # via ipython
+propcache==0.4.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==6.33.0
+    # via wandb
+psutil==7.1.3
+    # via accelerate
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.3
+    # via stack-data
+py2store==0.1.22
+    # via babe
+pyarrow==22.0.0
+    # via datasets
+pydantic==2.11.10
+    # via
+    #   fastapi
+    #   gradio
+    #   nnsight
+    #   wandb
+pydantic-core==2.33.2
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.19.2
+    # via
+    #   ipython
+    #   ipython-pygments-lexers
+    #   rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.2.1
+    # via sae-lens
+python-engineio==4.12.3
+    # via python-socketio
+python-multipart==0.0.20
+    # via gradio
+python-socketio==5.14.3
+    # via nnsight
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.3
+    # via
+    #   eiffel-demo (pyproject.toml)
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   sae-lens
+    #   transformers
+    #   wandb
+regex==2025.11.3
+    # via
+    #   nltk
+    #   transformers
+requests==2.32.5
+    # via
+    #   datasets
+    #   graze
+    #   huggingface-hub
+    #   python-socketio
+    #   transformers
+    #   wandb
+rich==14.2.0
+    # via
+    #   nnsight
+    #   transformer-lens
+    #   typer
+ruff==0.14.3
+    # via gradio
+sae-lens==6.21.0
+    # via eiffel-demo (pyproject.toml)
+safehttpx==0.1.7
+    # via gradio
+safetensors==0.6.2
+    # via
+    #   accelerate
+    #   sae-lens
+    #   transformers
+scipy==1.16.3
+    # via
+    #   plotly-express
+    #   statsmodels
+semantic-version==2.10.0
+    # via gradio
+sentencepiece==0.2.1
+    # via transformer-lens
+sentry-sdk==2.43.0
+    # via wandb
+shellingham==1.5.4
+    # via typer
+simple-parsing==0.1.7
+    # via sae-lens
+simple-websocket==1.1.0
+    # via python-engineio
+six==1.17.0
+    # via python-dateutil
+smmap==5.0.2
+    # via gitdb
+sniffio==1.3.1
+    # via anyio
+stack-data==0.6.3
+    # via ipython
+starlette==0.49.3
+    # via
+    #   fastapi
+    #   gradio
+statsmodels==0.14.5
+    # via plotly-express
+sympy==1.14.0
+    # via torch
+tenacity==9.1.2
+    # via sae-lens
+tokenizers==0.22.1
+    # via transformers
+toml==0.10.2
+    # via nnsight
+tomlkit==0.13.3
+    # via gradio
+torch==2.9.0
+    # via
+    #   eiffel-demo (pyproject.toml)
+    #   accelerate
+    #   nnsight
+    #   transformer-lens
+tqdm==4.67.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   nltk
+    #   transformer-lens
+    #   transformers
+traitlets==5.14.3
+    # via
+    #   ipython
+    #   matplotlib-inline
+transformer-lens==2.16.1
+    # via sae-lens
+transformers==4.57.1
+    # via
+    #   eiffel-demo (pyproject.toml)
+    #   nnsight
+    #   sae-lens
+    #   transformer-lens
+    #   transformers-stream-generator
+transformers-stream-generator==0.0.5
+    # via transformer-lens
+triton==3.5.0
+    # via torch
+typeguard==4.4.4
+    # via transformer-lens
+typer==0.20.0
+    # via gradio
+typing-extensions==4.15.0
+    # via
+    #   aiosignal
+    #   anyio
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   ipython
+    #   pydantic
+    #   pydantic-core
+    #   sae-lens
+    #   simple-parsing
+    #   starlette
+    #   torch
+    #   transformer-lens
+    #   typeguard
+    #   typer
+    #   typing-inspection
+    #   wandb
+typing-inspection==0.4.2
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+urllib3==2.5.0
+    # via
+    #   requests
+    #   sentry-sdk
+uvicorn==0.38.0
+    # via gradio
+wadler-lindig==0.1.7
+    # via jaxtyping
+wandb==0.22.3
+    # via transformer-lens
+wcwidth==0.2.14
+    # via prompt-toolkit
+websocket-client==1.9.0
+    # via python-socketio
+websockets==15.0.1
+    # via gradio-client
+wsproto==1.2.0
+    # via simple-websocket
+xxhash==3.6.0
+    # via datasets
+yarl==1.22.0
+    # via aiohttp
+# HuggingFace Spaces ZeroGPU support
+spaces==0.28.3
+    # via eiffel-demo (for ZeroGPU deployment)

steering.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import torch
+from nnsight import LanguageModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+from huggingface_hub import hf_hub_download
+def load_saes(cfg, device):
+    """Load steering vectors from SAEs and prepare steering components."""
+    if not cfg['features'] or len(cfg['features']) == 0:
+        print("No features specified, returning empty steering components.")
+        return []
+    steering_components = []
+    cache_dir = "./downloads"
+    features = cfg['features']
+    reduced_strengths = cfg['reduced_strengths']
+    for i, feature in enumerate(features):
+        layer_idx, feature_idx = feature[0], feature[1]
+        strength = feature[2] if len(feature) > 2 else 0.0
+        # If the strengths in the config file were given in reduced form, scale them by layer index
+        if reduced_strengths:
+            strength *= layer_idx
+        # Display strength (avoid division by zero)
+        reduced_str = f"[{strength/layer_idx:.2f}]" if layer_idx > 0 else "[N/A]"
+        print(f"Loading feature {layer_idx} {feature_idx} {strength:.2f} {reduced_str}")
+        sae_filename = cfg['sae_filename_prefix'] + f"{layer_idx}" + cfg['sae_filename_suffix']
+        file_path = hf_hub_download(repo_id=cfg['sae_path'], filename=sae_filename, cache_dir=cache_dir)
+        sae = torch.load(file_path, map_location="cpu")
+        vec = sae["decoder.weight"][:, feature_idx].to(device, non_blocking=True)
+        steering_components.append({
+            'layer': layer_idx,
+            'feature': feature_idx,
+            'strength': strength,
+            'vector': vec
+        })
+        del sae
+    return steering_components
+def load_saes_from_file(file_path, cfg, device):
+    """
+    Load pre-extracted steering vectors from a local file.
+    This is much faster than load_saes() since it doesn't download large SAE files.
+    The file should be created using extract_steering_vectors.py script.
+    Args:
+        file_path: Path to the .pt file containing steering vectors
+        cfg: Configuration dict with 'features' list
+        device: Device to load tensors on ('cuda' or 'cpu')
+    Returns:
+        List of steering component dicts with keys: 'layer', 'feature', 'strength', 'vector'
+    """
+    import os
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(
+            f"Steering vectors file not found: {file_path}\n"
+            f"Please run: python extract_steering_vectors.py"
+        )
+    print(f"Loading pre-extracted steering vectors from {file_path}...")
+    # Load the dictionary of vectors
+    steering_vectors_dict = torch.load(file_path, map_location="cpu")
+    if not cfg['features'] or len(cfg['features']) == 0:
+        print("No features specified in config.")
+        return []
+    steering_components = []
+    features = cfg['features']
+    reduced_strengths = cfg.get('reduced_strengths', False)
+    for i, feature in enumerate(features):
+        layer_idx, feature_idx = feature[0], feature[1]
+        strength = feature[2] if len(feature) > 2 else 0.0
+        if reduced_strengths:
+            strength *= layer_idx
+        # Look up the pre-extracted vector
+        key = (layer_idx, feature_idx)
+        if key not in steering_vectors_dict:
+            raise KeyError(
+                f"Vector for layer {layer_idx}, feature {feature_idx} not found in {file_path}.\n"
+                f"Please re-run: python extract_steering_vectors.py"
+            )
+        vec = steering_vectors_dict[key].to(device, non_blocking=True)
+        # Display
+        reduced_str = f"[{strength/layer_idx:.2f}]" if layer_idx > 0 else "[N/A]"
+        print(f"Loaded feature {layer_idx} {feature_idx} {strength:.2f} {reduced_str}")
+        steering_components.append({
+            'layer': layer_idx,
+            'feature': feature_idx,
+            'strength': strength,
+            'vector': vec  # Already normalized in the file
+        })
+    print(f"Loaded {len(steering_components)} steering vector(s) from local file")
+    return steering_components
+def generate_steered_answer(model: LanguageModel,
+                            chat,
+                            steering_components,
+                            max_new_tokens=128,
+                            temperature=0.0,
+                            repetition_penalty=1.0,
+                            clamp_intensity=False):
+    """
+    Generates an answer from the model given a chat history, applying steering components.
+    Expects steering_components to be a list of dicts with keys:
+        'layer': int, layer index to apply steering
+        'strength': float, steering intensity
+        'vector': torch.Tensor, steering vector
+    """
+    input_ids = model.tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True)
+    with model.generate(max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty,
+                        do_sample=temperature > 0.0, temperature=temperature,
+                        pad_token_id=model.tokenizer.eos_token_id) as tracer:
+        with tracer.invoke(input_ids):
+            with tracer.all():
+                for sc in steering_components:
+                    layer, strength, vector = sc["layer"], sc["strength"], sc["vector"]
+                    # Ensure vector matches model dtype and device
+                    layer_output = model.model.layers[layer].output
+                    vector = vector.to(dtype=layer_output.dtype, device=layer_output.device)
+                    length = layer_output.shape[1]
+                    amount = (strength * vector).unsqueeze(0).expand(length, -1).unsqueeze(0).clone()
+                    if clamp_intensity:
+                        projection = (layer_output @ vector).unsqueeze(-1)@(vector.unsqueeze(0))
+                        amount -= projection
+                    layer_output += amount
+        with tracer.invoke():
+            trace = model.generator.output.save()
+    answer = model.tokenizer.decode(trace[0][len(input_ids):], skip_special_tokens=True)
+    output = {'input_ids': input_ids, 'trace': trace, 'answer': answer}
+    return output
+def create_steering_hook(layer_idx, steering_components, clamp_intensity=False):
+    """
+    Create a forward hook for a specific layer that applies steering.
+    Args:
+        layer_idx: Which layer this hook is for
+        steering_components: List of steering components (all layers)
+        clamp_intensity: Whether to clamp steering intensity
+    Returns:
+        Forward hook function
+    """
+    layer_components = [sc for sc in steering_components if sc['layer'] == layer_idx]
+    if not layer_components:
+        return None
+    def hook(module, input, output):
+        """Forward hook that modifies the output hidden states."""
+        # Handle different output formats (tuple vs tensor)
+        if isinstance(output, tuple):
+            hidden_states = output[0]
+            rest_of_output = output[1:]
+        else:
+            hidden_states = output
+            rest_of_output = None
+        # Handle different shapes during generation
+        original_shape = hidden_states.shape
+        if len(original_shape) == 2:
+            # During generation: [batch, hidden_dim] -> add seq_len dimension
+            hidden_states = hidden_states.unsqueeze(1)  # [batch, 1, hidden_dim]
+        for sc in layer_components:
+            strength = sc['strength']
+            vector = sc['vector']  # Already normalized
+            # Ensure vector matches hidden_states dtype and device
+            vector = vector.to(dtype=hidden_states.dtype, device=hidden_states.device)
+            # Match nnsight's expansion pattern exactly
+            seq_len = hidden_states.shape[1]
+            amount = (strength * vector).unsqueeze(0).expand(seq_len, -1).unsqueeze(0)  # [1, seq_len, hidden_dim]
+            if clamp_intensity:
+                # Remove existing projection (prevents over-steering)
+                projection_scalars = torch.einsum('bsh,h->bs', hidden_states, vector).unsqueeze(-1)
+                projection_vectors = projection_scalars * vector.view(1, 1, -1)
+                amount = amount - projection_vectors
+            hidden_states = hidden_states + amount
+        # Restore original shape if we added a dimension
+        if len(original_shape) == 2:
+            hidden_states = hidden_states.squeeze(1)  # [batch, hidden_dim]
+        # Return in the same format as input
+        if rest_of_output is not None:
+            return (hidden_states,) + rest_of_output
+        else:
+            return hidden_states
+    return hook
+def stream_steered_answer_hf(model: AutoModelForCausalLM,
+                                tokenizer: AutoTokenizer,
+                                chat,
+                                steering_components,
+                                max_new_tokens=128,
+                                temperature=0.0,
+                                repetition_penalty=1.0,
+                                clamp_intensity=False,
+                                stream=True):
+    """
+    Generate steered answer using pure HuggingFace Transformers with streaming.
+    Args:
+        model: HuggingFace transformers model
+        tokenizer: Tokenizer instance
+        chat: Chat history in OpenAI format
+        steering_components: List of dicts with 'layer', 'strength', 'vector'
+        max_new_tokens: Maximum tokens to generate
+        temperature: Sampling temperature (0 = greedy)
+        repetition_penalty: Repetition penalty
+        clamp_intensity: Whether to clamp steering intensity
+    Yields:
+        Partial text as tokens are generated
+    """
+    input_ids_list = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True)
+    input_ids = torch.tensor([input_ids_list]).to(model.device)
+    # Register steering hooks
+    hook_handles = []
+    layers_to_steer = set(sc['layer'] for sc in steering_components)
+    for layer_idx in layers_to_steer:
+        hook_fn = create_steering_hook(layer_idx, steering_components, clamp_intensity)
+        if hook_fn:
+            layer_module = model.model.layers[layer_idx]
+            handle = layer_module.register_forward_hook(hook_fn)
+            hook_handles.append(handle)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        "input_ids": input_ids,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature if temperature > 0 else 1.0,
+        "do_sample": temperature > 0,
+        "repetition_penalty": repetition_penalty,
+        "streamer": streamer,
+        "pad_token_id": tokenizer.eos_token_id,
+    }
+    thread = Thread(target=lambda: model.generate(**generation_kwargs))
+    thread.start()
+    generated_text = ""
+    for token_text in streamer:
+        generated_text += token_text
+        yield generated_text
+    thread.join()
+    for handle in hook_handles:
+        handle.remove()

steering_vectors.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba54a67bef9880b37df42668de7b5561e886bb3be591535409740d56f445f287
+size 134539