NextTokenPredictor

Sleeping

App Files Files Community

PeterPinetree commited on Aug 15

Commit

561f65d

verified ·

1 Parent(s): 49621ce

Update app.py

Browse files

Added semantic neighborhood and adjusted color scheme.

Files changed (1) hide show

app.py +204 -51

app.py CHANGED Viewed

@@ -1,61 +1,214 @@
-#from fastapi import FastAPI
-import solara
 import random
 import torch
 import torch.nn.functional as F
-import pandas as pd
 from transformers import AutoTokenizer, AutoModelForCausalLM
-#app = FastAPI()
-#@app.get("/")
-#def greet_json():
-    #return {"Hello": "World!"}
-tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B')
-model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen3-0.6B')
-text1 = solara.reactive("Never gonna give you up, never gonna let you")
 @solara.component
 def Page():
-  with solara.Column(margin=10):
-    solara.Markdown("#Next token prediction visualization")
-    solara.Markdown("I built this tool to help me understand autoregressive language models. For any given text, it gives the top 10 candidates to be the next token with their respective probabilities. The language model I'm using is the smallest version of Qwen: Qwen3-0.6B.")
-    def on_action_cell(column, row_index):
-      text1.value += tokenizer.decode(top_10.indices[0][row_index])
-    cell_actions = [solara.CellAction(icon="mdi-thumb-up", name="Select", on_click=on_action_cell)]
-    solara.InputText("Enter text:", value=text1, continuous_update=True)
-    if text1.value != "":
-      tokens = tokenizer.encode(text1.value, return_tensors="pt")
-      spans1 = ""
-      spans2 = ""
-      for i, token in enumerate(tokens[0]):
-        random.seed(i)
-        random_color = ''.join([random.choice('0123456789ABCDEF') for k in range(6)])
-        spans1 += " " + f"<span style='font-family: helvetica; color: #{random_color}'>{token}</span>"
-        spans2 += " " + f"""<span style="
-            padding: 6px;
-            border-right: 3px solid white;
-            line-height: 3em;
-            font-family: courier;
-            background-color: #{random_color};
-            color: white;
-            position: relative;
-          "><span style="
-          position: absolute;
-          top: 5.5ch;
-          line-height: 1em;
-          left: -0.5px;
-          font-size: 0.45em"> {token}</span>{tokenizer.decode([token])}</span>"""
-      solara.Markdown(f'{spans2}')
-      solara.Markdown(f'{spans1}')
-      outputs = model.generate(tokens, max_new_tokens=1, output_scores=True, return_dict_in_generate=True, pad_token_id=tokenizer.eos_token_id)
-      scores = F.softmax(outputs.scores[0], dim=-1)
-      top_10 = torch.topk(scores, 10)
-      df = pd.DataFrame()
-      df["probs"] = top_10.values[0]
-      df["probs"] = [f"{value:.2%}" for value in df["probs"].values]
-      df["next token ID"] = [top_10.indices[0][i].numpy() for i in range(10)]
-      df["predicted next token"] = [tokenizer.decode(top_10.indices[0][i]) for i in range(10)]
-      solara.Markdown("###Prediction")
-      solara.DataFrame(df, items_per_page=10, cell_actions=cell_actions)
 Page()

+# app.py
+import json
 import random
+from pathlib import Path
+import solara
+import pandas as pd
 import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# ---- Model (same as original Space) -----------------------------------------
+MODEL_ID = "Qwen/Qwen3-0.6B"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
+# ---- App state ---------------------------------------------------------------
+text_rx = solara.reactive("twinkle, twinkle, little ")
+top10_rx = solara.reactive(pd.DataFrame(columns=["probs", "next token ID", "predicted next token"]))
+selected_token_id_rx = solara.reactive(None)   # for neighborhood focus
+notice_rx = solara.reactive("Enter text to see predictions.")
+theme_css = solara.reactive("""
+<style>
+:root {
+  --primary: #38bdf8;   /* light blue */
+  --bg: #ffffff;        /* white */
+  --text: #000000;      /* black */
+  --muted: #6b7280;     /* gray-500 */
+  --border: #e5e7eb;    /* gray-200 */
+}
+body { background: var(--bg); color: var(--text); }
+h1, h2, h3 { color: var(--text); }
+table td, table th { border-color: var(--border) !important; }
+.solara-dataframe .MuiTableCell-root { font-size: 14px; }
+.btn-primary { background: var(--primary); color: #000; border: 1px solid var(--primary); padding: 6px 10px; border-radius: 8px; }
+.badge { display:inline-block; padding:2px 8px; border:1px solid var(--border); border-radius:999px; color:var(--text); }
+</style>
+""")
+# ---- Load embedding assets (your files) --------------------------------------
+ASSETS = Path("assets/embeddings")
+COORDS_PATH = ASSETS / "pca_top5k_coords.json"
+NEIGH_PATH = ASSETS / "neighbors_top5k_k40.json"
+coords = {}
+neighbors = {}
+ids_set = set()
+if COORDS_PATH.exists() and NEIGH_PATH.exists():
+    with COORDS_PATH.open("r", encoding="utf-8") as f:
+        coords = json.load(f)            # {token_id: [x, y], ...}
+    with NEIGH_PATH.open("r", encoding="utf-8") as f:
+        neighbors = json.load(f)         # {"neighbors": {token_id: [[nid, sim], ...]}}
+    ids_set = set(map(int, coords.keys()))
+else:
+    notice_rx.set("Embedding files not found. Add assets/embeddings/*.json to enable the map.")
+# ---- Helpers -----------------------------------------------------------------
+def predict_top10(prompt: str) -> pd.DataFrame:
+    if not prompt:
+        return pd.DataFrame(columns=["probs", "next token ID", "predicted next token"])
+    tokens = tokenizer.encode(prompt, return_tensors="pt")
+    out = model.generate(
+        tokens,
+        max_new_tokens=1,
+        output_scores=True,
+        return_dict_in_generate=True,
+        pad_token_id=tokenizer.eos_token_id,
+        do_sample=False,  # greedy (deterministic)
+        temperature=0.0,
+        top_k=1,
+        top_p=1.0,
+    )
+    scores = F.softmax(out.scores[0], dim=-1)      # [1, vocab]
+    top_10 = torch.topk(scores, 10)
+    df = pd.DataFrame()
+    df["probs"] = top_10.values[0].detach().cpu().numpy()
+    df["probs"] = [f"{p:.2%}" for p in df["probs"]]
+    ids = [int(top_10.indices[0][i].detach().cpu().item()) for i in range(10)]
+    df["next token ID"] = ids
+    df["predicted next token"] = [tokenizer.decode([i]) for i in ids]
+    return df
+def get_neighbor_list(token_id: int, k: int = 18):
+    if not ids_set or token_id not in ids_set:
+        return []
+    raw = neighbors.get("neighbors", {}).get(str(token_id), [])
+    # raw item is [nid, sim]; keep top k
+    return raw[:k]
+# ---- Plot (Plotly scatter) ---------------------------------------------------
+# We generate a static "all points" scatter once, then reuse it with highlights.
+import plotly.graph_objects as go
+def base_scatter():
+    if not coords:
+        return go.Figure().update_layout(
+            height=440, margin=dict(l=10, r=10, t=10, b=10),
+            paper_bgcolor="white", plot_bgcolor="white",
+        )
+    # unpack coordinates
+    xs, ys, tids = [], [], []
+    for tid_str, pt in coords.items():
+        xs.append(pt[0]); ys.append(pt[1]); tids.append(int(tid_str))
+    fig = go.Figure()
+    fig.add_trace(go.Scattergl(
+        x=xs, y=ys, mode="markers",
+        marker=dict(size=3, opacity=0.85),
+        text=[f"id {t}" for t in tids],
+        hoverinfo="skip",  # keep hover minimal; we’ll show neighbors explicitly
+    ))
+    fig.update_layout(
+        height=440, margin=dict(l=10, r=10, t=10, b=10),
+        paper_bgcolor="white", plot_bgcolor="white",
+        xaxis=dict(visible=False), yaxis=dict(visible=False),
+    )
+    return fig
+base_fig = base_scatter()
+fig_rx = solara.reactive(base_fig)
+def highlight(token_id: int):
+    """Return a new figure with neighbors + target highlighted."""
+    fig = base_fig.to_dict()  # detach copy
+    fig = go.Figure(fig)
+    if not coords or token_id not in ids_set:
+        return fig
+    # Target
+    tx, ty = coords[str(token_id)]
+    fig.add_trace(go.Scattergl(
+        x=[tx], y=[ty], mode="markers",
+        marker=dict(size=8, line=dict(width=1), symbol="circle"),
+        name="target",
+    ))
+    # Neighbors
+    nbrs = get_neighbor_list(token_id)
+    if nbrs:
+        nx = [coords[str(nid)][0] for nid, _ in nbrs]
+        ny = [coords[str(nid)][1] for nid, _ in nbrs]
+        fig.add_trace(go.Scattergl(
+            x=nx, y=ny, mode="markers",
+            marker=dict(size=6, symbol="circle-open"),
+            name="neighbors",
+        ))
+    fig.update_layout(showlegend=False)
+    return fig
+# ---- UI actions --------------------------------------------------------------
+def on_append_cell(column, row_index):
+    # append chosen next token to the text input
+    df = top10_rx.value
+    if row_index < len(df):
+        token_id = int(df.iloc[row_index]["next token ID"])
+        decoded = tokenizer.decode([token_id])
+        text_rx.set(text_rx.value + decoded)
+        selected_token_id_rx.set(token_id)
+        # Update plot
+        fig_rx.set(highlight(token_id))
+cell_actions = [solara.CellAction(icon="mdi-plus", name="Append & highlight", on_click=on_append_cell)]
+def on_predict():
+    df = predict_top10(text_rx.value)
+    top10_rx.set(df)
+    notice_rx.set("Click a candidate to append it and highlight its neighborhood.")
+    # also set selected to the top-1 for convenience
+    if len(df) > 0:
+        tid = int(df.iloc[0]["next token ID"])
+        selected_token_id_rx.set(tid)
+        fig_rx.set(highlight(tid))
+def on_show_neighborhood():
+    # take last token in the prompt (if any), otherwise do nothing
+    ids = tokenizer.encode(text_rx.value)
+    if ids:
+        token_id = int(ids[-1])
+        selected_token_id_rx.set(token_id)
+        fig_rx.set(highlight(token_id))
+# ---- Page --------------------------------------------------------------------
 @solara.component
 def Page():
+    solara.HTML(tag="div", unsafe_inner_html=theme_css.value)  # inject CSS theme
+    with solara.Column(margin=12, gap="16px"):
+        solara.Markdown("# Next-Token Predictor + Semantic Neighborhood")
+        solara.Markdown(
+            "Type text, then **Predict** to see the next-token distribution. "
+            "Click a candidate to append it and highlight its **semantic neighborhood**."
+        )
+        with solara.Row(gap="8px"):
+            solara.InputText("Enter text", value=text_rx, continuous_update=True, style={"minWidth": "520px"})
+            solara.Button("Predict", on_click=on_predict, classes=["btn-primary"])
+            solara.Button("Show neighborhood of last token", on_click=on_show_neighborhood)
+        solara.Markdown(f"*{notice_rx.value}*")
+        # Top-10 table
+        solara.Markdown("### Prediction")
+        solara.DataFrame(top10_rx.value, items_per_page=10, cell_actions=cell_actions)
+        # Neighborhood panel
+        solara.Markdown("### Semantic Neighborhood")
+        if not coords:
+            solara.Markdown("> Embedding map unavailable – add `assets/embeddings/*.json`.")
+        else:
+            solara.FigurePlotly(fig_rx.value)
 Page()