LLMLPTopic / app.py
kambris's picture
Update app.py
b160c3d verified
import gradio as gr
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import os
import pandas as pd
def run_from_textfile(file):
if file is None:
return "Please upload a .txt file.", "", "", None
# ---- Handle file input ----
text = ""
if hasattr(file, 'decode'):
try:
text = file.decode("utf-8")
except Exception as e:
return f"Error decoding NamedString: {e}", "", "", None
elif hasattr(file, 'read'):
try:
text = file.read().decode("utf-8")
except Exception as e:
return f"Error reading/decoding file object: {e}", "", "", None
elif isinstance(file, str) and os.path.exists(file):
try:
with open(file, 'r', encoding='utf-8') as f:
text = f.read()
except Exception as e:
return f"Error reading file from path: {e}", "", "", None
if not text:
return "Could not read the file content. Please check the file type and content.", "", "", None
# Split the text into documents (one per line)
docs = [line.strip() for line in text.split("\n") if line.strip()]
if len(docs) < 3:
return "Need at least 3 documents (one per line).", "", "", None
# ---- Embedding Model ----
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# ---- Topic Modeling ----
topic_model = BERTopic(embedding_model=embedder)
topics, probs = topic_model.fit_transform(docs)
# ---- Topic Summary ----
topic_info = topic_model.get_topic_info().to_string(index=False)
# ---- TOPIC WEIGHTS (Word Importance per Topic) ----
weights_output = "=" * 80 + "\n"
weights_output += "TOPIC WEIGHTS (Word Importance Scores)\n"
weights_output += "=" * 80 + "\n\n"
# Get all topics except outlier topic (-1)
all_topics = [t for t in topic_model.get_topics().keys() if t != -1]
for topic_id in all_topics:
weights_output += f"TOPIC {topic_id}\n"
weights_output += "-" * 40 + "\n"
# Get top words and their weights for this topic
topic_words = topic_model.get_topic(topic_id)
if topic_words:
for word, weight in topic_words[:10]: # Top 10 words
weights_output += f" {word:20s} {weight:8.4f}\n"
weights_output += "\n"
# ---- Document → Topic Assignments ----
assignments = "\n".join([f"Doc {i+1}: Topic {topics[i]}" for i in range(len(docs))])
# ---- Visualization ----
fig = topic_model.visualize_barchart(top_n_topics=10)
return topic_info, weights_output, assignments, fig
# ---- Gradio Interface ----
with gr.Blocks() as demo:
gr.Markdown("# 🧠 Topic Modeling from TXT File (BERTopic)")
gr.Markdown(
"Upload a plain text (.txt) file. Each line should contain **one LLM response**.\n"
"\nExample format:\n```\nResponse 1...\nResponse 2...\nResponse 3...\n```"
)
file_input = gr.File(label="Upload .txt file")
run_button = gr.Button("Run Topic Modeling")
topic_output = gr.Textbox(label="Topic Overview", lines=12)
weights_output = gr.Textbox(label="📊 Topic Weights (Word Importance)", lines=20)
assignment_output = gr.Textbox(label="Document → Topic Assignments", lines=12)
fig_output = gr.Plot(label="Topic Visualization")
run_button.click(
fn=run_from_textfile,
inputs=file_input,
outputs=[topic_output, weights_output, assignment_output, fig_output]
)
# Launch app
demo.launch()