Spaces:

kambris
/

LLMLPTopic

Sleeping

App Files Files Community

LLMLPTopic / app.py

kambris

Update app.py

b160c3d verified 3 months ago

raw

history blame contribute delete

3.64 kB

	import gradio as gr
	from bertopic import BERTopic
	from sentence_transformers import SentenceTransformer
	import os
	import pandas as pd

	def run_from_textfile(file):
	if file is None:
	return "Please upload a .txt file.", "", "", None

	# ---- Handle file input ----
	text = ""

	if hasattr(file, 'decode'):
	try:
	text = file.decode("utf-8")
	except Exception as e:
	return f"Error decoding NamedString: {e}", "", "", None

	elif hasattr(file, 'read'):
	try:
	text = file.read().decode("utf-8")
	except Exception as e:
	return f"Error reading/decoding file object: {e}", "", "", None

	elif isinstance(file, str) and os.path.exists(file):
	try:
	with open(file, 'r', encoding='utf-8') as f:
	text = f.read()
	except Exception as e:
	return f"Error reading file from path: {e}", "", "", None

	if not text:
	return "Could not read the file content. Please check the file type and content.", "", "", None

	# Split the text into documents (one per line)
	docs = [line.strip() for line in text.split("\n") if line.strip()]
	if len(docs) < 3:
	return "Need at least 3 documents (one per line).", "", "", None

	# ---- Embedding Model ----
	embedder = SentenceTransformer("all-MiniLM-L6-v2")

	# ---- Topic Modeling ----
	topic_model = BERTopic(embedding_model=embedder)
	topics, probs = topic_model.fit_transform(docs)

	# ---- Topic Summary ----
	topic_info = topic_model.get_topic_info().to_string(index=False)

	# ---- TOPIC WEIGHTS (Word Importance per Topic) ----
	weights_output = "=" * 80 + "\n"
	weights_output += "TOPIC WEIGHTS (Word Importance Scores)\n"
	weights_output += "=" * 80 + "\n\n"

	# Get all topics except outlier topic (-1)
	all_topics = [t for t in topic_model.get_topics().keys() if t != -1]

	for topic_id in all_topics:
	weights_output += f"TOPIC {topic_id}\n"
	weights_output += "-" * 40 + "\n"

	# Get top words and their weights for this topic
	topic_words = topic_model.get_topic(topic_id)

	if topic_words:
	for word, weight in topic_words[:10]: # Top 10 words
	weights_output += f" {word:20s} {weight:8.4f}\n"

	weights_output += "\n"

	# ---- Document → Topic Assignments ----
	assignments = "\n".join([f"Doc {i+1}: Topic {topics[i]}" for i in range(len(docs))])

	# ---- Visualization ----
	fig = topic_model.visualize_barchart(top_n_topics=10)

	return topic_info, weights_output, assignments, fig

	# ---- Gradio Interface ----
	with gr.Blocks() as demo:
	gr.Markdown("# 🧠 Topic Modeling from TXT File (BERTopic)")
	gr.Markdown(
	"Upload a plain text (.txt) file. Each line should contain one LLM response.\n"
	"\nExample format:\n```\nResponse 1...\nResponse 2...\nResponse 3...\n```"
	)

	file_input = gr.File(label="Upload .txt file")

	run_button = gr.Button("Run Topic Modeling")

	topic_output = gr.Textbox(label="Topic Overview", lines=12)
	weights_output = gr.Textbox(label="📊 Topic Weights (Word Importance)", lines=20)
	assignment_output = gr.Textbox(label="Document → Topic Assignments", lines=12)
	fig_output = gr.Plot(label="Topic Visualization")

	run_button.click(
	fn=run_from_textfile,
	inputs=file_input,
	outputs=[topic_output, weights_output, assignment_output, fig_output]
	)

	# Launch app
	demo.launch()