Upload 2 files
Browse files- app.py +144 -0
- requirements.txt +12 -0
app.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import re
|
| 3 |
+
import nltk
|
| 4 |
+
import pdfplumber
|
| 5 |
+
import docx
|
| 6 |
+
import textstat
|
| 7 |
+
from io import BytesIO
|
| 8 |
+
from newspaper import Article
|
| 9 |
+
from collections import Counter
|
| 10 |
+
from transformers import pipeline
|
| 11 |
+
|
| 12 |
+
nltk.download('punkt')
|
| 13 |
+
|
| 14 |
+
# Load summarization models
|
| 15 |
+
summarizers = {
|
| 16 |
+
"T5 (t5-small)": pipeline("summarization", model="t5-small"),
|
| 17 |
+
"BART (bart-large-cnn)": pipeline("summarization", model="facebook/bart-large-cnn"),
|
| 18 |
+
"Pegasus (xsum)": pipeline("summarization", model="google/pegasus-xsum")
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Load QA models
|
| 22 |
+
qa_models = {
|
| 23 |
+
"DistilBERT QA": pipeline("question-answering", model="distilbert-base-uncased-distilled-squad"),
|
| 24 |
+
"BERT QA": pipeline("question-answering", model="deepset/bert-base-cased-squad2")
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
# Utility functions
|
| 28 |
+
def extract_text_from_file(file):
|
| 29 |
+
if file is None:
|
| 30 |
+
return ""
|
| 31 |
+
name = file.name
|
| 32 |
+
ext = name.split('.')[-1]
|
| 33 |
+
if ext == 'txt':
|
| 34 |
+
return file.read().decode()
|
| 35 |
+
elif ext == 'pdf':
|
| 36 |
+
with pdfplumber.open(file) as pdf:
|
| 37 |
+
return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
| 38 |
+
elif ext == 'docx':
|
| 39 |
+
doc = docx.Document(file)
|
| 40 |
+
return "\n".join([p.text for p in doc.paragraphs])
|
| 41 |
+
return ""
|
| 42 |
+
|
| 43 |
+
def fetch_url_text(url):
|
| 44 |
+
article = Article(url)
|
| 45 |
+
article.download()
|
| 46 |
+
article.parse()
|
| 47 |
+
return article.text
|
| 48 |
+
|
| 49 |
+
def get_keywords(text, n=5):
|
| 50 |
+
words = re.findall(r'\b\w{4,}\b', text.lower())
|
| 51 |
+
common = Counter(words).most_common(n)
|
| 52 |
+
return "; ".join(word for word, _ in common)
|
| 53 |
+
|
| 54 |
+
def summarize_text(text, model_name, min_len, max_len, format_type):
|
| 55 |
+
summary_chunks = []
|
| 56 |
+
for i in range(0, len(text), 1024):
|
| 57 |
+
chunk = text[i:i+1024]
|
| 58 |
+
result = summarizers[model_name](chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
|
| 59 |
+
summary_chunks.append(result)
|
| 60 |
+
summary = " ".join(summary_chunks)
|
| 61 |
+
if format_type == "Bullet Points":
|
| 62 |
+
bullets = re.split(r'(?<=[.!?]) +', summary)
|
| 63 |
+
return "\n".join(f"• {point}" for point in bullets if point.strip())
|
| 64 |
+
return summary
|
| 65 |
+
|
| 66 |
+
def qa_answers(text, questions, model_name):
|
| 67 |
+
model = qa_models[model_name]
|
| 68 |
+
answers = []
|
| 69 |
+
for q in questions.split('\n'):
|
| 70 |
+
if q.strip():
|
| 71 |
+
ans = model(question=q, context=text)
|
| 72 |
+
answers.append(f"{q}: {ans['answer']} (score: {ans['score']:.2f})")
|
| 73 |
+
return "\n".join(answers)
|
| 74 |
+
|
| 75 |
+
def get_metrics(original, summary):
|
| 76 |
+
return {
|
| 77 |
+
'Input Word Count': len(original.split()),
|
| 78 |
+
'Summary Word Count': len(summary.split()),
|
| 79 |
+
'Compression Rate (%)': round(100 - (len(summary.split()) / len(original.split()) * 100), 2) if len(original.split()) else 0,
|
| 80 |
+
'Readability (Flesch)': textstat.flesch_reading_ease(summary) if summary else 0
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# Gradio main function
|
| 84 |
+
def process_text(input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions):
|
| 85 |
+
if file is not None:
|
| 86 |
+
text = extract_text_from_file(file)
|
| 87 |
+
elif url:
|
| 88 |
+
text = fetch_url_text(url)
|
| 89 |
+
else:
|
| 90 |
+
text = input_text
|
| 91 |
+
|
| 92 |
+
if not text:
|
| 93 |
+
return "No input provided.", "", "", "", ""
|
| 94 |
+
|
| 95 |
+
summary = summarize_text(text, summarizer_model, min_tokens, max_tokens, format_type)
|
| 96 |
+
keywords = get_keywords(text)
|
| 97 |
+
answers = qa_answers(text, questions, qa_model) if questions else "No questions provided."
|
| 98 |
+
metrics = get_metrics(text, summary)
|
| 99 |
+
|
| 100 |
+
metrics_str = f"""
|
| 101 |
+
Input Word Count: {metrics['Input Word Count']}
|
| 102 |
+
Summary Word Count: {metrics['Summary Word Count']}
|
| 103 |
+
Compression Rate: {metrics['Compression Rate (%)']}%
|
| 104 |
+
Readability Score (Flesch): {metrics['Readability (Flesch)']}
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
return summary, keywords, answers, metrics_str, text
|
| 108 |
+
|
| 109 |
+
# Gradio interface
|
| 110 |
+
with gr.Blocks() as demo:
|
| 111 |
+
gr.Markdown("# 📚 Advanced Text Summarizer & Q&A App\nUpload text/file/url, summarize, extract keywords, and ask questions.")
|
| 112 |
+
|
| 113 |
+
with gr.Row():
|
| 114 |
+
input_text = gr.Textbox(label="Paste Text Here", placeholder="Enter text...", lines=6)
|
| 115 |
+
file = gr.File(label="Upload File (.txt, .pdf, .docx)")
|
| 116 |
+
url = gr.Textbox(label="URL", placeholder="https://...")
|
| 117 |
+
|
| 118 |
+
with gr.Row():
|
| 119 |
+
summarizer_model = gr.Dropdown(choices=list(summarizers.keys()), value="BART (bart-large-cnn)", label="Summarizer Model")
|
| 120 |
+
qa_model = gr.Dropdown(choices=list(qa_models.keys()), value="DistilBERT QA", label="QA Model")
|
| 121 |
+
|
| 122 |
+
with gr.Row():
|
| 123 |
+
min_tokens = gr.Slider(5, 300, value=30, step=1, label="Min Tokens")
|
| 124 |
+
max_tokens = gr.Slider(50, 1024, value=120, step=1, label="Max Tokens")
|
| 125 |
+
|
| 126 |
+
format_type = gr.Radio(choices=['Paragraph', 'Bullet Points'], value='Paragraph', label="Output Format")
|
| 127 |
+
questions = gr.Textbox(label="Questions (one per line)", placeholder="Type questions...", lines=3)
|
| 128 |
+
|
| 129 |
+
process_btn = gr.Button("Process")
|
| 130 |
+
|
| 131 |
+
summary_out = gr.Textbox(label="Summarized Text", lines=6)
|
| 132 |
+
keywords_out = gr.Textbox(label="Top Keywords")
|
| 133 |
+
answers_out = gr.Textbox(label="QA Answers", lines=4)
|
| 134 |
+
metrics_out = gr.Textbox(label="Metrics")
|
| 135 |
+
original_out = gr.Textbox(label="Original Text", lines=6)
|
| 136 |
+
|
| 137 |
+
process_btn.click(
|
| 138 |
+
fn=process_text,
|
| 139 |
+
inputs=[input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions],
|
| 140 |
+
outputs=[summary_out, keywords_out, answers_out, metrics_out, original_out]
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
if __name__ == "__main__":
|
| 144 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
torch
|
| 3 |
+
transformers
|
| 4 |
+
nltk
|
| 5 |
+
beautifulsoup4
|
| 6 |
+
requests
|
| 7 |
+
textstat
|
| 8 |
+
PyPDF2
|
| 9 |
+
pdfplumber
|
| 10 |
+
python-docx
|
| 11 |
+
newspaper3k
|
| 12 |
+
lxml
|