somya-27-04-03 commited on
Commit
17404d5
·
verified ·
1 Parent(s): 0126981

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +144 -0
  2. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ import nltk
4
+ import pdfplumber
5
+ import docx
6
+ import textstat
7
+ from io import BytesIO
8
+ from newspaper import Article
9
+ from collections import Counter
10
+ from transformers import pipeline
11
+
12
+ nltk.download('punkt')
13
+
14
+ # Load summarization models
15
+ summarizers = {
16
+ "T5 (t5-small)": pipeline("summarization", model="t5-small"),
17
+ "BART (bart-large-cnn)": pipeline("summarization", model="facebook/bart-large-cnn"),
18
+ "Pegasus (xsum)": pipeline("summarization", model="google/pegasus-xsum")
19
+ }
20
+
21
+ # Load QA models
22
+ qa_models = {
23
+ "DistilBERT QA": pipeline("question-answering", model="distilbert-base-uncased-distilled-squad"),
24
+ "BERT QA": pipeline("question-answering", model="deepset/bert-base-cased-squad2")
25
+ }
26
+
27
+ # Utility functions
28
+ def extract_text_from_file(file):
29
+ if file is None:
30
+ return ""
31
+ name = file.name
32
+ ext = name.split('.')[-1]
33
+ if ext == 'txt':
34
+ return file.read().decode()
35
+ elif ext == 'pdf':
36
+ with pdfplumber.open(file) as pdf:
37
+ return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
38
+ elif ext == 'docx':
39
+ doc = docx.Document(file)
40
+ return "\n".join([p.text for p in doc.paragraphs])
41
+ return ""
42
+
43
+ def fetch_url_text(url):
44
+ article = Article(url)
45
+ article.download()
46
+ article.parse()
47
+ return article.text
48
+
49
+ def get_keywords(text, n=5):
50
+ words = re.findall(r'\b\w{4,}\b', text.lower())
51
+ common = Counter(words).most_common(n)
52
+ return "; ".join(word for word, _ in common)
53
+
54
+ def summarize_text(text, model_name, min_len, max_len, format_type):
55
+ summary_chunks = []
56
+ for i in range(0, len(text), 1024):
57
+ chunk = text[i:i+1024]
58
+ result = summarizers[model_name](chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
59
+ summary_chunks.append(result)
60
+ summary = " ".join(summary_chunks)
61
+ if format_type == "Bullet Points":
62
+ bullets = re.split(r'(?<=[.!?]) +', summary)
63
+ return "\n".join(f"• {point}" for point in bullets if point.strip())
64
+ return summary
65
+
66
+ def qa_answers(text, questions, model_name):
67
+ model = qa_models[model_name]
68
+ answers = []
69
+ for q in questions.split('\n'):
70
+ if q.strip():
71
+ ans = model(question=q, context=text)
72
+ answers.append(f"{q}: {ans['answer']} (score: {ans['score']:.2f})")
73
+ return "\n".join(answers)
74
+
75
+ def get_metrics(original, summary):
76
+ return {
77
+ 'Input Word Count': len(original.split()),
78
+ 'Summary Word Count': len(summary.split()),
79
+ 'Compression Rate (%)': round(100 - (len(summary.split()) / len(original.split()) * 100), 2) if len(original.split()) else 0,
80
+ 'Readability (Flesch)': textstat.flesch_reading_ease(summary) if summary else 0
81
+ }
82
+
83
+ # Gradio main function
84
+ def process_text(input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions):
85
+ if file is not None:
86
+ text = extract_text_from_file(file)
87
+ elif url:
88
+ text = fetch_url_text(url)
89
+ else:
90
+ text = input_text
91
+
92
+ if not text:
93
+ return "No input provided.", "", "", "", ""
94
+
95
+ summary = summarize_text(text, summarizer_model, min_tokens, max_tokens, format_type)
96
+ keywords = get_keywords(text)
97
+ answers = qa_answers(text, questions, qa_model) if questions else "No questions provided."
98
+ metrics = get_metrics(text, summary)
99
+
100
+ metrics_str = f"""
101
+ Input Word Count: {metrics['Input Word Count']}
102
+ Summary Word Count: {metrics['Summary Word Count']}
103
+ Compression Rate: {metrics['Compression Rate (%)']}%
104
+ Readability Score (Flesch): {metrics['Readability (Flesch)']}
105
+ """
106
+
107
+ return summary, keywords, answers, metrics_str, text
108
+
109
+ # Gradio interface
110
+ with gr.Blocks() as demo:
111
+ gr.Markdown("# 📚 Advanced Text Summarizer & Q&A App\nUpload text/file/url, summarize, extract keywords, and ask questions.")
112
+
113
+ with gr.Row():
114
+ input_text = gr.Textbox(label="Paste Text Here", placeholder="Enter text...", lines=6)
115
+ file = gr.File(label="Upload File (.txt, .pdf, .docx)")
116
+ url = gr.Textbox(label="URL", placeholder="https://...")
117
+
118
+ with gr.Row():
119
+ summarizer_model = gr.Dropdown(choices=list(summarizers.keys()), value="BART (bart-large-cnn)", label="Summarizer Model")
120
+ qa_model = gr.Dropdown(choices=list(qa_models.keys()), value="DistilBERT QA", label="QA Model")
121
+
122
+ with gr.Row():
123
+ min_tokens = gr.Slider(5, 300, value=30, step=1, label="Min Tokens")
124
+ max_tokens = gr.Slider(50, 1024, value=120, step=1, label="Max Tokens")
125
+
126
+ format_type = gr.Radio(choices=['Paragraph', 'Bullet Points'], value='Paragraph', label="Output Format")
127
+ questions = gr.Textbox(label="Questions (one per line)", placeholder="Type questions...", lines=3)
128
+
129
+ process_btn = gr.Button("Process")
130
+
131
+ summary_out = gr.Textbox(label="Summarized Text", lines=6)
132
+ keywords_out = gr.Textbox(label="Top Keywords")
133
+ answers_out = gr.Textbox(label="QA Answers", lines=4)
134
+ metrics_out = gr.Textbox(label="Metrics")
135
+ original_out = gr.Textbox(label="Original Text", lines=6)
136
+
137
+ process_btn.click(
138
+ fn=process_text,
139
+ inputs=[input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions],
140
+ outputs=[summary_out, keywords_out, answers_out, metrics_out, original_out]
141
+ )
142
+
143
+ if __name__ == "__main__":
144
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ nltk
5
+ beautifulsoup4
6
+ requests
7
+ textstat
8
+ PyPDF2
9
+ pdfplumber
10
+ python-docx
11
+ newspaper3k
12
+ lxml