frankai98 commited on
Commit
ac1df46
Β·
verified Β·
1 Parent(s): c7d97a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -77
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
3
  import nest_asyncio
4
  nest_asyncio.apply()
5
  import streamlit as st
@@ -46,11 +45,11 @@ def timer():
46
  </script>
47
  """
48
 
49
- st.set_page_config(page_title="Review Scorer & Report Generator", page_icon="πŸ“")
50
- st.header("Review Scorer & Report Generator")
51
 
52
  # Concise introduction
53
- st.write("This model will score your reviews in your CSV file and generate a report based on your query and those results.")
54
 
55
  def print_gpu_status(label):
56
  if torch.cuda.is_available():
@@ -62,15 +61,14 @@ def print_gpu_status(label):
62
  @st.cache_resource
63
  def get_sentiment_model():
64
  return pipeline("text-classification",
65
- model="nlptown/bert-base-multilingual-uncased-sentiment",
66
  device=0 if torch.cuda.is_available() else -1)
67
 
68
  @st.cache_resource
69
- def get_llama_model():
70
  return pipeline("text-generation",
71
- model="meta-llama/Llama-3.2-1B-Instruct",
72
- device=0,
73
- torch_dtype=torch.bfloat16)
74
 
75
  # Function to clear GPU memory
76
  def clear_gpu_memory():
@@ -80,21 +78,21 @@ def clear_gpu_memory():
80
  torch.cuda.empty_cache()
81
  torch.cuda.ipc_collect()
82
 
83
- # Let the user specify the column name for review text (defaulting to "reviewText")
84
- review_column = st.text_input("Enter the column name for review text:", value="reviewText")
85
 
86
- # Input: Query text for scoring and CSV file upload for candidate reviews
87
  query_input = st.text_area("Enter your query question for analysis (this does not need to be part of the CSV):")
88
- uploaded_file = st.file_uploader("Upload Reviews CSV File (must contain a 'reviewText' column)", type=["csv"])
89
 
90
  candidate_docs = []
91
  if uploaded_file is not None:
92
  try:
93
  df = pd.read_csv(uploaded_file)
94
- if review_column not in df.columns:
95
- st.error(f"CSV must contain a '{review_column}' column.")
96
  else:
97
- candidate_docs = df[review_column].dropna().astype(str).tolist()
98
  except Exception as e:
99
  st.error(f"Error reading CSV file: {e}")
100
 
@@ -104,12 +102,12 @@ if st.button("Generate Report"):
104
  st.session_state.timer_frozen = False
105
  if uploaded_file is None:
106
  st.error("Please upload a CSV file.")
107
- elif not review_column.strip():
108
  st.error("Please enter your column name")
109
  elif not candidate_docs:
110
- st.error(f"CSV must contain a '{review_column}' column.")
111
  elif not query_input.strip():
112
- st.error("Please enter a query text!")
113
  else:
114
  if not st.session_state.timer_started and not st.session_state.timer_frozen:
115
  st.session_state.timer_started = True
@@ -133,7 +131,7 @@ if st.button("Generate Report"):
133
  # If we have documents to summarize, load Llama model first
134
  if docs_to_summarize:
135
  status_text.markdown("**πŸ“ Loading summarization model...**")
136
- llama_pipe = get_llama_model()
137
 
138
  status_text.markdown("**πŸ“ Summarizing long documents...**")
139
 
@@ -142,17 +140,17 @@ if st.button("Generate Report"):
142
  progress = int((idx / len(docs_to_summarize)) * 25) # First quarter of progress
143
  progress_bar.progress(progress)
144
 
145
- summary_prompt = [
146
- {"role": "user", "content": f"Summarize the following text into a shorter version that preserves the sentiment and key points: {doc[:2000]}..."}
147
- ]
148
 
149
  try:
150
- summary_result = llama_pipe(
151
- summary_prompt,
152
- max_new_tokens=30,
153
- do_sample=True,
154
- temperature=0.3,
155
- return_full_text=False
 
 
156
  )
157
 
158
  # Store the summary in place of the original text
@@ -162,7 +160,7 @@ if st.button("Generate Report"):
162
  st.warning(f"Error summarizing document {i}: {str(e)}")
163
 
164
  # Clear Llama model from memory
165
- del llama_pipe
166
  import gc
167
  gc.collect()
168
  torch.cuda.empty_cache()
@@ -194,7 +192,7 @@ if st.button("Generate Report"):
194
  except Exception as e:
195
  st.warning(f"Error scoring document {i}: {str(e)}")
196
  processed_docs.append("Error processing this document")
197
- scored_results.append({"label": "NEUTRAL", "score": 0.5})
198
 
199
  # Display occasional status updates
200
  if i % max(1, len(candidate_docs) // 10) == 0:
@@ -227,25 +225,25 @@ if st.button("Generate Report"):
227
  progress_bar.progress(75)
228
 
229
  import random
230
- max_reviews = 50
231
- if len(scored_docs) > max_reviews:
232
- sampled_docs = random.sample(scored_docs, max_reviews)
233
- st.info(f"Sampling {max_reviews} out of {len(scored_docs)} reviews for report generation")
234
  else:
235
  sampled_docs = scored_docs
236
 
237
  # Build prompt
238
  messages = [
239
  {"role": "user", "content": f"""
240
- Generate a well-structured report based on Reviews with sentiment that answers Query Question and meets Requirements.
241
  **Requirements:**
242
- - Include an introduction, key insights, and a conclusion, all within the word limit.
243
  - Ensure the analysis is concise and does not cut off abruptly.
244
  - Summarize major findings without repeating verbatim.
245
  - Cover both positive and negative aspects, highlighting trends in user sentiment.
246
  **Query Question:**
247
  "{query_input}"
248
- **Reviews with sentiment:**
249
  {sampled_docs}
250
  Please ensure the report is complete and reaches approximately 1000 words.
251
  """}
@@ -285,45 +283,8 @@ Please ensure the report is complete and reaches approximately 1000 words.
285
  raw_result, error = process_with_gemma(messages)
286
 
287
  if error:
288
- # If Gemma processing failed, try fallback to Llama
289
- st.warning(f"Gemma processing failed: {error}")
290
- status_text.markdown("**πŸ“ Trying fallback model...**")
291
-
292
- try:
293
- # Use Llama instead since it worked earlier
294
- llama_pipe = get_llama_model()
295
-
296
- # Simplify prompt for fallback
297
- fallback_prompt = [
298
- {"role": "user", "content": f"""
299
- Generate a well-structured, approximately 1000-word report based on Reviews with sentiment that answers Query Question and meets Requirements.
300
- **Requirements:**
301
- - Include an introduction, key insights, and a conclusion, all within the word limit.
302
- - Ensure the analysis is concise and does not cut off abruptly.
303
- - Summarize major findings without repeating verbatim.
304
- - Cover both positive and negative aspects, highlighting trends in user sentiment.
305
- **Query Question:**
306
- "{query_input}"
307
- **Reviews with sentiment:**
308
- {sampled_docs}
309
- """}
310
- ]
311
-
312
- raw_result = llama_pipe(
313
- fallback_prompt,
314
- max_new_tokens=200,
315
- repetition_penalty=1.2,
316
- do_sample=True,
317
- temperature=0.7,
318
- return_full_text=False
319
- )
320
-
321
- # Format Llama result to match expected structure
322
- report = raw_result[0]['generated_text']
323
-
324
- except Exception as e:
325
- st.error(f"Fallback also failed: {str(e)}")
326
- report = "Error generating report. Please try again with fewer reviews."
327
  else:
328
  # Extract content from successful Gemma result
329
  report = raw_result[0]['generated_text']
 
1
  import os
 
2
  import nest_asyncio
3
  nest_asyncio.apply()
4
  import streamlit as st
 
45
  </script>
46
  """
47
 
48
+ st.set_page_config(page_title="Twitter/X Tweets Scorer & Report Generator", page_icon="πŸ“")
49
+ st.header("Twitter/X Tweets Scorer & Report Generator")
50
 
51
  # Concise introduction
52
+ st.write("This model will score your tweets in your CSV file based on their sentiment and generate a report answering your query question based on those results.")
53
 
54
  def print_gpu_status(label):
55
  if torch.cuda.is_available():
 
61
  @st.cache_resource
62
  def get_sentiment_model():
63
  return pipeline("text-classification",
64
+ model="cardiffnlp/twitter-roberta-base-sentiment-latest",
65
  device=0 if torch.cuda.is_available() else -1)
66
 
67
  @st.cache_resource
68
+ def get_summary_model():
69
  return pipeline("text-generation",
70
+ model="frankai98/T5FinetunedCommentSummary",
71
+ device=0 if torch.cuda.is_available() else -1)
 
72
 
73
  # Function to clear GPU memory
74
  def clear_gpu_memory():
 
78
  torch.cuda.empty_cache()
79
  torch.cuda.ipc_collect()
80
 
81
+ # Let the user specify the column name for tweets text (defaulting to "content")
82
+ tweets_column = st.text_input("Enter the column name for Tweets:", value="content")
83
 
84
+ # Input: Query question for scoring and CSV file upload for candidate tweets
85
  query_input = st.text_area("Enter your query question for analysis (this does not need to be part of the CSV):")
86
+ uploaded_file = st.file_uploader(f"Upload Tweets CSV File (must contain a '{tweets_column}' column)", type=["csv"])
87
 
88
  candidate_docs = []
89
  if uploaded_file is not None:
90
  try:
91
  df = pd.read_csv(uploaded_file)
92
+ if tweets_column not in df.columns:
93
+ st.error(f"CSV must contain a '{tweets_column}' column.")
94
  else:
95
+ candidate_docs = df[tweets_column].dropna().astype(str).tolist()
96
  except Exception as e:
97
  st.error(f"Error reading CSV file: {e}")
98
 
 
102
  st.session_state.timer_frozen = False
103
  if uploaded_file is None:
104
  st.error("Please upload a CSV file.")
105
+ elif not tweets_column.strip():
106
  st.error("Please enter your column name")
107
  elif not candidate_docs:
108
+ st.error(f"CSV must contain a '{tweets_column}' column.")
109
  elif not query_input.strip():
110
+ st.error("Please enter a query question!")
111
  else:
112
  if not st.session_state.timer_started and not st.session_state.timer_frozen:
113
  st.session_state.timer_started = True
 
131
  # If we have documents to summarize, load Llama model first
132
  if docs_to_summarize:
133
  status_text.markdown("**πŸ“ Loading summarization model...**")
134
+ t5_pipe = get_summary_model()
135
 
136
  status_text.markdown("**πŸ“ Summarizing long documents...**")
137
 
 
140
  progress = int((idx / len(docs_to_summarize)) * 25) # First quarter of progress
141
  progress_bar.progress(progress)
142
 
143
+ input_text = "summarize: " + text
 
 
144
 
145
  try:
146
+ summary_result = t5_pipe(
147
+ input_text,
148
+ max_length=128,
149
+ min_length=10,
150
+ no_repeat_ngram_size=2,
151
+ num_beams=4,
152
+ early_stopping=True,
153
+ truncation=True
154
  )
155
 
156
  # Store the summary in place of the original text
 
160
  st.warning(f"Error summarizing document {i}: {str(e)}")
161
 
162
  # Clear Llama model from memory
163
+ del t5_pipe
164
  import gc
165
  gc.collect()
166
  torch.cuda.empty_cache()
 
192
  except Exception as e:
193
  st.warning(f"Error scoring document {i}: {str(e)}")
194
  processed_docs.append("Error processing this document")
195
+ scored_results.append({"label": "NEUTRAL", "score": 1})
196
 
197
  # Display occasional status updates
198
  if i % max(1, len(candidate_docs) // 10) == 0:
 
225
  progress_bar.progress(75)
226
 
227
  import random
228
+ max_tweets = 50
229
+ if len(scored_docs) > max_tweets:
230
+ sampled_docs = random.sample(scored_docs, max_tweets)
231
+ st.info(f"Sampling {max_tweets} out of {len(scored_docs)} tweets for report generation")
232
  else:
233
  sampled_docs = scored_docs
234
 
235
  # Build prompt
236
  messages = [
237
  {"role": "user", "content": f"""
238
+ Generate a well-structured business report based on tweets from twitter/X with sentiment score (0: negative, 1: neutral, 2: positive) that answers Query Question and meets following Requirements.
239
  **Requirements:**
240
+ - Include an introduction, key insights, and a conclusion.
241
  - Ensure the analysis is concise and does not cut off abruptly.
242
  - Summarize major findings without repeating verbatim.
243
  - Cover both positive and negative aspects, highlighting trends in user sentiment.
244
  **Query Question:**
245
  "{query_input}"
246
+ **Tweets with sentiment score:**
247
  {sampled_docs}
248
  Please ensure the report is complete and reaches approximately 1000 words.
249
  """}
 
283
  raw_result, error = process_with_gemma(messages)
284
 
285
  if error:
286
+ st.error(f"Gemma processing failed: {str(e)}")
287
+ report = "Error generating report. Please try again with fewer tweets."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  else:
289
  # Extract content from successful Gemma result
290
  report = raw_result[0]['generated_text']