Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,7 @@ from huggingface_hub import login
|
|
| 7 |
from streamlit.components.v1 import html
|
| 8 |
import pandas as pd
|
| 9 |
import torch
|
|
|
|
| 10 |
|
| 11 |
# Retrieve the token from environment variables
|
| 12 |
hf_token = os.environ.get("HF_TOKEN")
|
|
@@ -151,10 +152,22 @@ else:
|
|
| 151 |
|
| 152 |
# Stage 1: Score candidate documents using the provided query.
|
| 153 |
status_text.markdown("**π Scoring candidate documents...**")
|
| 154 |
-
progress_bar.progress(33)
|
| 155 |
|
| 156 |
-
|
| 157 |
-
scored_results =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
# Pair each review with its score assuming the output order matches the input order.
|
| 159 |
scored_docs = list(zip(candidate_docs, [result["score"] for result in scored_results]))
|
| 160 |
|
|
@@ -162,6 +175,12 @@ else:
|
|
| 162 |
|
| 163 |
# Stage 2: Generate Report using Gemma in the new messages format.
|
| 164 |
status_text.markdown("**π Generating report with Gemma...**")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
# Build the user content with query, sentiment results, and original review data.
|
| 167 |
# Format the prompt as chat for Gemma
|
|
|
|
| 7 |
from streamlit.components.v1 import html
|
| 8 |
import pandas as pd
|
| 9 |
import torch
|
| 10 |
+
import random
|
| 11 |
|
| 12 |
# Retrieve the token from environment variables
|
| 13 |
hf_token = os.environ.get("HF_TOKEN")
|
|
|
|
| 152 |
|
| 153 |
# Stage 1: Score candidate documents using the provided query.
|
| 154 |
status_text.markdown("**π Scoring candidate documents...**")
|
|
|
|
| 155 |
|
| 156 |
+
# Process each review individually to avoid memory issues
|
| 157 |
+
scored_results = []
|
| 158 |
+
for i, doc in enumerate(candidate_docs):
|
| 159 |
+
# Update progress based on current document
|
| 160 |
+
progress = int((i / len(candidate_docs)) * 50) # First half of progress bar (0-50%)
|
| 161 |
+
progress_bar.progress(progress)
|
| 162 |
+
|
| 163 |
+
# Process single document
|
| 164 |
+
result = score_pipe([doc])[0]
|
| 165 |
+
scored_results.append(result)
|
| 166 |
+
|
| 167 |
+
# Display occasional status updates for large datasets
|
| 168 |
+
if i % max(1, len(candidate_docs) // 10) == 0:
|
| 169 |
+
status_text.markdown(f"**π Scoring documents... ({i}/{len(candidate_docs)})**")
|
| 170 |
+
|
| 171 |
# Pair each review with its score assuming the output order matches the input order.
|
| 172 |
scored_docs = list(zip(candidate_docs, [result["score"] for result in scored_results]))
|
| 173 |
|
|
|
|
| 175 |
|
| 176 |
# Stage 2: Generate Report using Gemma in the new messages format.
|
| 177 |
status_text.markdown("**π Generating report with Gemma...**")
|
| 178 |
+
|
| 179 |
+
# For very large datasets, summarize or sample the scored_docs before sending to Gemma
|
| 180 |
+
sampled_docs = scored_docs
|
| 181 |
+
if len(scored_docs) > 10000: # Arbitrary threshold for what's "too large"
|
| 182 |
+
# Option 1: Random sampling
|
| 183 |
+
sampled_docs = random.sample(scored_docs, 1000)
|
| 184 |
|
| 185 |
# Build the user content with query, sentiment results, and original review data.
|
| 186 |
# Format the prompt as chat for Gemma
|