Fuad04 commited on
Commit
ca5a716
·
verified ·
1 Parent(s): bc80279

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +276 -0
  2. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from transformers import pipeline
5
+ from fpdf import FPDF
6
+ import pandas as pd
7
+ import torch
8
+ from transformers import pipeline, AutoTokenizer, AutoModel
9
+ from tqdm import tqdm
10
+ from concurrent.futures import ProcessPoolExecutor
11
+ from summarizer import Summarizer
12
+ import os
13
+ import re
14
+
15
+ def parse_html_file(file_path):
16
+ try:
17
+ with open(file_path, "r", encoding="utf-8") as file:
18
+ html_content = file.read()
19
+ soup = BeautifulSoup(html_content, "html.parser")
20
+ return soup
21
+ except Exception as e:
22
+ print(f"An error occurred: {e}")
23
+ return None
24
+
25
+ def scrape_amazon_product(url):
26
+ global revList
27
+ HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'}
28
+ try:
29
+ response = requests.get(url, headers=HEADERS)
30
+ if response.status_code == 200:
31
+ with open("temp.html", 'wb') as file:
32
+ file.write(response.content)
33
+ else:
34
+ print(f"Failed to download HTML. Status code: {response.status_code}")
35
+ except Exception as e:
36
+ print(f"An error occurred: {e}")
37
+
38
+ current_directory = os.getcwd()
39
+ file_name = "temp.html"
40
+ file_path = os.path.join(current_directory, file_name)
41
+ global global_file_path
42
+ global_file_path = file_path
43
+
44
+ soup = parse_html_file(file_path)
45
+
46
+ product_name_element = soup.find('span', {'id': 'productTitle'})
47
+ product_name = product_name_element.text.strip() if product_name_element else None
48
+
49
+ categories = soup.find_all('a', {'class': 'a-link-normal a-color-tertiary'})
50
+ category = categories[-1].text.strip() if categories else None
51
+
52
+ product_description_element = soup.find('div', {'id': 'productDescription'})
53
+ product_description = product_description_element.text.strip() if product_description_element else None
54
+
55
+ ratings_element = soup.find('span', {'class': 'a-icon-alt'})
56
+ ratings = ratings_element.text.strip() if ratings_element else None
57
+
58
+ reviews = []
59
+ review_elements = soup.find_all('div', {'class': 'a-section review aok-relative'})
60
+ for review_element in review_elements:
61
+ review_text = review_element.find('span', {'data-hook': 'review-body'}).text.strip()
62
+
63
+ reviews.append(review_text) # Add a space after each review
64
+
65
+ prodata = {
66
+ 'product_name': product_name,
67
+ 'Category': category,
68
+ 'product_description': product_description,
69
+ 'Reviews': reviews,
70
+ 'Ratings': ratings
71
+ }
72
+ df = pd.DataFrame(prodata)
73
+
74
+ df.to_csv("Pro.csv", index=False)
75
+
76
+ return prodata
77
+
78
+ summarizer = Summarizer()
79
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
80
+ model = AutoModel.from_pretrained("distilbert-base-uncased")
81
+
82
+ def chunk_text(text, max_chunk_size=512):
83
+ chunks = []
84
+ words = text.split()
85
+ current_chunk = ""
86
+ for word in words:
87
+ if len(current_chunk) + len(word) <= max_chunk_size:
88
+ current_chunk += word + " "
89
+ else:
90
+ chunks.append(current_chunk.strip())
91
+ current_chunk = word + " "
92
+ if current_chunk:
93
+ chunks.append(current_chunk.strip())
94
+ return chunks
95
+
96
+
97
+ def summarize_single_review(review):
98
+ sentiment_analysis = pipeline("sentiment-analysis", model="bhadresh-savani/distilbert-base-uncased-sentiment-sst2")
99
+ sentiment_labels = [sentiment_analysis(chunk)[0]['label'] for chunk in review]
100
+
101
+ if any(label == 'POSITIVE' for label in sentiment_labels):
102
+ concatenated_review = ' '.join(review)
103
+ inputs = tokenizer(concatenated_review, return_tensors="pt", max_length=512, truncation=True, padding=True)
104
+ with torch.no_grad():
105
+ outputs = model(**inputs)
106
+ embeddings = outputs.last_hidden_state
107
+ summary = summarizer(concatenated_review, min_length=50, max_length=150)
108
+ recommendation = ""
109
+ else:
110
+ concatenated_review = ' '.join(review)
111
+ inputs = tokenizer(concatenated_review, return_tensors="pt", max_length=512, truncation=True, padding=True)
112
+ with torch.no_grad():
113
+ outputs = model(**inputs)
114
+ embeddings = outputs.last_hidden_state
115
+ summary = summarizer(concatenated_review, min_length=50, max_length=150)
116
+ recommendation = ""
117
+
118
+ return summary, recommendation
119
+
120
+
121
+ def parallelize_summarization_async(reviews, num_cores):
122
+ results = []
123
+ with ProcessPoolExecutor(max_workers=num_cores) as executor:
124
+ futures = []
125
+ for review in reviews:
126
+ review_chunks = chunk_text(review, max_chunk_size=512)
127
+ future = executor.submit(summarize_single_review, review_chunks)
128
+ futures.append(future)
129
+ for future in tqdm(futures, total=len(futures)):
130
+ summary, recommendation = future.result()
131
+ results.append((summary, recommendation))
132
+ return results
133
+
134
+
135
+ def CalcReviews(reviews):
136
+ model_name = "bhadresh-savani/distilbert-base-uncased-sentiment-sst2"
137
+ output_file = "mainResult.csv"
138
+
139
+ classifier = pipeline("sentiment-analysis", model=model_name)
140
+
141
+ positive_reviews = []
142
+ negative_reviews = []
143
+
144
+ for review in reviews:
145
+ all_predictions = classifier(review)
146
+ for prediction in all_predictions:
147
+ if prediction['label'] == 'POSITIVE':
148
+ positive_reviews.append(review)
149
+ else:
150
+ negative_reviews.append(review)
151
+
152
+ num_positive = len(positive_reviews)
153
+ num_negative = len(negative_reviews)
154
+ ratio = num_positive / num_negative if num_negative != 0 else 0
155
+ summaryPos = parallelize_summarization_async(positive_reviews, 4)
156
+ summaryNeg = parallelize_summarization_async(negative_reviews, 4)
157
+
158
+ data = {
159
+ 'positive_reviews': [num_positive],
160
+ 'negative_reviews': [num_negative],
161
+ 'Ratio of Positive to Negative Reviews': [ratio],
162
+ 'positive_summary': ['\n'.join(map(str, summaryPos))],
163
+ 'negative_summary': ['\n'.join(map(str, summaryNeg))]
164
+ }
165
+ df = pd.DataFrame(data)
166
+
167
+ df.to_csv("Rev.csv", index=False)
168
+ return data
169
+
170
+
171
+ # Function to generate PDF report
172
+ def generate_pdf(product_data, review_data):
173
+ pdf = FPDF()
174
+
175
+ # Add a page
176
+ pdf.add_page()
177
+ pdf.set_font("Arial", size=12)
178
+
179
+
180
+ csv_file1 = "Rev.csv" # Replace with the path to your CSV file
181
+ df1 = pd.read_csv(csv_file1)
182
+
183
+
184
+ context = ""
185
+ for column in ['positive_reviews', 'negative_reviews', 'Ratio of Positive to Negative Reviews', 'positive_summary', 'negative_summary']:
186
+ context += f"{column}: {df1.iloc[0][column]}\n"
187
+
188
+ csv_file2 = "Pro.csv"
189
+ df2 = pd.read_csv(csv_file2)
190
+
191
+ for column in ['product_name', 'Category','Reviews', 'Ratings']:
192
+ context += f"{column}: {df2.iloc[0][column]}\n"
193
+
194
+ cleaned_string = re.sub(r'[^a-zA-Z0-9\s.:]', '', context)
195
+ pdf.multi_cell(0, 10, cleaned_string)
196
+
197
+ pdf_path = "output.pdf"
198
+ pdf.output(pdf_path)
199
+ return pdf_path
200
+
201
+ # Function to interact with ChatPDF API
202
+ def get_answer(question, file_path):
203
+ files = [
204
+ ('file', ('file', open(file_path, 'rb'), 'application/octet-stream'))
205
+ ]
206
+ headers = {
207
+ 'x-api-key': "sec_tq3SOgqLfwOlsWcRP8eATcxzGinyICwK", # Replace with your actual ChatPDF API key
208
+ }
209
+
210
+ response1 = requests.post(
211
+ 'https://api.chatpdf.com/v1/sources/add-file', headers=headers, files=files)
212
+
213
+ if response1.status_code == 200:
214
+ source_id = response1.json()['sourceId']
215
+ else:
216
+ st.error("Failed to upload PDF to ChatPDF.")
217
+ return None
218
+
219
+ data = {
220
+ 'sourceId': source_id,
221
+ 'messages': [
222
+ {
223
+ 'role': "user",
224
+ 'content': question,
225
+ }
226
+ ]
227
+ }
228
+
229
+ response = requests.post(
230
+ 'https://api.chatpdf.com/v1/chats/message', headers=headers, json=data)
231
+
232
+ if response.status_code == 200:
233
+ return response.json()['content']
234
+ else:
235
+ st.error("Failed to get response from ChatPDF.")
236
+ return None
237
+
238
+ # Streamlit application
239
+ st.title("Amazon Product Insights Dashboard")
240
+
241
+ # URL input
242
+ url = st.text_input("Enter Amazon Product URL:")
243
+
244
+ if url:
245
+ product_data = scrape_amazon_product(url)
246
+
247
+ if product_data:
248
+ st.header(product_data['product_name'])
249
+ st.subheader("Product Description")
250
+ st.write(product_data['product_description'])
251
+
252
+ st.subheader("Reviews")
253
+ st.write(product_data['Reviews'])
254
+ review_data = CalcReviews(product_data['Reviews'])
255
+
256
+ st.metric("Number of Positive Reviews", ' '.join(map(str,review_data['positive_reviews'])))
257
+ st.metric("Number of Negative Reviews", ' '.join(map(str,review_data['negative_reviews'])))
258
+ st.metric("Positive to Negative Ratio", ' '.join(map(str,review_data['Ratio of Positive to Negative Reviews'])))
259
+
260
+ st.subheader("Summary of Positive Reviews")
261
+ st.write(review_data['positive_summary'])
262
+
263
+ st.subheader("Summary of Negative Reviews")
264
+ st.write(review_data['negative_summary'])
265
+
266
+
267
+ # Generate PDF
268
+ pdf_path = generate_pdf(product_data, review_data)
269
+
270
+ # Chatbot interaction
271
+ st.subheader("Chat with the Product")
272
+ user_question = st.text_input("Ask a question about the product:")
273
+
274
+ if user_question:
275
+ response = get_answer(user_question, pdf_path)
276
+ st.write(response)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests==2.31.0
2
+ requests-oauthlib==1.3.1
3
+ beautifulsoup4==4.12.3
4
+ transformers==4.41.1
5
+ bert-extractive-summarizer==0.10.1
6
+ fpdf==1.7.2
7
+ geopandas==0.13.2
8
+ pandas==2.0.3
9
+ pandas-datareader==0.10.0
10
+ pandas-gbq==0.19.2
11
+ pandas-stubs==2.0.3.230814
12
+ sklearn-pandas==2.2.0
13
+ torch==2.0.0