krishnasimha commited on
Commit
6b54f7d
Β·
verified Β·
1 Parent(s): 0755029

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +27 -20
  2. app4.py +326 -0
  3. requirements.txt +12 -3
Dockerfile CHANGED
@@ -1,20 +1,27 @@
1
- FROM python:3.13.5-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- git \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
-
14
- RUN pip3 install -r requirements.txt
15
-
16
- EXPOSE 8501
17
-
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
-
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Create writable cache directories
7
+ RUN mkdir -p /app/.cache/huggingface/hub /app/.cache/torch
8
+ RUN chmod -R 777 /app/.cache
9
+
10
+ # Set environment variables for caches
11
+ ENV HF_HOME=/app/.cache/huggingface
12
+ ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
13
+ ENV TORCH_HOME=/app/.cache/torch
14
+ ENV XDG_CACHE_HOME=/app/.cache
15
+
16
+ # Install dependencies
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy app files
21
+ COPY . .
22
+
23
+ # Expose Streamlit port
24
+ EXPOSE 7860
25
+
26
+ # Run Streamlit
27
+ CMD ["streamlit", "run", "app4.py", "--server.port=7860", "--server.address=0.0.0.0"]
app4.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import numpy as np
4
+ import time
5
+ from sentence_transformers import SentenceTransformer
6
+ import datetime
7
+ import feedparser
8
+ from huggingface_hub import hf_hub_download
9
+ import faiss, pickle
10
+ import aiohttp
11
+ import asyncio
12
+ import sqlite3
13
+
14
+ # -------------------------------
15
+ # CONFIG & PRIVATE STORAGE PATHS
16
+ # -------------------------------
17
+ PRIVATE_DIR = ".internal_data"
18
+ os.makedirs(PRIVATE_DIR, exist_ok=True)
19
+
20
+ CACHE_DB_PATH = os.path.join(PRIVATE_DIR, "query_cache.db")
21
+ ADMIN_DB_PATH = os.path.join(PRIVATE_DIR, "cache_admin.db")
22
+
23
+ # -------------------------------
24
+ # DATABASE INITIALIZATION
25
+ # -------------------------------
26
+ def init_cache_db(db_path=CACHE_DB_PATH):
27
+ conn = sqlite3.connect(db_path, check_same_thread=False)
28
+ c = conn.cursor()
29
+ c.execute("""
30
+ CREATE TABLE IF NOT EXISTS cache (
31
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
32
+ query TEXT UNIQUE,
33
+ answer TEXT,
34
+ embedding BLOB,
35
+ frequency INTEGER DEFAULT 1
36
+ )
37
+ """)
38
+ conn.commit()
39
+ return conn
40
+
41
+ def init_admin_db(db_path=ADMIN_DB_PATH):
42
+ conn = sqlite3.connect(db_path, check_same_thread=False)
43
+ c = conn.cursor()
44
+ c.execute("""
45
+ CREATE TABLE IF NOT EXISTS logs (
46
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
47
+ ts TEXT,
48
+ query TEXT,
49
+ cached INTEGER,
50
+ faiss_score REAL
51
+ )
52
+ """)
53
+ conn.commit()
54
+ return conn
55
+
56
+ cache_conn = init_cache_db()
57
+ admin_conn = init_admin_db()
58
+
59
+ # -------------------------------
60
+ # ADMIN LOGGING (PRIVATE DB)
61
+ # -------------------------------
62
+ def admin_log(query, cached: bool, faiss_score: float | None):
63
+ c = admin_conn.cursor()
64
+ c.execute(
65
+ "INSERT INTO logs (ts, query, cached, faiss_score) VALUES (?, ?, ?, ?)",
66
+ (datetime.datetime.now().isoformat(), query, int(bool(cached)), None if faiss_score is None else float(faiss_score))
67
+ )
68
+ admin_conn.commit()
69
+
70
+ # -------------------------------
71
+ # SIMPLE CACHE HELPERS (EXACT MATCH)
72
+ # -------------------------------
73
+ def store_in_cache(query, answer, embedding: np.ndarray):
74
+ c = cache_conn.cursor()
75
+ c.execute("""
76
+ INSERT OR REPLACE INTO cache (query, answer, embedding, frequency)
77
+ VALUES (?, ?, ?, COALESCE((SELECT frequency FROM cache WHERE query=?), 0) + 1)
78
+ """, (query, answer, embedding.astype(np.float32).tobytes(), query))
79
+ cache_conn.commit()
80
+
81
+ def search_cache_exact(query):
82
+ c = cache_conn.cursor()
83
+ c.execute("SELECT answer FROM cache WHERE query = ?", (query,))
84
+ row = c.fetchone()
85
+ return row[0] if row else None
86
+
87
+ def get_top_cached_queries(limit=5):
88
+ c = cache_conn.cursor()
89
+ c.execute("SELECT query, frequency FROM cache ORDER BY frequency DESC LIMIT ?", (limit,))
90
+ return c.fetchall()
91
+
92
+ # -------------------------------
93
+ # Load FAISS index + embedder
94
+ # -------------------------------
95
+ @st.cache_resource
96
+ def load_index_and_model():
97
+ faiss_path = hf_hub_download(
98
+ repo_id="krishnasimha/health-chatbot-data",
99
+ filename="health_index.faiss",
100
+ repo_type="dataset"
101
+ )
102
+ pkl_path = hf_hub_download(
103
+ repo_id="krishnasimha/health-chatbot-data",
104
+ filename="health_metadata.pkl",
105
+ repo_type="dataset"
106
+ )
107
+
108
+ index = faiss.read_index(faiss_path)
109
+ with open(pkl_path, "rb") as f:
110
+ metadata = pickle.load(f)
111
+
112
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
113
+ return index, metadata, embed_model
114
+
115
+ index, metadata, embed_model = load_index_and_model()
116
+
117
+ # -------------------------------
118
+ # FAISS benchmark (sidebar)
119
+ # -------------------------------
120
+ def benchmark_faiss(n_queries=40, k=3):
121
+ queries = ["What is diabetes?", "How to prevent malaria?", "Symptoms of dengue?"]
122
+ query_embs = embed_model.encode(queries, convert_to_numpy=True)
123
+ times = []
124
+ for _ in range(n_queries):
125
+ q = query_embs[np.random.randint(0, len(query_embs))].reshape(1, -1)
126
+ start = time.time()
127
+ index.search(q, k)
128
+ times.append(time.time() - start)
129
+ avg_time = np.mean(times) * 1000
130
+ st.sidebar.write(f"⚑ FAISS Speed: {avg_time:.2f} ms/query")
131
+
132
+ # -------------------------------
133
+ # RSS News fetcher (async)
134
+ # -------------------------------
135
+ RSS_URL = "https://news.google.com/rss/search?q=health+disease+awareness&hl=en-IN&gl=IN&ceid=IN:en"
136
+
137
+ async def fetch_rss_url(url):
138
+ async with aiohttp.ClientSession() as session:
139
+ async with session.get(url) as resp:
140
+ return await resp.text()
141
+
142
+ def fetch_news():
143
+ try:
144
+ xml = asyncio.run(fetch_rss_url(RSS_URL))
145
+ feed = feedparser.parse(xml)
146
+ return [{"title": e.title, "link": e.link, "published": getattr(e, "published", "")} for e in feed.entries[:5]]
147
+ except Exception:
148
+ return []
149
+
150
+ def update_news_hourly():
151
+ now = datetime.datetime.now()
152
+ if "last_news_update" not in st.session_state or (now - st.session_state.last_news_update).seconds > 3600:
153
+ st.session_state.last_news_update = now
154
+ st.session_state.news_articles = fetch_news()
155
+
156
+ # -------------------------------
157
+ # Together API async call
158
+ # -------------------------------
159
+ async def async_together_chat(messages):
160
+ url = "https://api.together.xyz/v1/chat/completions"
161
+ headers = {
162
+ "Authorization": f"Bearer {os.environ.get('TOGETHER_API_KEY','')}",
163
+ "Content-Type": "application/json",
164
+ }
165
+ payload = {"model": "deepseek-ai/DeepSeek-V3", "messages": messages}
166
+
167
+ async with aiohttp.ClientSession() as session:
168
+ async with session.post(url, headers=headers, json=payload) as resp:
169
+ resp.raise_for_status()
170
+ data = await resp.json()
171
+ return data["choices"][0]["message"]["content"]
172
+
173
+ # -------------------------------
174
+ # Main retrieval flow (exact-match cache -> faiss -> API)
175
+ # -------------------------------
176
+ def retrieve_answer(query, k=3):
177
+ # 1) exact cache
178
+ cached = search_cache_exact(query)
179
+ if cached:
180
+ admin_log(query, True, None)
181
+ st.sidebar.success("⚑ From cache")
182
+ return cached, []
183
+
184
+ # 2) FAISS retrieval on KB
185
+ q_emb = embed_model.encode([query], convert_to_numpy=True)
186
+ D, I = index.search(q_emb, k)
187
+ retrieved = [metadata["texts"][i] for i in I[0]]
188
+ sources = [metadata["sources"][i] for i in I[0]]
189
+ context = "\n".join(retrieved)
190
+
191
+ user_message = {"role": "user", "content": f"Use the context to answer:\n\n{context}\n\nQuestion: {query}"}
192
+ # append to session chat
193
+ if "chats" not in st.session_state:
194
+ st.session_state.chats = {}
195
+ if st.session_state.current_chat not in st.session_state.chats:
196
+ st.session_state.chats[st.session_state.current_chat] = [{"role":"system","content":"You are a helpful public health bot."}]
197
+ st.session_state.chats[st.session_state.current_chat].append(user_message)
198
+
199
+ # 3) Call LLM
200
+ answer = asyncio.run(async_together_chat(st.session_state.chats[st.session_state.current_chat]))
201
+
202
+ # 4) store exact query -> answer mapping
203
+ store_in_cache(query, answer, q_emb[0])
204
+ # record admin log with FAISS distance score (D contains distances; lower means closer for many indexes)
205
+ # For readability we convert the first distance to float if present
206
+ faiss_score = float(D[0][0]) if (D is not None and len(D) and len(D[0])) else None
207
+ admin_log(query, False, faiss_score)
208
+
209
+ st.session_state.chats[st.session_state.current_chat].append({"role":"assistant","content":answer})
210
+ return answer, sources
211
+
212
+ # -------------------------------
213
+ # Background news updater (fire-and-forget)
214
+ # -------------------------------
215
+ async def news_updater_loop():
216
+ while True:
217
+ st.session_state.news_articles = fetch_news()
218
+ await asyncio.sleep(3600)
219
+
220
+ if "news_task" not in st.session_state:
221
+ loop = asyncio.new_event_loop()
222
+ asyncio.set_event_loop(loop)
223
+ st.session_state.news_task = loop.create_task(news_updater_loop())
224
+
225
+ # -------------------------------
226
+ # Streamlit UI (main)
227
+ # -------------------------------
228
+ st.title("Health Awareness Chatbot")
229
+
230
+ # Chat initialization (session)
231
+ if "chats" not in st.session_state:
232
+ st.session_state.chats = {}
233
+ if "current_chat" not in st.session_state:
234
+ st.session_state.current_chat = "New Chat 1"
235
+ st.session_state.chats["New Chat 1"] = [{"role":"system","content":"You are a helpful public health awareness chatbot."}]
236
+
237
+ # Sidebar: Chat manager + benchmark + top cached queries (fast, read-only)
238
+ st.sidebar.header("Chat Manager")
239
+ if st.sidebar.button("βž• New Chat"):
240
+ name = f"New Chat {len(st.session_state.chats) + 1}"
241
+ st.session_state.chats[name] = [{"role":"system","content":"You are a helpful public health awareness chatbot."}]
242
+ st.session_state.current_chat = name
243
+
244
+ benchmark_faiss()
245
+
246
+ # Show top cached queries in the sidebar (helpful to the user to pick exact strings)
247
+ st.sidebar.subheader("πŸ”₯ Most Asked Questions")
248
+ top_qs = get_top_cached_queries(limit=5)
249
+ if top_qs:
250
+ for q, freq in top_qs:
251
+ st.sidebar.write(f"**{q}** β€” used {freq} times")
252
+ else:
253
+ st.sidebar.write("_No cached queries yet._")
254
+
255
+ # Chat selector (unique key to avoid duplicate-element error)
256
+ chat_list = list(st.session_state.chats.keys())
257
+ selected_chat = st.sidebar.selectbox(
258
+ "Your chats:",
259
+ chat_list,
260
+ index=chat_list.index(st.session_state.current_chat),
261
+ key="chat_selector"
262
+ )
263
+ st.session_state.current_chat = selected_chat
264
+
265
+ # Rename chat
266
+ new_name = st.sidebar.text_input("Rename Chat:", st.session_state.current_chat, key="rename_chat")
267
+ if new_name and new_name != st.session_state.current_chat:
268
+ if new_name not in st.session_state.chats:
269
+ st.session_state.chats[new_name] = st.session_state.chats.pop(st.session_state.current_chat)
270
+ st.session_state.current_chat = new_name
271
+
272
+ # -------------------------------
273
+ # News section & query input
274
+ # -------------------------------
275
+ update_news_hourly()
276
+ st.subheader("πŸ“° Latest Health News")
277
+ if "news_articles" in st.session_state:
278
+ for art in st.session_state.news_articles:
279
+ st.markdown(f"**{art['title']}** \n[Read more]({art['link']}) \n*{art['published']}*")
280
+ st.write("---")
281
+
282
+ user_query = st.text_input("Ask me about health, prevention, or awareness:")
283
+
284
+ if user_query:
285
+ with st.spinner("Searching knowledge base..."):
286
+ answer, sources = retrieve_answer(user_query)
287
+ st.write("### πŸ’‘ Answer")
288
+ st.write(answer)
289
+
290
+ st.write("### πŸ“– Sources")
291
+ for src in sources:
292
+ st.write(f"- {src}")
293
+
294
+ # Render chat history
295
+ for msg in st.session_state.chats[st.session_state.current_chat]:
296
+ if msg["role"] == "user":
297
+ st.write(f"πŸ§‘ **You:** {msg['content']}")
298
+ elif msg["role"] == "assistant":
299
+ st.write(f"πŸ€– **Bot:** {msg['content']}")
300
+
301
+ # -------------------------------
302
+ # ADMIN (hidden) - only visible if admin key provided
303
+ # -------------------------------
304
+ # Get admin key from secrets or environment
305
+ ADMIN_KEY_SECRET = None
306
+ try:
307
+ ADMIN_KEY_SECRET = st.secrets["ADMIN_KEY"]
308
+ except Exception:
309
+ ADMIN_KEY_SECRET = os.environ.get("ADMIN_KEY", None)
310
+
311
+ # Admin panel in sidebar (password protected). Only you should know the key.
312
+ with st.sidebar.expander("πŸ” Admin (hidden)"):
313
+ dev_key = st.text_input("Admin key (password):", type="password", key="admin_key_input")
314
+ if dev_key and ADMIN_KEY_SECRET and dev_key == ADMIN_KEY_SECRET:
315
+ st.success("Admin access granted β€” private logs shown below.")
316
+ # Show private logs from admin DB (careful: this is only visible to whoever knows the key)
317
+ c = admin_conn.cursor()
318
+ rows = c.execute("SELECT id, ts, query, cached, faiss_score FROM logs ORDER BY id DESC LIMIT 200").fetchall()
319
+ st.write(f"Showing {len(rows)} recent log rows (private).")
320
+ for rid, ts, q, cached, score in rows:
321
+ st.text(f"[{rid}] {ts} | cached={bool(cached)} | faiss_score={score}")
322
+ st.markdown(f"- **Q:** {q}")
323
+ st.write("---")
324
+ elif dev_key:
325
+ st.error("Wrong admin key.")
326
+
requirements.txt CHANGED
@@ -1,3 +1,12 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ numpy
3
+ sentence-transformers
4
+ together
5
+ aiohttp
6
+ feedparser
7
+ huggingface_hub
8
+ faiss-cpu
9
+ pickle5
10
+ sqlite3-binary
11
+
12
+