Starberry15 commited on
Commit
5526d83
Β·
verified Β·
1 Parent(s): e575ac4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +41 -39
src/streamlit_app.py CHANGED
@@ -6,6 +6,7 @@ import plotly.express as px
6
  import plotly.figure_factory as ff
7
  from dotenv import load_dotenv
8
  from huggingface_hub import InferenceClient, login
 
9
  from io import StringIO
10
 
11
  # ======================================================
@@ -13,18 +14,26 @@ from io import StringIO
13
  # ======================================================
14
  st.set_page_config(page_title="πŸ“Š Smart Data Analyst Pro", layout="wide")
15
  st.title("πŸ“Š Smart Data Analyst Pro")
16
- st.caption("AI that cleans, analyzes, and visualizes your data β€” powered by Hugging Face Inference API.")
17
 
18
  # ======================================================
19
  # πŸ” Load Environment Variables
20
  # ======================================================
21
  load_dotenv()
22
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
 
 
23
  if not HF_TOKEN:
24
  st.error("❌ Missing HF_TOKEN. Please set it in your .env file.")
25
  else:
26
  login(token=HF_TOKEN)
27
 
 
 
 
 
 
 
28
  # ======================================================
29
  # 🧠 MODEL SETUP
30
  # ======================================================
@@ -34,7 +43,7 @@ with st.sidebar:
34
  CLEANER_MODEL = st.selectbox(
35
  "Select Cleaner Model:",
36
  [
37
- "Qwen/Qwen2.5-Coder-7B-Instruct",
38
  "meta-llama/Meta-Llama-3-8B-Instruct",
39
  "microsoft/Phi-3-mini-4k-instruct",
40
  "mistralai/Mistral-7B-Instruct-v0.3"
@@ -46,6 +55,7 @@ with st.sidebar:
46
  "Select Analysis Model:",
47
  [
48
  "Qwen/Qwen2.5-14B-Instruct",
 
49
  "mistralai/Mistral-7B-Instruct-v0.3",
50
  "HuggingFaceH4/zephyr-7b-beta"
51
  ],
@@ -56,17 +66,16 @@ with st.sidebar:
56
  max_tokens = st.slider("Max Tokens", 128, 2048, 512)
57
 
58
  # Initialize inference clients
59
- cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
60
- analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
 
 
61
 
62
  # ======================================================
63
  # 🧩 SAFE GENERATION FUNCTION
64
  # ======================================================
65
  def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
66
- """
67
- Tries text_generation first, then falls back to chat_completion if not supported.
68
- Returns plain string content.
69
- """
70
  try:
71
  resp = client.text_generation(
72
  prompt,
@@ -90,7 +99,6 @@ def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
90
  # 🧩 SMART DATA CLEANING
91
  # ======================================================
92
  def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
93
- """Backup rule-based cleaner."""
94
  df = df.copy()
95
  df.dropna(axis=1, how="all", inplace=True)
96
  df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
@@ -107,8 +115,8 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
107
 
108
 
109
  def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
110
- """Cleans the dataset using the selected AI model. Falls back gracefully if the model fails."""
111
- raw_preview = df.head(5).to_csv(index=False)
112
  prompt = f"""
113
  You are a professional data cleaning assistant.
114
  Clean and standardize the dataset below dynamically:
@@ -118,31 +126,19 @@ Clean and standardize the dataset below dynamically:
118
  4. Remove irrelevant or duplicate rows
119
  Return ONLY a valid CSV text (no markdown, no explanations).
120
 
121
- --- RAW SAMPLE ---
122
- {raw_preview}
123
  """
124
 
125
  try:
126
- cleaned_str = safe_hf_generate(cleaner_client, prompt, temperature=0.1, max_tokens=1024)
127
  except Exception as e:
128
  st.warning(f"⚠️ AI cleaning failed: {e}")
129
  return fallback_clean(df)
130
 
131
- cleaned_str = (
132
- cleaned_str.replace("```csv", "")
133
- .replace("```", "")
134
- .replace("###", "")
135
- .replace(";", ",")
136
- .strip()
137
- )
138
-
139
- lines = cleaned_str.splitlines()
140
- lines = [line for line in lines if "," in line and not line.lower().startswith(("note", "summary"))]
141
- cleaned_str = "\n".join(lines)
142
-
143
  try:
144
  cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
145
- cleaned_df = cleaned_df.dropna(axis=1, how="all")
146
  cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
147
  return cleaned_df
148
  except Exception as e:
@@ -150,8 +146,10 @@ Return ONLY a valid CSV text (no markdown, no explanations).
150
  return fallback_clean(df)
151
 
152
 
 
 
 
153
  def summarize_dataframe(df: pd.DataFrame) -> str:
154
- """Generate a concise summary of the dataframe."""
155
  lines = [f"Rows: {len(df)} | Columns: {len(df.columns)}", "Column summaries:"]
156
  for col in df.columns[:10]:
157
  non_null = int(df[col].notnull().sum())
@@ -167,18 +165,13 @@ def summarize_dataframe(df: pd.DataFrame) -> str:
167
 
168
 
169
  def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
170
- """Send the dataframe and user query to the analysis model for interpretation."""
171
- df_summary = summarize_dataframe(df)
172
- sample = df.head(6).to_csv(index=False)
173
  prompt = f"""
174
  You are a professional data analyst.
175
  Analyze the dataset '{dataset_name}' and answer the user's question.
176
 
177
- --- SUMMARY ---
178
- {df_summary}
179
-
180
- --- SAMPLE DATA ---
181
- {sample}
182
 
183
  --- USER QUESTION ---
184
  {user_query}
@@ -189,13 +182,21 @@ Respond with:
189
  3. Notable relationships or anomalies
190
  4. Data-driven recommendations
191
  """
192
-
193
  try:
194
- response = safe_hf_generate(analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
195
- return response
 
 
 
 
 
 
 
 
196
  except Exception as e:
197
  return f"⚠️ Analysis failed: {e}"
198
 
 
199
  # ======================================================
200
  # πŸš€ MAIN APP LOGIC
201
  # ======================================================
@@ -264,5 +265,6 @@ if uploaded:
264
  result = query_analysis_model(cleaned_df, user_query, uploaded.name)
265
  st.markdown("### πŸ’‘ Insights")
266
  st.markdown(result)
 
267
  else:
268
  st.info("πŸ“₯ Upload a dataset to begin smart analysis.")
 
6
  import plotly.figure_factory as ff
7
  from dotenv import load_dotenv
8
  from huggingface_hub import InferenceClient, login
9
+ from google import genai
10
  from io import StringIO
11
 
12
  # ======================================================
 
14
  # ======================================================
15
  st.set_page_config(page_title="πŸ“Š Smart Data Analyst Pro", layout="wide")
16
  st.title("πŸ“Š Smart Data Analyst Pro")
17
+ st.caption("AI that cleans, analyzes, and visualizes your data β€” Hugging Face + Gemini compatible.")
18
 
19
  # ======================================================
20
  # πŸ” Load Environment Variables
21
  # ======================================================
22
  load_dotenv()
23
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
24
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
25
+
26
  if not HF_TOKEN:
27
  st.error("❌ Missing HF_TOKEN. Please set it in your .env file.")
28
  else:
29
  login(token=HF_TOKEN)
30
 
31
+ if GEMINI_API_KEY:
32
+ gemini_client = genai.Client(api_key=GEMINI_API_KEY)
33
+ else:
34
+ gemini_client = None
35
+ st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")
36
+
37
  # ======================================================
38
  # 🧠 MODEL SETUP
39
  # ======================================================
 
43
  CLEANER_MODEL = st.selectbox(
44
  "Select Cleaner Model:",
45
  [
46
+ "Qwen/Qwen2.5-Coder-14B",
47
  "meta-llama/Meta-Llama-3-8B-Instruct",
48
  "microsoft/Phi-3-mini-4k-instruct",
49
  "mistralai/Mistral-7B-Instruct-v0.3"
 
55
  "Select Analysis Model:",
56
  [
57
  "Qwen/Qwen2.5-14B-Instruct",
58
+ "Gemini 2.5 Flash (Google)",
59
  "mistralai/Mistral-7B-Instruct-v0.3",
60
  "HuggingFaceH4/zephyr-7b-beta"
61
  ],
 
66
  max_tokens = st.slider("Max Tokens", 128, 2048, 512)
67
 
68
  # Initialize inference clients
69
+ hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
70
+ hf_analyst_client = None
71
+ if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
72
+ hf_analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
73
 
74
  # ======================================================
75
  # 🧩 SAFE GENERATION FUNCTION
76
  # ======================================================
77
  def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
78
+ """HF text generation fallback to chat_completion"""
 
 
 
79
  try:
80
  resp = client.text_generation(
81
  prompt,
 
99
  # 🧩 SMART DATA CLEANING
100
  # ======================================================
101
  def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
 
102
  df = df.copy()
103
  df.dropna(axis=1, how="all", inplace=True)
104
  df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
 
115
 
116
 
117
  def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
118
+ """Clean dataset using AI. Full dataset sent for thorough cleaning."""
119
+ csv_text = df.to_csv(index=False)
120
  prompt = f"""
121
  You are a professional data cleaning assistant.
122
  Clean and standardize the dataset below dynamically:
 
126
  4. Remove irrelevant or duplicate rows
127
  Return ONLY a valid CSV text (no markdown, no explanations).
128
 
129
+ Dataset:
130
+ {csv_text}
131
  """
132
 
133
  try:
134
+ cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
135
  except Exception as e:
136
  st.warning(f"⚠️ AI cleaning failed: {e}")
137
  return fallback_clean(df)
138
 
139
+ cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
 
 
 
 
 
 
 
 
 
 
 
140
  try:
141
  cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
 
142
  cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
143
  return cleaned_df
144
  except Exception as e:
 
146
  return fallback_clean(df)
147
 
148
 
149
+ # ======================================================
150
+ # 🧩 DATA ANALYSIS
151
+ # ======================================================
152
  def summarize_dataframe(df: pd.DataFrame) -> str:
 
153
  lines = [f"Rows: {len(df)} | Columns: {len(df.columns)}", "Column summaries:"]
154
  for col in df.columns[:10]:
155
  non_null = int(df[col].notnull().sum())
 
165
 
166
 
167
  def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
168
+ csv_text = df.to_csv(index=False)
 
 
169
  prompt = f"""
170
  You are a professional data analyst.
171
  Analyze the dataset '{dataset_name}' and answer the user's question.
172
 
173
+ --- FULL DATA SAMPLE ---
174
+ {csv_text}
 
 
 
175
 
176
  --- USER QUESTION ---
177
  {user_query}
 
182
  3. Notable relationships or anomalies
183
  4. Data-driven recommendations
184
  """
 
185
  try:
186
+ if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
187
+ if gemini_client is None:
188
+ return "⚠️ Gemini API key missing."
189
+ response = gemini_client.models.generate_content(
190
+ model="gemini-2.5-flash",
191
+ contents=[prompt]
192
+ )
193
+ return getattr(response, "text", "No response from Gemini.")
194
+ else:
195
+ return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
196
  except Exception as e:
197
  return f"⚠️ Analysis failed: {e}"
198
 
199
+
200
  # ======================================================
201
  # πŸš€ MAIN APP LOGIC
202
  # ======================================================
 
265
  result = query_analysis_model(cleaned_df, user_query, uploaded.name)
266
  st.markdown("### πŸ’‘ Insights")
267
  st.markdown(result)
268
+
269
  else:
270
  st.info("πŸ“₯ Upload a dataset to begin smart analysis.")