Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +41 -39
src/streamlit_app.py
CHANGED
|
@@ -6,6 +6,7 @@ import plotly.express as px
|
|
| 6 |
import plotly.figure_factory as ff
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
from huggingface_hub import InferenceClient, login
|
|
|
|
| 9 |
from io import StringIO
|
| 10 |
|
| 11 |
# ======================================================
|
|
@@ -13,18 +14,26 @@ from io import StringIO
|
|
| 13 |
# ======================================================
|
| 14 |
st.set_page_config(page_title="π Smart Data Analyst Pro", layout="wide")
|
| 15 |
st.title("π Smart Data Analyst Pro")
|
| 16 |
-
st.caption("AI that cleans, analyzes, and visualizes your data β
|
| 17 |
|
| 18 |
# ======================================================
|
| 19 |
# π Load Environment Variables
|
| 20 |
# ======================================================
|
| 21 |
load_dotenv()
|
| 22 |
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
|
|
|
|
|
|
|
| 23 |
if not HF_TOKEN:
|
| 24 |
st.error("β Missing HF_TOKEN. Please set it in your .env file.")
|
| 25 |
else:
|
| 26 |
login(token=HF_TOKEN)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# ======================================================
|
| 29 |
# π§ MODEL SETUP
|
| 30 |
# ======================================================
|
|
@@ -34,7 +43,7 @@ with st.sidebar:
|
|
| 34 |
CLEANER_MODEL = st.selectbox(
|
| 35 |
"Select Cleaner Model:",
|
| 36 |
[
|
| 37 |
-
"Qwen/Qwen2.5-Coder-
|
| 38 |
"meta-llama/Meta-Llama-3-8B-Instruct",
|
| 39 |
"microsoft/Phi-3-mini-4k-instruct",
|
| 40 |
"mistralai/Mistral-7B-Instruct-v0.3"
|
|
@@ -46,6 +55,7 @@ with st.sidebar:
|
|
| 46 |
"Select Analysis Model:",
|
| 47 |
[
|
| 48 |
"Qwen/Qwen2.5-14B-Instruct",
|
|
|
|
| 49 |
"mistralai/Mistral-7B-Instruct-v0.3",
|
| 50 |
"HuggingFaceH4/zephyr-7b-beta"
|
| 51 |
],
|
|
@@ -56,17 +66,16 @@ with st.sidebar:
|
|
| 56 |
max_tokens = st.slider("Max Tokens", 128, 2048, 512)
|
| 57 |
|
| 58 |
# Initialize inference clients
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
|
| 62 |
# ======================================================
|
| 63 |
# π§© SAFE GENERATION FUNCTION
|
| 64 |
# ======================================================
|
| 65 |
def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
|
| 66 |
-
"""
|
| 67 |
-
Tries text_generation first, then falls back to chat_completion if not supported.
|
| 68 |
-
Returns plain string content.
|
| 69 |
-
"""
|
| 70 |
try:
|
| 71 |
resp = client.text_generation(
|
| 72 |
prompt,
|
|
@@ -90,7 +99,6 @@ def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
|
|
| 90 |
# π§© SMART DATA CLEANING
|
| 91 |
# ======================================================
|
| 92 |
def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
|
| 93 |
-
"""Backup rule-based cleaner."""
|
| 94 |
df = df.copy()
|
| 95 |
df.dropna(axis=1, how="all", inplace=True)
|
| 96 |
df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
|
|
@@ -107,8 +115,8 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 107 |
|
| 108 |
|
| 109 |
def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
|
| 110 |
-
"""
|
| 111 |
-
|
| 112 |
prompt = f"""
|
| 113 |
You are a professional data cleaning assistant.
|
| 114 |
Clean and standardize the dataset below dynamically:
|
|
@@ -118,31 +126,19 @@ Clean and standardize the dataset below dynamically:
|
|
| 118 |
4. Remove irrelevant or duplicate rows
|
| 119 |
Return ONLY a valid CSV text (no markdown, no explanations).
|
| 120 |
|
| 121 |
-
|
| 122 |
-
{
|
| 123 |
"""
|
| 124 |
|
| 125 |
try:
|
| 126 |
-
cleaned_str = safe_hf_generate(
|
| 127 |
except Exception as e:
|
| 128 |
st.warning(f"β οΈ AI cleaning failed: {e}")
|
| 129 |
return fallback_clean(df)
|
| 130 |
|
| 131 |
-
cleaned_str = (
|
| 132 |
-
cleaned_str.replace("```csv", "")
|
| 133 |
-
.replace("```", "")
|
| 134 |
-
.replace("###", "")
|
| 135 |
-
.replace(";", ",")
|
| 136 |
-
.strip()
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
lines = cleaned_str.splitlines()
|
| 140 |
-
lines = [line for line in lines if "," in line and not line.lower().startswith(("note", "summary"))]
|
| 141 |
-
cleaned_str = "\n".join(lines)
|
| 142 |
-
|
| 143 |
try:
|
| 144 |
cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
|
| 145 |
-
cleaned_df = cleaned_df.dropna(axis=1, how="all")
|
| 146 |
cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
|
| 147 |
return cleaned_df
|
| 148 |
except Exception as e:
|
|
@@ -150,8 +146,10 @@ Return ONLY a valid CSV text (no markdown, no explanations).
|
|
| 150 |
return fallback_clean(df)
|
| 151 |
|
| 152 |
|
|
|
|
|
|
|
|
|
|
| 153 |
def summarize_dataframe(df: pd.DataFrame) -> str:
|
| 154 |
-
"""Generate a concise summary of the dataframe."""
|
| 155 |
lines = [f"Rows: {len(df)} | Columns: {len(df.columns)}", "Column summaries:"]
|
| 156 |
for col in df.columns[:10]:
|
| 157 |
non_null = int(df[col].notnull().sum())
|
|
@@ -167,18 +165,13 @@ def summarize_dataframe(df: pd.DataFrame) -> str:
|
|
| 167 |
|
| 168 |
|
| 169 |
def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
|
| 170 |
-
|
| 171 |
-
df_summary = summarize_dataframe(df)
|
| 172 |
-
sample = df.head(6).to_csv(index=False)
|
| 173 |
prompt = f"""
|
| 174 |
You are a professional data analyst.
|
| 175 |
Analyze the dataset '{dataset_name}' and answer the user's question.
|
| 176 |
|
| 177 |
-
---
|
| 178 |
-
{
|
| 179 |
-
|
| 180 |
-
--- SAMPLE DATA ---
|
| 181 |
-
{sample}
|
| 182 |
|
| 183 |
--- USER QUESTION ---
|
| 184 |
{user_query}
|
|
@@ -189,13 +182,21 @@ Respond with:
|
|
| 189 |
3. Notable relationships or anomalies
|
| 190 |
4. Data-driven recommendations
|
| 191 |
"""
|
| 192 |
-
|
| 193 |
try:
|
| 194 |
-
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
except Exception as e:
|
| 197 |
return f"β οΈ Analysis failed: {e}"
|
| 198 |
|
|
|
|
| 199 |
# ======================================================
|
| 200 |
# π MAIN APP LOGIC
|
| 201 |
# ======================================================
|
|
@@ -264,5 +265,6 @@ if uploaded:
|
|
| 264 |
result = query_analysis_model(cleaned_df, user_query, uploaded.name)
|
| 265 |
st.markdown("### π‘ Insights")
|
| 266 |
st.markdown(result)
|
|
|
|
| 267 |
else:
|
| 268 |
st.info("π₯ Upload a dataset to begin smart analysis.")
|
|
|
|
| 6 |
import plotly.figure_factory as ff
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
from huggingface_hub import InferenceClient, login
|
| 9 |
+
from google import genai
|
| 10 |
from io import StringIO
|
| 11 |
|
| 12 |
# ======================================================
|
|
|
|
| 14 |
# ======================================================
|
| 15 |
st.set_page_config(page_title="π Smart Data Analyst Pro", layout="wide")
|
| 16 |
st.title("π Smart Data Analyst Pro")
|
| 17 |
+
st.caption("AI that cleans, analyzes, and visualizes your data β Hugging Face + Gemini compatible.")
|
| 18 |
|
| 19 |
# ======================================================
|
| 20 |
# π Load Environment Variables
|
| 21 |
# ======================================================
|
| 22 |
load_dotenv()
|
| 23 |
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
|
| 24 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 25 |
+
|
| 26 |
if not HF_TOKEN:
|
| 27 |
st.error("β Missing HF_TOKEN. Please set it in your .env file.")
|
| 28 |
else:
|
| 29 |
login(token=HF_TOKEN)
|
| 30 |
|
| 31 |
+
if GEMINI_API_KEY:
|
| 32 |
+
gemini_client = genai.Client(api_key=GEMINI_API_KEY)
|
| 33 |
+
else:
|
| 34 |
+
gemini_client = None
|
| 35 |
+
st.warning("β οΈ Gemini API key missing. Gemini 2.5 Flash will not work.")
|
| 36 |
+
|
| 37 |
# ======================================================
|
| 38 |
# π§ MODEL SETUP
|
| 39 |
# ======================================================
|
|
|
|
| 43 |
CLEANER_MODEL = st.selectbox(
|
| 44 |
"Select Cleaner Model:",
|
| 45 |
[
|
| 46 |
+
"Qwen/Qwen2.5-Coder-14B",
|
| 47 |
"meta-llama/Meta-Llama-3-8B-Instruct",
|
| 48 |
"microsoft/Phi-3-mini-4k-instruct",
|
| 49 |
"mistralai/Mistral-7B-Instruct-v0.3"
|
|
|
|
| 55 |
"Select Analysis Model:",
|
| 56 |
[
|
| 57 |
"Qwen/Qwen2.5-14B-Instruct",
|
| 58 |
+
"Gemini 2.5 Flash (Google)",
|
| 59 |
"mistralai/Mistral-7B-Instruct-v0.3",
|
| 60 |
"HuggingFaceH4/zephyr-7b-beta"
|
| 61 |
],
|
|
|
|
| 66 |
max_tokens = st.slider("Max Tokens", 128, 2048, 512)
|
| 67 |
|
| 68 |
# Initialize inference clients
|
| 69 |
+
hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
|
| 70 |
+
hf_analyst_client = None
|
| 71 |
+
if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
|
| 72 |
+
hf_analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
|
| 73 |
|
| 74 |
# ======================================================
|
| 75 |
# π§© SAFE GENERATION FUNCTION
|
| 76 |
# ======================================================
|
| 77 |
def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
|
| 78 |
+
"""HF text generation fallback to chat_completion"""
|
|
|
|
|
|
|
|
|
|
| 79 |
try:
|
| 80 |
resp = client.text_generation(
|
| 81 |
prompt,
|
|
|
|
| 99 |
# π§© SMART DATA CLEANING
|
| 100 |
# ======================================================
|
| 101 |
def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
| 102 |
df = df.copy()
|
| 103 |
df.dropna(axis=1, how="all", inplace=True)
|
| 104 |
df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
|
| 118 |
+
"""Clean dataset using AI. Full dataset sent for thorough cleaning."""
|
| 119 |
+
csv_text = df.to_csv(index=False)
|
| 120 |
prompt = f"""
|
| 121 |
You are a professional data cleaning assistant.
|
| 122 |
Clean and standardize the dataset below dynamically:
|
|
|
|
| 126 |
4. Remove irrelevant or duplicate rows
|
| 127 |
Return ONLY a valid CSV text (no markdown, no explanations).
|
| 128 |
|
| 129 |
+
Dataset:
|
| 130 |
+
{csv_text}
|
| 131 |
"""
|
| 132 |
|
| 133 |
try:
|
| 134 |
+
cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
|
| 135 |
except Exception as e:
|
| 136 |
st.warning(f"β οΈ AI cleaning failed: {e}")
|
| 137 |
return fallback_clean(df)
|
| 138 |
|
| 139 |
+
cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
try:
|
| 141 |
cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
|
|
|
|
| 142 |
cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
|
| 143 |
return cleaned_df
|
| 144 |
except Exception as e:
|
|
|
|
| 146 |
return fallback_clean(df)
|
| 147 |
|
| 148 |
|
| 149 |
+
# ======================================================
|
| 150 |
+
# π§© DATA ANALYSIS
|
| 151 |
+
# ======================================================
|
| 152 |
def summarize_dataframe(df: pd.DataFrame) -> str:
|
|
|
|
| 153 |
lines = [f"Rows: {len(df)} | Columns: {len(df.columns)}", "Column summaries:"]
|
| 154 |
for col in df.columns[:10]:
|
| 155 |
non_null = int(df[col].notnull().sum())
|
|
|
|
| 165 |
|
| 166 |
|
| 167 |
def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
|
| 168 |
+
csv_text = df.to_csv(index=False)
|
|
|
|
|
|
|
| 169 |
prompt = f"""
|
| 170 |
You are a professional data analyst.
|
| 171 |
Analyze the dataset '{dataset_name}' and answer the user's question.
|
| 172 |
|
| 173 |
+
--- FULL DATA SAMPLE ---
|
| 174 |
+
{csv_text}
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
--- USER QUESTION ---
|
| 177 |
{user_query}
|
|
|
|
| 182 |
3. Notable relationships or anomalies
|
| 183 |
4. Data-driven recommendations
|
| 184 |
"""
|
|
|
|
| 185 |
try:
|
| 186 |
+
if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
|
| 187 |
+
if gemini_client is None:
|
| 188 |
+
return "β οΈ Gemini API key missing."
|
| 189 |
+
response = gemini_client.models.generate_content(
|
| 190 |
+
model="gemini-2.5-flash",
|
| 191 |
+
contents=[prompt]
|
| 192 |
+
)
|
| 193 |
+
return getattr(response, "text", "No response from Gemini.")
|
| 194 |
+
else:
|
| 195 |
+
return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
|
| 196 |
except Exception as e:
|
| 197 |
return f"β οΈ Analysis failed: {e}"
|
| 198 |
|
| 199 |
+
|
| 200 |
# ======================================================
|
| 201 |
# π MAIN APP LOGIC
|
| 202 |
# ======================================================
|
|
|
|
| 265 |
result = query_analysis_model(cleaned_df, user_query, uploaded.name)
|
| 266 |
st.markdown("### π‘ Insights")
|
| 267 |
st.markdown(result)
|
| 268 |
+
|
| 269 |
else:
|
| 270 |
st.info("π₯ Upload a dataset to begin smart analysis.")
|