File size: 8,111 Bytes
f87d4de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259fc98
 
 
 
 
 
 
 
 
 
 
 
f87d4de
259fc98
f87d4de
 
3fcc1a9
f87d4de
 
3fcc1a9
 
 
 
259fc98
 
 
3fcc1a9
b29148a
 
2b2b073
259fc98
f87d4de
3fcc1a9
 
 
f87d4de
3fcc1a9
f87d4de
3fcc1a9
259fc98
3fcc1a9
 
f87d4de
 
3fcc1a9
 
f87d4de
 
 
3fcc1a9
f87d4de
 
 
 
 
 
3fcc1a9
 
f87d4de
3fcc1a9
259fc98
f87d4de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
from fastapi import FastAPI
from pydantic import BaseModel
import os
from typing import List, Literal, Optional
import joblib
import numpy as np
import pandas as pd
import requests
import shap
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# =====================================================
# CONFIG
# =====================================================

# Replace these with your NoCoDB API details
NOCO_API_URL = "https://dun3co-sdc-nocodb.hf.space/api/v2/tables/m39a8axnn3980w9/records"
NOCO_VIEW_ID = "vwjuv5jnaet9npuu"
NOCO_API_TOKEN = os.getenv("NOCODB_TOKEN")

HEADERS = {"xc-token": NOCO_API_TOKEN}

# =====================================================
# MODEL LOADING
# =====================================================

model = joblib.load("model_1mvp.pkl")
app = FastAPI(title="Logistic Regression API 2")

# =====================================================
# DATA SCHEMAS
# =====================================================

class InputData(BaseModel):
    age: int
    balance: float
    day: int
    campaign: int
    job: str
    education: str
    default: Literal["yes", "no", "unknown"]
    housing: Literal["yes", "no", "unknown"]
    loan: Literal["yes", "no", "unknown"]
    months_since_previous_contact: str
    n_previous_contacts: str
    poutcome: str
    had_contact: bool
    is_single: bool
    uknown_contact: bool

class BatchInputData(BaseModel):
    data: List[InputData]

# =====================================================
# HEALTH CHECK
# =====================================================

@app.get("/health")
def health():
    return {"status": "ok"}

# =====================================================
# NOCODB DATA FETCHING
# =====================================================

def fetch_test_data(limit: int = 100):
    """Fetch test or sample data from NoCoDB view."""
    params = {"offset": 0, "limit": limit, "viewId": NOCO_VIEW_ID}
    res = requests.get(NOCO_API_URL, headers=HEADERS, params=params)
    res.raise_for_status()
    data = res.json()["list"]
    return pd.DataFrame(data)

# =====================================================
# PREDICTION ENDPOINT
# =====================================================

@app.post("/predict")
def predict(batch: BatchInputData):
    try:
        X = pd.DataFrame([item.dict() for item in batch.data])
        preds = model.predict(X)
        probs = model.predict_proba(X)[:, 1]
        return {
            "predictions": preds.tolist(),
            "probabilities": probs.tolist()
        }
    except Exception as e:
        import traceback
        return {"error": str(e), "trace": traceback.format_exc()}

# =====================================================
# EXPLAINABILITY ENDPOINT
# =====================================================

@app.post("/explain")
def explain(batch: Optional[BatchInputData] = None, limit: int = 100):
    """Generate SHAP values either from provided data or from NoCoDB test data."""
    try:
        if batch:
            X = pd.DataFrame([item.dict() for item in batch.data])
            source = "client batch"
        else:
            X = fetch_test_data(limit=limit)
            source = f"NoCoDB (limit={limit})"

        print(f"[DEBUG] SHAP explain called using {source} | shape={X.shape} | cols={list(X.columns)}")

        # Remove ID and target columns if they exist
        drop_cols = [c for c in ["Id", "y", "target"] if c in X.columns]
        if drop_cols:
            print(f"[DEBUG] Dropping columns not used for prediction: {drop_cols}")
            X = X.drop(columns=drop_cols)

        # Handle pipelines correctly
        if hasattr(model, "named_steps"):
            preprocessor = model.named_steps["preprocessor"]
            classifier = model.named_steps["classifier"]

            X_transformed = preprocessor.transform(X)
            feature_names = preprocessor.get_feature_names_out()

            print(f"[DEBUG] Transformed shape: {X_transformed.shape} | n_features={len(feature_names)}")

            explainer = shap.Explainer(classifier, X_transformed)
            shap_values = explainer(X_transformed)

            shap_summary = pd.DataFrame({
                "feature": feature_names,
                "mean_abs_shap": np.abs(shap_values.values).mean(axis=0)
            }).sort_values("mean_abs_shap", ascending=False)
        else:
            # If model is not a pipeline
            explainer = shap.Explainer(model, X)
            shap_values = explainer(X)
            shap_summary = pd.DataFrame({
                "feature": X.columns,
                "mean_abs_shap": np.abs(shap_values.values).mean(axis=0)
            }).sort_values("mean_abs_shap", ascending=False)

        print(f"[DEBUG] SHAP summary created successfully with {len(shap_summary)} features.")
        return {"n_samples": len(X), "shap_summary": shap_summary.to_dict(orient="records")}

    except Exception as e:
        import traceback
        print("[ERROR] SHAP explain failed:", e)
        print(traceback.format_exc())
        return {"error": str(e), "trace": traceback.format_exc()}


# =====================================================
# METRICS ENDPOINT
# =====================================================

@app.post("/metrics")
def metrics(batch: Optional[BatchInputData] = None, limit: int = 100):
    """
    Compute ROC AUC and threshold analysis using input or NoCoDB test data.
    Assumes the target column 'y' is boolean (True/False).
    """
    # Defaults in case something fails
    roc_auc = None
    pr_auc = None
    thresholds = []
    precision = []
    recall = []

    try:
        # Fetch data from batch or NoCoDB
        if batch:
            X = pd.DataFrame([item.dict() for item in batch.data])
            source = "client batch"
        else:
            X = fetch_test_data(limit=limit)
            source = f"NoCoDB (limit={limit})"

        print(f"[DEBUG] Metrics called using {source} | shape={X.shape}")

        # Ensure target 'y' exists
        if "y" not in X.columns:
            return {"error": "No target column 'y' found in dataset."}

        # Robust conversion to int
        y_true = X["y"].map(lambda v: 1 if v in [True, "True", "true", 1] else 0).tolist()
        print(f"[DEBUG] Found {sum(y_true)} positive cases out of {len(y_true)}")
        X = X.drop(columns=["y"])

        # Drop ID if exists
        if "Id" in X.columns:
            X = X.drop(columns=["Id"])

        # Predict probabilities
        y_prob = model.predict_proba(X)[:, 1]

        # Compute metrics
        roc_auc = roc_auc_score(y_true, y_prob)
        precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
        pr_auc = auc(recall, precision)

        print(f"[DEBUG] ROC AUC={roc_auc:.3f} | PR AUC={pr_auc:.3f}")

        return {
            "roc_auc": roc_auc,
            "pr_auc": pr_auc,
            "thresholds": thresholds.tolist()[:20],
            "precision": precision.tolist()[:20],
            "recall": recall.tolist()[:20]
        }

    except Exception as e:
        import traceback
        print("[ERROR] Metrics failed:", e)
        print(traceback.format_exc())
        return {"error": str(e), "trace": traceback.format_exc()}


    
@app.get("/coefficients")
def coefficients():
    """
    Return logistic regression coefficients and feature names.
    Works if your model is a pipeline with 'preprocessor' and 'classifier' steps.
    """
    try:
        # Extract classifier and preprocessor
        classifier = model.named_steps["classifier"]
        preprocessor = model.named_steps["preprocessor"]

        # Get feature names after preprocessing
        feature_names = preprocessor.get_feature_names_out()

        # Get coefficients
        coefficients = classifier.coef_[0]

        df = pd.DataFrame({
            "feature": feature_names,
            "coefficient": coefficients.tolist()
        })

        return {"coefficients": df.to_dict(orient="records")}

    except Exception as e:
        import traceback
        return {"error": str(e), "trace": traceback.format_exc()}