| | import marimo |
| |
|
| | __generated_with = "0.11.20" |
| | app = marimo.App(width="medium") |
| |
|
| |
|
| | @app.cell |
| | def _(mo): |
| | mo.md(r"""# Customer Churn Analysis""") |
| | return |
| |
|
| |
|
| | @app.cell |
| | def _(): |
| | import marimo as mo |
| | import polars as pl |
| | import altair as alt |
| | return alt, mo, pl |
| |
|
| |
|
| | @app.cell |
| | def _(pl): |
| | df = pl.read_csv( |
| | "hf://datasets/louiecerv/customer_churn/customer_churn_data.csv" |
| | ) |
| | df.describe() |
| | return (df,) |
| |
|
| |
|
| | @app.cell |
| | def _(df): |
| | df.head() |
| | return |
| |
|
| |
|
| | @app.cell |
| | def _(df, pl): |
| | from sklearn.preprocessing import ( |
| | RobustScaler, |
| | OneHotEncoder, |
| | MinMaxScaler, |
| | OrdinalEncoder, |
| | ) |
| | from sklearn.pipeline import make_pipeline |
| | from sklearn.compose import make_column_transformer |
| | from sklearn.linear_model import ( |
| | LogisticRegression, |
| | BayesianRidge, |
| | RidgeClassifier, |
| | SGDClassifier, |
| | ) |
| | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis |
| | from sklearn.naive_bayes import BernoulliNB |
| | from sklearn.svm import SVC |
| | from sklearn.tree import DecisionTreeClassifier |
| | from sklearn.neighbors import KNeighborsClassifier |
| | from sklearn.ensemble import ( |
| | VotingClassifier, |
| | BaggingClassifier, |
| | GradientBoostingClassifier, |
| | RandomForestClassifier, |
| | ) |
| | from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector |
| | from sklearn.model_selection import train_test_split |
| |
|
| | num_features = ["tenure", "monthly_charges", "total_charges"] |
| | cat_features = ["contract_One Two year", "internet_service_Fiber No"] |
| | random_state = 33 |
| |
|
| | df2 = df.with_columns( |
| | (pl.col("contract_One year") + "_" + pl.col("contract_Two year")).alias( |
| | "contract_One Two year" |
| | ), |
| | ( |
| | pl.col("internet_service_Fiber optic") |
| | + "_" |
| | + pl.col("internet_service_No") |
| | ).alias("internet_service_Fiber No"), |
| | ) |
| |
|
| | X, y = df2.select(num_features + cat_features), df2.select(["churn"]) |
| |
|
| | X_train, X_test, y_train, y_test = train_test_split( |
| | X, y, test_size=0.32, random_state=random_state |
| | ) |
| |
|
| | preprocessor = make_column_transformer( |
| | (OneHotEncoder(), cat_features), |
| | (MinMaxScaler(), num_features), |
| | ) |
| |
|
| | knc = KNeighborsClassifier(algorithm="ball_tree") |
| | dtree = DecisionTreeClassifier(criterion="entropy", random_state=random_state) |
| | rfc = RandomForestClassifier( |
| | criterion="entropy", max_features=0.3, random_state=random_state |
| | ) |
| | gbc = GradientBoostingClassifier(random_state=random_state) |
| | bag = BaggingClassifier( |
| | KNeighborsClassifier(), |
| | max_samples=0.8, |
| | max_features=0.8, |
| | random_state=random_state, |
| | ) |
| |
|
| | log_pipe = make_pipeline( |
| | preprocessor, LogisticRegression(max_iter=10000, random_state=random_state) |
| | ) |
| | bridge_pipe = make_pipeline(preprocessor, BayesianRidge(max_iter=10000)) |
| | ridge_pipe = make_pipeline( |
| | preprocessor, RidgeClassifier(max_iter=10000, random_state=random_state) |
| | ) |
| | sgd_pipe = make_pipeline( |
| | preprocessor, |
| | SGDClassifier( |
| | loss="hinge", penalty="l2", max_iter=10000, random_state=random_state |
| | ), |
| | ) |
| | lda_pipe = make_pipeline(preprocessor, QuadraticDiscriminantAnalysis()) |
| | bnb_pipe = make_pipeline(preprocessor, BernoulliNB()) |
| | svc_pipe = make_pipeline( |
| | preprocessor, SVC(kernel="rbf", max_iter=10000, random_state=random_state) |
| | ) |
| | dtree_pipe = make_pipeline(preprocessor, dtree) |
| | rfc_pipe = make_pipeline(preprocessor, rfc) |
| | knc_pipe = make_pipeline(preprocessor, knc) |
| | gbc_pipe = make_pipeline(preprocessor, gbc) |
| | vot_pipe = make_pipeline( |
| | preprocessor, |
| | VotingClassifier( |
| | estimators=[ |
| | ("qda", QuadraticDiscriminantAnalysis()), |
| | ("dtree", dtree), |
| | ], |
| | voting="soft", |
| | weights=[5, 2], |
| | ), |
| | ) |
| | bag_pipe = make_pipeline(preprocessor, bag) |
| |
|
| | log_pred = log_pipe.fit(X_train, y_train).predict(X_test) |
| | bridge_pred = bridge_pipe.fit(X_train, y_train).predict(X_test) |
| | ridge_pred = ridge_pipe.fit(X_train, y_train).predict(X_test) |
| | sgd_pred = sgd_pipe.fit(X_train, y_train).predict(X_test) |
| | lda_pred = lda_pipe.fit(X_train, y_train).predict(X_test) |
| | bnb_pred = bnb_pipe.fit(X_train, y_train).predict(X_test) |
| | svc_pred = svc_pipe.fit(X_train, y_train).predict(X_test) |
| | dtree_pred = dtree_pipe.fit(X_train, y_train).predict(X_test) |
| | rfc_pred = dtree_pipe.fit(X_train, y_train).predict(X_test) |
| | knc_pred = knc_pipe.fit(X_train, y_train).predict(X_test) |
| | gbc_pred = gbc_pipe.fit(X_train, y_train).predict(X_test) |
| | vot_pred = vot_pipe.fit(X_train, y_train).predict(X_test) |
| | bag_pred = bag_pipe.fit(X_train, y_train).predict(X_test) |
| | return ( |
| | BaggingClassifier, |
| | BayesianRidge, |
| | BernoulliNB, |
| | DecisionTreeClassifier, |
| | GradientBoostingClassifier, |
| | KNeighborsClassifier, |
| | LogisticRegression, |
| | MinMaxScaler, |
| | OneHotEncoder, |
| | OrdinalEncoder, |
| | QuadraticDiscriminantAnalysis, |
| | RFE, |
| | RFECV, |
| | RandomForestClassifier, |
| | RidgeClassifier, |
| | RobustScaler, |
| | SGDClassifier, |
| | SVC, |
| | SequentialFeatureSelector, |
| | VotingClassifier, |
| | X, |
| | X_test, |
| | X_train, |
| | bag, |
| | bag_pipe, |
| | bag_pred, |
| | bnb_pipe, |
| | bnb_pred, |
| | bridge_pipe, |
| | bridge_pred, |
| | cat_features, |
| | df2, |
| | dtree, |
| | dtree_pipe, |
| | dtree_pred, |
| | gbc, |
| | gbc_pipe, |
| | gbc_pred, |
| | knc, |
| | knc_pipe, |
| | knc_pred, |
| | lda_pipe, |
| | lda_pred, |
| | log_pipe, |
| | log_pred, |
| | make_column_transformer, |
| | make_pipeline, |
| | num_features, |
| | preprocessor, |
| | random_state, |
| | rfc, |
| | rfc_pipe, |
| | rfc_pred, |
| | ridge_pipe, |
| | ridge_pred, |
| | sgd_pipe, |
| | sgd_pred, |
| | svc_pipe, |
| | svc_pred, |
| | train_test_split, |
| | vot_pipe, |
| | vot_pred, |
| | y, |
| | y_test, |
| | y_train, |
| | ) |
| |
|
| |
|
| | @app.cell |
| | def _( |
| | bag_pred, |
| | bnb_pred, |
| | bridge_pred, |
| | dtree_pred, |
| | gbc_pred, |
| | knc_pred, |
| | lda_pred, |
| | log_pred, |
| | mo, |
| | rfc_pred, |
| | ridge_pred, |
| | sgd_pred, |
| | svc_pred, |
| | vot_pred, |
| | y_test, |
| | ): |
| | from sklearn.metrics import ( |
| | accuracy_score, |
| | precision_score, |
| | f1_score, |
| | recall_score, |
| | roc_auc_score, |
| | log_loss, |
| | mean_squared_error, |
| | root_mean_squared_error, |
| | mean_absolute_error, |
| | r2_score, |
| | explained_variance_score, |
| | ) |
| |
|
| | mo.md(f""" |
| | # Model Metrics |
| | |
| | ## Logistic Regression |
| | |
| | - Accuracy: {accuracy_score(y_test, log_pred)} |
| | - Precision: {precision_score(y_test, log_pred)} |
| | - Recall: {recall_score(y_test, log_pred)} |
| | - F1: {f1_score(y_test, log_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, log_pred)} |
| | - Log Loss: {log_loss(y_test, log_pred)} |
| | |
| | ## Ridge Classifier |
| | |
| | - Accuracy: {accuracy_score(y_test, ridge_pred)} |
| | - Precision: {precision_score(y_test, ridge_pred)} |
| | - Recall: {recall_score(y_test, ridge_pred)} |
| | - F1: {f1_score(y_test, ridge_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, ridge_pred)} |
| | - Log Loss: {log_loss(y_test, ridge_pred)} |
| | |
| | ## SGD Classifier |
| | |
| | - Accuracy: {accuracy_score(y_test, sgd_pred)} |
| | - Precision: {precision_score(y_test, sgd_pred)} |
| | - Recall: {recall_score(y_test, sgd_pred)} |
| | - F1: {f1_score(y_test, sgd_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, sgd_pred)} |
| | - Log Loss: {log_loss(y_test, sgd_pred)} |
| | |
| | ## Bayesian Ridge Regression |
| | |
| | - Mean Squared Error: {mean_squared_error(y_test, bridge_pred)} |
| | - Root Mean Squared Error: {root_mean_squared_error(y_test, bridge_pred)} |
| | - Mean Absolute Error: {mean_absolute_error(y_test, bridge_pred)} |
| | - R^2: {r2_score(y_test, bridge_pred)} |
| | - Explained Variance: {explained_variance_score(y_test, bridge_pred)} |
| | |
| | ## Quadratic Discriminant Analysis |
| | |
| | - Accuracy: {accuracy_score(y_test, lda_pred)} |
| | - Precision: {precision_score(y_test, lda_pred)} |
| | - Recall: {recall_score(y_test, lda_pred)} |
| | - F1: {f1_score(y_test, lda_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, lda_pred)} |
| | - Log Loss: {log_loss(y_test, lda_pred)} |
| | |
| | ## Bernoulli Naive Bayes |
| | |
| | - Accuracy: {accuracy_score(y_test, bnb_pred)} |
| | - Precision: {precision_score(y_test, bnb_pred)} |
| | - Recall: {recall_score(y_test, bnb_pred)} |
| | - F1: {f1_score(y_test, bnb_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, bnb_pred)} |
| | - Log Loss: {log_loss(y_test, bnb_pred)} |
| | |
| | ## C-Support Vector Classifier |
| | |
| | - Accuracy: {accuracy_score(y_test, svc_pred)} |
| | - Precision: {precision_score(y_test, svc_pred)} |
| | - Recall: {recall_score(y_test, svc_pred)} |
| | - F1: {f1_score(y_test, svc_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, svc_pred)} |
| | - Log Loss: {log_loss(y_test, svc_pred)} |
| | |
| | ## Decision Tree Classifier |
| | |
| | - Accuracy: {accuracy_score(y_test, dtree_pred)} |
| | - Precision: {precision_score(y_test, dtree_pred)} |
| | - Recall: {recall_score(y_test, dtree_pred)} |
| | - F1: {f1_score(y_test, dtree_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, dtree_pred)} |
| | - Log Loss: {log_loss(y_test, dtree_pred)} |
| | |
| | ## Random Forest Classifier |
| | |
| | - Accuracy: {accuracy_score(y_test, rfc_pred)} |
| | - Precision: {precision_score(y_test, rfc_pred)} |
| | - Recall: {recall_score(y_test, rfc_pred)} |
| | - F1: {f1_score(y_test, rfc_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, rfc_pred)} |
| | - Log Loss: {log_loss(y_test, rfc_pred)} |
| | |
| | ## K Neighbors Classifier |
| | |
| | - Accuracy: {accuracy_score(y_test, knc_pred)} |
| | - Precision: {precision_score(y_test, knc_pred)} |
| | - Recall: {recall_score(y_test, knc_pred)} |
| | - F1: {f1_score(y_test, knc_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, knc_pred)} |
| | - Log Loss: {log_loss(y_test, knc_pred)} |
| | |
| | ## Gradient Boosting Classifier |
| | |
| | - Accuracy: {accuracy_score(y_test, gbc_pred)} |
| | - Precision: {precision_score(y_test, gbc_pred)} |
| | - Recall: {recall_score(y_test, gbc_pred)} |
| | - F1: {f1_score(y_test, gbc_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, gbc_pred)} |
| | - Log Loss: {log_loss(y_test, gbc_pred)} |
| | |
| | ## Voting Classifier |
| | |
| | - Accuracy: {accuracy_score(y_test, vot_pred)} |
| | - Precision: {precision_score(y_test, vot_pred)} |
| | - Recall: {recall_score(y_test, vot_pred)} |
| | - F1: {f1_score(y_test, vot_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, vot_pred)} |
| | - Log Loss: {log_loss(y_test, vot_pred)} |
| | |
| | ## Bagging Classifier |
| | |
| | - Accuracy: {accuracy_score(y_test, bag_pred)} |
| | - Precision: {precision_score(y_test, bag_pred)} |
| | - Recall: {recall_score(y_test, bag_pred)} |
| | - F1: {f1_score(y_test, bag_pred)} |
| | - ROC-AUC: {roc_auc_score(y_test, bag_pred)} |
| | - Log Loss: {log_loss(y_test, bag_pred)} |
| | |
| | { |
| | mo.callout( |
| | "From the metrics, the Quadratic Discriminant Analysis and the Decision Tree Classifier perform the best, thus, they were chosen for the Voting Classifier", |
| | kind="info", |
| | ) |
| | } |
| | """) |
| | return ( |
| | accuracy_score, |
| | explained_variance_score, |
| | f1_score, |
| | log_loss, |
| | mean_absolute_error, |
| | mean_squared_error, |
| | precision_score, |
| | r2_score, |
| | recall_score, |
| | roc_auc_score, |
| | root_mean_squared_error, |
| | ) |
| |
|
| |
|
| | @app.cell |
| | def _(mo): |
| | user_inputs = mo.ui.dictionary( |
| | { |
| | "tenure": mo.ui.number(label="Tenure", start=1, stop=72, step=1), |
| | "monthly_charges": mo.ui.number( |
| | label="Monthly Charges", start=20, stop=120, step=1 |
| | ), |
| | "total_charges": mo.ui.number( |
| | label="Total Charges", start=20, stop=8000, step=1 |
| | ), |
| | "contract": mo.ui.dropdown( |
| | label="Contract (Year)", options=["None", "One", "Two"] |
| | ), |
| | "service": mo.ui.dropdown( |
| | label="Service", options=["None", "Basic", "Fiber Optic"] |
| | ), |
| | } |
| | ) |
| |
|
| | mo.vstack(user_inputs.values()) |
| | return (user_inputs,) |
| |
|
| |
|
| | @app.cell |
| | def _(mo, pl, user_inputs, vot_pipe): |
| | contract = None |
| | service = None |
| |
|
| | match user_inputs["contract"].value: |
| | case "None": |
| | contract = "false_false" |
| | case "One": |
| | contract = "true_false" |
| | case "Two": |
| | contract = "false_true" |
| | case _: |
| | pass |
| |
|
| | match user_inputs["service"].value: |
| | case "None": |
| | service = "false_false" |
| | case "Basic": |
| | service = "true_false" |
| | case "Fiber Optic": |
| | service = "false_true" |
| | case _: |
| | pass |
| |
|
| | preds = pl.DataFrame({ |
| | "tenure": user_inputs["tenure"].value, |
| | "monthly_charges": user_inputs["monthly_charges"].value, |
| | "total_charges": user_inputs["total_charges"].value, |
| | "contract_One Two year": contract, |
| | "internet_service_Fiber No": service, |
| | }) |
| |
|
| | prediction = (vot_pipe.predict(preds), vot_pipe.predict_proba(preds)) |
| |
|
| | mo.md(f"Prediction: {"Yes" if prediction[0][0] else "No" }, with about {prediction[1][0][0] * 100 if not prediction[0][0] else prediction[1][0][1] * 100:.2f}% probability.") |
| | return contract, prediction, preds, service |
| |
|
| |
|
| | if __name__ == "__main__": |
| | app.run() |
| |
|