Spaces:

computerscience-person
/

CCS229_Customer_Churn

Sleeping

App Files Files Community

CCS229_Customer_Churn / app.py

computerscience-person

Add predictor portion.

6df7a65 12 months ago

raw

history blame contribute delete

13.4 kB

	import marimo

	__generated_with = "0.11.20"
	app = marimo.App(width="medium")


	@app.cell
	def _(mo):
	mo.md(r"""# Customer Churn Analysis""")
	return


	@app.cell
	def _():
	import marimo as mo
	import polars as pl
	import altair as alt
	return alt, mo, pl


	@app.cell
	def _(pl):
	df = pl.read_csv(
	"hf://datasets/louiecerv/customer_churn/customer_churn_data.csv"
	)
	df.describe()
	return (df,)


	@app.cell
	def _(df):
	df.head()
	return


	@app.cell
	def _(df, pl):
	from sklearn.preprocessing import (
	RobustScaler,
	OneHotEncoder,
	MinMaxScaler,
	OrdinalEncoder,
	)
	from sklearn.pipeline import make_pipeline
	from sklearn.compose import make_column_transformer
	from sklearn.linear_model import (
	LogisticRegression,
	BayesianRidge,
	RidgeClassifier,
	SGDClassifier,
	)
	from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
	from sklearn.naive_bayes import BernoulliNB
	from sklearn.svm import SVC
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.ensemble import (
	VotingClassifier,
	BaggingClassifier,
	GradientBoostingClassifier,
	RandomForestClassifier,
	)
	from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
	from sklearn.model_selection import train_test_split

	num_features = ["tenure", "monthly_charges", "total_charges"]
	cat_features = ["contract_One Two year", "internet_service_Fiber No"]
	random_state = 33

	df2 = df.with_columns(
	(pl.col("contract_One year") + "_" + pl.col("contract_Two year")).alias(
	"contract_One Two year"
	),
	(
	pl.col("internet_service_Fiber optic")
	+ "_"
	+ pl.col("internet_service_No")
	).alias("internet_service_Fiber No"),
	)

	X, y = df2.select(num_features + cat_features), df2.select(["churn"])

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.32, random_state=random_state
	)

	preprocessor = make_column_transformer(
	(OneHotEncoder(), cat_features),
	(MinMaxScaler(), num_features),
	)

	knc = KNeighborsClassifier(algorithm="ball_tree")
	dtree = DecisionTreeClassifier(criterion="entropy", random_state=random_state)
	rfc = RandomForestClassifier(
	criterion="entropy", max_features=0.3, random_state=random_state
	)
	gbc = GradientBoostingClassifier(random_state=random_state)
	bag = BaggingClassifier(
	KNeighborsClassifier(),
	max_samples=0.8,
	max_features=0.8,
	random_state=random_state,
	)

	log_pipe = make_pipeline(
	preprocessor, LogisticRegression(max_iter=10000, random_state=random_state)
	)
	bridge_pipe = make_pipeline(preprocessor, BayesianRidge(max_iter=10000))
	ridge_pipe = make_pipeline(
	preprocessor, RidgeClassifier(max_iter=10000, random_state=random_state)
	)
	sgd_pipe = make_pipeline(
	preprocessor,
	SGDClassifier(
	loss="hinge", penalty="l2", max_iter=10000, random_state=random_state
	),
	)
	lda_pipe = make_pipeline(preprocessor, QuadraticDiscriminantAnalysis())
	bnb_pipe = make_pipeline(preprocessor, BernoulliNB())
	svc_pipe = make_pipeline(
	preprocessor, SVC(kernel="rbf", max_iter=10000, random_state=random_state)
	)
	dtree_pipe = make_pipeline(preprocessor, dtree)
	rfc_pipe = make_pipeline(preprocessor, rfc)
	knc_pipe = make_pipeline(preprocessor, knc)
	gbc_pipe = make_pipeline(preprocessor, gbc)
	vot_pipe = make_pipeline(
	preprocessor,
	VotingClassifier(
	estimators=[
	("qda", QuadraticDiscriminantAnalysis()),
	("dtree", dtree),
	],
	voting="soft",
	weights=[5, 2],
	),
	)
	bag_pipe = make_pipeline(preprocessor, bag)

	log_pred = log_pipe.fit(X_train, y_train).predict(X_test)
	bridge_pred = bridge_pipe.fit(X_train, y_train).predict(X_test)
	ridge_pred = ridge_pipe.fit(X_train, y_train).predict(X_test)
	sgd_pred = sgd_pipe.fit(X_train, y_train).predict(X_test)
	lda_pred = lda_pipe.fit(X_train, y_train).predict(X_test)
	bnb_pred = bnb_pipe.fit(X_train, y_train).predict(X_test)
	svc_pred = svc_pipe.fit(X_train, y_train).predict(X_test)
	dtree_pred = dtree_pipe.fit(X_train, y_train).predict(X_test)
	rfc_pred = dtree_pipe.fit(X_train, y_train).predict(X_test)
	knc_pred = knc_pipe.fit(X_train, y_train).predict(X_test)
	gbc_pred = gbc_pipe.fit(X_train, y_train).predict(X_test)
	vot_pred = vot_pipe.fit(X_train, y_train).predict(X_test)
	bag_pred = bag_pipe.fit(X_train, y_train).predict(X_test)
	return (
	BaggingClassifier,
	BayesianRidge,
	BernoulliNB,
	DecisionTreeClassifier,
	GradientBoostingClassifier,
	KNeighborsClassifier,
	LogisticRegression,
	MinMaxScaler,
	OneHotEncoder,
	OrdinalEncoder,
	QuadraticDiscriminantAnalysis,
	RFE,
	RFECV,
	RandomForestClassifier,
	RidgeClassifier,
	RobustScaler,
	SGDClassifier,
	SVC,
	SequentialFeatureSelector,
	VotingClassifier,
	X,
	X_test,
	X_train,
	bag,
	bag_pipe,
	bag_pred,
	bnb_pipe,
	bnb_pred,
	bridge_pipe,
	bridge_pred,
	cat_features,
	df2,
	dtree,
	dtree_pipe,
	dtree_pred,
	gbc,
	gbc_pipe,
	gbc_pred,
	knc,
	knc_pipe,
	knc_pred,
	lda_pipe,
	lda_pred,
	log_pipe,
	log_pred,
	make_column_transformer,
	make_pipeline,
	num_features,
	preprocessor,
	random_state,
	rfc,
	rfc_pipe,
	rfc_pred,
	ridge_pipe,
	ridge_pred,
	sgd_pipe,
	sgd_pred,
	svc_pipe,
	svc_pred,
	train_test_split,
	vot_pipe,
	vot_pred,
	y,
	y_test,
	y_train,
	)


	@app.cell
	def _(
	bag_pred,
	bnb_pred,
	bridge_pred,
	dtree_pred,
	gbc_pred,
	knc_pred,
	lda_pred,
	log_pred,
	mo,
	rfc_pred,
	ridge_pred,
	sgd_pred,
	svc_pred,
	vot_pred,
	y_test,
	):
	from sklearn.metrics import (
	accuracy_score,
	precision_score,
	f1_score,
	recall_score,
	roc_auc_score,
	log_loss,
	mean_squared_error,
	root_mean_squared_error,
	mean_absolute_error,
	r2_score,
	explained_variance_score,
	)

	mo.md(f"""
	# Model Metrics

	## Logistic Regression

	- Accuracy: {accuracy_score(y_test, log_pred)}
	- Precision: {precision_score(y_test, log_pred)}
	- Recall: {recall_score(y_test, log_pred)}
	- F1: {f1_score(y_test, log_pred)}
	- ROC-AUC: {roc_auc_score(y_test, log_pred)}
	- Log Loss: {log_loss(y_test, log_pred)}

	## Ridge Classifier

	- Accuracy: {accuracy_score(y_test, ridge_pred)}
	- Precision: {precision_score(y_test, ridge_pred)}
	- Recall: {recall_score(y_test, ridge_pred)}
	- F1: {f1_score(y_test, ridge_pred)}
	- ROC-AUC: {roc_auc_score(y_test, ridge_pred)}
	- Log Loss: {log_loss(y_test, ridge_pred)}

	## SGD Classifier

	- Accuracy: {accuracy_score(y_test, sgd_pred)}
	- Precision: {precision_score(y_test, sgd_pred)}
	- Recall: {recall_score(y_test, sgd_pred)}
	- F1: {f1_score(y_test, sgd_pred)}
	- ROC-AUC: {roc_auc_score(y_test, sgd_pred)}
	- Log Loss: {log_loss(y_test, sgd_pred)}

	## Bayesian Ridge Regression

	- Mean Squared Error: {mean_squared_error(y_test, bridge_pred)}
	- Root Mean Squared Error: {root_mean_squared_error(y_test, bridge_pred)}
	- Mean Absolute Error: {mean_absolute_error(y_test, bridge_pred)}
	- R^2: {r2_score(y_test, bridge_pred)}
	- Explained Variance: {explained_variance_score(y_test, bridge_pred)}

	## Quadratic Discriminant Analysis

	- Accuracy: {accuracy_score(y_test, lda_pred)}
	- Precision: {precision_score(y_test, lda_pred)}
	- Recall: {recall_score(y_test, lda_pred)}
	- F1: {f1_score(y_test, lda_pred)}
	- ROC-AUC: {roc_auc_score(y_test, lda_pred)}
	- Log Loss: {log_loss(y_test, lda_pred)}

	## Bernoulli Naive Bayes

	- Accuracy: {accuracy_score(y_test, bnb_pred)}
	- Precision: {precision_score(y_test, bnb_pred)}
	- Recall: {recall_score(y_test, bnb_pred)}
	- F1: {f1_score(y_test, bnb_pred)}
	- ROC-AUC: {roc_auc_score(y_test, bnb_pred)}
	- Log Loss: {log_loss(y_test, bnb_pred)}

	## C-Support Vector Classifier

	- Accuracy: {accuracy_score(y_test, svc_pred)}
	- Precision: {precision_score(y_test, svc_pred)}
	- Recall: {recall_score(y_test, svc_pred)}
	- F1: {f1_score(y_test, svc_pred)}
	- ROC-AUC: {roc_auc_score(y_test, svc_pred)}
	- Log Loss: {log_loss(y_test, svc_pred)}

	## Decision Tree Classifier

	- Accuracy: {accuracy_score(y_test, dtree_pred)}
	- Precision: {precision_score(y_test, dtree_pred)}
	- Recall: {recall_score(y_test, dtree_pred)}
	- F1: {f1_score(y_test, dtree_pred)}
	- ROC-AUC: {roc_auc_score(y_test, dtree_pred)}
	- Log Loss: {log_loss(y_test, dtree_pred)}

	## Random Forest Classifier

	- Accuracy: {accuracy_score(y_test, rfc_pred)}
	- Precision: {precision_score(y_test, rfc_pred)}
	- Recall: {recall_score(y_test, rfc_pred)}
	- F1: {f1_score(y_test, rfc_pred)}
	- ROC-AUC: {roc_auc_score(y_test, rfc_pred)}
	- Log Loss: {log_loss(y_test, rfc_pred)}

	## K Neighbors Classifier

	- Accuracy: {accuracy_score(y_test, knc_pred)}
	- Precision: {precision_score(y_test, knc_pred)}
	- Recall: {recall_score(y_test, knc_pred)}
	- F1: {f1_score(y_test, knc_pred)}
	- ROC-AUC: {roc_auc_score(y_test, knc_pred)}
	- Log Loss: {log_loss(y_test, knc_pred)}

	## Gradient Boosting Classifier

	- Accuracy: {accuracy_score(y_test, gbc_pred)}
	- Precision: {precision_score(y_test, gbc_pred)}
	- Recall: {recall_score(y_test, gbc_pred)}
	- F1: {f1_score(y_test, gbc_pred)}
	- ROC-AUC: {roc_auc_score(y_test, gbc_pred)}
	- Log Loss: {log_loss(y_test, gbc_pred)}

	## Voting Classifier

	- Accuracy: {accuracy_score(y_test, vot_pred)}
	- Precision: {precision_score(y_test, vot_pred)}
	- Recall: {recall_score(y_test, vot_pred)}
	- F1: {f1_score(y_test, vot_pred)}
	- ROC-AUC: {roc_auc_score(y_test, vot_pred)}
	- Log Loss: {log_loss(y_test, vot_pred)}

	## Bagging Classifier

	- Accuracy: {accuracy_score(y_test, bag_pred)}
	- Precision: {precision_score(y_test, bag_pred)}
	- Recall: {recall_score(y_test, bag_pred)}
	- F1: {f1_score(y_test, bag_pred)}
	- ROC-AUC: {roc_auc_score(y_test, bag_pred)}
	- Log Loss: {log_loss(y_test, bag_pred)}

	{
	mo.callout(
	"From the metrics, the Quadratic Discriminant Analysis and the Decision Tree Classifier perform the best, thus, they were chosen for the Voting Classifier",
	kind="info",
	)
	}
	""")
	return (
	accuracy_score,
	explained_variance_score,
	f1_score,
	log_loss,
	mean_absolute_error,
	mean_squared_error,
	precision_score,
	r2_score,
	recall_score,
	roc_auc_score,
	root_mean_squared_error,
	)


	@app.cell
	def _(mo):
	user_inputs = mo.ui.dictionary(
	{
	"tenure": mo.ui.number(label="Tenure", start=1, stop=72, step=1),
	"monthly_charges": mo.ui.number(
	label="Monthly Charges", start=20, stop=120, step=1
	),
	"total_charges": mo.ui.number(
	label="Total Charges", start=20, stop=8000, step=1
	),
	"contract": mo.ui.dropdown(
	label="Contract (Year)", options=["None", "One", "Two"]
	),
	"service": mo.ui.dropdown(
	label="Service", options=["None", "Basic", "Fiber Optic"]
	),
	}
	)

	mo.vstack(user_inputs.values())
	return (user_inputs,)


	@app.cell
	def _(mo, pl, user_inputs, vot_pipe):
	contract = None
	service = None

	match user_inputs["contract"].value:
	case "None":
	contract = "false_false"
	case "One":
	contract = "true_false"
	case "Two":
	contract = "false_true"
	case _:
	pass

	match user_inputs["service"].value:
	case "None":
	service = "false_false"
	case "Basic":
	service = "true_false"
	case "Fiber Optic":
	service = "false_true"
	case _:
	pass

	preds = pl.DataFrame({
	"tenure": user_inputs["tenure"].value,
	"monthly_charges": user_inputs["monthly_charges"].value,
	"total_charges": user_inputs["total_charges"].value,
	"contract_One Two year": contract,
	"internet_service_Fiber No": service,
	})

	prediction = (vot_pipe.predict(preds), vot_pipe.predict_proba(preds))

	mo.md(f"Prediction: {"Yes" if prediction[0][0] else "No" }, with about {prediction[1][0][0] * 100 if not prediction[0][0] else prediction[1][0][1] * 100:.2f}% probability.")
	return contract, prediction, preds, service


	if __name__ == "__main__":
	app.run()