Spotify User Churn Analysis - Building an XGBoost classifier

This is my first post here on machine learning. I’ve chosen this Spotfify dataset from kaggle to predict whether a user will churn or remain active using XGBoost gradient boosted trees. Decision trees become huge and are difficult to visualise, but I include SHAP plots to see how each variable contributes to the exlanation of the outcome for each user.

import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier

# Scikit-learn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the data
data_path = 'spotify_churn_dataset.csv'
df = pd.read_csv(data_path, delimiter=',')
df.describe()

	user_id	age	listening_time	songs_played_per_day	skip_rate	ads_listened_per_week	offline_listening	is_churned
count	8000.00000	8000.000000	8000.000000	8000.000000	8000.000000	8000.000000	8000.000000	8000.000000
mean	4000.50000	37.662125	154.068250	50.127250	0.300127	6.943875	0.747750	0.258875
std	2309.54541	12.740359	84.015596	28.449762	0.173594	13.617953	0.434331	0.438044
min	1.00000	16.000000	10.000000	1.000000	0.000000	0.000000	0.000000	0.000000
25%	2000.75000	26.000000	81.000000	25.000000	0.150000	0.000000	0.000000	0.000000
50%	4000.50000	38.000000	154.000000	50.000000	0.300000	0.000000	1.000000	0.000000
75%	6000.25000	49.000000	227.000000	75.000000	0.450000	5.000000	1.000000	1.000000
max	8000.00000	59.000000	299.000000	99.000000	0.600000	49.000000	1.000000	1.000000

# Extract feature and target arrays
X, y = df.drop(['user_id', 'is_churned'], axis=1), df[['is_churned']]

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
   X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)

X.dtypes

gender                   category
age                         int64
country                  category
subscription_type        category
listening_time              int64
songs_played_per_day        int64
skip_rate                 float64
device_type              category
ads_listened_per_week       int64
offline_listening           int64
dtype: object

# Define XGBClassifier model parameters
params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

# Instantiate XGBClassifier with the parameters
model = xgb.XGBClassifier(**params, enable_categorical = True)

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict class probabilities on the test data
probabilities = model.predict_proba(X_test)
# Print the first 5 samples
print("Predicted probabilities:\n", probabilities[:5])

Predicted probabilities:
 [[0.6463436  0.35365644]
 [0.82448244 0.17551754]
 [0.7530736  0.24692641]
 [0.7600267  0.23997329]
 [0.7368733  0.26312667]]

# Evaluate performance using scikit-learn metrics
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)


print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(confusion)
print(report)

Accuracy: 74.00%
[[1480    2]
 [ 518    0]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85      1482
           1       0.00      0.00      0.00       518

    accuracy                           0.74      2000
   macro avg       0.37      0.50      0.43      2000
weighted avg       0.55      0.74      0.63      2000

from sklearn.metrics import log_loss, roc_auc_score

# Calculate log loss
print(log_loss(y_test, probabilities))

# Calculate ROC AUC
print(roc_auc_score(y_test,probabilities[:,1]))

0.5824186947602866
0.4941446651972967

# Perform SHAP analysis 
import shap

# Create an explainer object
explainer = shap.TreeExplainer(model)

# Calculate SHAP values for the test set
shap_values = explainer(X_test)

The summary plot shows the contribution of each feature to the predictions, with colors representing the feature values. The spread of points indicates how the SHAP value varies across the dataset.

shap.summary_plot(shap_values, X_test)

# Visualise global feature importances
shap.summary_plot(shap_values, X_test, plot_type="bar")

Shap bar plot, average impact of each feature on model output