Spotify User Churn Analysis - Building an XGBoost classifier

This is my first post here on machine learning. I’ve chosen this Spotfify dataset from kaggle to predict whether a user will churn or remain active using XGBoost gradient boosted trees. Decision trees become huge and are difficult to visualise, but I include SHAP plots to see how each variable contributes to the exlanation of the outcome for each user.

import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier

# Scikit-learn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Load the data
data_path = 'spotify_churn_dataset.csv'
df = pd.read_csv(data_path, delimiter=',')
df.describe()
user_id age listening_time songs_played_per_day skip_rate ads_listened_per_week offline_listening is_churned
count 8000.00000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000
mean 4000.50000 37.662125 154.068250 50.127250 0.300127 6.943875 0.747750 0.258875
std 2309.54541 12.740359 84.015596 28.449762 0.173594 13.617953 0.434331 0.438044
min 1.00000 16.000000 10.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 2000.75000 26.000000 81.000000 25.000000 0.150000 0.000000 0.000000 0.000000
50% 4000.50000 38.000000 154.000000 50.000000 0.300000 0.000000 1.000000 0.000000
75% 6000.25000 49.000000 227.000000 75.000000 0.450000 5.000000 1.000000 1.000000
max 8000.00000 59.000000 299.000000 99.000000 0.600000 49.000000 1.000000 1.000000
# Extract feature and target arrays
X, y = df.drop(['user_id', 'is_churned'], axis=1), df[['is_churned']]

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
   X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)

X.dtypes
gender                   category
age                         int64
country                  category
subscription_type        category
listening_time              int64
songs_played_per_day        int64
skip_rate                 float64
device_type              category
ads_listened_per_week       int64
offline_listening           int64
dtype: object
# Define XGBClassifier model parameters
params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

# Instantiate XGBClassifier with the parameters
model = xgb.XGBClassifier(**params, enable_categorical = True)
# Fit the model on the training data
model.fit(X_train, y_train)

# Predict class probabilities on the test data
probabilities = model.predict_proba(X_test)
# Print the first 5 samples
print("Predicted probabilities:\n", probabilities[:5])  
Predicted probabilities:
 [[0.6463436  0.35365644]
 [0.82448244 0.17551754]
 [0.7530736  0.24692641]
 [0.7600267  0.23997329]
 [0.7368733  0.26312667]]
# Evaluate performance using scikit-learn metrics
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)


print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(confusion)
print(report)
Accuracy: 74.00%
[[1480    2]
 [ 518    0]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85      1482
           1       0.00      0.00      0.00       518

    accuracy                           0.74      2000
   macro avg       0.37      0.50      0.43      2000
weighted avg       0.55      0.74      0.63      2000
from sklearn.metrics import log_loss, roc_auc_score

# Calculate log loss
print(log_loss(y_test, probabilities))

# Calculate ROC AUC
print(roc_auc_score(y_test,probabilities[:,1]))
0.5824186947602866
0.4941446651972967
# Perform SHAP analysis 
import shap

# Create an explainer object
explainer = shap.TreeExplainer(model)

# Calculate SHAP values for the test set
shap_values = explainer(X_test)

The summary plot shows the contribution of each feature to the predictions, with colors representing the feature values. The spread of points indicates how the SHAP value varies across the dataset.

shap.summary_plot(shap_values, X_test)
Shap values plot
# Visualise global feature importances
shap.summary_plot(shap_values, X_test, plot_type="bar")
Shap bar plot, average impact of each feature on model output