Spotify User Churn Analysis - Building an XGBoost classifier
This is my first post here on machine learning. I’ve chosen this Spotfify dataset from kaggle to predict whether a user will churn or remain active using XGBoost gradient boosted trees. Decision trees become huge and are difficult to visualise, but I include SHAP plots to see how each variable contributes to the exlanation of the outcome for each user.
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
# Scikit-learn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Load the data
= 'spotify_churn_dataset.csv'
data_path = pd.read_csv(data_path, delimiter=',')
df df.describe()
user_id | age | listening_time | songs_played_per_day | skip_rate | ads_listened_per_week | offline_listening | is_churned | |
---|---|---|---|---|---|---|---|---|
count | 8000.00000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 |
mean | 4000.50000 | 37.662125 | 154.068250 | 50.127250 | 0.300127 | 6.943875 | 0.747750 | 0.258875 |
std | 2309.54541 | 12.740359 | 84.015596 | 28.449762 | 0.173594 | 13.617953 | 0.434331 | 0.438044 |
min | 1.00000 | 16.000000 | 10.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 2000.75000 | 26.000000 | 81.000000 | 25.000000 | 0.150000 | 0.000000 | 0.000000 | 0.000000 |
50% | 4000.50000 | 38.000000 | 154.000000 | 50.000000 | 0.300000 | 0.000000 | 1.000000 | 0.000000 |
75% | 6000.25000 | 49.000000 | 227.000000 | 75.000000 | 0.450000 | 5.000000 | 1.000000 | 1.000000 |
max | 8000.00000 | 59.000000 | 299.000000 | 99.000000 | 0.600000 | 49.000000 | 1.000000 | 1.000000 |
# Extract feature and target arrays
= df.drop(['user_id', 'is_churned'], axis=1), df[['is_churned']]
X, y
# Extract text features
= X.select_dtypes(exclude=np.number).columns.tolist()
cats
# Convert to pd.Categorical
for col in cats:
= X[col].astype('category')
X[col]
# Split the data
= train_test_split(X, y, random_state=1, stratify = y)
X_train, X_test, y_train, y_test
X.dtypes
gender category
age int64
country category
subscription_type category
listening_time int64
songs_played_per_day int64
skip_rate float64
device_type category
ads_listened_per_week int64
offline_listening int64
dtype: object
# Define XGBClassifier model parameters
= {
params 'objective': 'binary:logistic',
'max_depth': 3,
'learning_rate': 0.1,
'n_estimators': 100,
'subsample': 0.8,
'colsample_bytree': 0.8,
'random_state': 42
}
# Instantiate XGBClassifier with the parameters
= xgb.XGBClassifier(**params, enable_categorical = True) model
# Fit the model on the training data
model.fit(X_train, y_train)
# Predict class probabilities on the test data
= model.predict_proba(X_test)
probabilities # Print the first 5 samples
print("Predicted probabilities:\n", probabilities[:5])
Predicted probabilities:
[[0.6463436 0.35365644]
[0.82448244 0.17551754]
[0.7530736 0.24692641]
[0.7600267 0.23997329]
[0.7368733 0.26312667]]
# Evaluate performance using scikit-learn metrics
# Make predictions on the test set
= model.predict(X_test)
y_pred
# Evaluate model performance
= accuracy_score(y_test, y_pred)
accuracy = confusion_matrix(y_test, y_pred)
confusion = classification_report(y_test, y_pred)
report
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(confusion)
print(report)
Accuracy: 74.00%
[[1480 2]
[ 518 0]]
precision recall f1-score support
0 0.74 1.00 0.85 1482
1 0.00 0.00 0.00 518
accuracy 0.74 2000
macro avg 0.37 0.50 0.43 2000
weighted avg 0.55 0.74 0.63 2000
from sklearn.metrics import log_loss, roc_auc_score
# Calculate log loss
print(log_loss(y_test, probabilities))
# Calculate ROC AUC
print(roc_auc_score(y_test,probabilities[:,1]))
0.5824186947602866
0.4941446651972967
# Perform SHAP analysis
import shap
# Create an explainer object
= shap.TreeExplainer(model)
explainer
# Calculate SHAP values for the test set
= explainer(X_test) shap_values
The summary plot shows the contribution of each feature to the predictions, with colors representing the feature values. The spread of points indicates how the SHAP value varies across the dataset.
shap.summary_plot(shap_values, X_test)

# Visualise global feature importances
="bar") shap.summary_plot(shap_values, X_test, plot_type
