visit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_formats = [‘retina’]
import seaborn as sns
import time
import warnings
warnings.filterwarnings(“ignore”)
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, fbeta_score
from sklearn.metrics import auc, roc_curve, roc_auc_score, precision_recall_curve
This project is based on from Kaggle.
df = pd.read_csv(‘WA_Fn-UseC_-Telco-Customer-Churn.csv’)
# remove 11 rows with spaces in TotalCharges (0.15% missing data)
df['TotalCharges'] = df['TotalCharges'].replace(' ',np.nan)
df = df.dropna(how = 'any')
df['TotalCharges'] = df['TotalCharges'].astype(float)
# data overview
print ('Rows : ', df.shape[0])
print ('Columns : ', df.shape[1])
print ('\nFeatures : \n', df.columns.tolist())
print ('\nMissing values : ', df.isnull().sum().values.sum())
print ('\nUnique values : \n', df.nunique())
df.info()
df.isnull().sum()
print(df.Churn.value_counts())
df[‘Churn’].value_counts().plot(‘bar’).set_title(‘Churn’)
The features in this dataset include the following:
· demographic data: Gender, SeniorCitizen, Partner, Dependents
· subscribed services: PhoneService, MultipleLine, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies
· customer account information: CustomerID, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Tenure
# replace values for SeniorCitizen as a categorical feature
df[‘SeniorCitizen’] = df[‘SeniorCitizen’].replace({1:’Yes’,0:’No’})
num_cols = [‘tenure’, ‘MonthlyCharges’, ‘TotalCharges’]
df[num_cols].describe()
Features are selected mainly based on Lasso coefficient and using Random Forest. The p-values and coefficients from Statsmodels have also been considered.
df1 = pd.read_csv(‘df1.csv’)
X, y = df1.drop(‘Churn’,axis=1), df1[[‘Churn’]]
## to find significant features using LassoCV (all X_scaled)
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
print(‘Use LassoCV to find the optimal ALPHA value for L1 regularization’)
std = StandardScaler()
std.fit(X.values)
X_scaled = std.transform(X.values)
print(‘X_scaled’, X_scaled.shape)
alphavec = 10**np.linspace(-3,3,200)
lasso_model = LassoCV(alphas = alphavec, cv=5)
lasso_model.fit(X_scaled, y)
print(‘LASSO best alpha: ‘, lasso_model.alpha_ )
# display all coefficients in the model with optimal alpha
list(zip(X.columns, lasso_model.coef_))
## To look for top features using Random Forest
rfc = RandomForestClassifier(random_state=0, n_estimators=100)
model = rfc.fit(X, y)
(pd.Series(model.feature_importances_, index=X.columns)
.nlargest(47)
.plot(kind='barh', figsize=[20,15])
.invert_yaxis())
plt.yticks(size=15)
plt.title('Top Features derived by Random Forest', size=20)
# To analyse categorical feature distribution
categorical_features = [ ‘gender’, ‘SeniorCitizen’, ‘Partner’,
‘Dependents’, ‘PhoneService’, ‘MultipleLines’, ‘InternetService’,
‘OnlineSecurity’, ‘OnlineBackup’, ‘DeviceProtection’, ‘TechSupport’, ‘StreamingTV’, ‘StreamingMovies’, ‘PaymentMethod’,
‘PaperlessBilling’, ‘Contract’ ]
ROWS, COLS = 4, 4
fig, ax = plt.subplots(ROWS, COLS, figsize=(18, 20) )
row, col = 0, 0
for i, categorical_feature in enumerate(categorical_features):
if col == COLS — 1: row += 1
col = i % COLS
df[df.Churn==’No’][categorical_feature].value_counts().plot(‘bar’, width=.5, ax=ax[row, col], color=’blue’, alpha=0.5).set_title(categorical_feature)
df[df.Churn==’Yes’][categorical_feature].value_counts().plot(‘bar’, width=.3, ax=ax[row, col], color=’orange’, alpha=0.7).set_title(categorical_feature)
plt.legend([‘No Churn’, ‘Churn’])
fig.subplots_adjust(hspace=0.7)
sns.pairplot(df[['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']], hue='Churn', plot_kws=dict(alpha=.3, edgecolor='none'), height=2, aspect=1.1)
fig, ax = plt.subplots(1, 3, figsize=(15, 3))
df[df.Churn == “No”][num_cols].hist(bins=35, color=”blue”, alpha=0.5, ax=ax)
df[df.Churn == “Yes”][num_cols].hist(bins=35, color=”orange”, alpha=0.7, ax=ax)
plt.legend([‘No Churn’, ‘Churn’], shadow=True, loc=9)
# change MonthlyCharges to categorical column
def monthlycharges_split(df) :
if df[‘MonthlyCharges’] <= 30 :
return ‘0–30’
elif (df[‘MonthlyCharges’] > 30) & (df[‘MonthlyCharges’] <= 70 ):
return ‘30–70’
elif (df[‘MonthlyCharges’] > 70) & (df[‘MonthlyCharges’] <= 99 ):
return ‘70–99’
elif df[‘MonthlyCharges’] > 99 :
return ‘99plus’
df[‘monthlycharges_group’] = df.apply(lambda df:monthlycharges_split(df), axis = 1)
# change TotalCharges to categorical column
def totalcharges_split(df) :
if df[‘TotalCharges’] <= 2000 :
return ‘0–2k’
elif (df[‘TotalCharges’] > 2000) & (df[‘TotalCharges’] <= 4000 ):
return ‘2k-4k’
elif (df[‘TotalCharges’] > 4000) & (df[‘TotalCharges’] <= 6000) :
return ‘4k-6k’
elif df[‘TotalCharges’] > 6000 :
return ‘6kplus’
df[‘totalcharges_group’] = df.apply(lambda df:totalcharges_split(df), axis = 1)
# change Tenure to categorical column
def tenure_split(df) :
if df[‘tenure’] <= 20 :
return ‘0–20’
elif (df[‘tenure’] > 20) & (df[‘tenure’] <= 40 ):
return ‘20–40’
elif (df[‘tenure’] > 40) & (df[‘tenure’] <= 60) :
return ‘40–60’
elif df[‘tenure’] > 60 :
return ‘60plus’
df[‘tenure_group’] = df.apply(lambda df:tenure_split(df), axis = 1)
# split data to 80:20 ratio for train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=71)
print(‘X_train’, X_train.shape)
print(‘y_train’, y_train.shape)
print(‘X_test’, X_test.shape)
print(‘y_test’, y_test.shape)
def model_report(model_name, model):
print(‘\nSearch for OPTIMAL THRESHOLD, vary from 0.0001 to 0.9999, fit/predict on train/test data’)
model.fit(X_train, y_train)
optimal_th = 0.5 # start with default threshold value
for i in range(0,3):
score_list = []
print(‘\nLooping decimal place’, i+1)
th_list = [np.linspace(optimal_th-0.4999, optimal_th+0.4999, 11),
np.linspace(optimal_th-0.1, optimal_th+0.1, 21),
np.linspace(optimal_th-0.01, optimal_th+0.01, 21)]
for th in th_list[i]:
y_pred = (model.predict_proba(X_test)[:,1] >= th)
f1scor = f1_score(y_test, y_pred)
score_list.append(f1scor)
print(‘{:.3f}->{:.4f}’.format(th, f1scor), end=’, ‘)
optimal_th = float(th_list[i][score_list.index(max(score_list))])
print(‘optimal F1 score = {:.4f}’.format(max(score_list)))
print(‘optimal threshold = {:.3f}’.format(optimal_th))
print(model_name, ‘accuracy score is’)
print(‘Training: {:.2f}%’.format(100*model.score(X_train, y_train))) # score uses accuracy
print(‘Test set: {:.2f}%’.format(100*model.score(X_test, y_test)))
y_pred = (model.predict_proba(X_test)[:,1] >= 0.25)
print(‘\nAdjust threshold to 0.25:’)
print(‘Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}’.format(
precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))
print(model_name, ‘confusion matrix: \n’, confusion_matrix(y_test, y_pred))
y_pred = model.predict(X_test)
print(‘\nDefault threshold of 0.50:’)
print(‘Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}’.format(
precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))
print(model_name, ‘confusion matrix: \n’, confusion_matrix(y_test, y_pred))
y_pred = (model.predict_proba(X_test)[:,1] >= 0.75)
print(‘\nAdjust threshold to 0.75:’)
print(‘Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}’.format(
precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))
print(model_name, ‘confusion matrix: \n’, confusion_matrix(y_test, y_pred))
y_pred = (model.predict_proba(X_test)[:,1] >= optimal_th)
print(‘\nOptimal threshold {:.3f}’.format(optimal_th))
print(‘Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}’.format(
precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))
print(model_name, ‘confusion matrix: \n’, confusion_matrix(y_test, y_pred))
global model_f1, model_auc, model_ll, model_roc_auc
model_f1 = f1_score(y_test, y_pred)
y_pred = model.predict_proba(X_test)
model_ll = log_loss(y_test, y_pred)
print(model_name, ‘Log-loss: {:.4f}’.format(model_ll))
y_pred = model.predict(X_test)
model_roc_auc = roc_auc_score(y_test, y_pred)
print(model_name, ‘roc_auc_score: {:.4f}’.format(model_roc_auc))
y_pred = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
model_auc = auc(fpr, tpr)
print(model_name, ‘AUC: {:.4f}’.format(model_auc))
# plot the ROC curve
plt.figure(figsize = [6,6])
plt.plot(fpr, tpr, label=’ROC curve (area = %0.2f)’ % model_auc)
plt.plot([0, 1], [0, 1],’r — ‘)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel(‘False Positive Rate’)
plt.ylabel(‘True Positive Rate’)
plt.title(‘Receiver Operating Characteristic’)
plt.legend(loc=”lower right”)
# plt.savefig(‘roc_auc_score’)
plt.show()
return
# initialise lists to collect the results to plot later
model_list = []
f1_list = []
auc_list = []
ll_list = []
roc_auc_list = []
time_list = []
print(‘\n”””””” LogisticRegression “”””””’)
print(‘\nSearch for optimal hyperparameter C in LogisticRegresssion, vary C from 0.001 to 1000, using KFold(5) Cross Validation on train data’)
kf = KFold(n_splits=5, random_state=21, shuffle=True)
score_list = []
c_list = 10**np.linspace(-3,3,200)
for c in c_list:
logit = LogisticRegression(C = c)
cvs = (cross_val_score(logit, X_train, y_train, cv=kf, scoring=’f1')).mean()
score_list.append(cvs)
print(‘{:.4f}’.format(cvs), end=”, “) # 4 decimal pl
print(‘optimal cv F1 score = {:.4f}’.format(max(score_list)))
optimal_c = float(c_list[score_list.index(max(score_list))])
print(‘optimal value of C = {:.3f}’.format(optimal_c))
time1 = time.time()
logit = LogisticRegression(C = optimal_c)
model_report(‘LogisticRegression’, logit)
model_list.append(‘LogisticRegression’)
f1_list.append(model_f1)
auc_list.append(model_auc)
ll_list.append(model_ll)
roc_auc_list.append(model_roc_auc)
time_list.append(time.time() — time1)
## plot the classification report scores
fig, ax = plt.subplots(5, 1, figsize=(18, 28))
ax[0].bar(model_list, f1_list)
ax[0].set_title('F1-score')
ax[1].bar(model_list, auc_list)
ax[1].set_title('AUC-score');
ax[2].bar(model_list, ll_list)
ax[2].set_title('Log-Loss-Score')
ax[3].bar(model_list, roc_auc_list)
ax[3].set_title('ROC AUC Score')
ax[4].bar(model_list, time_list)
ax[4].set_title('Time taken')
fig.subplots_adjust(hspace=0.2, wspace=0.2)
AUC stands for area under curve, which is explained below. The ROC () curve is a relationship between True Positive Rate and False Positive Rate. Logistic Regression, which is the red line, has an area of 0.82 under this curve. It means the model has performed better than random guesses of 0.5, the diagonal line.
# plot the ROC curves
plt.figure(figsize=(10,10))
y_pred = gnb.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color=’blue’,
lw=3, label=’GaussianNB (area = %0.2f)’ % auc_list[0])
y_pred = bnb.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color=’green’,
lw=3, label=’BernoulliNB (area = %0.2f)’ % auc_list[1])
y_pred = logit.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color=’red’,
lw=2, label=’LogisticRegression (area = %0.2f)’ % auc_list[2])
y_pred = knn.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color=’yellow’,
lw=3, label=’KNN (area = %0.2f)’ % auc_list[3])
y_pred = decisiontree.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color=’purple’,
lw=2, label=’DecisionTree (area = %0.2f)’ % auc_list[4])
y_pred = randomforest.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color=’brown’,
lw=2, label=’RandomForest (area = %0.2f)’ % auc_list[5])
y_pred = linearsvc.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color=’cyan’,
lw=2, label=’LinearSVC (area = %0.2f)’ % auc_list[6])
y_pred = svc.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, color=’magenta’,
lw=2, label=’SVC (area = %0.2f)’ % auc_list[7])
plt.plot([0, 1], [0, 1], color=’navy’, lw=2, linestyle=’ — ‘)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel(‘False Positive Rate’, fontsize=13)
plt.ylabel(‘True Positive Rate’, fontsize=14)
plt.title(‘Receiver Operating Characteristic’, fontsize=17)
plt.legend(loc=’lower right’, fontsize=13)
plt.show()
Another performance metric is Log-Loss-Score where we look for the model with the lowest loss function. Once again Logistic Regression is the best, and thus it is the final selected model.
Next I tuned the threshold of Logistic Regression to maximise the F1-Score. By tuning this hyperparameter, I have achieved the optimised Recall rate of 76%. is defined as the number of churned customers predicted correctly. However, the trade-off is that only 58% of the churn predictions ( rate) are correct. This is due to the limitation in the current model and dataset.
Python codes for the above analysis are available on my , do feel free to refer to them.
Video presentation: Thank you for reading!