utils.py
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class CategoricalEncoder(BaseEstimator, TransformerMixin):
"""
Encodes categorical columns using LabelEncoding, OneHotEncoding and TargetEncoding.
LabelEncoding is used for binary categorical columns
OneHotEncoding is used for columns with <= 10 distinct values
TargetEncoding is used for columns with higher cardinality (>10 distinct values)
"""
def __init__(self, cols = None, lcols = None, ohecols = None, tcols = None, reduce_df = False):
"""
Parameters
----------
cols : list of str
Columns to encode. Default is to one-hot/target/label encode all categorical columns in the DataFrame.
reduce_df : bool
Whether to use reduced degrees of freedom for encoding
(that is, add N-1 one-hot columns for a column with N
categories). E.g. for a column with categories A, B,
and C: When reduce_df is True, A=[1, 0], B=[0, 1],
and C=[0, 0]. When reduce_df is False, A=[1, 0, 0],
B=[0, 1, 0], and C=[0, 0, 1]
Default = False
"""
if isinstance(cols,str):
self.cols = [cols]
else :
self.cols = cols
if isinstance(lcols,str):
self.lcols = [lcols]
else :
self.lcols = lcols
if isinstance(ohecols,str):
self.ohecols = [ohecols]
else :
self.ohecols = ohecols
if isinstance(tcols,str):
self.tcols = [tcols]
else :
self.tcols = tcols
self.reduce_df = reduce_df
def fit(self, X, y):
"""Fit label/one-hot/target encoder to X and y
Parameters
----------
X : pandas DataFrame, shape [n_samples, n_columns]
DataFrame containing columns to encode
y : pandas Series, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
# Encode all categorical cols by default
if self.cols is None:
self.cols = [c for c in X if str(X[c].dtype)=='object']
# Check columns are in X
for col in self.cols:
if col not in X:
raise ValueError('Column \''+col+'\' not in X')
# Separating out lcols, ohecols and tcols
if self.lcols is None:
self.lcols = [c for c in self.cols if X[c].nunique() <= 2]
if self.ohecols is None:
self.ohecols = [c for c in self.cols if ((X[c].nunique() > 2) & (X[c].nunique() <= 10))]
if self.tcols is None:
self.tcols = [c for c in self.cols if X[c].nunique() > 10]
## Create Label Encoding mapping
self.lmaps = dict()
for col in self.lcols:
self.lmaps[col] = dict(zip(X[col].values, X[col].astype('category').cat.codes.values))
## Create OneHot Encoding mapping
self.ohemaps = dict() #dict to store map for each column
for col in self.ohecols:
self.ohemaps[col] = []
uniques = X[col].unique()
for unique in uniques:
self.ohemaps[col].append(unique)
if self.reduce_df:
del self.ohemaps[col][-1]
## Create Target Encoding mapping
self.global_target_mean = y.mean().round(2)
self.sum_count = dict()
for col in self.tcols:
self.sum_count[col] = dict()
uniques = X[col].unique()
for unique in uniques:
ix = X[col]==unique
self.sum_count[col][unique] = (y[ix].sum(),ix.sum())
## Return the fit object
return self
def transform(self, X, y=None):
"""Perform label/one-hot/target encoding transformation.
Parameters
----------
X : pandas DataFrame, shape [n_samples, n_columns]
DataFrame containing columns to label encode
Returns
-------
pandas DataFrame
Input DataFrame with transformed columns
"""
Xo = X.copy()
## Perform label encoding transformation
for col, lmap in self.lmaps.items():
# Map the column
Xo[col] = Xo[col].map(lmap)
Xo[col].fillna(-1, inplace=True) ## Filling new values with -1
## Perform one-hot encoding transformation
for col, vals in self.ohemaps.items():
for val in vals:
new_col = col+'_'+str(val)
Xo[new_col] = (Xo[col]==val).astype('uint8')
del Xo[col]
## Perform LOO target encoding transformation
# Use normal target encoding if this is test data
if y is None:
for col in self.sum_count:
vals = np.full(X.shape[0], np.nan)
for cat, sum_count in self.sum_count[col].items():
vals[X[col]==cat] = (sum_count[0]/sum_count[1]).round(2)
Xo[col] = vals
Xo[col].fillna(self.global_target_mean, inplace=True) # Filling new values by global target mean
# LOO target encode each column
else:
for col in self.sum_count:
vals = np.full(X.shape[0], np.nan)
for cat, sum_count in self.sum_count[col].items():
ix = X[col]==cat
if sum_count[1] > 1:
vals[ix] = ((sum_count[0]-y[ix].reshape(-1,))/(sum_count[1]-1)).round(2)
else :
vals[ix] = ((y.sum() - y[ix])/(X.shape[0] - 1)).round(2) # Catering to the case where a particular
# category level occurs only once in the dataset
Xo[col] = vals
Xo[col].fillna(self.global_target_mean, inplace=True) # Filling new values by global target mean
## Return encoded DataFrame
return Xo
def fit_transform(self, X, y=None):
"""Fit and transform the data via label/one-hot/target encoding.
Parameters
----------
X : pandas DataFrame, shape [n_samples, n_columns]
DataFrame containing columns to encode
y : pandas Series, shape = [n_samples]
Target values (required!).
Returns
-------
pandas DataFrame
Input DataFrame with transformed columns
"""
return self.fit(X, y).transform(X, y)
class AddFeatures(BaseEstimator):
"""
Add new, engineered features using original categorical and numerical features of the DataFrame
"""
def __init__(self, eps = 1e-6):
"""
Parameters
----------
eps : A small value to avoid divide by zero error. Default value is 0.000001
"""
self.eps = eps
def fit(self, X, y=None):
return self
def transform(self, X):
"""
Parameters
----------
X : pandas DataFrame, shape [n_samples, n_columns]
DataFrame containing base columns using which new interaction-based features can be engineered
"""
Xo = X.copy()
## Add 4 new columns - bal_per_product, bal_by_est_salary, tenure_age_ratio, age_surname_mean_churn
Xo['bal_per_product'] = Xo.Balance/(Xo.NumOfProducts + self.eps)
Xo['bal_by_est_salary'] = Xo.Balance/(Xo.EstimatedSalary + self.eps)
Xo['tenure_age_ratio'] = Xo.Tenure/(Xo.Age + self.eps)
Xo['age_surname_enc'] = np.sqrt(Xo.Age) * Xo.Surname_enc
## Returning the updated dataframe
return Xo
def fit_transform(self, X, y=None):
"""
Parameters
----------
X : pandas DataFrame, shape [n_samples, n_columns]
DataFrame containing base columns using which new interaction-based features can be engineered
"""
return self.fit(X,y).transform(X)
class CustomScaler(BaseEstimator, TransformerMixin):
"""
A custom standard scaler class with the ability to apply scaling on selected columns
"""
def __init__(self, scale_cols = None):
"""
Parameters
----------
scale_cols : list of str
Columns on which to perform scaling and normalization. Default is to scale all numerical columns
"""
self.scale_cols = scale_cols
def fit(self, X, y=None):
"""
Parameters
----------
X : pandas DataFrame, shape [n_samples, n_columns]
DataFrame containing columns to scale
"""
# Scaling all non-categorical columns if user doesn't provide the list of columns to scale
if self.scale_cols is None:
self.scale_cols = [c for c in X if ((str(X[c].dtype).find('float') != -1) or (str(X[c].dtype).find('int') != -1))]
## Create mapping corresponding to scaling and normalization
self.maps = dict()
for col in self.scale_cols:
self.maps[col] = dict()
self.maps[col]['mean'] = np.mean(X[col].values).round(2)
self.maps[col]['std_dev'] = np.std(X[col].values).round(2)
# Return fit object
return self
def transform(self, X):
"""
Parameters
----------
X : pandas DataFrame, shape [n_samples, n_columns]
DataFrame containing columns to scale
"""
Xo = X.copy()
## Map transformation to respective columns
for col in self.scale_cols:
Xo[col] = (Xo[col] - self.maps[col]['mean']) / self.maps[col]['std_dev']
# Return scaled and normalized DataFrame
return Xo
def fit_transform(self, X, y=None):
"""
Parameters
----------
X : pandas DataFrame, shape [n_samples, n_columns]
DataFrame containing columns to scale
"""
# Fit and return transformed dataframe
return self.fit(X).transform(X)
customer_churn_prediction.py
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, recall_score, confusion_matrix, classification_report
import subprocess
import joblib
# Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# %%
# Reading the dataset
dc = pd.read_csv("Churn_Modelling.csv")
dc.head(5)
# %%
# Dimension of the dataset
dc.shape
# %%
dc.describe(exclude= ['O']) # Describe all numerical columns
dc.describe(include = ['O']) # Describe all non-numerical/categorical columns
# %%
# Checking number of unique customers in the dataset
dc.shape[0], dc.CustomerId.nunique()
# %%
# churn value Distribution
dc["Exited"].value_counts()
# %%
dc.groupby(['Surname']).agg({'RowNumber':'count', 'Exited':'mean'}
).reset_index().sort_values(by='RowNumber', ascending=False).head()
# %%
dc.groupby(['Geography']).agg({'RowNumber':'count', 'Exited':'mean'}
).reset_index().sort_values(by='RowNumber', ascending=False)
# %%
sns.set(style="whitegrid")
sns.boxplot(y=dc['CreditScore'])
# %%
sns.boxplot(y=dc['Age'])
# %%
sns.violinplot(y = dc.Tenure)
# %%
sns.violinplot(y = dc['Balance'])
# %%
sns.set(style = 'ticks')
sns.distplot(dc.NumOfProducts, hist=True, kde=False)
# %%
# When dealing with numerical characteristics, one of the most useful statistics to examine is the data distribution.
# we can use Kernel-Density-Estimation plot for that purpose.
sns.kdeplot(dc.EstimatedSalary)
# %%
# Separating out different columns into various categories as defined above
target_var = ['Exited']
cols_to_remove = ['RowNumber', 'CustomerId']
# numerical columns
num_feats = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
# categorical columns
cat_feats = ['Surname', 'Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
# %%
y = dc[target_var].values
dc.drop(cols_to_remove, axis=1, inplace=True)
# %%
# Keeping aside a test/holdout set
dc_train_val, dc_test, y_train_val, y_test = train_test_split(dc, y.ravel(), test_size = 0.1, random_state = 42)
# Splitting into train and validation set
dc_train, dc_val, y_train, y_val = train_test_split(dc_train_val, y_train_val, test_size = 0.12, random_state = 42)
dc_train.shape, dc_val.shape, dc_test.shape, y_train.shape, y_val.shape, y_test.shape
np.mean(y_train), np.mean(y_val), np.mean(y_test)
# %%
# label encoding With the sklearn method
le = LabelEncoder()
# Label encoding of Gender variable
dc_train['Gender'] = le.fit_transform(dc_train['Gender'])
le_gender_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_gender_mapping
# %%
# Encoding Gender feature for validation and test set
dc_val['Gender'] = dc_val.Gender.map(le_gender_mapping)
dc_test['Gender'] = dc_test.Gender.map(le_gender_mapping)
# Filling missing/NaN values created due to new categorical levels
dc_val['Gender'].fillna(-1, inplace=True)
dc_test['Gender'].fillna(-1, inplace=True)
# %%
dc_train.Gender.unique(), dc_val.Gender.unique(), dc_test.Gender.unique()
# %%
# With the sklearn method(LabelEncoder())
le_ohe = LabelEncoder()
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
enc_train = le_ohe.fit_transform(dc_train.Geography).reshape(dc_train.shape[0],1)
ohe_train = ohe.fit_transform(enc_train)
ohe_train
# %%
# mapping between classes
le_ohe_geography_mapping = dict(zip(le_ohe.classes_, le_ohe.transform(le_ohe.classes_)))
le_ohe_geography_mapping
# %%
# Encoding Geography feature for validation and test set
enc_val = dc_val.Geography.map(le_ohe_geography_mapping).ravel().reshape(-1,1)
enc_test = dc_test.Geography.map(le_ohe_geography_mapping).ravel().reshape(-1,1)
# Filling missing/NaN values created due to new categorical levels
enc_val[np.isnan(enc_val)] = 9999
enc_test[np.isnan(enc_test)] = 9999
# %%
ohe_val = ohe.transform(enc_val)
ohe_test = ohe.transform(enc_test)
# %%
# Show what happens when a new value is inputted into the OHE
ohe.transform(np.array([[9999]]))
# %%
cols = ['country_' + str(x) for x in le_ohe_geography_mapping.keys()]
cols
# %%
# Adding to the respective dataframes
dc_train = pd.concat([dc_train.reset_index(), pd.DataFrame(ohe_train, columns = cols)], axis = 1).drop(['index'], axis=1)
dc_val = pd.concat([dc_val.reset_index(), pd.DataFrame(ohe_val, columns = cols)], axis = 1).drop(['index'], axis=1)
dc_test = pd.concat([dc_test.reset_index(), pd.DataFrame(ohe_test, columns = cols)], axis = 1).drop(['index'], axis=1)
print("Training set")
dc_train.head()
print("\n\nValidation set")
dc_val.head()
print("\n\nTest set")
dc_test.head()
# %%
dc_train.drop(['Geography'], axis=1, inplace=True)
dc_val.drop(['Geography'], axis=1, inplace=True)
dc_test.drop(['Geography'], axis=1, inplace=True)
# %%
means = dc_train.groupby(['Surname']).Exited.mean()
means.head()
means.tail()
# %%
global_mean = y_train.mean()
global_mean
# %%
# Creating new encoded features for surname - Target (mean) encoding
dc_train['Surname_mean_churn'] = dc_train.Surname.map(means)
dc_train['Surname_mean_churn'].fillna(global_mean, inplace=True)
# %%
freqs = dc_train.groupby(['Surname']).size()
freqs.head()
# %%
dc_train['Surname_freq'] = dc_train.Surname.map(freqs)
dc_train['Surname_freq'].fillna(0, inplace=True)
# %%
dc_train['Surname_enc'] = ((dc_train.Surname_freq * dc_train.Surname_mean_churn) - dc_train.Exited)/(dc_train.Surname_freq - 1)
# Fill NaNs occuring due to category frequency being 1 or less
dc_train['Surname_enc'].fillna((((dc_train.shape[0] * global_mean) - dc_train.Exited) / (dc_train.shape[0] - 1)), inplace=True)
dc_train.head(5)
# %%
# Replacing by category means and new category levels by global mean
dc_val['Surname_enc'] = dc_val.Surname.map(means)
dc_val['Surname_enc'].fillna(global_mean, inplace=True)
dc_test['Surname_enc'] = dc_test.Surname.map(means)
dc_test['Surname_enc'].fillna(global_mean, inplace=True)
# Show that using LOO Target encoding decorrelates features
dc_train[['Surname_mean_churn', 'Surname_enc', 'Exited']].corr()
# %%
dc_train.drop(['Surname_mean_churn'], axis=1, inplace=True)
dc_train.drop(['Surname_freq'], axis=1, inplace=True)
dc_train.drop(['Surname'], axis=1, inplace=True)
dc_val.drop(['Surname'], axis=1, inplace=True)
dc_test.drop(['Surname'], axis=1, inplace=True)
dc_train.head()
# %%
corr = dc_train.corr()
sns.heatmap(corr, cmap = 'coolwarm')
# %%
sns.boxplot(x="Exited", y="Age", data=dc_train, palette="Set3")
# %%
sns.violinplot(x="Exited", y="Balance", data=dc_train, palette="Set3")
# %%
cat_vars_bv = ['Gender', 'IsActiveMember', 'country_Germany', 'country_France']
for col in cat_vars_bv:
dc_train.groupby([col]).Exited.mean()
print()
# %%
# Computed mean on churned or non chuned custmers group by number of product on training data
col = 'NumOfProducts'
dc_train.groupby([col]).Exited.mean()
# unique "NumOfProducts" on training data
dc_train[col].value_counts()
# %%
eps = 1e-6
dc_train['bal_per_product'] = dc_train.Balance/(dc_train.NumOfProducts + eps)
dc_train['bal_by_est_salary'] = dc_train.Balance/(dc_train.EstimatedSalary + eps)
dc_train['tenure_age_ratio'] = dc_train.Tenure/(dc_train.Age + eps)
dc_train['age_surname_mean_churn'] = np.sqrt(dc_train.Age) * dc_train.Surname_enc
# %%
new_cols = ['bal_per_product', 'bal_by_est_salary', 'tenure_age_ratio', 'age_surname_mean_churn']
# Ensuring that the new column doesn't have any missing values
dc_train[new_cols].isnull().sum()
# %%
# Linear association of new columns with target variables to judge importance
sns.heatmap(dc_train[new_cols + ['Exited']].corr(), annot=True)
# %%
dc_val['bal_per_product'] = dc_val.Balance/(dc_val.NumOfProducts + eps)
dc_val['bal_by_est_salary'] = dc_val.Balance/(dc_val.EstimatedSalary + eps)
dc_val['tenure_age_ratio'] = dc_val.Tenure/(dc_val.Age + eps)
dc_val['age_surname_mean_churn'] = np.sqrt(dc_val.Age) * dc_val.Surname_enc
dc_test['bal_per_product'] = dc_test.Balance/(dc_test.NumOfProducts + eps)
dc_test['bal_by_est_salary'] = dc_test.Balance/(dc_test.EstimatedSalary + eps)
dc_test['tenure_age_ratio'] = dc_test.Tenure/(dc_test.Age + eps)
dc_test['age_surname_mean_churn'] = np.sqrt(dc_test.Age) * dc_test.Surname_enc
# %%
# initialize the standard scaler
sc = StandardScaler()
cont_vars = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Surname_enc', 'bal_per_product'
, 'bal_by_est_salary', 'tenure_age_ratio', 'age_surname_mean_churn']
cat_vars = ['Gender', 'HasCrCard', 'IsActiveMember', 'country_France', 'country_Germany', 'country_Spain']
# Scaling only continuous columns
cols_to_scale = cont_vars
sc_X_train = sc.fit_transform(dc_train[cols_to_scale])
# Converting from array to dataframe and naming the respective features/columns
sc_X_train = pd.DataFrame(data=sc_X_train, columns=cols_to_scale)
sc_X_train.shape
sc_X_train.head()
# %%
# Scaling validation and test sets by transforming the mapping obtained through the training set
sc_X_val = sc.transform(dc_val[cols_to_scale])
sc_X_test = sc.transform(dc_test[cols_to_scale])
# Converting val and test arrays to dataframes for re-usability
sc_X_val = pd.DataFrame(data=sc_X_val, columns=cols_to_scale)
sc_X_test = pd.DataFrame(data=sc_X_test, columns=cols_to_scale)
# %%
# Creating feature-set and target for RFE model
y = dc_train['Exited'].values
X = dc_train[cat_vars + cont_vars]
X.columns = cat_vars + cont_vars
X.columns
# %%
# for logistics regression
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=10)
rfe = rfe.fit(X.values, y)
# mask of selected features
print(rfe.support_)
# The feature ranking, such that ranking_[i] corresponds to the ranking position of the i-th feature
print(rfe.ranking_)
# %%
# Logistic regression (linear)
mask = rfe.support_.tolist()
selected_feats = [b for a,b in zip(mask, X.columns) if a]
selected_feats
# %%
rfe_dt = RFE(estimator=DecisionTreeClassifier(max_depth = 4, criterion = 'entropy'), n_features_to_select=10)
rfe_dt = rfe_dt.fit(X.values, y)
# %%
mask = rfe_dt.support_.tolist()
selected_feats_dt = [b for a,b in zip(mask, X.columns) if a]
selected_feats_dt
# %%
selected_cat_vars = [x for x in selected_feats if x in cat_vars]
selected_cont_vars = [x for x in selected_feats if x in cont_vars]
# Using categorical features and scaled numerical features
X_train = np.concatenate((dc_train[selected_cat_vars].values, sc_X_train[selected_cont_vars].values), axis=1)
X_val = np.concatenate((dc_val[selected_cat_vars].values, sc_X_val[selected_cont_vars].values), axis=1)
X_test = np.concatenate((dc_test[selected_cat_vars].values, sc_X_test[selected_cont_vars].values), axis=1)
# print the shapes
X_train.shape, X_val.shape, X_test.shape
# %%
# Obtaining class weights based on the class samples imbalance ratio
_, num_samples = np.unique(y_train, return_counts=True)
weights = np.max(num_samples)/num_samples
# Define weight dictionnary
weights_dict = dict()
class_labels = [0,1]
# Weights associated with classes
for a,b in zip(class_labels,weights):
weights_dict[a] = b
weights_dict
# %%
# Defining model
lr = LogisticRegression(C=1.0, penalty='l2', class_weight=weights_dict, n_jobs=-1)
# train
lr.fit(X_train, y_train)
# %%
print(f'Confusion Matrix: \n{confusion_matrix(y_val, lr.predict(X_val))}')
print(f'Area Under Curve: {roc_auc_score(y_val, lr.predict(X_val))}')
print(f'Recall score: {recall_score(y_val,lr.predict(X_val))}')
print(f'Classification report: \n{classification_report(y_val,lr.predict(X_val))}')
# %%
svm = SVC(C=1.0, kernel="linear", class_weight=weights_dict)
svm.fit(X_train, y_train)
# %%
# Validation metrics
print(f'Confusion Matrix: {confusion_matrix(y_val, lr.predict(X_val))}')
print(f'Area Under Curve: {roc_auc_score(y_val, lr.predict(X_val))}')
print(f'Recall score: {recall_score(y_val,lr.predict(X_val))}')
print(f'Classification report: \n{classification_report(y_val,lr.predict(X_val))}')
# %%
pca = PCA(n_components=2)
# Transforming the dataset using PCA
X_pca = pca.fit_transform(X_train)
y = y_train
X_pca.shape, y.shape
# %%
# min and max values
xmin, xmax = X_pca[:, 0].min() - 2, X_pca[:, 0].max() + 2
ymin, ymax = X_pca[:, 1].min() - 2, X_pca[:, 1].max() + 2
# Creating a mesh region where the boundary will be plotted
xx, yy = np.meshgrid(np.arange(xmin, xmax, 0.2),
np.arange(ymin, ymax, 0.2))
# Fitting LR model on 2 features
lr.fit(X_pca, y)
# Fitting SVM model on 2 features
svm.fit(X_pca, y)
# Plotting decision boundary for LR
z1 = lr.predict(np.c_[xx.ravel(), yy.ravel()])
z1 = z1.reshape(xx.shape)
# Plotting decision boundary for SVM
z2 = svm.predict(np.c_[xx.ravel(), yy.ravel()])
z2 = z2.reshape(xx.shape)
# Displaying the result
plt.contourf(xx, yy, z1, alpha=0.4) # LR
plt.contour(xx, yy, z2, alpha=0.4, colors='blue') # SVM
sns.scatterplot(X_pca[:,0], X_pca[:,1], hue=y_train, s=50, alpha=0.8)
plt.title('Linear models - LogReg and SVM')
# %%
# Features selected from the RFE process
selected_feats_dt
# %%
# Re-defining X_train and X_val to consider original unscaled continuous features. y_train and y_val remain unaffected
X_train = dc_train[selected_feats_dt].values
X_val = dc_val[selected_feats_dt].values
# Decision tree classiier model
clf = DecisionTreeClassifier(criterion='entropy', class_weight=weights_dict, max_depth=4, max_features=None
, min_samples_split=25, min_samples_leaf=15)
# Fit the model
clf.fit(X_train, y_train)
# Checking the importance of different features of the model
pd.DataFrame({'features': selected_feats,
'importance': clf.feature_importances_
}).sort_values(by='importance', ascending=False)
# %%
# Validation metrics
print(f'Confusion Matrix: {confusion_matrix(y_val, clf.predict(X_val))}')
print(f'Area Under Curve: {roc_auc_score(y_val, clf.predict(X_val))}')
print(f'Recall score: {recall_score(y_val,clf.predict(X_val))}')
print(f'Classification report: \n{classification_report(y_val,clf.predict(X_val))}')
# %%
# Decision tree Classifier
clf = DecisionTreeClassifier(criterion='entropy', class_weight=weights_dict,
max_depth=3, max_features=None,
min_samples_split=25, min_samples_leaf=15)
# We fit the model
clf.fit(X_train, y_train)
# Export now as a dot file
dot_data = export_graphviz(clf, out_file='tree.dot',
feature_names=selected_feats_dt,
class_names=['Did not churn', 'Churned'],
rounded=True, proportion=False,
precision=2, filled=True)
# Convert to png using system command (requires Graphviz installation)
subprocess.run(['dot', '-Tpng','tree.dot', '-o', 'tree.png', '-Gdpi=600'], shell=True)
# Display the rule-set of a single tree
from IPython.display import Image
Image(filename='tree.png')
# %%
## Preparing data and a few common model parameters
# Unscaled features will be used since it's a tree model
X_train = dc_train.drop(columns = ['Exited'], axis = 1)
X_val = dc_val.drop(columns = ['Exited'], axis = 1)
# %%
from utils import *
best_f1_lgb = LGBMClassifier(boosting_type='dart', class_weight={0: 1, 1: 3.0}, min_child_samples=20, n_jobs=-1, importance_type='gain', max_depth=6, num_leaves=63, colsample_bytree=0.6, learning_rate=0.1, n_estimators=201, reg_alpha=1, reg_lambda=1)
best_recall_lgb = LGBMClassifier(boosting_type='dart', num_leaves=31, max_depth=6, learning_rate=0.1, n_estimators=21, class_weight={0: 1, 1: 3.93}, min_child_samples=2, colsample_bytree=0.6, reg_alpha=0.3, reg_lambda=1.0, n_jobs=-1, importance_type='gain')
model = Pipeline(steps = [('categorical_encoding', CategoricalEncoder()),
('add_new_features', AddFeatures()),
('classifier', best_f1_lgb)
])
# Fitting final model on train dataset
model.fit(X_train, y_train)
# Predict target probabilities
val_probs = model.predict_proba(X_val)[:,1]
# Predict target values on val data
val_preds = np.where(val_probs > 0.45, 1, 0) # The probability threshold can be tweaked
# Validation metrics
print(f'Confusion Matrix: {confusion_matrix(y_val,val_preds)}')
print(f'Area Under Curve: {roc_auc_score(y_val,val_preds)}')
print(f'Recall score: {recall_score(y_val,val_preds)}')
print(f'Classification report: \n{classification_report(y_val,val_preds)}')
# %%
# Save model object
joblib.dump(model, 'final_churn_model_f1_0_45.sav')
# %%
# Load model object
model = joblib.load('final_churn_model_f1_0_45.sav')
X_test = dc_test.drop(columns=['Exited'], axis=1)
# Predict target probabilities
test_probs = model.predict_proba(X_test)[:,1]
# Predict target values on test data
test_preds = np.where(test_probs > 0.45, 1, 0) # Flexibility to tweak the probability threshold
#test_preds = model.predict(X_test)
# Test set metrics
roc_auc_score(y_test, test_preds)
recall_score(y_test, test_preds)
confusion_matrix(y_test, test_preds)
print(classification_report(y_test, test_preds))
# %%
# Adding predictions and their probabilities in the original test dataframe
test = dc_test.copy()
test['predictions'] = test_preds
test['pred_probabilities'] = test_probs
test.sample(5)
# %%
high_churn_list = test[test.pred_probabilities > 0.7].sort_values(by=['pred_probabilities'], ascending=False
).reset_index().drop(columns=['index', 'Exited', 'predictions'], axis=1)
high_churn_list.shape
high_churn_list.head()
# %%
high_churn_list.to_csv('high_churn_list.csv', index=False)
# %%