Credit Card Churn Analysis¶

Made by: Carlos Pérez Franquelo

image

Image taken from Kaggle dataset

1. Import Libraries¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, make_scorer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

2. Data preprocessing¶

2.1. Preparing the data and first steps¶

In [2]:
df = pd.read_csv("credit_card_churn.csv")
df = df.drop(columns=['CLIENTNUM','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'])

print(f'Number of null values in DataFrame: {df.isna().sum().sum()}')
Number of null values in DataFrame: 0
In [ ]:
df['Attrition_Flag'] = df['Attrition_Flag'].replace({
    'Existing Customer': 0,
    'Attrited Customer': 1
})
In [4]:
df.head()
Out[4]:
Attrition_Flag Customer_Age Gender Dependent_count Education_Level Marital_Status Income_Category Card_Category Months_on_book Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
0 0 45 M 3 High School Married $60K - $80K Blue 39 5 1 3 12691.0 777 11914.0 1.335 1144 42 1.625 0.061
1 0 49 F 5 Graduate Single Less than $40K Blue 44 6 1 2 8256.0 864 7392.0 1.541 1291 33 3.714 0.105
2 0 51 M 3 Graduate Married $80K - $120K Blue 36 4 1 0 3418.0 0 3418.0 2.594 1887 20 2.333 0.000
3 0 40 F 4 High School Unknown Less than $40K Blue 34 3 4 1 3313.0 2517 796.0 1.405 1171 20 2.333 0.760
4 0 40 M 3 Uneducated Married $60K - $80K Blue 21 5 1 0 4716.0 0 4716.0 2.175 816 28 2.500 0.000
In [5]:
df.nunique()
Out[5]:
Attrition_Flag                 2
Customer_Age                  45
Gender                         2
Dependent_count                6
Education_Level                7
Marital_Status                 4
Income_Category                6
Card_Category                  4
Months_on_book                44
Total_Relationship_Count       6
Months_Inactive_12_mon         7
Contacts_Count_12_mon          7
Credit_Limit                6205
Total_Revolving_Bal         1974
Avg_Open_To_Buy             6813
Total_Amt_Chng_Q4_Q1        1158
Total_Trans_Amt             5033
Total_Trans_Ct               126
Total_Ct_Chng_Q4_Q1          830
Avg_Utilization_Ratio        964
dtype: int64
In [6]:
df.describe()
Out[6]:
Attrition_Flag Customer_Age Dependent_count Months_on_book Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
count 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000 10127.000000
mean 0.160660 46.325960 2.346203 35.928409 3.812580 2.341167 2.455317 8631.953698 1162.814061 7469.139637 0.759941 4404.086304 64.858695 0.712222 0.274894
std 0.367235 8.016814 1.298908 7.986416 1.554408 1.010622 1.106225 9088.776650 814.987335 9090.685324 0.219207 3397.129254 23.472570 0.238086 0.275691
min 0.000000 26.000000 0.000000 13.000000 1.000000 0.000000 0.000000 1438.300000 0.000000 3.000000 0.000000 510.000000 10.000000 0.000000 0.000000
25% 0.000000 41.000000 1.000000 31.000000 3.000000 2.000000 2.000000 2555.000000 359.000000 1324.500000 0.631000 2155.500000 45.000000 0.582000 0.023000
50% 0.000000 46.000000 2.000000 36.000000 4.000000 2.000000 2.000000 4549.000000 1276.000000 3474.000000 0.736000 3899.000000 67.000000 0.702000 0.176000
75% 0.000000 52.000000 3.000000 40.000000 5.000000 3.000000 3.000000 11067.500000 1784.000000 9859.000000 0.859000 4741.000000 81.000000 0.818000 0.503000
max 1.000000 73.000000 5.000000 56.000000 6.000000 6.000000 6.000000 34516.000000 2517.000000 34516.000000 3.397000 18484.000000 139.000000 3.714000 0.999000
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  int64  
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Open_To_Buy           10127 non-null  float64
 15  Total_Amt_Chng_Q4_Q1      10127 non-null  float64
 16  Total_Trans_Amt           10127 non-null  int64  
 17  Total_Trans_Ct            10127 non-null  int64  
 18  Total_Ct_Chng_Q4_Q1       10127 non-null  float64
 19  Avg_Utilization_Ratio     10127 non-null  float64
dtypes: float64(5), int64(10), object(5)
memory usage: 1.5+ MB

2.2. Visualization¶

In [8]:
df_numbers = df.drop(columns=['Gender','Education_Level','Marital_Status','Income_Category','Card_Category'])
correlation_matrix = df_numbers.corr()

plt.figure(figsize=(11, 9.5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

plt.title('Correlation Matrix Heatmap')
plt.savefig('plots/correlation_heatmap.png', bbox_inches='tight')
plt.show()
No description has been provided for this image
In [9]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Customer_Age'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()
No description has been provided for this image
In [10]:
plt.barh(df['Dependent_count'].value_counts().index, df['Dependent_count'].value_counts().values)
Out[10]:
<BarContainer object of 6 artists>
No description has been provided for this image
In [11]:
table = pd.crosstab(df['Dependent_count'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [12]:
plt.barh(df['Gender'].value_counts().index, df['Gender'].value_counts().values)
Out[12]:
<BarContainer object of 2 artists>
No description has been provided for this image
In [13]:
table = pd.crosstab(df['Gender'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [14]:
plt.barh(df['Education_Level'].value_counts().index, df['Education_Level'].value_counts().values, )
Out[14]:
<BarContainer object of 7 artists>
No description has been provided for this image
In [15]:
table = pd.crosstab(df['Education_Level'], df['Attrition_Flag'])

order = ['Unknown','Uneducated','High School','College','Graduate','Post-Graduate','Doctorate']
table = table.reindex(order)

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [16]:
plt.barh(df['Marital_Status'].value_counts().index, df['Marital_Status'].value_counts().values)
Out[16]:
<BarContainer object of 4 artists>
No description has been provided for this image
In [17]:
table = pd.crosstab(df['Marital_Status'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [18]:
plt.barh(df['Income_Category'].value_counts().index, df['Income_Category'].value_counts().values)
Out[18]:
<BarContainer object of 6 artists>
No description has been provided for this image
In [19]:
table = pd.crosstab(df['Income_Category'], df['Attrition_Flag'])

order = ['Less than $40K','$40K - $60K','$60K - $80K','$80K - $120K','$120K +']
table = table.reindex(order)

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [20]:
plt.barh(df['Card_Category'].value_counts().index, df['Card_Category'].value_counts().values)
Out[20]:
<BarContainer object of 4 artists>
No description has been provided for this image
In [21]:
table = pd.crosstab(df['Card_Category'], df['Attrition_Flag'])

order = ['Blue','Silver','Gold','Platinum']
table = table.reindex(order)

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [22]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Months_on_book'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()
No description has been provided for this image
In [23]:
plt.barh(df['Total_Relationship_Count'].value_counts().index, df['Total_Relationship_Count'].value_counts().values)
Out[23]:
<BarContainer object of 6 artists>
No description has been provided for this image
In [24]:
table = pd.crosstab(df['Total_Relationship_Count'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [25]:
plt.barh(df['Months_Inactive_12_mon'].value_counts().index, df['Months_Inactive_12_mon'].value_counts().values)
Out[25]:
<BarContainer object of 7 artists>
No description has been provided for this image
In [26]:
table = pd.crosstab(df['Months_Inactive_12_mon'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [27]:
plt.barh(df['Contacts_Count_12_mon'].value_counts().index, df['Contacts_Count_12_mon'].value_counts().values)
plt.savefig('plots/bar_graph_contact_count_12_mon.png', bbox_inches='tight')
No description has been provided for this image
In [28]:
table = pd.crosstab(df['Contacts_Count_12_mon'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.title('% of attrition for every value of Contacts count 12 months')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.savefig('plots/percentual_bars_contacts_count_12_mon.png', bbox_inches='tight')
plt.show()
No description has been provided for this image
In [29]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Credit_Limit'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()
No description has been provided for this image
In [30]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Revolving_Bal'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()
No description has been provided for this image
In [31]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Avg_Open_To_Buy'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()
No description has been provided for this image
In [32]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Amt_Chng_Q4_Q1'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()
No description has been provided for this image
In [33]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Trans_Amt'] for flag in df['Attrition_Flag'].unique()],
    bins=20,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()
No description has been provided for this image
In [34]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Ct_Chng_Q4_Q1'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()
No description has been provided for this image
In [35]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Trans_Ct'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.title('Distribution of Attrition for Total Transaction Count values')
plt.savefig('plots/histogram_total_trans_ct.png', bbox_inches='tight')
plt.show()
No description has been provided for this image
In [36]:
plt.hist(
    [df[df['Attrition_Flag']==flag]['Avg_Utilization_Ratio'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()
No description has been provided for this image

2.3. Data preparation¶

In [37]:
X = df.drop(columns=['Attrition_Flag'])
y = df['Attrition_Flag']

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y
)
In [38]:
print(y_train.value_counts())
Attrition_Flag
0    6799
1    1302
Name: count, dtype: int64
In [41]:
X_train, y_train = SMOTE(random_state=1).fit_resample(X_train, y_train)
In [42]:
print(y_train.value_counts())
Attrition_Flag
0    6799
1    6799
Name: count, dtype: int64
In [43]:
alt_weight = 3 # This will be the weight we give in our second model to the error we want to minimize
In [44]:
def weighted_metric(y_true, y_pred):
    r = recall_score(y_true, y_pred)
    a = accuracy_score(y_true, y_pred)
    return 0.8 * r + 0.2 * a

custom_scorer = make_scorer(weighted_metric)
In [45]:
scaler = PowerTransformer()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

3. Logistic regression models¶

In [46]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

pipeline = Pipeline([
    ("clf", LogisticRegression(max_iter=1000, solver='saga', random_state=42))
])

param_grid = [
    {
        "clf__penalty": ["l2"],
        "clf__solver": ["saga"],
        "clf__C": [0.01, 0.1, 1, 10, 100, 1000],
    },
    {
        "clf__penalty": ["elasticnet"],
        "clf__solver": ["saga"],
        "clf__C": [0.01, 0.1, 1, 10, 100],
        "clf__l1_ratio": [0.2, 0.5, 0.8],
    },
    {
        "clf__penalty": ["l1"],
        "clf__solver": ["saga"],
        "clf__C": [0.01, 0.1, 1, 10, 100],
    }
]

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, scoring=custom_scorer, n_jobs=-1)
grid.fit(X_train_scaled, y_train)
print(grid.best_params_)
print(round(grid.best_score_, 3))
{'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'saga'}
0.917

3.1. First model (simple)¶

In [47]:
log_reg = LogisticRegression(C=10, penalty='l2', solver='saga')
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.725
Accuracy: 0.894
Recall: 0.683

Confussion Matrix:
 [[1590  111]
 [ 103  222]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94      1701
           1       0.67      0.68      0.67       325

    accuracy                           0.89      2026
   macro avg       0.80      0.81      0.81      2026
weighted avg       0.90      0.89      0.89      2026

In [48]:
coef = pd.DataFrame({
    'Variable': X.columns,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nVariables by weight:")
print(coef)
Variables by weight:
                          Variable  Coefficient
10                 Total_Trans_Amt     2.421791
27  Income_Category_Less than $40K     1.387969
22           Marital_Status_Single     1.257281
16        Education_Level_Graduate     1.143552
17     Education_Level_High School     1.025308
21          Marital_Status_Married     0.998827
20         Education_Level_Unknown     0.988722
24     Income_Category_$40K - $60K     0.905398
19      Education_Level_Uneducated     0.866818
28         Income_Category_Unknown     0.864538
26    Income_Category_$80K - $120K     0.732665
15       Education_Level_Doctorate     0.716404
18   Education_Level_Post-Graduate     0.632730
23          Marital_Status_Unknown     0.626041
25     Income_Category_$60K - $80K     0.572619
6                     Credit_Limit     0.552765
5            Contacts_Count_12_mon     0.437338
4           Months_Inactive_12_mon     0.401792
14                        Gender_M     0.227993
31            Card_Category_Silver     0.137788
29              Card_Category_Gold     0.099461
1                  Dependent_count     0.053719
30          Card_Category_Platinum     0.034419
2                   Months_on_book    -0.041379
0                     Customer_Age    -0.070956
9             Total_Amt_Chng_Q4_Q1    -0.296884
13           Avg_Utilization_Ratio    -0.466619
8                  Avg_Open_To_Buy    -0.674935
12             Total_Ct_Chng_Q4_Q1    -0.716764
3         Total_Relationship_Count    -0.756911
7              Total_Revolving_Bal    -0.771137
11                  Total_Trans_Ct    -3.603026
In [49]:
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

thresholds = np.linspace(0, 1, 300)
fnr_list, tnr_list = [], []

for t in thresholds:
    y_pred = (y_prob >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    fnr = fn / (fn + tp)
    tnr = tn / (tn + fp)

    fnr_list.append(fnr)
    tnr_list.append(tnr)

plt.figure(figsize=(8, 5))
plt.plot(fnr_list, tnr_list, color='darkorange', lw=2,
         label='FNR–TNR Curve') 

plt.plot([0, 1], [0, 1], linestyle='--', color='navy', lw=2,
         label='Baseline')

plt.xlabel('False Negative Rate')
plt.ylabel('True Negative Rate')
plt.title('FNR vs TNR Curve')
plt.legend(loc="lower right")
plt.savefig('plots/FNR_TNR_curve_log_reg_1.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

3.2. Second Model (higher loss for false negatives)¶

In [51]:
log_reg2 = LogisticRegression(class_weight={0:1, 1:alt_weight}, C=10, penalty='l2', solver='saga')
log_reg2.fit(X_train_scaled, y_train)

y_pred = log_reg2.predict(X_test_scaled)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.855
Accuracy: 0.852
Recall: 0.855

Confussion Matrix:
 [[1449  252]
 [  47  278]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.85      0.91      1701
           1       0.52      0.86      0.65       325

    accuracy                           0.85      2026
   macro avg       0.75      0.85      0.78      2026
weighted avg       0.90      0.85      0.87      2026

In [52]:
coef = pd.DataFrame({
    'Variable': X.columns,
    'Coefficient': log_reg2.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nVariables by weight:")
print(coef)
Variables by weight:
                          Variable  Coefficient
10                 Total_Trans_Amt     2.562894
27  Income_Category_Less than $40K     1.260395
22           Marital_Status_Single     1.037837
16        Education_Level_Graduate     0.976881
17     Education_Level_High School     0.882386
20         Education_Level_Unknown     0.852187
24     Income_Category_$40K - $60K     0.803180
21          Marital_Status_Married     0.792000
19      Education_Level_Uneducated     0.756512
28         Income_Category_Unknown     0.736755
15       Education_Level_Doctorate     0.632740
26    Income_Category_$80K - $120K     0.632589
6                     Credit_Limit     0.587660
18   Education_Level_Post-Graduate     0.568536
23          Marital_Status_Unknown     0.495418
25     Income_Category_$60K - $80K     0.487043
5            Contacts_Count_12_mon     0.461674
4           Months_Inactive_12_mon     0.406590
14                        Gender_M     0.204293
31            Card_Category_Silver     0.117727
29              Card_Category_Gold     0.095425
1                  Dependent_count     0.023210
30          Card_Category_Platinum     0.014729
2                   Months_on_book    -0.038182
0                     Customer_Age    -0.113261
9             Total_Amt_Chng_Q4_Q1    -0.368663
13           Avg_Utilization_Ratio    -0.434431
8                  Avg_Open_To_Buy    -0.639398
3         Total_Relationship_Count    -0.701649
12             Total_Ct_Chng_Q4_Q1    -0.706199
7              Total_Revolving_Bal    -0.781122
11                  Total_Trans_Ct    -3.866591
In [53]:
y_prob = log_reg2.predict_proba(X_test_scaled)[:, 1]

thresholds = np.linspace(0, 1, 300)
fnr_list, tnr_list = [], []

for t in thresholds:
    y_pred = (y_prob >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    fnr = fn / (fn + tp)
    tnr = tn / (tn + fp)

    fnr_list.append(fnr)
    tnr_list.append(tnr)

plt.figure(figsize=(8, 5))
plt.plot(fnr_list, tnr_list, color='darkorange', lw=2,
         label='FNR–TNR Curve') 

plt.plot([0, 1], [0, 1], linestyle='--', color='navy', lw=2,
         label='Baseline')

plt.xlabel('False Negative Rate')
plt.ylabel('True Negative Rate')
plt.title('FNR vs TNR Curve')
plt.legend(loc="lower right")
plt.savefig('plots/FNR_TNR_curve_log_reg_2.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

4. Decision tree¶

In [54]:
dt_model = tree.DecisionTreeClassifier()

dt_grid = {
    "criterion": ["gini"],
    "max_depth": [5, 10, 20, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

dt_search = GridSearchCV(
    estimator=dt_model,
    param_grid=dt_grid,
    cv=cv,
    scoring=custom_scorer,
    n_jobs=-1
)

dt_search.fit(X_train, y_train)

print("Best Decision Tree Params:", dt_search.best_params_)
print("Best CV Score:", round(dt_search.best_score_,3))
Best Decision Tree Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best CV Score: 0.957

4.1. First decision tree (simple)¶

In [55]:
dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_leaf=1, min_samples_split=2)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.9
Accuracy: 0.931
Recall: 0.892

Confussion Matrix:
 [[1596  105]
 [  35  290]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96      1701
           1       0.73      0.89      0.81       325

    accuracy                           0.93      2026
   macro avg       0.86      0.92      0.88      2026
weighted avg       0.94      0.93      0.93      2026

In [56]:
plt.figure(figsize=(18, 12))
tree.plot_tree(dt, max_depth=2, feature_names=list(X_train.columns), filled=True, fontsize=14, impurity=False)
plt.savefig('plots/decision_tree_1.png', bbox_inches='tight')
plt.show()
No description has been provided for this image
In [102]:
importances = dt.feature_importances_

indices = np.argsort(importances)[::-1][:10]   # descending and slice top 10

top_features = X_train.columns[indices]
top_importances = importances[indices]

plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/decision_tree_top10_features.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

4.2. Second decision tree (weight class change)¶

In [58]:
dt2 = tree.DecisionTreeClassifier(criterion='gini', class_weight={0:1, 1:alt_weight}, max_depth=10, min_samples_leaf=1, min_samples_split=2)
dt2.fit(X_train, y_train)

y_pred = dt2.predict(X_test)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.933
Accuracy: 0.923
Recall: 0.935

Confussion Matrix:
 [[1565  136]
 [  21  304]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.92      0.95      1701
           1       0.69      0.94      0.79       325

    accuracy                           0.92      2026
   macro avg       0.84      0.93      0.87      2026
weighted avg       0.94      0.92      0.93      2026

In [59]:
plt.figure(figsize=(18, 12))
tree.plot_tree(dt2, max_depth=2, feature_names=list(X_train.columns), filled=True, fontsize=14, impurity=True)
plt.savefig('plots/decision_tree_2.png', bbox_inches='tight')
plt.show()
No description has been provided for this image
In [103]:
importances = dt2.feature_importances_

indices = np.argsort(importances)[::-1][:10]   # descending and slice top 10

top_features = X_train.columns[indices]
top_importances = importances[indices]

plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/decision_tree_2_top10_features.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

5. Random forest classifier¶

In [61]:
rf_model = RandomForestClassifier()

rf_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

rf_search = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_grid,
    cv=cv,
    scoring=custom_scorer,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

print("Best Random Forest Params:", rf_search.best_params_)
print("Best CV Score:", rf_search.best_score_)
Best Random Forest Params: {'bootstrap': False, 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV Score: 0.987498301297369

5.1. First random forest (simple)¶

In [62]:
forest = RandomForestClassifier(bootstrap=False, n_estimators=200, criterion='gini', max_depth=15, min_samples_leaf=1, min_samples_split=2)
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.919
Accuracy: 0.964
Recall: 0.908

Confussion Matrix:
 [[1658   43]
 [  30  295]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      1701
           1       0.87      0.91      0.89       325

    accuracy                           0.96      2026
   macro avg       0.93      0.94      0.93      2026
weighted avg       0.96      0.96      0.96      2026

In [106]:
estimator = forest.estimators_[0]

plt.figure(figsize=(20, 12))
tree.plot_tree(estimator, filled=True, feature_names=list(X_train.columns), max_depth=2, fontsize=14, impurity=True)
plt.savefig('plots/random_forest_1.png', bbox_inches='tight')
plt.show()
No description has been provided for this image
In [104]:
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1][:10]   # descending and slice top 10

top_features = X_train.columns[indices]
top_importances = importances[indices]

plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/random_forest_top10_features.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

5.2. Second random forest (weighted for false positive)¶

In [65]:
forest2 = RandomForestClassifier(bootstrap=False, n_estimators=200, criterion='gini', class_weight={0:1, 1:alt_weight}, max_depth=15, min_samples_leaf=1, min_samples_split=2)
forest2.fit(X_train, y_train)

y_pred = forest.predict(X_test)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.919
Accuracy: 0.964
Recall: 0.908

Confussion Matrix:
 [[1658   43]
 [  30  295]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      1701
           1       0.87      0.91      0.89       325

    accuracy                           0.96      2026
   macro avg       0.93      0.94      0.93      2026
weighted avg       0.96      0.96      0.96      2026

In [66]:
estimator = forest2.estimators_[0]

plt.figure(figsize=(20, 12))
tree.plot_tree(estimator, filled=True, feature_names=list(X_train.columns), max_depth=2, fontsize=14, impurity=True)
plt.savefig('plots/random_forest_2.png', bbox_inches='tight')
plt.show()
No description has been provided for this image
In [105]:
importances = forest2.feature_importances_

indices = np.argsort(importances)[::-1][:10]   # descending and slice top 10

top_features = X_train.columns[indices]
top_importances = importances[indices]

plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/random_forest_2_top10_features.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

6. Naive Bayes Model¶

In [69]:
bnb_model = BernoulliNB()

bnb_grid = {
    "alpha": np.logspace(-3, 2, 6),
    "binarize": [None, 0.0, 0.2, 0.5, 1.0],
    "fit_prior": [True, False]
}

bnb_search = GridSearchCV(
    estimator=bnb_model,
    param_grid=bnb_grid,
    cv=cv,
    scoring=custom_scorer,
    n_jobs=-1
)

bnb_search.fit(X_train_scaled, y_train)

print("Best Bernoulli NB Params:", bnb_search.best_params_)
print("Best CV Score:", bnb_search.best_score_)
Best Bernoulli NB Params: {'alpha': 0.001, 'binarize': 0.5, 'fit_prior': True}
Best CV Score: 0.8448306026617898

6.1. Naive Bayes model¶

In [70]:
nb = BernoulliNB(alpha=0.001, binarize=0.5, fit_prior=True)
nb.fit(X_train_scaled, y_train)

y_pred = nb.predict(X_test_scaled)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.717
Accuracy: 0.805
Recall: 0.695

Confussion Matrix:
 [[1405  296]
 [  99  226]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.83      0.88      1701
           1       0.43      0.70      0.53       325

    accuracy                           0.81      2026
   macro avg       0.68      0.76      0.71      2026
weighted avg       0.85      0.81      0.82      2026

In [109]:
log_probs = nb.feature_log_prob_

plt.figure(figsize=(16, 6))
plt.imshow(log_probs, aspect='auto')
plt.colorbar(label='Log Probability')
plt.xticks(ticks=np.arange(len(X_train.columns)), labels=X_train.columns, rotation=90)
plt.yticks(ticks=np.arange(len(nb.classes_)), labels=nb.classes_)
plt.savefig('plots/naive_bayes.png', bbox_inches='tight')
plt.show()
No description has been provided for this image
In [72]:
scaler_nb = MinMaxScaler()
X_train_nb = scaler_nb.fit_transform(X_train)
X_test_nb = scaler_nb.transform(X_test)
X_nb = scaler_nb.transform(X)
In [73]:
nb = MultinomialNB()
nb.fit(X_train_nb, y_train)

y_pred = nb.predict(X_test_nb)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.588
Accuracy: 0.812
Recall: 0.532

Confussion Matrix:
 [[1472  229]
 [ 152  173]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.87      0.89      1701
           1       0.43      0.53      0.48       325

    accuracy                           0.81      2026
   macro avg       0.67      0.70      0.68      2026
weighted avg       0.83      0.81      0.82      2026