Credit Card Churn Analysis¶
Made by: Carlos Pérez Franquelo

Image taken from Kaggle dataset
1. Import Libraries¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, make_scorer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
2. Data preprocessing¶
2.1. Preparing the data and first steps¶
In [2]:
df = pd.read_csv("credit_card_churn.csv")
df = df.drop(columns=['CLIENTNUM','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'])
print(f'Number of null values in DataFrame: {df.isna().sum().sum()}')
Number of null values in DataFrame: 0
In [ ]:
df['Attrition_Flag'] = df['Attrition_Flag'].replace({
'Existing Customer': 0,
'Attrited Customer': 1
})
In [4]:
df.head()
Out[4]:
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 0 | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 0 | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 0 | 40 | F | 4 | High School | Unknown | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 0 | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
In [5]:
df.nunique()
Out[5]:
Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 7 Marital_Status 4 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
In [6]:
df.describe()
Out[6]:
| Attrition_Flag | Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
| mean | 0.160660 | 46.325960 | 2.346203 | 35.928409 | 3.812580 | 2.341167 | 2.455317 | 8631.953698 | 1162.814061 | 7469.139637 | 0.759941 | 4404.086304 | 64.858695 | 0.712222 | 0.274894 |
| std | 0.367235 | 8.016814 | 1.298908 | 7.986416 | 1.554408 | 1.010622 | 1.106225 | 9088.776650 | 814.987335 | 9090.685324 | 0.219207 | 3397.129254 | 23.472570 | 0.238086 | 0.275691 |
| min | 0.000000 | 26.000000 | 0.000000 | 13.000000 | 1.000000 | 0.000000 | 0.000000 | 1438.300000 | 0.000000 | 3.000000 | 0.000000 | 510.000000 | 10.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 41.000000 | 1.000000 | 31.000000 | 3.000000 | 2.000000 | 2.000000 | 2555.000000 | 359.000000 | 1324.500000 | 0.631000 | 2155.500000 | 45.000000 | 0.582000 | 0.023000 |
| 50% | 0.000000 | 46.000000 | 2.000000 | 36.000000 | 4.000000 | 2.000000 | 2.000000 | 4549.000000 | 1276.000000 | 3474.000000 | 0.736000 | 3899.000000 | 67.000000 | 0.702000 | 0.176000 |
| 75% | 0.000000 | 52.000000 | 3.000000 | 40.000000 | 5.000000 | 3.000000 | 3.000000 | 11067.500000 | 1784.000000 | 9859.000000 | 0.859000 | 4741.000000 | 81.000000 | 0.818000 | 0.503000 |
| max | 1.000000 | 73.000000 | 5.000000 | 56.000000 | 6.000000 | 6.000000 | 6.000000 | 34516.000000 | 2517.000000 | 34516.000000 | 3.397000 | 18484.000000 | 139.000000 | 3.714000 | 0.999000 |
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null object 3 Dependent_count 10127 non-null int64 4 Education_Level 10127 non-null object 5 Marital_Status 10127 non-null object 6 Income_Category 10127 non-null object 7 Card_Category 10127 non-null object 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(5) memory usage: 1.5+ MB
2.2. Visualization¶
In [8]:
df_numbers = df.drop(columns=['Gender','Education_Level','Marital_Status','Income_Category','Card_Category'])
correlation_matrix = df_numbers.corr()
plt.figure(figsize=(11, 9.5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.savefig('plots/correlation_heatmap.png', bbox_inches='tight')
plt.show()
In [9]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Customer_Age'] for flag in df['Attrition_Flag'].unique()],
bins=16,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.show()
In [10]:
plt.barh(df['Dependent_count'].value_counts().index, df['Dependent_count'].value_counts().values)
Out[10]:
<BarContainer object of 6 artists>
In [11]:
table = pd.crosstab(df['Dependent_count'], df['Attrition_Flag'])
table_pct = table.div(table.sum(axis=1), axis=0) * 100
ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
In [12]:
plt.barh(df['Gender'].value_counts().index, df['Gender'].value_counts().values)
Out[12]:
<BarContainer object of 2 artists>
In [13]:
table = pd.crosstab(df['Gender'], df['Attrition_Flag'])
table_pct = table.div(table.sum(axis=1), axis=0) * 100
ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
In [14]:
plt.barh(df['Education_Level'].value_counts().index, df['Education_Level'].value_counts().values, )
Out[14]:
<BarContainer object of 7 artists>
In [15]:
table = pd.crosstab(df['Education_Level'], df['Attrition_Flag'])
order = ['Unknown','Uneducated','High School','College','Graduate','Post-Graduate','Doctorate']
table = table.reindex(order)
table_pct = table.div(table.sum(axis=1), axis=0) * 100
ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
In [16]:
plt.barh(df['Marital_Status'].value_counts().index, df['Marital_Status'].value_counts().values)
Out[16]:
<BarContainer object of 4 artists>
In [17]:
table = pd.crosstab(df['Marital_Status'], df['Attrition_Flag'])
table_pct = table.div(table.sum(axis=1), axis=0) * 100
ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
In [18]:
plt.barh(df['Income_Category'].value_counts().index, df['Income_Category'].value_counts().values)
Out[18]:
<BarContainer object of 6 artists>
In [19]:
table = pd.crosstab(df['Income_Category'], df['Attrition_Flag'])
order = ['Less than $40K','$40K - $60K','$60K - $80K','$80K - $120K','$120K +']
table = table.reindex(order)
table_pct = table.div(table.sum(axis=1), axis=0) * 100
ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
In [20]:
plt.barh(df['Card_Category'].value_counts().index, df['Card_Category'].value_counts().values)
Out[20]:
<BarContainer object of 4 artists>
In [21]:
table = pd.crosstab(df['Card_Category'], df['Attrition_Flag'])
order = ['Blue','Silver','Gold','Platinum']
table = table.reindex(order)
table_pct = table.div(table.sum(axis=1), axis=0) * 100
ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
In [22]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Months_on_book'] for flag in df['Attrition_Flag'].unique()],
bins=16,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.show()
In [23]:
plt.barh(df['Total_Relationship_Count'].value_counts().index, df['Total_Relationship_Count'].value_counts().values)
Out[23]:
<BarContainer object of 6 artists>
In [24]:
table = pd.crosstab(df['Total_Relationship_Count'], df['Attrition_Flag'])
table_pct = table.div(table.sum(axis=1), axis=0) * 100
ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
In [25]:
plt.barh(df['Months_Inactive_12_mon'].value_counts().index, df['Months_Inactive_12_mon'].value_counts().values)
Out[25]:
<BarContainer object of 7 artists>
In [26]:
table = pd.crosstab(df['Months_Inactive_12_mon'], df['Attrition_Flag'])
table_pct = table.div(table.sum(axis=1), axis=0) * 100
ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()
In [27]:
plt.barh(df['Contacts_Count_12_mon'].value_counts().index, df['Contacts_Count_12_mon'].value_counts().values)
plt.savefig('plots/bar_graph_contact_count_12_mon.png', bbox_inches='tight')
In [28]:
table = pd.crosstab(df['Contacts_Count_12_mon'], df['Attrition_Flag'])
table_pct = table.div(table.sum(axis=1), axis=0) * 100
ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%')
plt.title('% of attrition for every value of Contacts count 12 months')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.savefig('plots/percentual_bars_contacts_count_12_mon.png', bbox_inches='tight')
plt.show()
In [29]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Credit_Limit'] for flag in df['Attrition_Flag'].unique()],
bins=16,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.show()
In [30]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Total_Revolving_Bal'] for flag in df['Attrition_Flag'].unique()],
bins=16,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.show()
In [31]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Avg_Open_To_Buy'] for flag in df['Attrition_Flag'].unique()],
bins=16,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.show()
In [32]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Total_Amt_Chng_Q4_Q1'] for flag in df['Attrition_Flag'].unique()],
bins=16,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.show()
In [33]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Total_Trans_Amt'] for flag in df['Attrition_Flag'].unique()],
bins=20,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.show()
In [34]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Total_Ct_Chng_Q4_Q1'] for flag in df['Attrition_Flag'].unique()],
bins=16,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.show()
In [35]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Total_Trans_Ct'] for flag in df['Attrition_Flag'].unique()],
bins=16,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.title('Distribution of Attrition for Total Transaction Count values')
plt.savefig('plots/histogram_total_trans_ct.png', bbox_inches='tight')
plt.show()
In [36]:
plt.hist(
[df[df['Attrition_Flag']==flag]['Avg_Utilization_Ratio'] for flag in df['Attrition_Flag'].unique()],
bins=16,
stacked=True,
label=df['Attrition_Flag'].unique()
)
plt.legend()
plt.show()
2.3. Data preparation¶
In [37]:
X = df.drop(columns=['Attrition_Flag'])
y = df['Attrition_Flag']
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
In [38]:
print(y_train.value_counts())
Attrition_Flag 0 6799 1 1302 Name: count, dtype: int64
In [41]:
X_train, y_train = SMOTE(random_state=1).fit_resample(X_train, y_train)
In [42]:
print(y_train.value_counts())
Attrition_Flag 0 6799 1 6799 Name: count, dtype: int64
In [43]:
alt_weight = 3 # This will be the weight we give in our second model to the error we want to minimize
In [44]:
def weighted_metric(y_true, y_pred):
r = recall_score(y_true, y_pred)
a = accuracy_score(y_true, y_pred)
return 0.8 * r + 0.2 * a
custom_scorer = make_scorer(weighted_metric)
In [45]:
scaler = PowerTransformer()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
3. Logistic regression models¶
In [46]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
pipeline = Pipeline([
("clf", LogisticRegression(max_iter=1000, solver='saga', random_state=42))
])
param_grid = [
{
"clf__penalty": ["l2"],
"clf__solver": ["saga"],
"clf__C": [0.01, 0.1, 1, 10, 100, 1000],
},
{
"clf__penalty": ["elasticnet"],
"clf__solver": ["saga"],
"clf__C": [0.01, 0.1, 1, 10, 100],
"clf__l1_ratio": [0.2, 0.5, 0.8],
},
{
"clf__penalty": ["l1"],
"clf__solver": ["saga"],
"clf__C": [0.01, 0.1, 1, 10, 100],
}
]
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, scoring=custom_scorer, n_jobs=-1)
grid.fit(X_train_scaled, y_train)
print(grid.best_params_)
print(round(grid.best_score_, 3))
{'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'saga'}
0.917
3.1. First model (simple)¶
In [47]:
log_reg = LogisticRegression(C=10, penalty='l2', solver='saga')
log_reg.fit(X_train_scaled, y_train)
y_pred = log_reg.predict(X_test_scaled)
print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.725
Accuracy: 0.894
Recall: 0.683
Confussion Matrix:
[[1590 111]
[ 103 222]]
Classification Report:
precision recall f1-score support
0 0.94 0.93 0.94 1701
1 0.67 0.68 0.67 325
accuracy 0.89 2026
macro avg 0.80 0.81 0.81 2026
weighted avg 0.90 0.89 0.89 2026
In [48]:
coef = pd.DataFrame({
'Variable': X.columns,
'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)
print("\nVariables by weight:")
print(coef)
Variables by weight:
Variable Coefficient
10 Total_Trans_Amt 2.421791
27 Income_Category_Less than $40K 1.387969
22 Marital_Status_Single 1.257281
16 Education_Level_Graduate 1.143552
17 Education_Level_High School 1.025308
21 Marital_Status_Married 0.998827
20 Education_Level_Unknown 0.988722
24 Income_Category_$40K - $60K 0.905398
19 Education_Level_Uneducated 0.866818
28 Income_Category_Unknown 0.864538
26 Income_Category_$80K - $120K 0.732665
15 Education_Level_Doctorate 0.716404
18 Education_Level_Post-Graduate 0.632730
23 Marital_Status_Unknown 0.626041
25 Income_Category_$60K - $80K 0.572619
6 Credit_Limit 0.552765
5 Contacts_Count_12_mon 0.437338
4 Months_Inactive_12_mon 0.401792
14 Gender_M 0.227993
31 Card_Category_Silver 0.137788
29 Card_Category_Gold 0.099461
1 Dependent_count 0.053719
30 Card_Category_Platinum 0.034419
2 Months_on_book -0.041379
0 Customer_Age -0.070956
9 Total_Amt_Chng_Q4_Q1 -0.296884
13 Avg_Utilization_Ratio -0.466619
8 Avg_Open_To_Buy -0.674935
12 Total_Ct_Chng_Q4_Q1 -0.716764
3 Total_Relationship_Count -0.756911
7 Total_Revolving_Bal -0.771137
11 Total_Trans_Ct -3.603026
In [49]:
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]
thresholds = np.linspace(0, 1, 300)
fnr_list, tnr_list = [], []
for t in thresholds:
y_pred = (y_prob >= t).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
fnr = fn / (fn + tp)
tnr = tn / (tn + fp)
fnr_list.append(fnr)
tnr_list.append(tnr)
plt.figure(figsize=(8, 5))
plt.plot(fnr_list, tnr_list, color='darkorange', lw=2,
label='FNR–TNR Curve')
plt.plot([0, 1], [0, 1], linestyle='--', color='navy', lw=2,
label='Baseline')
plt.xlabel('False Negative Rate')
plt.ylabel('True Negative Rate')
plt.title('FNR vs TNR Curve')
plt.legend(loc="lower right")
plt.savefig('plots/FNR_TNR_curve_log_reg_1.png', bbox_inches='tight')
plt.show()
3.2. Second Model (higher loss for false negatives)¶
In [51]:
log_reg2 = LogisticRegression(class_weight={0:1, 1:alt_weight}, C=10, penalty='l2', solver='saga')
log_reg2.fit(X_train_scaled, y_train)
y_pred = log_reg2.predict(X_test_scaled)
print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.855
Accuracy: 0.852
Recall: 0.855
Confussion Matrix:
[[1449 252]
[ 47 278]]
Classification Report:
precision recall f1-score support
0 0.97 0.85 0.91 1701
1 0.52 0.86 0.65 325
accuracy 0.85 2026
macro avg 0.75 0.85 0.78 2026
weighted avg 0.90 0.85 0.87 2026
In [52]:
coef = pd.DataFrame({
'Variable': X.columns,
'Coefficient': log_reg2.coef_[0]
}).sort_values(by='Coefficient', ascending=False)
print("\nVariables by weight:")
print(coef)
Variables by weight:
Variable Coefficient
10 Total_Trans_Amt 2.562894
27 Income_Category_Less than $40K 1.260395
22 Marital_Status_Single 1.037837
16 Education_Level_Graduate 0.976881
17 Education_Level_High School 0.882386
20 Education_Level_Unknown 0.852187
24 Income_Category_$40K - $60K 0.803180
21 Marital_Status_Married 0.792000
19 Education_Level_Uneducated 0.756512
28 Income_Category_Unknown 0.736755
15 Education_Level_Doctorate 0.632740
26 Income_Category_$80K - $120K 0.632589
6 Credit_Limit 0.587660
18 Education_Level_Post-Graduate 0.568536
23 Marital_Status_Unknown 0.495418
25 Income_Category_$60K - $80K 0.487043
5 Contacts_Count_12_mon 0.461674
4 Months_Inactive_12_mon 0.406590
14 Gender_M 0.204293
31 Card_Category_Silver 0.117727
29 Card_Category_Gold 0.095425
1 Dependent_count 0.023210
30 Card_Category_Platinum 0.014729
2 Months_on_book -0.038182
0 Customer_Age -0.113261
9 Total_Amt_Chng_Q4_Q1 -0.368663
13 Avg_Utilization_Ratio -0.434431
8 Avg_Open_To_Buy -0.639398
3 Total_Relationship_Count -0.701649
12 Total_Ct_Chng_Q4_Q1 -0.706199
7 Total_Revolving_Bal -0.781122
11 Total_Trans_Ct -3.866591
In [53]:
y_prob = log_reg2.predict_proba(X_test_scaled)[:, 1]
thresholds = np.linspace(0, 1, 300)
fnr_list, tnr_list = [], []
for t in thresholds:
y_pred = (y_prob >= t).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
fnr = fn / (fn + tp)
tnr = tn / (tn + fp)
fnr_list.append(fnr)
tnr_list.append(tnr)
plt.figure(figsize=(8, 5))
plt.plot(fnr_list, tnr_list, color='darkorange', lw=2,
label='FNR–TNR Curve')
plt.plot([0, 1], [0, 1], linestyle='--', color='navy', lw=2,
label='Baseline')
plt.xlabel('False Negative Rate')
plt.ylabel('True Negative Rate')
plt.title('FNR vs TNR Curve')
plt.legend(loc="lower right")
plt.savefig('plots/FNR_TNR_curve_log_reg_2.png', bbox_inches='tight')
plt.show()
4. Decision tree¶
In [54]:
dt_model = tree.DecisionTreeClassifier()
dt_grid = {
"criterion": ["gini"],
"max_depth": [5, 10, 20, 50],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 5]
}
dt_search = GridSearchCV(
estimator=dt_model,
param_grid=dt_grid,
cv=cv,
scoring=custom_scorer,
n_jobs=-1
)
dt_search.fit(X_train, y_train)
print("Best Decision Tree Params:", dt_search.best_params_)
print("Best CV Score:", round(dt_search.best_score_,3))
Best Decision Tree Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best CV Score: 0.957
4.1. First decision tree (simple)¶
In [55]:
dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_leaf=1, min_samples_split=2)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.9
Accuracy: 0.931
Recall: 0.892
Confussion Matrix:
[[1596 105]
[ 35 290]]
Classification Report:
precision recall f1-score support
0 0.98 0.94 0.96 1701
1 0.73 0.89 0.81 325
accuracy 0.93 2026
macro avg 0.86 0.92 0.88 2026
weighted avg 0.94 0.93 0.93 2026
In [56]:
plt.figure(figsize=(18, 12))
tree.plot_tree(dt, max_depth=2, feature_names=list(X_train.columns), filled=True, fontsize=14, impurity=False)
plt.savefig('plots/decision_tree_1.png', bbox_inches='tight')
plt.show()
In [102]:
importances = dt.feature_importances_
indices = np.argsort(importances)[::-1][:10] # descending and slice top 10
top_features = X_train.columns[indices]
top_importances = importances[indices]
plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/decision_tree_top10_features.png', bbox_inches='tight')
plt.show()
4.2. Second decision tree (weight class change)¶
In [58]:
dt2 = tree.DecisionTreeClassifier(criterion='gini', class_weight={0:1, 1:alt_weight}, max_depth=10, min_samples_leaf=1, min_samples_split=2)
dt2.fit(X_train, y_train)
y_pred = dt2.predict(X_test)
print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.933
Accuracy: 0.923
Recall: 0.935
Confussion Matrix:
[[1565 136]
[ 21 304]]
Classification Report:
precision recall f1-score support
0 0.99 0.92 0.95 1701
1 0.69 0.94 0.79 325
accuracy 0.92 2026
macro avg 0.84 0.93 0.87 2026
weighted avg 0.94 0.92 0.93 2026
In [59]:
plt.figure(figsize=(18, 12))
tree.plot_tree(dt2, max_depth=2, feature_names=list(X_train.columns), filled=True, fontsize=14, impurity=True)
plt.savefig('plots/decision_tree_2.png', bbox_inches='tight')
plt.show()
In [103]:
importances = dt2.feature_importances_
indices = np.argsort(importances)[::-1][:10] # descending and slice top 10
top_features = X_train.columns[indices]
top_importances = importances[indices]
plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/decision_tree_2_top10_features.png', bbox_inches='tight')
plt.show()
5. Random forest classifier¶
In [61]:
rf_model = RandomForestClassifier()
rf_grid = {
"n_estimators": [50, 100, 200],
"max_depth": [5, 10, 15],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4],
"bootstrap": [True, False]
}
rf_search = GridSearchCV(
estimator=rf_model,
param_grid=rf_grid,
cv=cv,
scoring=custom_scorer,
n_jobs=-1
)
rf_search.fit(X_train, y_train)
print("Best Random Forest Params:", rf_search.best_params_)
print("Best CV Score:", rf_search.best_score_)
Best Random Forest Params: {'bootstrap': False, 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV Score: 0.987498301297369
5.1. First random forest (simple)¶
In [62]:
forest = RandomForestClassifier(bootstrap=False, n_estimators=200, criterion='gini', max_depth=15, min_samples_leaf=1, min_samples_split=2)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.919
Accuracy: 0.964
Recall: 0.908
Confussion Matrix:
[[1658 43]
[ 30 295]]
Classification Report:
precision recall f1-score support
0 0.98 0.97 0.98 1701
1 0.87 0.91 0.89 325
accuracy 0.96 2026
macro avg 0.93 0.94 0.93 2026
weighted avg 0.96 0.96 0.96 2026
In [106]:
estimator = forest.estimators_[0]
plt.figure(figsize=(20, 12))
tree.plot_tree(estimator, filled=True, feature_names=list(X_train.columns), max_depth=2, fontsize=14, impurity=True)
plt.savefig('plots/random_forest_1.png', bbox_inches='tight')
plt.show()
In [104]:
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1][:10] # descending and slice top 10
top_features = X_train.columns[indices]
top_importances = importances[indices]
plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/random_forest_top10_features.png', bbox_inches='tight')
plt.show()
5.2. Second random forest (weighted for false positive)¶
In [65]:
forest2 = RandomForestClassifier(bootstrap=False, n_estimators=200, criterion='gini', class_weight={0:1, 1:alt_weight}, max_depth=15, min_samples_leaf=1, min_samples_split=2)
forest2.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.919
Accuracy: 0.964
Recall: 0.908
Confussion Matrix:
[[1658 43]
[ 30 295]]
Classification Report:
precision recall f1-score support
0 0.98 0.97 0.98 1701
1 0.87 0.91 0.89 325
accuracy 0.96 2026
macro avg 0.93 0.94 0.93 2026
weighted avg 0.96 0.96 0.96 2026
In [66]:
estimator = forest2.estimators_[0]
plt.figure(figsize=(20, 12))
tree.plot_tree(estimator, filled=True, feature_names=list(X_train.columns), max_depth=2, fontsize=14, impurity=True)
plt.savefig('plots/random_forest_2.png', bbox_inches='tight')
plt.show()
In [105]:
importances = forest2.feature_importances_
indices = np.argsort(importances)[::-1][:10] # descending and slice top 10
top_features = X_train.columns[indices]
top_importances = importances[indices]
plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/random_forest_2_top10_features.png', bbox_inches='tight')
plt.show()
6. Naive Bayes Model¶
In [69]:
bnb_model = BernoulliNB()
bnb_grid = {
"alpha": np.logspace(-3, 2, 6),
"binarize": [None, 0.0, 0.2, 0.5, 1.0],
"fit_prior": [True, False]
}
bnb_search = GridSearchCV(
estimator=bnb_model,
param_grid=bnb_grid,
cv=cv,
scoring=custom_scorer,
n_jobs=-1
)
bnb_search.fit(X_train_scaled, y_train)
print("Best Bernoulli NB Params:", bnb_search.best_params_)
print("Best CV Score:", bnb_search.best_score_)
Best Bernoulli NB Params: {'alpha': 0.001, 'binarize': 0.5, 'fit_prior': True}
Best CV Score: 0.8448306026617898
6.1. Naive Bayes model¶
In [70]:
nb = BernoulliNB(alpha=0.001, binarize=0.5, fit_prior=True)
nb.fit(X_train_scaled, y_train)
y_pred = nb.predict(X_test_scaled)
print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.717
Accuracy: 0.805
Recall: 0.695
Confussion Matrix:
[[1405 296]
[ 99 226]]
Classification Report:
precision recall f1-score support
0 0.93 0.83 0.88 1701
1 0.43 0.70 0.53 325
accuracy 0.81 2026
macro avg 0.68 0.76 0.71 2026
weighted avg 0.85 0.81 0.82 2026
In [109]:
log_probs = nb.feature_log_prob_
plt.figure(figsize=(16, 6))
plt.imshow(log_probs, aspect='auto')
plt.colorbar(label='Log Probability')
plt.xticks(ticks=np.arange(len(X_train.columns)), labels=X_train.columns, rotation=90)
plt.yticks(ticks=np.arange(len(nb.classes_)), labels=nb.classes_)
plt.savefig('plots/naive_bayes.png', bbox_inches='tight')
plt.show()
In [72]:
scaler_nb = MinMaxScaler()
X_train_nb = scaler_nb.fit_transform(X_train)
X_test_nb = scaler_nb.transform(X_test)
X_nb = scaler_nb.transform(X)
In [73]:
nb = MultinomialNB()
nb.fit(X_train_nb, y_train)
y_pred = nb.predict(X_test_nb)
print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Logistic Regression Results:
Custom metric: 0.588
Accuracy: 0.812
Recall: 0.532
Confussion Matrix:
[[1472 229]
[ 152 173]]
Classification Report:
precision recall f1-score support
0 0.91 0.87 0.89 1701
1 0.43 0.53 0.48 325
accuracy 0.81 2026
macro avg 0.67 0.70 0.68 2026
weighted avg 0.83 0.81 0.82 2026