import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, make_scorer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

df = pd.read_csv("credit_card_churn.csv")
df = df.drop(columns=['CLIENTNUM','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'])

print(f'Number of null values in DataFrame: {df.isna().sum().sum()}')

Number of null values in DataFrame: 0

df['Attrition_Flag'] = df['Attrition_Flag'].replace({
    'Existing Customer': 0,
    'Attrited Customer': 1
})

df.head()

df.nunique()

Attrition_Flag                 2
Customer_Age                  45
Gender                         2
Dependent_count                6
Education_Level                7
Marital_Status                 4
Income_Category                6
Card_Category                  4
Months_on_book                44
Total_Relationship_Count       6
Months_Inactive_12_mon         7
Contacts_Count_12_mon          7
Credit_Limit                6205
Total_Revolving_Bal         1974
Avg_Open_To_Buy             6813
Total_Amt_Chng_Q4_Q1        1158
Total_Trans_Amt             5033
Total_Trans_Ct               126
Total_Ct_Chng_Q4_Q1          830
Avg_Utilization_Ratio        964
dtype: int64

df.describe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  int64  
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Open_To_Buy           10127 non-null  float64
 15  Total_Amt_Chng_Q4_Q1      10127 non-null  float64
 16  Total_Trans_Amt           10127 non-null  int64  
 17  Total_Trans_Ct            10127 non-null  int64  
 18  Total_Ct_Chng_Q4_Q1       10127 non-null  float64
 19  Avg_Utilization_Ratio     10127 non-null  float64
dtypes: float64(5), int64(10), object(5)
memory usage: 1.5+ MB

df_numbers = df.drop(columns=['Gender','Education_Level','Marital_Status','Income_Category','Card_Category'])
correlation_matrix = df_numbers.corr()

plt.figure(figsize=(11, 9.5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

plt.title('Correlation Matrix Heatmap')
plt.savefig('plots/correlation_heatmap.png', bbox_inches='tight')
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Customer_Age'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()

plt.barh(df['Dependent_count'].value_counts().index, df['Dependent_count'].value_counts().values)

<BarContainer object of 6 artists>

table = pd.crosstab(df['Dependent_count'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()

plt.barh(df['Gender'].value_counts().index, df['Gender'].value_counts().values)

<BarContainer object of 2 artists>

table = pd.crosstab(df['Gender'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()

plt.barh(df['Education_Level'].value_counts().index, df['Education_Level'].value_counts().values, )

<BarContainer object of 7 artists>

table = pd.crosstab(df['Education_Level'], df['Attrition_Flag'])

order = ['Unknown','Uneducated','High School','College','Graduate','Post-Graduate','Doctorate']
table = table.reindex(order)

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()

plt.barh(df['Marital_Status'].value_counts().index, df['Marital_Status'].value_counts().values)

<BarContainer object of 4 artists>

table = pd.crosstab(df['Marital_Status'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()

plt.barh(df['Income_Category'].value_counts().index, df['Income_Category'].value_counts().values)

<BarContainer object of 6 artists>

table = pd.crosstab(df['Income_Category'], df['Attrition_Flag'])

order = ['Less than $40K','$40K - $60K','$60K - $80K','$80K - $120K','$120K +']
table = table.reindex(order)

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()

plt.barh(df['Card_Category'].value_counts().index, df['Card_Category'].value_counts().values)

<BarContainer object of 4 artists>

table = pd.crosstab(df['Card_Category'], df['Attrition_Flag'])

order = ['Blue','Silver','Gold','Platinum']
table = table.reindex(order)

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Months_on_book'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()

plt.barh(df['Total_Relationship_Count'].value_counts().index, df['Total_Relationship_Count'].value_counts().values)

<BarContainer object of 6 artists>

table = pd.crosstab(df['Total_Relationship_Count'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()

plt.barh(df['Months_Inactive_12_mon'].value_counts().index, df['Months_Inactive_12_mon'].value_counts().values)

<BarContainer object of 7 artists>

table = pd.crosstab(df['Months_Inactive_12_mon'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.show()

plt.barh(df['Contacts_Count_12_mon'].value_counts().index, df['Contacts_Count_12_mon'].value_counts().values)
plt.savefig('plots/bar_graph_contact_count_12_mon.png', bbox_inches='tight')

table = pd.crosstab(df['Contacts_Count_12_mon'], df['Attrition_Flag'])

table_pct = table.div(table.sum(axis=1), axis=0) * 100

ax = table_pct.plot(kind='bar', stacked=True, figsize=(10,6))

for c in ax.containers:
    ax.bar_label(c, fmt='%.0f%%')

plt.title('% of attrition for every value of Contacts count 12 months')
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.savefig('plots/percentual_bars_contacts_count_12_mon.png', bbox_inches='tight')
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Credit_Limit'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Revolving_Bal'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Avg_Open_To_Buy'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Amt_Chng_Q4_Q1'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Trans_Amt'] for flag in df['Attrition_Flag'].unique()],
    bins=20,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Ct_Chng_Q4_Q1'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Total_Trans_Ct'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.title('Distribution of Attrition for Total Transaction Count values')
plt.savefig('plots/histogram_total_trans_ct.png', bbox_inches='tight')
plt.show()

plt.hist(
    [df[df['Attrition_Flag']==flag]['Avg_Utilization_Ratio'] for flag in df['Attrition_Flag'].unique()],
    bins=16,
    stacked=True,
    label=df['Attrition_Flag'].unique()
)

plt.legend()
plt.show()

X = df.drop(columns=['Attrition_Flag'])
y = df['Attrition_Flag']

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y
)

print(y_train.value_counts())

Attrition_Flag
0    6799
1    1302
Name: count, dtype: int64

X_train, y_train = SMOTE(random_state=1).fit_resample(X_train, y_train)

print(y_train.value_counts())

Attrition_Flag
0    6799
1    6799
Name: count, dtype: int64

alt_weight = 3 # This will be the weight we give in our second model to the error we want to minimize

def weighted_metric(y_true, y_pred):
    r = recall_score(y_true, y_pred)
    a = accuracy_score(y_true, y_pred)
    return 0.8 * r + 0.2 * a

custom_scorer = make_scorer(weighted_metric)

scaler = PowerTransformer()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

pipeline = Pipeline([
    ("clf", LogisticRegression(max_iter=1000, solver='saga', random_state=42))
])

param_grid = [
    {
        "clf__penalty": ["l2"],
        "clf__solver": ["saga"],
        "clf__C": [0.01, 0.1, 1, 10, 100, 1000],
    },
    {
        "clf__penalty": ["elasticnet"],
        "clf__solver": ["saga"],
        "clf__C": [0.01, 0.1, 1, 10, 100],
        "clf__l1_ratio": [0.2, 0.5, 0.8],
    },
    {
        "clf__penalty": ["l1"],
        "clf__solver": ["saga"],
        "clf__C": [0.01, 0.1, 1, 10, 100],
    }
]

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, scoring=custom_scorer, n_jobs=-1)
grid.fit(X_train_scaled, y_train)
print(grid.best_params_)
print(round(grid.best_score_, 3))

{'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'saga'}
0.917

log_reg = LogisticRegression(C=10, penalty='l2', solver='saga')
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Results:
Custom metric: 0.725
Accuracy: 0.894
Recall: 0.683

Confussion Matrix:
 [[1590  111]
 [ 103  222]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94      1701
           1       0.67      0.68      0.67       325

    accuracy                           0.89      2026
   macro avg       0.80      0.81      0.81      2026
weighted avg       0.90      0.89      0.89      2026

coef = pd.DataFrame({
    'Variable': X.columns,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nVariables by weight:")
print(coef)

Variables by weight:
                          Variable  Coefficient
10                 Total_Trans_Amt     2.421791
27  Income_Category_Less than $40K     1.387969
22           Marital_Status_Single     1.257281
16        Education_Level_Graduate     1.143552
17     Education_Level_High School     1.025308
21          Marital_Status_Married     0.998827
20         Education_Level_Unknown     0.988722
24     Income_Category_$40K - $60K     0.905398
19      Education_Level_Uneducated     0.866818
28         Income_Category_Unknown     0.864538
26    Income_Category_$80K - $120K     0.732665
15       Education_Level_Doctorate     0.716404
18   Education_Level_Post-Graduate     0.632730
23          Marital_Status_Unknown     0.626041
25     Income_Category_$60K - $80K     0.572619
6                     Credit_Limit     0.552765
5            Contacts_Count_12_mon     0.437338
4           Months_Inactive_12_mon     0.401792
14                        Gender_M     0.227993
31            Card_Category_Silver     0.137788
29              Card_Category_Gold     0.099461
1                  Dependent_count     0.053719
30          Card_Category_Platinum     0.034419
2                   Months_on_book    -0.041379
0                     Customer_Age    -0.070956
9             Total_Amt_Chng_Q4_Q1    -0.296884
13           Avg_Utilization_Ratio    -0.466619
8                  Avg_Open_To_Buy    -0.674935
12             Total_Ct_Chng_Q4_Q1    -0.716764
3         Total_Relationship_Count    -0.756911
7              Total_Revolving_Bal    -0.771137
11                  Total_Trans_Ct    -3.603026

y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

thresholds = np.linspace(0, 1, 300)
fnr_list, tnr_list = [], []

for t in thresholds:
    y_pred = (y_prob >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    fnr = fn / (fn + tp)
    tnr = tn / (tn + fp)

    fnr_list.append(fnr)
    tnr_list.append(tnr)

plt.figure(figsize=(8, 5))
plt.plot(fnr_list, tnr_list, color='darkorange', lw=2,
         label='FNR–TNR Curve') 

plt.plot([0, 1], [0, 1], linestyle='--', color='navy', lw=2,
         label='Baseline')

plt.xlabel('False Negative Rate')
plt.ylabel('True Negative Rate')
plt.title('FNR vs TNR Curve')
plt.legend(loc="lower right")
plt.savefig('plots/FNR_TNR_curve_log_reg_1.png', bbox_inches='tight')
plt.show()

log_reg2 = LogisticRegression(class_weight={0:1, 1:alt_weight}, C=10, penalty='l2', solver='saga')
log_reg2.fit(X_train_scaled, y_train)

y_pred = log_reg2.predict(X_test_scaled)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Results:
Custom metric: 0.855
Accuracy: 0.852
Recall: 0.855

Confussion Matrix:
 [[1449  252]
 [  47  278]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.85      0.91      1701
           1       0.52      0.86      0.65       325

    accuracy                           0.85      2026
   macro avg       0.75      0.85      0.78      2026
weighted avg       0.90      0.85      0.87      2026

coef = pd.DataFrame({
    'Variable': X.columns,
    'Coefficient': log_reg2.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nVariables by weight:")
print(coef)

Variables by weight:
                          Variable  Coefficient
10                 Total_Trans_Amt     2.562894
27  Income_Category_Less than $40K     1.260395
22           Marital_Status_Single     1.037837
16        Education_Level_Graduate     0.976881
17     Education_Level_High School     0.882386
20         Education_Level_Unknown     0.852187
24     Income_Category_$40K - $60K     0.803180
21          Marital_Status_Married     0.792000
19      Education_Level_Uneducated     0.756512
28         Income_Category_Unknown     0.736755
15       Education_Level_Doctorate     0.632740
26    Income_Category_$80K - $120K     0.632589
6                     Credit_Limit     0.587660
18   Education_Level_Post-Graduate     0.568536
23          Marital_Status_Unknown     0.495418
25     Income_Category_$60K - $80K     0.487043
5            Contacts_Count_12_mon     0.461674
4           Months_Inactive_12_mon     0.406590
14                        Gender_M     0.204293
31            Card_Category_Silver     0.117727
29              Card_Category_Gold     0.095425
1                  Dependent_count     0.023210
30          Card_Category_Platinum     0.014729
2                   Months_on_book    -0.038182
0                     Customer_Age    -0.113261
9             Total_Amt_Chng_Q4_Q1    -0.368663
13           Avg_Utilization_Ratio    -0.434431
8                  Avg_Open_To_Buy    -0.639398
3         Total_Relationship_Count    -0.701649
12             Total_Ct_Chng_Q4_Q1    -0.706199
7              Total_Revolving_Bal    -0.781122
11                  Total_Trans_Ct    -3.866591

y_prob = log_reg2.predict_proba(X_test_scaled)[:, 1]

thresholds = np.linspace(0, 1, 300)
fnr_list, tnr_list = [], []

for t in thresholds:
    y_pred = (y_prob >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    fnr = fn / (fn + tp)
    tnr = tn / (tn + fp)

    fnr_list.append(fnr)
    tnr_list.append(tnr)

plt.figure(figsize=(8, 5))
plt.plot(fnr_list, tnr_list, color='darkorange', lw=2,
         label='FNR–TNR Curve') 

plt.plot([0, 1], [0, 1], linestyle='--', color='navy', lw=2,
         label='Baseline')

plt.xlabel('False Negative Rate')
plt.ylabel('True Negative Rate')
plt.title('FNR vs TNR Curve')
plt.legend(loc="lower right")
plt.savefig('plots/FNR_TNR_curve_log_reg_2.png', bbox_inches='tight')
plt.show()

dt_model = tree.DecisionTreeClassifier()

dt_grid = {
    "criterion": ["gini"],
    "max_depth": [5, 10, 20, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

dt_search = GridSearchCV(
    estimator=dt_model,
    param_grid=dt_grid,
    cv=cv,
    scoring=custom_scorer,
    n_jobs=-1
)

dt_search.fit(X_train, y_train)

print("Best Decision Tree Params:", dt_search.best_params_)
print("Best CV Score:", round(dt_search.best_score_,3))

Best Decision Tree Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best CV Score: 0.957

dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_leaf=1, min_samples_split=2)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Results:
Custom metric: 0.9
Accuracy: 0.931
Recall: 0.892

Confussion Matrix:
 [[1596  105]
 [  35  290]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96      1701
           1       0.73      0.89      0.81       325

    accuracy                           0.93      2026
   macro avg       0.86      0.92      0.88      2026
weighted avg       0.94      0.93      0.93      2026

plt.figure(figsize=(18, 12))
tree.plot_tree(dt, max_depth=2, feature_names=list(X_train.columns), filled=True, fontsize=14, impurity=False)
plt.savefig('plots/decision_tree_1.png', bbox_inches='tight')
plt.show()

importances = dt.feature_importances_

indices = np.argsort(importances)[::-1][:10]   # descending and slice top 10

top_features = X_train.columns[indices]
top_importances = importances[indices]

plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/decision_tree_top10_features.png', bbox_inches='tight')
plt.show()

dt2 = tree.DecisionTreeClassifier(criterion='gini', class_weight={0:1, 1:alt_weight}, max_depth=10, min_samples_leaf=1, min_samples_split=2)
dt2.fit(X_train, y_train)

y_pred = dt2.predict(X_test)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Results:
Custom metric: 0.933
Accuracy: 0.923
Recall: 0.935

Confussion Matrix:
 [[1565  136]
 [  21  304]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.92      0.95      1701
           1       0.69      0.94      0.79       325

    accuracy                           0.92      2026
   macro avg       0.84      0.93      0.87      2026
weighted avg       0.94      0.92      0.93      2026

plt.figure(figsize=(18, 12))
tree.plot_tree(dt2, max_depth=2, feature_names=list(X_train.columns), filled=True, fontsize=14, impurity=True)
plt.savefig('plots/decision_tree_2.png', bbox_inches='tight')
plt.show()

importances = dt2.feature_importances_

indices = np.argsort(importances)[::-1][:10]   # descending and slice top 10

top_features = X_train.columns[indices]
top_importances = importances[indices]

plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/decision_tree_2_top10_features.png', bbox_inches='tight')
plt.show()

rf_model = RandomForestClassifier()

rf_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

rf_search = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_grid,
    cv=cv,
    scoring=custom_scorer,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

print("Best Random Forest Params:", rf_search.best_params_)
print("Best CV Score:", rf_search.best_score_)

Best Random Forest Params: {'bootstrap': False, 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV Score: 0.987498301297369

forest = RandomForestClassifier(bootstrap=False, n_estimators=200, criterion='gini', max_depth=15, min_samples_leaf=1, min_samples_split=2)
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Results:
Custom metric: 0.919
Accuracy: 0.964
Recall: 0.908

Confussion Matrix:
 [[1658   43]
 [  30  295]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      1701
           1       0.87      0.91      0.89       325

    accuracy                           0.96      2026
   macro avg       0.93      0.94      0.93      2026
weighted avg       0.96      0.96      0.96      2026

estimator = forest.estimators_[0]

plt.figure(figsize=(20, 12))
tree.plot_tree(estimator, filled=True, feature_names=list(X_train.columns), max_depth=2, fontsize=14, impurity=True)
plt.savefig('plots/random_forest_1.png', bbox_inches='tight')
plt.show()

importances = forest.feature_importances_

indices = np.argsort(importances)[::-1][:10]   # descending and slice top 10

top_features = X_train.columns[indices]
top_importances = importances[indices]

plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/random_forest_top10_features.png', bbox_inches='tight')
plt.show()

forest2 = RandomForestClassifier(bootstrap=False, n_estimators=200, criterion='gini', class_weight={0:1, 1:alt_weight}, max_depth=15, min_samples_leaf=1, min_samples_split=2)
forest2.fit(X_train, y_train)

y_pred = forest.predict(X_test)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Results:
Custom metric: 0.919
Accuracy: 0.964
Recall: 0.908

Confussion Matrix:
 [[1658   43]
 [  30  295]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      1701
           1       0.87      0.91      0.89       325

    accuracy                           0.96      2026
   macro avg       0.93      0.94      0.93      2026
weighted avg       0.96      0.96      0.96      2026

estimator = forest2.estimators_[0]

plt.figure(figsize=(20, 12))
tree.plot_tree(estimator, filled=True, feature_names=list(X_train.columns), max_depth=2, fontsize=14, impurity=True)
plt.savefig('plots/random_forest_2.png', bbox_inches='tight')
plt.show()

importances = forest2.feature_importances_

indices = np.argsort(importances)[::-1][:10]   # descending and slice top 10

top_features = X_train.columns[indices]
top_importances = importances[indices]

plt.figure(figsize=(18, 14))
plt.barh(top_features, top_importances)
plt.gca().invert_yaxis()
plt.yticks(fontsize=16)
plt.savefig('plots/random_forest_2_top10_features.png', bbox_inches='tight')
plt.show()

bnb_model = BernoulliNB()

bnb_grid = {
    "alpha": np.logspace(-3, 2, 6),
    "binarize": [None, 0.0, 0.2, 0.5, 1.0],
    "fit_prior": [True, False]
}

bnb_search = GridSearchCV(
    estimator=bnb_model,
    param_grid=bnb_grid,
    cv=cv,
    scoring=custom_scorer,
    n_jobs=-1
)

bnb_search.fit(X_train_scaled, y_train)

print("Best Bernoulli NB Params:", bnb_search.best_params_)
print("Best CV Score:", bnb_search.best_score_)

Best Bernoulli NB Params: {'alpha': 0.001, 'binarize': 0.5, 'fit_prior': True}
Best CV Score: 0.8448306026617898

nb = BernoulliNB(alpha=0.001, binarize=0.5, fit_prior=True)
nb.fit(X_train_scaled, y_train)

y_pred = nb.predict(X_test_scaled)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Results:
Custom metric: 0.717
Accuracy: 0.805
Recall: 0.695

Confussion Matrix:
 [[1405  296]
 [  99  226]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.83      0.88      1701
           1       0.43      0.70      0.53       325

    accuracy                           0.81      2026
   macro avg       0.68      0.76      0.71      2026
weighted avg       0.85      0.81      0.82      2026

log_probs = nb.feature_log_prob_

plt.figure(figsize=(16, 6))
plt.imshow(log_probs, aspect='auto')
plt.colorbar(label='Log Probability')
plt.xticks(ticks=np.arange(len(X_train.columns)), labels=X_train.columns, rotation=90)
plt.yticks(ticks=np.arange(len(nb.classes_)), labels=nb.classes_)
plt.savefig('plots/naive_bayes.png', bbox_inches='tight')
plt.show()

scaler_nb = MinMaxScaler()
X_train_nb = scaler_nb.fit_transform(X_train)
X_test_nb = scaler_nb.transform(X_test)
X_nb = scaler_nb.transform(X)

nb = MultinomialNB()
nb.fit(X_train_nb, y_train)

y_pred = nb.predict(X_test_nb)

print("\nLogistic Regression Results:")
print("Custom metric:", round(weighted_metric(y_test, y_pred), 3))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))

print("\nConfussion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Results:
Custom metric: 0.588
Accuracy: 0.812
Recall: 0.532

Confussion Matrix:
 [[1472  229]
 [ 152  173]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.87      0.89      1701
           1       0.43      0.53      0.48       325

    accuracy                           0.81      2026
   macro avg       0.67      0.70      0.68      2026
weighted avg       0.83      0.81      0.82      2026

	Attrition_Flag	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
count	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000
mean	0.160660	46.325960	2.346203	35.928409	3.812580	2.341167	2.455317	8631.953698	1162.814061	7469.139637	0.759941	4404.086304	64.858695	0.712222	0.274894
std	0.367235	8.016814	1.298908	7.986416	1.554408	1.010622	1.106225	9088.776650	814.987335	9090.685324	0.219207	3397.129254	23.472570	0.238086	0.275691
min	0.000000	26.000000	0.000000	13.000000	1.000000	0.000000	0.000000	1438.300000	0.000000	3.000000	0.000000	510.000000	10.000000	0.000000	0.000000
25%	0.000000	41.000000	1.000000	31.000000	3.000000	2.000000	2.000000	2555.000000	359.000000	1324.500000	0.631000	2155.500000	45.000000	0.582000	0.023000
50%	0.000000	46.000000	2.000000	36.000000	4.000000	2.000000	2.000000	4549.000000	1276.000000	3474.000000	0.736000	3899.000000	67.000000	0.702000	0.176000
75%	0.000000	52.000000	3.000000	40.000000	5.000000	3.000000	3.000000	11067.500000	1784.000000	9859.000000	0.859000	4741.000000	81.000000	0.818000	0.503000
max	1.000000	73.000000	5.000000	56.000000	6.000000	6.000000	6.000000	34516.000000	2517.000000	34516.000000	3.397000	18484.000000	139.000000	3.714000	0.999000

Credit Card Churn Analysis¶

1. Import Libraries¶

2. Data preprocessing¶

2.1. Preparing the data and first steps¶

2.2. Visualization¶

2.3. Data preparation¶

3. Logistic regression models¶

3.1. First model (simple)¶

3.2. Second Model (higher loss for false negatives)¶

4. Decision tree¶

4.1. First decision tree (simple)¶

4.2. Second decision tree (weight class change)¶

5. Random forest classifier¶

5.1. First random forest (simple)¶

5.2. Second random forest (weighted for false positive)¶

6. Naive Bayes Model¶

6.1. Naive Bayes model¶

	Customer_Age	Gender	Dependent_count	Education_Level	Marital_Status	Income_Category	Card_Category	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
0	45	M	3	High School	Married	$60K - $80K	Blue	39	5	1	3	12691.0	777	11914.0	1.335	1144	42	1.625	0.061
1	49	F	5	Graduate	Single	Less than $40K	Blue	44	6	1	2	8256.0	864	7392.0	1.541	1291	33	3.714	0.105
2	51	M	3	Graduate	Married	$80K - $120K	Blue	36	4	1	0	3418.0	0	3418.0	2.594	1887	20	2.333	0.000
3	40	F	4	High School	Unknown	Less than $40K	Blue	34	3	4	1	3313.0	2517	796.0	1.405	1171	20	2.333	0.760
4	40	M	3	Uneducated	Married	$60K - $80K	Blue	21	5	1	0	4716.0	0	4716.0	2.175	816	28	2.500	0.000