#Importing necessary tools
#basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

#viz
from viz_utils_ import *
import plotly.express as px
from plotly.offline import init_notebook_mode
#Pipelines
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

#data preparation
from sklearn.model_selection import train_test_split

#Preprocessing
from sklearn.preprocessing import PolynomialFeatures

#categorical encoding
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

#Scaling
from sklearn.preprocessing import RobustScaler

from sklearn.preprocessing import PowerTransformer
# modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#validation
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from yellowbrick.classifier import DiscriminationThreshold

#Optimisation
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

#Learning curve
from sklearn.model_selection import learning_curve

# models interpretation
from sklearn.inspection import permutation_importance 
import shap  # package used to calculate Shap values

import joblib
from sklearn import set_config
set_config(display='diagram')

import warnings
warnings.filterwarnings("ignore")
"setup complete"

'setup complete'


#General functions
# Plot pandas dataframe
def plot_df(data, ax, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
    ax.axis('off')
    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns)

    # mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in mpl_table._cells.items():
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
        
#getting features names from columntransformer
def get_feature_names(column_transformer, as_list=True):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    if as_list:
        return list(map(lambda s: s.split('__')[-1], list(feature_names)))
    feature_dict = {}
    for elem in map(lambda s: s.split('__'), list(feature_names)):
        key = '__'.join(elem[:-1]) if len(elem)>1 else 'passthrough'
        if key in feature_dict:
            feature_dict[key].append(elem[-1])
        else:
            feature_dict[key] = elem[-1:]

    return feature_dict


bank_data=pd.read_csv("bank marketing dataset.csv")
df_viz=bank_data.copy()


fig , ax =  plt.subplots(1,1, figsize=(13,6))
num_desc=df_viz.describe().round(2).rename_axis('').reset_index()
plot_df(num_desc, ax, font_size=20)


fig, axs = plt.subplots(3,3,figsize=(20,15))
axs = axs.flatten()
axs[-1].axis('off')
axs[-2].axis('off')
num_features = ['age', 'balance', 'duration', 'campaign','pdays','previous','day']
fig.suptitle('Distribution of numerical features', fontsize=20)
for i,col in enumerate(num_features):
    sns.kdeplot(data=df_viz, x=col, hue="deposit",palette=['#4682B4', 'salmon'],ax=axs[i])
# distplot(df_viz, num_features, fig_cols=3, hue='deposit', color=['salmon', 'darkslateblue'], figsize=(16, 12))


cat_features=[col for col in df_viz if df_viz[col].dtype=='object']
catplot_analysis(df_viz[cat_features], hue='deposit',figsize=(16, 14),palette=['#4682B4', '#E9967A'])


# count_target=df_viz['deposit'].value_counts().to_frame()
# count_target["percentage"]=count_target["deposit"]/count_target["deposit"].sum(axis=0)
# count_target
# count_target.style.background_gradient()


# Target class balance
fig, ax = plt.subplots(figsize=(8, 8))
color_list = ['cadetblue', '#E9967A']
text = f'Total\n{len(df_viz)}'
title = 'Term subscriptions'
labels =["Did not Open term deposit account", "Opened Term deposit account"]

# Visualizing it through a donut chart
donut_plot(df_viz, col='deposit', ax=ax, colors=color_list, label_names=labels, title=title, text=text)


#Any missing values?
pd.DataFrame(df_viz.isnull().sum(axis=0),columns=['Missing']).T


plt.figure(figsize=(20,20))
sns.pairplot(df_viz, hue="deposit", plot_kws=dict(alpha=0.5), corner=True,palette=['rosybrown', 'steelblue'])

<seaborn.axisgrid.PairGrid at 0x1fd00242b50>

<Figure size 1440x1440 with 0 Axes>


# f, ax = plt.subplots(1,1, figsize=(16,8))

# colors = ['salmon', 'darkslateblue']
# labels ="Did not Open Term deposit account", "Opened Term deposit account"

# plt.suptitle('Information on Term subscriptions', fontsize=20)

# df_viz["deposit"].value_counts().plot.pie(explode=[0,0.25], autopct='%1.2f%%', shadow=True,colors=colors, 
#                                              labels=labels, fontsize=12, startangle=25)

# ax.set_ylabel('% of subscriptions', fontsize=14)


cum_df=bank_data[['duration','deposit']].sort_values('duration')
cum_df['deposit']=cum_df['deposit'].replace({'yes':1,'no':0})
# cum_df=cum_df.sort_values('duration')
cum_df['cum_deposit']=100*cum_df["deposit"].cumsum()/cum_df['deposit'].sum()


fig, ax = plt.subplots(1,2,figsize=(15, 8))
#Boxen plot
sns.boxenplot(x="deposit", y="duration",data=df_viz,palette=["#e09e8f", "steelblue"],
                   linewidth=1, ax=ax[1])

ax[1].set_title('Duration boxenplots')

#Line plot
sns.lineplot(data=cum_df, x="duration", y="cum_deposit",color='darkblue',ax=ax[0])

y_50=cum_df[cum_df.cum_deposit<=50].iloc[-1][0] #corresp duration to 0.5
y_90=cum_df[cum_df.cum_deposit>=90].iloc[0,0] #corresp duration to 0.9


ax[0].hlines([50], 0, y_50, linestyles='dashed', colors='firebrick')
ax[0].vlines([y_50], 0, 50, linestyles='dashed', colors='firebrick')
ax[0].hlines([90], 0, y_90, linestyles='dashed', colors='firebrick')
ax[0].vlines([y_90], 0, 90, linestyles='dashed', colors='firebrick')

ax[0].set_xlim([0,4000])
ax[0].set_ylim([0,100])
style = dict(size=12, color='grey')

ax[0].text(y_50, 50, f"  ({int(y_50)},50%)", **style)
ax[0].text(y_90, 90, f"  ({int(y_90)},90%)", **style)

ax[0].set_ylabel('percentage(%)')
ax[0].set_title('Cumulative percentage of term subscriptions')


fig, axs = plt.subplots(1,3,figsize=(15,6))
sns.stripplot(y="deposit", x="duration", data=df_viz[df_viz.duration<=120],palette=["#e09e8f", "steelblue"],
                   linewidth=1, ax=axs[0])
sns.stripplot(y="deposit", x="duration", data=df_viz[(120<df_viz.duration) & (df_viz.duration<=1000)],palette=["#e09e8f", "steelblue"],
                   linewidth=1, ax=axs[1])
sns.stripplot(y="deposit", x="duration", data=df_viz[df_viz.duration>1000],palette=["#e09e8f", "steelblue"],
                   linewidth=1, ax=axs[2])

<AxesSubplot:xlabel='duration', ylabel='deposit'>


#age discretization
age_cat=pd.cut(df_viz.age,bins=[17,25,40,60,95],labels=["18-25","26-40","41-60","+60"])
df_viz['age_cat']=age_cat
#plot
fig, axs = plt.subplots(1,2,figsize=(18,8))

g1=sns.boxplot(x="age_cat", y="duration",hue="deposit",
            ax=axs[0],
            data=df_viz,
            linewidth=1,
            palette=["#e09e8f", "steelblue"])
g2=sns.scatterplot(x="age", y="duration", hue="deposit",
                size=4,
                alpha=0.4,
                ax=axs[1],
                data=df_viz,
                palette=["#e09e8f", "steelblue"])
g2.axvline(60, color='black', linestyle='--')

<matplotlib.lines.Line2D at 0x1fd6156f7f0>


# balance discretization
balance_cat=pd.cut(df_viz.balance,bins=[df_viz.balance.min()-1,0,df_viz.balance.median(),2000,df_viz.balance.max()],labels=["negative_balance","low-balance","middle_balance","high-balance"])
df_viz['balance_cat']=balance_cat

deposit_groups = df_viz.groupby(['deposit'], as_index=False)['balance'].median()
# "#a6bbff" ,"#ffc99c" ,"#f5dfb3", "#e6a1bc"
fig, axs = plt.subplots(1,2,figsize=(16,8))
g1=sns.countplot(x="balance_cat", hue="deposit",
            palette=["#6f9fb3", "#e09e8f"],alpha=0.8,
            data=df_viz,
            ax=axs[0])
g2=sns.barplot(x="deposit", y="balance",
            palette=["#ADD8E6","#6f9fb3"],alpha=0.8,
            data=deposit_groups,
            ax=axs[1])
g1.set_title('Subscriptions by balance groups', fontsize=16)
g2.set_title('Median balance by target feature', fontsize=16)
for p in g1.patches:
                g1.annotate(str(p.get_height()), (p.get_x() + 0.15, p.get_height() * 1.02))


age_groups = df_viz.groupby(['age_cat'], as_index=False)['balance'].median()
# fig, axs = plt.subplots(2,2,figsize=(20,20))
# axs=axs.flatten()

fig= plt.figure(figsize=(24,8))
ax = fig.add_gridspec(1, 3)
ax1 = fig.add_subplot(ax[0,0])
ax2 = fig.add_subplot(ax[0,1])
ax3 = fig.add_subplot(ax[0,2])

g1=sns.barplot(y="balance", x="age_cat", 
               palette="RdBu",
               data=age_groups,
               ax=ax2)
g2=sns.countplot(x="age_cat", hue="deposit",
                 palette=["#6f9fb3", "#e09e8f"],alpha=0.8,
                 data=df_viz,
                 ax=ax1)          
g3=sns.scatterplot(x="age", y="balance",
                   hue="deposit",alpha=0.4,
                   palette=['indianred','#4c86ad'],
                   data=df_viz[df_viz.balance<10000],
                   ax=ax3)
g3.axhline(0, color='black', linestyle='--')
g1.set_title('Median Balance by age group', fontsize=16)
g2.set_title('Count of term subscriptions by age group', fontsize=16)
g3.set_title('Clients balance by age', fontsize=16)

Text(0.5, 1.0, 'Clients balance by age')


education_groups = df_viz[df_viz.education!="unknown"].groupby(['education'], as_index=False)['balance'].median()
fig, axs=plt.subplots(1,2,figsize=(16,8))
g1=sns.countplot(x="education", hue="deposit",
            palette=["#6f9fb3", "#e09e8f"],alpha=0.8,
            data=df_viz,
            ax=axs[0])
g2=sns.barplot(x="education", y="balance", ax=axs[1],
               palette="RdBu",
               data=education_groups)

for p in g1.patches:
        g1.annotate(str(p.get_height()), (p.get_x() + 0.15, p.get_height() * 1.02))


job_groups = df_viz.groupby(['job'], as_index=False)['balance'].median()
fig, axs=plt.subplots(2,1,figsize=(15,10))
g1=sns.countplot(x="job", hue="deposit",
            palette=["#6f9fb3", "#e09e8f"],alpha=0.8,
            data=df_viz,
            ax=axs[0])
g2=sns.barplot(x="job", y="balance", ax=axs[1],
               palette="RdBu",
               data=job_groups)

for p in g1.patches:
        g1.annotate(str(p.get_height()), (p.get_x() + 0.15, p.get_height() * 1.02))


campaign_cat=pd.cut(df_viz.campaign,bins=[-1,df_viz.campaign.mean(),10,df_viz.campaign.max()],labels=["below_mean","above_mean","more than 10 contacts"])
df_viz['campaign_cat']=campaign_cat
previous_contact=pd.cut(df_viz.previous,bins=[-1,0,df_viz.previous.max()],labels=["No contact","Contact"])
df_viz['previous_contact']=previous_contact


fig, axs=plt.subplots(2,2,figsize=(18,15))
axs=axs.flatten()
g1=sns.countplot(x="campaign_cat",hue='deposit',   
                 palette=["#6f9fb3", "#e09e8f"],
                 ax=axs[0],
                 data=df_viz)

g2=sns.scatterplot(x="campaign", y="duration", hue="deposit",
                alpha=0.8,palette=["#6f9fb3","#F5DEB3"],
                ax=axs[1],
                data=df_viz)
g3=sns.countplot(x="poutcome",hue='deposit',
                 palette=["#F5DEB3","#6f9fb3"],
                 ax=axs[2],
                 data=df_viz)

g4=sns.countplot(x="previous_contact",hue='deposit', 
                 ax=axs[3],
                 palette=["#FAEBD7","#e09e8f"],
                 data=df_viz)
# plt.tight_layout()
for g in [g1,g3,g4]:
    for p in g.patches:
            g.annotate(str(p.get_height()), (p.get_x() + 0.15, p.get_height() * 1.02))


df = bank_data.copy()
#encoding the target variable
df['deposit']=df['deposit'].replace({'yes':1,'no':0}) 
#encoding binary explicative features
for var in ['default','housing','loan']:
    df[var]=df[var].replace({'yes':1,'no':0})
#dropping duration
print(f"Data shape before the drop : {df.shape}")
df.drop('duration',axis=1,inplace=True)
print(f"Data shape before the drop : {df.shape}")

Data shape before the drop : (11162, 17)
Data shape before the drop : (11162, 16)


#train and test separation 
X=df.drop('deposit',axis=1)
y=df['deposit'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print(f"Shape of training set : {X_train.shape}")
print(f"Shape of testing set : {X_test.shape}")

#separation of categorical and numeric features
binary_features=["default",'housing','loan']
numeric_features= list(set([col for col in X_train if X_train[col].dtype!='object' ])-set(binary_features))
categorical_features=[col for col in X_train if X_train[col].dtype=='object' ]

ordinal_categ=['education','month']
nominal_categ=list(set(categorical_features)-set(ordinal_categ))

high_card_categ=[col for col in nominal_categ if len(X_train[col].unique())>=10]
low_card_categ=[col for col in nominal_categ if len(X_train[col].unique())<10]

print(f"Shape of numeric training data : {X_train[numeric_features].shape}")
print(f"Shape of categorical training data : {X_train[categorical_features].shape}")
print(f"Shape of ordinal categorical training data : {X_train[ordinal_categ].shape}")
print(f"Shape of high cardinality nominal categorical  training data : {X_train[high_card_categ].shape}")
print(f"Shape of low cardinality nominal categorical training data : {X_train[low_card_categ].shape}")

# ordering modalities of ordinal features
#education
educ=['unknown','primary','secondary','tertiary']
#month
month=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']

Shape of training set : (8929, 15)
Shape of testing set : (2233, 15)
Shape of numeric training data : (8929, 6)
Shape of categorical training data : (8929, 6)
Shape of ordinal categorical training data : (8929, 2)
Shape of high cardinality nominal categorical  training data : (8929, 1)
Shape of low cardinality nominal categorical training data : (8929, 3)


scaler_pipeline=Pipeline([
    ('scaler', RobustScaler())
])

normalizer_pipeline=Pipeline([
    ('normalizer', PowerTransformer())
])

OHEncoder_pipeline = Pipeline([
    ('OH_encoder', OneHotEncoder(sparse=False,drop='first'))
])
OrdinalEncoder_pipeline = Pipeline([
    ('ordinal_encoder', OrdinalEncoder())
])


#scaling numeric features and OH encoding all categorical features
prep_pipeline1 = ColumnTransformer([
    ('numeric',scaler_pipeline, numeric_features),
    ('categorical', OHEncoder_pipeline, categorical_features)],
     remainder="passthrough" 
)

#scaling numeric features and Ordinal encoding all categorical features
prep_pipeline2 = ColumnTransformer([
    ('numeric',scaler_pipeline, numeric_features),
    ('nominal_categ', OrdinalEncoder_pipeline, nominal_categ),
    ('ordinal_categ', OrdinalEncoder([educ,month]), ordinal_categ)],
     remainder="passthrough" 
)

#normalizing numeric features and OH encoding all categorical features
prep_pipeline3 = ColumnTransformer([
    ('numeric',normalizer_pipeline, numeric_features),
    ('categorical', OHEncoder_pipeline, categorical_features)],
     remainder="passthrough" 
)

#normalizing numeric features and Ordinal encoding all categorical features
prep_pipeline4 = ColumnTransformer([
    ('numeric',normalizer_pipeline, numeric_features),
    ('nominal_categ', OrdinalEncoder_pipeline, nominal_categ),
    ('ordinal_categ', OrdinalEncoder([educ,month]), ordinal_categ)],
     remainder="passthrough" 
)
#scaling numeric features, Ordinal encoding just ordinal features, OH encoding low cardinality nominal features and binary encooding high cardinality features
prep_pipeline5 = ColumnTransformer([
    ('numeric',scaler_pipeline, numeric_features),
    ('ordinal_categ', make_pipeline(OrdinalEncoder([educ,month])), ordinal_categ),
    ('nominal_categ', OHEncoder_pipeline, nominal_categ)],
     remainder="passthrough"   
)
#same as prep_pipeline5 with normalizing instead of scaling
prep_pipeline6 = ColumnTransformer([
    ('numeric',normalizer_pipeline, numeric_features),
    ('ordinal_categ', make_pipeline(OrdinalEncoder([educ,month])), ordinal_categ),
    ('nominal_categ', OHEncoder_pipeline, nominal_categ)], 
     remainder="passthrough"   
)

#preprocessors
preprocessors={"prep_pipeline1": prep_pipeline1, "prep_pipeline2":prep_pipeline2, "prep_pipeline3":prep_pipeline3, "prep_pipeline4":prep_pipeline4, "prep_pipeline5":prep_pipeline5, "prep_pipeline6":prep_pipeline6 }

#models 
Classifiers = {
    "KNearest_Neighbors": KNeighborsClassifier(),
    "Logistic_Regression": LogisticRegression(random_state=42),
    "SVM": SVC(random_state=42),
    "XGBoost": XGBClassifier(verbosity = 0, random_state=42),
    "Random_Forest": RandomForestClassifier(random_state=42)
}


# apply different preprocessing pipelines on a single model 
def test_pipelines(model, pipelines, X_train, y_train):
    #test multiple pipelines
    perf={}
    for name,pipeline in pipelines.items():
        print(name)
        estimator=make_pipeline(pipeline,model)
        score=cross_val_score(estimator, X_train, y_train, cv=5, scoring='accuracy')
        perf[name]=score.mean()
    return perf

# apply different preprocessing pipelines on different models
def test_pipelines_multiple_models(models,pipelines, X_train, y_train):
    #for each model, each pipeline in the pipelines dictionary is tested 
    models_perf=pd.DataFrame(columns=pipelines.keys())
    for name,model in models.items():
        print(name)
        models_perf=models_perf.append(test_pipelines(model, pipelines, X_train, y_train),ignore_index=True)
    models_perf.index= list(models.keys())
    return models_perf
    
# select the best preprocessor for each model
def select_best_preprocessor(models, preprocessors, perf_df):
    best_preprocessors={}
    for model_name, model in models.items():
        model_perf=perf_df.loc[model_name]
        best_preprocessors[model_name]={"preprocessor_name": model_perf.idxmax(), "preprocessor": preprocessors[model_perf.idxmax()]}
    return  best_preprocessors


joblib_Filename = "Models_performance.joblib"
try:
    models_perf = joblib.load(joblib_Filename)
    print("models_perf imported")
except FileNotFoundError:
    models_perf = test_pipelines_multiple_models(Classifiers,preprocessors, X_train, y_train)
    joblib.dump(models_perf, joblib_Filename)
    print("models_perf created and exported")
#plot performance dataframe 
fig , ax = plt.subplots(1,1,figsize=(16,4))
models_perf_plot=models_perf.round(4).rename_axis('Classifier').reset_index()
plot_df(models_perf_plot, ax, edge_color='black', font_size=13.5)
# select best preprocessor for each classifier
best_preprocessors=select_best_preprocessor(Classifiers, preprocessors, models_perf)

models_perf imported


def test_fe(models,best_preprocessors):
    models_perf=pd.DataFrame(columns=["With FE","Without FE"])
    for model_name, model in models.items():
        # selector = SelectFromModel(RandomForestClassifier(random_state=42))
        pipe_with_FE=make_pipeline(make_pipeline(best_preprocessors[model_name]['preprocessor'], PolynomialFeatures(2,interaction_only=False)), model)
        pipe_without_FE=make_pipeline(best_preprocessors[model_name]['preprocessor'], model)

        score_with_FE=cross_val_score(pipe_with_FE, X_train, y_train, cv=5).mean()
        score_without_FE=cross_val_score(pipe_without_FE, X_train, y_train, cv=5).mean()
        models_perf=models_perf.append({"With FE": score_with_FE, "Without FE": score_without_FE},ignore_index=True)

    models_perf.index= list(models.keys())
    return models_perf
    
fe_res=test_fe(Classifiers, best_preprocessors)

#plot performance dataframe 
fig , ax = plt.subplots(1,1,figsize=(10,4))
fe_res=fe_res.round(4).rename_axis('Classifier').reset_index()
plot_df(fe_res, ax, edge_color='black', font_size=13.5)


def make_pipelines(Classifiers,best_preprocessors) :
    Classifiers_pipelines={}
    for classifier_name, classifer in Classifiers.items() :
        if classifier_name=='Logistic_Regression':
            classifier_pipeline = make_pipeline(
            best_preprocessors[classifier_name]['preprocessor'],
            PolynomialFeatures(2,interaction_only=False),
            classifer)
        
        else :
            classifier_pipeline= make_pipeline(
            best_preprocessors[classifier_name]['preprocessor'],
            classifer)
            
        Classifiers_pipelines[classifier_name+'_pipeline']=classifier_pipeline

    return Classifiers_pipelines

def models_performance(Classifiers_pipelines, X_train, y_train):
    performances=pd.DataFrame(columns=['Pipeline', 'Accuracy'])
    for name, pipe in Classifiers_pipelines.items():
        accuracy=cross_val_score(pipe, X_train, y_train, cv=5).mean()
        performances=performances.append({'Pipeline': name, 'Accuracy':accuracy}, ignore_index=True)
    return performances


#Cross validation scores
# Classifiers_pipelines = make_pipelines(Classifiers,best_preprocessors)
# performances = models_performance(Classifiers_pipelines, X_train, y_train)
# performances.Accuracy=performances.Accuracy.round(4)

#plot scores
fig, axs = plt.subplots(1,2,figsize=(18,4.5),gridspec_kw={'width_ratios': [3,4]},constrained_layout = True)
plot_df(performances, ax=axs[0], edge_color='black')

g=sns.barplot(y="Pipeline", x="Accuracy", 
               palette="RdBu",
               data=performances,
               ax= axs[1])
g.set_title("Cross validation scores before hyperparameters optimisation", fontsize=16)
g.set_xlabel("Accuracy", fontsize=12)
g.set(ylabel=None)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


def on_step(optim_result):
    global Iter
    Iter+=1
#     score = svm_Search.best_score_
    print(f"Iteration {Iter}  done.")
#     print(f"Iteration {Iter} done. Best score : {score}")


def model_tuning(model_name, model_pipeline, search_type, space):# set Iter=0 before calling the function
    if search_type=='BayesSearchCV' :
        search=BayesSearchCV(estimator=model_pipeline,
                                search_spaces=space,
                                scoring='accuracy',
                                n_iter=100,
                                cv=5,
                                return_train_score=True,
                                verbose= 3
                                )
    elif search_type== 'GridSearchCV':
        search = GridSearchCV(estimator=model_pipeline,
                            param_grid=space,
                            scoring='accuracy',
                            cv=5,
                            return_train_score=True,
                            verbose=3
                            )



    joblib_Filename = model_name+"_result.joblib"
    try:
        Search_result = joblib.load(joblib_Filename)
        print(f"{joblib_Filename} imported")
    except FileNotFoundError:
        if search_type=='BayesSearchCV' :
            Search_result = search.fit(X_train, y_train, callback=on_step)
        elif search_type== 'GridSearchCV':
            Search_result = search.fit(X_train, y_train)

        joblib.dump(Search_result, joblib_Filename)
        print(f"{joblib_Filename} created and exported")


    print('Best Score: ', Search_result.best_score_)
    print('Best Params: ', Search_result.best_params_)

    return Search_result

def viz_tuning_result(search_res, params, labels, scale_midPoint=0.67 ):


    my_palette=['rgb(0, 147, 146)', 'rgb(114, 170, 161)', 'rgb(177, 199, 179)','rgb(229, 185, 173)', 'rgb(217, 137, 148)', 'rgb(208, 88, 126)']

    search_res_df=pd.DataFrame(search_res.cv_results_) #create dataframe from search result
    search_res_df['mean_test_score_cummax']=search_res_df.mean_test_score.cummax()
    for col in params:
        search_res_df[col]=search_res_df[col].astype(float)
    # paralle coordinates

    p1 = px.parallel_coordinates(search_res_df[params+['mean_test_score']], color='mean_test_score',labels=dict(zip(params,Labels)),color_continuous_midpoint=scale_midPoint,
                color_continuous_scale=my_palette)
    p1.update_layout(
        width=1000,
        height=350,
        margin=dict(
            # l=0,
            # r=0,
            b=0,
            # t=0,
            # pad=4
        )
    )

    p1.show()
    # convergence_plot
    fig1, axs = plt.subplots(1,2,figsize=(16,4))

    p2=sns.lineplot(x=search_res_df.index, y=search_res_df.mean_test_score_cummax, linestyle='-', marker='o', color='Teal', ax=axs[0])
    p2.set_title('Convergence plot', fontsize=16)

    p3=sns.regplot(x=search_res_df.index, y=search_res_df.mean_test_score, color='Teal', ax=axs[1])
    p3.set_title('Score by iterations', fontsize=16)


def plot_validation_curve(param_range, mean_train_score, std_train_score, mean_test_score, std_test_score ):
    f, ax = plt.subplots(1,1,figsize=(10,4))

    
     # Calculate mean and standard deviation for training set scores
    train_mean = np.array(res.mean_train_score)
    train_std = np.array(res.std_train_score)

    # Calculate mean and standard deviation for test set scores
    test_mean = np.array(res.mean_test_score)
    test_std = np.array(res.std_test_score)

    lw = 2
    ax.semilogx(param_range, train_mean, label="Training score",
                color="darkorange", lw=lw)
    ax.fill_between(param_range,  train_mean - train_std,
                    train_mean + train_std, alpha=0.2,
                    color="darkorange", lw=lw)
    ax.semilogx(param_range, test_mean, label="Cross-validation score",
                color="navy", lw=lw)
    ax.fill_between(param_range, test_mean - test_std,
                    test_mean + test_std, alpha=0.2,
                    color="navy", lw=lw)
    # Create plot
    ax.set_title("Validation Curve With Logistic regression")
    ax.set_xlabel("C ")
    ax.set_ylabel("Accuracy Score")
    plt.tight_layout()
    ax.legend(loc="best")
    plt.show()


Best_estimators={}
# Logistic Regression optimisation

penalty = ['l1', 'l2', 'elasticnet']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
solver =['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

lr_param_grid = dict(logisticregression__penalty=penalty,
                           logisticregression__C=C,
                           logisticregression__solver=solver)
lr_search_res=model_tuning('Logistic_Regression', Classifiers_pipelines['Logistic_Regression_pipeline'], 'GridSearchCV', lr_param_grid)
lr_trials=pd.DataFrame(lr_search_res.cv_results_)
Best_estimators['Logistic_Regression']=lr_search_res.best_estimator_

Logistic_Regression_result.joblib imported
Best Score:  0.732108368543386
Best Params:  {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear'}


# parallel categories

init_notebook_mode(connected=True)
fig = px.parallel_categories(lr_trials[['param_logisticregression__solver', 'param_logisticregression__penalty','mean_test_score']], dimensions=['param_logisticregression__solver', 'param_logisticregression__penalty'],
                color="mean_test_score", labels={'param_logisticregression__solver': 'Solver', 'param_logisticregression__penalty': 'Penalty'},color_continuous_scale=px.colors.sequential.Inferno)
fig.update_layout(
        width=800,
        height=300,
        margin=dict(
            l=20,
            # r=0,
            b=0,
            t=20,
            # pad=4
        )
    )
fig.show()
init_notebook_mode(connected=False)
# validation curve

res=lr_trials[(lr_trials.param_logisticregression__penalty=='l1' ) & ( lr_trials.param_logisticregression__solver== 'liblinear')]
param_range=np.array(res.param_logisticregression__C,dtype='Float64')
plot_validation_curve(param_range, res.mean_train_score, res.std_train_score, res.mean_test_score, res.std_test_score )


#SVM optimisation

svm_space = {
    'svc__C': Real(0.0001, 1000, 'log-uniform'),
    'svc__gamma': Real(0.001, 1, 'log-uniform'),
#     'svc__kernel': Categorical(['poly', 'rbf','sigmoid'])
}
svm_search_res=model_tuning('SVM', Classifiers_pipelines['SVM_pipeline'], 'BayesSearchCV', svm_space)
# svm_trials=pd.DataFrame(svm_search_res.cv_results_)
Best_estimators['SVM']=svm_search_res.best_estimator_

SVM_result.joblib imported
Best Score:  0.7362519565496972
Best Params:  OrderedDict([('svc__C', 1.0192997328765452), ('svc__gamma', 0.20529559876362308)])


Labels=['C', 'gamma']
params=['param_svc__C', 'param_svc__gamma']
init_notebook_mode(connected=True)
viz_tuning_result(svm_search_res, params, Labels )
init_notebook_mode(connected=False)


#XGBoost optimisation

xgb_space = {
    'xgbclassifier__max_depth': Integer(5, 30),
    'xgbclassifier__learning_rate': Real(0.01, 0.5, 'log-uniform'),
    'xgbclassifier__n_estimators': Categorical(range(100,1000,50)),
    'xgbclassifier__gamma': Real(0.01, 0.5, 'log-uniform'),
    'xgbclassifier__min_child_weight':Integer(1,20),
    'xgbclassifier__subsample' : Real(0.1, 1.0,'uniform'),
    'xgbclassifier__colsample_bytree' : Real(0.1, 1.0,'uniform')
}

Iter = 0
xgb_Search_result=model_tuning('XGBoost', Classifiers_pipelines['XGBoost_pipeline'], 'BayesSearchCV', xgb_space)
Best_estimators['XGBoost']=xgb_Search_result.best_estimator_

XGBoost_result.joblib imported
Best Score:  0.7387154369026446
Best Params:  OrderedDict([('xgbclassifier__colsample_bytree', 0.6470269391763109), ('xgbclassifier__gamma', 0.49999999999999994), ('xgbclassifier__learning_rate', 0.01), ('xgbclassifier__max_depth', 5), ('xgbclassifier__min_child_weight', 1), ('xgbclassifier__n_estimators', 950), ('xgbclassifier__subsample', 0.779827694446314)])


Labels=['colsample_bytree', 'gamma', 'learning_rate', 'max_depth', 'min_child_weight', 'n_estimators']
params=['param_xgbclassifier__colsample_bytree', 'param_xgbclassifier__gamma', 'param_xgbclassifier__learning_rate', 'param_xgbclassifier__max_depth', 'param_xgbclassifier__min_child_weight', 'param_xgbclassifier__n_estimators']
init_notebook_mode(connected=True)
viz_tuning_result(xgb_Search_result, params, Labels ,scale_midPoint=0.71)
init_notebook_mode(connected=False)


#Random forest optimisation
rf_space = {
    'RandomForestClassifier__max_depth': Integer(1, 30),
    'RandomForestClassifier__max_features': Real(0.1, 1.0, 'uniform'),
    'RandomForestClassifier__n_estimators': Categorical(range(100,1000,50)),
    'RandomForestClassifier__min_samples_split': Integer(1,20),
    'RandomForestClassifier__min_samples_leaf':Integer(1,20),
    'RandomForestClassifier__bootstrap' : Categorical([True, False]),
    'RandomForestClassifier__criterion' : Categorical(['gini','entropy'])
}

Iter = 0
rf_Search_result=model_tuning('RandomForest', Classifiers_pipelines['Random_Forest_pipeline'], 'BayesSearchCV', rf_space)
Best_estimators['Random_Forest']=rf_Search_result.best_estimator_

RandomForest_result.joblib imported
Best Score:  0.7347960640023088
Best Params:  OrderedDict([('randomforestclassifier__bootstrap', True), ('randomforestclassifier__criterion', 'gini'), ('randomforestclassifier__max_depth', 30), ('randomforestclassifier__max_features', 0.1677246117543552), ('randomforestclassifier__min_samples_leaf', 1), ('randomforestclassifier__min_samples_split', 20), ('randomforestclassifier__n_estimators', 950)])


Labels=['max_depth', 'max_features', 'min_samples_leaf', 'min_samples_split', 'min_samples_leaf', 'n_estimators']
params=['param_randomforestclassifier__max_depth', 'param_randomforestclassifier__max_features', 'param_randomforestclassifier__min_samples_leaf', 'param_randomforestclassifier__min_samples_split', 'param_randomforestclassifier__n_estimators']
init_notebook_mode(connected=True)
viz_tuning_result(rf_Search_result, params, Labels , scale_midPoint=0.7)
init_notebook_mode(connected=False)


#KNN optimisation

n_neighbors =  range(1,30)
# leaf_size = (1,50)
p=[1,2,3]

knn_space = { 'kneighborsclassifier__n_neighbors' : n_neighbors,
                        #    kneighborsclassifier__leaf_size=leaf_size,
                           'kneighborsclassifier__p': p }



knn_search_res=model_tuning('kNearastNeighbour', Classifiers_pipelines['KNearest_Neighbors_pipeline'], 'GridSearchCV', knn_space)
knn_trials=pd.DataFrame(knn_search_res.cv_results_)
Best_estimators['KNearest_Neighbors']=knn_search_res.best_estimator_

kNearastNeighbour_result.joblib imported
Best Score:  0.7092607614154284
Best Params:  {'kneighborsclassifier__n_neighbors': 23, 'kneighborsclassifier__p': 1}


plt.figure(figsize=(9,5))
sns.lineplot(x='param_kneighborsclassifier__n_neighbors',y='mean_test_score', data=knn_trials)

<AxesSubplot:xlabel='param_kneighborsclassifier__n_neighbors', ylabel='mean_test_score'>


def plot_learning_curves(model_name, estimator, X, y, ax, train_sizes=np.linspace(.1, 1.0, 5), cv=5):

   train_sizes, train_scores, validation_scores = learning_curve(estimator, X, y, train_sizes = train_sizes, cv = cv, scoring = 'accuracy')

   train_scores_mean = train_scores.mean(axis = 1)
   validation_scores_mean = validation_scores.mean(axis = 1)
   train_scores_std = train_scores.std(axis = 1)
   validation_scores_std = validation_scores.std(axis = 1)

   ax.plot(train_sizes, train_scores_mean, label = 'Training score',linestyle='-', marker='o', color="firebrick")
   ax.fill_between(train_sizes,  train_scores_mean - train_scores_std,
               train_scores_mean + train_scores_std, alpha=0.1,
               color="firebrick")
   ax.plot(train_sizes, validation_scores_mean, label = 'Validation score',linestyle='-', marker='o', color="navy")
   ax.fill_between(train_sizes,  validation_scores_mean - validation_scores_std,
               validation_scores_mean + validation_scores_std, alpha=0.1,
               color="navy")

   ax.set_ylabel('Accuracy', fontsize = 14)
   ax.set_xlabel('Training set size', fontsize = 14)
   title = model_name + ' learning curves'
   ax.set_title(title, fontsize = 16, y = 1.03)
   ax.legend()


fig, axs = plt.subplots(2,3,figsize=(24,14))
axs=axs.flatten()
axs[-1].axis('off')
for i, estimator in enumerate(Best_estimators.items()):
    plot_learning_curves(estimator[0], estimator[1], X_train, y_train, ax=axs[i] ,train_sizes=np.linspace(.1, 1.0, 5), cv=5)


def report_for_different_thresholds(target, pred_prob) :

    thresholds = [float(x)/10 for x in range(10)]  
    cutoff_df = pd.DataFrame(columns = ['Threshold','Accuracy','Recall','Precision','F1_score'])
    for i in thresholds:

        pred_final= np.array(pd.Series(pred_prob).map(lambda x: 1 if x >= i else 0))
        Accuracy=metrics.accuracy_score(target,  pred_final)
        Recall=metrics.recall_score(target,  pred_final)
        Precision=metrics.precision_score(target,  pred_final)
        F1_score=metrics.f1_score(target,  pred_final)
        cutoff_df=cutoff_df.append({'Threshold': i, 'Accuracy': Accuracy,'Recall': Recall, 'Precision': Precision, 'F1_score': F1_score },ignore_index=True)

    return cutoff_df

def draw_roc( actual, probs , ax):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score(actual, probs)
    ax.plot( fpr, tpr, label='ROC curve (AUC = %0.2f)' % auc_score )
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic')
    ax.grid()
    ax.legend(loc="lower right")

get_threshold = lambda ax: float(ax.get_legend_handles_labels()[1][3][5:-1])

def threshold_report(estimator_name, X_train, y_train, X_test, y_test):
    fig, axs = plt.subplots(1,3,figsize=(22,6))
    estimator=Best_estimators[estimator_name]
    estimator.fit(X_train, y_train)
    preds=estimator.predict(X_test)
    preds_proba=estimator.predict_proba(X_test)
    #report for diff thresholds
    cutoff_df=report_for_different_thresholds(y_test,preds_proba[:,1])
    #plots
    plot_df(cutoff_df.round(4), ax=axs[0])
    draw_roc(y_test, preds_proba[:,1] ,ax=axs[1])

    visualizer = DiscriminationThreshold(estimator, n_trials=30, exclude= 'queue_rate', random_state=42, ax=axs[2])
    visualizer.fit(X_train, y_train)        # Fit the data to the visualizer
    visualizer.show()           # Finalize and render the figure

    return get_threshold(axs[2]), fig


def show_figure(fig):
    # create a dummy figure and use its
    # manager to display "fig"  
    dummy = plt.figure()
    new_manager = dummy.canvas.manager
    new_manager.canvas.figure = fig
    fig.set_canvas(new_manager.canvas)


Best_estimators['SVM']=make_pipeline(
            best_preprocessors['SVM']['preprocessor'],
            SVC(C= 1.0192997328765452, gamma= 0.20529559876362308, probability=True, random_state=42)) # to enable probability estimates in SVC

joblib_Filename = "discimination_threshold_res.joblib"
try:
    discimination_threshold_res = joblib.load(joblib_Filename)
    optimal_thresholds = discimination_threshold_res['thresholds']
    figures = discimination_threshold_res['figures']
    print("discimination_threshold_res imported")
    for fig in figures.values():
        show_figure(fig)
        fig.show()
except FileNotFoundError:
    optimal_thresholds={}
    figures={}
    for estimator_name in Best_estimators.keys() :
        optimal_thresholds[estimator_name],figures[estimator_name]=threshold_report(estimator_name, X_train, y_train, X_test, y_test)
    discimination_threshold_res={"thresholds":optimal_thresholds, "figures": figures}
    joblib.dump(discimination_threshold_res, joblib_Filename)
    print("discimination_threshold_res created and exported")

discimination_threshold_res imported


def get_pred_from_proba(pred_prob, prob_threshold):
	return (pred_prob >= prob_threshold).astype('int')
    
def plot_confusion_matrix(actual, preds, ax) :
    cnf_matrix = metrics.confusion_matrix(actual, preds)
    class_names=[0,1] # name  of classes
    # fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    ax.set_xticks(tick_marks, class_names)
    ax.set_yticks(tick_marks, class_names)
    # create heatmap
    sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g', ax=ax)
    ax.xaxis.set_label_position("top")
    # ax.set_tight_layout()
    ax.set_title('Confusion matrix')
    ax.set_ylabel('Actual label')
    ax.set_xlabel('Predicted label')

def model_evaluation(estimator_name, X_train, y_train, X_test, y_test, threshold= 0.5):
    fig, axs = plt.subplots(1,2,figsize=(16,6))
    estimator=Best_estimators[estimator_name]
    estimator.fit(X_train, y_train)

    preds_proba=estimator.predict_proba(X_test)
    preds=get_pred_from_proba(preds_proba[:,1], threshold)

    plot_confusion_matrix(y_test, preds, axs[0])

    Accuracy=metrics.accuracy_score(y_test,  preds)
    Recall=metrics.recall_score(y_test,  preds)
    Precision=metrics.precision_score(y_test,  preds)
    F1_score=metrics.f1_score(y_test,  preds)
    AUC_score = metrics.roc_auc_score(y_test, preds_proba[:,1])
    perf=pd.Series({'Accuracy': Accuracy, 'Recall': Recall , 'Precision': Precision, 'F1_score': F1_score, 'AUC_score': AUC_score },name= estimator_name)

    p=sns.barplot(x=perf.index, y=perf.values,ax=axs[1], palette='YlGnBu')
    p.set_title(f"Evaluation of {estimator_name}")

    return perf


models_comparison=pd.DataFrame()
for model_name in Best_estimators.keys():
    perf=model_evaluation(model_name, X_train, y_train, X_test, y_test, threshold=optimal_thresholds[model_name])
    models_comparison=pd.concat([models_comparison, perf], axis=1)


df= models_comparison.copy()
df=df.rename_axis('score').reset_index()
tidy = df.melt(id_vars='score').rename(columns=str.title)
fig, axs = plt.subplots(2,1,figsize=(18, 10))
plot_df(df.round(4), ax=axs[0])
sns.barplot(x='Score', y='Value', hue='Variable', data=tidy, ax=axs[1], palette='YlGnBu')
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)

<matplotlib.legend.Legend at 0x1fd082244c0>


def plot_permutation_importance(estimator_name, X_train, y_train, X_test, y_test):
    estimator=Best_estimators[estimator_name]
    estimator.fit(X_train, y_train)
    result = permutation_importance(estimator, X_test, y_test, n_repeats=10,
                                    random_state=42, n_jobs=2)
    sorted_idx = result.importances_mean.argsort()
    
    # plt.boxplot(result.importances[sorted_idx].T,
    #         vert=False, labels=X_test.columns[sorted_idx])

    sns.barplot(x=result.importances_mean[sorted_idx].T, y= X_test.columns[sorted_idx],palette = 'YlGnBu' )     

def get_names_from_pipe(pipe):
    transf=pipe.named_steps['columntransformer']
    for step in transf.transformers_:
        if type(step[1])== Pipeline :
            if type(step[1][0])==OneHotEncoder:
                oh_feat_names=step[2]
                break
    features=get_feature_names(transf)
    count=0
    for i,elt in enumerate(features):
        if elt[0]=='x':
            if int(elt[1])!=count :
                count+=1
            features[i]=oh_feat_names[count]+elt[2:]
    return features

def plot_SHAP(pipe, x_train, y_train, x_test):

    preprocessor=pipe.named_steps['columntransformer']
    feature_names=get_names_from_pipe(pipe)
    x_train_prep=preprocessor.fit_transform(x_train, y_train)
    x_train_prep=pd.DataFrame(x_train_prep,columns=feature_names, index=x_train.index)
    x_test_prep=preprocessor.transform(x_test)
    x_test_prep=pd.DataFrame(x_test_prep,columns=feature_names, index=x_test.index)

    if pipe.steps[-1][0]=='logisticregression':
       fe_transf=pipe.named_steps['polynomialfeatures']
       poly_names=fe_transf.get_feature_names(x_train_prep.columns)
       x_train_prep=fe_transf.fit_transform(x_train_prep)
       x_train_prep=pd.DataFrame(x_train_prep, columns=poly_names, index=x_train.index)
       x_test_prep=fe_transf.transform(x_test_prep)
       x_test_prep=pd.DataFrame(x_test_prep, columns=poly_names, index=x_test.index)

    model= pipe.steps[-1][1]
    model.fit(x_train_prep, y_train)
    X_test_sample=x_test_prep.sample(100) #100
    X_test_small_sample=x_test_prep.sample(50) #50

    if pipe.steps[-1][0]=='xgbclassifier':
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test_sample)
        shap.summary_plot(shap_values, X_test_sample,plot_size=None, show=False)
    elif pipe.steps[-1][0]=='randomforestclassifier':
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test_sample)
        shap.summary_plot(shap_values[1], X_test_sample, plot_size=None, show=False)
    elif pipe.steps[-1][0]=='svc' or pipe.steps[-1][0]=='kneighborsclassifier' or  pipe.steps[-1][0]=='logisticregression':
        explainer = shap.KernelExplainer(model.predict,shap.kmeans(x_train_prep,100)) # predict+shap_values or predict_proba+shap_values[1] 
        shap_values = explainer.shap_values(X_test_small_sample)
        shap.summary_plot(shap_values, X_test_small_sample, plot_size=None, show=False)


joblib_Filename = "feature_importance.joblib"
try:
    feature_imp_fig = joblib.load(joblib_Filename)
    print("feature_importance imported")
    show_figure(feature_imp_fig)
    feature_imp_fig.show()
except FileNotFoundError:
    i=1
    fig = plt.figure(figsize=(22,7*5))
    for  name,estimator in Best_estimators.items():
        plt.subplot(5,2,i)
        plot_SHAP(estimator, X_train, y_train, X_test)
        plt.title('SHAP plot of '+ name, fontsize=16)
        
        plt.subplot(5,2,i+1)
        plot_permutation_importance(name, X_train, y_train, X_test, y_test)
        plt.title('Features importance of '+ name, fontsize=16)
        plt.xlabel('feature importance',  fontsize=14)
        i+=2
    plt.subplots_adjust(wspace=0.1, hspace=0.3)
    plt.show
    joblib.dump(fig, joblib_Filename)
    print("feature_importance created and exported")

feature_importance imported

Table of Contents

Introduction¶

Modules du projet¶

Aperçu des données¶

Description des variables¶

Variables numériques¶

Observations¶

Variables catégorielles¶

Observations¶

Variable cible¶

Valeurs manquantes¶

Interactions des variables¶

Analyse exploratoire de données (EDA)¶

Impact de la durée du contact sur la décision du client¶

Le solde du client a t-il un impact sur sa décision?¶

Le niveau éducatif et la profession du client ont t-ils un impact direct ou indirect sur sa volenté à souscrire à un compte à terme?¶

la persistence et l'insistence ont un effect négatif ou positif sur la décision du client?¶

Modélisation¶

Preparation de données¶

Prétraitement de données¶

Feature engineering¶

Optimisation des hyperparamètres¶

Performance des modèles avant l'optimisation¶

Optimisation¶

Régression logistique¶

Support vector classifier¶

XGBoost¶

Random forest¶

KNearest neighbors¶

Learning curves¶

Evaluation¶

Discrimination threshold : choix du seuil de probabilité¶

Evaluation des Modèles¶

Features importance¶

Quelles actions la Banque devrait-elle envisager ?¶