import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import imblearn
import os
import pickle
import time
import shap

from pathlib import Path
from scipy import stats
from scipy.stats import norm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.utils import class_weight
from pyclustertend import hopkins
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from math import ceil
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import shapiro
from scipy.stats import normaltest
from scipy.stats import anderson
from xgboost import plot_importance
from lifelines import CoxPHFitter
from warnings import simplefilter


simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline
sns.set_theme()


seed_ = 194
np.random.seed(seed_)


heart_original = pd.read_csv('Data/heart.csv', sep = ',')


heart_original.head()


heart_original.describe()


heart_original.dtypes

age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object


heart = heart_original.copy()


heart['age'] = heart['age'].astype('int64')


heart.dtypes

age                           int64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object


heart.describe()


# Verificando se possui valores missing
print(heart.isna().sum())

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64


# Verificando valores unicos
print(heart.nunique())

age                          46
anaemia                       2
creatinine_phosphokinase    208
diabetes                      2
ejection_fraction            17
high_blood_pressure           2
platelets                   176
serum_creatinine             40
serum_sodium                 27
sex                           2
smoking                       2
time                        148
DEATH_EVENT                   2
dtype: int64


# Verificando valores duplicados
print(sum(heart.duplicated()))

0


# Lista de variaveis de cada tipo
continuas = []
categoricas = []

for c in heart.columns[:-1]:
    if heart.nunique()[c] > 5:
        continuas.append(c)
    else:
        categoricas.append(c)


continuas

['age',
 'creatinine_phosphokinase',
 'ejection_fraction',
 'platelets',
 'serum_creatinine',
 'serum_sodium',
 'time']


categoricas

['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']


heart[continuas].head()


heart[categoricas].head()


# Plot para variaveis continuas

fig = plt.figure(figsize = (12, 8))

for i, col in enumerate(continuas):
    plt.subplot(3, 3, i + 1)
    heart.boxplot(col)
    plt.tight_layout()


heart[continuas] = np.log1p(1 + heart[continuas])


# Plot para variaveis continuas

fig = plt.figure(figsize = (12, 8))

for i, col in enumerate(continuas):
    plt.subplot(3, 3, i + 1)
    heart.boxplot(col)
    plt.tight_layout()


heart = heart_original.copy()


# Mapa de calor das variaveis continuas
continuas_temp = continuas.copy()
continuas_temp.append('DEATH_EVENT')

plt.figure(figsize = (12, 12))
sns.heatmap(heart[continuas_temp].corr(method = 'pearson'), annot = True, square = True)
plt.show()


# Countplot para variaveis categóricas

fig = plt.figure()
fig.subplots_adjust(hspace = 0.4, wspace = 0.4)
fig.set_figheight(7)
fig.set_figwidth(10)

for i, col in enumerate(categoricas):
    ax = fig.add_subplot(ceil(len(categoricas) / 3), 3, i + 1)
    sns.countplot(x = heart[col])
    
plt.tight_layout()
plt.show()


# Countplot da variavel target
sns.countplot(x = heart['DEATH_EVENT'])
plt.show()


print(Counter(heart['DEATH_EVENT']))

Counter({0: 203, 1: 96})


for col in categoricas:
    pd.crosstab(heart[col], heart['DEATH_EVENT']).plot(kind = 'bar',
                                                       stacked = True,
                                                       figsize = (15, 5),
                                                       color = ['green', 'red'])


def quantil_quantil_teste(data, columns):
    
    for col in columns:
        print(col)
        qqplot(data[col], line = 's')
        plt.show()


quantil_quantil_teste(heart_original, continuas)

age

creatinine_phosphokinase

ejection_fraction

platelets

serum_creatinine

serum_sodium

time


def testes_gaussianos(data, columns, teste):
    
    for i, col in enumerate(columns):
        print('Teste para a variavel', col)
        alpha = 0.05
        
        if teste == 'shapiro':
            stat, p = shapiro(data[col])
        elif teste == 'normal':
            stat, p = normaltest(data[col])           
        elif teste == 'anderson':
            resultado = anderson(data[col])
            print('Stats: %.4f' % resultado.statistic)
            
            for j in range(len(resultado.critical_values)):
                sl, cv = resultado.significance_level[j], resultado.critical_values[j]
                
                if resultado.statistic < cv:
                    print('Significancia = %.4f, Valor Critico = %.4f, os dados parecem Gaussianos. Falha ao rejeitar H0.' % (sl, cv))
                else:
                    print('Significancia = %.4f, Valor Critico = %.4f, os dados não parecem Gaussianos. H0 rejeitado.' % (sl, cv))
            
        if teste != 'anderson':         
            print('Stat = ', round(stat, 4))
            print('p-value = ', round(p, 4))
            #print('Stats = %4.f, p = %4.f' % (stat, p))

            if p > alpha:
                print('Os dados parecem Gaussianos. Falha ao rejeitar H0.')
            else:
                print('Os dados não parecem Gaussianos. H0 rejeitado.')
            
        print('\n')


testes_gaussianos(heart, continuas, teste = 'shapiro')

Teste para a variavel age
Stat =  0.9755
p-value =  0.0001
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel creatinine_phosphokinase
Stat =  0.5143
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel ejection_fraction
Stat =  0.9473
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel platelets
Stat =  0.9115
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel serum_creatinine
Stat =  0.5515
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel serum_sodium
Stat =  0.939
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel time
Stat =  0.9468
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


testes_gaussianos(heart, continuas, teste = 'normal')

Teste para a variavel age
Stat =  8.9515
p-value =  0.0114
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel creatinine_phosphokinase
Stat =  307.1838
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel ejection_fraction
Stat =  14.1656
p-value =  0.0008
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel platelets
Stat =  111.3375
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel serum_creatinine
Stat =  307.903
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel serum_sodium
Stat =  74.3649
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel time
Stat =  174.7684
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


def crosstab_column(data, col, target, percentage = True):
    res = pd.crosstab(data[col], data[target], margins = True)
    
    if percentage:
        res = pd.crosstab(data[col], data[target], margins = True, normalize = 'index').round(4) * 100
    
    return res


for col in categoricas:
    print(crosstab_column(heart, col, 'DEATH_EVENT'), end = '\n\n\n')

DEATH_EVENT      0      1
anaemia                  
0            70.59  29.41
1            64.34  35.66
All          67.89  32.11


DEATH_EVENT      0      1
diabetes                 
0            67.82  32.18
1            68.00  32.00
All          67.89  32.11


DEATH_EVENT              0      1
high_blood_pressure              
0                    70.62  29.38
1                    62.86  37.14
All                  67.89  32.11


DEATH_EVENT      0      1
sex                      
0            67.62  32.38
1            68.04  31.96
All          67.89  32.11


DEATH_EVENT      0      1
smoking                  
0            67.49  32.51
1            68.75  31.25
All          67.89  32.11


def coefSpearman(data, col, target):    
    for c in col:
        coeficiente, p_valor = stats.spearmanr(data[c], data[target])
        print("Correlação de Spearman entre a variavel", target, "e a variavel continua", c, ": {:0.4}".format(coeficiente))


coefSpearman(heart, continuas, 'DEATH_EVENT')

Correlação de Spearman entre a variavel DEATH_EVENT e a variavel continua age : 0.2181
Correlação de Spearman entre a variavel DEATH_EVENT e a variavel continua creatinine_phosphokinase : 0.02362
Correlação de Spearman entre a variavel DEATH_EVENT e a variavel continua ejection_fraction : -0.2869
Correlação de Spearman entre a variavel DEATH_EVENT e a variavel continua platelets : -0.0462
Correlação de Spearman entre a variavel DEATH_EVENT e a variavel continua serum_creatinine : 0.3706
Correlação de Spearman entre a variavel DEATH_EVENT e a variavel continua serum_sodium : -0.2098
Correlação de Spearman entre a variavel DEATH_EVENT e a variavel continua time : -0.5432


def qui2(data, col, target):
    for c in col:
        cross = pd.crosstab(data[c], data[target])
        chi2, p, dof, exp = stats.chi2_contingency(cross)
        print("Qui-quadrado entre a variavel", target, "e a variavel categorica", c, ": {:0.4}".format(chi2))
        print("Apresentando um p-value de: {:0.4}".format(p), end = '\n\n')


qui2(heart, categoricas, 'DEATH_EVENT')

Qui-quadrado entre a variavel DEATH_EVENT e a variavel categorica anaemia : 1.042
Apresentando um p-value de: 0.3073

Qui-quadrado entre a variavel DEATH_EVENT e a variavel categorica diabetes : 0.008458
Apresentando um p-value de: 0.9267

Qui-quadrado entre a variavel DEATH_EVENT e a variavel categorica high_blood_pressure : 1.543
Apresentando um p-value de: 0.2141

Qui-quadrado entre a variavel DEATH_EVENT e a variavel categorica sex : 0.003037
Apresentando um p-value de: 0.9561

Qui-quadrado entre a variavel DEATH_EVENT e a variavel categorica smoking : 0.007331
Apresentando um p-value de: 0.9318


cph = CoxPHFitter()


cph.fit(heart, event_col = 'DEATH_EVENT', duration_col = 'time')

<lifelines.CoxPHFitter: fitted with 299 total observations, 203 right-censored observations>


cph.plot()

<AxesSubplot:xlabel='log(HR) (95% CI)'>


cph.print_summary(columns=["coef", "exp(coef)", "exp(coef) lower 95%", "exp(coef) upper 95%", "z", "p"], decimals = 4)


cph.predict_survival_function(heart.loc[1]).plot(title="Probabilidade de sobrevivencia do individuo 01 ao longo do tempo")

<AxesSubplot:title={'center':'Probabilidade de sobrevivencia do individuo 01 ao longo do tempo'}>


cph.predict_survival_function(heart.loc[10]).plot(title="Probabilidade de sobrevivencia do individuo 10 ao longo do tempo")

<AxesSubplot:title={'center':'Probabilidade de sobrevivencia do individuo 10 ao longo do tempo'}>


cph.plot_partial_effects_on_outcome(covariates = 'age', values = [35, 45, 55, 60, 65, 70, 75, 85, 95], cmap = 'coolwarm')

<AxesSubplot:>


cph.plot_partial_effects_on_outcome(covariates = 'creatinine_phosphokinase', values = [250, 500, 1000, 2000, 3000, 4000],\
                                    cmap = 'coolwarm')

<AxesSubplot:>


cph.plot_partial_effects_on_outcome(covariates = 'ejection_fraction', values = [20, 30, 40, 50, 60, 70, 80], cmap = 'coolwarm')

<AxesSubplot:>


cph.plot_partial_effects_on_outcome(covariates = 'platelets',\
                                    values = [200000, 300000, 400000, 500000, 600000, 700000, 800000],\
                                    cmap = 'coolwarm')

<AxesSubplot:>


cph.plot_partial_effects_on_outcome(covariates = 'serum_creatinine', values = [1, 2, 3, 4, 5, 6, 7, 8, 9], cmap = 'coolwarm')

<AxesSubplot:>


cph.plot_partial_effects_on_outcome(covariates = 'serum_sodium', values = [125, 130, 135, 140, 145], cmap = 'coolwarm')

<AxesSubplot:>


def plot_partial_categorical(cph_, columns):
    for column in columns:
        cph_.plot_partial_effects_on_outcome(covariates = column, values = [0, 1], cmap = 'coolwarm')


plot_partial_categorical(cph, categoricas)


continuas.remove('time')
heart = heart.drop('time', axis = 1)
heart.head()


X = heart.iloc[:, :-1]
y = heart['DEATH_EVENT'].values
columns = heart.keys()


X.head()


corr = np.corrcoef(X[continuas], rowvar = 0)
eigenvalues, eigenvectors = np.linalg.eig(corr)


print(eigenvalues, min(eigenvalues))

[1.34527211 1.15364884 0.72934891 0.82384075 0.96051258 0.98737681] 0.7293489113371818


print(abs(eigenvectors[:, 2]))

[0.18085313 0.23800393 0.41537521 0.02737793 0.54333596 0.66494074]


print(continuas[0], continuas[1], continuas[2], continuas[4], continuas[5])

age creatinine_phosphokinase ejection_fraction serum_creatinine serum_sodium


def scatter_plot_conjunto(data, columns, target):
    # Definindo range de Y
    y_range = [data[target].min(), data[target].max()]
    
    for column in columns:
        if target != column:
            # Definindo range de X
            x_range = [data[column].min(), data[column].max()]
            
            # Scatter plot de X e Y
            scatter_plot = data.plot(kind = 'scatter', x = column, y = target, xlim = x_range, ylim = y_range)
            
            # Traçar linha da media de X e Y
            meanX = scatter_plot.plot(x_range, [data[target].mean(), data[target].mean()], '--', color = 'red', linewidth = 1)
            meanY = scatter_plot.plot([data[column].mean(), data[column].mean()], y_range, '--', color = 'red', linewidth = 1)


heart_multicolinearidade = heart[['age', 'creatinine_phosphokinase' ,'ejection_fraction', 'serum_creatinine', 'serum_sodium']]


sns.pairplot(heart_multicolinearidade)

<seaborn.axisgrid.PairGrid at 0x277ef39abb0>


def boxplot_plot_individual(data, columns, target):
    
    fig = plt.figure()
    fig.subplots_adjust(hspace = 0.4, wspace = 0.4)
    fig.set_figheight(25)
    fig.set_figwidth(15)
    
    columns_adjust = ceil(len(columns))
    
    for i, column in enumerate(columns):
        if column != target:
            ax = fig.add_subplot(columns_adjust, 3, i + 1)
            sns.boxplot(x = target, y = column, data = data)
    
    plt.tight_layout()
    plt.show()


boxplot_plot_individual(heart, continuas, 'DEATH_EVENT')


sns.pairplot(heart[continuas])

<seaborn.axisgrid.PairGrid at 0x277f0f77c10>


print(heart[continuas].skew())

age                         0.423062
creatinine_phosphokinase    4.463110
ejection_fraction           0.555383
platelets                   1.462321
serum_creatinine            4.455996
serum_sodium               -1.048136
dtype: float64


def hist_individual(data, columns):
    fig = plt.figure()
    fig.subplots_adjust(hspace = 0.4, wspace = 0.4)
    fig.set_figheight(10)
    fig.set_figwidth(15)
    
    columns_adjust = ceil(len(columns) / 3)
    
    for i, column in enumerate(columns):
        ax = fig.add_subplot(columns_adjust, 3, i + 1)
        data[column].hist(label = column)
        plt.title(column)
        
    plt.tight_layout()  
    plt.show()


hist_individual(heart, continuas)


print(heart[continuas].kurtosis() - 3)

age                         -3.184871
creatinine_phosphokinase    22.149046
ejection_fraction           -2.958591
platelets                    3.209255
serum_creatinine            22.828239
serum_sodium                 1.119712
dtype: float64


def boxplot_individuais(data, columns):
    fig = plt.figure()
    fig.subplots_adjust(hspace = 0.4, wspace = 0.4)
    fig.set_figheight(8)
    fig.set_figwidth(15)
    
    columns_adjust = ceil(len(columns) / 3)
    
    for i, column in enumerate(columns):
        ax = fig.add_subplot(columns_adjust, 3, i + 1)
        sns.boxplot(x = data[column])
        
    plt.tight_layout()  
    plt.show()


boxplot_individuais(heart, continuas)


# Criando array de colunas com outliers
outlier_columns = continuas.copy()
outlier_columns.remove('age')
print(outlier_columns)

['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium']


def outlier_log_transformation_based(data, columns):
    for column in columns:
        data[column] = data[column].map(lambda x: np.log(x) if x > 0 else 0)
    return data


def outlier_percentil_based(data, columns):
    for column in columns:
        # Capturando percentile de 10 e 90
        percentil10 = data[column].quantile(0.10)
        percentil90 = data[column].quantile(0.90)
        
        data[column] = np.where(data[column] < percentil10, percentil10, data[column])
        data[column] = np.where(data[column] > percentil90, percentil90, data[column])
        
    return data


heart = outlier_percentil_based(heart, outlier_columns)


print(heart[continuas].skew())

age                         0.423062
creatinine_phosphokinase    0.987717
ejection_fraction           0.663999
platelets                   0.183135
serum_creatinine            0.969494
serum_sodium               -0.103250
dtype: float64


hist_individual(heart, continuas)


print(heart[continuas].kurtosis() - 3)

age                        -3.184871
creatinine_phosphokinase   -3.206519
ejection_fraction          -3.409022
platelets                  -3.827136
serum_creatinine           -3.309419
serum_sodium               -4.161165
dtype: float64


boxplot_individuais(heart, continuas)


X = heart.iloc[:, :-1]
y = heart['DEATH_EVENT'].values


X.head()


print(y)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1
 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 0
 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


# Aplicando padronização
scaler = StandardScaler()
X_standard = scaler.fit_transform(X)


print(X_standard)

[[ 1.19294523 -0.87110478  0.44656511 ... -1.58662083  0.73568819
  -0.68768191]
 [-0.49127928 -0.87110478  2.13856173 ... -0.25777033  0.73568819
  -0.68768191]
 [ 0.35083298 -0.87110478 -0.73984615 ... -1.58662083  0.73568819
   1.4541607 ]
 ...
 [-1.33339153 -0.87110478  2.13856173 ...  0.40665492 -1.35927151
  -0.68768191]
 [-1.33339153 -0.87110478  2.13856173 ...  1.07108017  0.73568819
   1.4541607 ]
 [-0.9123354  -0.87110478 -0.60378982 ... -0.25777033  0.73568819
   1.4541607 ]]


plt.hist(X_standard[:,0:1])

(array([37., 37., 39., 55., 46., 40., 19., 12.,  8.,  6.]),
 array([-1.75444766, -1.29128592, -0.82812418, -0.36496244,  0.0981993 ,
         0.56136104,  1.02452278,  1.48768452,  1.95084626,  2.414008  ,
         2.87716974]),
 <BarContainer object of 10 artists>)


heart_original.DEATH_EVENT.value_counts().plot(kind = 'bar', title = 'Count DEATH EVENT')

<AxesSubplot:title={'center':'Count DEATH EVENT'}>


x_train, x_test, y_train, y_test = train_test_split(X_standard, y, test_size = .3, random_state = seed_)


oversample = SMOTE(random_state = seed_)
x_train_resample, y_train_resample = oversample.fit_resample(x_train, y_train)


y_all = np.concatenate((y_train_resample, y_test), axis = 0)


print(np.shape(y_test),
np.shape(y_train_resample),
np.shape(y_all))

(90,) (284,) (374,)


Counter(y_all)

Counter({1: 171, 0: 203})


dt = pd.DataFrame(y_all, columns = ['target'])
dt.target.value_counts().plot(kind ='bar', title = 'Count DEATH EVENT')

<AxesSubplot:title={'center':'Count DEATH EVENT'}>


modeloXGB = xgb.XGBClassifier(n_estimators = 1000, use_label_encoder = False, seed_ = seed_)


modeloXGB.fit(X_standard, y)

[16:01:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: 
Parameters: { "seed_" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[16:01:24] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=24, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed_=194,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)


print(modeloXGB.feature_importances_)

[0.08655462 0.02514137 0.0560401  0.02858021 0.17375933 0.0743641
 0.04546017 0.15915318 0.10861176 0.12236615 0.1199689 ]


index_ordenado = modeloXGB.feature_importances_.argsort()


index_ordenado

array([ 1,  3,  6,  2,  5,  0,  8, 10,  9,  7,  4], dtype=int64)


plt.barh(heart.drop('DEATH_EVENT', axis = 1).columns[index_ordenado], modeloXGB.feature_importances_[index_ordenado])

<BarContainer object of 11 artists>


plot_importance(modeloXGB)

<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>


modeloRFE = LogisticRegression(solver = 'lbfgs', random_state = seed_)
rfe = RFE(modeloRFE, n_features_to_select = 6)
fit = rfe.fit(X_standard, y)


print("Features Selecionadas: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Features Selecionadas: [ True  True False False  True  True False  True  True False False]
Feature Ranking: [1 1 2 6 1 1 5 1 1 3 4]


selecionadas = [columns[i] for i, col in enumerate(fit.support_) if col == True]


selecionadas

['age',
 'anaemia',
 'ejection_fraction',
 'high_blood_pressure',
 'serum_creatinine',
 'serum_sodium']


heart[selecionadas].head()


modeloExtraTrees = ExtraTreesClassifier(n_estimators = 1000, random_state = seed_)
modeloExtraTrees.fit(X_standard, y)
print(modeloExtraTrees.feature_importances_)

[0.1322135  0.03761433 0.09971018 0.05015474 0.14472388 0.03769115
 0.10798292 0.18853182 0.11691522 0.0416596  0.04280264]


index_ordenado_extra = modeloExtraTrees.feature_importances_.argsort()


plt.barh(heart.drop('DEATH_EVENT', axis = 1).columns[index_ordenado_extra],
         modeloExtraTrees.feature_importances_[index_ordenado_extra])

<BarContainer object of 11 artists>


heart.head()


'''
index_ordenado_invertido = np.flip(index_ordenado)
print(index_ordenado_invertido)

heart = heart.iloc[:, index_ordenado_invertido[0:6]]
features = index_ordenado_invertido[0:6]

# Mantendo os 6 melhores de acordo com XGBOOST sem excluir variaveis MultiColineares
x_train = x_train[:, index_ordenado_invertido[0:6]]
x_test = x_test[:, index_ordenado_invertido[0:6]]
x_train_resample = x_train_resample[:, index_ordenado_invertido[0:6]]
'''

'\nindex_ordenado_invertido = np.flip(index_ordenado)\nprint(index_ordenado_invertido)\n\nheart = heart.iloc[:, index_ordenado_invertido[0:6]]\nfeatures = index_ordenado_invertido[0:6]\n\n# Mantendo os 6 melhores de acordo com XGBOOST sem excluir variaveis MultiColineares\nx_train = x_train[:, index_ordenado_invertido[0:6]]\nx_test = x_test[:, index_ordenado_invertido[0:6]]\nx_train_resample = x_train_resample[:, index_ordenado_invertido[0:6]]\n'


'''
Mantendo os 6 melhores de acordo com XGBOOST excluindo as MultiColinearidades

Variaveis com multicolinearidade:
age, creatinine_phosphokinase, ejection_fraction, serum_creatinine e serum_sodium
'''
'''
features = [0, 2, 3, 5, 6, 8]

heart = heart.iloc[:, features]

x_train = x_train[:, features]
x_test = x_test[:, features]
x_train_resample = x_train_resample[:, features]
'''

'\nfeatures = [0, 2, 3, 5, 6, 8]\n\nheart = heart.iloc[:, features]\n\nx_train = x_train[:, features]\nx_test = x_test[:, features]\nx_train_resample = x_train_resample[:, features]\n'


'''
Mantendo as variaveis indicadas confiaveis e ideais pelo Cox Proportional Hazard,
sem adaptar o modelo para remover multicolineres.
'''

features = [5, 1, 7, 3, 10, 0]

heart = heart.iloc[:, features]

x_train = x_train[:, features]
x_test = x_test[:, features]
x_train_resample = x_train_resample[:, features]


heart.head()


print(x_train[0])

[-0.73568819 -0.87110478 -0.33527655  1.1798305  -0.68768191 -1.33339153]


def report_modelo(modelo, y, pred, label = 'Modelo', save = False, target_names = [0, 1], cut_limit = 0.5):
    # Forçando predições para um numero inteiro
    pred[pred > cut_limit] = 1
    pred[pred <= cut_limit] = 0
    
    # Plotando a matriz de confusão
    cm = confusion_matrix(y, pred)
    cm = pd.DataFrame(cm, index = target_names, columns= target_names)

    plt.figure(figsize = (10, 10))
    sns.heatmap(cm, cmap = "Blues", linecolor = 'black', linewidths = 1, annot = True, \
                fmt = '', xticklabels = target_names, yticklabels = target_names)
    plt.show()
    
    print('AUC: %f' % roc_auc_score(y, pred))
    
    # Area sob  a curva ROC
    rfp, rvp, lim = roc_curve(y,  pred)

    plt.plot(rfp, rvp, marker = '.',  label = label,  color = 'orange')
    plt.plot([0, 1],  [0, 1], color = 'darkblue', linestyle = '--')
    plt.xlabel('Especificade')
    plt.ylabel('Sensibilidade')
    plt.legend()
    plt.show()
    
    # Acurácia
    print("Acurácia: %f" % accuracy_score(y, pred))
    
    # Classification Report
    print(classification_report(y, pred, target_names= target_names))    
    
    # Salvando modelo sem sobreescrever arquivos existentes
    if save:
        shortFileName = '000'
        fileName = 'models/0001.model'
        fileObj = Path(fileName)
        
        index = 1
        while fileObj.exists():
            index += 1
            fileName = 'models/' + shortFileName + str(index) + '.model'
            fileObj = Path(fileName)
        
        # salvar modelo
        pickle.dump(modelo, open(fileName, 'wb'))
        
        return fileName


# Hopkins sem padronização
print("Sem padronização:", hopkins(X, X.shape[0]))

# Hopkins com padronização
print("Com padronização:", hopkins(X_standard, X_standard.shape[0]))

# Hopkins com padronização e feature selecting
X_standard_feature = X_standard[:, features]
print("Com padronização e feature selecting:", hopkins(X_standard_feature, X_standard_feature.shape[0]))

# Hopkins com normalização
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print("Com normalização:", hopkins(X_scaled, X_scaled.shape[0]))

# Hopkins com normalização e feature selecting
X_scaled_feature = X_scaled[:, features]
print("Com normalização e feature selecting:", hopkins(X_scaled_feature, X_scaled_feature.shape[0]))

Sem padronização: 0.3363645330777487
Com padronização: 0.4308891768765708
Com padronização e feature selecting: 0.22525797559595137
Com normalização: 0.3771774104157872
Com normalização e feature selecting: 0.1479034056449901


print(x_train.shape, x_test.shape)

(209, 6) (90, 6)


x_full = np.concatenate([x_train, x_test])


print(x_full.shape)

(299, 6)


# Aplicando a redução de dimensionalidade
pca = PCA(n_components = 2)
pca = pca.fit_transform(x_full)


modelo_v1 = KMeans(n_clusters = 2, random_state = seed_)
modelo_v1.fit(pca)

KMeans(n_clusters=2, random_state=194)


x_min, x_max, y_min, y_max, xx, yy, Z = [0, 0, 0, 0, 0, 0, 0]


def minMax(pca_, modelo):
    global x_min, x_max, y_min, y_max, xx, yy, Z
    
    # Obtenção de valores minimos e maximos
    x_min, x_max = pca_[:, 0].min(), pca_[:, 0].max()
    y_min, y_max = pca_[:, 1].min(), pca_[:, 1].max()
    xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02))
    Z = modelo.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)


minMax(pca, modelo_v1)


def areaCluster():
    # Plot das areas dos clusters
    plt.imshow(Z, interpolation = 'nearest',
               extent = (xx.min(), xx.max(), yy.min(), yy.max()),
               cmap = plt.cm.Paired,
               aspect = 'auto',
               origin = 'lower')


areaCluster()


def plotCentroides(pca_, modelo):
    # Plot dos centroides
    plt.plot(pca_[:, 0], pca_[:, 1], 'k.', markersize = 4)
    centroids = modelo.cluster_centers_
    inert = modelo.inertia_
    plt.scatter(centroids[:, 0], centroids[:, 1], marker = 'x', s = 169, linewidths = 3, color = 'r', zorder = 8)
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()


plotCentroides(pca, modelo_v1)


?silhouette_score


# Silhouette Score
labels = modelo_v1.labels_
silhouette_score(pca, labels, metric = 'euclidean')

0.33460910639009167


x_full_resample = np.concatenate([x_train_resample, x_test])


# Aplicando a redução de dimensionalidade
pca2 = PCA(n_components = 2)
pca2 = pca2.fit_transform(x_full_resample)


modelo_v2 = KMeans(n_clusters = 2, random_state = seed_)
modelo_v2.fit(pca2)

KMeans(n_clusters=2, random_state=194)


minMax(pca2, modelo_v2)


areaCluster()


plotCentroides(pca2, modelo_v2)


# Silhouette Score
labels = modelo_v2.labels_
silhouette_score(pca2, labels, metric = 'euclidean')

0.3620117065146864


# Modelo v3
# Iremos utilizados os dados normalizados com feature selecting

# Aplicando a redução de dimensionalidade
pca3 = PCA(n_components = 2)
pca3 = pca3.fit_transform(X_scaled_feature)

modelo_v3 = KMeans(n_clusters = 2, random_state = seed_)
modelo_v3.fit(pca3)

minMax(pca3, modelo_v3)


areaCluster()


plotCentroides(pca3, modelo_v3)


# Silhouette Score
labels = modelo_v3.labels_
silhouette_score(pca3, labels, metric = 'euclidean')

0.41523707424145706


# Criação do modelo v1
modelo_svm_v1 = SVC(kernel = 'linear', random_state = seed_)


# Criando base sem padronização, balanceamento e feature selecting
x_train_nothing, x_test_nothing, y_train_nothing, y_test_nothing =\
                                                train_test_split(X, y, test_size = .3, random_state = seed_)


# Treinamento
start = time.time()
modelo_svm_v1.fit(x_train_nothing, y_train_nothing)
end = time.time()
print('Tempo de Treinamento do Modelo:', round(end - start, 4))

Tempo de Treinamento do Modelo: 3.8909


pred_v1 = modelo_svm_v1.predict(x_test_nothing)


report_modelo(modelo_svm_v1, y_test_nothing, pred_v1, label = 'SVM V1', target_names = ['VIVER', 'MORRER'])

AUC: 0.560769

Acurácia: 0.711111
              precision    recall  f1-score   support

       VIVER       0.71      0.98      0.82        61
      MORRER       0.80      0.14      0.24        29

    accuracy                           0.71        90
   macro avg       0.75      0.56      0.53        90
weighted avg       0.74      0.71      0.63        90


# Criação do modelo v2
modelo_svm_v2 = SVC(kernel = 'linear', random_state = seed_)


# Criando base sem balanceamento e feature selecting. Com padronização
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train_nothing)
x_test_sc = sc.fit_transform(x_test_nothing)


# Treinamento
start = time.time()
modelo_svm_v2.fit(x_train_sc, y_train_nothing)
end = time.time()
print('Tempo de Treinamento do Modelo:', round(end - start, 4))

Tempo de Treinamento do Modelo: 0.003


pred_v2 = modelo_svm_v2.predict(x_test_sc)


report_modelo(modelo_svm_v2, y_test_nothing, pred_v2, label = 'SVM V2', target_names = ['VIVER', 'MORRER'])

AUC: 0.615885

Acurácia: 0.700000
              precision    recall  f1-score   support

       VIVER       0.74      0.85      0.79        61
      MORRER       0.55      0.38      0.45        29

    accuracy                           0.70        90
   macro avg       0.65      0.62      0.62        90
weighted avg       0.68      0.70      0.68        90


# Criação do modelo v3
modelo_svm_v3 = SVC(kernel = 'linear', random_state = seed_)


# Criando sem feature selecting. Com padronização e balanceamento
oversample2 = SMOTE(random_state = seed_)
x_train_sc_resample, y_train_resample_2 = oversample2.fit_resample(x_train_sc, y_train_nothing)


# Treinamento
start = time.time()
modelo_svm_v3.fit(x_train_sc_resample, y_train_resample_2)
end = time.time()
print('Tempo de Treinamento do Modelo:', round(end - start, 4))

Tempo de Treinamento do Modelo: 0.003


pred_v3 = modelo_svm_v3.predict(x_test_sc)


report_modelo(modelo_svm_v3, y_test_nothing, pred_v3, label = 'SVM V3', target_names = ['VIVER', 'MORRER'])

AUC: 0.663652

Acurácia: 0.666667
              precision    recall  f1-score   support

       VIVER       0.80      0.67      0.73        61
      MORRER       0.49      0.66      0.56        29

    accuracy                           0.67        90
   macro avg       0.65      0.66      0.65        90
weighted avg       0.70      0.67      0.68        90


# Criação do modelo v4
modelo_svm_v4 = SVC(kernel = 'linear', random_state = seed_)


# Criando com padronização, balanceamento e feature selecting
# x_train_resample e x_test já possuem essas caracteristicas


# Treinamento
start = time.time()
modelo_svm_v4.fit(x_train_resample, y_train_resample)
end = time.time()
print('Tempo de Treinamento do Modelo:', round(end - start, 4))

Tempo de Treinamento do Modelo: 0.003


pred_v4 = modelo_svm_v4.predict(x_test)


report_modelo(modelo_svm_v4, y_test, pred_v4, label = 'SVM V4', target_names = ['VIVER', 'MORRER'])

AUC: 0.694743

Acurácia: 0.733333
              precision    recall  f1-score   support

       VIVER       0.80      0.80      0.80        61
      MORRER       0.59      0.59      0.59        29

    accuracy                           0.73        90
   macro avg       0.69      0.69      0.69        90
weighted avg       0.73      0.73      0.73        90


def treina_GridSearchCV(modelo, params_, x_treino, y_treino, x_teste, y_teste,\
                        n_jobs = 20, cv = 5, refit = True, title = 'SVM', scoring = None, salvar_resultados = False,\
                       report_treino = False):
    grid = GridSearchCV(modelo, params_, n_jobs = n_jobs, cv = cv, refit = refit, scoring = scoring)
    
    grid.fit(x_treino, y_treino)
    pred = grid.predict(x_teste)
    modelo_ = grid.best_estimator_
    
    print(grid.best_params_)
    
    target_names = ['VIVER', 'MORRER']
    
    print('Report Para Dados de Teste')
    
    report_modelo(modelo_, y_teste, pred, label = title, target_names = target_names)
    
    if report_treino:
        print('Report Para Dados de Treino')
        pred_treino = grid.predict(x_treino)
        
        # Acurácia
        print("Acurácia: %f" % accuracy_score(y_treino, pred_treino))
          
        # Classification Report
        print(classification_report(y_treino, pred_treino, target_names= target_names))    
    
    if salvar_resultados:
        resultados_df = pd.DataFrame(grid.cv_results_)
        
        return resultados_df


# Criação do modelo intenso 05

params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.9, 1.0, 1.1],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced'],
    'random_state': [seed_]
}


%%time
treina_GridSearchCV(SVC(), params, x_train_resample, y_train_resample, x_test, y_test, title = 'SVM V5', cv = 10)

{'C': 1.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear', 'random_state': 194}
Report Para Dados de Teste

AUC: 0.677501

Acurácia: 0.722222
              precision    recall  f1-score   support

       VIVER       0.79      0.80      0.80        61
      MORRER       0.57      0.55      0.56        29

    accuracy                           0.72        90
   macro avg       0.68      0.68      0.68        90
weighted avg       0.72      0.72      0.72        90

Wall time: 2.22 s


# Criação do modelo intenso 06

params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.9, 1.0, 1.1],
    'gamma': ['scale', 'auto', 0.1, 1, 10],
    'class_weight': ['balanced', {0:1, 1:5}, {0:1, 1:10}],
    'random_state': [seed_]
}


%%time
treina_GridSearchCV(SVC(), params, x_train_resample, y_train_resample, x_test, y_test, title = 'SVM V6')

{'C': 0.9, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear', 'random_state': 194}
Report Para Dados de Teste

AUC: 0.694743

Acurácia: 0.733333
              precision    recall  f1-score   support

       VIVER       0.80      0.80      0.80        61
      MORRER       0.59      0.59      0.59        29

    accuracy                           0.73        90
   macro avg       0.69      0.69      0.69        90
weighted avg       0.73      0.73      0.73        90

Wall time: 2min 12s


%%time

# Criação do modelo 07
modelo_svm_v7 = SVC(C = 1.1, class_weight = 'balanced', gamma = 'scale', kernel = 'rbf', random_state = seed_)

# Treinamento
modelo_svm_v7.fit(x_train_resample, y_train_resample)

# Previsão
pred_v7 = modelo_svm_v7.predict(x_test)

# Report Geral
report_modelo(modelo_svm_v7, y_test, pred_v7, label = 'SVM V7', target_names = ['VIVER', 'MORRER'])

AUC: 0.652063

Acurácia: 0.700000
              precision    recall  f1-score   support

       VIVER       0.77      0.79      0.78        61
      MORRER       0.54      0.52      0.53        29

    accuracy                           0.70        90
   macro avg       0.65      0.65      0.65        90
weighted avg       0.70      0.70      0.70        90

Wall time: 253 ms


%%time

# Criação do modelo 08
modelo_svm_v8 = SVC(C = 1000, class_weight = {0:1, 1:10}, gamma = 0.01, kernel = 'rbf', random_state = seed_)

# Treinamento
modelo_svm_v8.fit(x_train_resample, y_train_resample)

# Previsão
pred_v8 = modelo_svm_v8.predict(x_test)

# Report Geral
report_modelo(modelo_svm_v8, y_test, pred_v8, label = 'SVM V8', target_names = ['VIVER', 'MORRER'])

AUC: 0.561334

Acurácia: 0.466667
              precision    recall  f1-score   support

       VIVER       0.78      0.30      0.43        61
      MORRER       0.36      0.83      0.50        29

    accuracy                           0.47        90
   macro avg       0.57      0.56      0.46        90
weighted avg       0.65      0.47      0.45        90

Wall time: 273 ms


class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train_resample), y_train_resample)


class_weights

array([1., 1.])


%%time

# Criação do modelo 09
modelo_svm_v9 = SVC(C = 1000, class_weight = {0: 1, 1: 1}, gamma = 0.01, kernel = 'rbf', random_state = seed_)

# Treinamento
modelo_svm_v9.fit(x_train_resample, y_train_resample)

# Previsão
pred_v9 = modelo_svm_v9.predict(x_test)

# Report Geral
report_modelo(modelo_svm_v9, y_test, pred_v9, label = 'SVM V9', target_names = ['VIVER', 'MORRER'])

AUC: 0.669305

Acurácia: 0.711111
              precision    recall  f1-score   support

       VIVER       0.79      0.79      0.79        61
      MORRER       0.55      0.55      0.55        29

    accuracy                           0.71        90
   macro avg       0.67      0.67      0.67        90
weighted avg       0.71      0.71      0.71        90

Wall time: 249 ms


# Criação do modelo intenso 10

params = {
    'kernel': ['rbf'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 1, 10, 100],
    'class_weight': ['balanced'],
    'random_state': [seed_]
}


%%time
treina_GridSearchCV(SVC(), params, x_train_resample, y_train_resample, x_test, y_test,\
                    title = 'SVM V10', scoring = 'top_k_accuracy')

{'C': 0.001, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf', 'random_state': 194}
Report Para Dados de Teste

AUC: 0.703787

Acurácia: 0.733333
              precision    recall  f1-score   support

       VIVER       0.81      0.79      0.80        61
      MORRER       0.58      0.62      0.60        29

    accuracy                           0.73        90
   macro avg       0.70      0.70      0.70        90
weighted avg       0.74      0.73      0.74        90

Wall time: 360 ms


# Criação do modelo intenso 11

params = {
    'kernel': ['rbf'],
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'gamma': ['scale', 'auto', 1, 10, 100],
    'class_weight': ['balanced'],
    'random_state': [seed_]
}


%%time
resultados = treina_GridSearchCV(SVC(), params, x_train_resample, y_train_resample, x_test, y_test,\
                    title = 'SVM V11', scoring = 'top_k_accuracy', salvar_resultados = True)

{'C': 0.0001, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf', 'random_state': 194}
Report Para Dados de Teste

AUC: 0.703787

Acurácia: 0.733333
              precision    recall  f1-score   support

       VIVER       0.81      0.79      0.80        61
      MORRER       0.58      0.62      0.60        29

    accuracy                           0.73        90
   macro avg       0.70      0.70      0.70        90
weighted avg       0.74      0.73      0.74        90

Wall time: 357 ms


resultados[['param_C', 'param_class_weight', 'param_gamma', 'param_kernel',\
            'mean_test_score', 'std_test_score', 'rank_test_score']]


%%time

# Criação do modelo 12
modelo_svm_v12 = SVC(C = 100, class_weight = {0: 1, 1: 1.3}, gamma = 0.0001, kernel = 'rbf', random_state = seed_)

# Treinamento
modelo_svm_v12.fit(x_train_resample, y_train_resample)

# Previsão
pred_v12 = modelo_svm_v12.predict(x_test)

# Report Geral
report_modelo(modelo_svm_v12, y_test, pred_v12, label = 'SVM V12', target_names = ['VIVER', 'MORRER'])

print('Report Para Dados de Treino\n')

pred_treino = modelo_svm_v12.predict(x_train_resample)

print("Acurácia: %f" % accuracy_score(y_train_resample, pred_treino))

# Classification Report
print(classification_report(y_train_resample, pred_treino, target_names= ['VIVER', 'MORRER']))

AUC: 0.713680

Acurácia: 0.722222
              precision    recall  f1-score   support

       VIVER       0.83      0.74      0.78        61
      MORRER       0.56      0.69      0.62        29

    accuracy                           0.72        90
   macro avg       0.69      0.71      0.70        90
weighted avg       0.74      0.72      0.73        90

Report Para Dados de Treino

Acurácia: 0.725352
              precision    recall  f1-score   support

       VIVER       0.78      0.63      0.70       142
      MORRER       0.69      0.82      0.75       142

    accuracy                           0.73       284
   macro avg       0.73      0.73      0.72       284
weighted avg       0.73      0.73      0.72       284

Wall time: 261 ms


# Criação do modelo intenso 12

params = {
    'kernel': ['rbf'],
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 1, 10, 100],
    'class_weight': ['balanced'],
    'random_state': [seed_]
}


%%time
resultados = treina_GridSearchCV(SVC(), params, x_train_resample, y_train_resample, x_test, y_test,\
                    title = 'SVM V12', scoring = 'top_k_accuracy', salvar_resultados = True, report_treino = True)

{'C': 0.0001, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'rbf', 'random_state': 194}
Report Para Dados de Teste

AUC: 0.633126

Acurácia: 0.711111
              precision    recall  f1-score   support

       VIVER       0.75      0.85      0.80        61
      MORRER       0.57      0.41      0.48        29

    accuracy                           0.71        90
   macro avg       0.66      0.63      0.64        90
weighted avg       0.69      0.71      0.70        90

Report Para Dados de Treino
Acurácia: 0.700704
              precision    recall  f1-score   support

       VIVER       0.65      0.87      0.74       142
      MORRER       0.81      0.53      0.64       142

    accuracy                           0.70       284
   macro avg       0.73      0.70      0.69       284
weighted avg       0.73      0.70      0.69       284

Wall time: 377 ms


resultados[['param_C', 'param_class_weight', 'param_gamma', 'param_kernel',\
            'mean_test_score', 'std_test_score', 'rank_test_score']]


# Transformando os dados em DMatrix pois o XGBoost exige
dtrain_nothing = xgb.DMatrix(x_train_nothing, label = y_train_nothing)
dtest_nothing = xgb.DMatrix(x_test_nothing, label = y_test_nothing)


# Definindo parametros e configurações
param = {}


# Criação do modelo base v1
# Criando base sem padronização, balanceamento e feature selecting
modelo_xgb_v1 = xgb.train(params = param, dtrain = dtrain_nothing)

pred_v1 = modelo_xgb_v1.predict(dtest_nothing)

report_modelo(modelo_xgb_v1, y_test_nothing, pred_v1, label = 'XGB V1', target_names = ['VIVER', 'MORRER'])

AUC: 0.593838

Acurácia: 0.633333
              precision    recall  f1-score   support

       VIVER       0.74      0.70      0.72        61
      MORRER       0.44      0.48      0.46        29

    accuracy                           0.63        90
   macro avg       0.59      0.59      0.59        90
weighted avg       0.64      0.63      0.64        90


# Transformando os dados em DMatrix pois o XGBoost exige
dtrain_sc = xgb.DMatrix(x_train_sc, label = y_train_nothing)
dtest_sc = xgb.DMatrix(x_test_sc, label = y_test_nothing)

# Criação do modelo base v2
# Criando base sem balanceamento e feature selecting. Com padronização
modelo_xgb_v2 = xgb.train(params = param, dtrain = dtrain_sc)

pred_v2 = modelo_xgb_v2.predict(dtest_sc)

report_modelo(modelo_xgb_v2, y_test_nothing, pred_v2, label = 'XGB V2', target_names = ['VIVER', 'MORRER'])

AUC: 0.602035

Acurácia: 0.644444
              precision    recall  f1-score   support

       VIVER       0.75      0.72      0.73        61
      MORRER       0.45      0.48      0.47        29

    accuracy                           0.64        90
   macro avg       0.60      0.60      0.60        90
weighted avg       0.65      0.64      0.65        90


# Transformando os dados em DMatrix pois o XGBoost exige
dtrain_sc_resample = xgb.DMatrix(x_train_sc_resample, label = y_train_resample_2)
dtest_sc = xgb.DMatrix(x_test_sc, label = y_test_nothing)

# Criação do modelo base v3
# Criando sem feature selecting. Com padronização e balanceamento
modelo_xgb_v3 = xgb.train(params = param, dtrain = dtrain_sc_resample)

pred_v3 = modelo_xgb_v3.predict(dtest_sc)

report_modelo(modelo_xgb_v3, y_test_nothing, pred_v3, label = 'XGB V3', target_names = ['VIVER', 'MORRER'])

AUC: 0.696439

Acurácia: 0.711111
              precision    recall  f1-score   support

       VIVER       0.82      0.74      0.78        61
      MORRER       0.54      0.66      0.59        29

    accuracy                           0.71        90
   macro avg       0.68      0.70      0.68        90
weighted avg       0.73      0.71      0.72        90


# Transformando os dados em DMatrix pois o XGBoost exige
dtrain = xgb.DMatrix(x_train_resample, label = y_train_resample)
dtest = xgb.DMatrix(x_test, label = y_test)

# Criação do modelo base v4
# Criando com padronização, balanceamento e feature selecting
modelo_xgb_v4 = xgb.train(params = param, dtrain = dtrain)

pred_v4 = modelo_xgb_v4.predict(dtest)

report_modelo(modelo_xgb_v4, y_test_nothing, pred_v4, label = 'XGB V4', target_names = ['VIVER', 'MORRER'])

AUC: 0.659412

Acurácia: 0.722222
              precision    recall  f1-score   support

       VIVER       0.77      0.84      0.80        61
      MORRER       0.58      0.48      0.53        29

    accuracy                           0.72        90
   macro avg       0.68      0.66      0.67        90
weighted avg       0.71      0.72      0.71        90


params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.3, 0.2, 0.1, 0.05],
        'nthread': [2]
        }


%%time
# Criação do modelo v5
# Criando sem feature selecting. Com padronização e balanceamento

treina_GridSearchCV(xgb.XGBClassifier(use_label_encoder = False), params, x_train_sc_resample, y_train_resample_2,\
                                 x_test_sc, y_test_nothing, title = 'XGB V05', report_treino = True)

[22:29:14] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'colsample_bytree': 0.8, 'gamma': 0.5, 'learning_rate': 0.2, 'max_depth': 4, 'min_child_weight': 1, 'nthread': 2, 'subsample': 1.0}
Report Para Dados de Teste

AUC: 0.739966

Acurácia: 0.733333
              precision    recall  f1-score   support

       VIVER       0.86      0.72      0.79        61
      MORRER       0.56      0.76      0.65        29

    accuracy                           0.73        90
   macro avg       0.71      0.74      0.72        90
weighted avg       0.77      0.73      0.74        90

Report Para Dados de Treino
Acurácia: 1.000000
              precision    recall  f1-score   support

       VIVER       1.00      1.00      1.00       142
      MORRER       1.00      1.00      1.00       142

    accuracy                           1.00       284
   macro avg       1.00      1.00      1.00       284
weighted avg       1.00      1.00      1.00       284

Wall time: 22.3 s


params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.3, 0.2, 0.1, 0.05],
        'nthread': [2]
        }


%%time
# Criação do modelo v6
# Criando com padronização, balanceamento e feature selecting

treina_GridSearchCV(xgb.XGBClassifier(use_label_encoder = False), params, x_train_resample, y_train_resample,\
                                 x_test, y_test, title = 'XGB V06', report_treino = True)

[22:29:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'colsample_bytree': 1.0, 'gamma': 2, 'learning_rate': 0.3, 'max_depth': 4, 'min_child_weight': 1, 'nthread': 2, 'subsample': 0.8}
Report Para Dados de Teste

AUC: 0.702940

Acurácia: 0.744444
              precision    recall  f1-score   support

       VIVER       0.81      0.82      0.81        61
      MORRER       0.61      0.59      0.60        29

    accuracy                           0.74        90
   macro avg       0.71      0.70      0.70        90
weighted avg       0.74      0.74      0.74        90

Report Para Dados de Treino
Acurácia: 0.904930
              precision    recall  f1-score   support

       VIVER       0.90      0.92      0.91       142
      MORRER       0.91      0.89      0.90       142

    accuracy                           0.90       284
   macro avg       0.91      0.90      0.90       284
weighted avg       0.91      0.90      0.90       284

Wall time: 19.9 s


%%time

params = {
        'min_child_weight': [1, 2, 3],
        'gamma': [1, 1.5, 2, 3],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [2, 3, 4, 5],
        'learning_rate': [0.4, 0.3, 0.2, 0.1, 0.05],
        'nthread': [2]
        }

# Criação do modelo v7
# Criando com padronização, balanceamento e feature selecting
treina_GridSearchCV(xgb.XGBClassifier(use_label_encoder = False), params, x_train_resample, y_train_resample,\
                                 x_test, y_test, title = 'XGB V07', report_treino = True)

[22:30:02] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'colsample_bytree': 1.0, 'gamma': 2, 'learning_rate': 0.3, 'max_depth': 4, 'min_child_weight': 1, 'nthread': 2, 'subsample': 0.8}
Report Para Dados de Teste

AUC: 0.702940

Acurácia: 0.744444
              precision    recall  f1-score   support

       VIVER       0.81      0.82      0.81        61
      MORRER       0.61      0.59      0.60        29

    accuracy                           0.74        90
   macro avg       0.71      0.70      0.70        90
weighted avg       0.74      0.74      0.74        90

Report Para Dados de Treino
Acurácia: 0.904930
              precision    recall  f1-score   support

       VIVER       0.90      0.92      0.91       142
      MORRER       0.91      0.89      0.90       142

    accuracy                           0.90       284
   macro avg       0.91      0.90      0.90       284
weighted avg       0.91      0.90      0.90       284

Wall time: 27.8 s


%%time

params = {
        'min_child_weight': [1, 2, 3, 4, 5, 6],
        'gamma': [0.5, 0.7, 1, 1.5, 2, 3],
        'subsample': [0.2, 0.4, 0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0, 1.2],
        'max_depth': [5, 6, 7, 8, 9],
        'learning_rate': [0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05],
        'nthread': [2]
        }

# Criação do modelo v8
# Criando com padronização, balanceamento e feature selecting
modelo_grid = xgb.XGBClassifier(use_label_encoder = False, early_stopping_rounds = 10, num_boost_round = 999)
treina_GridSearchCV(modelo_grid, params, x_train_resample, y_train_resample,\
                                 x_test, y_test, title = 'XGB V08', report_treino = True)

[22:34:46] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: 
Parameters: { "early_stopping_rounds", "num_boost_round" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[22:34:46] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'colsample_bytree': 1.0, 'gamma': 1.5, 'learning_rate': 0.5, 'max_depth': 6, 'min_child_weight': 3, 'nthread': 2, 'subsample': 0.8}
Report Para Dados de Teste

One or more of the test scores are non-finite: [0.70789474 0.72537594 0.76409774 ...        nan        nan        nan]

AUC: 0.703787

Acurácia: 0.733333
              precision    recall  f1-score   support

       VIVER       0.81      0.79      0.80        61
      MORRER       0.58      0.62      0.60        29

    accuracy                           0.73        90
   macro avg       0.70      0.70      0.70        90
weighted avg       0.74      0.73      0.74        90

Report Para Dados de Treino
Acurácia: 0.897887
              precision    recall  f1-score   support

       VIVER       0.88      0.92      0.90       142
      MORRER       0.91      0.88      0.90       142

    accuracy                           0.90       284
   macro avg       0.90      0.90      0.90       284
weighted avg       0.90      0.90      0.90       284

Wall time: 4min 43s


%%time

params = {
        'min_child_weight': [2, 3, 4, 5],
        'gamma': [0.7, 1.3, 1.5, 1.7],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],    
        'max_depth': [5, 6, 7],
        'learning_rate': [0.5, 0.1, 0.01],
        'nthread': [2],
        'n_estimators' : [500]
        }

# Criação do modelo v9
# Criando com padronização, balanceamento e feature selecting
modelo_grid = xgb.XGBClassifier(use_label_encoder = False)

resultados = treina_GridSearchCV(modelo_grid, params, x_train_resample, y_train_resample,\
                                 x_test, y_test, title = 'XGB V09', report_treino = True,\
                                 salvar_resultados = True)

[22:38:16] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'colsample_bytree': 1.0, 'gamma': 1.5, 'learning_rate': 0.5, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 500, 'nthread': 2, 'subsample': 0.8}
Report Para Dados de Teste

AUC: 0.702940

Acurácia: 0.744444
              precision    recall  f1-score   support

       VIVER       0.81      0.82      0.81        61
      MORRER       0.61      0.59      0.60        29

    accuracy                           0.74        90
   macro avg       0.71      0.70      0.70        90
weighted avg       0.74      0.74      0.74        90

Report Para Dados de Treino
Acurácia: 0.901408
              precision    recall  f1-score   support

       VIVER       0.88      0.93      0.90       142
      MORRER       0.93      0.87      0.90       142

    accuracy                           0.90       284
   macro avg       0.90      0.90      0.90       284
weighted avg       0.90      0.90      0.90       284

Wall time: 3min 30s


resultados


%%time

params = {
        'min_child_weight': [2, 3, 4, 5],
        'gamma': [0.7, 1.3, 1.4, 1.5],
        'subsample': [0.6, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],    
        'max_depth': [5, 6, 7],
        'learning_rate': [0.5, 0.1, 0.01],
        'lambda': [0.9, 1, 1.1],
        'alpha': [0, 0.1],
        'nthread': [2],
        'n_estimators' : [500]
        }

# Criação do modelo v10
# Criando com padronização, balanceamento e feature selecting
modelo_grid = xgb.XGBClassifier(use_label_encoder = False)

resultados = treina_GridSearchCV(modelo_grid, params, x_train_resample, y_train_resample,\
                                 x_test, y_test, title = 'XGB V10', report_treino = True,\
                                 salvar_resultados = True)

[22:48:31] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'alpha': 0, 'colsample_bytree': 1.0, 'gamma': 1.5, 'lambda': 1, 'learning_rate': 0.5, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 500, 'nthread': 2, 'subsample': 0.8}
Report Para Dados de Teste

AUC: 0.702940

Acurácia: 0.744444
              precision    recall  f1-score   support

       VIVER       0.81      0.82      0.81        61
      MORRER       0.61      0.59      0.60        29

    accuracy                           0.74        90
   macro avg       0.71      0.70      0.70        90
weighted avg       0.74      0.74      0.74        90

Report Para Dados de Treino
Acurácia: 0.901408
              precision    recall  f1-score   support

       VIVER       0.88      0.93      0.90       142
      MORRER       0.93      0.87      0.90       142

    accuracy                           0.90       284
   macro avg       0.90      0.90      0.90       284
weighted avg       0.90      0.90      0.90       284

Wall time: 10min 15s


%%time

params = {
        'min_child_weight': [1, 2, 3, 4, 5],
        'gamma': [0, 0.1, 0.2, 0.3],
        'subsample': [0.6, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],    
        'max_depth': [5, 6, 7],
        'learning_rate': [0.5, 0.1, 0.01],
        'nthread': [2],
        'n_estimators' : [500]
        }

# Criação do modelo v11
# Criando com padronização, balanceamento e feature selecting
modelo_grid = xgb.XGBClassifier(use_label_encoder = False)

resultados = treina_GridSearchCV(modelo_grid, params, x_train_resample, y_train_resample,\
                                 x_test, y_test, title = 'XGB V11', report_treino = True,\
                                 salvar_resultados = True)

[22:50:39] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'colsample_bytree': 0.9, 'gamma': 0.3, 'learning_rate': 0.5, 'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 500, 'nthread': 2, 'subsample': 0.8}
Report Para Dados de Teste

AUC: 0.677501

Acurácia: 0.722222
              precision    recall  f1-score   support

       VIVER       0.79      0.80      0.80        61
      MORRER       0.57      0.55      0.56        29

    accuracy                           0.72        90
   macro avg       0.68      0.68      0.68        90
weighted avg       0.72      0.72      0.72        90

Report Para Dados de Treino
Acurácia: 0.964789
              precision    recall  f1-score   support

       VIVER       0.96      0.97      0.97       142
      MORRER       0.97      0.96      0.96       142

    accuracy                           0.96       284
   macro avg       0.96      0.96      0.96       284
weighted avg       0.96      0.96      0.96       284

Wall time: 2min 8s


%%time

params = {
        'min_child_weight': [1, 2, 3, 4],
        'gamma': [0.7, 1.3, 1.4, 1.5],
        'subsample': [0.6, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'colsample_bylevel': [0.8, 0.9, 1.0],
        'max_depth': [5, 6, 7],
        'learning_rate': [0.5, 0.1, 0.01],
        'nthread': [2],
        'n_estimators' : [500]
        }

# Criação do modelo v12
# Criando com padronização, balanceamento e feature selecting
modelo_grid = xgb.XGBClassifier(use_label_encoder = False)

resultados = treina_GridSearchCV(modelo_grid, params, x_train_resample, y_train_resample,\
                                 x_test, y_test, title = 'XGB V12', report_treino = True,\
                                 salvar_resultados = True)

[22:56:21] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'colsample_bylevel': 1.0, 'colsample_bytree': 1.0, 'gamma': 1.5, 'learning_rate': 0.5, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 500, 'nthread': 2, 'subsample': 0.8}
Report Para Dados de Teste

AUC: 0.702940

Acurácia: 0.744444
              precision    recall  f1-score   support

       VIVER       0.81      0.82      0.81        61
      MORRER       0.61      0.59      0.60        29

    accuracy                           0.74        90
   macro avg       0.71      0.70      0.70        90
weighted avg       0.74      0.74      0.74        90

Report Para Dados de Treino
Acurácia: 0.901408
              precision    recall  f1-score   support

       VIVER       0.88      0.93      0.90       142
      MORRER       0.93      0.87      0.90       142

    accuracy                           0.90       284
   macro avg       0.90      0.90      0.90       284
weighted avg       0.90      0.90      0.90       284

Wall time: 5min 42s


%%time

params = {
        'min_child_weight': [1, 2, 3, 4],
        'gamma': [0.7, 1.3, 1.4, 1.5],
        'subsample': [0.6, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'colsample_bynode': [0.8, 0.9, 1.0],
        'max_depth': [5, 6, 7],
        'learning_rate': [0.5, 0.1, 0.01],
        'nthread': [2],
        'n_estimators' : [500]
        }

# Criação do modelo v13
# Criando com padronização, balanceamento e feature selecting
modelo_grid = xgb.XGBClassifier(use_label_encoder = False)

resultados = treina_GridSearchCV(modelo_grid, params, x_train_resample, y_train_resample,\
                                 x_test, y_test, title = 'XGB V13', report_treino = True,\
                                 salvar_resultados = True)

[23:02:01] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
{'colsample_bynode': 1.0, 'colsample_bytree': 1.0, 'gamma': 1.5, 'learning_rate': 0.5, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 500, 'nthread': 2, 'subsample': 0.8}
Report Para Dados de Teste

AUC: 0.702940

Acurácia: 0.744444
              precision    recall  f1-score   support

       VIVER       0.81      0.82      0.81        61
      MORRER       0.61      0.59      0.60        29

    accuracy                           0.74        90
   macro avg       0.71      0.70      0.70        90
weighted avg       0.74      0.74      0.74        90

Report Para Dados de Treino
Acurácia: 0.901408
              precision    recall  f1-score   support

       VIVER       0.88      0.93      0.90       142
      MORRER       0.93      0.87      0.90       142

    accuracy                           0.90       284
   macro avg       0.90      0.90      0.90       284
weighted avg       0.90      0.90      0.90       284

Wall time: 5min 39s


#Transformando os dados em DMatrix pois o XGBoost exige
dtrain = xgb.DMatrix(x_train_resample, label = y_train_resample)
dtest = xgb.DMatrix(x_test, label = y_test)

params = {'colsample_bynode': 0.3, 'colsample_bytree': 1.0, 'gamma': 1.5, 'learning_rate': 0.5, 'max_depth': 10,\
          'min_child_weight': 1, 'n_estimators': 1000, 'nthread': 2, 'subsample': 0.8}

# Criação do modelo base v14
# Criando com padronização, balanceamento e feature selecting
modelo_xgb_v14 = xgb.train(params = params, dtrain = dtrain)

pred_v14 = modelo_xgb_v14.predict(dtest)

report_modelo(modelo_xgb_v14, y_test_nothing, pred_v14, label = 'XGB V14', target_names = ['VIVER', 'MORRER'])

[23:02:02] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: 
Parameters: { "n_estimators" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.

AUC: 0.695591

Acurácia: 0.722222
              precision    recall  f1-score   support

       VIVER       0.81      0.77      0.79        61
      MORRER       0.56      0.62      0.59        29

    accuracy                           0.72        90
   macro avg       0.69      0.70      0.69        90
weighted avg       0.73      0.72      0.73        90


#Transformando os dados em DMatrix pois o XGBoost exige
dtrain = xgb.DMatrix(x_train_resample, label = y_train_resample)
dtest = xgb.DMatrix(x_test, label = y_test)

params = {'colsample_bynode': 0.3, 'colsample_bytree': 1.0, 'gamma': 2.3, 'learning_rate': 0.5, 'max_depth': 12,\
          'min_child_weight': 1, 'n_estimators': 500, 'nthread': 2, 'subsample': 0.8}

# Criação do modelo base v15
# Criando com padronização, balanceamento e feature selecting
modelo_xgb_v15 = xgb.train(params = params, dtrain = dtrain)

pred_v15 = modelo_xgb_v15.predict(dtest)

report_modelo(modelo_xgb_v15, y_test_nothing, pred_v15, label = 'XGB V15', target_names = ['VIVER', 'MORRER'])

[23:02:02] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: 
Parameters: { "n_estimators" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.

AUC: 0.696439

Acurácia: 0.711111
              precision    recall  f1-score   support

       VIVER       0.82      0.74      0.78        61
      MORRER       0.54      0.66      0.59        29

    accuracy                           0.71        90
   macro avg       0.68      0.70      0.68        90
weighted avg       0.73      0.71      0.72        90


%%time

#Transformando os dados em DMatrix pois o XGBoost exige
dtrain_sc_resample = xgb.DMatrix(x_train_sc_resample, label = y_train_resample_2)
dtest_sc = xgb.DMatrix(x_test_sc, label = y_test_nothing)

params = {'colsample_bynode': 0.6, 'colsample_bytree': 0.3, 'gamma': 2.3, 'learning_rate': 0.5, 'max_depth': 12,\
          'min_child_weight': 1, 'n_estimators': 500, 'nthread': 2, 'subsample': 0.8}

# Criação do modelo base v16
# Criando sem feature selecting. Com padronização e balanceamento
modelo_xgb_v16 = xgb.train(params = params, dtrain = dtrain_sc_resample)

pred_v16 = modelo_xgb_v16.predict(dtest_sc)

report_modelo(modelo_xgb_v16, y_test_nothing, pred_v16, label = 'XGB V16', target_names = ['VIVER', 'MORRER'])

print('Report Para Dados de Treino')
pred_treino = modelo_xgb_v16.predict(dtrain_sc_resample)
pred_treino = pred_treino > 0.5

# Acurácia
print("Acurácia: %f" % accuracy_score(y_train_resample_2, pred_treino))

# Classification Report
print(classification_report(y_train_resample_2, pred_treino, target_names= ['VIVER', 'MORRER']))

[23:02:02] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: 
Parameters: { "n_estimators" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.

AUC: 0.764556

Acurácia: 0.766667
              precision    recall  f1-score   support

       VIVER       0.87      0.77      0.82        61
      MORRER       0.61      0.76      0.68        29

    accuracy                           0.77        90
   macro avg       0.74      0.76      0.75        90
weighted avg       0.79      0.77      0.77        90

Report Para Dados de Treino
Acurácia: 0.728873
              precision    recall  f1-score   support

       VIVER       0.70      0.80      0.75       142
      MORRER       0.76      0.66      0.71       142

    accuracy                           0.73       284
   macro avg       0.73      0.73      0.73       284
weighted avg       0.73      0.73      0.73       284

Wall time: 265 ms


%%time

#Transformando os dados em DMatrix pois o XGBoost exige
dtrain_sc_resample = xgb.DMatrix(x_train_sc_resample, label = y_train_resample_2)
dtest_sc = xgb.DMatrix(x_test_sc, label = y_test_nothing)

params = {'colsample_bynode': 0.6, 'colsample_bytree': 0.5, 'gamma': 2.3, 'learning_rate': 0.5, 'max_depth': 12,\
          'min_child_weight': 1, 'n_estimators': 500, 'nthread': 2, 'subsample': 0.8}

# Criação do modelo base v17
# Criando sem feature selecting. Com padronização e balanceamento
modelo_xgb_v17 = xgb.train(params = params, dtrain = dtrain_sc_resample)

pred_v17 = modelo_xgb_v17.predict(dtest_sc)

report_modelo(modelo_xgb_v17, y_test_nothing, pred_v17, label = 'XGB V17', target_names = ['VIVER', 'MORRER'])

print('Report Para Dados de Treino')
pred_treino = modelo_xgb_v17.predict(dtrain_sc_resample)
pred_treino = pred_treino > 0.5

# Acurácia
print("Acurácia: %f" % accuracy_score(y_train_resample_2, pred_treino))

# Classification Report
print(classification_report(y_train_resample_2, pred_treino, target_names= ['VIVER', 'MORRER']))

[23:02:03] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573: 
Parameters: { "n_estimators" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.

AUC: 0.806388

Acurácia: 0.811111
              precision    recall  f1-score   support

       VIVER       0.89      0.82      0.85        61
      MORRER       0.68      0.79      0.73        29

    accuracy                           0.81        90
   macro avg       0.78      0.81      0.79        90
weighted avg       0.82      0.81      0.81        90

Report Para Dados de Treino
Acurácia: 0.781690
              precision    recall  f1-score   support

       VIVER       0.77      0.81      0.79       142
      MORRER       0.80      0.75      0.78       142

    accuracy                           0.78       284
   macro avg       0.78      0.78      0.78       284
weighted avg       0.78      0.78      0.78       284

Wall time: 273 ms


%%time

#Transformando os dados em DMatrix pois o XGBoost exige
dtrain_sc_resample = xgb.DMatrix(x_train_sc_resample, label = y_train_resample_2)
dtest_sc = xgb.DMatrix(x_test_sc, label = y_test_nothing)

params = {'colsample_bynode': 0.6, 'colsample_bytree': 0.5, 'gamma': 2.3, 'learning_rate': 0.5, 'max_depth': 12,\
          'min_child_weight': 1, 'nthread': 2, 'subsample': 0.789}

# Criação do modelo base v18
# Criando sem feature selecting. Com padronização e balanceamento
modelo_xgb_v18 = xgb.train(params = params, dtrain = dtrain_sc_resample)

pred_v18 = modelo_xgb_v18.predict(dtest_sc)

report_modelo(modelo_xgb_v18, y_test_nothing, pred_v18, label = 'XGB V18', target_names = ['VIVER', 'MORRER'])

print('Report Para Dados de Treino')
pred_treino = modelo_xgb_v18.predict(dtrain_sc_resample)
pred_treino = pred_treino > 0.5

# Acurácia
print("Acurácia: %f" % accuracy_score(y_train_resample_2, pred_treino))

# Classification Report
print(classification_report(y_train_resample_2, pred_treino, target_names= ['VIVER', 'MORRER']))

AUC: 0.782646

Acurácia: 0.766667
              precision    recall  f1-score   support

       VIVER       0.90      0.74      0.81        61
      MORRER       0.60      0.83      0.70        29

    accuracy                           0.77        90
   macro avg       0.75      0.78      0.75        90
weighted avg       0.80      0.77      0.77        90

Report Para Dados de Treino
Acurácia: 0.750000
              precision    recall  f1-score   support

       VIVER       0.77      0.71      0.74       142
      MORRER       0.73      0.79      0.76       142

    accuracy                           0.75       284
   macro avg       0.75      0.75      0.75       284
weighted avg       0.75      0.75      0.75       284

Wall time: 268 ms


%%time

#Transformando os dados em DMatrix pois o XGBoost exige
dtrain_sc_resample = xgb.DMatrix(x_train_sc_resample, label = y_train_resample_2)
dtest_sc = xgb.DMatrix(x_test_sc, label = y_test_nothing)

params = {'colsample_bynode': 0.6, 'colsample_bytree': 0.5, 'gamma': 2.2, 'learning_rate': 0.5, 'max_depth': 12,\
          'min_child_weight': 1, 'nthread': 2, 'subsample': 0.789}

# Criação do modelo base v19
# Criando sem feature selecting. Com padronização e balanceamento
modelo_xgb_v19 = xgb.train(params = params, dtrain = dtrain_sc_resample)

pred_v19 = modelo_xgb_v19.predict(dtest_sc)

report_modelo(modelo_xgb_v19, y_test_nothing, pred_v19, label = 'XGB V19', target_names = ['VIVER', 'MORRER'])

print('Report Para Dados de Treino')
pred_treino = modelo_xgb_v19.predict(dtrain_sc_resample)
pred_treino = pred_treino > 0.5

# Acurácia
print("Acurácia: %f" % accuracy_score(y_train_resample_2, pred_treino))

# Classification Report
print(classification_report(y_train_resample_2, pred_treino, target_names= ['VIVER', 'MORRER']))

AUC: 0.782646

Acurácia: 0.766667
              precision    recall  f1-score   support

       VIVER       0.90      0.74      0.81        61
      MORRER       0.60      0.83      0.70        29

    accuracy                           0.77        90
   macro avg       0.75      0.78      0.75        90
weighted avg       0.80      0.77      0.77        90

Report Para Dados de Treino
Acurácia: 0.750000
              precision    recall  f1-score   support

       VIVER       0.77      0.71      0.74       142
      MORRER       0.73      0.79      0.76       142

    accuracy                           0.75       284
   macro avg       0.75      0.75      0.75       284
weighted avg       0.75      0.75      0.75       284

Wall time: 269 ms


%%time

#Transformando os dados em DMatrix pois o XGBoost exige
dtrain_sc_resample = xgb.DMatrix(x_train_sc_resample, label = y_train_resample_2)
dtest_sc = xgb.DMatrix(x_test_sc, label = y_test_nothing)

params = {'colsample_bynode': 0.6, 'colsample_bytree': 0.5, 'gamma': 2.2, 'learning_rate': 0.5, 'max_depth': 5,\
          'min_child_weight': 1, 'nthread': 2, 'subsample': 0.789, 'colsample_bylevel': 0.9}

# Criação do modelo base v20
# Criando sem feature selecting. Com padronização e balanceamento
modelo_xgb_v20 = xgb.train(params = params, dtrain = dtrain_sc_resample)

pred_v20 = modelo_xgb_v20.predict(dtest_sc)

report_modelo(modelo_xgb_v20, y_test_nothing, pred_v20, label = 'XGB V20', target_names = ['VIVER', 'MORRER'])

print('Report Para Dados de Treino')
pred_treino = modelo_xgb_v20.predict(dtrain_sc_resample)
pred_treino = pred_treino > 0.5

# Acurácia
print("Acurácia: %f" % accuracy_score(y_train_resample_2, pred_treino))

# Classification Report
print(classification_report(y_train_resample_2, pred_treino, target_names= ['VIVER', 'MORRER']))

AUC: 0.808084

Acurácia: 0.788889
              precision    recall  f1-score   support

       VIVER       0.92      0.75      0.83        61
      MORRER       0.62      0.86      0.72        29

    accuracy                           0.79        90
   macro avg       0.77      0.81      0.78        90
weighted avg       0.82      0.79      0.80        90

Report Para Dados de Treino
Acurácia: 0.845070
              precision    recall  f1-score   support

       VIVER       0.87      0.82      0.84       142
      MORRER       0.83      0.87      0.85       142

    accuracy                           0.85       284
   macro avg       0.85      0.85      0.84       284
weighted avg       0.85      0.85      0.84       284

Wall time: 265 ms


# Transformando os dados em DMatrix pois o XGBoost exige
dtrain_sc_resample = xgb.DMatrix(x_train_sc_resample, label = y_train_resample_2)
dtest_sc = xgb.DMatrix(x_test_sc, label = y_test_nothing)

params = {'colsample_bynode': 0.6, 'colsample_bytree': 0.5, 'gamma': 2.2, 'learning_rate': 0.5, 'max_depth': 5,\
          'min_child_weight': 1, 'nthread': 2, 'subsample': 0.789, 'colsample_bylevel': 0.9}

# Criação do modelo base final
# Criando sem feature selecting. Com padronização e balanceamento
modelo_xgb_final = xgb.train(params = params, dtrain = dtrain_sc_resample)

pred_final = modelo_xgb_final.predict(dtest_sc)

report_modelo(modelo_xgb_final, y_test_nothing, pred_final, label = 'XGB Final', target_names = ['VIVER', 'MORRER'])

print('Report Para Dados de Treino')
pred_treino = modelo_xgb_final.predict(dtrain_sc_resample)
pred_treino = pred_treino > 0.5

# Acurácia
print("Acurácia: %f" % accuracy_score(y_train_resample_2, pred_treino))

# Classification Report
print(classification_report(y_train_resample_2, pred_treino, target_names= ['VIVER', 'MORRER']))

AUC: 0.808084

Acurácia: 0.788889
              precision    recall  f1-score   support

       VIVER       0.92      0.75      0.83        61
      MORRER       0.62      0.86      0.72        29

    accuracy                           0.79        90
   macro avg       0.77      0.81      0.78        90
weighted avg       0.82      0.79      0.80        90

Report Para Dados de Treino
Acurácia: 0.845070
              precision    recall  f1-score   support

       VIVER       0.87      0.82      0.84       142
      MORRER       0.83      0.87      0.85       142

    accuracy                           0.85       284
   macro avg       0.85      0.85      0.84       284
weighted avg       0.85      0.85      0.84       284


colunas = ['age',
         'anaemia',
         'creatinine_phosphokinase',
         'diabetes',
         'ejection_fraction',
         'high_blood_pressure',
         'platelets',
         'serum_creatinine',
         'serum_sodium',
         'sex',
         'smoking']


shap.initjs()
explainer = shap.TreeExplainer(modelo_xgb_final)
shap_values = explainer(X)


# Interpretação da predição 0
shap.plots.waterfall(shap_values[0])


# Interpretação da predição 0
shap.plots.force(shap_values[0])


# Interpretação da predição 1
shap.plots.waterfall(shap_values[1])


# Interpretação da predição 1
shap.plots.force(shap_values[1])


shap.plots.beeswarm(shap_values)


shap.plots.bar(shap_values)

Feature	Explanation	Measurement	Range
Age	Age of the patient	Years	[40,…, 95]
Anaemia	Decrease of red blood cells or hemoglobin	Boolean	0, 1
High blood pressure	If a patient has hypertension	Boolean	0, 1
Creatinine phosphokinase (CPK)	Level of the CPK enzyme in the blood	mcg/L	[23,…, 7861]
Diabetes	If the patient has diabetes	Boolean	0, 1
Ejection fraction	Percentage of blood leaving	Percentage	[14,…, 80]
Sex	Woman or man	Binary	0, 1
Platelets	Platelets in the blood	kiloplatelets/mL	[25.01,…, 850.00]
Serum creatinine	Level of creatinine in the blood	mg/dL	[0.50,…, 9.40]
Serum sodium	Level of sodium in the blood	mEq/L	[114,…, 148]
Smoking	If the patient smokes	Boolean	0, 1
Time	Follow-up period	Days	[4,…,285]
(target) death event	If the patient died during the follow-up period	Boolean	0, 1

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time	DEATH_EVENT
count	299.000000	299.000000	299.000000	299.000000	299.000000	299.000000	299.000000	299.00000	299.000000	299.000000	299.00000	299.000000	299.00000
mean	60.833893	0.431438	581.839465	0.418060	38.083612	0.351171	263358.029264	1.39388	136.625418	0.648829	0.32107	130.260870	0.32107
std	11.894809	0.496107	970.287881	0.494067	11.834841	0.478136	97804.236869	1.03451	4.412477	0.478136	0.46767	77.614208	0.46767
min	40.000000	0.000000	23.000000	0.000000	14.000000	0.000000	25100.000000	0.50000	113.000000	0.000000	0.00000	4.000000	0.00000
25%	51.000000	0.000000	116.500000	0.000000	30.000000	0.000000	212500.000000	0.90000	134.000000	0.000000	0.00000	73.000000	0.00000
50%	60.000000	0.000000	250.000000	0.000000	38.000000	0.000000	262000.000000	1.10000	137.000000	1.000000	0.00000	115.000000	0.00000
75%	70.000000	1.000000	582.000000	1.000000	45.000000	1.000000	303500.000000	1.40000	140.000000	1.000000	1.00000	203.000000	1.00000
max	95.000000	1.000000	7861.000000	1.000000	80.000000	1.000000	850000.000000	9.40000	148.000000	1.000000	1.00000	285.000000	1.00000

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time	DEATH_EVENT
count	299.000000	299.000000	299.000000	299.000000	299.000000	299.000000	299.000000	299.00000	299.000000	299.000000	299.00000	299.000000	299.00000
mean	60.829431	0.431438	581.839465	0.418060	38.083612	0.351171	263358.029264	1.39388	136.625418	0.648829	0.32107	130.260870	0.32107
std	11.894997	0.496107	970.287881	0.494067	11.834841	0.478136	97804.236869	1.03451	4.412477	0.478136	0.46767	77.614208	0.46767
min	40.000000	0.000000	23.000000	0.000000	14.000000	0.000000	25100.000000	0.50000	113.000000	0.000000	0.00000	4.000000	0.00000
25%	51.000000	0.000000	116.500000	0.000000	30.000000	0.000000	212500.000000	0.90000	134.000000	0.000000	0.00000	73.000000	0.00000
50%	60.000000	0.000000	250.000000	0.000000	38.000000	0.000000	262000.000000	1.10000	137.000000	1.000000	0.00000	115.000000	0.00000
75%	70.000000	1.000000	582.000000	1.000000	45.000000	1.000000	303500.000000	1.40000	140.000000	1.000000	1.00000	203.000000	1.00000
max	95.000000	1.000000	7861.000000	1.000000	80.000000	1.000000	850000.000000	9.40000	148.000000	1.000000	1.00000	285.000000	1.00000

model	lifelines.CoxPHFitter
duration col	'time'
event col	'DEATH_EVENT'
baseline estimation	breslow
number of observations	299
number of events observed	96
partial log-likelihood	-468.2279
time fit was run	2021-07-12 19:01:11 UTC

	coef	exp(coef)	exp(coef) lower 95%	exp(coef) upper 95%	z	p
age	0.0464	1.0475	1.0285	1.0668	4.9773	<5e-05
anaemia	0.4601	1.5843	1.0358	2.4233	2.1220	0.0338
creatinine_phosphokinase	0.0002	1.0002	1.0000	1.0004	2.2255	0.0260
diabetes	0.1399	1.1501	0.7427	1.7811	0.6269	0.5307
ejection_fraction	-0.0489	0.9522	0.9329	0.9720	-4.6719	<5e-05
high_blood_pressure	0.4757	1.6092	1.0534	2.4583	2.2005	0.0278
platelets	-0.0000	1.0000	1.0000	1.0000	-0.4116	0.6806
serum_creatinine	0.3210	1.3786	1.2014	1.5818	4.5751	<5e-05
serum_sodium	-0.0442	0.9568	0.9141	1.0014	-1.8993	0.0575
sex	-0.2375	0.7886	0.4816	1.2913	-0.9440	0.3452
smoking	0.1289	1.1376	0.6953	1.8614	0.5132	0.6078

Variaveis¶

Imports¶

Ambiente¶

Coleta dos Dados¶

1. Analise Exploratoria¶

1.1 Distribuição dos Dados¶

1.1.1 Tesde de Shapiro Wilk¶

1.1.2 Teste normal de D'Agostino¶

1.2 Tabela de Contigencia¶

1.3 Correlação de Spearman¶

1.4 QUI-QUADRADO¶

1.5 Cox Proportional Hazards¶

2. Avaliando a MultiColinearidade¶

2.1 Autovetores¶

2.2 Visualizando Multicolinearidade¶

3. Pre-Processamento¶

3.1 Detectando Outliers¶

Exceço de Kurtosis = Kurtosis - 3¶

3.2 Removendo Outliers¶

3.3 Padronização dos Dados¶

4. Balanceando os Dados¶

5. Feature Selecting¶

5.1 XGBOOST¶

5.2 RFE (Recursive Feature Elimination)¶

5.3 Extra Trees Classifier¶

5.4 Aplicando Feature Selecting¶

5.4.1 Feature Selecting Baseado em Arvore - Mantendo Multicolineares¶

5.4.2 Feature Selecting Baseado em Arvore - Excluindo Multicolineares¶

5.4.3 Feature Selecting Baseado no Cox Proportional Hazard - Mantendo Multicolineares¶

5.5 Visualizando Resultado do Feature Selecting¶

6. Modelagem Preditiva¶

6.1 K-Means¶

Observação¶

6.2 SVM¶

6.3 XGBOOST¶

7. Conclusão¶

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time	DEATH_EVENT
0	75.0	0	582	0	20	1	265000.00	1.9	130	1	0	4	1
1	55.0	0	7861	0	38	0	263358.03	1.1	136	1	0	6	1
2	65.0	0	146	0	20	0	162000.00	1.3	129	1	1	7	1
3	50.0	1	111	0	20	0	210000.00	1.9	137	1	0	7	1
4	65.0	1	160	1	20	0	327000.00	2.7	116	0	0	8	1

Concordance	0.7408
Partial AIC	958.4557
log-likelihood ratio test	81.9545 on 11 df
-log2(p) of ll-ratio test	40.5590

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking
0	75.0	0	582.0	0	25.0	1	265000.00	1.9	132.0	1	0
1	55.0	0	1203.8	0	38.0	0	263358.03	1.1	136.0	1	0
2	65.0	0	146.0	0	25.0	0	162000.00	1.3	132.0	1	1
3	50.0	1	111.0	0	25.0	0	210000.00	1.9	137.0	1	0
4	65.0	1	160.0	1	25.0	0	327000.00	2.1	132.0	0	0

	param_C	param_class_weight	param_gamma	param_kernel	mean_test_score	rank_test_score
0	0.0001	balanced	scale	rbf	1.0	1
1	0.0001	balanced	auto	rbf	1.0	1
2	0.0001	balanced	1	rbf	1.0	1
3	0.0001	balanced	10	rbf	1.0	1
4	0.0001	balanced	100	rbf	1.0	1
5	0.001	balanced	scale	rbf	1.0	1
6	0.001	balanced	auto	rbf	1.0	1
7	0.001	balanced	1	rbf	1.0	1
8	0.001	balanced	10	rbf	1.0	1
9	0.001	balanced	100	rbf	1.0	1
10	0.01	balanced	scale	rbf	1.0	1
11	0.01	balanced	auto	rbf	1.0	1
12	0.01	balanced	1	rbf	1.0	1
13	0.01	balanced	10	rbf	1.0	1
14	0.01	balanced	100	rbf	1.0	1
15	0.1	balanced	scale	rbf	1.0	1
16	0.1	balanced	auto	rbf	1.0	1
17	0.1	balanced	1	rbf	1.0	1
18	0.1	balanced	10	rbf	1.0	1
19	0.1	balanced	100	rbf	1.0	1
20	1	balanced	scale	rbf	1.0	1
21	1	balanced	auto	rbf	1.0	1
22	1	balanced	1	rbf	1.0	1
23	1	balanced	10	rbf	1.0	1
24	1	balanced	100	rbf	1.0	1
25	10	balanced	scale	rbf	1.0	1
26	10	balanced	auto	rbf	1.0	1
27	10	balanced	1	rbf	1.0	1
28	10	balanced	10	rbf	1.0	1
29	10	balanced	100	rbf	1.0	1

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_colsample_bytree	param_gamma	param_learning_rate	param_max_depth	param_min_child_weight	param_n_estimators	...	param_subsample	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
0	0.184767	0.018689	0.005005	4.520880e-03	0.6	0.7	0.5	5	2	500	...	0.6	{'colsample_bytree': 0.6, 'gamma': 0.7, 'learn...	0.561404	0.701754	0.771930	0.894737	0.785714	0.743108	0.109876	1371
1	0.190372	0.014400	0.004004	2.099489e-03	0.6	0.7	0.5	5	2	500	...	0.7	{'colsample_bytree': 0.6, 'gamma': 0.7, 'learn...	0.631579	0.684211	0.789474	0.877193	0.785714	0.753634	0.086368	523
2	0.218598	0.038571	0.002803	4.004240e-04	0.6	0.7	0.5	5	2	500	...	0.8	{'colsample_bytree': 0.6, 'gamma': 0.7, 'learn...	0.596491	0.754386	0.754386	0.771930	0.767857	0.729010	0.066634	2719
3	0.246624	0.023834	0.002803	4.002810e-04	0.6	0.7	0.5	5	2	500	...	0.9	{'colsample_bytree': 0.6, 'gamma': 0.7, 'learn...	0.596491	0.736842	0.789474	0.807018	0.803571	0.746679	0.079198	999
4	0.212781	0.014608	0.002803	4.002810e-04	0.6	0.7	0.5	5	2	500	...	1.0	{'colsample_bytree': 0.6, 'gamma': 0.7, 'learn...	0.649123	0.719298	0.684211	0.771930	0.785714	0.722055	0.051573	3103
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3595	0.202486	0.024683	0.002803	4.002571e-04	1.0	1.7	0.01	7	5	500	...	0.6	{'colsample_bytree': 1.0, 'gamma': 1.7, 'learn...	0.631579	0.842105	0.807018	0.789474	0.714286	0.756892	0.075305	457
3596	0.182868	0.006962	0.003002	2.780415e-07	1.0	1.7	0.01	7	5	500	...	0.7	{'colsample_bytree': 1.0, 'gamma': 1.7, 'learn...	0.631579	0.824561	0.754386	0.771930	0.732143	0.742920	0.063484	1622
3597	0.183071	0.014351	0.003003	1.507891e-07	1.0	1.7	0.01	7	5	500	...	0.8	{'colsample_bytree': 1.0, 'gamma': 1.7, 'learn...	0.631579	0.842105	0.754386	0.736842	0.767857	0.746554	0.067763	1126
3598	0.198390	0.035933	0.002797	3.976883e-04	1.0	1.7	0.01	7	5	500	...	0.9	{'colsample_bytree': 1.0, 'gamma': 1.7, 'learn...	0.631579	0.771930	0.736842	0.736842	0.803571	0.736153	0.057900	2032
3599	0.180926	0.020455	0.003001	6.345223e-04	1.0	1.7	0.01	7	5	500	...	1.0	{'colsample_bytree': 1.0, 'gamma': 1.7, 'learn...	0.649123	0.771930	0.719298	0.754386	0.750000	0.728947	0.043366	2758