import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import sweetviz as sv
import statsmodels.api as sm
import statsmodels.formula.api as smf
import shap
import graphviz

from warnings import simplefilter
from matplotlib.colors import ListedColormap
from math import ceil
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import normaltest, kurtosis
from statsmodels.stats.outliers_influence import variance_inflation_factor
from smogn import smoter
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from holidays import Belgium
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, LassoCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, LinearSVR
from sklearn.feature_selection import RFE
from sklearn.tree import export_graphviz
from catboost import CatBoostRegressor
from catboost import Pool, cv
from pickle import dump, load


# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Herikc Brecher" --iversions

Author: Herikc Brecher

shap       : 0.39.0
graphviz   : 0.17
statsmodels: 0.12.2
matplotlib : 3.3.4
sweetviz   : 2.1.3
pandas     : 1.2.4
seaborn    : 0.11.1
numpy      : 1.19.5


simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline
sns.set_theme()


seed_ = 194
np.random.seed(seed_)


# Carregamento do dataset de treino e teste
dtTreino = pd.read_csv('../data/training.csv')
dtTeste = pd.read_csv('../data/testing.csv')


dtTreino.head()


dtTeste.head()


dtFull = pd.concat([dtTreino, dtTeste], axis = 0)


dtFull.head()


print(dtTreino.shape, dtTeste.shape, dtFull.shape)

(14803, 32) (4932, 32) (19735, 32)


dtFull.head()


dtFull.describe()


dtFull.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
NSM              int64
WeekStatus      object
Day_of_week     object
dtype: object


# Copiando para um dataset onde iremos processar os dados
dtProcessado = dtFull.copy()

# Convertendo a coluna 'date' para 'datetime'
dtProcessado['date'] = pd.to_datetime(dtProcessado['date'], format='%Y-%m-%d %H:%M:%S')


dtProcessado.dtypes

date           datetime64[ns]
Appliances              int64
lights                  int64
T1                    float64
RH_1                  float64
T2                    float64
RH_2                  float64
T3                    float64
RH_3                  float64
T4                    float64
RH_4                  float64
T5                    float64
RH_5                  float64
T6                    float64
RH_6                  float64
T7                    float64
RH_7                  float64
T8                    float64
RH_8                  float64
T9                    float64
RH_9                  float64
T_out                 float64
Press_mm_hg           float64
RH_out                float64
Windspeed             float64
Visibility            float64
Tdewpoint             float64
rv1                   float64
rv2                   float64
NSM                     int64
WeekStatus             object
Day_of_week            object
dtype: object


dtProcessado.head()


# Verificando se possui valor missing/NA
print(dtProcessado.isna().sum())

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
NSM            0
WeekStatus     0
Day_of_week    0
dtype: int64


# Verificando valores unicos
print(dtProcessado.nunique())

date           19735
Appliances        92
lights             8
T1               722
RH_1            2547
T2              1650
RH_2            3376
T3              1426
RH_3            2618
T4              1390
RH_4            2987
T5              2263
RH_5            7571
T6              4446
RH_6            9709
T7              1955
RH_7            5891
T8              2228
RH_8            6649
T9               924
RH_9            3388
T_out           1730
Press_mm_hg     2189
RH_out           566
Windspeed        189
Visibility       413
Tdewpoint       1409
rv1            19735
rv2            19735
NSM              144
WeekStatus         2
Day_of_week        7
dtype: int64


# Verificando se possui valores duplicados
print(sum(dtProcessado.duplicated()))

0


qualitativas = ['WeekStatus', 'Day_of_week']
quantitativas = dtProcessado.drop(['WeekStatus', 'Day_of_week', 'date'], axis = 1).columns


dtProcessado[qualitativas].head()


dtProcessado[quantitativas].head()


# Consumo de energia entre dias da semana e finais de semana
fig = plt.figure(figsize = (15, 10))
plt.pie(dtProcessado.groupby('WeekStatus').sum()['Appliances'], labels = ['Weekday', 'Weekend'], autopct = '%1.1f%%')

plt.savefig('../analises/pizza_energia_weekday_weekend.png')
plt.show()


plt.plot(dtProcessado['date'], dtProcessado['Appliances'])

[<matplotlib.lines.Line2D at 0x235061623d0>]


def scatter_plot_conjunto(data, columns, target):
    # Definindo range de Y
    y_range = [data[target].min(), data[target].max()]
    
    for column in columns:
        if target != column:
            # Definindo range de X
            x_range = [data[column].min(), data[column].max()]
            
            # Scatter plot de X e Y
            scatter_plot = data.plot(kind = 'scatter', x = column, y = target, xlim = x_range, ylim = y_range,\
                                    c = ['black'])
            
            # Traçar linha da media de X e Y
            meanX = scatter_plot.plot(x_range, [data[target].mean(), data[target].mean()], '--', color = 'red', linewidth = 1)
            meanY = scatter_plot.plot([data[column].mean(), data[column].mean()], y_range, '--', color = 'red', linewidth = 1)


scatter_plot_conjunto(dtProcessado, quantitativas, 'Appliances')

More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).


def quantil_quantil_teste(data, columns):
    
    for col in columns:
        print(col)
        qqplot(data[col], line = 's')
        plt.show()


quantil_quantil_teste(dtProcessado, quantitativas)

Appliances

lights

T1

RH_1

T2

RH_2

T3

RH_3

T4

RH_4


def testes_gaussianos(data, columns, teste):
    
    for i, col in enumerate(columns):
        print('Teste para a variavel', col)
        alpha = 0.05
        
        if teste == 'shapiro':
            stat, p = shapiro(data[col])
        elif teste == 'normal':
            stat, p = normaltest(data[col])           
        elif teste == 'anderson':
            resultado = anderson(data[col])
            print('Stats: %.4f' % resultado.statistic)
            
            for j in range(len(resultado.critical_values)):
                sl, cv = resultado.significance_level[j], resultado.critical_values[j]
                
                if resultado.statistic < cv:
                    print('Significancia = %.4f, Valor Critico = %.4f, os dados parecem Gaussianos. Falha ao rejeitar H0.' % (sl, cv))
                else:
                    print('Significancia = %.4f, Valor Critico = %.4f, os dados não parecem Gaussianos. H0 rejeitado.' % (sl, cv))
            
        if teste != 'anderson':         
            print('Stat = ', round(stat, 4))
            print('p-value = ', round(p, 4))
            #print('Stats = %4.f, p = %4.f' % (stat, p))

            if p > alpha:
                print('Os dados parecem Gaussianos. Falha ao rejeitar H0.')
            else:
                print('Os dados não parecem Gaussianos. H0 rejeitado.')
            
        print('\n')


testes_gaussianos(dtProcessado, quantitativas, teste = 'normal')

Teste para a variavel Appliances
Stat =  14008.9202
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel lights
Stat =  8437.425
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T1
Stat =  66.3657
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_1
Stat =  657.2555
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T2
Stat =  2300.1745
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_2
Stat =  442.0018
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T3
Stat =  610.1817
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_3
Stat =  1238.9214
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T4
Stat =  95.3642
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_4
Stat =  1280.0365
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T5
Stat =  906.2089
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_5
Stat =  7370.3086
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T6
Stat =  1113.0635
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_6
Stat =  11009.4467
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T7
Stat =  510.3485
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_7
Stat =  668.7835
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T8
Stat =  233.5762
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_8
Stat =  640.2104
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T9
Stat =  575.0876
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_9
Stat =  637.2376
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel T_out
Stat =  908.7901
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel Press_mm_hg
Stat =  540.6699
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel RH_out
Stat =  2110.9709
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel Windspeed
Stat =  1892.7628
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel Visibility
Stat =  606.9198
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel Tdewpoint
Stat =  197.8235
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel rv1
Stat =  18296.8123
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel rv2
Stat =  18296.8123
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


Teste para a variavel NSM
Stat =  17369.1239
p-value =  0.0
Os dados não parecem Gaussianos. H0 rejeitado.


# Plot para variaveis quantitativas

fig = plt.figure(figsize = (16, 32))

for i, col in enumerate(quantitativas):
    plt.subplot(10, 3, i + 1)
    dtProcessado.boxplot(col)
    plt.tight_layout()


fig = plt.figure(figsize = (32, 32))

sns.heatmap(dtProcessado[quantitativas].corr(method = 'pearson'), annot = True, square = True)
plt.show()


# Gerando relatorio de analise do Sweetviz
relatorio = sv.analyze(dtProcessado)
relatorio.show_html('eda_report.html')

Report eda_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Remoção de variaveis desnecessárias a primeira vista

dtProcessado = dtProcessado.drop(['rv1', 'rv2'], axis = 1)
quantitativas = quantitativas.drop(['rv1', 'rv2'])


dtProcessado_Temp = dtProcessado.copy()
dtProcessado_Temp = dtProcessado_Temp.drop(['date', 'Appliances'], axis = 1)

# Capturando variaveis independentes e dependentes
X = dtProcessado_Temp[quantitativas.drop('Appliances')]

# Gerando matriz de correlação e recombinando
corr = np.corrcoef(X, rowvar = 0)
eigenvalues, eigenvectors = np.linalg.eig(corr)


menor = 999
index = 0
for i, val in enumerate(eigenvalues):
    if val < menor:
        menor = val
        index = i


print('Menor valor do eigenvalues:', menor, 'Index:', index)

Menor valor do eigenvalues: 0.0036995337351865758 Index: 20


menorEigenVector = abs(eigenvectors[:, 19])


for i, val in enumerate(eigenvectors[:, 19]):
    print('Variavel', i,':', abs(val))

Variavel 0 : 0.012508869857297154
Variavel 1 : 0.05999614308799351
Variavel 2 : 0.39796806946142554
Variavel 3 : 0.1293417764198327
Variavel 4 : 0.0065720474555933375
Variavel 5 : 0.20872335203896905
Variavel 6 : 0.7093152784227946
Variavel 7 : 0.07116744022615061
Variavel 8 : 0.002340580766378429
Variavel 9 : 0.10252370243755643
Variavel 10 : 0.009037451481592679
Variavel 11 : 0.016828188950170738
Variavel 12 : 0.3890808789742833
Variavel 13 : 0.052917602709535834
Variavel 14 : 0.09321477213498679
Variavel 15 : 0.01695868232166083
Variavel 16 : 0.18437456347816766
Variavel 17 : 0.048838853198184366
Variavel 18 : 0.017253674475974264
Variavel 19 : 0.004418863747774177
Variavel 20 : 0.03142486038181419
Variavel 21 : 0.18374437578705016
Variavel 22 : 0.010690483150461665
Variavel 23 : 0.0014915408631150135
Variavel 24 : 0.15236584339230871
Variavel 25 : 0.022681722214888112


colunas = dtProcessado_Temp.columns


colunas[[11, 19, 21, 24]]

Index(['T6', 'T_out', 'RH_out', 'Tdewpoint'], dtype='object')


scatter_plot_conjunto(dtProcessado_Temp, ['RH_5', 'RH_9', 'Press_mm_hg', 'Visibility'], 'RH_5')


scatter_plot_conjunto(dtProcessado_Temp, ['RH_5', 'RH_9', 'Press_mm_hg', 'Visibility'], 'RH_9')


scatter_plot_conjunto(dtProcessado_Temp, ['RH_5', 'RH_9', 'Press_mm_hg', 'Visibility'], 'Press_mm_hg')


scatter_plot_conjunto(dtProcessado_Temp, ['RH_5', 'RH_9', 'Press_mm_hg', 'Visibility'], 'Visibility')


def calcular_VIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    
    return vif


dtProcessado_Temp = dtProcessado.copy()
dtProcessado_Temp = dtProcessado_Temp.drop(['Appliances'], axis = 1)

# Capturando variaveis independentes
X = dtProcessado_Temp[quantitativas.drop('Appliances')]


calcular_VIF(X)


X_temp = X.drop(['T1', 'T2', 'T9', 'Press_mm_hg'], axis = 1)
calcular_VIF(X_temp)


X_temp = X.drop(['T1', 'T2', 'T9', 'Press_mm_hg', 'RH_1', 'RH_3', 'RH_4', 'T7'], axis = 1)
calcular_VIF(X_temp)


X_temp = X.drop(['T1', 'T2', 'T9', 'Press_mm_hg', 'RH_1', 'RH_3', 'RH_4', 'T7', 'T3', 'T4', 'T5', 'T8', 'RH_9', 'RH_2',\
                'RH_7', 'RH_8'], axis = 1)
calcular_VIF(X_temp)


X_temp = X.drop(['T1', 'T2', 'T9', 'Press_mm_hg', 'RH_1', 'RH_3', 'RH_4', 'T7', 'T3', 'T4', 'T5', 'T8', 'RH_9', 'RH_2',\
                'RH_7', 'RH_8', 'RH_5', 'T_out', 'Visibility', 'RH_out', 'T6'], axis = 1)
calcular_VIF(X_temp)


# Carregando todas variaveis com exceção da 'Target', iremos adicionar a constante exigida pelo modelo
X = dtProcessado_Temp.copy().drop('date', axis = 1)[quantitativas.drop('Appliances')]
Xc = sm.add_constant(X)

y = dtProcessado['Appliances'].values


# Criando e treinando modelo
modelo = sm.OLS(y, Xc)
modelo_v1 = modelo.fit()


# Visualizando resumo do modelo
modelo_v1.summary()


# Carregando variaveis com exceção
Xc = sm.add_constant(X_temp)

y = dtProcessado['Appliances'].values


# Criando e treinando modelo
modelo = sm.OLS(y, Xc)
modelo_v2 = modelo.fit()


# Visualizando resumo do modelo
modelo_v2.summary()


print(dtProcessado[quantitativas].skew(), '\nSoma:', sum(abs(dtProcessado[quantitativas].skew())))

Appliances     3.386367
lights         2.195155
T1             0.120917
RH_1           0.465774
T2             0.889658
RH_2          -0.268247
T3             0.450777
RH_3           0.467589
T4             0.170384
RH_4           0.444614
T5             0.558220
RH_5           1.866820
T6             0.597471
RH_6          -0.241961
T7             0.254722
RH_7           0.242141
T8            -0.256151
RH_8           0.308036
T9             0.382711
RH_9           0.368937
T_out          0.534273
Press_mm_hg   -0.420442
RH_out        -0.922997
Windspeed      0.859982
Visibility     0.441554
Tdewpoint      0.239374
NSM           -0.000670
dtype: float64 
Soma: 17.355945524815315


def hist_individual(data, columns, width = 10, height = 15):
    fig = plt.figure()
    fig.subplots_adjust(hspace = 0.4, wspace = 0.4)
    fig.set_figheight(10)
    fig.set_figwidth(15)
    
    columns_adjust = ceil(len(columns) / 3)
    
    for i, column in enumerate(columns):
        ax = fig.add_subplot(columns_adjust, 3, i + 1)
        data[column].hist(label = column)
        plt.title(column)
        
    plt.tight_layout()  
    plt.show()


hist_individual(dtProcessado, quantitativas[0:9])


hist_individual(dtProcessado, quantitativas[9:18])


hist_individual(dtProcessado, quantitativas[18:27])


print(dtProcessado[quantitativas].kurtosis(), '\nSoma:', sum(abs(dtProcessado[quantitativas].kurtosis())))

Appliances     13.667863
lights          4.462147
T1              0.161601
RH_1            0.112629
T2              0.933397
RH_2            0.670959
T3             -0.007055
RH_3           -0.583126
T4             -0.037633
RH_4           -0.613967
T5              0.112724
RH_5            4.503391
T6              0.425549
RH_6           -1.142064
T7             -0.461165
RH_7           -0.544889
T8             -0.158742
RH_8           -0.481962
T9             -0.324625
RH_9           -0.405540
T_out           0.364291
Press_mm_hg     0.071831
RH_out          0.256859
Windspeed       0.250030
Visibility      0.165818
Tdewpoint      -0.124519
NSM            -1.200156
dtype: float64 
Soma: 32.24453232465741


# Renomeando coluna WeekStatus para Weekend
dtProcessado = dtProcessado.rename(columns = {'WeekStatus': 'Weekend'})


# Dia de semana = 0, final de semana = 1
dtProcessado['Day_of_week'] = dtProcessado['date'].dt.dayofweek
dtProcessado['Weekend'] = 0

dtProcessado.loc[(dtProcessado['Day_of_week'] == 5) | (dtProcessado['Day_of_week'] == 6), 'Weekend'] = 1


# Criando colunan de Mês, Dia e Hora
dtProcessado['Month'] = dtProcessado['date'].dt.month
dtProcessado['Day'] = dtProcessado['date'].dt.day
dtProcessado['Hour'] = dtProcessado['date'].dt.hour


dtProcessado.head()


fig, ax = plt.subplots(figsize = (10, 5))
dtProcessado.groupby('Day_of_week').mean()['Appliances'].plot(kind = 'bar')

ax.set_title('Média de Watt-Hora por Dia')
ax.set_ylabel('Watt-Hora')
ax.set_xlabel('Dia da Semana')

plt.savefig('../analises/barra_dia_semana_media_wh.png')
plt.plot()

[]


fig, ax = plt.subplots(figsize = (10, 5))
dtProcessado.groupby('Day_of_week').sum()['Appliances'].plot(kind = 'bar')

ax.set_title('Soma de Watt-Hora por Dia')
ax.set_ylabel('Watt-Hora')
ax.set_xlabel('Dia da Semana')

plt.savefig('../analises/barra_dia_semana_soma_wh.png')
plt.plot()

[]


fig, ax = plt.subplots(figsize = (10, 5))
dtProcessado.groupby('Hour').mean()['Appliances'].plot(kind = 'line')

ax.set_title('Media de Watt-Hora por Hora')
ax.set_ylabel('Watt-Hora')
ax.set_xlabel('Hora do Dia')

plt.savefig('../analises/linha_hora_media_wh.png')
plt.plot()

[]


fig, ax = plt.subplots(figsize = (10, 5))
dtProcessado.groupby('Hour').sum()['Appliances'].plot(kind = 'line')

ax.set_title('Soma de Watt-Hora por Hora')
ax.set_ylabel('Watt-Hora')
ax.set_xlabel('Hora do Dia')

plt.savefig('../analises/linha_hora_soma_wh.png')
plt.plot()

[]


# Criando copia do data set
dtProcessado_temporal = dtProcessado.copy()

# Set da data como index
dtProcessado_temporal.index = dtProcessado_temporal['date']
dtProcessado_temporal = dtProcessado_temporal.drop('date', axis = 1)


dtProcessado_temporal.head()


# Calculando media por data
dtProcessado_Dia = dtProcessado_temporal['Appliances'].resample('D').mean()

# Calculando media até a data atual
media_momentanea = pd.Series(\
                        [np.mean(dtProcessado_Dia[:x]) for x in range(len(dtProcessado_Dia))]\
                        )

media_momentanea.index = dtProcessado_Dia.index


fig, ax = plt.subplots(figsize = (15, 5))
plt.plot(dtProcessado_Dia, label = 'Gasto Energetico Diario')
plt.plot(media_momentanea, label = 'Media de Gasto Energetico')
plt.legend()
plt.xticks(rotation = 90)

plt.savefig('../analises/linha_media_wh_data.png')
ax.set_title('Gasto Médio de Energia Diário em Watt-Hora');


dtProcessado = dtProcessado.drop(['date'], axis = 1)


dtProcessado.head()


def boxplot_individuais(data, columns, width = 15, height = 8):
    fig = plt.figure()
    fig.subplots_adjust(hspace = 0.4, wspace = 0.4)
    fig.set_figheight(8)
    fig.set_figwidth(15)
    
    columns_adjust = ceil(len(columns) / 3)
    
    for i, column in enumerate(columns):
        ax = fig.add_subplot(columns_adjust, 3, i + 1)
        sns.boxplot(x = data[column])
        
    plt.tight_layout()  
    plt.show()


boxplot_individuais(dtProcessado, quantitativas[0:9])


boxplot_individuais(dtProcessado, quantitativas[9:18])


boxplot_individuais(dtProcessado, quantitativas[18:27])


def calcular_limites_IQR(column):
    # Calcular Q1 e Q3 do array
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR
    
    return limite_inferior, limite_superior

def aplicar_IQR_coluna(column, superior, inferior):
    limite_inferior, limite_superior = calcular_limites_IQR(column)
    
    if inferior:
        column = [limite_inferior if x < limite_inferior else x for x in column]
        
    if superior:      
        column = [limite_superior if x > limite_superior else x for x in column]
    
    return column

def aplicar_IQR(data, columns = [], superior = True, inferior = True):
    
    if len(columns) == 0:
        especificar = False
    else:
        especificar = True
    
    for i, column in enumerate(data.columns):
        if especificar:
            if column in columns:
                data[column] = aplicar_IQR_coluna(data[column], superior, inferior)
        else:
            data[column] = aplicar_IQR_coluna(data[column], superior, inferior)
            
    return data


dtProcessado.describe()


dtProcessado_IQR = dtProcessado.copy()
dtProcessado_IQR = aplicar_IQR(dtProcessado_IQR, columns = dtProcessado_IQR.columns.copy().drop(['lights',\
                                                                    'Weekend', 'Day_of_week', 'Month', 'Day', 'Hour']))


dtProcessado_IQR.describe()


# Normalização dos dados
scaler = StandardScaler()
processado_IQR_normalizado = dtProcessado_IQR.copy()
processado_IQR_normalizado[quantitativas.drop('Appliances')] = scaler.fit_transform(\
                                                                    dtProcessado_IQR[quantitativas.drop('Appliances')])


'''
# Normalização dos dados
scaler = StandardScaler()
processado_IQR_normalizado = dtProcessado_IQR.copy()
processado_IQR_normalizado[quantitativas] = scaler.fit_transform(dtProcessado_IQR[quantitativas])
'''

'\n# Normalização dos dados\nscaler = StandardScaler()\nprocessado_IQR_normalizado = dtProcessado_IQR.copy()\nprocessado_IQR_normalizado[quantitativas] = scaler.fit_transform(dtProcessado_IQR[quantitativas])\n'


dtProcessado_IQR_normalizado = pd.DataFrame(processado_IQR_normalizado.copy(), columns = dtProcessado_IQR.columns)


dtProcessado_IQR_normalizado.head()


print(dtProcessado_IQR_normalizado.skew()[quantitativas],\
      '\nSoma:', sum(abs(dtProcessado_IQR_normalizado[quantitativas].skew())))

Appliances     1.156157
lights         2.195155
T1             0.166939
RH_1           0.401822
T2             0.634604
RH_2          -0.146187
T3             0.383435
RH_3           0.466830
T4             0.192775
RH_4           0.444614
T5             0.534076
RH_5           0.734540
T6             0.456428
RH_6          -0.241961
T7             0.254697
RH_7           0.237275
T8            -0.246628
RH_8           0.305941
T9             0.382711
RH_9           0.370684
T_out          0.398211
Press_mm_hg   -0.338726
RH_out        -0.846254
Windspeed      0.790130
Visibility     0.157975
Tdewpoint      0.239017
NSM           -0.000670
dtype: float64 
Soma: 12.724442034537741


print(dtProcessado_IQR_normalizado[quantitativas].kurtosis(),\
      '\nSoma:', sum(abs(dtProcessado_IQR_normalizado[quantitativas].kurtosis())))

Appliances     0.259590
lights         4.462147
T1            -0.061093
RH_1          -0.201816
T2            -0.012872
RH_2           0.041570
T3            -0.212381
RH_3          -0.594909
T4            -0.096892
RH_4          -0.613967
T5             0.039439
RH_5           0.111175
T6             0.005378
RH_6          -1.142064
T7            -0.461242
RH_7          -0.562041
T8            -0.185979
RH_8          -0.489210
T9            -0.324625
RH_9          -0.411121
T_out         -0.051862
Press_mm_hg   -0.205165
RH_out        -0.080575
Windspeed     -0.049887
Visibility    -0.416524
Tdewpoint     -0.125523
NSM           -1.200156
dtype: float64 
Soma: 12.41920323799972


hist_individual(dtProcessado_IQR_normalizado, quantitativas[0:9])


hist_individual(dtProcessado_IQR_normalizado, quantitativas[9:18])


hist_individual(dtProcessado_IQR_normalizado, quantitativas[18:27])


boxplot_individuais(dtProcessado_IQR_normalizado, quantitativas[0:9])


boxplot_individuais(dtProcessado_IQR_normalizado, quantitativas[9:18])


boxplot_individuais(dtProcessado_IQR_normalizado, quantitativas[18:27])


feriados = []

# Criando lista com todos feriados do ano em que o dataset foi gerado
for data in Belgium(years = [2016]).items():
    feriados.append(data)


# Converter para dataframe e renomear colunas
dtferiados = pd.DataFrame(feriados)
dtferiados.columns = ['data', 'feriado']


dtferiados.head()


# Criar uma copia do dataset original para recuperar a coluna 'date', desconsiderando horario
dtTemp = dtFull.copy()
dtTemp['date'] = pd.to_datetime(dtTemp['date'], format='%Y-%m-%d %H:%M:%S').dt.date


def isHoliday(row):
    
    # Verifica se a data da linha atual esta no dataframe de feriados
    holiday = dtferiados.apply(lambda x: 1 if (row['date'] == x['data']) else 0, axis = 1)
    
    holiday = sum(holiday)
    
    if holiday > 0:
        holiday = 1
    else:
        holiday = 0
    
    return holiday


# Preenche a coluna feriados do dataframe temporario
dtTemp['Holiday'] = dtTemp.apply(isHoliday, axis = 1)


# Copia a coluna de feriados do dataframe temporario para o novo
dtProcessado_incremento = dtProcessado_IQR_normalizado.copy()
dtProcessado_incremento['Holiday'] = dtTemp['Holiday'].copy().values


dtProcessado_incremento.head()


dtFinal = dtProcessado_incremento.drop('lights', axis = 1)


dtFinal.columns

Index(['Appliances', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4',
       'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9',
       'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'NSM', 'Weekend', 'Day_of_week', 'Month', 'Day', 'Hour',
       'Holiday'],
      dtype='object')


X_fs = dtFinal.drop(['Appliances'], axis = 1)
y_fs = dtFinal['Appliances'].values


seleciona_fs = SelectFromModel(RandomForestRegressor())
seleciona_fs.fit(X_fs, y_fs)

SelectFromModel(estimator=RandomForestRegressor())


variaveis = X_fs.columns[seleciona_fs.get_support()]


print(variaveis)

Index(['T3', 'RH_3', 'T8', 'Press_mm_hg', 'NSM', 'Hour'], dtype='object')


modelo_fs_v1 = RandomForestRegressor()
modelo_fs_v1.fit(X_fs, y_fs)

RandomForestRegressor()


index_ordenado_fs_v1 = modelo_fs_v1.feature_importances_.argsort()


plt.barh(dtFinal.drop(['Appliances'], axis = 1).columns[index_ordenado_fs_v1],\
         modelo_fs_v1.feature_importances_[index_ordenado_fs_v1])

<BarContainer object of 31 artists>


# Função para calcular o RMSE
def rmse_cv(modelo, x, y):
    rmse = np.sqrt(-cross_val_score(modelo, 
                                    x, 
                                    y, 
                                    scoring = "neg_mean_squared_error", 
                                    cv = 5))
    return(rmse)


# Criando modelo LASSO, com lista de alphas e executanndo em CV
modelo_fs_v2 = LassoCV(alphas = [10, 1, 0.1, 0.01, 0.001])
modelo_fs_v2.fit(X_fs, y_fs)

Objective did not converge. You might want to increase the number of iterations. Duality gap: 4357.590548660606, tolerance: 2759.9261819103117
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3480847.6415894013, tolerance: 2759.9261819103117
Objective did not converge. You might want to increase the number of iterations. Duality gap: 76205.56021223217, tolerance: 2889.6108006080567
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3623297.658385303, tolerance: 2889.6108006080567
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2597637.269967463, tolerance: 2990.5874187674194
Objective did not converge. You might want to increase the number of iterations. Duality gap: 186628.33479275554, tolerance: 2949.7745317646313
Objective did not converge. You might want to increase the number of iterations. Duality gap: 7161.319000288844, tolerance: 2977.9247352419566
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3834942.145448884, tolerance: 2977.9247352419566

LassoCV(alphas=[10, 1, 0.1, 0.01, 0.001])


# Calculando RMSE de todos os CV
rmse = rmse_cv(modelo_fs_v2, X_fs, y_fs)

Objective did not converge. You might want to increase the number of iterations. Duality gap: 60265.489504825324, tolerance: 2121.555612826604
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3449080.197078473, tolerance: 2121.555612826604
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2848743.487422847, tolerance: 2242.5050556215356
Objective did not converge. You might want to increase the number of iterations. Duality gap: 1832131.9116363432, tolerance: 2280.2716708234366
Objective did not converge. You might want to increase the number of iterations. Duality gap: 299376.41998958215, tolerance: 2149.791022880215
Objective did not converge. You might want to increase the number of iterations. Duality gap: 25525.858414947987, tolerance: 2245.414767635184
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2690454.878200626, tolerance: 2245.414767635184
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3351978.5385637432, tolerance: 2149.406833135393
Objective did not converge. You might want to increase the number of iterations. Duality gap: 101437.03946716711, tolerance: 2342.3896785431516
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3123800.4188147625, tolerance: 2342.3896785431516
Objective did not converge. You might want to increase the number of iterations. Duality gap: 9394.365452123806, tolerance: 2409.9487112034844
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3098391.152376443, tolerance: 2409.9487112034844
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2204370.918308871, tolerance: 2279.221212097221
Objective did not converge. You might want to increase the number of iterations. Duality gap: 130502.78012827411, tolerance: 2375.086999445808
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3235366.746466916, tolerance: 2375.086999445808
Objective did not converge. You might want to increase the number of iterations. Duality gap: 4322.736883841455, tolerance: 2252.7703768804445
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2852729.1123774685, tolerance: 2252.7703768804445
Objective did not converge. You might want to increase the number of iterations. Duality gap: 13944.324127260596, tolerance: 2385.0320207838477
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3126175.968297707, tolerance: 2385.0320207838477
Objective did not converge. You might want to increase the number of iterations. Duality gap: 1318783.2841035128, tolerance: 2468.0057761282656
Objective did not converge. You might want to increase the number of iterations. Duality gap: 831739.0762900505, tolerance: 2379.241208138707
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2964071.2282052245, tolerance: 2475.7422191433775
Objective did not converge. You might want to increase the number of iterations. Duality gap: 394906.24633935466, tolerance: 2344.238533056215
Objective did not converge. You might want to increase the number of iterations. Duality gap: 14765.361982159317, tolerance: 2396.1292794932697
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2805997.0434392374, tolerance: 2410.4928865489665
Objective did not converge. You might want to increase the number of iterations. Duality gap: 216432.8311970476, tolerance: 2435.0306060486105
Objective did not converge. You might want to increase the number of iterations. Duality gap: 89341.5997652132, tolerance: 2239.5895979809975
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2962815.7126216628, tolerance: 2239.5895979809975
Objective did not converge. You might want to increase the number of iterations. Duality gap: 157369.46706711873, tolerance: 2372.389842438638
Objective did not converge. You might want to increase the number of iterations. Duality gap: 3273712.9955270756, tolerance: 2372.389842438638
Objective did not converge. You might want to increase the number of iterations. Duality gap: 76137.53736353107, tolerance: 2424.292300277118
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2998396.3357756846, tolerance: 2424.292300277118
Objective did not converge. You might want to increase the number of iterations. Duality gap: 286455.2422223259, tolerance: 2462.249401076716
Objective did not converge. You might want to increase the number of iterations. Duality gap: 1753242.459048465, tolerance: 2462.249401076716


# Print valor medio, maximo, minimo
print(rmse.mean(), max(rmse), min(rmse))

37.20205195968503 41.53525490236213 34.49627699946396


# Coeficientes LASSO
coef = pd.Series(modelo_fs_v2.coef_, index = X_fs.columns)


coef.sort_values().tail(15)

Day_of_week     0.478850
Visibility      0.485754
RH_5            0.613386
RH_6            0.798299
Hour            0.892201
Holiday         1.213774
RH_out          2.117558
Weekend         2.243103
Windspeed       2.301194
T4              2.349132
RH_3            3.780797
T8             12.106176
T6             17.054270
T3             20.129085
RH_1           26.180667
dtype: float64


# Plotando importancia das variaveis
imp_coef_fs = pd.concat([coef.sort_values().head(15), coef.sort_values().tail(15)])
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef_fs.plot(kind = "barh")
plt.title("Coeficientes Modelo LASSO")

Text(0.5, 1.0, 'Coeficientes Modelo LASSO')


# Criando modelo de SVM para regressão
modelo_v4 = LinearSVR(max_iter = 3000)
rfe = RFE(modelo_v4, n_features_to_select = 8)


# Treinando RFE
rfe.fit(X_fs, y_fs)

RFE(estimator=LinearSVR(max_iter=3000), n_features_to_select=8)


print('Features Selecionadas: %s' % rfe.support_)
print("Feature Ranking: %s" % rfe.ranking_)

Features Selecionadas: [ True  True False  True  True False False False False False False False
  True False  True  True  True False False False False False False False
 False False False False False False False]
Feature Ranking: [ 1  1  2  1  1 16 13 20 17 15  3 19  1 12  1  1  1  7 18 24  9 10 21  8
  4 23 14  6 22  5 11]


variaveis_v4 = [X_fs.columns[i] for i, col in enumerate(rfe.support_) if col == True]


print(variaveis_v4)

['T1', 'RH_1', 'RH_2', 'T3', 'T7', 'T8', 'RH_8', 'T9']


X_fs[variaveis_v4].head()


def avalia_modelo(modelo, x, y):  
    preds = modelo.predict(x)
    
    erros = abs(preds - y)
    mape = 100 * np.mean(erros / y)
    r2 = 100*r2_score(y, preds)
    acuracia = 100 - mape
    mse = mean_squared_error(y, preds, squared = True)
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds, squared = False)
    
    print(modelo,'\n')
    print('R^2                 : {:0.2f}%' .format(r2))
    print('Acuracia            : {:0.2f}%'.format(acuracia))
    print('MAE                 : {:0.2f}'.format(mae))
    print('MSE                 : {:0.2f}'.format(mse))
    print('RMSE                : {:0.2f}\n'.format(rmse))


# Selecionando variaveis do RandomForestRegressor
X_sel_fs_v1 = X_fs[variaveis]


x_train, x_test, y_train, y_test = train_test_split(X_fs, y_fs, test_size = .3, random_state = seed_)


# Criando o modelo com todas variaveis
modelo_sel_fs_v1 = RandomForestRegressor()
modelo_sel_fs_v1.fit(x_train, y_train)

RandomForestRegressor()


avalia_modelo(modelo_sel_fs_v1, x_test, y_test)

RandomForestRegressor() 

R^2                 : 73.77%
Acuracia            : 79.46%
MAE                 : 14.20
MSE                 : 476.38
RMSE                : 21.83


x_train, x_test, y_train, y_test = train_test_split(X_sel_fs_v1, y_fs, test_size = .3, random_state = seed_)


# Criando o modelo com variaveis selecionodas pelo RandomForestRegressor
modelo_sel_fs_v2 = RandomForestRegressor()
modelo_sel_fs_v2.fit(x_train, y_train)

RandomForestRegressor()


avalia_modelo(modelo_sel_fs_v2, x_test, y_test)

RandomForestRegressor() 

R^2                 : 70.37%
Acuracia            : 78.15%
MAE                 : 15.21
MSE                 : 538.10
RMSE                : 23.20


i = 20
x_temp = x_test.iloc[i]
x_temp = pd.DataFrame(x_temp).T
y_temp = y_test[i]


pred = modelo_sel_fs_v2.predict(x_temp)


print('Previsto:', pred,'Real:', y_temp)

Previsto: [49.1] Real: 50.0


X_sel_fs_v2 = X_fs[['RH_1', 'T3', 'T6', 'T8', 'RH_3']]


x_train, x_test, y_train, y_test = train_test_split(X_fs, y_fs, test_size = .3, random_state = seed_)


# Criando modelo LASSO, com todas variaveis
modelo_sel_fs_v3 = LassoCV(alphas = [10, 1, 0.1, 0.01, 0.001])
modelo_sel_fs_v3.fit(x_train, y_train)

Objective did not converge. You might want to increase the number of iterations. Duality gap: 548945.3283650819, tolerance: 2054.5656972219717
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2613726.3627101686, tolerance: 2033.1613442222433
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2098964.046488886, tolerance: 2076.477421952764
Objective did not converge. You might want to increase the number of iterations. Duality gap: 11849.215182460845, tolerance: 2042.2981173649446
Objective did not converge. You might want to increase the number of iterations. Duality gap: 2025129.5642450713, tolerance: 2060.514712947883

LassoCV(alphas=[10, 1, 0.1, 0.01, 0.001])


avalia_modelo(modelo_sel_fs_v3, x_test, y_test)

LassoCV(alphas=[10, 1, 0.1, 0.01, 0.001]) 

R^2                 : 27.09%
Acuracia            : 59.58%
MAE                 : 26.93
MSE                 : 1323.89
RMSE                : 36.39


x_train, x_test, y_train, y_test = train_test_split(X_sel_fs_v2, y_fs, test_size = .3, random_state = seed_)


# Criando modelo LASSO, com variaveis selecionadas
modelo_sel_fs_v4 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005])
modelo_sel_fs_v4.fit(x_train, y_train)

LassoCV(alphas=[1, 0.1, 0.001, 0.0005])


avalia_modelo(modelo_sel_fs_v4, x_test, y_test)

LassoCV(alphas=[1, 0.1, 0.001, 0.0005]) 

R^2                 : 4.52%
Acuracia            : 51.10%
MAE                 : 32.33
MSE                 : 1733.83
RMSE                : 41.64


# Selecionando variaveis do RandomForestRegressor
X_sel_fs_v3 = X_fs[variaveis_v4]


x_train, x_test, y_train, y_test = train_test_split(X_fs, y_fs, test_size = .3, random_state = seed_)


# Criando o modelo com todas variaveis
modelo_sel_fs_v3 = LinearSVR(max_iter = 3000)
modelo_sel_fs_v3.fit(x_train, y_train)

LinearSVR(max_iter=3000)


avalia_modelo(modelo_sel_fs_v3, x_test, y_test)

LinearSVR(max_iter=3000) 

R^2                 : 20.54%
Acuracia            : 67.07%
MAE                 : 25.50
MSE                 : 1442.97
RMSE                : 37.99


x_train, x_test, y_train, y_test = train_test_split(X_sel_fs_v3, y_fs, test_size = .3, random_state = seed_)


# Criando o modelo com todas variaveis
modelo_sel_fs_v3 = LinearSVR(max_iter = 3000)
modelo_sel_fs_v3.fit(x_train, y_train)

LinearSVR(max_iter=3000)


avalia_modelo(modelo_sel_fs_v3, x_test, y_test)

LinearSVR(max_iter=3000) 

R^2                 : 17.33%
Acuracia            : 66.91%
MAE                 : 25.96
MSE                 : 1501.27
RMSE                : 38.75


# Separando em variaveis preditivas e target 
#X = dtFinal[variaveis]
X = dtFinal[['T3', 'RH_3', 'T8', 'Press_mm_hg', 'NSM', 'Hour']]
y = dtFinal['Appliances'].values


X.head()

y

array([ 60.,  60.,  50., ..., 100., 100., 175.])


# Separando em treino e teste
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed_)


def reportModeloRegressao(modelo, x_teste, y_teste, x_treino = [], y_treino = [], report_treino = False):  
    y_pred = modelo.predict(x_teste)
    
    erros = abs(y_pred - y_teste)
    mape = 100 * np.mean(erros / y_teste)
    r2 = 100*r2_score(y_teste, y_pred)
    r2_ajustado = 1 - (1 - r2) * (len(y_teste) - 1) / (len(y_teste) - x_teste.shape[1] -1)
    acuracia = 100 - mape
    mse = mean_squared_error(y_teste, y_pred, squared = True)
    mae = mean_absolute_error(y_teste, y_pred)
    rmse = mean_squared_error(y_teste, y_pred, squared = False)
    
    print(modelo,'\n')
    print('Dados de teste')
    print('R^2                 : {:0.2f}%' .format(r2))
    print('R^2 Ajustado        : {:0.2f}%' .format(r2_ajustado))
    print('Acuracia            : {:0.2f}%'.format(acuracia))
    print('MAE                 : {:0.2f}'.format(mae))
    print('MSE                 : {:0.2f}'.format(mse))
    print('RMSE                : {:0.2f}\n'.format(rmse))
    
    residuo = abs(y_teste - y_pred)
    plt.scatter(residuo, y_pred)
    plt.xlabel('Residuos')
    plt.ylabel('Previsto')
    plt.show()
    
    if report_treino:
        print('Dados de treino')
        if x_treino.shape[1] > 0 and len(y_treino) > 0: 
            reportModeloRegressao(modelo, x_treino, y_treino)
        else:
            print('X_treino e/ou y_treino possuem tamanho 0.')


def treinaRegressao_GridSearchCV(modelo, params_, x_treino, y_treino, x_teste, y_teste,\
                        n_jobs = -1, cv = 5, refit = True, scoring = None, salvar_resultados = False,\
                       report_treino = False, retorna_modelo = False):
    grid = GridSearchCV(modelo, params_, n_jobs = n_jobs, cv = cv, refit = refit, scoring = scoring)
    
    grid.fit(x_treino, y_treino)
    pred = grid.predict(x_teste)
    modelo_ = grid.best_estimator_

    print(grid.best_params_)
    
    reportModeloRegressao(modelo_, x_teste, y_teste, x_treino, y_treino, report_treino) 
    
    if salvar_resultados:
        resultados_df = pd.DataFrame(grid.cv_results_)
        
        if retorna_modelo:
            return resultados_df, modelo_
        else:
            resultados_df
        
    if retorna_modelo:
        return modelo_


# Modelo base do algoritmo SVM para regressão
modelo_svr = SVR(max_iter = -1)
modelo_svr.fit(x_train, y_train)

SVR()


reportModeloRegressao(modelo_svr, x_test, y_test, x_train, y_train, True)

SVR() 

Dados de teste
R^2                 : 20.75%
R^2 Ajustado        : 20.77%
Acuracia            : 67.23%
MAE                 : 25.13
MSE                 : 1439.09
RMSE                : 37.94

Dados de treino
SVR() 

Dados de teste
R^2                 : 20.72%
R^2 Ajustado        : 20.72%
Acuracia            : 67.57%
MAE                 : 25.55
MSE                 : 1473.20
RMSE                : 38.38


%%time

params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.9, 1.0, 1.1],
    'gamma': ['scale', 'auto']
}

# Criação de modelo intenso 01
modelo = SVR(max_iter = -1, cache_size = 1000)
treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'C': 1.1, 'gamma': 'auto', 'kernel': 'rbf'}
SVR(C=1.1, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 29.20%
R^2 Ajustado        : 29.23%
Acuracia            : 70.64%
MAE                 : 22.95
MSE                 : 1285.61
RMSE                : 35.86

Dados de treino
SVR(C=1.1, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 29.90%
R^2 Ajustado        : 29.92%
Acuracia            : 71.75%
MAE                 : 22.92
MSE                 : 1302.48
RMSE                : 36.09

Wall time: 4min 59s


%%time

params = {
    'kernel': ['rbf'],
    'C': [0.001, 0.1, 1.0, 10, 100],
    'gamma': ['auto']
}

# Criação de modelo intenso 02
modelo = SVR(max_iter = -1, cache_size = 1000)
treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
SVR(C=100, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 42.47%
R^2 Ajustado        : 42.51%
Acuracia            : 72.61%
MAE                 : 20.77
MSE                 : 1044.65
RMSE                : 32.32

Dados de treino
SVR(C=100, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 45.52%
R^2 Ajustado        : 45.54%
Acuracia            : 74.85%
MAE                 : 19.83
MSE                 : 1012.21
RMSE                : 31.82

Wall time: 4min 21s


%%time

params = {
    'kernel': ['rbf'],
    'C': [0.1, 1.0, 10, 100, 1000, 10000],
    'gamma': ['auto']
}

# Criação de modelo intenso 03
modelo = SVR(max_iter = -1, cache_size = 1000)
treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'C': 10000, 'gamma': 'auto', 'kernel': 'rbf'}
SVR(C=10000, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 51.14%
R^2 Ajustado        : 51.19%
Acuracia            : 74.36%
MAE                 : 18.84
MSE                 : 887.18
RMSE                : 29.79

Dados de treino
SVR(C=10000, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 59.99%
R^2 Ajustado        : 60.02%
Acuracia            : 79.48%
MAE                 : 15.77
MSE                 : 743.38
RMSE                : 27.26

Wall time: 11min 42s


%%time

params = {
    'kernel': ['rbf'],
    'C': [500, 1000, 2000],
    'gamma': ['auto']
}

# Criação de modelo intenso 04
modelo = SVR(max_iter = -1, cache_size = 1000)
treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'C': 2000, 'gamma': 'auto', 'kernel': 'rbf'}
SVR(C=2000, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 48.71%
R^2 Ajustado        : 48.76%
Acuracia            : 73.85%
MAE                 : 19.37
MSE                 : 931.35
RMSE                : 30.52

Dados de treino
SVR(C=2000, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 54.99%
R^2 Ajustado        : 55.01%
Acuracia            : 77.76%
MAE                 : 17.20
MSE                 : 836.42
RMSE                : 28.92

Wall time: 2min 45s


%%time
# Modelo 05
modelo_svr_v5 = SVR(max_iter = -1, cache_size = 1000, kernel = 'rbf', C = 10000, gamma = 'auto')
modelo_svr_v5.fit(x_train, y_train)

reportModeloRegressao(modelo_svr_v5, x_test, y_test, x_train, y_train, True)

SVR(C=10000, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 51.14%
R^2 Ajustado        : 51.19%
Acuracia            : 74.36%
MAE                 : 18.84
MSE                 : 887.18
RMSE                : 29.79

Dados de treino
SVR(C=10000, cache_size=1000, gamma='auto') 

Dados de teste
R^2                 : 59.99%
R^2 Ajustado        : 60.02%
Acuracia            : 79.48%
MAE                 : 15.77
MSE                 : 743.38
RMSE                : 27.26

Wall time: 6min 32s


%%time
# Modelo 06
modelo_svr_v6 = SVR(max_iter = -1, cache_size = 1000, kernel = 'rbf', C = 10000, gamma = 1) # gamma = 'auto' = 0.166
modelo_svr_v6.fit(x_train, y_train)

reportModeloRegressao(modelo_svr_v6, x_test, y_test, x_train, y_train, True)

SVR(C=10000, cache_size=1000, gamma=1) 

Dados de teste
R^2                 : 56.53%
R^2 Ajustado        : 56.58%
Acuracia            : 76.33%
MAE                 : 17.05
MSE                 : 789.46
RMSE                : 28.10

Dados de treino
SVR(C=10000, cache_size=1000, gamma=1) 

Dados de teste
R^2                 : 87.16%
R^2 Ajustado        : 87.20%
Acuracia            : 90.98%
MAE                 : 6.55
MSE                 : 238.58
RMSE                : 15.45

Wall time: 3min 31s


%%time
# Modelo 07
modelo_svr_v7 = SVR(max_iter = -1, cache_size = 1000, kernel = 'rbf', C = 10000, gamma = 3) # gamma = 'auto' = 0.166
modelo_svr_v7.fit(x_train, y_train)

reportModeloRegressao(modelo_svr_v7, x_test, y_test, x_train, y_train, True)

SVR(C=10000, cache_size=1000, gamma=3) 

Dados de teste
R^2                 : 46.41%
R^2 Ajustado        : 46.46%
Acuracia            : 73.57%
MAE                 : 19.00
MSE                 : 973.11
RMSE                : 31.19

Dados de treino
SVR(C=10000, cache_size=1000, gamma=3) 

Dados de teste
R^2                 : 94.80%
R^2 Ajustado        : 94.84%
Acuracia            : 95.87%
MAE                 : 2.98
MSE                 : 96.68
RMSE                : 9.83

Wall time: 2min 57s


%%time
# Modelo 08
modelo_svr_v8 = SVR(max_iter = -1, cache_size = 1000, kernel = 'rbf', C = 10000, gamma = 0.5) # gamma = 'auto' = 0.166
modelo_svr_v8.fit(x_train, y_train)

reportModeloRegressao(modelo_svr_v8, x_test, y_test, x_train, y_train, True)

SVR(C=10000, cache_size=1000, gamma=0.5) 

Dados de teste
R^2                 : 56.06%
R^2 Ajustado        : 56.12%
Acuracia            : 76.01%
MAE                 : 17.26
MSE                 : 797.83
RMSE                : 28.25

Dados de treino
SVR(C=10000, cache_size=1000, gamma=0.5) 

Dados de teste
R^2                 : 78.26%
R^2 Ajustado        : 78.29%
Acuracia            : 86.84%
MAE                 : 9.83
MSE                 : 404.01
RMSE                : 20.10

Wall time: 5min 40s


%%time
# Modelo Final
modelo_svr_final = SVR(max_iter = -1, cache_size = 1000, kernel = 'rbf', C = 10000, gamma = 0.5)
modelo_svr_final.fit(x_train, y_train)

reportModeloRegressao(modelo_svr_final, x_test, y_test, x_train, y_train, True)

SVR(C=10000, cache_size=1000, gamma=0.5) 

Dados de teste
R^2                 : 56.06%
R^2 Ajustado        : 56.12%
Acuracia            : 76.01%
MAE                 : 17.26
MSE                 : 797.83
RMSE                : 28.25

Dados de treino
SVR(C=10000, cache_size=1000, gamma=0.5) 

Dados de teste
R^2                 : 78.26%
R^2 Ajustado        : 78.29%
Acuracia            : 86.84%
MAE                 : 9.83
MSE                 : 404.01
RMSE                : 20.10

Wall time: 5min 34s


shap.initjs()


# Construindo shap
amostras = 20
explainer = shap.Explainer(modelo_svr_final.predict, x_train)
shap_values = explainer(x_test[:amostras])

Exact explainer: 21it [02:09,  6.80s/it]


# Waterfall Predição 0
shap.plots.waterfall(shap_values[0])


# Waterfall Predição 10
shap.plots.waterfall(shap_values[10])


# Force Predição 0
shap.plots.force(shap_values[0])


# Force Predição 10
shap.plots.force(shap_values[10])


# Summary Plot
shap.summary_plot(shap_values, x_test[:amostras])


# Separando o conjunto de treino em treino e validação
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size = 0.8, random_state = seed_)


# Definindo variaveis categoricas
categorical_features_index = np.where(x_train.dtypes != np.float)[0]


%%time
# Modelo Base CatBoost Regressor
modelo_cat = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_)

modelo_cat.fit(x_train, y_train,
               cat_features = categorical_features_index,
               eval_set = (x_val, y_val),
               plot = True, verbose = False);

Wall time: 26.2 s

<catboost.core.CatBoostRegressor at 0x22186b5b3d0>


reportModeloRegressao(modelo_cat, x_test, y_test, x_train, y_train, True)

<catboost.core.CatBoostRegressor object at 0x0000019A8750DF40> 

Dados de teste
R^2                 : 58.77%
R^2 Ajustado        : 58.83%
Acuracia            : 71.99%
MAE                 : 19.25
MSE                 : 748.66
RMSE                : 27.36

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000019A8750DF40> 

Dados de teste
R^2                 : 72.08%
R^2 Ajustado        : 72.11%
Acuracia            : 76.99%
MAE                 : 16.04
MSE                 : 518.16
RMSE                : 22.76


%%time
# Modelo 01 CatBoost Regressor
modelo_cat_v1 = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                                 iterations = 5000, metric_period = 50, od_type = 'Iter', od_wait = 20)

modelo_cat_v1.fit(x_train, y_train,
               cat_features = categorical_features_index,
               eval_set = (x_val, y_val),
               plot = True, verbose = True);

Learning rate set to 0.026621
0:	learn: 42.7399292	test: 42.8854326	best: 42.8854326 (0)	total: 179ms	remaining: 14m 54s

Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.

50:	learn: 35.2272567	test: 35.4326424	best: 35.4326424 (50)	total: 1.47s	remaining: 2m 22s
100:	learn: 33.5706843	test: 33.9646591	best: 33.9646591 (100)	total: 2.71s	remaining: 2m 11s
150:	learn: 32.7297557	test: 33.3181609	best: 33.3181609 (150)	total: 3.92s	remaining: 2m 6s
200:	learn: 32.0742833	test: 32.8320736	best: 32.8320736 (200)	total: 5.2s	remaining: 2m 4s
250:	learn: 31.5634411	test: 32.4852863	best: 32.4852863 (250)	total: 6.37s	remaining: 2m
300:	learn: 31.0755939	test: 32.1319941	best: 32.1319941 (300)	total: 7.58s	remaining: 1m 58s
350:	learn: 30.6078842	test: 31.7580385	best: 31.7580385 (350)	total: 8.81s	remaining: 1m 56s
400:	learn: 30.1832102	test: 31.4771442	best: 31.4771442 (400)	total: 10s	remaining: 1m 55s
450:	learn: 29.7629514	test: 31.1861149	best: 31.1861149 (450)	total: 11.3s	remaining: 1m 53s
500:	learn: 29.3681044	test: 30.9204577	best: 30.9204577 (500)	total: 12.5s	remaining: 1m 52s
550:	learn: 29.0139188	test: 30.7156731	best: 30.7156731 (550)	total: 13.7s	remaining: 1m 50s
600:	learn: 28.7193385	test: 30.5441187	best: 30.5441187 (600)	total: 15s	remaining: 1m 49s
650:	learn: 28.4072268	test: 30.3651986	best: 30.3651986 (650)	total: 16.2s	remaining: 1m 48s
700:	learn: 28.1142650	test: 30.2026245	best: 30.2026245 (700)	total: 17.4s	remaining: 1m 46s
750:	learn: 27.8431292	test: 30.0424655	best: 30.0424655 (750)	total: 18.6s	remaining: 1m 45s
800:	learn: 27.5897884	test: 29.9134793	best: 29.9134793 (800)	total: 19.9s	remaining: 1m 44s
850:	learn: 27.3461986	test: 29.7877909	best: 29.7877909 (850)	total: 21.2s	remaining: 1m 43s
900:	learn: 27.1552133	test: 29.6891314	best: 29.6891314 (900)	total: 22.4s	remaining: 1m 41s
950:	learn: 26.9550873	test: 29.5969176	best: 29.5969176 (950)	total: 23.6s	remaining: 1m 40s
1000:	learn: 26.7461294	test: 29.4845496	best: 29.4845496 (1000)	total: 24.8s	remaining: 1m 39s
1050:	learn: 26.5485243	test: 29.3883071	best: 29.3883071 (1050)	total: 26s	remaining: 1m 37s
1100:	learn: 26.3684547	test: 29.2878618	best: 29.2878618 (1100)	total: 27.3s	remaining: 1m 36s
1150:	learn: 26.1956292	test: 29.1953511	best: 29.1953511 (1150)	total: 28.5s	remaining: 1m 35s
1200:	learn: 26.0155325	test: 29.0941099	best: 29.0941099 (1200)	total: 29.7s	remaining: 1m 33s
1250:	learn: 25.8589620	test: 29.0198995	best: 29.0198995 (1250)	total: 30.9s	remaining: 1m 32s
1300:	learn: 25.6978428	test: 28.9444081	best: 28.9444081 (1300)	total: 32.1s	remaining: 1m 31s
1350:	learn: 25.5433310	test: 28.8742446	best: 28.8737408 (1349)	total: 33.3s	remaining: 1m 30s
1400:	learn: 25.4058418	test: 28.8180423	best: 28.8180423 (1400)	total: 34.6s	remaining: 1m 28s
1450:	learn: 25.2513746	test: 28.7468077	best: 28.7468077 (1450)	total: 35.8s	remaining: 1m 27s
1500:	learn: 25.0966732	test: 28.6984761	best: 28.6984761 (1500)	total: 37s	remaining: 1m 26s
1550:	learn: 24.9577593	test: 28.6364573	best: 28.6364573 (1550)	total: 38.2s	remaining: 1m 25s
1600:	learn: 24.8202809	test: 28.5701005	best: 28.5699791 (1599)	total: 39.5s	remaining: 1m 23s
1650:	learn: 24.6738644	test: 28.4976716	best: 28.4976716 (1650)	total: 40.7s	remaining: 1m 22s
1700:	learn: 24.5546248	test: 28.4552892	best: 28.4536165 (1699)	total: 41.9s	remaining: 1m 21s
1750:	learn: 24.4242050	test: 28.4116079	best: 28.4116079 (1750)	total: 43.1s	remaining: 1m 19s
1800:	learn: 24.2951498	test: 28.3658246	best: 28.3658246 (1800)	total: 44.3s	remaining: 1m 18s
1850:	learn: 24.1714228	test: 28.3053960	best: 28.3053960 (1850)	total: 45.5s	remaining: 1m 17s
1900:	learn: 24.0690556	test: 28.2579377	best: 28.2577103 (1899)	total: 46.7s	remaining: 1m 16s
1950:	learn: 23.9626398	test: 28.2129195	best: 28.2129195 (1950)	total: 48s	remaining: 1m 15s
2000:	learn: 23.8536334	test: 28.1670292	best: 28.1670292 (2000)	total: 49.3s	remaining: 1m 13s
2050:	learn: 23.7518537	test: 28.1309189	best: 28.1308538 (2048)	total: 50.5s	remaining: 1m 12s
2100:	learn: 23.6406882	test: 28.0950074	best: 28.0950074 (2100)	total: 51.8s	remaining: 1m 11s
2150:	learn: 23.5356934	test: 28.0545858	best: 28.0540179 (2149)	total: 53s	remaining: 1m 10s
2200:	learn: 23.4433153	test: 28.0293441	best: 28.0293441 (2200)	total: 54.3s	remaining: 1m 9s
2250:	learn: 23.3362165	test: 27.9882062	best: 27.9882062 (2250)	total: 55.6s	remaining: 1m 7s
2300:	learn: 23.2375428	test: 27.9533397	best: 27.9533397 (2300)	total: 56.8s	remaining: 1m 6s
2350:	learn: 23.1363215	test: 27.9065672	best: 27.9062900 (2349)	total: 58s	remaining: 1m 5s
2400:	learn: 23.0383257	test: 27.8726044	best: 27.8726044 (2400)	total: 59.2s	remaining: 1m 4s
2450:	learn: 22.9435268	test: 27.8483518	best: 27.8483518 (2450)	total: 1m	remaining: 1m 2s
2500:	learn: 22.8494015	test: 27.8054676	best: 27.8054676 (2500)	total: 1m 1s	remaining: 1m 1s
2550:	learn: 22.7559982	test: 27.7718586	best: 27.7715578 (2549)	total: 1m 2s	remaining: 1m
2600:	learn: 22.6519917	test: 27.7359253	best: 27.7359253 (2600)	total: 1m 4s	remaining: 59.1s
2650:	learn: 22.5608799	test: 27.7111233	best: 27.7108088 (2649)	total: 1m 5s	remaining: 57.9s
2700:	learn: 22.4603293	test: 27.6731201	best: 27.6731201 (2700)	total: 1m 6s	remaining: 56.6s
2750:	learn: 22.3792869	test: 27.6484428	best: 27.6484428 (2750)	total: 1m 7s	remaining: 55.3s
2800:	learn: 22.2842086	test: 27.6119756	best: 27.6119756 (2800)	total: 1m 8s	remaining: 54.2s
2850:	learn: 22.1960326	test: 27.5897150	best: 27.5885119 (2839)	total: 1m 10s	remaining: 52.9s
2900:	learn: 22.1107462	test: 27.5649899	best: 27.5649899 (2900)	total: 1m 11s	remaining: 51.7s
2950:	learn: 22.0229458	test: 27.5361969	best: 27.5361969 (2950)	total: 1m 12s	remaining: 50.4s
3000:	learn: 21.9322296	test: 27.5008354	best: 27.5008354 (3000)	total: 1m 13s	remaining: 49.2s
3050:	learn: 21.8470409	test: 27.4732832	best: 27.4725475 (3049)	total: 1m 15s	remaining: 48s
3100:	learn: 21.7713221	test: 27.4619093	best: 27.4615471 (3099)	total: 1m 16s	remaining: 46.7s
3150:	learn: 21.6968766	test: 27.4340484	best: 27.4340484 (3150)	total: 1m 17s	remaining: 45.5s
3200:	learn: 21.6195097	test: 27.4095916	best: 27.4095916 (3200)	total: 1m 18s	remaining: 44.2s
3250:	learn: 21.5373746	test: 27.3811968	best: 27.3811968 (3250)	total: 1m 19s	remaining: 43s
3300:	learn: 21.4590591	test: 27.3513491	best: 27.3513491 (3300)	total: 1m 21s	remaining: 41.8s
3350:	learn: 21.3810091	test: 27.3339386	best: 27.3332570 (3349)	total: 1m 22s	remaining: 40.6s
3400:	learn: 21.3029288	test: 27.2995094	best: 27.2995094 (3400)	total: 1m 23s	remaining: 39.4s
3450:	learn: 21.2322078	test: 27.2725430	best: 27.2715559 (3449)	total: 1m 25s	remaining: 38.2s
3500:	learn: 21.1547929	test: 27.2480532	best: 27.2480532 (3500)	total: 1m 26s	remaining: 37s
3550:	learn: 21.0867766	test: 27.2276898	best: 27.2275792 (3549)	total: 1m 27s	remaining: 35.7s
3600:	learn: 21.0100341	test: 27.2052387	best: 27.2052387 (3600)	total: 1m 28s	remaining: 34.5s
3650:	learn: 20.9428807	test: 27.1776461	best: 27.1776461 (3650)	total: 1m 30s	remaining: 33.3s
3700:	learn: 20.8731928	test: 27.1635916	best: 27.1635916 (3700)	total: 1m 31s	remaining: 32s
3750:	learn: 20.8063211	test: 27.1456728	best: 27.1456728 (3750)	total: 1m 32s	remaining: 30.8s
3800:	learn: 20.7437928	test: 27.1196947	best: 27.1195129 (3799)	total: 1m 33s	remaining: 29.6s
3850:	learn: 20.6722372	test: 27.0940814	best: 27.0940814 (3850)	total: 1m 35s	remaining: 28.4s
3900:	learn: 20.6032396	test: 27.0731729	best: 27.0717360 (3898)	total: 1m 36s	remaining: 27.1s
3950:	learn: 20.5394726	test: 27.0554935	best: 27.0547202 (3947)	total: 1m 37s	remaining: 25.9s
4000:	learn: 20.4687535	test: 27.0338434	best: 27.0338434 (4000)	total: 1m 38s	remaining: 24.7s
4050:	learn: 20.4064699	test: 27.0040263	best: 27.0040263 (4050)	total: 1m 39s	remaining: 23.4s
4100:	learn: 20.3377657	test: 26.9812347	best: 26.9812347 (4100)	total: 1m 41s	remaining: 22.2s
4150:	learn: 20.2652081	test: 26.9617448	best: 26.9616127 (4149)	total: 1m 42s	remaining: 20.9s
4200:	learn: 20.2061480	test: 26.9433014	best: 26.9433014 (4200)	total: 1m 43s	remaining: 19.7s
4250:	learn: 20.1380626	test: 26.9238496	best: 26.9236590 (4248)	total: 1m 44s	remaining: 18.5s
4300:	learn: 20.0771845	test: 26.9039492	best: 26.9039492 (4300)	total: 1m 45s	remaining: 17.2s
4350:	learn: 20.0077619	test: 26.8918839	best: 26.8918839 (4350)	total: 1m 47s	remaining: 16s
4400:	learn: 19.9454403	test: 26.8725617	best: 26.8725617 (4400)	total: 1m 48s	remaining: 14.8s
4450:	learn: 19.8837919	test: 26.8502856	best: 26.8497545 (4449)	total: 1m 49s	remaining: 13.5s
4500:	learn: 19.8235619	test: 26.8383417	best: 26.8379712 (4497)	total: 1m 50s	remaining: 12.3s
4550:	learn: 19.7644419	test: 26.8265353	best: 26.8263419 (4549)	total: 1m 52s	remaining: 11.1s
4600:	learn: 19.7051956	test: 26.8030791	best: 26.8030791 (4600)	total: 1m 53s	remaining: 9.84s
4650:	learn: 19.6447935	test: 26.7915264	best: 26.7915264 (4650)	total: 1m 54s	remaining: 8.61s
4700:	learn: 19.5823976	test: 26.7757118	best: 26.7757118 (4700)	total: 1m 55s	remaining: 7.37s
4750:	learn: 19.5265463	test: 26.7637401	best: 26.7637401 (4750)	total: 1m 57s	remaining: 6.14s
4800:	learn: 19.4760040	test: 26.7457751	best: 26.7457751 (4800)	total: 1m 58s	remaining: 4.91s
4850:	learn: 19.4262597	test: 26.7346130	best: 26.7346130 (4850)	total: 1m 59s	remaining: 3.67s
4900:	learn: 19.3714094	test: 26.7233782	best: 26.7210904 (4892)	total: 2m	remaining: 2.44s
4950:	learn: 19.3174973	test: 26.7156317	best: 26.7154939 (4947)	total: 2m 1s	remaining: 1.21s
4999:	learn: 19.2633679	test: 26.7062911	best: 26.7062911 (4999)	total: 2m 3s	remaining: 0us

bestTest = 26.70629115
bestIteration = 4999

Wall time: 2min 3s

<catboost.core.CatBoostRegressor at 0x16e867d7520>


reportModeloRegressao(modelo_cat_v1, x_test, y_test, x_train, y_train, True)

<catboost.core.CatBoostRegressor object at 0x0000016E867D7520> 

Dados de teste
R^2                 : 61.61%
R^2 Ajustado        : 61.67%
Acuracia            : 73.30%
MAE                 : 18.39
MSE                 : 697.12
RMSE                : 26.40

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E867D7520> 

Dados de teste
R^2                 : 79.27%
R^2 Ajustado        : 79.31%
Acuracia            : 79.92%
MAE                 : 13.86
MSE                 : 384.73
RMSE                : 19.61


%%time
# Modelo 03 CatBoost Regressor
modelo_cat_v3 = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                                 iterations = 5000, metric_period = 50, od_type = 'Iter', od_wait = 20,\
                                 learning_rate = 0.01)

modelo_cat_v3.fit(x_train, y_train,
               cat_features = categorical_features_index,
               eval_set = (x_val, y_val),
               plot = True, verbose = False);

Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.

Wall time: 2min 2s

<catboost.core.CatBoostRegressor at 0x16e874eb190>


reportModeloRegressao(modelo_cat_v3, x_test, y_test, x_train, y_train, True)

<catboost.core.CatBoostRegressor object at 0x0000016E874EB190> 

Dados de teste
R^2                 : 56.83%
R^2 Ajustado        : 56.88%
Acuracia            : 71.24%
MAE                 : 19.72
MSE                 : 783.96
RMSE                : 28.00

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E874EB190> 

Dados de teste
R^2                 : 68.23%
R^2 Ajustado        : 68.26%
Acuracia            : 75.45%
MAE                 : 17.15
MSE                 : 589.61
RMSE                : 24.28


%%time
# Modelo 04 CatBoost Regressor
modelo_cat_v4 = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                                 iterations = 5000, metric_period = 50, od_type = 'Iter', od_wait = 20,\
                                 learning_rate = 0.1)

modelo_cat_v4.fit(x_train, y_train,
               cat_features = categorical_features_index,
               eval_set = (x_val, y_val),
               plot = True, verbose = False);

Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.

Wall time: 38.6 s

<catboost.core.CatBoostRegressor at 0x16e86436b50>


reportModeloRegressao(modelo_cat_v4, x_test, y_test, x_train, y_train, True)

<catboost.core.CatBoostRegressor object at 0x0000016E86436B50> 

Dados de teste
R^2                 : 62.32%
R^2 Ajustado        : 62.39%
Acuracia            : 73.58%
MAE                 : 18.19
MSE                 : 684.17
RMSE                : 26.16

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E86436B50> 

Dados de teste
R^2                 : 81.40%
R^2 Ajustado        : 81.44%
Acuracia            : 80.86%
MAE                 : 13.17
MSE                 : 345.16
RMSE                : 18.58


%%time
# Modelo 05 CatBoost Regressor
modelo_cat_v5 = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                                 iterations = 20000, metric_period = 50, od_type = 'Iter', od_wait = 20,\
                                 learning_rate = 0.01)

modelo_cat_v5.fit(x_train, y_train,
               cat_features = categorical_features_index,
               eval_set = (x_val, y_val),
               plot = True, verbose = False);

reportModeloRegressao(modelo_cat_v5, x_test, y_test, x_train, y_train, True)

Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.

<catboost.core.CatBoostRegressor object at 0x0000016E867AE2E0> 

Dados de teste
R^2                 : 61.19%
R^2 Ajustado        : 61.25%
Acuracia            : 73.07%
MAE                 : 18.52
MSE                 : 704.72
RMSE                : 26.55

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E867AE2E0> 

Dados de teste
R^2                 : 77.73%
R^2 Ajustado        : 77.77%
Acuracia            : 79.26%
MAE                 : 14.36
MSE                 : 413.24
RMSE                : 20.33

Wall time: 4min 52s


# Separando em variaveis preditivas e target 
#X = dtFinal[variaveis]
X = dtFinal[['T3', 'RH_3', 'T8', 'Press_mm_hg', 'NSM', 'Hour']]
y = dtFinal['Appliances'].values

# Separando em treino e teste
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed_)


%%time

params = {
    'depth': [5, 6, 7, 8, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'iterations' : [5000]
}

# Criação de modelo intenso 06
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 9, 'iterations': 5000, 'learning_rate': 0.05}
<catboost.core.CatBoostRegressor object at 0x0000016E87839F10> 

Dados de teste
R^2                 : 71.16%
R^2 Ajustado        : 71.23%
Acuracia            : 77.78%
MAE                 : 15.29
MSE                 : 523.66
RMSE                : 22.88

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E87839F10> 

Dados de teste
R^2                 : 94.99%
R^2 Ajustado        : 95.03%
Acuracia            : 89.15%
MAE                 : 7.05
MSE                 : 93.07
RMSE                : 9.65

Wall time: 8min 24s


%%time

params = {
    'depth': [7, 8, 9, 10],
    'learning_rate': [0.04, 0.05, 0.06, 0.07],
    'iterations' : [5000]
}

# Criação de modelo intenso 07
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 10, 'iterations': 5000, 'learning_rate': 0.05}
<catboost.core.CatBoostRegressor object at 0x0000016E91625B50> 

Dados de teste
R^2                 : 71.60%
R^2 Ajustado        : 71.67%
Acuracia            : 78.27%
MAE                 : 15.02
MSE                 : 515.80
RMSE                : 22.71

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E91625B50> 

Dados de teste
R^2                 : 96.35%
R^2 Ajustado        : 96.39%
Acuracia            : 90.52%
MAE                 : 6.09
MSE                 : 67.91
RMSE                : 8.24

Wall time: 16min 58s


# Separando em variaveis preditivas e target 
#X = dtFinal[variaveis]
X = dtFinal[['T3', 'RH_3', 'T8', 'Press_mm_hg', 'NSM', 'Hour']]
y = dtFinal['Appliances'].values

# Separando em treino e teste
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed_)

# Separando o conjunto de treino em treino e validação
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size = 0.8, random_state = seed_)


%%time
# Modelo 08 CatBoost Regressor
modelo_cat_v8 = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                                 iterations = 5000, metric_period = 50, od_type = 'Iter', od_wait = 20,\
                                 learning_rate = 0.05, depth = 10)

modelo_cat_v8.fit(x_train, y_train,
               cat_features = categorical_features_index,
               eval_set = (x_val, y_val),
               plot = True, verbose = False);

reportModeloRegressao(modelo_cat_v8, x_test, y_test, x_train, y_train, True)

Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.

<catboost.core.CatBoostRegressor object at 0x0000016E92EB1FA0> 

Dados de teste
R^2                 : 66.88%
R^2 Ajustado        : 66.95%
Acuracia            : 75.95%
MAE                 : 16.62
MSE                 : 601.34
RMSE                : 24.52

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E92EB1FA0> 

Dados de teste
R^2                 : 88.31%
R^2 Ajustado        : 88.36%
Acuracia            : 84.67%
MAE                 : 10.39
MSE                 : 216.86
RMSE                : 14.73

Wall time: 1min 38s


# Separando em variaveis preditivas e target 
#X = dtFinal[variaveis]
X = dtFinal[['T3', 'RH_3', 'T8', 'Press_mm_hg', 'NSM', 'Hour']]
y = dtFinal['Appliances'].values

# Separando em treino e teste
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed_)


%%time

params = {
    'depth': [8, 9, 10],
    'learning_rate': [0.04, 0.05, 0.06],
    'grow_policy': ['Depthwise', 'Lossguide'],
    'iterations' : [5000]
}

# Criação de modelo intenso 09
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 10, 'grow_policy': 'Depthwise', 'iterations': 5000, 'learning_rate': 0.04}
<catboost.core.CatBoostRegressor object at 0x0000016E8641B3D0> 

Dados de teste
R^2                 : 71.53%
R^2 Ajustado        : 71.60%
Acuracia            : 78.69%
MAE                 : 14.85
MSE                 : 516.96
RMSE                : 22.74

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E8641B3D0> 

Dados de teste
R^2                 : 99.75%
R^2 Ajustado        : 99.80%
Acuracia            : 97.56%
MAE                 : 1.52
MSE                 : 4.60
RMSE                : 2.15

Wall time: 14min 23s


%%time

params = {
    'depth': [9, 10],
    'learning_rate': [0.02, 0.03, 0.04],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000]
}

# Criação de modelo intenso 10
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 10, 'grow_policy': 'Depthwise', 'iterations': 5000, 'learning_rate': 0.03}
<catboost.core.CatBoostRegressor object at 0x0000016E909EED30> 

Dados de teste
R^2                 : 71.87%
R^2 Ajustado        : 71.94%
Acuracia            : 78.76%
MAE                 : 14.78
MSE                 : 510.88
RMSE                : 22.60

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E909EED30> 

Dados de teste
R^2                 : 99.51%
R^2 Ajustado        : 99.55%
Acuracia            : 96.59%
MAE                 : 2.16
MSE                 : 9.13
RMSE                : 3.02

Wall time: 10min 44s


%%time

params = {
    'depth': [10, 11],
    'learning_rate': [0.025, 0.03, 0.035],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000]
}

# Criação de modelo intenso 11
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 11, 'grow_policy': 'Depthwise', 'iterations': 5000, 'learning_rate': 0.025}
<catboost.core.CatBoostRegressor object at 0x0000016E8783B130> 

Dados de teste
R^2                 : 72.19%
R^2 Ajustado        : 72.26%
Acuracia            : 78.99%
MAE                 : 14.65
MSE                 : 504.96
RMSE                : 22.47

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E8783B130> 

Dados de teste
R^2                 : 99.62%
R^2 Ajustado        : 99.67%
Acuracia            : 96.97%
MAE                 : 1.92
MSE                 : 6.97
RMSE                : 2.64

Wall time: 13min 33s


%%time

params = {
    'depth': [11],
    'learning_rate': [0.024, 0.025, 0.026],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000]
}

# Criação de modelo intenso 12
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 11, 'grow_policy': 'Depthwise', 'iterations': 5000, 'learning_rate': 0.025}
<catboost.core.CatBoostRegressor object at 0x0000016E927FBB80> 

Dados de teste
R^2                 : 72.19%
R^2 Ajustado        : 72.26%
Acuracia            : 78.99%
MAE                 : 14.65
MSE                 : 504.96
RMSE                : 22.47

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E927FBB80> 

Dados de teste
R^2                 : 99.62%
R^2 Ajustado        : 99.67%
Acuracia            : 96.97%
MAE                 : 1.92
MSE                 : 6.97
RMSE                : 2.64

Wall time: 8min 32s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [9000, 10000, 11000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000]
}

# Criação de modelo intenso 13
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'langevin': True, 'learning_rate': 0.025}
<catboost.core.CatBoostRegressor object at 0x0000016E90CA7940> 

Dados de teste
R^2                 : 72.47%
R^2 Ajustado        : 72.54%
Acuracia            : 79.08%
MAE                 : 14.60
MSE                 : 499.88
RMSE                : 22.36

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E90CA7940> 

Dados de teste
R^2                 : 99.69%
R^2 Ajustado        : 99.73%
Acuracia            : 97.38%
MAE                 : 1.73
MSE                 : 5.79
RMSE                : 2.41

Wall time: 11min 32s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.2, 0.22, 0.025, 0.27],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000]
}

# Criação de modelo intenso 14
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'langevin': True, 'learning_rate': 0.025}
<catboost.core.CatBoostRegressor object at 0x0000016E90A172E0> 

Dados de teste
R^2                 : 72.47%
R^2 Ajustado        : 72.54%
Acuracia            : 79.08%
MAE                 : 14.60
MSE                 : 499.88
RMSE                : 22.36

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E90A172E0> 

Dados de teste
R^2                 : 99.69%
R^2 Ajustado        : 99.73%
Acuracia            : 97.38%
MAE                 : 1.73
MSE                 : 5.79
RMSE                : 2.41

Wall time: 14min 22s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    # Não foi adicionado a score function 'Cosine', pois essa é a default utilizada no modelo 14
    'score_function': ['L2', 'NewtonCosine', 'NewtonL2']
}

# Criação de modelo intenso 15
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

One or more of the test scores are non-finite: [-23.99381769          nan          nan]

{'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'langevin': True, 'learning_rate': 0.025, 'score_function': 'L2'}
<catboost.core.CatBoostRegressor object at 0x0000016E94BFCD30> 

Dados de teste
R^2                 : 72.15%
R^2 Ajustado        : 72.23%
Acuracia            : 79.09%
MAE                 : 14.62
MSE                 : 505.67
RMSE                : 22.49

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000016E94BFCD30> 

Dados de teste
R^2                 : 99.93%
R^2 Ajustado        : 99.97%
Acuracia            : 98.95%
MAE                 : 0.71
MSE                 : 1.28
RMSE                : 1.13

Wall time: 8min 32s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.5]
}

# Criação de modelo intenso 16
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.5, 'langevin': True, 'learning_rate': 0.025, 'score_function': 'Cosine'}
<catboost.core.CatBoostRegressor object at 0x0000020A0686E820> 

Dados de teste
R^2                 : 72.50%
R^2 Ajustado        : 72.58%
Acuracia            : 79.05%
MAE                 : 14.62
MSE                 : 499.29
RMSE                : 22.34

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000020A0686E820> 

Dados de teste
R^2                 : 99.75%
R^2 Ajustado        : 99.79%
Acuracia            : 97.65%
MAE                 : 1.55
MSE                 : 4.67
RMSE                : 2.16

Wall time: 10min 14s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.4, 2.6, 2.8]
}

# Criação de modelo intenso 17
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.4, 'langevin': True, 'learning_rate': 0.025, 'score_function': 'Cosine'}
<catboost.core.CatBoostRegressor object at 0x0000020A071D7460> 

Dados de teste
R^2                 : 72.25%
R^2 Ajustado        : 72.33%
Acuracia            : 79.05%
MAE                 : 14.63
MSE                 : 503.85
RMSE                : 22.45

Dados de treino
<catboost.core.CatBoostRegressor object at 0x0000020A071D7460> 

Dados de teste
R^2                 : 99.76%
R^2 Ajustado        : 99.81%
Acuracia            : 97.68%
MAE                 : 1.52
MSE                 : 4.42
RMSE                : 2.10

Wall time: 11min 30s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.5],
    'subsample': [0.7, 0.9, 1.0] # Default subsample = 0.8
}

# Criação de modelo intenso 18
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.5, 'langevin': True, 'learning_rate': 0.025, 'score_function': 'Cosine', 'subsample': 0.7}
<catboost.core.CatBoostRegressor object at 0x000002093C0DCAC0> 

Dados de teste
R^2                 : 72.32%
R^2 Ajustado        : 72.40%
Acuracia            : 79.12%
MAE                 : 14.60
MSE                 : 502.57
RMSE                : 22.42

Dados de treino
<catboost.core.CatBoostRegressor object at 0x000002093C0DCAC0> 

Dados de teste
R^2                 : 99.70%
R^2 Ajustado        : 99.74%
Acuracia            : 97.24%
MAE                 : 1.79
MSE                 : 5.65
RMSE                : 2.38

Wall time: 12min 25s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.5],
    'subsample': [0.8],
    'bootstrap_type': ['Bayesian', 'Bernoulli', 'No'] # Default para CPU = MVS
}

# Criação de modelo intenso 19
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

One or more of the test scores are non-finite: [         nan -23.58715838          nan]

{'bootstrap_type': 'Bernoulli', 'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.5, 'langevin': True, 'learning_rate': 0.025, 'score_function': 'Cosine', 'subsample': 0.8}
<catboost.core.CatBoostRegressor object at 0x000002A84080BF40> 

Dados de teste
R^2                 : 73.05%
R^2 Ajustado        : 73.12%
Acuracia            : 79.44%
MAE                 : 14.35
MSE                 : 489.44
RMSE                : 22.12

Dados de treino
<catboost.core.CatBoostRegressor object at 0x000002A84080BF40> 

Dados de teste
R^2                 : 99.91%
R^2 Ajustado        : 99.96%
Acuracia            : 98.50%
MAE                 : 0.96
MSE                 : 1.59
RMSE                : 1.26

Wall time: 12min 20s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [3.0],
    'subsample': [0.8],
    'bootstrap_type': ['Bernoulli']
}

# Criação de modelo intenso 20
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'bootstrap_type': 'Bernoulli', 'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 3.0, 'langevin': True, 'learning_rate': 0.025, 'score_function': 'Cosine', 'subsample': 0.8}
<catboost.core.CatBoostRegressor object at 0x000002A90FFFDA60> 

Dados de teste
R^2                 : 72.73%
R^2 Ajustado        : 72.80%
Acuracia            : 79.33%
MAE                 : 14.43
MSE                 : 495.15
RMSE                : 22.25

Dados de treino
<catboost.core.CatBoostRegressor object at 0x000002A90FFFDA60> 

Dados de teste
R^2                 : 99.89%
R^2 Ajustado        : 99.93%
Acuracia            : 98.30%
MAE                 : 1.09
MSE                 : 2.04
RMSE                : 1.43

Wall time: 11min 57s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.5],
    'subsample': [0.8],
    'bootstrap_type': ['Bernoulli'],
    'random_strength': [0.8, 1.2, 1.4, 1.6] # Default = 1
}

# Criação de modelo intenso 21
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'bootstrap_type': 'Bernoulli', 'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.5, 'langevin': True, 'learning_rate': 0.025, 'random_strength': 0.8, 'score_function': 'Cosine', 'subsample': 0.8}
<catboost.core.CatBoostRegressor object at 0x000002A906D092B0> 

Dados de teste
R^2                 : 72.93%
R^2 Ajustado        : 73.00%
Acuracia            : 79.40%
MAE                 : 14.39
MSE                 : 491.62
RMSE                : 22.17

Dados de treino
<catboost.core.CatBoostRegressor object at 0x000002A906D092B0> 

Dados de teste
R^2                 : 99.92%
R^2 Ajustado        : 99.96%
Acuracia            : 98.55%
MAE                 : 0.93
MSE                 : 1.48
RMSE                : 1.22

Wall time: 17min 12s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.5],
    'subsample': [0.8],
    'bootstrap_type': ['Bernoulli'],
    'random_strength': [0.9, 1.0, 1.1] # Default = 1
}

# Criação de modelo intenso 22
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'bootstrap_type': 'Bernoulli', 'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.5, 'langevin': True, 'learning_rate': 0.025, 'random_strength': 1.0, 'score_function': 'Cosine', 'subsample': 0.8}
<catboost.core.CatBoostRegressor object at 0x000002A90FF01790> 

Dados de teste
R^2                 : 73.05%
R^2 Ajustado        : 73.12%
Acuracia            : 79.44%
MAE                 : 14.35
MSE                 : 489.44
RMSE                : 22.12

Dados de treino
<catboost.core.CatBoostRegressor object at 0x000002A90FF01790> 

Dados de teste
R^2                 : 99.91%
R^2 Ajustado        : 99.96%
Acuracia            : 98.50%
MAE                 : 0.96
MSE                 : 1.59
RMSE                : 1.26

Wall time: 13min 56s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.5],
    'subsample': [0.8],
    'bootstrap_type': ['Bernoulli'],
    'random_strength': [1.0],
    'min_data_in_leaf': [3, 6, 9] # Default = 1
}

# Criação de modelo intenso 22
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'bootstrap_type': 'Bernoulli', 'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.5, 'langevin': True, 'learning_rate': 0.025, 'min_data_in_leaf': 3, 'random_strength': 1.0, 'score_function': 'Cosine', 'subsample': 0.8}
<catboost.core.CatBoostRegressor object at 0x000002A9040C9850> 

Dados de teste
R^2                 : 72.80%
R^2 Ajustado        : 72.88%
Acuracia            : 79.25%
MAE                 : 14.45
MSE                 : 493.88
RMSE                : 22.22

Dados de treino
<catboost.core.CatBoostRegressor object at 0x000002A9040C9850> 

Dados de teste
R^2                 : 99.62%
R^2 Ajustado        : 99.66%
Acuracia            : 96.84%
MAE                 : 2.01
MSE                 : 7.06
RMSE                : 2.66

Wall time: 6min 53s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.5],
    'subsample': [0.8],
    'bootstrap_type': ['Bernoulli'],
    'random_strength': [1.0],
    'min_data_in_leaf': [1, 2] # Default = 1
}

# Criação de modelo intenso 23
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'bootstrap_type': 'Bernoulli', 'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.5, 'langevin': True, 'learning_rate': 0.025, 'min_data_in_leaf': 1, 'random_strength': 1.0, 'score_function': 'Cosine', 'subsample': 0.8}
<catboost.core.CatBoostRegressor object at 0x000002E307325DF0> 

Dados de teste
R^2                 : 73.05%
R^2 Ajustado        : 73.12%
Acuracia            : 79.44%
MAE                 : 14.35
MSE                 : 489.44
RMSE                : 22.12

Dados de treino
<catboost.core.CatBoostRegressor object at 0x000002E307325DF0> 

Dados de teste
R^2                 : 99.91%
R^2 Ajustado        : 99.96%
Acuracia            : 98.50%
MAE                 : 0.96
MSE                 : 1.59
RMSE                : 1.26

Wall time: 13min 25s


%%time

feature_weights = [[1, 1, 1, 0.9, 1.1, 1.1], [0.9, 0.95, 1.05, 1, 1.2, 1.1]]

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.5],
    'subsample': [0.8],
    'bootstrap_type': ['Bernoulli'],
    'random_strength': [1.0],
    'min_data_in_leaf': [1],
    'feature_weights': feature_weights
}

# Criação de modelo intenso 24
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 50, od_type = 'Iter', od_wait = 20)

treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test, scoring = 'neg_root_mean_squared_error',\
                            report_treino = True)

{'bootstrap_type': 'Bernoulli', 'depth': 11, 'diffusion_temperature': 10000, 'feature_weights': [1, 1, 1, 0.9, 1.1, 1.1], 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.5, 'langevin': True, 'learning_rate': 0.025, 'min_data_in_leaf': 1, 'random_strength': 1.0, 'score_function': 'Cosine', 'subsample': 0.8}
<catboost.core.CatBoostRegressor object at 0x000002E306CB4FA0> 

Dados de teste
R^2                 : 71.92%
R^2 Ajustado        : 72.00%
Acuracia            : 78.47%
MAE                 : 14.98
MSE                 : 509.82
RMSE                : 22.58

Dados de treino
<catboost.core.CatBoostRegressor object at 0x000002E306CB4FA0> 

Dados de teste
R^2                 : 99.96%
R^2 Ajustado        : 100.01%
Acuracia            : 99.16%
MAE                 : 0.58
MSE                 : 0.66
RMSE                : 0.81

Wall time: 16min


# Separando em variaveis preditivas e target 
#X = dtFinal[variaveis]
X = dtFinal[['T3', 'RH_3', 'T8', 'Press_mm_hg', 'NSM', 'Hour']]
y = dtFinal['Appliances'].values

# Separando em treino e teste
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed_)

# Definindo variaveis categoricas
categorical_features_index = np.where(x_train.dtypes != np.float)[0]


%%time
cv_dataset = Pool(data=x_train,
                  label=y_train,
                  cat_features=categorical_features_index)

params = {
    'depth': 11,
    'langevin': True,
    'diffusion_temperature': 10000,
    'learning_rate': 0.025,
    'grow_policy': 'Depthwise',
    'iterations' : 5000,
    'score_function': 'Cosine',
    'l2_leaf_reg': 2.5,
    'subsample': 0.8,
    'bootstrap_type': 'Bernoulli',
    'random_strength': 1.0,
    'min_data_in_leaf': 1,
    'loss_function': 'RMSE',
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': seed_,
    'verbose': False,
    'metric_period': 10,
    'od_type': 'Iter',
    'od_wait': 10
}

scores = cv(cv_dataset,
            params,
            fold_count = 5, 
            plot = True,
            seed = seed_)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.

Stopped by overfitting detector  (10 iterations wait)
Wall time: 31min 22s


%%time

params = {
    'depth': 11,
    'langevin': True,
    'diffusion_temperature': 10000,
    'learning_rate': 0.025,
    'grow_policy': 'Depthwise',
    'iterations' : 5000,
    'score_function': 'Cosine',
    'l2_leaf_reg': 2.5,
    'subsample': 0.8,
    'bootstrap_type': 'Bernoulli',
    'random_strength': 1.0,
    'min_data_in_leaf': 1,
    'loss_function': 'RMSE',
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': seed_,
    'verbose': False,
    'metric_period': 1,
    'od_type': 'Iter',
    'od_wait': 10
}

# Criação de modelo 25
modelo_cat_v25 = CatBoostRegressor(**params)

modelo_cat_v25.fit(x_train, y_train,
               cat_features = categorical_features_index,
               eval_set = (x_val, y_val),
               plot = True, verbose = False)

reportModeloRegressao(modelo_cat_v25, x_test, y_test, x_train, y_train, True)

<catboost.core.CatBoostRegressor object at 0x000002E307141700> 

Dados de teste
R^2                 : 68.74%
R^2 Ajustado        : 68.81%
Acuracia            : 77.37%
MAE                 : 15.76
MSE                 : 567.68
RMSE                : 23.83

Dados de treino
<catboost.core.CatBoostRegressor object at 0x000002E307141700> 

Dados de teste
R^2                 : 95.83%
R^2 Ajustado        : 95.88%
Acuracia            : 90.63%
MAE                 : 6.22
MSE                 : 77.38
RMSE                : 8.80

Wall time: 2min 13s


%%time

params = {
    'depth': [11],
    'langevin': [True],
    'diffusion_temperature': [10000],
    'learning_rate': [0.025],
    'grow_policy': ['Depthwise'],
    'iterations' : [5000],
    'score_function': ['Cosine'],
    'l2_leaf_reg': [2.5],
    'subsample': [0.8],
    'bootstrap_type': ['Bernoulli'],
    'random_strength': [1.0],
    'min_data_in_leaf': [1]
}

# Criação de modelo Final
modelo = CatBoostRegressor(loss_function = 'RMSE', eval_metric = 'RMSE', random_seed = seed_,\
                           verbose = False, metric_period = 1, od_type = 'Iter', od_wait = 10)

modelo_cat_final = treinaRegressao_GridSearchCV(modelo, params, x_train, y_train, x_test, y_test,\
                                            scoring = 'neg_root_mean_squared_error',\
                                            report_treino = True, retorna_modelo = True)

{'bootstrap_type': 'Bernoulli', 'depth': 11, 'diffusion_temperature': 10000, 'grow_policy': 'Depthwise', 'iterations': 5000, 'l2_leaf_reg': 2.5, 'langevin': True, 'learning_rate': 0.025, 'min_data_in_leaf': 1, 'random_strength': 1.0, 'score_function': 'Cosine', 'subsample': 0.8}
<catboost.core.CatBoostRegressor object at 0x00000183A0C0EDF0> 

Dados de teste
R^2                 : 73.05%
R^2 Ajustado        : 73.12%
Acuracia            : 79.44%
MAE                 : 14.35
MSE                 : 489.44
RMSE                : 22.12

Dados de treino
<catboost.core.CatBoostRegressor object at 0x00000183A0C0EDF0> 

Dados de teste
R^2                 : 99.91%
R^2 Ajustado        : 99.96%
Acuracia            : 98.50%
MAE                 : 0.96
MSE                 : 1.59
RMSE                : 1.26

Wall time: 12min 28s


# Salvando modelo de machine learning em formato Pickle
pickle_out = open('../modelos/modelo_final.pkl', mode = 'wb')
dump(modelo_cat_final, pickle_out)
pickle_out.close()


# Salvando Scale
dump(scaler, open('../modelos/scaler.pkl', mode = 'wb'))


# Carregando modelo
with open('../modelos/modelo_final.pkl', 'rb') as f:
    modelo_cat_final = load(f)


def dependence_plot_unique(columns, shap_values_, x):
    for col in columns:
        shap.dependence_plot(col, shap_values_, x)


# Construindo shap
amostras = 1000
x_shap = x_train[:amostras]
y_shap =  y_train[:amostras]

explainer = shap.TreeExplainer(modelo_cat_final)
shap_values = explainer.shap_values(Pool(x_shap, y_shap))


shap.initjs()
n = 0
shap.force_plot(explainer.expected_value, shap_values[n,:], x_shap.iloc[n,:])


shap.initjs()
n = 13
shap.force_plot(explainer.expected_value, shap_values[n,:], x_shap.iloc[n,:])


shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, x_shap)


dependence_plot_unique(x_train.columns, shap_values, x_shap)


shap.summary_plot(shap_values,x_shap)


modelo_cat_final.plot_tree(
    tree_idx=0
)


# Calculando Previsões
pred = modelo_cat_final.predict(x_test)


fig = plt.figure(figsize = (20, 8))

amostras = 50
plt.plot(y_test[:amostras], label = 'Target', color = 'red', linestyle = '--')
plt.plot(pred[:amostras], label = 'CatBoost', color = 'blue')
plt.title('Consumo de Energia')
plt.xlabel('Observações')
plt.ylabel('Wh')

plt.legend()
plt.savefig('../analises/linha_real_previsto.png')
plt.show()


# Calculando com intervalo de 95% de confiança
soma_erro = np.sum((y_test - pred)**2)
stdev = np.sqrt( 1 / (len(y_test) - 2) * soma_erro)

intervalo = 1.95 * stdev
lower, upper = pred - intervalo, pred + intervalo


fig = plt.figure(figsize = (20, 8))

amostras = 50
plt.plot(y_test[:amostras], label = 'Target', color = 'red', linestyle = '--')
plt.plot(lower[:amostras],label='Limite Inferior', linestyle='--', color='g')
plt.plot(upper[:amostras],label='Limite Superior', linestyle='--', color='y')
plt.plot(pred[:amostras], label = 'CatBoost', color = 'blue')
plt.title('Previsão de Energia com Limite Inferior e Superior')
plt.xlabel('Observações')
plt.ylabel('Wh')

plt.legend()
plt.savefig('../analises/linha_real_previsto_limites.png')
plt.show()


# Somando consumo de energia real e previsto
soma_energia_real_wh = sum(y_test)
soma_energia_pred_wh = sum(pred)

# Convertendo de Wh para kWh
soma_energia_real_kwh = soma_energia_real_wh / 1000
soma_energia_pred_kwh = soma_energia_pred_wh / 1000

soma_energia_pred = [soma_energia_real_kwh, soma_energia_pred_kwh]


# Preco kWh Belgium - Local dos dados coletados
# Fonte dados atualizados em 01.12.2020: https://www.globalpetrolprices.com/Belgium/
kwh_casa_eur = 0.265
kwh_casa_usd = 0.315


low = min(soma_energia_pred)
high = max(soma_energia_pred)


def adicionaLabels(x, y):
    for i in range(len(x)):
        plt.text(i, round(y[i], 4), round(y[i], 4), ha = 'center')


# Grafico de barras do consumo previsto x real de energia
fig = plt.figure(figsize = (13, 8))

plt.bar(['Consumo Real', 'Consumo Previsto'], soma_energia_pred)
plt.ylabel('kWh')
plt.title('Consumo de Energia')
plt.ylim([ceil(low-0.5*(high-low)) - 1, ceil(high+0.5*(high-low)) + 1])
adicionaLabels(['Consumo Real', 'Consumo Previsto'], soma_energia_pred)

plt.savefig('../analises/barra_consumo_real_previsto.png')
plt.show()


# Calculando custo eletrico em EUR
custo_eletrico_real_eur = soma_energia_real_kwh * kwh_casa_eur
custo_eletrico_pred_eur = soma_energia_pred_kwh * kwh_casa_eur

custo_eletrico_eur = [custo_eletrico_real_eur, custo_eletrico_pred_eur]

# Calculando limites do eixo y
low = min(custo_eletrico_eur)
high = max(custo_eletrico_eur)


# Grafico de barras do consumo previsto x real de energia
fig = plt.figure(figsize = (13, 8))

plt.bar(['Custo Real', 'Custo Previsto'], custo_eletrico_eur)
plt.ylabel('EUR')
plt.title('Custo Eletrico')
plt.ylim([ceil(low-0.5*(high-low)) - 1, ceil(high+0.5*(high-low)) + 1])
adicionaLabels(['Custo Real', 'Custo Previsto'], custo_eletrico_eur)

plt.savefig('../analises/barra_custo_real_previsto_euro.png')
plt.show()


# Calculando custo eletrico em USD
custo_eletrico_real_usd = soma_energia_real_kwh * kwh_casa_usd
custo_eletrico_pred_usd = soma_energia_pred_kwh * kwh_casa_usd

custo_eletrico_usd = [custo_eletrico_real_usd, custo_eletrico_pred_usd]

# Calculando limites do eixo y
low = min(custo_eletrico_usd)
high = max(custo_eletrico_usd)


# Grafico de barras do consumo previsto x real de energia
fig = plt.figure(figsize = (13, 8))

plt.bar(['Custo Real', 'Custo Previsto'], custo_eletrico_usd)
plt.ylabel('USD')
plt.title('Custo Eletrico')
plt.ylim([ceil(low-0.5*(high-low)) - 1, ceil(high+0.5*(high-low)) + 1])
adicionaLabels(['Custo Real', 'Custo Previsto'], custo_eletrico_usd)

plt.savefig('../analises/barra_custo_real_previsto_dolar.png')
plt.show()

Feature	Descrição	Unidade
date	Data no formato ano-mês-dia hora:minutos:segundos.
Appliances	Consumo de energia. Variavel Target.	Wh (Watt-Hora)
lights	Consumo de energia de luminárias.	Wh (Watt-Hora)
T1	Temperatura na Cozinha.	Celsius
RH1	Umidade Relativa na Cozinha.	%
T2	Temperatura na Sala de Estar.	Celsius
RH2	Umidade Relativa na Sala de Estar.	%
T3	Temperatura na Lavanderia.	Celsius
RH3	Umidade Relativa na Lavanderia.	%
T4	Temperatura no Escritório.	Celsius
RH4	Umidade Relativa no Escritório.	%
T5	Temperatura no Banheiro.	Celsius
RH5	Umidade Relativa no Banheiro.	%
T6	Temperatura Externa Lado Norte.	Celsius
RH6	Umidade Relativa Externa Lado Norte.	%
T7	Temperatura na Sala de Passar Roupa.	Celsius
RH7	Umidade Relativa na Sala de Passar Roupa.	%
T8	Temperatura no Quarto do Adolescente.	Celsius
RH8	Umidade Relativa no Quarto do Adolescente.	%
T9	Temperatura no Quarto dos Pais.	Celsius
RH9	Umidade Relativa no Quarto dos Pais.	%
T_out	Temperatura Externa.	Celsius
Press_mm_hg	Pressão.	mm/hg
RH_out	Umidade Relativa Externa.	%
Windspeed	Velocidade do Vento.	m/s
Visibility	Visibilidade.	km
Tdewpoint	Ponto de Saturação.	Celsius
rv1	Variável Randômica.
rv2	Variável Randômica.
NSM	Segundos até a meioa noite
WeekStatus	Indicativo de Dia da Semana ou Final de Semana.
Day_of_week	Indicativo de Segunda à Domingo.

	date	Appliances	lights	T1	RH_1	T2	RH_2	T3	RH_3	T4	...	Press_mm_hg	RH_out	Windspeed	Visibility	Tdewpoint	rv1	rv2	NSM	WeekStatus	Day_of_week
0	2016-01-11 17:00:00	60	30	19.89	47.596667	19.2	44.790000	19.79	44.730000	19.000000	...	733.5	92.0	7.000000	63.000000	5.3	13.275433	13.275433	61200	Weekday	Monday
1	2016-01-11 17:10:00	60	30	19.89	46.693333	19.2	44.722500	19.79	44.790000	19.000000	...	733.6	92.0	6.666667	59.166667	5.2	18.606195	18.606195	61800	Weekday	Monday
2	2016-01-11 17:20:00	50	30	19.89	46.300000	19.2	44.626667	19.79	44.933333	18.926667	...	733.7	92.0	6.333333	55.333333	5.1	28.642668	28.642668	62400	Weekday	Monday
3	2016-01-11 17:40:00	60	40	19.89	46.333333	19.2	44.530000	19.79	45.000000	18.890000	...	733.9	92.0	5.666667	47.666667	4.9	10.084097	10.084097	63600	Weekday	Monday
4	2016-01-11 17:50:00	50	40	19.89	46.026667	19.2	44.500000	19.79	44.933333	18.890000	...	734.0	92.0	5.333333	43.833333	4.8	44.919484	44.919484	64200	Weekday	Monday

	date	Appliances	lights	T1	RH_1	T2	RH_2	T3	RH_3	T4	...	Press_mm_hg	RH_out	Windspeed	Visibility	Tdewpoint	rv1	rv2	NSM	WeekStatus	Day_of_week
0	2016-01-11 17:30:00	50	40	19.890000	46.066667	19.200000	44.590000	19.79	45.000000	18.89	...	733.800000	92.000000	6.000000	51.5	5.000000	45.410389	45.410389	63000	Weekday	Monday
1	2016-01-11 18:00:00	60	50	19.890000	45.766667	19.200000	44.500000	19.79	44.900000	18.89	...	734.100000	92.000000	5.000000	40.0	4.700000	47.233763	47.233763	64800	Weekday	Monday
2	2016-01-11 18:40:00	230	70	19.926667	45.863333	19.356667	44.400000	19.79	44.900000	18.89	...	734.366667	91.333333	5.666667	40.0	4.633333	10.298729	10.298729	67200	Weekday	Monday
3	2016-01-11 18:50:00	580	60	20.066667	46.396667	19.426667	44.400000	19.79	44.826667	19.00	...	734.433333	91.166667	5.833333	40.0	4.616667	8.827838	8.827838	67800	Weekday	Monday
4	2016-01-11 19:30:00	100	10	20.566667	53.893333	20.033333	46.756667	20.10	48.466667	19.00	...	734.850000	89.500000	6.000000	40.0	4.350000	24.884962	24.884962	70200	Weekday	Monday

	date	Appliances	lights	T1	RH_1	T2	RH_2	T3	RH_3	T4	...	Press_mm_hg	RH_out	Windspeed	Visibility	Tdewpoint	rv1	rv2	NSM	WeekStatus	Day_of_week
0	2016-01-11 17:00:00	60	30	19.89	47.596667	19.2	44.790000	19.79	44.730000	19.000000	...	733.5	92.0	7.000000	63.000000	5.3	13.275433	13.275433	61200	Weekday	Monday
1	2016-01-11 17:10:00	60	30	19.89	46.693333	19.2	44.722500	19.79	44.790000	19.000000	...	733.6	92.0	6.666667	59.166667	5.2	18.606195	18.606195	61800	Weekday	Monday
2	2016-01-11 17:20:00	50	30	19.89	46.300000	19.2	44.626667	19.79	44.933333	18.926667	...	733.7	92.0	6.333333	55.333333	5.1	28.642668	28.642668	62400	Weekday	Monday
3	2016-01-11 17:40:00	60	40	19.89	46.333333	19.2	44.530000	19.79	45.000000	18.890000	...	733.9	92.0	5.666667	47.666667	4.9	10.084097	10.084097	63600	Weekday	Monday
4	2016-01-11 17:50:00	50	40	19.89	46.026667	19.2	44.500000	19.79	44.933333	18.890000	...	734.0	92.0	5.333333	43.833333	4.8	44.919484	44.919484	64200	Weekday	Monday

	date	Appliances	lights	T1	RH_1	T2	RH_2	T3	RH_3	T4	...	Press_mm_hg	RH_out	Windspeed	Visibility	Tdewpoint	rv1	rv2	NSM	WeekStatus	Day_of_week
0	2016-01-11 17:00:00	60	30	19.89	47.596667	19.2	44.790000	19.79	44.730000	19.000000	...	733.5	92.0	7.000000	63.000000	5.3	13.275433	13.275433	61200	Weekday	Monday
1	2016-01-11 17:10:00	60	30	19.89	46.693333	19.2	44.722500	19.79	44.790000	19.000000	...	733.6	92.0	6.666667	59.166667	5.2	18.606195	18.606195	61800	Weekday	Monday
2	2016-01-11 17:20:00	50	30	19.89	46.300000	19.2	44.626667	19.79	44.933333	18.926667	...	733.7	92.0	6.333333	55.333333	5.1	28.642668	28.642668	62400	Weekday	Monday
3	2016-01-11 17:40:00	60	40	19.89	46.333333	19.2	44.530000	19.79	45.000000	18.890000	...	733.9	92.0	5.666667	47.666667	4.9	10.084097	10.084097	63600	Weekday	Monday
4	2016-01-11 17:50:00	50	40	19.89	46.026667	19.2	44.500000	19.79	44.933333	18.890000	...	734.0	92.0	5.333333	43.833333	4.8	44.919484	44.919484	64200	Weekday	Monday

	Appliances	lights	T1	RH_1	T2	RH_2	T3	RH_3	T4	RH_4	...	RH_9	T_out	Press_mm_hg	RH_out	Windspeed	Visibility	Tdewpoint	rv1	rv2	NSM
count	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	...	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000
mean	97.694958	3.801875	21.686571	40.259739	20.341219	40.420420	22.267611	39.242500	20.855335	39.026904	...	41.552401	7.411665	755.522602	79.750418	4.039752	38.330834	3.760707	24.988033	24.988033	42907.129465
std	102.524891	7.935988	1.606066	3.979299	2.192974	4.069813	2.006111	3.254576	2.042884	4.341321	...	4.151497	5.317409	7.399441	14.901088	2.451221	11.794719	4.194648	14.496634	14.496634	24940.020831
min	10.000000	0.000000	16.790000	27.023333	16.100000	20.463333	17.200000	28.766667	15.100000	27.660000	...	29.166667	-5.000000	729.300000	24.000000	0.000000	1.000000	-6.600000	0.005322	0.005322	0.000000
25%	50.000000	0.000000	20.760000	37.333333	18.790000	37.900000	20.790000	36.900000	19.530000	35.530000	...	38.500000	3.666667	750.933333	70.333333	2.000000	29.000000	0.900000	12.497889	12.497889	21600.000000
50%	60.000000	0.000000	21.600000	39.656667	20.000000	40.500000	22.100000	38.530000	20.666667	38.400000	...	40.900000	6.916667	756.100000	83.666667	3.666667	40.000000	3.433333	24.897653	24.897653	43200.000000
75%	100.000000	0.000000	22.600000	43.066667	21.500000	43.260000	23.290000	41.760000	22.100000	42.156667	...	44.338095	10.408333	760.933333	91.666667	5.500000	40.000000	6.566667	37.583769	37.583769	64200.000000
max	1080.000000	70.000000	26.260000	63.360000	29.856667	56.026667	29.236000	50.163333	26.200000	51.090000	...	53.326667	26.100000	772.300000	100.000000	14.000000	66.000000	15.500000	49.996530	49.996530	85800.000000

	Features	VIF
0	lights	1.598469
1	T1	3664.675604
2	RH_1	1675.029726
3	T2	2506.573416
4	RH_2	2171.779385
5	T3	1245.642279
6	RH_3	1569.504538
7	T4	1029.698072
8	RH_4	1413.473620
9	T5	1188.789675
10	RH_5	45.918059
11	T6	89.683180
12	RH_6	40.393545
13	T7	1614.407298
14	RH_7	519.500020
15	T8	998.895521
16	RH_8	639.327424
17	T9	2680.167167
18	RH_9	683.068683
19	T_out	399.842239
20	Press_mm_hg	2141.265529
21	RH_out	1304.357699
22	Windspeed	5.265026
23	Visibility	12.033046
24	Tdewpoint	133.401568
25	NSM	8.269484

	Features	VIF
0	lights	1.535229
1	RH_1	1069.740361
2	RH_2	598.778204
3	T3	931.937502
4	RH_3	1490.346576
5	T4	789.808421
6	RH_4	1229.064681
7	T5	974.129890
8	RH_5	45.234874
9	T6	77.496936
10	RH_6	37.272565
11	T7	1164.586860
12	RH_7	485.753884
13	T8	855.024234
14	RH_8	629.617636
15	RH_9	676.128304
16	T_out	219.666115
17	RH_out	497.869417
18	Windspeed	5.072104
19	Visibility	11.966896
20	Tdewpoint	39.010393
21	NSM	7.406162

	Features	VIF
0	lights	1.474037
1	RH_2	306.256402
2	T3	914.791631
3	T4	647.955577
4	T5	889.230921
5	RH_5	44.725359
6	T6	67.201155
7	RH_6	33.332515
8	RH_7	391.146600
9	T8	590.154997
10	RH_8	490.277232
11	RH_9	545.305393
12	T_out	200.633878
13	RH_out	464.902227
14	Windspeed	4.980792
15	Visibility	11.944881
16	Tdewpoint	31.836147
17	NSM	5.924309

	Features	VIF
0	lights	1.386041
1	RH_5	36.610124
2	T6	59.583160
3	RH_6	13.399596
4	T_out	76.338804
5	RH_out	43.804281
6	Windspeed	4.543545
7	Visibility	11.775606
8	Tdewpoint	7.900114
9	NSM	4.957419

	Features	VIF
0	lights	1.354529
1	RH_6	2.725191
2	Windspeed	3.558092
3	Tdewpoint	1.714626
4	NSM	2.965764

Dep. Variable:	y	R-squared:	0.167
Model:	OLS	Adj. R-squared:	0.166
Method:	Least Squares	F-statistic:	152.3
Date:	Sat, 07 Aug 2021	Prob (F-statistic):	0.00
Time:	12:29:09	Log-Likelihood:	-1.1757e+05
No. Observations:	19735	AIC:	2.352e+05
Df Residuals:	19708	BIC:	2.354e+05
Df Model:	26
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-8.2144	94.756	-0.087	0.931	-193.945	177.516
lights	1.9099	0.096	19.944	0.000	1.722	2.098
T1	-2.4021	1.855	-1.295	0.195	-6.039	1.235
RH_1	14.4922	0.675	21.482	0.000	13.170	15.815
T2	-18.0111	1.632	-11.040	0.000	-21.209	-14.813
RH_2	-13.3635	0.766	-17.438	0.000	-14.866	-11.861
T3	26.0199	1.054	24.697	0.000	23.955	28.085
RH_3	4.8021	0.674	7.127	0.000	3.481	6.123
T4	-3.1388	1.024	-3.066	0.002	-5.146	-1.132
RH_4	-0.7810	0.640	-1.220	0.223	-2.036	0.474
T5	-0.3787	1.172	-0.323	0.747	-2.677	1.919
RH_5	0.0617	0.087	0.707	0.480	-0.109	0.233
T6	7.5680	0.636	11.906	0.000	6.322	8.814
RH_6	0.2863	0.068	4.232	0.000	0.154	0.419
T7	1.8028	1.321	1.364	0.172	-0.787	4.393
RH_7	-1.4776	0.429	-3.445	0.001	-2.318	-0.637
T8	7.4096	0.972	7.626	0.000	5.505	9.314
RH_8	-3.7938	0.390	-9.739	0.000	-4.557	-3.030
T9	-13.2966	1.792	-7.422	0.000	-16.808	-9.785
RH_9	-0.1390	0.421	-0.330	0.741	-0.964	0.686
T_out	-10.1001	1.519	-6.650	0.000	-13.077	-7.123
Press_mm_hg	0.1501	0.107	1.405	0.160	-0.059	0.359
RH_out	-0.8495	0.315	-2.696	0.007	-1.467	-0.232
Windspeed	1.7940	0.345	5.200	0.000	1.118	2.470
Visibility	0.1525	0.058	2.645	0.008	0.039	0.266
Tdewpoint	3.8372	1.483	2.587	0.010	0.930	6.744
NSM	0.0003	3.87e-05	7.614	0.000	0.000	0.000

Omnibus:	13963.593	Durbin-Watson:	0.876
Prob(Omnibus):	0.000	Jarque-Bera (JB):	211283.363
Skew:	3.321	Prob(JB):	0.00
Kurtosis:	17.588	Cond. No.	7.06e+06

Dep. Variable:	y	R-squared:	0.080
Model:	OLS	Adj. R-squared:	0.079
Method:	Least Squares	F-statistic:	341.1
Date:	Sat, 07 Aug 2021	Prob (F-statistic):	0.00
Time:	12:29:09	Log-Likelihood:	-1.1856e+05
No. Observations:	19735	AIC:	2.371e+05
Df Residuals:	19729	BIC:	2.372e+05
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	68.4136	2.408	28.414	0.000	63.694	73.133
lights	2.1715	0.093	23.246	0.000	1.988	2.355
RH_6	-0.3018	0.025	-12.299	0.000	-0.350	-0.254
Windspeed	3.0766	0.293	10.502	0.000	2.502	3.651
Tdewpoint	-0.3754	0.175	-2.142	0.032	-0.719	-0.032
NSM	0.0006	3.01e-05	20.496	0.000	0.001	0.001

	Appliances	lights	T1	RH_1	T2	RH_2	T3	RH_3	T4	RH_4	...	RH_out	Windspeed	Visibility	Tdewpoint	NSM	Month	Day	Hour
0	60.0	3.301264	-1.139072	1.863478	-0.528718	1.092582	-1.245155	1.686863	-0.912635	1.506438	...	0.828905	1.226233	1.799947	0.367016	0.733493	1	11	17
1	60.0	3.301264	-1.139072	1.634348	-0.528718	1.075633	-1.245155	1.705307	-0.912635	1.604528	...	0.828905	1.088599	1.799947	0.343175	0.757551	1	11	17
2	50.0	3.301264	-1.139072	1.534580	-0.528718	1.051570	-1.245155	1.749367	-0.948663	1.580918	...	0.828905	0.950965	1.687977	0.319333	0.781610	1	11	17
3	60.0	4.561378	-1.139072	1.543034	-0.528718	1.027297	-1.245155	1.769859	-0.966677	1.497991	...	0.828905	0.675697	0.952175	0.271649	0.829726	1	11	17
4	50.0	4.561378	-1.139072	1.465249	-0.528718	1.019764	-1.245155	1.749367	-0.966677	1.544061	...	0.828905	0.538063	0.584273	0.247807	0.853785	1	11	17

	data	feriado
0	2016-01-01	Nieuwjaarsdag
1	2016-03-27	Pasen
2	2016-03-28	Paasmaandag
3	2016-05-05	O.L.H. Hemelvaart
4	2016-05-15	Pinksteren

	Appliances	lights	T1	RH_1	T2	RH_2	T3	RH_3	T4	RH_4	...	Windspeed	Visibility	Tdewpoint	NSM	Month	Day	Hour
0	60.0	3.301264	-1.139072	1.863478	-0.528718	1.092582	-1.245155	1.686863	-0.912635	1.506438	...	1.226233	1.799947	0.367016	0.733493	1	11	17
1	60.0	3.301264	-1.139072	1.634348	-0.528718	1.075633	-1.245155	1.705307	-0.912635	1.604528	...	1.088599	1.799947	0.343175	0.757551	1	11	17
2	50.0	3.301264	-1.139072	1.534580	-0.528718	1.051570	-1.245155	1.749367	-0.948663	1.580918	...	0.950965	1.687977	0.319333	0.781610	1	11	17
3	60.0	4.561378	-1.139072	1.543034	-0.528718	1.027297	-1.245155	1.769859	-0.966677	1.497991	...	0.675697	0.952175	0.271649	0.829726	1	11	17
4	50.0	4.561378	-1.139072	1.465249	-0.528718	1.019764	-1.245155	1.749367	-0.966677	1.544061	...	0.538063	0.584273	0.247807	0.853785	1	11	17

Omnibus:	14148.097	Durbin-Watson:	0.763
Prob(Omnibus):	0.000	Jarque-Bera (JB):	206087.033
Skew:	3.405	Prob(JB):	0.00
Kurtosis:	17.292	Cond. No.	1.71e+05

	T3	RH_3	T8	Press_mm_hg	NSM	Hour
0	-1.245155	1.686863	-1.961010	-2.684267	0.733493	17
1	-1.245155	1.705307	-1.961010	-2.684267	0.757551	17
2	-1.245155	1.749367	-1.961010	-2.684267	0.781610	17
3	-1.245155	1.769859	-2.012209	-2.684267	0.829726	17
4	-1.245155	1.749367	-2.012209	-2.684267	0.853785	17

1. Problema de Negócio¶

2. Imports¶

2.1 Ambiente¶

3. Carregamento dos Dados¶

4. Analise Exploratoria¶

4.2 Geração de plots e insights¶

4.3 Distribuição dos Dados¶

4.3.1 Teste normal de D'Agostino¶

4.4 Avaliando MultiColinearidade¶

4.5 Simetria dos Dados¶

4.5.1 Skewness¶

4.5.2 Histograma¶

4.5.3 Exceço de Kurtosis¶

4.6 Analise Temporal¶

4.6.1 Pre-Processamento colunas temporais¶

4.6.2 Analise Temporal de Gasto Energia¶

5. Pre-Processamento¶

5.1 Removendo Colunas Desnecessárias¶

5.2 Detectando Outliers¶

5.3 Tratando Outliers¶

5.4 Feature Scaling¶

5.4.1 Aplicando Normalização¶

5.4.2 Analisando Dados Pós Normalização¶

5.5 Incremento nas Features¶

6. Feature Selecting¶

6.1 Select From Model - Random Forest¶

6.2 Random Forest - Feature Importance¶

6.3 Regressão LASSO¶

6.4 Recursive Feature Elimination (RFE) - Linear SVR¶

6.5 Analisando Seleção¶

6.5.1 Random Forest¶

6.5.2 LASSO¶

6.5.3 RFE - Linear SVR¶

7. Modelagem Preditiva¶

7.1 Definindo Ambiente¶

7.2 SVR¶

7.2.1 Conclusão SVR¶

7.2.2 Executando Melhor Modelo¶

7.2.3 Avaliando SVR¶

7.3 CatBoost Regressor¶

7.3.1 Definindo Ambiente¶

7.3.2 Iniciando Modelagem¶

7.3.3 Conclusão CatBoostRegressor¶

7.3.4 Execução Melhor Modelo¶

7.3.5 Salvando Modelo¶

7.3.6 Avaliando CatBoostRegressor¶

8. Conclusão¶