Prediction french electricity consumption. More analysis about french energy:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
Target of this section is to extract data from hourly consumption and hourly meteo information. Result will be on single tab with :
Then this tab will be split for machin learning as Consumption d the output data to predict and other data as input.
consumption = pd.read_csv('consommation-quotidienne-brute-regionale.csv',
sep=';',
parse_dates=[0])
consumption.drop(['Date', 'Heure', 'Code INSEE région', 'Statut - GRTgaz',\
'Consommation brute gaz (MW PCS 0°C) - GRTgaz', 'Statut - Teréga', \
'Consommation brute gaz (MW PCS 0°C) - Teréga', 'Statut - RTE'],
axis=1, inplace=True)
consumption.columns = ['date', 'region', 'consumption (MWh)']
consumption.set_index('date', inplace=True)
consumption = consumption.groupby(by=pd.Grouper(freq='D')).sum()
sw = pd.read_csv('rayonnement-solaire-vitesse-vent-tri-horaires-regionaux.csv',
sep=';',
parse_dates=[0])
sw.drop(['Code INSEE région'], axis=1, inplace=True)
sw.columns = ['date', 'region', 'wind_speed (m/s)', 'solar_radiation (W/m2)']
sw.set_index('date', inplace=True)
solar = sw.drop('wind_speed (m/s)', axis=1)
solar = solar.groupby(by=pd.Grouper(freq='D')).mean()
wind = sw.drop('solar_radiation (W/m2)', axis=1)
wind = wind.groupby(by=pd.Grouper(freq='D')).mean()
temp = pd.read_csv('temperature-quotidienne-regionale.csv', sep=';', parse_dates=[0])
temp.drop(['Code INSEE région', 'Région', 'TMin (°C)', 'TMax (°C)'], axis=1, inplace=True)
temp.columns = ['date', 'temperature (°C)']
temp.set_index('date', inplace=True)
temp = temp.groupby(by=pd.Grouper(freq='D')).mean()
data = pd.merge(consumption, solar, how='inner', left_index=True, right_index=True)
data = pd.merge(data, wind, how='inner', left_index=True, right_index=True)
data = pd.merge(data, temp, how='inner', left_index=True, right_index=True)
data['month'] = data.index.month
data['day_week'] = data.index.dayofweek
c = data['consumption (MWh)']
data.drop(pd.date_range(start='2016-01-01', freq='D', periods=4), inplace=True)
data['consumption d-1'] = c[3:-1].values
data['consumption d-2'] = c[2:-2].values
data['consumption d-3'] = c[1:-3].values
data['consumption d-4'] = c[0:-4].values
data.head(10)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_scaller = StandardScaler()
y_scaller = StandardScaler()
X = X_scaller.fit_transform(data.drop(['consumption (MWh)'], axis=1).values)
y = y_scaller.fit_transform(data['consumption (MWh)'].values.reshape(-1, 1)).ravel() # Reshape for scaller
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=456)
We will test some differents algorithms with differents parameters
from sklearn.metrics import mean_squared_error, make_scorer
def fit_and_compute_score(model):
"""
Fit data training on model. Then compute score on training data ans test.
:param model: machin learning model which implement sklearn interface fit(X, y), predict(X), score(X, y)
:return: R2 score for training and test data.
"""
model.fit(X_train, y_train)
return model.score(X_train, y_train), model.score(X_test, y_test)
from sklearn.svm import SVR
C_list = [1, 5, 10, 15, 20, 25, 35, 45, 60]
svr_train = []
svr_test = []
for c in C_list:
model = SVR(kernel='rbf', gamma=0.01, C=c)
train, test = fit_and_compute_score(model)
svr_train.append(train)
svr_test.append(test)
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(C_list, svr_train, label='Train score')
ax.plot(C_list, svr_test, label='Test score')
ax.set_title('R^2 Score for SVR')
ax.set_xlabel('C values')
fig.legend()
plt.show()
from sklearn.ensemble import RandomForestRegressor
depth_list = np.arange(2, 20)
forest_train = []
forest_test = []
for d in depth_list:
model = RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=d, n_jobs=4, oob_score=True)
train, test = fit_and_compute_score(model)
forest_train.append(train)
forest_test.append(test)
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(depth_list, forest_train, label='Train score')
ax.plot(depth_list, forest_test, label='Test score')
ax.set_title('R^2 Score for Random forest')
ax.set_xlabel('forest depth')
fig.legend()
plt.show()
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
degree_list = [1, 2, 3, 4, 5, 6]
poly_train = []
poly_test = []
for d in degree_list:
model = Pipeline([('poly', PolynomialFeatures(degree=d)),
('linear', LinearRegression(fit_intercept=False))])
train, test = fit_and_compute_score(model)
poly_train.append(train)
poly_test.append(test)
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(degree_list, poly_train, label='Train score')
ax.plot(degree_list, poly_test, label='Test score')
ax.set_title('R^2 Score for polynome regression')
ax.set_xlabel('polynome degree')
fig.legend()
plt.show()
from sklearn.neighbors import KNeighborsRegressor
k_list = np.arange(1, 15)
knn_train = []
knn_test = []
for k in k_list:
model = KNeighborsRegressor(n_neighbors=k)
train, test = fit_and_compute_score(model)
knn_train.append(train)
knn_test.append(test)
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(k_list, knn_train, label='Train score')
ax.plot(k_list, knn_test, label='Test score')
ax.set_title('R^2 Score for knn regression')
ax.set_xlabel('k neighbors')
fig.legend()
plt.show()
from dnn import DeepNeuronalNetwork
sizes_l1 = [ 2, 3, 4, 5, 6, 7, 8, 9, 10]
sizes_l2 = [ 2, 3, 4, 5, 6, 7, 8, 9, 10]
dnn_train = []
dnn_test = []
for l1 in sizes_l1:
dnn_train_size = []
dnn_test_size = []
for l2 in sizes_l2:
model = DeepNeuronalNetwork(layers_size=[l1, l2], learning_rate=0.01, n_iteration=400, batch_size=30)
train, test = fit_and_compute_score(model)
dnn_train_size.append(train)
dnn_test_size.append(test)
dnn_train.append(dnn_train_size)
dnn_test.append(dnn_test_size)
fig, (l, r) = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))
lim = l.imshow(dnn_train)
l.figure.colorbar(lim, ax=l)
l.set_yticks(np.arange(len(sizes_l1)))
l.set_yticklabels(sizes_l1)
l.set_ylabel('Neuronal Network size as [10*s, s]')
l.set_xticks(np.arange(len(sizes_l2)))
l.set_xticklabels(sizes_l2)
l.set_xlabel('Number of iterations')
l.set_title('Train model score')
rim = r.imshow(dnn_test)
r.figure.colorbar(rim, ax=r)
r.set_yticks(np.arange(len(sizes_l1)))
r.set_yticklabels(sizes_l1)
r.set_ylabel('Neuronal Network size as [10*s, s]')
r.set_xticks(np.arange(len(sizes_l2)))
r.set_xticklabels(sizes_l2)
r.set_xlabel('Number of iterations')
r.set_title('Test model score')
plt.show()
According to last section we can select most efficient parameters for each algorithms. Then we will aggragate result, and analyze performance.
svr = SVR(kernel='rbf', gamma=0.01, C=50)
svr.fit(X_train, y_train)
y_svr = svr.predict(X_test)
error_svr = np.power(y_svr - y_test, 2)
forest = RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=10, n_jobs=4, oob_score=True)
forest.fit(X_train, y_train)
y_forest = forest.predict(X_test)
error_forest = np.power(y_forest - y_test, 2)
poly = Pipeline([('poly', PolynomialFeatures(degree=3)),
('linear', LinearRegression(fit_intercept=False))])
poly.fit(X_train, y_train)
y_poly = poly.predict(X_test)
error_poly = np.power(y_poly - y_test, 2)
knn = KNeighborsRegressor(n_neighbors=4)
knn.fit(X_train, y_train)
y_knn = knn.predict(X_test)
error_knn = np.power(y_knn - y_test, 2)
dnn = DeepNeuronalNetwork(layers_size=[10, 10], learning_rate=0.001, n_iteration=1000, batch_size=30)
dnn.fit(X_train, y_train)
y_dnn = dnn.predict(X_test)
error_dnn = np.power(y_dnn - y_test, 2)
y_agg = (y_svr + y_forest + y_poly + y_knn) / 4
error_agg = np.power(y_agg - y_test, 2)
def split_resize(y, split):
return y_scaller.inverse_transform(y[:split])
split = 50
t = np.arange(50)
fig, ax = plt.subplots(figsize=(8, 10))
ax.scatter(split_resize(y_test, split), t, c='orange', marker='o', s=100, alpha=1, label='true')
ax.scatter(split_resize(y_agg, split), t, c='cyan', marker='o', s=100, alpha=1, label='aggregated')
ax.scatter(split_resize(y_svr, split), t, c='green', marker='^', s=30, alpha=0.8, label='SVR')
ax.scatter(split_resize(y_forest, split), t, c='red', marker='^', s=30, alpha=0.8, label='forest')
ax.scatter(split_resize(y_poly, split), t, c='blue', marker='^', s=30, alpha=0.8, label='poly')
ax.scatter(split_resize(y_knn, split), t, c='pink', marker='^', s=30, alpha=0.8, label='knn')
# ax.scatter(split_resize(y_dnn, split), t, c='purple', marker='^', s=30, alpha=0.8, label='dnn')
ax.set_yticklabels([])
ax.set_xlabel('Consumption (MWh)')
ax.set_title('Presicion ML result')
fig.legend(loc=(0.8, 0.48))
plt.show()
bins = 100
fig, ax = plt.subplots(figsize=(12, 6))
sns.kdeplot(error_svr, ax=ax, label='SVR')
sns.kdeplot(error_forest, ax=ax, label='forest')
sns.kdeplot(error_poly, ax=ax, label='poly')
sns.kdeplot(error_knn, ax=ax, label='KNN')
sns.kdeplot(error_agg, ax=ax, label='Agg')
ax.set_xlim([0, 0.2])
ax.set_xlabel('Error')
ax.set_yticks([])
ax.set_title('KDE distribution of squre error')
plt.show()