Prediction French Consumption

Prediction french electricity consumption. More analysis about french energy:

In [2]:
import numpy as np
import pandas as pd
/usr/lib64/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/usr/lib64/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
In [4]:
%matplotlib inline

Construct data input

Target of this section is to extract data from hourly consumption and hourly meteo information. Result will be on single tab with :

  • Consumption d
  • Consumption d-1
  • Consumption d-2
  • Consumption d-3
  • Consumption d-4
  • Solar radiation
  • wind speed
  • temperature

Then this tab will be split for machin learning as Consumption d the output data to predict and other data as input.

Consumption

In [5]:
consumption = pd.read_csv('consommation-quotidienne-brute-regionale.csv', 
                          sep=';', 
                          parse_dates=[0])
consumption.drop(['Date', 'Heure', 'Code INSEE région', 'Statut - GRTgaz',\
                  'Consommation brute gaz (MW PCS 0°C) - GRTgaz', 'Statut - Teréga', \
                 'Consommation brute gaz (MW PCS 0°C) - Teréga', 'Statut - RTE'], 
                 axis=1, inplace=True)
consumption.columns = ['date', 'region', 'consumption (MWh)']
consumption.set_index('date', inplace=True)
consumption = consumption.groupby(by=pd.Grouper(freq='D')).sum()

Wind and solar

In [6]:
sw = pd.read_csv('rayonnement-solaire-vitesse-vent-tri-horaires-regionaux.csv',
                 sep=';',
                 parse_dates=[0])
sw.drop(['Code INSEE région'], axis=1, inplace=True)
sw.columns = ['date', 'region', 'wind_speed (m/s)', 'solar_radiation (W/m2)']
sw.set_index('date', inplace=True)
In [7]:
solar = sw.drop('wind_speed (m/s)', axis=1)
solar = solar.groupby(by=pd.Grouper(freq='D')).mean()
In [8]:
wind = sw.drop('solar_radiation (W/m2)', axis=1)
wind = wind.groupby(by=pd.Grouper(freq='D')).mean()

Temperature

In [9]:
temp = pd.read_csv('temperature-quotidienne-regionale.csv', sep=';', parse_dates=[0])
temp.drop(['Code INSEE région', 'Région', 'TMin (°C)', 'TMax (°C)'], axis=1, inplace=True)
temp.columns = ['date', 'temperature (°C)']
temp.set_index('date', inplace=True)
temp = temp.groupby(by=pd.Grouper(freq='D')).mean()

Merge

In [10]:
data = pd.merge(consumption, solar, how='inner', left_index=True, right_index=True)
data = pd.merge(data, wind, how='inner', left_index=True, right_index=True)
data = pd.merge(data, temp, how='inner', left_index=True, right_index=True)
In [11]:
data['month'] = data.index.month
data['day_week'] = data.index.dayofweek

Prepare data

In [12]:
c = data['consumption (MWh)']
data.drop(pd.date_range(start='2016-01-01', freq='D', periods=4), inplace=True)
data['consumption d-1'] = c[3:-1].values
data['consumption d-2'] = c[2:-2].values
data['consumption d-3'] = c[1:-3].values
data['consumption d-4'] = c[0:-4].values
In [13]:
data.head(10)
Out[13]:
consumption (MWh) solar_radiation (W/m2) wind_speed (m/s) temperature (°C) month day_week consumption d-1 consumption d-2 consumption d-3 consumption d-4
date
2016-01-05 3234575.0 43.612885 7.985962 7.699231 1 1 3101717.0 2725674.0 2668997.0 2575676.0
2016-01-06 3268121.0 39.137500 7.726442 7.582308 1 2 3234575.0 3101717.0 2725674.0 2668997.0
2016-01-07 3251392.0 31.206250 7.784327 8.716154 1 3 3268121.0 3234575.0 3101717.0 2725674.0
2016-01-08 3205301.0 45.046346 8.117500 7.598462 1 4 3251392.0 3268121.0 3234575.0 3101717.0
2016-01-09 2849667.0 37.963077 8.419135 8.879231 1 5 3205301.0 3251392.0 3268121.0 3234575.0
2016-01-10 2784139.0 38.943942 8.855096 8.314615 1 6 2849667.0 3205301.0 3251392.0 3268121.0
2016-01-11 3195995.0 44.494519 8.894231 8.326154 1 0 2784139.0 2849667.0 3205301.0 3251392.0
2016-01-12 3331976.0 55.128365 8.659615 7.404615 1 1 3195995.0 2784139.0 2849667.0 3205301.0
2016-01-13 3442324.0 61.412788 7.804231 5.490000 1 2 3331976.0 3195995.0 2784139.0 2849667.0
2016-01-14 3487782.0 40.836731 7.031635 5.099231 1 3 3442324.0 3331976.0 3195995.0 2784139.0
In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_scaller = StandardScaler()
y_scaller = StandardScaler()

X = X_scaller.fit_transform(data.drop(['consumption (MWh)'], axis=1).values)
y = y_scaller.fit_transform(data['consumption (MWh)'].values.reshape(-1, 1)).ravel() # Reshape for scaller

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=456)

Analyze different ML

We will test some differents algorithms with differents parameters

In [15]:
from sklearn.metrics import mean_squared_error, make_scorer
In [16]:
def fit_and_compute_score(model):
    """
    Fit data training on model. Then compute score on training data ans test.
    :param model: machin learning model which implement sklearn interface fit(X, y), predict(X), score(X, y)
    :return: R2 score for training and test data.
    """
    model.fit(X_train, y_train)
    return model.score(X_train, y_train), model.score(X_test, y_test)

Support machine regression

In [17]:
from sklearn.svm import SVR
In [18]:
C_list = [1, 5, 10, 15, 20, 25, 35, 45, 60]
svr_train = []
svr_test = []
for c in C_list:
    model = SVR(kernel='rbf', gamma=0.01, C=c)
    train, test = fit_and_compute_score(model)
    svr_train.append(train)
    svr_test.append(test)
In [19]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(C_list, svr_train, label='Train score')
ax.plot(C_list, svr_test, label='Test score')
ax.set_title('R^2 Score for SVR')
ax.set_xlabel('C values')
fig.legend()
plt.show()

Random forest

In [20]:
from sklearn.ensemble import RandomForestRegressor
In [21]:
depth_list = np.arange(2, 20)
forest_train = []
forest_test = []
for d in depth_list:
    model = RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=d, n_jobs=4, oob_score=True)
    train, test = fit_and_compute_score(model)
    forest_train.append(train)
    forest_test.append(test)
In [22]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(depth_list, forest_train, label='Train score')
ax.plot(depth_list, forest_test, label='Test score')
ax.set_title('R^2 Score for Random forest')
ax.set_xlabel('forest depth')
fig.legend()
plt.show()

Polynomial regression

In [23]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
In [24]:
degree_list = [1, 2, 3, 4, 5, 6]
poly_train = []
poly_test = []

for d in degree_list:
    model = Pipeline([('poly', PolynomialFeatures(degree=d)),
                     ('linear', LinearRegression(fit_intercept=False))])
    train, test = fit_and_compute_score(model)
    poly_train.append(train)
    poly_test.append(test)
In [25]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(degree_list, poly_train, label='Train score')
ax.plot(degree_list, poly_test, label='Test score')
ax.set_title('R^2 Score for polynome regression')
ax.set_xlabel('polynome degree')
fig.legend()
plt.show()

KNN Regression

In [26]:
from sklearn.neighbors import KNeighborsRegressor
In [107]:
k_list = np.arange(1, 15)
knn_train = []
knn_test = []

for k in k_list:
    model = KNeighborsRegressor(n_neighbors=k)
    train, test = fit_and_compute_score(model)
    knn_train.append(train)
    knn_test.append(test)
In [108]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(k_list, knn_train, label='Train score')
ax.plot(k_list, knn_test, label='Test score')
ax.set_title('R^2 Score for knn regression')
ax.set_xlabel('k neighbors')
fig.legend()
plt.show()

Neuronal network

In [71]:
from dnn import DeepNeuronalNetwork
In [72]:
sizes_l1 = [ 2,  3,  4,  5,  6,  7,  8,  9, 10]
sizes_l2 = [ 2,  3,  4,  5,  6,  7,  8,  9, 10]

dnn_train = []
dnn_test = []
for l1 in sizes_l1:
    dnn_train_size = []
    dnn_test_size = []
    
    for l2 in sizes_l2:
        model = DeepNeuronalNetwork(layers_size=[l1, l2], learning_rate=0.01, n_iteration=400, batch_size=30)
        train, test = fit_and_compute_score(model)
        dnn_train_size.append(train)
        dnn_test_size.append(test)
        
    dnn_train.append(dnn_train_size)
    dnn_test.append(dnn_test_size)
In [73]:
fig, (l, r) = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))

lim = l.imshow(dnn_train)
l.figure.colorbar(lim, ax=l)
l.set_yticks(np.arange(len(sizes_l1)))
l.set_yticklabels(sizes_l1)
l.set_ylabel('Neuronal Network size as [10*s, s]')
l.set_xticks(np.arange(len(sizes_l2)))
l.set_xticklabels(sizes_l2)
l.set_xlabel('Number of iterations')
l.set_title('Train model score')

rim = r.imshow(dnn_test)
r.figure.colorbar(rim, ax=r)
r.set_yticks(np.arange(len(sizes_l1)))
r.set_yticklabels(sizes_l1)
r.set_ylabel('Neuronal Network size as [10*s, s]')
r.set_xticks(np.arange(len(sizes_l2)))
r.set_xticklabels(sizes_l2)
r.set_xlabel('Number of iterations')
r.set_title('Test model score')

plt.show()

Aggregate best result

According to last section we can select most efficient parameters for each algorithms. Then we will aggragate result, and analyze performance.

In [79]:
svr = SVR(kernel='rbf', gamma=0.01, C=50)
svr.fit(X_train, y_train)
y_svr = svr.predict(X_test)
error_svr = np.power(y_svr - y_test, 2)
In [80]:
forest = RandomForestRegressor(n_estimators=200, criterion='mse', max_depth=10, n_jobs=4, oob_score=True)
forest.fit(X_train, y_train)
y_forest = forest.predict(X_test)
error_forest = np.power(y_forest - y_test, 2)
In [81]:
poly = Pipeline([('poly', PolynomialFeatures(degree=3)),
                     ('linear', LinearRegression(fit_intercept=False))])
poly.fit(X_train, y_train)
y_poly = poly.predict(X_test)
error_poly = np.power(y_poly - y_test, 2)
In [82]:
knn = KNeighborsRegressor(n_neighbors=4)
knn.fit(X_train, y_train)
y_knn = knn.predict(X_test)
error_knn = np.power(y_knn - y_test, 2)
In [83]:
dnn = DeepNeuronalNetwork(layers_size=[10, 10], learning_rate=0.001, n_iteration=1000, batch_size=30)
dnn.fit(X_train, y_train)
y_dnn = dnn.predict(X_test)
error_dnn = np.power(y_dnn - y_test, 2)
In [85]:
y_agg = (y_svr + y_forest + y_poly + y_knn) / 4
error_agg = np.power(y_agg - y_test, 2)
In [63]:
def split_resize(y, split):
    return y_scaller.inverse_transform(y[:split])
In [64]:
split = 50
t = np.arange(50)
fig, ax = plt.subplots(figsize=(8, 10))
ax.scatter(split_resize(y_test, split),   t, c='orange', marker='o', s=100, alpha=1,   label='true')
ax.scatter(split_resize(y_agg, split),    t, c='cyan',   marker='o', s=100, alpha=1,   label='aggregated')
ax.scatter(split_resize(y_svr, split),    t, c='green',  marker='^', s=30,  alpha=0.8, label='SVR')
ax.scatter(split_resize(y_forest, split), t, c='red',    marker='^', s=30,  alpha=0.8, label='forest')
ax.scatter(split_resize(y_poly, split),   t, c='blue',   marker='^', s=30,  alpha=0.8, label='poly')
ax.scatter(split_resize(y_knn, split),    t, c='pink',   marker='^', s=30,  alpha=0.8, label='knn')
# ax.scatter(split_resize(y_dnn, split),    t, c='purple', marker='^', s=30,  alpha=0.8, label='dnn')
ax.set_yticklabels([])
ax.set_xlabel('Consumption (MWh)')
ax.set_title('Presicion ML result')
fig.legend(loc=(0.8, 0.48))
plt.show()
In [109]:
bins = 100
fig, ax = plt.subplots(figsize=(12, 6))
sns.kdeplot(error_svr, ax=ax, label='SVR')
sns.kdeplot(error_forest, ax=ax, label='forest')
sns.kdeplot(error_poly, ax=ax, label='poly')
sns.kdeplot(error_knn, ax=ax, label='KNN')
sns.kdeplot(error_agg, ax=ax, label='Agg')
ax.set_xlim([0, 0.2])
ax.set_xlabel('Error')
ax.set_yticks([])
ax.set_title('KDE distribution of squre error')
plt.show()
In [ ]: