version 1.0 1/4/2018
Wing-Fai Thi
import warnings
warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.externals import joblib # if one wants to save the Phrophet model
# parse the input
def parse(x):
return datetime.strptime(x, '%Y %m %d %H %M') # year month day hour minute
# combine the 4 columns into a Pandas timestamp
# The data are in a space-separated table format
dataset = pd.read_table('CFHT_weather_data/cfht-wx.2016.dat.txt',
delim_whitespace=True, # white space delimiter
header=-1, # no header
date_parser=parse, # name of the function used to parse the data
parse_dates ={'datetime' : [0,1,2,3,4]})
columns=['datetime','wind speed','wind direction', # assign a column header
'temperature','relative humidity','pressure']
dataset.columns = columns
dataset.dtypes
dataset.head()
The data are recorded every minutes, which is a pretty high rate. Here for our prediction purpose we make a 60 minute averaging of the data.
# make a 1 hr rolling average
# The first column is the datatime and we will not roll it
nb_minutes = 60
avg_dataset = pd.DataFrame.copy(dataset)
avg_dataset.iloc[:,1:5]=avg_dataset.iloc[:,1:5].rolling(nb_minutes).mean()
avg_dataset=avg_dataset.iloc[::nb_minutes,:].dropna()
print avg_dataset.shape
# alternative
avg_dataset = pd.DataFrame.copy(dataset) # make first a copy of the original data
avg_dataset = avg_dataset.resample('H',on='datetime').mean() # resample to the hourly-average
print avg_dataset.shape
avg_dataset.reset_index(inplace=True) # move the index as a column inplace
avg_dataset.head()
We will train the model over a few days. One can try models with different training time.
n_train_hours = 5*24 # 1 yr * 24 hrs
horizon = 8
pressure = avg_dataset[['datetime','pressure']]
pressure.columns=['ds','y'] # change the headers for prophet
pressure['y'] = np.log(pressure['y'])
n0 = avg_dataset.shape[0]
n1 = n0 - n_train_hours - horizon
n2 = n0 - horizon
pressure_train = pressure.iloc[n1:n2, :]
pressure_train.shape
We first instantiate the Prophet class. The predicted value is yhat with yhat_lower abd yhat_upper as upper and lower possible values. Prophet uses the same API than scikit-learn.
from fbprophet import Prophet
m = Prophet(yearly_seasonality=False,weekly_seasonality=False)
m.fit(pressure_train);
#future = m.make_future_dataframe(periods=horizon,freq='H')
future = pressure.iloc[n1:,:]
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
m is a Prophet object and can be plotted with predifined settings by Prophet
# Pressure forecast with a 8hr horizon
m.plot(forecast);
A time series component decomposition is avaialble. We can clearly see the daily pressure pressure variation. The trend has been a decreasing pressure over the last 7 days.
m.plot_components(forecast);
from matplotlib.dates import date2num
t = pressure['ds']
p = pressure['y']
dates = date2num(t)
datesf = date2num(forecast['ds'])
plt.figure(figsize=(12,9))
plt.plot_date(dates[n1:],p[n1:],marker='o',label='data')
plt.plot_date(datesf[-horizon:],forecast['yhat'][-horizon:],
marker='.',linestyle='-',label='prediction')
plt.legend()
plt.xlabel('date-time')
plt.ylabel('log(pressure) (mb)')
plt.show()
The 5-day window for the pressure predictions gives reasonable results.
import math
# compute the mean square error
y_pred = np.array(np.exp(forecast['yhat'][-horizon:]))
y_true = np.array(np.exp(p[-horizon:]))
y_persistent = np.array(np.exp(p[n0-horizon-1:n0-1]))
mse = ((y_pred-y_true)**2).mean()
print('Prediction quality: {:.2f} MSE ({:.2f} RMSE)'.format(mse, math.sqrt(mse)))
A persistent model is just using the previous value as a prediction. In our case, the persistent model can predict for the next hour.
mse_persistent = ((y_persistent-y_true)**2).mean()
print('Persistent model quality: {:.2f} MSE ({:.2f} RMSE)'.format(mse_persistent,
math.sqrt(mse_persistent)))
plt.figure(figsize=(12,9))
plt.scatter(y_true,y_persistent,label='persistent model',s=50)
plt.scatter(y_true,y_pred,label='Prophet model',s=50)
plt.plot([y_true.min(),y_true.max()],[y_true.min(),y_true.max()])
plt.xlabel('true pressure')
plt.ylabel('predicted pressure')
plt.legend()
plt.show()
An eight-hour horizon should be really a maximum before the predictions become really bad.
In this notebook, we have learnt how to:
One can try to predict other meteorological data as well like the temperature.