Time-Series Forecasting w/ Prophet

In this post, we use a generalized additive model to forecast store sales using contrbuting factors as variables to help improve the accuracy of demand predictions.

Forecasting at Scale: Time-Series with Prophet

In this exercise we will use parallel computing to implement prophet models for each product in the data and conduct forecasting to submit to kaggle for evaluation

# Import libraries
import pandas as pd
import time
from multiprocessing import Pool, cpu_count
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import zipfile
import itertools
from prophet import Prophet
import os
import sys
import logging
import random 

# List contents
for dirname, _, filenames in os.walk('/kaggle/input'):
    
    for filename in filenames:
        
        print(os.path.join(dirname, filename))

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv

# Load data
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv').rename({'date': 'ds', 'description': 'holiday'}, axis = 1)
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv').rename({'date': 'ds', 'dcoilwtico': 'oil'}, axis = 1).fillna(method = 'bfill')

# Inspect data
print(f"Unique Item Families:    {train['family'].nunique()}") 
print(f"# Stores:                {train['store_nbr'].nunique()}")
print(f"Dataset start date:      {min(train['date'])}")
print(f"Dataset end date:        {max(train['date'])}")
print(f"Test set start date:     {min(test['date'])}")
print(f"Test set end date:       {max(test['date'])}")

interval = pd.date_range(min(train['date']), max(train['date']), freq = 'd')

print(f"Num Days:               {len(interval)}")
print(f"Train Shape:            {train['date'].nunique()}")

Unique Item Families:    33
# Stores:                54
Dataset start date:      2013-01-01
Dataset end date:        2017-08-15
Test set start date:     2017-08-16
Test set end date:       2017-08-31
Num Days:               1688
Train Shape:            1684

	ds	oil
0	2013-01-01	93.14
1	2013-01-02	93.14
2	2013-01-03	92.97
3	2013-01-04	93.12
4	2013-01-07	93.20

# Fix dates
train['ds'] = pd.to_datetime(train['date'])
test['ds'] = pd.to_datetime(test['date'])
oil['ds'] = pd.to_datetime(oil['ds'])

# Create a master label for training 
train['item'] = train['store_nbr'].astype(str) + '_' + train['family']
test['item'] = test['store_nbr'].astype(str) + '_' + test['family']

# Merge data
test1 = test.merge(oil, how = 'left', on = 'ds').drop('date', axis = 1)
train1 = train.merge(oil, how = 'left', on = 'ds').drop('date', axis = 1)

	id	store_nbr	family	onpromotion	ds	item	oil
0	3000888	1	AUTOMOTIVE	0	2017-08-16	1_AUTOMOTIVE	46.8
1	3000889	1	BABY CARE	0	2017-08-16	1_BABY CARE	46.8
2	3000890	1	BEAUTY	2	2017-08-16	1_BEAUTY	46.8
3	3000891	1	BEVERAGES	20	2017-08-16	1_BEVERAGES	46.8
4	3000892	1	BOOKS	0	2017-08-16	1_BOOKS	46.8

	id	store_nbr	family	ds	item	oil
0	0	1	AUTOMOTIVE	2013-01-01	1_AUTOMOTIVE	93.14
1	1	1	BABY CARE	2013-01-01	1_BABY CARE	93.14
2	2	1	BEAUTY	2013-01-01	1_BEAUTY	93.14
3	3	1	BEVERAGES	2013-01-01	1_BEVERAGES	93.14
4	4	1	BOOKS	2013-01-01	1_BOOKS	93.14

# Create an index for each product
df = train1.pivot(index = 'ds', columns = 'item', values = 'sales').reset_index() #.rename({'date': 'ds'}, axis = 1)
promos = train1.pivot(index = 'ds', columns = 'item', values = 'onpromotion').reset_index() #.rename({'date': 'ds'}, axis = 1)
test_promos = test1.pivot(index = 'ds', columns = 'item', values = 'onpromotion').reset_index() #.rename({'date': 'ds'}, axis = 1)

# Instantiate processing power for parallelism
num_cpus = cpu_count()

# Get an index of all products
items = df.drop('ds', axis = 1).columns

#!kaggle competitions submit -c store-sales-time-series-forecasting -f submission.csv -m "prophet_forecasting_at_scale"

test.isnull().sum()

id             0
date           0
store_nbr      0
family         0
onpromotion    0
ds             0
item           0
dtype: int64

model = Prophet(holidays = holidays[['ds', 'holiday']])
model.add_regressor('onpromotion') #, standardize = False)
model.add_regressor('oil' )#, standardize = False)

<prophet.forecaster.Prophet at 0x7c37f7ace770>

# Compile data
trainingdata = df[['ds', items[0]]].rename({items[0]: 'y'}, axis = 1)
trainingdata['y'] = trainingdata['y'].astype(float)

# cONVERT DATES
trainingdata['ds'] = pd.to_datetime(trainingdata['ds'])
promos['ds'] = pd.to_datetime(promos['ds'])

# Add regressors
trainingdata = trainingdata.merge(promos[['ds', items[0]]]).rename({items[0]: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')

trainingdata['oil'] = trainingdata['oil'].fillna(method = 'bfill')

trainingdata.head()

	ds	y	oil
0	2013-01-01	0.0	93.14
1	2013-01-02	3.0	93.14
2	2013-01-03	2.0	92.97
3	2013-01-04	2.0	93.12
4	2013-01-05	0.0	93.20

model.fit(trainingdata)

14:13:49 - cmdstanpy - INFO - Chain [1] start processing
14:13:50 - cmdstanpy - INFO - Chain [1] done processing





<prophet.forecaster.Prophet at 0x7c37f7ace770>

fut = model.make_future_dataframe(periods = 16)
merge_content = test_promos[['ds', items[0]]].rename({items[0]: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
fut = fut.merge(merge_content)
fut['oil'] = fut['oil'].fillna(method = 'bfill')

	ds	onpromotion	oil
0	2017-08-16	0	46.80
1	2017-08-17	0	47.07
2	2017-08-18	0	48.59
3	2017-08-19	0	47.39
4	2017-08-20	0	47.39
5	2017-08-21	0	47.39
6	2017-08-22	0	47.65
7	2017-08-23	0	48.45
8	2017-08-24	0	47.24
9	2017-08-25	0	47.65
10	2017-08-26	0	46.40
11	2017-08-27	0	46.40
12	2017-08-28	0	46.40
13	2017-08-29	0	46.46
14	2017-08-30	2	45.96
15	2017-08-31	0	47.26

fut['ds'] = pd.to_datetime(fut['ds'])
preds = model.predict(fut)

preds[['ds', 'onpromotion', 'oil', 'yhat']]

	ds	onpromotion	oil	yhat
0	2017-08-16	0.000000	-0.095392	1.749961
1	2017-08-17	0.000000	-0.094169	1.512343
2	2017-08-18	0.000000	-0.087285	1.776728
3	2017-08-19	0.000000	-0.092720	2.870283
4	2017-08-20	0.000000	-0.092720	3.268697
5	2017-08-21	0.000000	-0.092720	2.059172
6	2017-08-22	0.000000	-0.091543	2.067662
7	2017-08-23	0.000000	-0.087919	2.086430
8	2017-08-24	0.000000	-0.093400	0.411933
9	2017-08-25	0.000000	-0.091543	2.089462
10	2017-08-26	0.000000	-0.097204	3.169663
11	2017-08-27	0.000000	-0.097204	3.550008
12	2017-08-28	0.000000	-0.097204	2.317490
13	2017-08-29	0.000000	-0.096932	2.297301
14	2017-08-30	7.158534	-0.099197	9.436443
15	2017-08-31	0.000000	-0.093309	2.004886

out = test[test['item'] == items[0]].drop('item', axis = 1)
out['ds'] = pd.to_datetime(out['ds'])
out.head()

	id	date	store_nbr	family	ds
33	3000921	2017-08-16	10	AUTOMOTIVE	2017-08-16
1815	3002703	2017-08-17	10	AUTOMOTIVE	2017-08-17
3597	3004485	2017-08-18	10	AUTOMOTIVE	2017-08-18
5379	3006267	2017-08-19	10	AUTOMOTIVE	2017-08-19
7161	3008049	2017-08-20	10	AUTOMOTIVE	2017-08-20

	id	date	store_nbr	family	onpromotion	ds
19635	3020523	2017-08-27	10	AUTOMOTIVE	0	2017-08-27
21417	3022305	2017-08-28	10	AUTOMOTIVE	0	2017-08-28
23199	3024087	2017-08-29	10	AUTOMOTIVE	0	2017-08-29
24981	3025869	2017-08-30	10	AUTOMOTIVE	2	2017-08-30
26763	3027651	2017-08-31	10	AUTOMOTIVE	0	2017-08-31

# cONVERT DATES
df['ds'] = pd.to_datetime(df['ds'])
promos['ds'] = pd.to_datetime(promos['ds'])
test['ds'] = pd.to_datetime(test['ds'])

# Create a function to perform modeling
def prophet_model(item):
    
    # Init model
    model = Prophet(holidays = holidays[['ds', 'holiday']])
    model.add_regressor('onpromotion') #, standardize = False)
    model.add_regressor('oil' )#, standardize = False)

    # Compile data
    trainingdata = df[['ds', item]].rename({item: 'y'}, axis = 1)
    trainingdata['y'] = trainingdata['y'].astype(float)

    # Add regressors
    trainingdata = trainingdata.merge(promos[['ds', item]], how = 'left').rename({item: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
    trainingdata['onpromotion'] = trainingdata['onpromotion'].fillna(0)
    trainingdata['oil'] = trainingdata['oil'].fillna(method = 'bfill')
    
    # Train
    model.fit(trainingdata)
    
    # Init predictions
    fut = model.make_future_dataframe(periods = 16)
    merge_content = test_promos[['ds', item]].rename({item: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
    fut = fut.merge(merge_content)
    fut['oil'] = fut['oil'].fillna(method = 'bfill')
    
    # Model
    preds = model.predict(fut)[['ds', 'yhat']] 
    preds['ds'] = pd.to_datetime(preds['ds'])
    
    # Output
    out = test[test['item'] == item].drop('item', axis = 1)    
    
    return out.merge(preds, how = 'left', on = 'ds').rename({'yhat': 'sales'}, axis = 1) #.drop(['ds', 'oil', 'onpromotion'], axis = 1)

random.choice(items)

prophet_model(random.choice(items))

14:13:51 - cmdstanpy - INFO - Chain [1] start processing
14:13:51 - cmdstanpy - INFO - Chain [1] done processing

	id	date	store_nbr	family	ds	sales
0	3001581	2017-08-16	29	AUTOMOTIVE	2017-08-16	6.083554
1	3003363	2017-08-17	29	AUTOMOTIVE	2017-08-17	6.022175
2	3005145	2017-08-18	29	AUTOMOTIVE	2017-08-18	6.568839
3	3006927	2017-08-19	29	AUTOMOTIVE	2017-08-19	7.950376
4	3008709	2017-08-20	29	AUTOMOTIVE	2017-08-20	8.325301
5	3010491	2017-08-21	29	AUTOMOTIVE	2017-08-21	6.421023
6	3012273	2017-08-22	29	AUTOMOTIVE	2017-08-22	6.901322
7	3014055	2017-08-23	29	AUTOMOTIVE	2017-08-23	6.409948
8	3015837	2017-08-24	29	AUTOMOTIVE	2017-08-24	5.036301
9	3017619	2017-08-25	29	AUTOMOTIVE	2017-08-25	6.838067
10	3019401	2017-08-26	29	AUTOMOTIVE	2017-08-26	8.204809
11	3021183	2017-08-27	29	AUTOMOTIVE	2017-08-27	8.558502
12	3022965	2017-08-28	29	AUTOMOTIVE	2017-08-28	6.625649
13	3024747	2017-08-29	29	AUTOMOTIVE	2017-08-29	7.066330
14	3026529	2017-08-30	29	AUTOMOTIVE	2017-08-30	6.506890
15	3028311	2017-08-31	29	AUTOMOTIVE	2017-08-31	6.418256

# Forecast
#start = time.time()

# Disable outputs
#logging.getLogger("cmdstanpy").disabled = True 

# Parallelization
#p = Pool(num_cpus)
#result = p.map(prophet_model, items)

#end = time.time()

#print(f'Elapsed Modeling Time:  {round((start - end) / 60, 2)} minutes..')

# Compile output
#res = pd.concat([i for i in result])
#out = res[['id', 'sales']]

# Write file
out.to_csv('submission.csv', index = False)

# Visualize outputs
sample = random.choice(items)

# Init model
model = Prophet(holidays = holidays[['ds', 'holiday']])
model.add_regressor('onpromotion') #, standardize = False)
model.add_regressor('oil' )#, standardize = False)

# Compile data
trainingdata = df[['ds', sample]].rename({sample: 'y'}, axis = 1)
trainingdata['y'] = trainingdata['y'].astype(float)

# Add regressors
trainingdata = trainingdata.merge(promos[['ds', sample]], how = 'left').rename({sample: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
trainingdata['onpromotion'] = trainingdata['onpromotion'].fillna(0)
trainingdata['oil'] = trainingdata['oil'].fillna(method = 'bfill')
    
# Train
model.fit(trainingdata)
    
# Init predictions
fut = model.make_future_dataframe(periods = 16)
merge_content = test_promos[['ds', sample]].rename({sample: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
fut = fut.merge(merge_content)
fut['oil'] = fut['oil'].fillna(method = 'bfill')
    
# Model
preds = model.predict(fut) #[['ds', 'yhat']] 
preds['ds'] = pd.to_datetime(preds['ds'])
    
# Output
out = test[test['item'] == sample].drop('item', axis = 1)    
out = out.merge(preds, how = 'left', on = 'ds').rename({'yhat': 'sales'}, axis = 1) #.drop(['ds', 'oil', 'onpromotion'], axis = 1)

14:13:53 - cmdstanpy - INFO - Chain [1] start processing
14:13:53 - cmdstanpy - INFO - Chain [1] done processing

print(f"Examining Outputs for {sample}")
    
model.plot(preds)

Examining Outputs for 29_PET SUPPLIES

model.plot_components(preds)

print('Showing Time Series for Model: ')

Showing Time Series for Model:

	id	date	store_nbr	family	ds	trend	yhat_lower	yhat_upper	trend_lower	...	weekly	weekly_lower	weekly_upper	yearly	yearly_lower	yearly_upper	sales
0	3001607	2017-08-16	29	PET SUPPLIES	2017-08-16	7.514397	4.404190	8.789459	7.514397	...	-0.199900	-0.199900	-0.199900	-0.229431	-0.229431	-0.229431	6.625475
1	3003389	2017-08-17	29	PET SUPPLIES	2017-08-17	7.520778	4.404179	8.655474	7.520778	...	-0.343361	-0.343361	-0.343361	-0.181378	-0.181378	-0.181378	6.541952
2	3005171	2017-08-18	29	PET SUPPLIES	2017-08-18	7.527158	4.113727	8.530878	7.527158	...	-0.558148	-0.558148	-0.558148	-0.131434	-0.131434	-0.131434	6.414476
3	3006953	2017-08-19	29	PET SUPPLIES	2017-08-19	7.533539	5.316851	9.693612	7.533539	...	0.510069	0.510069	0.510069	-0.080453	-0.080453	-0.080453	7.515592
4	3008735	2017-08-20	29	PET SUPPLIES	2017-08-20	7.539920	5.291586	9.929054	7.539920	...	0.736900	0.736900	0.736900	-0.029338	-0.029338	-0.029338	7.799919

5 rows × 345 columns

fig, ax = plt.subplots()
ax.scatter(trainingdata['ds'], trainingdata['y'], color = 'dodgerblue', s = .1)
ax.scatter(out['ds'], out['sales'], color = 'red', s = .2)
plt.ylabel('Count of Item')
plt.xlabel('Date')