In this post, we use a generalized additive model to forecast store sales using contrbuting factors as variables to help improve the accuracy of demand predictions.

In this exercise we will use parallel computing to implement prophet models for each product in the data and conduct forecasting to submit to kaggle for evaluation
# Import libraries
import pandas as pd
import time
from multiprocessing import Pool, cpu_count
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import zipfile
import itertools
from prophet import Prophet
import os
import sys
import logging
import random
# List contents
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv
# Load data
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv').rename({'date': 'ds', 'description': 'holiday'}, axis = 1)
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv').rename({'date': 'ds', 'dcoilwtico': 'oil'}, axis = 1).fillna(method = 'bfill')
# Inspect data
print(f"Unique Item Families: {train['family'].nunique()}")
print(f"# Stores: {train['store_nbr'].nunique()}")
print(f"Dataset start date: {min(train['date'])}")
print(f"Dataset end date: {max(train['date'])}")
print(f"Test set start date: {min(test['date'])}")
print(f"Test set end date: {max(test['date'])}")
interval = pd.date_range(min(train['date']), max(train['date']), freq = 'd')
print(f"Num Days: {len(interval)}")
print(f"Train Shape: {train['date'].nunique()}")
Unique Item Families: 33
# Stores: 54
Dataset start date: 2013-01-01
Dataset end date: 2017-08-15
Test set start date: 2017-08-16
Test set end date: 2017-08-31
Num Days: 1688
Train Shape: 1684
oil.head()
| ds | oil | |
|---|---|---|
| 0 | 2013-01-01 | 93.14 |
| 1 | 2013-01-02 | 93.14 |
| 2 | 2013-01-03 | 92.97 |
| 3 | 2013-01-04 | 93.12 |
| 4 | 2013-01-07 | 93.20 |
# Fix dates
train['ds'] = pd.to_datetime(train['date'])
test['ds'] = pd.to_datetime(test['date'])
oil['ds'] = pd.to_datetime(oil['ds'])
# Create a master label for training
train['item'] = train['store_nbr'].astype(str) + '_' + train['family']
test['item'] = test['store_nbr'].astype(str) + '_' + test['family']
# Merge data
test1 = test.merge(oil, how = 'left', on = 'ds').drop('date', axis = 1)
train1 = train.merge(oil, how = 'left', on = 'ds').drop('date', axis = 1)
test.shape
(28512, 7)
test1.shape
(28512, 7)
test1.head()
| id | store_nbr | family | onpromotion | ds | item | oil | |
|---|---|---|---|---|---|---|---|
| 0 | 3000888 | 1 | AUTOMOTIVE | 0 | 2017-08-16 | 1_AUTOMOTIVE | 46.8 |
| 1 | 3000889 | 1 | BABY CARE | 0 | 2017-08-16 | 1_BABY CARE | 46.8 |
| 2 | 3000890 | 1 | BEAUTY | 2 | 2017-08-16 | 1_BEAUTY | 46.8 |
| 3 | 3000891 | 1 | BEVERAGES | 20 | 2017-08-16 | 1_BEVERAGES | 46.8 |
| 4 | 3000892 | 1 | BOOKS | 0 | 2017-08-16 | 1_BOOKS | 46.8 |
train1.head()
| id | store_nbr | family | sales | onpromotion | ds | item | oil | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | AUTOMOTIVE | 0.0 | 0 | 2013-01-01 | 1_AUTOMOTIVE | 93.14 |
| 1 | 1 | 1 | BABY CARE | 0.0 | 0 | 2013-01-01 | 1_BABY CARE | 93.14 |
| 2 | 2 | 1 | BEAUTY | 0.0 | 0 | 2013-01-01 | 1_BEAUTY | 93.14 |
| 3 | 3 | 1 | BEVERAGES | 0.0 | 0 | 2013-01-01 | 1_BEVERAGES | 93.14 |
| 4 | 4 | 1 | BOOKS | 0.0 | 0 | 2013-01-01 | 1_BOOKS | 93.14 |
# Create an index for each product
df = train1.pivot(index = 'ds', columns = 'item', values = 'sales').reset_index() #.rename({'date': 'ds'}, axis = 1)
promos = train1.pivot(index = 'ds', columns = 'item', values = 'onpromotion').reset_index() #.rename({'date': 'ds'}, axis = 1)
test_promos = test1.pivot(index = 'ds', columns = 'item', values = 'onpromotion').reset_index() #.rename({'date': 'ds'}, axis = 1)
# Instantiate processing power for parallelism
num_cpus = cpu_count()
# Get an index of all products
items = df.drop('ds', axis = 1).columns
#!kaggle competitions submit -c store-sales-time-series-forecasting -f submission.csv -m "prophet_forecasting_at_scale"
test.isnull().sum()
id 0
date 0
store_nbr 0
family 0
onpromotion 0
ds 0
item 0
dtype: int64
model = Prophet(holidays = holidays[['ds', 'holiday']])
model.add_regressor('onpromotion') #, standardize = False)
model.add_regressor('oil' )#, standardize = False)
<prophet.forecaster.Prophet at 0x7c37f7ace770>
# Compile data
trainingdata = df[['ds', items[0]]].rename({items[0]: 'y'}, axis = 1)
trainingdata['y'] = trainingdata['y'].astype(float)
# cONVERT DATES
trainingdata['ds'] = pd.to_datetime(trainingdata['ds'])
promos['ds'] = pd.to_datetime(promos['ds'])
# Add regressors
trainingdata = trainingdata.merge(promos[['ds', items[0]]]).rename({items[0]: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
trainingdata['oil'] = trainingdata['oil'].fillna(method = 'bfill')
trainingdata.head()
| ds | y | onpromotion | oil | |
|---|---|---|---|---|
| 0 | 2013-01-01 | 0.0 | 0 | 93.14 |
| 1 | 2013-01-02 | 3.0 | 0 | 93.14 |
| 2 | 2013-01-03 | 2.0 | 0 | 92.97 |
| 3 | 2013-01-04 | 2.0 | 0 | 93.12 |
| 4 | 2013-01-05 | 0.0 | 0 | 93.20 |
model.fit(trainingdata)
14:13:49 - cmdstanpy - INFO - Chain [1] start processing
14:13:50 - cmdstanpy - INFO - Chain [1] done processing
<prophet.forecaster.Prophet at 0x7c37f7ace770>
fut = model.make_future_dataframe(periods = 16)
merge_content = test_promos[['ds', items[0]]].rename({items[0]: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
fut = fut.merge(merge_content)
fut['oil'] = fut['oil'].fillna(method = 'bfill')
fut
| ds | onpromotion | oil | |
|---|---|---|---|
| 0 | 2017-08-16 | 0 | 46.80 |
| 1 | 2017-08-17 | 0 | 47.07 |
| 2 | 2017-08-18 | 0 | 48.59 |
| 3 | 2017-08-19 | 0 | 47.39 |
| 4 | 2017-08-20 | 0 | 47.39 |
| 5 | 2017-08-21 | 0 | 47.39 |
| 6 | 2017-08-22 | 0 | 47.65 |
| 7 | 2017-08-23 | 0 | 48.45 |
| 8 | 2017-08-24 | 0 | 47.24 |
| 9 | 2017-08-25 | 0 | 47.65 |
| 10 | 2017-08-26 | 0 | 46.40 |
| 11 | 2017-08-27 | 0 | 46.40 |
| 12 | 2017-08-28 | 0 | 46.40 |
| 13 | 2017-08-29 | 0 | 46.46 |
| 14 | 2017-08-30 | 2 | 45.96 |
| 15 | 2017-08-31 | 0 | 47.26 |
fut['ds'] = pd.to_datetime(fut['ds'])
preds = model.predict(fut)
preds[['ds', 'onpromotion', 'oil', 'yhat']]
| ds | onpromotion | oil | yhat | |
|---|---|---|---|---|
| 0 | 2017-08-16 | 0.000000 | -0.095392 | 1.749961 |
| 1 | 2017-08-17 | 0.000000 | -0.094169 | 1.512343 |
| 2 | 2017-08-18 | 0.000000 | -0.087285 | 1.776728 |
| 3 | 2017-08-19 | 0.000000 | -0.092720 | 2.870283 |
| 4 | 2017-08-20 | 0.000000 | -0.092720 | 3.268697 |
| 5 | 2017-08-21 | 0.000000 | -0.092720 | 2.059172 |
| 6 | 2017-08-22 | 0.000000 | -0.091543 | 2.067662 |
| 7 | 2017-08-23 | 0.000000 | -0.087919 | 2.086430 |
| 8 | 2017-08-24 | 0.000000 | -0.093400 | 0.411933 |
| 9 | 2017-08-25 | 0.000000 | -0.091543 | 2.089462 |
| 10 | 2017-08-26 | 0.000000 | -0.097204 | 3.169663 |
| 11 | 2017-08-27 | 0.000000 | -0.097204 | 3.550008 |
| 12 | 2017-08-28 | 0.000000 | -0.097204 | 2.317490 |
| 13 | 2017-08-29 | 0.000000 | -0.096932 | 2.297301 |
| 14 | 2017-08-30 | 7.158534 | -0.099197 | 9.436443 |
| 15 | 2017-08-31 | 0.000000 | -0.093309 | 2.004886 |
out = test[test['item'] == items[0]].drop('item', axis = 1)
out['ds'] = pd.to_datetime(out['ds'])
out.head()
| id | date | store_nbr | family | onpromotion | ds | |
|---|---|---|---|---|---|---|
| 33 | 3000921 | 2017-08-16 | 10 | AUTOMOTIVE | 0 | 2017-08-16 |
| 1815 | 3002703 | 2017-08-17 | 10 | AUTOMOTIVE | 0 | 2017-08-17 |
| 3597 | 3004485 | 2017-08-18 | 10 | AUTOMOTIVE | 0 | 2017-08-18 |
| 5379 | 3006267 | 2017-08-19 | 10 | AUTOMOTIVE | 0 | 2017-08-19 |
| 7161 | 3008049 | 2017-08-20 | 10 | AUTOMOTIVE | 0 | 2017-08-20 |
out.tail()
| id | date | store_nbr | family | onpromotion | ds | |
|---|---|---|---|---|---|---|
| 19635 | 3020523 | 2017-08-27 | 10 | AUTOMOTIVE | 0 | 2017-08-27 |
| 21417 | 3022305 | 2017-08-28 | 10 | AUTOMOTIVE | 0 | 2017-08-28 |
| 23199 | 3024087 | 2017-08-29 | 10 | AUTOMOTIVE | 0 | 2017-08-29 |
| 24981 | 3025869 | 2017-08-30 | 10 | AUTOMOTIVE | 2 | 2017-08-30 |
| 26763 | 3027651 | 2017-08-31 | 10 | AUTOMOTIVE | 0 | 2017-08-31 |
# cONVERT DATES
df['ds'] = pd.to_datetime(df['ds'])
promos['ds'] = pd.to_datetime(promos['ds'])
test['ds'] = pd.to_datetime(test['ds'])
# Create a function to perform modeling
def prophet_model(item):
# Init model
model = Prophet(holidays = holidays[['ds', 'holiday']])
model.add_regressor('onpromotion') #, standardize = False)
model.add_regressor('oil' )#, standardize = False)
# Compile data
trainingdata = df[['ds', item]].rename({item: 'y'}, axis = 1)
trainingdata['y'] = trainingdata['y'].astype(float)
# Add regressors
trainingdata = trainingdata.merge(promos[['ds', item]], how = 'left').rename({item: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
trainingdata['onpromotion'] = trainingdata['onpromotion'].fillna(0)
trainingdata['oil'] = trainingdata['oil'].fillna(method = 'bfill')
# Train
model.fit(trainingdata)
# Init predictions
fut = model.make_future_dataframe(periods = 16)
merge_content = test_promos[['ds', item]].rename({item: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
fut = fut.merge(merge_content)
fut['oil'] = fut['oil'].fillna(method = 'bfill')
# Model
preds = model.predict(fut)[['ds', 'yhat']]
preds['ds'] = pd.to_datetime(preds['ds'])
# Output
out = test[test['item'] == item].drop('item', axis = 1)
return out.merge(preds, how = 'left', on = 'ds').rename({'yhat': 'sales'}, axis = 1) #.drop(['ds', 'oil', 'onpromotion'], axis = 1)
random.choice(items)
'34_LADIESWEAR'
prophet_model(random.choice(items))
14:13:51 - cmdstanpy - INFO - Chain [1] start processing
14:13:51 - cmdstanpy - INFO - Chain [1] done processing
| id | date | store_nbr | family | onpromotion | ds | sales | |
|---|---|---|---|---|---|---|---|
| 0 | 3001581 | 2017-08-16 | 29 | AUTOMOTIVE | 0 | 2017-08-16 | 6.083554 |
| 1 | 3003363 | 2017-08-17 | 29 | AUTOMOTIVE | 0 | 2017-08-17 | 6.022175 |
| 2 | 3005145 | 2017-08-18 | 29 | AUTOMOTIVE | 0 | 2017-08-18 | 6.568839 |
| 3 | 3006927 | 2017-08-19 | 29 | AUTOMOTIVE | 0 | 2017-08-19 | 7.950376 |
| 4 | 3008709 | 2017-08-20 | 29 | AUTOMOTIVE | 0 | 2017-08-20 | 8.325301 |
| 5 | 3010491 | 2017-08-21 | 29 | AUTOMOTIVE | 0 | 2017-08-21 | 6.421023 |
| 6 | 3012273 | 2017-08-22 | 29 | AUTOMOTIVE | 0 | 2017-08-22 | 6.901322 |
| 7 | 3014055 | 2017-08-23 | 29 | AUTOMOTIVE | 0 | 2017-08-23 | 6.409948 |
| 8 | 3015837 | 2017-08-24 | 29 | AUTOMOTIVE | 0 | 2017-08-24 | 5.036301 |
| 9 | 3017619 | 2017-08-25 | 29 | AUTOMOTIVE | 0 | 2017-08-25 | 6.838067 |
| 10 | 3019401 | 2017-08-26 | 29 | AUTOMOTIVE | 0 | 2017-08-26 | 8.204809 |
| 11 | 3021183 | 2017-08-27 | 29 | AUTOMOTIVE | 0 | 2017-08-27 | 8.558502 |
| 12 | 3022965 | 2017-08-28 | 29 | AUTOMOTIVE | 0 | 2017-08-28 | 6.625649 |
| 13 | 3024747 | 2017-08-29 | 29 | AUTOMOTIVE | 0 | 2017-08-29 | 7.066330 |
| 14 | 3026529 | 2017-08-30 | 29 | AUTOMOTIVE | 0 | 2017-08-30 | 6.506890 |
| 15 | 3028311 | 2017-08-31 | 29 | AUTOMOTIVE | 0 | 2017-08-31 | 6.418256 |
# Forecast
#start = time.time()
# Disable outputs
#logging.getLogger("cmdstanpy").disabled = True
# Parallelization
#p = Pool(num_cpus)
#result = p.map(prophet_model, items)
#end = time.time()
#print(f'Elapsed Modeling Time: {round((start - end) / 60, 2)} minutes..')
# Compile output
#res = pd.concat([i for i in result])
#out = res[['id', 'sales']]
#out.shape
test.shape
(28512, 7)
# Write file
out.to_csv('submission.csv', index = False)
# Visualize outputs
sample = random.choice(items)
sample
'29_PET SUPPLIES'
# Init model
model = Prophet(holidays = holidays[['ds', 'holiday']])
model.add_regressor('onpromotion') #, standardize = False)
model.add_regressor('oil' )#, standardize = False)
# Compile data
trainingdata = df[['ds', sample]].rename({sample: 'y'}, axis = 1)
trainingdata['y'] = trainingdata['y'].astype(float)
# Add regressors
trainingdata = trainingdata.merge(promos[['ds', sample]], how = 'left').rename({sample: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
trainingdata['onpromotion'] = trainingdata['onpromotion'].fillna(0)
trainingdata['oil'] = trainingdata['oil'].fillna(method = 'bfill')
# Train
model.fit(trainingdata)
# Init predictions
fut = model.make_future_dataframe(periods = 16)
merge_content = test_promos[['ds', sample]].rename({sample: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
fut = fut.merge(merge_content)
fut['oil'] = fut['oil'].fillna(method = 'bfill')
# Model
preds = model.predict(fut) #[['ds', 'yhat']]
preds['ds'] = pd.to_datetime(preds['ds'])
# Output
out = test[test['item'] == sample].drop('item', axis = 1)
out = out.merge(preds, how = 'left', on = 'ds').rename({'yhat': 'sales'}, axis = 1) #.drop(['ds', 'oil', 'onpromotion'], axis = 1)
14:13:53 - cmdstanpy - INFO - Chain [1] start processing
14:13:53 - cmdstanpy - INFO - Chain [1] done processing
print(f"Examining Outputs for {sample}")
model.plot(preds)
Examining Outputs for 29_PET SUPPLIES


model.plot_components(preds)


print('Showing Time Series for Model: ')
Showing Time Series for Model:
out.head()
| id | date | store_nbr | family | onpromotion_x | ds | trend | yhat_lower | yhat_upper | trend_lower | ... | weekly | weekly_lower | weekly_upper | yearly | yearly_lower | yearly_upper | multiplicative_terms | multiplicative_terms_lower | multiplicative_terms_upper | sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3001607 | 2017-08-16 | 29 | PET SUPPLIES | 0 | 2017-08-16 | 7.514397 | 4.404190 | 8.789459 | 7.514397 | ... | -0.199900 | -0.199900 | -0.199900 | -0.229431 | -0.229431 | -0.229431 | 0.0 | 0.0 | 0.0 | 6.625475 |
| 1 | 3003389 | 2017-08-17 | 29 | PET SUPPLIES | 0 | 2017-08-17 | 7.520778 | 4.404179 | 8.655474 | 7.520778 | ... | -0.343361 | -0.343361 | -0.343361 | -0.181378 | -0.181378 | -0.181378 | 0.0 | 0.0 | 0.0 | 6.541952 |
| 2 | 3005171 | 2017-08-18 | 29 | PET SUPPLIES | 0 | 2017-08-18 | 7.527158 | 4.113727 | 8.530878 | 7.527158 | ... | -0.558148 | -0.558148 | -0.558148 | -0.131434 | -0.131434 | -0.131434 | 0.0 | 0.0 | 0.0 | 6.414476 |
| 3 | 3006953 | 2017-08-19 | 29 | PET SUPPLIES | 0 | 2017-08-19 | 7.533539 | 5.316851 | 9.693612 | 7.533539 | ... | 0.510069 | 0.510069 | 0.510069 | -0.080453 | -0.080453 | -0.080453 | 0.0 | 0.0 | 0.0 | 7.515592 |
| 4 | 3008735 | 2017-08-20 | 29 | PET SUPPLIES | 0 | 2017-08-20 | 7.539920 | 5.291586 | 9.929054 | 7.539920 | ... | 0.736900 | 0.736900 | 0.736900 | -0.029338 | -0.029338 | -0.029338 | 0.0 | 0.0 | 0.0 | 7.799919 |
5 rows × 345 columns
out.shape
(16, 345)
fig, ax = plt.subplots()
ax.scatter(trainingdata['ds'], trainingdata['y'], color = 'dodgerblue', s = .1)
ax.scatter(out['ds'], out['sales'], color = 'red', s = .2)
plt.ylabel('Count of Item')
plt.xlabel('Date')
Text(0.5, 0, 'Date')
