In this post, we use a generalized additive model to forecast store sales using contrbuting factors as variables to help improve the accuracy of demand predictions.

Forecasting at Scale: Time-Series with Prophet

In this exercise we will use parallel computing to implement prophet models for each product in the data and conduct forecasting to submit to kaggle for evaluation

# Import libraries
import pandas as pd
import time
from multiprocessing import Pool, cpu_count
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import zipfile
import itertools
from prophet import Prophet
import os
import sys
import logging
import random 

# List contents
for dirname, _, filenames in os.walk('/kaggle/input'):
    
    for filename in filenames:
        
        print(os.path.join(dirname, filename))
/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv
# Load data
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv').rename({'date': 'ds', 'description': 'holiday'}, axis = 1)
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv').rename({'date': 'ds', 'dcoilwtico': 'oil'}, axis = 1).fillna(method = 'bfill')
# Inspect data
print(f"Unique Item Families:    {train['family'].nunique()}") 
print(f"# Stores:                {train['store_nbr'].nunique()}")
print(f"Dataset start date:      {min(train['date'])}")
print(f"Dataset end date:        {max(train['date'])}")
print(f"Test set start date:     {min(test['date'])}")
print(f"Test set end date:       {max(test['date'])}")

interval = pd.date_range(min(train['date']), max(train['date']), freq = 'd')

print(f"Num Days:               {len(interval)}")
print(f"Train Shape:            {train['date'].nunique()}")
Unique Item Families:    33
# Stores:                54
Dataset start date:      2013-01-01
Dataset end date:        2017-08-15
Test set start date:     2017-08-16
Test set end date:       2017-08-31
Num Days:               1688
Train Shape:            1684
oil.head()
dsoil
02013-01-0193.14
12013-01-0293.14
22013-01-0392.97
32013-01-0493.12
42013-01-0793.20
# Fix dates
train['ds'] = pd.to_datetime(train['date'])
test['ds'] = pd.to_datetime(test['date'])
oil['ds'] = pd.to_datetime(oil['ds'])

# Create a master label for training 
train['item'] = train['store_nbr'].astype(str) + '_' + train['family']
test['item'] = test['store_nbr'].astype(str) + '_' + test['family']

# Merge data
test1 = test.merge(oil, how = 'left', on = 'ds').drop('date', axis = 1)
train1 = train.merge(oil, how = 'left', on = 'ds').drop('date', axis = 1)
test.shape
(28512, 7)
test1.shape
(28512, 7)
test1.head()
idstore_nbrfamilyonpromotiondsitemoil
030008881AUTOMOTIVE02017-08-161_AUTOMOTIVE46.8
130008891BABY CARE02017-08-161_BABY CARE46.8
230008901BEAUTY22017-08-161_BEAUTY46.8
330008911BEVERAGES202017-08-161_BEVERAGES46.8
430008921BOOKS02017-08-161_BOOKS46.8
train1.head()
idstore_nbrfamilysalesonpromotiondsitemoil
001AUTOMOTIVE0.002013-01-011_AUTOMOTIVE93.14
111BABY CARE0.002013-01-011_BABY CARE93.14
221BEAUTY0.002013-01-011_BEAUTY93.14
331BEVERAGES0.002013-01-011_BEVERAGES93.14
441BOOKS0.002013-01-011_BOOKS93.14
# Create an index for each product
df = train1.pivot(index = 'ds', columns = 'item', values = 'sales').reset_index() #.rename({'date': 'ds'}, axis = 1)
promos = train1.pivot(index = 'ds', columns = 'item', values = 'onpromotion').reset_index() #.rename({'date': 'ds'}, axis = 1)
test_promos = test1.pivot(index = 'ds', columns = 'item', values = 'onpromotion').reset_index() #.rename({'date': 'ds'}, axis = 1)
# Instantiate processing power for parallelism
num_cpus = cpu_count()
# Get an index of all products
items = df.drop('ds', axis = 1).columns
#!kaggle competitions submit -c store-sales-time-series-forecasting -f submission.csv -m "prophet_forecasting_at_scale"
test.isnull().sum()
id             0
date           0
store_nbr      0
family         0
onpromotion    0
ds             0
item           0
dtype: int64
model = Prophet(holidays = holidays[['ds', 'holiday']])
model.add_regressor('onpromotion') #, standardize = False)
model.add_regressor('oil' )#, standardize = False)
<prophet.forecaster.Prophet at 0x7c37f7ace770>
# Compile data
trainingdata = df[['ds', items[0]]].rename({items[0]: 'y'}, axis = 1)
trainingdata['y'] = trainingdata['y'].astype(float)

# cONVERT DATES
trainingdata['ds'] = pd.to_datetime(trainingdata['ds'])
promos['ds'] = pd.to_datetime(promos['ds'])

# Add regressors
trainingdata = trainingdata.merge(promos[['ds', items[0]]]).rename({items[0]: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')

trainingdata['oil'] = trainingdata['oil'].fillna(method = 'bfill')
trainingdata.head()
dsyonpromotionoil
02013-01-010.0093.14
12013-01-023.0093.14
22013-01-032.0092.97
32013-01-042.0093.12
42013-01-050.0093.20
model.fit(trainingdata)
14:13:49 - cmdstanpy - INFO - Chain [1] start processing
14:13:50 - cmdstanpy - INFO - Chain [1] done processing





<prophet.forecaster.Prophet at 0x7c37f7ace770>
fut = model.make_future_dataframe(periods = 16)
merge_content = test_promos[['ds', items[0]]].rename({items[0]: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
fut = fut.merge(merge_content)
fut['oil'] = fut['oil'].fillna(method = 'bfill')
fut
dsonpromotionoil
02017-08-16046.80
12017-08-17047.07
22017-08-18048.59
32017-08-19047.39
42017-08-20047.39
52017-08-21047.39
62017-08-22047.65
72017-08-23048.45
82017-08-24047.24
92017-08-25047.65
102017-08-26046.40
112017-08-27046.40
122017-08-28046.40
132017-08-29046.46
142017-08-30245.96
152017-08-31047.26
fut['ds'] = pd.to_datetime(fut['ds'])
preds = model.predict(fut)
preds[['ds', 'onpromotion', 'oil', 'yhat']]
dsonpromotionoilyhat
02017-08-160.000000-0.0953921.749961
12017-08-170.000000-0.0941691.512343
22017-08-180.000000-0.0872851.776728
32017-08-190.000000-0.0927202.870283
42017-08-200.000000-0.0927203.268697
52017-08-210.000000-0.0927202.059172
62017-08-220.000000-0.0915432.067662
72017-08-230.000000-0.0879192.086430
82017-08-240.000000-0.0934000.411933
92017-08-250.000000-0.0915432.089462
102017-08-260.000000-0.0972043.169663
112017-08-270.000000-0.0972043.550008
122017-08-280.000000-0.0972042.317490
132017-08-290.000000-0.0969322.297301
142017-08-307.158534-0.0991979.436443
152017-08-310.000000-0.0933092.004886
out = test[test['item'] == items[0]].drop('item', axis = 1)
out['ds'] = pd.to_datetime(out['ds'])
out.head()
iddatestore_nbrfamilyonpromotionds
3330009212017-08-1610AUTOMOTIVE02017-08-16
181530027032017-08-1710AUTOMOTIVE02017-08-17
359730044852017-08-1810AUTOMOTIVE02017-08-18
537930062672017-08-1910AUTOMOTIVE02017-08-19
716130080492017-08-2010AUTOMOTIVE02017-08-20
out.tail()
iddatestore_nbrfamilyonpromotionds
1963530205232017-08-2710AUTOMOTIVE02017-08-27
2141730223052017-08-2810AUTOMOTIVE02017-08-28
2319930240872017-08-2910AUTOMOTIVE02017-08-29
2498130258692017-08-3010AUTOMOTIVE22017-08-30
2676330276512017-08-3110AUTOMOTIVE02017-08-31
# cONVERT DATES
df['ds'] = pd.to_datetime(df['ds'])
promos['ds'] = pd.to_datetime(promos['ds'])
test['ds'] = pd.to_datetime(test['ds'])
# Create a function to perform modeling
def prophet_model(item):
    
    # Init model
    model = Prophet(holidays = holidays[['ds', 'holiday']])
    model.add_regressor('onpromotion') #, standardize = False)
    model.add_regressor('oil' )#, standardize = False)

    # Compile data
    trainingdata = df[['ds', item]].rename({item: 'y'}, axis = 1)
    trainingdata['y'] = trainingdata['y'].astype(float)

    # Add regressors
    trainingdata = trainingdata.merge(promos[['ds', item]], how = 'left').rename({item: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
    trainingdata['onpromotion'] = trainingdata['onpromotion'].fillna(0)
    trainingdata['oil'] = trainingdata['oil'].fillna(method = 'bfill')
    
    # Train
    model.fit(trainingdata)
    
    # Init predictions
    fut = model.make_future_dataframe(periods = 16)
    merge_content = test_promos[['ds', item]].rename({item: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
    fut = fut.merge(merge_content)
    fut['oil'] = fut['oil'].fillna(method = 'bfill')
    
    # Model
    preds = model.predict(fut)[['ds', 'yhat']] 
    preds['ds'] = pd.to_datetime(preds['ds'])
    
    # Output
    out = test[test['item'] == item].drop('item', axis = 1)    
    
    return out.merge(preds, how = 'left', on = 'ds').rename({'yhat': 'sales'}, axis = 1) #.drop(['ds', 'oil', 'onpromotion'], axis = 1)
random.choice(items)
'34_LADIESWEAR'
prophet_model(random.choice(items))
14:13:51 - cmdstanpy - INFO - Chain [1] start processing
14:13:51 - cmdstanpy - INFO - Chain [1] done processing
iddatestore_nbrfamilyonpromotiondssales
030015812017-08-1629AUTOMOTIVE02017-08-166.083554
130033632017-08-1729AUTOMOTIVE02017-08-176.022175
230051452017-08-1829AUTOMOTIVE02017-08-186.568839
330069272017-08-1929AUTOMOTIVE02017-08-197.950376
430087092017-08-2029AUTOMOTIVE02017-08-208.325301
530104912017-08-2129AUTOMOTIVE02017-08-216.421023
630122732017-08-2229AUTOMOTIVE02017-08-226.901322
730140552017-08-2329AUTOMOTIVE02017-08-236.409948
830158372017-08-2429AUTOMOTIVE02017-08-245.036301
930176192017-08-2529AUTOMOTIVE02017-08-256.838067
1030194012017-08-2629AUTOMOTIVE02017-08-268.204809
1130211832017-08-2729AUTOMOTIVE02017-08-278.558502
1230229652017-08-2829AUTOMOTIVE02017-08-286.625649
1330247472017-08-2929AUTOMOTIVE02017-08-297.066330
1430265292017-08-3029AUTOMOTIVE02017-08-306.506890
1530283112017-08-3129AUTOMOTIVE02017-08-316.418256
# Forecast
#start = time.time()

# Disable outputs
#logging.getLogger("cmdstanpy").disabled = True 

# Parallelization
#p = Pool(num_cpus)
#result = p.map(prophet_model, items)

#end = time.time()
#print(f'Elapsed Modeling Time:  {round((start - end) / 60, 2)} minutes..')
# Compile output
#res = pd.concat([i for i in result])
#out = res[['id', 'sales']]
#out.shape
test.shape
(28512, 7)
# Write file
out.to_csv('submission.csv', index = False)
# Visualize outputs
sample = random.choice(items)
sample
'29_PET SUPPLIES'
# Init model
model = Prophet(holidays = holidays[['ds', 'holiday']])
model.add_regressor('onpromotion') #, standardize = False)
model.add_regressor('oil' )#, standardize = False)

# Compile data
trainingdata = df[['ds', sample]].rename({sample: 'y'}, axis = 1)
trainingdata['y'] = trainingdata['y'].astype(float)

# Add regressors
trainingdata = trainingdata.merge(promos[['ds', sample]], how = 'left').rename({sample: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
trainingdata['onpromotion'] = trainingdata['onpromotion'].fillna(0)
trainingdata['oil'] = trainingdata['oil'].fillna(method = 'bfill')
    
# Train
model.fit(trainingdata)
    
# Init predictions
fut = model.make_future_dataframe(periods = 16)
merge_content = test_promos[['ds', sample]].rename({sample: 'onpromotion'}, axis = 1).merge(oil, how = 'left', on = 'ds')
fut = fut.merge(merge_content)
fut['oil'] = fut['oil'].fillna(method = 'bfill')
    
# Model
preds = model.predict(fut) #[['ds', 'yhat']] 
preds['ds'] = pd.to_datetime(preds['ds'])
    
# Output
out = test[test['item'] == sample].drop('item', axis = 1)    
out = out.merge(preds, how = 'left', on = 'ds').rename({'yhat': 'sales'}, axis = 1) #.drop(['ds', 'oil', 'onpromotion'], axis = 1)
14:13:53 - cmdstanpy - INFO - Chain [1] start processing
14:13:53 - cmdstanpy - INFO - Chain [1] done processing
print(f"Examining Outputs for {sample}")
    
model.plot(preds)
Examining Outputs for 29_PET SUPPLIES

png

png

model.plot_components(preds)

png

png

print('Showing Time Series for Model: ')
Showing Time Series for Model: 
out.head()
iddatestore_nbrfamilyonpromotion_xdstrendyhat_loweryhat_uppertrend_lower...weeklyweekly_lowerweekly_upperyearlyyearly_loweryearly_uppermultiplicative_termsmultiplicative_terms_lowermultiplicative_terms_uppersales
030016072017-08-1629PET SUPPLIES02017-08-167.5143974.4041908.7894597.514397...-0.199900-0.199900-0.199900-0.229431-0.229431-0.2294310.00.00.06.625475
130033892017-08-1729PET SUPPLIES02017-08-177.5207784.4041798.6554747.520778...-0.343361-0.343361-0.343361-0.181378-0.181378-0.1813780.00.00.06.541952
230051712017-08-1829PET SUPPLIES02017-08-187.5271584.1137278.5308787.527158...-0.558148-0.558148-0.558148-0.131434-0.131434-0.1314340.00.00.06.414476
330069532017-08-1929PET SUPPLIES02017-08-197.5335395.3168519.6936127.533539...0.5100690.5100690.510069-0.080453-0.080453-0.0804530.00.00.07.515592
430087352017-08-2029PET SUPPLIES02017-08-207.5399205.2915869.9290547.539920...0.7369000.7369000.736900-0.029338-0.029338-0.0293380.00.00.07.799919

5 rows × 345 columns

out.shape
(16, 345)
fig, ax = plt.subplots()
ax.scatter(trainingdata['ds'], trainingdata['y'], color = 'dodgerblue', s = .1)
ax.scatter(out['ds'], out['sales'], color = 'red', s = .2)
plt.ylabel('Count of Item')
plt.xlabel('Date')
Text(0.5, 0, 'Date')

png