5
Reformatted from http://www.dataversity.net/forecasting-prophet-r/ for easier comprehension. I do not know how to write code as in http://www.dataversity.net/forecasting-prophet-r/, and what follows is NOT my work. I learned a lot from Stephen’s original, but I felt many others could benefit if only they could simply cut and paste. So, I have shared the same code in a format you can cut and paste. And, I have also reconstructed portions for ease of comprehension, using simple R constructs, for readability. Furthermore, I am much grateful o Stephen, and I learned a lot over a few hours. library(data.table) library(dplyr) library(lubridate) library(dtplyr) library(ggplot2) library(ggthemes) library(purrr) library(splines) library(forecast) library(prophet) library(gam) library(randomForest) # Note that I have removed options and suppressMessages… #don’t suppress unless your first name is Hadley or Stephen xxq<-fread("download from the source and delete some rows at the bottom…") head(xxq) tail(xxq) names(xxq)<-c("ds","y") xxq$ds<- ymd(as.Date(xxq$ds,"%m/%d/%Y")) head(xxq)

Reformatted for-c-n-p

Embed Size (px)

Citation preview

Page 1: Reformatted for-c-n-p

Reformatted from http://www.dataversity.net/forecasting-prophet-r/ for easier comprehension.

I do not know how to write code as in http://www.dataversity.net/forecasting-prophet-r/, and what

follows is NOT my work. I learned a lot from Stephen’s original, but I felt many others could benefit if

only they could simply cut and paste. So, I have shared the same code in a format you can cut and paste.

And, I have also reconstructed portions for ease of comprehension, using simple R constructs, for

readability. Furthermore, I am much grateful o Stephen, and I learned a lot over a few hours.

library(data.table)

library(dplyr)

library(lubridate)

library(dtplyr)

library(ggplot2)

library(ggthemes)

library(purrr)

library(splines)

library(forecast)

library(prophet)

library(gam)

library(randomForest)

# Note that I have removed options and suppressMessages…

#don’t suppress unless your first name is Hadley or Stephen

xxq<-fread("download from the source and delete some rows at the bottom…")

head(xxq)

tail(xxq)

names(xxq)<-c("ds","y")

xxq$ds<-

ymd(as.Date(xxq$ds,"%m/%d/%Y"))

head(xxq)

Page 2: Reformatted for-c-n-p

xxq$y<-as.numeric(xxq$y)

#invisible(xxq<-dtplyr::tbl_dt(xxq[,y:=as.numeric(y)] ))

Note that I simply renamed the column names. The original dataset comes with colnames other than

c(“date”, “daily births”) – author must have renamed these columns. Note you must also delete some

rows at the bottom of the dataset from source, available here. Authors make the code available to read

and am making them available in a format one can cut and paste, translating Hadley constructs into

easier constructs.

ntest<-365

#xxq<-xxq[complete.cases(xxq)]

len <- nrow(xxq)

trainq <-xxq[1:(len-ntest)]

testq<-xxq[(len-ntest+1):len]

str(trainq)

str(testq)

c(min(trainq$ds),max(trainq$ds))

c(min(testq$ds),max(testq$ds))

rmse<-function(actual,predicted) round((sum((actual-predicted)^2)/length(actual))^.5,2)

mape<-function(actual,predicted) round(mean(100*abs((actual-predicted)/actual)),2)

slug1<-lm(y~ns(ds,13),data=trainq)

trainq$pred1<-predict(slug1)

testq$pred1<-predict(slug1,newdata=testq)

c(rmse(trainq$y,trainq$pred1),rmse(testq$y,testq$pred1))

c(mape(trainq$y,trainq$pred1),mape(testq$y,testq$pred1))

slug2<-lm(y~as.factor(lubridate::wday(ds)),data=trainq)

trainq$pred2<-predict(slug2)

testq$pred2<-predict(slug2,newdata=testq)

c(rmse(trainq$y,trainq$pred2),rmse(testq$y,testq$pred2))

Page 3: Reformatted for-c-n-p

c(mape(trainq$y,trainq$pred2),mape(testq$y,testq$pred2))

slug3<-lm(y~as.factor(lubridate::month(ds)),data=trainq)

trainq$pred3<-predict(slug3)

testq$pred3<-predict(slug3,newdata=testq)

c(rmse(trainq$y,trainq$pred3),rmse(testq$y,testq$pred3))

c(mape(trainq$y,trainq$pred3),mape(testq$y,testq$pred3))

slug4<-

lm(y~ns(ds,13)+as.factor(lubridate::wday(ds))+as.factor(lubridate::month(ds)),data=trainq)

trainq$pred4<-predict(slug4)

testq$pred4<-predict(slug4,newdata=testq)

c(rmse(trainq$y,trainq$pred4),rmse(testq$y,testq$pred4))

c(mape(trainq$y,trainq$pred4),mape(testq$y,testq$pred4))

slug4a<-

gam(y~ns(ds,11)+as.factor(lubridate::wday(ds))+as.factor(lubridate::month(ds)),data=trainq)

trainq$pred4a<-predict(slug4a)

testq$pred4a<-predict(slug4a,newdata=testq)

c(rmse(trainq$y,trainq$pred4a),rmse(testq$y,testq$pred4a))

c(mape(trainq$y,trainq$pred4a),mape(testq$y,testq$pred4a))

slug4b<-

randomForest(y~ds+as.factor(lubridate::wday(ds))+as.factor(lubridate::month(ds)),data=trainq,

ntree=100)

trainq$pred4b<-predict(slug4b)

testq$pred4b<-predict(slug4b,newdata=testq)

c(rmse(trainq$y,trainq$pred4b),rmse(testq$y,testq$pred4b))

c(mape(trainq$y,trainq$pred4b),mape(testq$y,testq$pred4b))

slug5<-prophet(as.data.frame(trainq))

Page 4: Reformatted for-c-n-p

future <- make_future_dataframe(slug5,periods=ntest+1,include_history=TRUE)

forecast <-predict(slug5,future)

n<-nrow(trainq)

trainq$yhat<-forecast$yhat[1:n]

testq$yhat <- forecast$yhat[(n+1):(n+ntest)]

c(rmse(trainq$y,trainq$yhat),rmse(testq$y,testq$yhat))

c(mape(trainq$y,trainq$yhat),mape(testq$y,testq$yhat))

ggplot(aes(y=pred1,x=ds),data=trainq)+

ylim(0,1.25*max(trainq$pred1))+

labs(title="Year",x="Year",y="Births")+

geom_line()

uwdays<-trainq[,.(.N,w=min(ds)),.(as.factor(lubridate::wday(ds)))]$w

days<-lubridate::wday(uwdays,label=TRUE)

ggplot(aes(y=pred2,x=lubridate::wday(ds)),data=trainq[ds %in% uwdays])+

scale_x_continuous(breaks=1:length(days),labels=days)+

labs(title="Day of Week",x="Day",y="Births")+

ylim(0,1.25*max(trainq$pred2))+geom_point()+geom_line()

umonths<-trainq[,.(.N,w=min(ds)),.(as.factor(lubridate::month(ds)))]$w

months<-lubridate::month(umonths,label=TRUE)

ggplot(aes(y=pred3,x=lubridate::month(ds)),data=trainq[ds %in% umonths])+

scale_x_continuous(breaks=1:length(months),labels=months)+

labs(title="Month",x="Month",y="Births")+

ylim(0,1.25*max(trainq$pred3))+geom_point()+geom_line()

prophet_plot_components(slug5,forecast,uncertainty=TRUE)

Page 5: Reformatted for-c-n-p

testqstack<-data.table::melt(testq,id="ds",measure=c("yhat","pred4","pred4a","pred4b"))

model_names<-c(

`yhat`=sprintf("prophet -- %.2f : %.2f",rmse(testq$y,testq$yhat),mape(testq$y,testq$yhat)),

`pred4`=sprintf("lm -- %.2f : %.2f",rmse(testq$y,testq$pred4),mape(testq$y,testq$pred4)),

`pred4a`=sprintf("gam -- %.2f : %.2f",rmse(testq$y,testq$pred4a),mape(testq$y,testq$pred4a)),

`pred4b`=sprintf("randomForest -- %.2f :

%.2f",rmse(testq$y,testq$pred4b),mape(testq$y,testq$pred4b)))

g1<-ggplot(aes(y=value,x=ds,color=variable),data=testqstack)+

geom_line(data=testq,aes(y=y),colour="grey70") + geom_line() +

facet_wrap(~variable,ncol=2,labeller=as_labeller(model_names))+

scale_x_date()+

ylim(0,1.25*max(testqstack$value))+

theme(axis.text=element_text(size=5),axis.title=element_text(size=6))+

theme(legend.position="none") +

labs(title="Model Comparison, Test data, with rmse and mape Statistics.",x="Time",y="Births")

print(g1)