Upload
raman-kannan
View
145
Download
1
Embed Size (px)
Citation preview
Reformatted from http://www.dataversity.net/forecasting-prophet-r/ for easier comprehension.
I do not know how to write code as in http://www.dataversity.net/forecasting-prophet-r/, and what
follows is NOT my work. I learned a lot from Stephen’s original, but I felt many others could benefit if
only they could simply cut and paste. So, I have shared the same code in a format you can cut and paste.
And, I have also reconstructed portions for ease of comprehension, using simple R constructs, for
readability. Furthermore, I am much grateful o Stephen, and I learned a lot over a few hours.
library(data.table)
library(dplyr)
library(lubridate)
library(dtplyr)
library(ggplot2)
library(ggthemes)
library(purrr)
library(splines)
library(forecast)
library(prophet)
library(gam)
library(randomForest)
# Note that I have removed options and suppressMessages…
#don’t suppress unless your first name is Hadley or Stephen
xxq<-fread("download from the source and delete some rows at the bottom…")
head(xxq)
tail(xxq)
names(xxq)<-c("ds","y")
xxq$ds<-
ymd(as.Date(xxq$ds,"%m/%d/%Y"))
head(xxq)
xxq$y<-as.numeric(xxq$y)
#invisible(xxq<-dtplyr::tbl_dt(xxq[,y:=as.numeric(y)] ))
Note that I simply renamed the column names. The original dataset comes with colnames other than
c(“date”, “daily births”) – author must have renamed these columns. Note you must also delete some
rows at the bottom of the dataset from source, available here. Authors make the code available to read
and am making them available in a format one can cut and paste, translating Hadley constructs into
easier constructs.
ntest<-365
#xxq<-xxq[complete.cases(xxq)]
len <- nrow(xxq)
trainq <-xxq[1:(len-ntest)]
testq<-xxq[(len-ntest+1):len]
str(trainq)
str(testq)
c(min(trainq$ds),max(trainq$ds))
c(min(testq$ds),max(testq$ds))
rmse<-function(actual,predicted) round((sum((actual-predicted)^2)/length(actual))^.5,2)
mape<-function(actual,predicted) round(mean(100*abs((actual-predicted)/actual)),2)
slug1<-lm(y~ns(ds,13),data=trainq)
trainq$pred1<-predict(slug1)
testq$pred1<-predict(slug1,newdata=testq)
c(rmse(trainq$y,trainq$pred1),rmse(testq$y,testq$pred1))
c(mape(trainq$y,trainq$pred1),mape(testq$y,testq$pred1))
slug2<-lm(y~as.factor(lubridate::wday(ds)),data=trainq)
trainq$pred2<-predict(slug2)
testq$pred2<-predict(slug2,newdata=testq)
c(rmse(trainq$y,trainq$pred2),rmse(testq$y,testq$pred2))
c(mape(trainq$y,trainq$pred2),mape(testq$y,testq$pred2))
slug3<-lm(y~as.factor(lubridate::month(ds)),data=trainq)
trainq$pred3<-predict(slug3)
testq$pred3<-predict(slug3,newdata=testq)
c(rmse(trainq$y,trainq$pred3),rmse(testq$y,testq$pred3))
c(mape(trainq$y,trainq$pred3),mape(testq$y,testq$pred3))
slug4<-
lm(y~ns(ds,13)+as.factor(lubridate::wday(ds))+as.factor(lubridate::month(ds)),data=trainq)
trainq$pred4<-predict(slug4)
testq$pred4<-predict(slug4,newdata=testq)
c(rmse(trainq$y,trainq$pred4),rmse(testq$y,testq$pred4))
c(mape(trainq$y,trainq$pred4),mape(testq$y,testq$pred4))
slug4a<-
gam(y~ns(ds,11)+as.factor(lubridate::wday(ds))+as.factor(lubridate::month(ds)),data=trainq)
trainq$pred4a<-predict(slug4a)
testq$pred4a<-predict(slug4a,newdata=testq)
c(rmse(trainq$y,trainq$pred4a),rmse(testq$y,testq$pred4a))
c(mape(trainq$y,trainq$pred4a),mape(testq$y,testq$pred4a))
slug4b<-
randomForest(y~ds+as.factor(lubridate::wday(ds))+as.factor(lubridate::month(ds)),data=trainq,
ntree=100)
trainq$pred4b<-predict(slug4b)
testq$pred4b<-predict(slug4b,newdata=testq)
c(rmse(trainq$y,trainq$pred4b),rmse(testq$y,testq$pred4b))
c(mape(trainq$y,trainq$pred4b),mape(testq$y,testq$pred4b))
slug5<-prophet(as.data.frame(trainq))
future <- make_future_dataframe(slug5,periods=ntest+1,include_history=TRUE)
forecast <-predict(slug5,future)
n<-nrow(trainq)
trainq$yhat<-forecast$yhat[1:n]
testq$yhat <- forecast$yhat[(n+1):(n+ntest)]
c(rmse(trainq$y,trainq$yhat),rmse(testq$y,testq$yhat))
c(mape(trainq$y,trainq$yhat),mape(testq$y,testq$yhat))
ggplot(aes(y=pred1,x=ds),data=trainq)+
ylim(0,1.25*max(trainq$pred1))+
labs(title="Year",x="Year",y="Births")+
geom_line()
uwdays<-trainq[,.(.N,w=min(ds)),.(as.factor(lubridate::wday(ds)))]$w
days<-lubridate::wday(uwdays,label=TRUE)
ggplot(aes(y=pred2,x=lubridate::wday(ds)),data=trainq[ds %in% uwdays])+
scale_x_continuous(breaks=1:length(days),labels=days)+
labs(title="Day of Week",x="Day",y="Births")+
ylim(0,1.25*max(trainq$pred2))+geom_point()+geom_line()
umonths<-trainq[,.(.N,w=min(ds)),.(as.factor(lubridate::month(ds)))]$w
months<-lubridate::month(umonths,label=TRUE)
ggplot(aes(y=pred3,x=lubridate::month(ds)),data=trainq[ds %in% umonths])+
scale_x_continuous(breaks=1:length(months),labels=months)+
labs(title="Month",x="Month",y="Births")+
ylim(0,1.25*max(trainq$pred3))+geom_point()+geom_line()
prophet_plot_components(slug5,forecast,uncertainty=TRUE)
testqstack<-data.table::melt(testq,id="ds",measure=c("yhat","pred4","pred4a","pred4b"))
model_names<-c(
`yhat`=sprintf("prophet -- %.2f : %.2f",rmse(testq$y,testq$yhat),mape(testq$y,testq$yhat)),
`pred4`=sprintf("lm -- %.2f : %.2f",rmse(testq$y,testq$pred4),mape(testq$y,testq$pred4)),
`pred4a`=sprintf("gam -- %.2f : %.2f",rmse(testq$y,testq$pred4a),mape(testq$y,testq$pred4a)),
`pred4b`=sprintf("randomForest -- %.2f :
%.2f",rmse(testq$y,testq$pred4b),mape(testq$y,testq$pred4b)))
g1<-ggplot(aes(y=value,x=ds,color=variable),data=testqstack)+
geom_line(data=testq,aes(y=y),colour="grey70") + geom_line() +
facet_wrap(~variable,ncol=2,labeller=as_labeller(model_names))+
scale_x_date()+
ylim(0,1.25*max(testqstack$value))+
theme(axis.text=element_text(size=5),axis.title=element_text(size=6))+
theme(legend.position="none") +
labs(title="Model Comparison, Test data, with rmse and mape Statistics.",x="Time",y="Births")
print(g1)