Appendix final project

Preview:

Citation preview

AppendixProject final1I have more than 3 covariates, so I do not need to add any other variables to get higher correlation.bike.new <- read.csv("C:/Users/dell1/Desktop/bike new.csv")bike.new1<-bike.new[,c(-1,-2,-3,-4,-5)]cor(bike.new1)library(corrplot)corrplot(cor(bike.new1),method = "number")

pairs(bike.new1)

We found that temp and atemp have a high correlation

2acf(bike.new1,lag.max=24)

ccf(bike.new1$temp,bike.new1$atemp)

ccf(bike.new1$temp,bike.new1$windspeed)

ccf(bike.new1$temp,bike.new1$humidity)

ccf(bike.new1$atemp,bike.new1$humidity)

ccf(bike.new1$atemp,bike.new1$windspeed)

ccf(bike.new1$humidity,bike.new1$windspeed)

Temp and atemp might be more appropriate.

3The mean should be made constant. This can be achieved by removing the windspeed and thehumidity component, or by taking the difference of the series.

4x <-arima.sim(model=list(ar=c(0.7),ma=c(0.7)),n = 200)plot(x)

acf(x)

pacf(x)

library(forecast)auto.arima(x,max.p = 10,max.q=10)Series: xARIMA(1,0,1) with zero meanCoefficients:

ar1 ma10.6499 0.7323

s.e. 0.0582 0.0571sigma^2 estimated as 1.175: log likelihood=-300.99AIC=607.98 AICc=608.1 BIC=617.88

5library(leaps)regfit.full <- regsubsets(temp~.,bike.new1,nvmax=13)summary(regfit.full)Subset selection objectCall: regsubsets.formula(temp ~ ., bike.new1)3 Variables (and intercept)

Forced in Forced outatemp FALSE FALSEhumidity FALSE FALSEwindspeed FALSE FALSE1 subsets of each size up to 3Selection Algorithm: exhaustive

atemp humidity windspeed1 ( 1 ) "*" " " " "2 ( 1 ) "*" " " "*"3 ( 1 ) "*" "*" "*"

regfit.summary <- summary(regfit.full)names(regfit.summary)[1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"plot(regfit.summary$rsq)

plot(regfit.summary$bic)

The lowest BIC is achieved by the 2nd model, which includes all of the variables excepthumidity.coef(regfit.full,2)(Intercept) atemp windspeed-1.82858449 0.85009266 0.09934351

backward stepwise selectionregfit.bwd <- regsubsets(temp~.,bike.new1,nvmax=13,method="backward")summary(regfit.bwd)Selection Algorithm: backward

atemp humidity windspeed1 ( 1 ) "*" " " " "2 ( 1 ) "*" " " "*"3 ( 1 ) "*" "*" "*"

MSElibrary(boot)CVmse <- rep(0,3)for(i in 1:3){tempCols <- which(regfit.summary$which[i,-1]==TRUE)tempCols <- c(tempCols,4)tempCols <- as.numeric(tempCols)tempGLM <- glm(temp~.,data=bike.new1[,tempCols])tempCV <- cv.glm(tempGLM,data=bike.new1[,tempCols],K = 4)CVmse[i] <- tempCV$delta[1]}plot(CVmse)

The model with the lowest cross validation error is the 3rd model which includes atemp,humidity and windspeed.

generalized least squareslibrary(nlme)bike.new1.gls <-gls(temp~atemp+humidity,data=bike.new1,correlation=corARMA(p = 1,q=1))bike.new1.glssummary(bike.new1.gls)testMSE <- c(); MAX_POLY=3for(i in 1:MAX_POLY){

tempLm <- lm(temp~poly(atemp,i),data=bike.new1)testMSE[i] <- mean((bike.new1$temppredict(tempLm,newdata=bike.new1))^2) }

plot(testMSE,xlab="Degree of Polynomial",ylab="Test MSE",type='l', ylim=c(50,150))splinesplot(gam(temp~s(atemp+humidity,bs="tp"),data=bike.new1))

interaction termsremember that temp and atemp had high correlation.ggplot(aes(x=bike.new1$temp,y=bike.new1$atemp))+stat_density2d()

gam.bike.new1 <- gam(bike.new1$temp~s(bike.new1$atemp,bike.new1$humidity)+s(bike.new1$temp),data=bike.new1)

CVgam(formula(gam.bike.new1),bike.new1,nfold = 10)

6lm1<-lm(bike.new1$temp~bike.new1$atemp+bike.new1$humidity+bike.new1$windspeed )summary(lm1)p-value: < 2.2e-16as p-value: < 2.2e-16, humidity,windspeed,atemp have liner regression relationship withtemp.plot(bike.new1$atemp,bike.new1$temp,xlab = " atemp",ylab="temp",main="bike",abline(lm1))

plot(bike.new1$humidity,bike.new1$temp,xlab="humidity",ylab="temp",main="bike",abline(lm1))

plot(bike.new1$windspeed,bike.new1$temp,xlab ="windspeed",ylab="temp",main="bike",abline(lm1))

lm1<-lm(bike.new1$humidity~bike.new1$temp+bike.new1$atemp+bike.new1$windspeed )summary(lm1)p-value: 0.3597As the p-value is 0.3597, they do not exist linear regressionlm1<-lm(bike.new1$temp~bike.new1$humidity+bike.new1$windspeed+bike.new1$atemp+bike.new1$windspeed )summary(lm1)p-value: 0.1421As the p-value is 0.1421, they do not exist linear regressionlm1<-lm(bike.new1$atemp~bike.new1$humidity+bike.new1$windspeed+bike.new1$temp+bike.new1$windspeed )summary(lm1)p-value: < 2.2e-16As the p-value is 2.2e-16, they exist linear regression.plot(bike.new1$windspeed,bike.new1$atemp,xlab ="windspeed",ylab="atemp",main="bike",abline(lm1))

plot(bike.new1$humidity,bike.new1$atemp,xlab ="humidity",ylab="atemp",main="bike",abline(lm1))

plot(bike.new1$temp,bike.new1$atemp,xlab ="temp",ylab="atemp",main="bike",abline(lm1))

From the plots above, we found that temp and atemp exist liner relationship.

7 Bonus

We can also use accrual package: Bayesian Accrual Prediction, brglm package: Bias reduction inbinomial-response generalized linear models,bestglm package: Best Subset GLM andBayesBridge package: Bridge Regression to predict the datasets.

Recommended