Upload
goran-s-milovanovic
View
15.167
Download
3
Embed Size (px)
Citation preview
Introduction to R for Data Science
Lecturers
dipl. ing Branko Kovač
Data Analyst at CUBE/Data Science Mentor
at Springboard
Data Science zajednica Srbije
dr Goran S. Milovanović
Data Scientist at DiploFoundation
Data Science zajednica Srbije
Linear Regression in R
• Exploratory Data Analysis
• Assumptions of the Linear Model
• Correlation
• Normality Tests
• Linear Regression
• Prediction, Confidence
Intervals, Residuals
• Influential Cases and
the Influence Plot
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
# clear rm(list=ls())
#### read data
library(datasets) data(iris)
### iris data set description: # https://stat.ethz.ch/R-manual/R-devel/library/iriss/html/iris.html
### Exploratory Data Analysis (EDA) str(iris)
summary(iris)
Linear Regression in R
• Before modeling: Assumptions and Exploratory Data Analysis (EDA)
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
### EDA plots # plot layout: 2 x 2
par(mfcol = c(2,2)) # boxplot iris$Sepal.Length
boxplot(iris$Sepal.Length, horizontal = TRUE, xlab="Sepal Length")
# histogram: iris$Sepal.Length hist(iris$Sepal.Length,
main="", xlab="Sepal.Length", prob=T)
# overlay iris$Sepal.Length density function over the empirical distribution lines(density(iris$Sepal.Length),
lty="dashed", lwd=2.5, col="red")
EDA
Intro to R for Data Science
Session 6: Linear Regression in R
Linear Regression in R
• EDA
Intro to R for Data Science
Session 6: Linear Regression in R
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
## Pearson correlation in R {base} cor1 <- cor(iris$Sepal.Length, iris$Petal.Length,
method="pearson") cor1
par(mfcol = c(1,1)) plot(iris$Sepal.Length, iris$Petal.Length, main = "Sepal Length vs Petal Length",
xlab = "Sepal Length", ylab = "Petal Length")
## Correlation matrix and treatment of missing data dSet <- iris # Remove one discrete variable
dSet$Species <- NULL # introduce NA in dSet$Sepal.Length[5]
dSet$Sepal.Length[5] <- NA # Pairwise and Listwise Deletion:
cor1a <- cor(dSet,use="complete.obs") # listwise deletion cor1a <- cor(dSet,use="pairwise.complete.obs") # pairwise deletion
cor1a <- cor(dSet,use="all.obs") # all observations - error
Correlation
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
library(Hmisc) cor2 <- rcorr(iris$Sepal.Length,
iris$Petal.Length, type="pearson")
cor2$r # correlations cor2$r[1,2] # that's what you need, right cor2$P # significant at
cor2$n # num. observations # NOTE: rcorr uses Pairwise deletion!
# Correlation matrix cor2a <- rcorr(as.matrix(dSet),
type="pearson") # NOTE: as.matrix # select significant at alpha == .05
w <- which(!(cor2a$P<.05),arr.ind = T) cor2a$r[w] <- NA cor2a$P # compare w.
cor2a$r
Correlation {Hmisc}
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
# Linear Regression: lm() # Predicting: Petal Length from Sepal Length
reg <- lm(Petal.Length ~ Sepal.Length, data=iris) class(reg)
summary(reg) coefsReg <- coefficients(reg) coefsReg
slopeReg <- coefsReg[2] interceptReg <- coefsReg[1]
# Prediction from this model newSLength <- data.frame(Sepal.Length = runif(100,
min(iris$Sepal.Length), max(iris$Sepal.Length))
) # watch the variable names in the new data.frame! predictPLength <- predict(reg, newSLength) predictPLength
Linear Regression with lm()
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
# Standardized regression coefficients {QuantPsych} library(QuantPsyc)
lm.beta(reg)
# Reminder: standardized regression coefficients are... # What you would obtain upon performing linear regression over standardized variables # z-score in R
zSLength <- scale(iris$Sepal.Length, center = T, scale = T) # computes z-score zPLength <- scale(iris$Petal.Length, center = T, scale = T) # again; ?scale
# new dSet w. standardized variables dSet <- data.frame(Sepal.Length <- zSLength, Petal.Length <- zPLength)
# Linear Regression w. lm() over standardized variables reg1 <- lm(Petal.Length ~ Sepal.Length, data=dSet)
summary(reg1) # compare coefficients(reg1)[2] # beta from reg1
lm.beta(reg) # standardized beta w. QuantPscy lm.beta from reg
Standardized Regression Coefficients
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
# plots w. {base} and {ggplot2} library(ggplot2)
# Predictor vs Criterion {base} plot(iris$Sepal.Length, iris$Petal.Length,
main = "Petal Length vs Sepal Length", xlab = "Sepal Length", ylab = "Petal Length"
) abline(reg,col="red")
# Predictor vs Criterion {ggplot2} ggplot(data = iris,
aes(x = Sepal.Length, y = Petal.Length)) + geom_point(size = 2, colour = "black") +
geom_point(size = 1, colour = "white") + geom_smooth(aes(colour = "red"), method='lm') +
ggtitle("Sepal Length vs Petal Length") + xlab("Sepal Length") + ylab("Petal Length") +
theme(legend.position = "none")
Plots {base} vs {ggplot2}
Intro to R for Data Science
Session 6: Linear Regression in R
Plots {base} vs {ggplot2}
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
# Predicted vs. residuals {ggplot2} predReg <- predict(reg) # get predictions from reg
resReg <- residuals(reg) # get residuals from reg # resStReg <- rstandard(reg) # get residuals from reg
plotFrame <- data.frame(predicted = predReg, residual = resReg); ggplot(data = plotFrame,
aes(x = predicted, y = residual)) + geom_point(size = 2, colour = "black") +
geom_point(size = 1, colour = "white") + geom_smooth(aes(colour = "blue"), method='lm',
se=F) + ggtitle("Predicted vs Residual Lengths") +
xlab("Predicted Lengths") + ylab("Residual") + theme(legend.position = "none")
Predicted vs Residuals
Intro to R for Data Science
Session 6: Linear Regression in R
Predicted vs Residuals
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
## Detect influential cases infReg <- as.data.frame(influence.measures(reg)$infmat)
# Cook's Distance: Cook and Weisberg (1982): # values greater than 1 are troublesome
wCook <- which(infReg$cook.d>1) # we're fine here # Average Leverage = (k+1)/n, k - num. of predictors, n - num. observations # Also termed: hat values, range: 0 - 1
# see: https://en.wikipedia.org/wiki/Leverage_%28statistics%29 # Various criteria (twice the leverage, three times the average...)
wLev <- which(infReg$hat>2*(2/length(iris$price))) # we seem to be fine here too... ## Influence plot infReg <- as.data.frame(influence.measures(reg)$infmat)
plotFrame <- data.frame(residual = resStReg, leverage = infReg$hat,
cookD = infReg$cook.d)
Infulential Cases + Infulence Plot
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science # SESSION 6 :: 02 June, 2016
ggplot(plotFrame, aes(y = residual,
x = leverage)) + geom_point(size = plotFrame$cookD*100, shape = 1) +
ggtitle("Influence Plot\nSize of the circle corresponds to Cook's distance") + theme(plot.title = element_text(size=8, face="bold")) + ylab("Standardized Residual") + xlab("Leverage")
Infulence Plot
Intro to R for Data Science
Session 6: Linear Regression in R
Infulence Plot
Intro to R for Data Science
Session 6: Linear Regression in R