Upload
romain-francois
View
884
Download
0
Embed Size (px)
Citation preview
• Use R since 2002
• #rcatladies
• R Enthusiast
• R/C++ hero
• Performance
• dplyr
• Occasional comedy
enjoy(cool(bake(shape(beat(append(bowl(rep("flour", 2), "yeast", "water", "milk", "oil"), "flour", until = "soft"), duration = "3mins"), as = "balls", style = "slightly-flat"), degrees = 200, duration = "15mins"), duration = "5mins"))
bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>% append("flour", until = "soft") %>% beat(duration = "3mins") %>% shape(as = "balls", style = "slightly-flat") %>% bake(degrees = 200, duration = "15mins") %>% cool(buns, duration = "5mins") %>% enjoy()
nycflights13
> flights Source: local data frame [336,776 x 16]
year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight 1 2013 1 1 517 2 830 11 UA N14228 1545 2 2013 1 1 533 4 850 20 UA N24211 1714 .. ... ... ... ... ... ... ... ... ... ... Variables not shown: origin (chr), dest (chr), air_time (dbl), distance (dbl), hour (dbl), minute (dbl)
nycflights13
> glimpse(flights) Observations: 336,776 Variables: 16 $ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 201... $ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 55... $ dep_delay (dbl) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, ... $ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 8... $ arr_delay (dbl) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14,... $ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA... $ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463... $ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49,... $ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "... $ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "... $ air_time (dbl) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158... $ distance (dbl) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 10... $ hour (dbl) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, ... $ minute (dbl) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, 58, 5...
filterA subset of the rows of the data frame
flights %>% filter( dep_delay < 10 )
flights %>% filter( arr_delay < dep_delay )
selectselect certain columns from the data frame
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
mutatemodify or create columns based on others
flights %>% mutate( gain = arr_delay - dep_delay, speed = distance / air_time * 60 ) %>% filter( gain > 0 ) %>% arrange( desc(speed) ) %>% select( year, month, day, dest, gain, speed )
summarisecollapse a data frame into one row …
flights %>% summarise(delay = mean(dep_delay, na.rm = TRUE))
flights %>% filter( dep_delay > 0 ) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE))
group_byGroup observations by one or more variables
flights %>% group_by( tailnum ) %>% summarise( count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) %>% filter( is.finite(delay) ) %>% arrange( desc(count) )
bind_rowsbind_rows( , )
color num
green 1
yellow 2
red 3
blue 4
pink 5
color numgreen 1yellow 2
color numred 3blue 4pink 5
joinsa <- data_frame( color = c("green", "yellow", "red"), num = 1:3 )
b <- data_frame( color = c("green", "yellow", "pink"), size = c("S", "M", "L") )
color numgreen 1yellow 2
red 3
color sizegreen Syellow Mpink L
inner_joincolor numgreen 1yellow 2
red 3
color sizegreen Syellow Mpink L
inner_join( , )
color num size
green 1 S
yellow 2 M
left_joincolor numgreen 1yellow 2
red 3
color sizegreen Syellow Mpink L
left_join( , )
color num size
green 1 S
yellow 2 M
red 3
right_joincolor numgreen 1yellow 2
red 3
color sizegreen Syellow Mpink L
right_join( , )
color num size
green 1 S
yellow 2 M
pink L
full_joincolor numgreen 1yellow 2
red 3
color sizegreen Syellow Mpink L
full_join( , )
color num size
green 1 S
yellow 2 M
red 3
pink L
data_frameJust like data.frame, but better
> data_frame( x = 1:5, y = letters[1:5] ) %>% glimpse Observations: 5 Variables: 2 $ x (int) 1, 2, 3, 4, 5 $ y (chr) "a", "b", "c", "d", "e"
> data_frame( x = 1:5, y = letters[1:5] , z = x + 1) %>% glimpse Observations: 5 Variables: 3 $ x (int) 1, 2, 3, 4, 5 $ y (chr) "a", "b", "c", "d", "e" $ z (dbl) 2, 3, 4, 5, 6
frame_data aka tibble
> frame_data( + ~colA, ~colB, + "a", 1, + "b", 2 + ) Source: local data frame [2 x 2]
colA colB (chr) (dbl) 1 a 1 2 b 2
g <- c("origin", "dest") v <- "dep_delay"
flights %>% group_by( g ) %>% summarise( result = mean(v, na.rm = TRUE) )
🙀
🙀
g <- c("origin", "dest") v <- "dep_delay"
flights %>% group_by_( .dots = g ) %>% summarise_( .dots = interp(~ mean(var, na.rm = TRUE), var = as.name(v)) )
Future
• Performance improvements (parallel C++)
• Alternative back ends
• Different type of groupings (e.g. bootstrap)
As soon as we get hoverboard ...