31
dplyr @romain_francois

dplyr

Embed Size (px)

Citation preview

dplyr @romain_francois

• Use R since 2002

• #rcatladies

• R Enthusiast

• R/C++ hero

• Performance

• dplyr

• Occasional comedy

%>%from magrittr

enjoy(cool(bake(shape(beat(append(bowl(rep("flour", 2), "yeast", "water", "milk", "oil"), "flour", until = "soft"), duration = "3mins"), as = "balls", style = "slightly-flat"), degrees = 200, duration = "15mins"), duration = "5mins"))

bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>% append("flour", until = "soft") %>% beat(duration = "3mins") %>% shape(as = "balls", style = "slightly-flat") %>% bake(degrees = 200, duration = "15mins") %>% cool(buns, duration = "5mins") %>% enjoy()

nycflights13

> flights Source: local data frame [336,776 x 16]

year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight 1 2013 1 1 517 2 830 11 UA N14228 1545 2 2013 1 1 533 4 850 20 UA N24211 1714 .. ... ... ... ... ... ... ... ... ... ... Variables not shown: origin (chr), dest (chr), air_time (dbl), distance (dbl), hour (dbl), minute (dbl)

nycflights13

> glimpse(flights) Observations: 336,776 Variables: 16 $ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 201... $ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 55... $ dep_delay (dbl) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, ... $ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 8... $ arr_delay (dbl) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14,... $ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA... $ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463... $ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49,... $ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "... $ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "... $ air_time (dbl) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158... $ distance (dbl) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 10... $ hour (dbl) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, ... $ minute (dbl) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, 58, 5...

filterA subset of the rows of the data frame

flights %>% filter( dep_delay < 10 )

flights %>% filter( arr_delay < dep_delay )

slicefilter rows by position

flights %>% slice( 1:10 )

arrangereorder a data frame

flights %>% filter( hour < 8 ) %>% arrange( year, month, day )

selectselect certain columns from the data frame

select(flights, year, month, day)

select(flights, year:day)

select(flights, -(year:day))

mutatemodify or create columns based on others

flights %>% mutate( gain = arr_delay - dep_delay, speed = distance / air_time * 60 ) %>% filter( gain > 0 ) %>% arrange( desc(speed) ) %>% select( year, month, day, dest, gain, speed )

summarisecollapse a data frame into one row …

flights %>% summarise(delay = mean(dep_delay, na.rm = TRUE))

flights %>% filter( dep_delay > 0 ) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE))

group_byGroup observations by one or more variables

flights %>% group_by( tailnum ) %>% summarise( count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) %>% filter( is.finite(delay) ) %>% arrange( desc(count) )

bind_rowsbind_rows( , )

color num

green 1

yellow 2

red 3

blue 4

pink 5

color numgreen 1yellow 2

color numred 3blue 4pink 5

joinsa <- data_frame( color = c("green", "yellow", "red"), num = 1:3 )

b <- data_frame( color = c("green", "yellow", "pink"), size = c("S", "M", "L") )

color numgreen 1yellow 2

red 3

color sizegreen Syellow Mpink L

inner_joincolor numgreen 1yellow 2

red 3

color sizegreen Syellow Mpink L

inner_join( , )

color num size

green 1 S

yellow 2 M

left_joincolor numgreen 1yellow 2

red 3

color sizegreen Syellow Mpink L

left_join( , )

color num size

green 1 S

yellow 2 M

red 3

right_joincolor numgreen 1yellow 2

red 3

color sizegreen Syellow Mpink L

right_join( , )

color num size

green 1 S

yellow 2 M

pink L

full_joincolor numgreen 1yellow 2

red 3

color sizegreen Syellow Mpink L

full_join( , )

color num size

green 1 S

yellow 2 M

red 3

pink L

data_frameJust like data.frame, but better

> data_frame( x = 1:5, y = letters[1:5] ) %>% glimpse Observations: 5 Variables: 2 $ x (int) 1, 2, 3, 4, 5 $ y (chr) "a", "b", "c", "d", "e"

> data_frame( x = 1:5, y = letters[1:5] , z = x + 1) %>% glimpse Observations: 5 Variables: 3 $ x (int) 1, 2, 3, 4, 5 $ y (chr) "a", "b", "c", "d", "e" $ z (dbl) 2, 3, 4, 5, 6

frame_data aka tibble

> frame_data( + ~colA, ~colB, + "a", 1, + "b", 2 + ) Source: local data frame [2 x 2]

colA colB (chr) (dbl) 1 a 1 2 b 2

_

g <- c("origin", "dest") v <- "dep_delay"

flights %>% group_by( g ) %>% summarise( result = mean(v, na.rm = TRUE) )

🙀

🙀

g <- c("origin", "dest") v <- "dep_delay"

flights %>% group_by_( .dots = g ) %>% summarise_( .dots = interp(~ mean(var, na.rm = TRUE), var = as.name(v)) )

Future

• Performance improvements (parallel C++)

• Alternative back ends

• Different type of groupings (e.g. bootstrap)

As soon as we get hoverboard ...

dplyr

Romain François @romain_francois [email protected]