20 dplyr
and nycflights13
# load packages
suppressMessages(library(dplyr))
library(nycflights13)
# print the flights dataset from nycflights13
head(flights)
#> # A tibble: 6 × 19
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 1 517 515 2
#> 2 2013 1 1 533 529 4
#> 3 2013 1 1 542 540 2
#> 4 2013 1 1 544 545 -1
#> 5 2013 1 1 554 600 -6
#> 6 2013 1 1 554 558 -4
#> # … with 13 more variables: arr_time <int>,
#> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> # air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>
20.1 Choosing columns: select, rename
# besides just using select() to pick columns...
flights %>% select(carrier, flight)
#> # A tibble: 336,776 × 2
#> carrier flight
#> <chr> <int>
#> 1 UA 1545
#> 2 UA 1714
#> 3 AA 1141
#> 4 B6 725
#> 5 DL 461
#> 6 UA 1696
#> 7 B6 507
#> 8 EV 5708
#> 9 B6 79
#> 10 AA 301
#> # … with 336,766 more rows
# ...you can use the minus sign to hide columns
flights %>% select(-month, -day)
#> # A tibble: 336,776 × 17
#> year dep_time sched_dep_time dep_delay arr_time
#> <int> <int> <int> <dbl> <int>
#> 1 2013 517 515 2 830
#> 2 2013 533 529 4 850
#> 3 2013 542 540 2 923
#> 4 2013 544 545 -1 1004
#> 5 2013 554 600 -6 812
#> 6 2013 554 558 -4 740
#> 7 2013 555 600 -5 913
#> 8 2013 557 600 -3 709
#> 9 2013 557 600 -3 838
#> 10 2013 558 600 -2 753
#> # … with 336,766 more rows, and 12 more variables:
#> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> # air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>
# hide a range of columns
flights %>% select(-(dep_time:arr_delay))
# hide any column with a matching name
flights %>% select(-contains("time"))
# pick columns using a character vector of column names
cols <- c("carrier", "flight", "tailnum")
flights %>% select(one_of(cols))
#> # A tibble: 336,776 × 3
#> carrier flight tailnum
#> <chr> <int> <chr>
#> 1 UA 1545 N14228
#> 2 UA 1714 N24211
#> 3 AA 1141 N619AA
#> 4 B6 725 N804JB
#> 5 DL 461 N668DN
#> 6 UA 1696 N39463
#> 7 B6 507 N516JB
#> 8 EV 5708 N829AS
#> 9 B6 79 N593JB
#> 10 AA 301 N3ALAA
#> # … with 336,766 more rows
# select() can be used to rename columns, though all columns not mentioned are dropped
flights %>% select(tail = tailnum)
#> # A tibble: 336,776 × 1
#> tail
#> <chr>
#> 1 N14228
#> 2 N24211
#> 3 N619AA
#> 4 N804JB
#> 5 N668DN
#> 6 N39463
#> 7 N516JB
#> 8 N829AS
#> 9 N593JB
#> 10 N3ALAA
#> # … with 336,766 more rows
# rename() does the same thing, except all columns not mentioned are kept
flights %>% rename(tail = tailnum)
#> # A tibble: 336,776 × 19
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 1 517 515 2
#> 2 2013 1 1 533 529 4
#> 3 2013 1 1 542 540 2
#> 4 2013 1 1 544 545 -1
#> 5 2013 1 1 554 600 -6
#> 6 2013 1 1 554 558 -4
#> 7 2013 1 1 555 600 -5
#> 8 2013 1 1 557 600 -3
#> 9 2013 1 1 557 600 -3
#> 10 2013 1 1 558 600 -2
#> # … with 336,766 more rows, and 13 more variables:
#> # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tail <chr>, origin <chr>,
#> # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>
20.2 Choosing rows: filter, between, slice, sample_n, top_n, distinct
# filter() supports the use of multiple conditions
flights %>% filter(dep_time >= 600, dep_time <= 605)
#> # A tibble: 2,460 × 19
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 1 600 600 0
#> 2 2013 1 1 600 600 0
#> 3 2013 1 1 601 600 1
#> 4 2013 1 1 602 610 -8
#> 5 2013 1 1 602 605 -3
#> 6 2013 1 2 600 600 0
#> 7 2013 1 2 600 605 -5
#> 8 2013 1 2 600 600 0
#> 9 2013 1 2 600 600 0
#> 10 2013 1 2 600 600 0
#> # … with 2,450 more rows, and 13 more variables:
#> # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>
# between() is a concise alternative for determing if numeric values fall in a range
flights %>% filter(between(dep_time, 600, 605))
# side note: is.na() can also be useful when filtering
flights %>% filter(!is.na(dep_time))
# slice() filters rows by position
flights %>% slice(1000:1005)
#> # A tibble: 6 × 19
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 2 809 810 -1
#> 2 2013 1 2 810 800 10
#> 3 2013 1 2 811 815 -4
#> 4 2013 1 2 811 815 -4
#> 5 2013 1 2 811 820 -9
#> 6 2013 1 2 815 815 0
#> # … with 13 more variables: arr_time <int>,
#> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> # flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> # air_time <dbl>, distance <dbl>, hour <dbl>,
#> # minute <dbl>, time_hour <dttm>
# keep the first three rows within each group
flights %>% group_by(month, day) %>% slice(1:3)
#> # A tibble: 1,095 × 19
#> # Groups: month, day [365]
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 1 517 515 2
#> 2 2013 1 1 533 529 4
#> 3 2013 1 1 542 540 2
#> 4 2013 1 2 42 2359 43
#> 5 2013 1 2 126 2250 156
#> 6 2013 1 2 458 500 -2
#> 7 2013 1 3 32 2359 33
#> 8 2013 1 3 50 2145 185
#> 9 2013 1 3 235 2359 156
#> 10 2013 1 4 25 2359 26
#> # … with 1,085 more rows, and 13 more variables:
#> # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>
# sample three rows from each group
flights %>% group_by(month, day) %>% sample_n(3)
#> # A tibble: 1,095 × 19
#> # Groups: month, day [365]
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 1 1846 1855 -9
#> 2 2013 1 1 2122 2125 -3
#> 3 2013 1 1 752 750 2
#> 4 2013 1 2 1513 1458 15
#> 5 2013 1 2 733 735 -2
#> 6 2013 1 2 1031 1015 16
#> 7 2013 1 3 855 900 -5
#> 8 2013 1 3 805 810 -5
#> 9 2013 1 3 1928 1835 53
#> 10 2013 1 4 1610 1600 10
#> # … with 1,085 more rows, and 13 more variables:
#> # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>
# keep three rows from each group with the top dep_delay
flights %>% group_by(month, day) %>% top_n(3, dep_delay)
#> # A tibble: 1,108 × 19
#> # Groups: month, day [365]
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 1 848 1835 853
#> 2 2013 1 1 1815 1325 290
#> 3 2013 1 1 2343 1724 379
#> 4 2013 1 2 1412 838 334
#> 5 2013 1 2 1607 1030 337
#> 6 2013 1 2 2131 1512 379
#> 7 2013 1 3 2008 1540 268
#> 8 2013 1 3 2012 1600 252
#> 9 2013 1 3 2056 1605 291
#> 10 2013 1 4 2058 1730 208
#> # … with 1,098 more rows, and 13 more variables:
#> # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>
# also sort by dep_delay within each group
flights %>% group_by(month, day) %>% top_n(3, dep_delay) %>% arrange(desc(dep_delay))
#> # A tibble: 1,108 × 19
#> # Groups: month, day [365]
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 9 641 900 1301
#> 2 2013 6 15 1432 1935 1137
#> 3 2013 1 10 1121 1635 1126
#> 4 2013 9 20 1139 1845 1014
#> 5 2013 7 22 845 1600 1005
#> 6 2013 4 10 1100 1900 960
#> 7 2013 3 17 2321 810 911
#> 8 2013 6 27 959 1900 899
#> 9 2013 7 22 2257 759 898
#> 10 2013 12 5 756 1700 896
#> # … with 1,098 more rows, and 13 more variables:
#> # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>
# unique rows can be identified using unique() from base R
flights %>% select(origin, dest) %>% unique()
#> # A tibble: 224 × 2
#> origin dest
#> <chr> <chr>
#> 1 EWR IAH
#> 2 LGA IAH
#> 3 JFK MIA
#> 4 JFK BQN
#> 5 LGA ATL
#> 6 EWR ORD
#> 7 EWR FLL
#> 8 LGA IAD
#> 9 JFK MCO
#> 10 LGA ORD
#> # … with 214 more rows
20.3 Excercies
Using the nycflights13 dataset and the dplyr package, answer these questions. Some answers are given in square brackets for you to check your answers.
- How many flights in Sept were late departing flights? [7815]
- How many flights in Sept were late departing flights that originated at JFK airport? [2649]
- How many flights in Sept were late departing flights with an origin of JFK airport and had an destination of anywhere except MIA? [2572]
- Which carrier had the most flights in this data set? [UA with 58665]
- Which destination had the most flights in this data set? [ORD with 17283]
- Which destination had the most flights with departure delays of greater than 60 minutes in this data set? [ORD with 1480]
- What was the longest arrival delay in this dataset? [1272]
- Which carrier in September had the most late departing flights? [UA with 1559]
- Create a variable called total.annoyance which arrival delay plus the departure delay for each flight.
- Which carrier with more than 10 flights in September had greatest % late departing flights?
20.4 Adding new variables: mutate, transmute, add_rownames
# mutate() creates a new variable (and keeps all existing variables)
flights %>% mutate(speed = distance/air_time*60)
#> # A tibble: 336,776 × 20
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 1 517 515 2
#> 2 2013 1 1 533 529 4
#> 3 2013 1 1 542 540 2
#> 4 2013 1 1 544 545 -1
#> 5 2013 1 1 554 600 -6
#> 6 2013 1 1 554 558 -4
#> 7 2013 1 1 555 600 -5
#> 8 2013 1 1 557 600 -3
#> 9 2013 1 1 557 600 -3
#> 10 2013 1 1 558 600 -2
#> # … with 336,766 more rows, and 14 more variables:
#> # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>, speed <dbl>
# transmute() only keeps the new variables
flights %>% transmute(speed = distance/air_time*60)
#> # A tibble: 336,776 × 1
#> speed
#> <dbl>
#> 1 370.
#> 2 374.
#> 3 408.
#> 4 517.
#> 5 394.
#> 6 288.
#> 7 404.
#> 8 259.
#> 9 405.
#> 10 319.
#> # … with 336,766 more rows
# example data frame with row names
mtcars %>% head()
#> mpg cyl disp hp drat wt qsec vs am
#> Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1
#> Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1
#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0
#> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0
#> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0
#> gear carb
#> Mazda RX4 4 4
#> Mazda RX4 Wag 4 4
#> Datsun 710 4 1
#> Hornet 4 Drive 3 1
#> Hornet Sportabout 3 2
#> Valiant 3 1
# add_rownames() turns row names into an explicit variable
mtcars %>% add_rownames("model") %>% head()
#> Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
#> Please use `tibble::rownames_to_column()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
#> # A tibble: 6 × 12
#> model mpg cyl disp hp drat wt qsec vs
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 21 6 160 110 3.9 2.62 16.5 0
#> 2 Mazda RX4… 21 6 160 110 3.9 2.88 17.0 0
#> 3 Datsun 710 22.8 4 108 93 3.85 2.32 18.6 1
#> 4 Hornet 4 … 21.4 6 258 110 3.08 3.22 19.4 1
#> 5 Hornet Sp… 18.7 8 360 175 3.15 3.44 17.0 0
#> 6 Valiant 18.1 6 225 105 2.76 3.46 20.2 1
#> # … with 3 more variables: am <dbl>, gear <dbl>, carb <dbl>
# side note: dplyr no longer prints row names (ever) for local data frames
mtcars %>% tbl_df()
#> Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
#> Please use `tibble::as_tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
#> # A tibble: 32 × 11
#> mpg cyl disp hp drat wt qsec vs am
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1
#> 2 21 6 160 110 3.9 2.88 17.0 0 1
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0
#> 7 14.3 8 360 245 3.21 3.57 15.8 0 0
#> 8 24.4 4 147. 62 3.69 3.19 20 1 0
#> 9 22.8 4 141. 95 3.92 3.15 22.9 1 0
#> 10 19.2 6 168. 123 3.92 3.44 18.3 1 0
#> # … with 22 more rows, and 2 more variables: gear <dbl>,
#> # carb <dbl>
20.5 Grouping and counting: summarise, tally, count, group_size, n_groups, ungroup
# summarise() can be used to count the number of rows in each group
flights %>% group_by(month) %>% summarise(cnt = n())
#> # A tibble: 12 × 2
#> month cnt
#> <int> <int>
#> 1 1 27004
#> 2 2 24951
#> 3 3 28834
#> 4 4 28330
#> 5 5 28796
#> 6 6 28243
#> 7 7 29425
#> 8 8 29327
#> 9 9 27574
#> 10 10 28889
#> 11 11 27268
#> 12 12 28135
# tally() and count() can do this more concisely
flights %>% group_by(month) %>% tally()
flights %>% count(month)
# you can sort by the count
flights %>% group_by(month) %>% summarise(cnt = n()) %>% arrange(desc(cnt))
#> # A tibble: 12 × 2
#> month cnt
#> <int> <int>
#> 1 7 29425
#> 2 8 29327
#> 3 10 28889
#> 4 3 28834
#> 5 5 28796
#> 6 4 28330
#> 7 6 28243
#> 8 12 28135
#> 9 9 27574
#> 10 11 27268
#> 11 1 27004
#> 12 2 24951
# tally() and count() have a sort parameter for this purpose
flights %>% group_by(month) %>% tally(sort=TRUE)
flights %>% count(month, sort=TRUE)
# you can sum over a specific variable instead of simply counting rows
flights %>% group_by(month) %>% summarise(dist = sum(distance))
#> # A tibble: 12 × 2
#> month dist
#> <int> <dbl>
#> 1 1 27188805
#> 2 2 24975509
#> 3 3 29179636
#> 4 4 29427294
#> 5 5 29974128
#> 6 6 29856388
#> 7 7 31149199
#> 8 8 31149334
#> 9 9 28711426
#> 10 10 30012086
#> 11 11 28639718
#> 12 12 29954084
# tally() and count() have a wt parameter for this purpose
flights %>% group_by(month) %>% tally(wt = distance)
flights %>% count(month, wt = distance)
# group_size() returns the counts as a vector
flights %>% group_by(month) %>% group_size()
#> [1] 27004 24951 28834 28330 28796 28243 29425 29327 27574
#> [10] 28889 27268 28135
# n_groups() simply reports the number of groups
flights %>% group_by(month) %>% n_groups()
#> [1] 12
# group by two variables, summarise, arrange (output is possibly confusing)
flights %>% group_by(month, day) %>% summarise(cnt = n()) %>% arrange(desc(cnt)) %>% print(n = 40)
#> `summarise()` has grouped output by 'month'. You can
#> override using the `.groups` argument.
#> # A tibble: 365 × 3
#> # Groups: month [12]
#> month day cnt
#> <int> <int> <int>
#> 1 11 27 1014
#> 2 7 11 1006
#> 3 7 8 1004
#> 4 7 10 1004
#> 5 12 2 1004
#> 6 7 18 1003
#> 7 7 25 1003
#> 8 7 12 1002
#> 9 7 9 1001
#> 10 7 17 1001
#> 11 7 31 1001
#> 12 8 7 1001
#> 13 8 8 1001
#> 14 8 12 1001
#> 15 7 22 1000
#> 16 7 24 1000
#> 17 8 1 1000
#> 18 8 5 1000
#> 19 8 15 1000
#> 20 11 21 1000
#> 21 7 15 999
#> 22 7 19 999
#> 23 7 26 999
#> 24 7 29 999
#> 25 8 2 999
#> 26 8 9 999
#> 27 11 22 999
#> 28 8 16 998
#> 29 7 23 997
#> 30 7 30 997
#> 31 8 14 997
#> 32 7 16 996
#> 33 8 6 996
#> 34 8 19 996
#> 35 9 13 996
#> 36 9 26 996
#> 37 9 27 996
#> 38 4 15 995
#> 39 6 20 995
#> 40 6 26 995
#> # … with 325 more rows
# ungroup() before arranging to arrange across all groups
flights %>% group_by(month, day) %>% summarise(cnt = n()) %>% ungroup() %>% arrange(desc(cnt))
#> `summarise()` has grouped output by 'month'. You can
#> override using the `.groups` argument.
#> # A tibble: 365 × 3
#> month day cnt
#> <int> <int> <int>
#> 1 11 27 1014
#> 2 7 11 1006
#> 3 7 8 1004
#> 4 7 10 1004
#> 5 12 2 1004
#> 6 7 18 1003
#> 7 7 25 1003
#> 8 7 12 1002
#> 9 7 9 1001
#> 10 7 17 1001
#> # … with 355 more rows
20.6 Creating data frames: data_frame
data_frame()
is a better way than data.frame()
for creating data frames. Benefits of data_frame()
:
- You can use previously defined columns to compute new columns.
- It never coerces column types.
- It never munges column names.
- It never adds row names.
- It only recycles length 1 input.
- It returns a local data frame (a tbl_df).
# data_frame() example
data_frame(a = 1:6, b = a*2, c = 'string', 'd+e' = 1) %>% glimpse()
#> Warning: `data_frame()` was deprecated in tibble 1.1.0.
#> Please use `tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
#> Rows: 6
#> Columns: 4
#> $ a <int> 1, 2, 3, 4, 5, 6
#> $ b <dbl> 2, 4, 6, 8, 10, 12
#> $ c <chr> "string", "string", "string", "string", "str…
#> $ `d+e` <dbl> 1, 1, 1, 1, 1, 1
# data.frame() example
data.frame(a = 1:6, c = 'string', 'd+e' = 1) %>% glimpse()
#> Rows: 6
#> Columns: 3
#> $ a <int> 1, 2, 3, 4, 5, 6
#> $ c <chr> "string", "string", "string", "string", "strin…
#> $ d.e <dbl> 1, 1, 1, 1, 1, 1