21 Joining tables
21.1 Joining (merging) tables: left_join, right_join, inner_join, full_join, semi_join, anti_join
# create two simple data frames
(a <- data_frame(color = c("green","yellow","red"), num = 1:3))
#> # A tibble: 3 × 2
#> color num
#> <chr> <int>
#> 1 green 1
#> 2 yellow 2
#> 3 red 3
(b <- data_frame(color = c("green","yellow","pink"), size = c("S","M","L")))
#> # A tibble: 3 × 2
#> color size
#> <chr> <chr>
#> 1 green S
#> 2 yellow M
#> 3 pink L
# only include observations found in both "a" and "b" (automatically joins on variables that appear in both tables)
inner_join(a, b)
#> Joining, by = "color"
#> # A tibble: 2 × 3
#> color num size
#> <chr> <int> <chr>
#> 1 green 1 S
#> 2 yellow 2 M
# include observations found in either "a" or "b"
full_join(a, b)
#> Joining, by = "color"
#> # A tibble: 4 × 3
#> color num size
#> <chr> <int> <chr>
#> 1 green 1 S
#> 2 yellow 2 M
#> 3 red 3 <NA>
#> 4 pink NA L
# include all observations found in "a"
left_join(a, b)
#> Joining, by = "color"
#> # A tibble: 3 × 3
#> color num size
#> <chr> <int> <chr>
#> 1 green 1 S
#> 2 yellow 2 M
#> 3 red 3 <NA>
# include all observations found in "b"
right_join(a, b)
#> Joining, by = "color"
#> # A tibble: 3 × 3
#> color num size
#> <chr> <int> <chr>
#> 1 green 1 S
#> 2 yellow 2 M
#> 3 pink NA L
# right_join(a, b) is identical to left_join(b, a) except for column ordering
left_join(b, a)
#> Joining, by = "color"
#> # A tibble: 3 × 3
#> color size num
#> <chr> <chr> <int>
#> 1 green S 1
#> 2 yellow M 2
#> 3 pink L NA
# filter "a" to only show observations that match "b"
semi_join(a, b)
#> Joining, by = "color"
#> # A tibble: 2 × 2
#> color num
#> <chr> <int>
#> 1 green 1
#> 2 yellow 2
# filter "a" to only show observations that don't match "b"
anti_join(a, b)
#> Joining, by = "color"
#> # A tibble: 1 × 2
#> color num
#> <chr> <int>
#> 1 red 3
# sometimes matching variables don't have identical names
b <- b %>% rename(col = color)
# specify that the join should occur by matching "color" in "a" with "col" in "b"
inner_join(a, b, by=c("color" = "col"))
#> # A tibble: 2 × 3
#> color num size
#> <chr> <int> <chr>
#> 1 green 1 S
#> 2 yellow 2 M
21.2 Viewing more output: print, View
# specify that you want to see more rows
flights %>% print(n = 15)
#> # A tibble: 336,776 × 19
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 1 517 515 2
#> 2 2013 1 1 533 529 4
#> 3 2013 1 1 542 540 2
#> 4 2013 1 1 544 545 -1
#> 5 2013 1 1 554 600 -6
#> 6 2013 1 1 554 558 -4
#> 7 2013 1 1 555 600 -5
#> 8 2013 1 1 557 600 -3
#> 9 2013 1 1 557 600 -3
#> 10 2013 1 1 558 600 -2
#> 11 2013 1 1 558 600 -2
#> 12 2013 1 1 558 600 -2
#> 13 2013 1 1 558 600 -2
#> 14 2013 1 1 558 600 -2
#> 15 2013 1 1 559 600 -1
#> # … with 336,761 more rows, and 13 more variables:
#> # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> # carrier <chr>, flight <int>, tailnum <chr>,
#> # origin <chr>, dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>,
#> # time_hour <dttm>
# specify that you want to see all columns
flights %>% print(width = Inf)
#> # A tibble: 336,776 × 19
#> year month day dep_time sched_dep_time dep_delay
#> <int> <int> <int> <int> <int> <dbl>
#> 1 2013 1 1 517 515 2
#> 2 2013 1 1 533 529 4
#> 3 2013 1 1 542 540 2
#> 4 2013 1 1 544 545 -1
#> 5 2013 1 1 554 600 -6
#> 6 2013 1 1 554 558 -4
#> 7 2013 1 1 555 600 -5
#> 8 2013 1 1 557 600 -3
#> 9 2013 1 1 557 600 -3
#> 10 2013 1 1 558 600 -2
#> arr_time sched_arr_time arr_delay carrier flight tailnum
#> <int> <int> <dbl> <chr> <int> <chr>
#> 1 830 819 11 UA 1545 N14228
#> 2 850 830 20 UA 1714 N24211
#> 3 923 850 33 AA 1141 N619AA
#> 4 1004 1022 -18 B6 725 N804JB
#> 5 812 837 -25 DL 461 N668DN
#> 6 740 728 12 UA 1696 N39463
#> 7 913 854 19 B6 507 N516JB
#> 8 709 723 -14 EV 5708 N829AS
#> 9 838 846 -8 B6 79 N593JB
#> 10 753 745 8 AA 301 N3ALAA
#> origin dest air_time distance hour minute
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 EWR IAH 227 1400 5 15
#> 2 LGA IAH 227 1416 5 29
#> 3 JFK MIA 160 1089 5 40
#> 4 JFK BQN 183 1576 5 45
#> 5 LGA ATL 116 762 6 0
#> 6 EWR ORD 150 719 5 58
#> 7 EWR FLL 158 1065 6 0
#> 8 LGA IAD 53 229 6 0
#> 9 JFK MCO 140 944 6 0
#> 10 LGA ORD 138 733 6 0
#> time_hour
#> <dttm>
#> 1 2013-01-01 05:00:00
#> 2 2013-01-01 05:00:00
#> 3 2013-01-01 05:00:00
#> 4 2013-01-01 05:00:00
#> 5 2013-01-01 06:00:00
#> 6 2013-01-01 05:00:00
#> 7 2013-01-01 06:00:00
#> 8 2013-01-01 06:00:00
#> 9 2013-01-01 06:00:00
#> 10 2013-01-01 06:00:00
#> # … with 336,766 more rows