21 Joining tables

21.1 Joining (merging) tables: left_join, right_join, inner_join, full_join, semi_join, anti_join

# create two simple data frames
(a <- data_frame(color = c("green","yellow","red"), num = 1:3))
#> # A tibble: 3 × 2
#>   color    num
#>   <chr>  <int>
#> 1 green      1
#> 2 yellow     2
#> 3 red        3
(b <- data_frame(color = c("green","yellow","pink"), size = c("S","M","L")))
#> # A tibble: 3 × 2
#>   color  size 
#>   <chr>  <chr>
#> 1 green  S    
#> 2 yellow M    
#> 3 pink   L

# only include observations found in both "a" and "b" (automatically joins on variables that appear in both tables)
inner_join(a, b)
#> Joining, by = "color"
#> # A tibble: 2 × 3
#>   color    num size 
#>   <chr>  <int> <chr>
#> 1 green      1 S    
#> 2 yellow     2 M

# include observations found in either "a" or "b"
full_join(a, b)
#> Joining, by = "color"
#> # A tibble: 4 × 3
#>   color    num size 
#>   <chr>  <int> <chr>
#> 1 green      1 S    
#> 2 yellow     2 M    
#> 3 red        3 <NA> 
#> 4 pink      NA L

# include all observations found in "a"
left_join(a, b)
#> Joining, by = "color"
#> # A tibble: 3 × 3
#>   color    num size 
#>   <chr>  <int> <chr>
#> 1 green      1 S    
#> 2 yellow     2 M    
#> 3 red        3 <NA>

# include all observations found in "b"
right_join(a, b)
#> Joining, by = "color"
#> # A tibble: 3 × 3
#>   color    num size 
#>   <chr>  <int> <chr>
#> 1 green      1 S    
#> 2 yellow     2 M    
#> 3 pink      NA L

# right_join(a, b) is identical to left_join(b, a) except for column ordering
left_join(b, a)
#> Joining, by = "color"
#> # A tibble: 3 × 3
#>   color  size    num
#>   <chr>  <chr> <int>
#> 1 green  S         1
#> 2 yellow M         2
#> 3 pink   L        NA

# filter "a" to only show observations that match "b"
semi_join(a, b)
#> Joining, by = "color"
#> # A tibble: 2 × 2
#>   color    num
#>   <chr>  <int>
#> 1 green      1
#> 2 yellow     2

# filter "a" to only show observations that don't match "b"
anti_join(a, b)
#> Joining, by = "color"
#> # A tibble: 1 × 2
#>   color   num
#>   <chr> <int>
#> 1 red       3
# sometimes matching variables don't have identical names
b <- b %>% rename(col = color)

# specify that the join should occur by matching "color" in "a" with "col" in "b"
inner_join(a, b, by=c("color" = "col"))
#> # A tibble: 2 × 3
#>   color    num size 
#>   <chr>  <int> <chr>
#> 1 green      1 S    
#> 2 yellow     2 M

21.2 Viewing more output: print, View

# specify that you want to see more rows
flights %>% print(n = 15)
#> # A tibble: 336,776 × 19
#>     year month   day dep_time sched_dep_time dep_delay
#>    <int> <int> <int>    <int>          <int>     <dbl>
#>  1  2013     1     1      517            515         2
#>  2  2013     1     1      533            529         4
#>  3  2013     1     1      542            540         2
#>  4  2013     1     1      544            545        -1
#>  5  2013     1     1      554            600        -6
#>  6  2013     1     1      554            558        -4
#>  7  2013     1     1      555            600        -5
#>  8  2013     1     1      557            600        -3
#>  9  2013     1     1      557            600        -3
#> 10  2013     1     1      558            600        -2
#> 11  2013     1     1      558            600        -2
#> 12  2013     1     1      558            600        -2
#> 13  2013     1     1      558            600        -2
#> 14  2013     1     1      558            600        -2
#> 15  2013     1     1      559            600        -1
#> # … with 336,761 more rows, and 13 more variables:
#> #   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#> #   carrier <chr>, flight <int>, tailnum <chr>,
#> #   origin <chr>, dest <chr>, air_time <dbl>,
#> #   distance <dbl>, hour <dbl>, minute <dbl>,
#> #   time_hour <dttm>
# specify that you want to see ALL rows (don't run this!)
flights %>% print(n = Inf)
# specify that you want to see all columns
flights %>% print(width = Inf)
#> # A tibble: 336,776 × 19
#>     year month   day dep_time sched_dep_time dep_delay
#>    <int> <int> <int>    <int>          <int>     <dbl>
#>  1  2013     1     1      517            515         2
#>  2  2013     1     1      533            529         4
#>  3  2013     1     1      542            540         2
#>  4  2013     1     1      544            545        -1
#>  5  2013     1     1      554            600        -6
#>  6  2013     1     1      554            558        -4
#>  7  2013     1     1      555            600        -5
#>  8  2013     1     1      557            600        -3
#>  9  2013     1     1      557            600        -3
#> 10  2013     1     1      558            600        -2
#>    arr_time sched_arr_time arr_delay carrier flight tailnum
#>       <int>          <int>     <dbl> <chr>    <int> <chr>  
#>  1      830            819        11 UA        1545 N14228 
#>  2      850            830        20 UA        1714 N24211 
#>  3      923            850        33 AA        1141 N619AA 
#>  4     1004           1022       -18 B6         725 N804JB 
#>  5      812            837       -25 DL         461 N668DN 
#>  6      740            728        12 UA        1696 N39463 
#>  7      913            854        19 B6         507 N516JB 
#>  8      709            723       -14 EV        5708 N829AS 
#>  9      838            846        -8 B6          79 N593JB 
#> 10      753            745         8 AA         301 N3ALAA 
#>    origin dest  air_time distance  hour minute
#>    <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
#>  1 EWR    IAH        227     1400     5     15
#>  2 LGA    IAH        227     1416     5     29
#>  3 JFK    MIA        160     1089     5     40
#>  4 JFK    BQN        183     1576     5     45
#>  5 LGA    ATL        116      762     6      0
#>  6 EWR    ORD        150      719     5     58
#>  7 EWR    FLL        158     1065     6      0
#>  8 LGA    IAD         53      229     6      0
#>  9 JFK    MCO        140      944     6      0
#> 10 LGA    ORD        138      733     6      0
#>    time_hour          
#>    <dttm>             
#>  1 2013-01-01 05:00:00
#>  2 2013-01-01 05:00:00
#>  3 2013-01-01 05:00:00
#>  4 2013-01-01 05:00:00
#>  5 2013-01-01 06:00:00
#>  6 2013-01-01 05:00:00
#>  7 2013-01-01 06:00:00
#>  8 2013-01-01 06:00:00
#>  9 2013-01-01 06:00:00
#> 10 2013-01-01 06:00:00
#> # … with 336,766 more rows
# show up to 1000 rows and all columns
flights %>% View()

# set option to see all columns and fewer rows
options(dplyr.width = Inf, dplyr.print_min = 6)

# reset options (or just close R)
options(dplyr.width = NULL, dplyr.print_min = 10)