1# The MIT License (MIT) 2# 3# Copyright © 2013-2019 RStudio and others. 4# 5# Permission is hereby granted, free of charge, to any person obtaining a copy 6# of this software and associated documentation files (the “Software”), to deal 7# in the Software without restriction, including without limitation the rights 8# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9# copies of the Software, and to permit persons to whom the Software is 10# furnished to do so, subject to the following conditions: 11# 12# The above copyright notice and this permission notice shall be included in 13# all copies or substantial portions of the Software. 14# 15# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21# SOFTWARE. 22# 23 24# tidyverse/dplyr/blob/master/R/join-cols.R 25 26join_cols <- function(x_names, y_names, by = NULL, suffix = c(".x", ".y"), keep = FALSE) { 27 check_duplicate_vars(x_names, "x") 28 check_duplicate_vars(y_names, "y") 29 30 by <- standardise_join_by(by, x_names = x_names, y_names = y_names) 31 suffix <- standardise_join_suffix(suffix) 32 33 x_by <- set_names(match(by$x, x_names), by$x) 34 y_by <- set_names(match(by$y, y_names), by$x) 35 36 x_loc <- seq_along(x_names) 37 names(x_loc) <- x_names 38 if (!keep) { 39 # in x_out, key variables need to keep the same name, and aux 40 # variables need suffixes for duplicates that appear in y_out 41 y_aux <- setdiff(y_names, c(by$x, if (!keep) by$y)) 42 x_is_aux <- !x_names %in% by$x 43 names(x_loc)[x_is_aux] <- add_suffixes(x_names[x_is_aux], c(by$x, y_aux), suffix$x) 44 } else { 45 # in x_out, key variables and aux variables need suffixes 46 # for duplicates that appear in y_out 47 names(x_loc) <- add_suffixes(x_names, y_names, suffix$x) 48 } 49 50 y_loc <- seq_along(y_names) 51 names(y_loc) <- add_suffixes(y_names, x_names, suffix$y) 52 if (!keep) { 53 y_loc <- y_loc[!y_names %in% by$y] 54 } 55 56 # key = named location to use for matching 57 # out = named locations to use in output 58 list( 59 x = list(key = x_by, out = x_loc), 60 y = list(key = y_by, out = y_loc) 61 ) 62} 63 64standardise_join_by <- function(by, x_names, y_names) { 65 if (is.null(by)) { 66 by <- intersect(x_names, y_names) 67 if (length(by) == 0) { 68 abort(c( 69 "`by` must be supplied when `x` and `y` have no common variables.", 70 i = "use by = character()` to perform a cross-join." 71 )) 72 } 73 by_quoted <- encodeString(by, quote = '"') 74 if (length(by_quoted) == 1L) { 75 by_code <- by_quoted 76 } else { 77 by_code <- paste0("c(", paste(by_quoted, collapse = ", "), ")") 78 } 79 inform(paste0("Joining, by = ", by_code)) 80 81 by <- list(x = by, y = by) 82 } else if (is.character(by)) { 83 by_x <- names(by) %||% by 84 by_y <- unname(by) 85 86 # If x partially named, assume unnamed are the same in both tables 87 by_x[by_x == ""] <- by_y[by_x == ""] 88 89 by <- list(x = by_x, y = by_y) 90 } else if (is.list(by)) { 91 # TODO: check lengths 92 by <- by[c("x", "y")] 93 } else { 94 bad_args("by", "must be a (named) character vector, list, or NULL, not {friendly_type_of(by)}.") 95 } 96 97 check_join_vars(by$x, x_names) 98 check_join_vars(by$y, y_names) 99 100 by 101} 102 103check_join_vars <- function(vars, names) { 104 if (!is.character(vars)) { 105 abort("join columns must be character vectors.") 106 } 107 108 na <- is.na(vars) 109 if (any(na)) { 110 abort(c( 111 "Join columns must be not NA.", 112 x = glue("Problem at position {err_vars(na)}.") 113 )) 114 } 115 116 dup <- duplicated(vars) 117 if (any(dup)) { 118 abort(c( 119 "Join columns must be unique.", 120 x = glue("Problem at position {err_vars(dup)}.") 121 )) 122 } 123 124 missing <- setdiff(vars, names) 125 if (length(missing) > 0) { 126 abort(c( 127 "Join columns must be present in data.", 128 x = glue("Problem with {err_vars(missing)}.") 129 )) 130 } 131} 132 133check_duplicate_vars <- function(vars, input) { 134 dup <- duplicated(vars) 135 if (any(dup)) { 136 abort(c( 137 glue("Input columns in `{input}` must be unique."), 138 x = glue("Problem with {err_vars(vars[dup])}.") 139 )) 140 } 141} 142 143standardise_join_suffix <- function(x) { 144 if (!is.character(x) || length(x) != 2) { 145 abort(c( 146 "`suffix` must be a character vector of length 2.", 147 i = glue("suffix is {friendly_type_of(x)} of length {length(x)}.") 148 )) 149 } 150 151 if (any(is.na(x))) { 152 bad_args("suffix", "can't be NA.") 153 } 154 155 list(x = x[[1]], y = x[[2]]) 156} 157 158add_suffixes <- function(x, y, suffix) { 159 if (identical(suffix, "")) { 160 return(x) 161 } 162 163 out <- rep_along(x, na_chr) 164 for (i in seq_along(x)) { 165 nm <- x[[i]] 166 while (nm %in% y || nm %in% out[seq_len(i - 1)]) { 167 nm <- paste0(nm, suffix) 168 } 169 170 out[[i]] <- nm 171 } 172 out 173} 174