1# The MIT License (MIT)
2#
3# Copyright © 2013-2019 RStudio and others.
4#
5# Permission is hereby granted, free of charge, to any person obtaining a copy
6# of this software and associated documentation files (the “Software”), to deal
7# in the Software without restriction, including without limitation the rights
8# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9# copies of the Software, and to permit persons to whom the Software is
10# furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice shall be included in
13# all copies or substantial portions of the Software.
14#
15# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21# SOFTWARE.
22#
23
24# tidyverse/dplyr/blob/master/R/join-cols.R
25
26join_cols <- function(x_names, y_names, by = NULL, suffix = c(".x", ".y"), keep = FALSE) {
27 check_duplicate_vars(x_names, "x")
28 check_duplicate_vars(y_names, "y")
29
30 by <- standardise_join_by(by, x_names = x_names, y_names = y_names)
31 suffix <- standardise_join_suffix(suffix)
32
33 x_by <- set_names(match(by$x, x_names), by$x)
34 y_by <- set_names(match(by$y, y_names), by$x)
35
36 x_loc <- seq_along(x_names)
37 names(x_loc) <- x_names
38 if (!keep) {
39 # in x_out, key variables need to keep the same name, and aux
40 # variables need suffixes for duplicates that appear in y_out
41 y_aux <- setdiff(y_names, c(by$x, if (!keep) by$y))
42 x_is_aux <- !x_names %in% by$x
43 names(x_loc)[x_is_aux] <- add_suffixes(x_names[x_is_aux], c(by$x, y_aux), suffix$x)
44 } else {
45 # in x_out, key variables and aux variables need suffixes
46 # for duplicates that appear in y_out
47 names(x_loc) <- add_suffixes(x_names, y_names, suffix$x)
48 }
49
50 y_loc <- seq_along(y_names)
51 names(y_loc) <- add_suffixes(y_names, x_names, suffix$y)
52 if (!keep) {
53 y_loc <- y_loc[!y_names %in% by$y]
54 }
55
56 # key = named location to use for matching
57 # out = named locations to use in output
58 list(
59 x = list(key = x_by, out = x_loc),
60 y = list(key = y_by, out = y_loc)
61 )
62}
63
64standardise_join_by <- function(by, x_names, y_names) {
65 if (is.null(by)) {
66 by <- intersect(x_names, y_names)
67 if (length(by) == 0) {
68 abort(c(
69 "`by` must be supplied when `x` and `y` have no common variables.",
70 i = "use by = character()` to perform a cross-join."
71 ))
72 }
73 by_quoted <- encodeString(by, quote = '"')
74 if (length(by_quoted) == 1L) {
75 by_code <- by_quoted
76 } else {
77 by_code <- paste0("c(", paste(by_quoted, collapse = ", "), ")")
78 }
79 inform(paste0("Joining, by = ", by_code))
80
81 by <- list(x = by, y = by)
82 } else if (is.character(by)) {
83 by_x <- names(by) %||% by
84 by_y <- unname(by)
85
86 # If x partially named, assume unnamed are the same in both tables
87 by_x[by_x == ""] <- by_y[by_x == ""]
88
89 by <- list(x = by_x, y = by_y)
90 } else if (is.list(by)) {
91 # TODO: check lengths
92 by <- by[c("x", "y")]
93 } else {
94 bad_args("by", "must be a (named) character vector, list, or NULL, not {friendly_type_of(by)}.")
95 }
96
97 check_join_vars(by$x, x_names)
98 check_join_vars(by$y, y_names)
99
100 by
101}
102
103check_join_vars <- function(vars, names) {
104 if (!is.character(vars)) {
105 abort("join columns must be character vectors.")
106 }
107
108 na <- is.na(vars)
109 if (any(na)) {
110 abort(c(
111 "Join columns must be not NA.",
112 x = glue("Problem at position {err_vars(na)}.")
113 ))
114 }
115
116 dup <- duplicated(vars)
117 if (any(dup)) {
118 abort(c(
119 "Join columns must be unique.",
120 x = glue("Problem at position {err_vars(dup)}.")
121 ))
122 }
123
124 missing <- setdiff(vars, names)
125 if (length(missing) > 0) {
126 abort(c(
127 "Join columns must be present in data.",
128 x = glue("Problem with {err_vars(missing)}.")
129 ))
130 }
131}
132
133check_duplicate_vars <- function(vars, input) {
134 dup <- duplicated(vars)
135 if (any(dup)) {
136 abort(c(
137 glue("Input columns in `{input}` must be unique."),
138 x = glue("Problem with {err_vars(vars[dup])}.")
139 ))
140 }
141}
142
143standardise_join_suffix <- function(x) {
144 if (!is.character(x) || length(x) != 2) {
145 abort(c(
146 "`suffix` must be a character vector of length 2.",
147 i = glue("suffix is {friendly_type_of(x)} of length {length(x)}.")
148 ))
149 }
150
151 if (any(is.na(x))) {
152 bad_args("suffix", "can't be NA.")
153 }
154
155 list(x = x[[1]], y = x[[2]])
156}
157
158add_suffixes <- function(x, y, suffix) {
159 if (identical(suffix, "")) {
160 return(x)
161 }
162
163 out <- rep_along(x, na_chr)
164 for (i in seq_along(x)) {
165 nm <- x[[i]]
166 while (nm %in% y || nm %in% out[seq_len(i - 1)]) {
167 nm <- paste0(nm, suffix)
168 }
169
170 out[[i]] <- nm
171 }
172 out
173}
174