1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2020, Anatoly Akkerman <akkerman@gmail.com>, <anatoly.akkerman@twosigma.com>. 22 */ 23 package org.opengrok.indexer.history; 24 25 import org.opengrok.indexer.logger.LoggerFactory; 26 27 import java.io.IOException; 28 import java.nio.file.FileSystem; 29 import java.nio.file.Files; 30 import java.nio.file.Path; 31 import java.nio.file.Paths; 32 import java.util.ArrayList; 33 import java.util.Collection; 34 import java.util.Comparator; 35 import java.util.List; 36 import java.util.Map; 37 import java.util.Optional; 38 import java.util.Set; 39 import java.util.TreeSet; 40 import java.util.concurrent.ConcurrentHashMap; 41 import java.util.concurrent.ConcurrentMap; 42 import java.util.logging.Level; 43 import java.util.logging.Logger; 44 import java.util.stream.Collectors; 45 46 public class RepositoryLookupCached implements RepositoryLookup { 47 private static final Logger LOGGER = LoggerFactory.getLogger(RepositoryLookupCached.class); 48 49 /** 50 * Cache of directories to their enclosing repositories. 51 * 52 * This cache will contain Optional.empty() for any directory that has so far not been mapped to a repo. 53 * See findRepository for more information. 54 */ 55 private final ConcurrentMap<String, Optional<Repository>> dirToRepoCache = new ConcurrentHashMap<>(); 56 57 /** 58 * Find enclosing Repository for a given file, limiting canonicalization operations 59 * to a given set of Repository parent dirs. 60 * Search results are cached in an internal cache to speed up subsequent lookups for all directories 61 * between path and the root path of the enclosing Repository (if found). Negative results are also 62 * cached as empty Optional values in the cache. However, negative results are not treated as definitive. 63 * <p> 64 * The cache is only invalidated on Repository invalidation or removal 65 * which means that if the filesystem changes under HistoryGuru, it may return stale results 66 * 67 * @param filePath path to find enclosing repository for 68 * @param repoRoots set of Repository parent dirs 69 * @param repositories map of repository roots to repositories 70 * @param canonicalizer path canonicalizer implementation to use 71 * @return Optional Repository (empty if not found) 72 */ findRepository(final Path filePath, final Set<String> repoRoots, Map<String, Repository> repositories, PathCanonicalizer canonicalizer)73 private Optional<Repository> findRepository(final Path filePath, final Set<String> repoRoots, 74 Map<String, Repository> repositories, PathCanonicalizer canonicalizer) { 75 Path path = filePath; 76 FileSystem fileSystem = filePath.getFileSystem(); 77 Optional<Repository> maybeRepo = Optional.empty(); 78 79 // If we find repo mapping, we backfill the cache with these entries as well 80 List<String> backfillPaths = new ArrayList<>(); 81 // Walk up the file's path until we find a matching enclosing Repository 82 while (maybeRepo.isEmpty() && path != null) { 83 String nextPath = path.toString(); 84 boolean isDirectory = Files.isDirectory(path); 85 if (isDirectory) { 86 /* 87 * Only store (and lookup) directory entries in the cache to avoid cache explosion 88 * This may fail to find a Repository (e.g. nextPath is inside a root), so this will 89 * insert an Optional.empty() into the map. However, as we walk up, we may eventually 90 * find a Repo, in which case we need to backfill these placeholder Optional.empty() entries 91 * with the final answer 92 */ 93 maybeRepo = dirToRepoCache 94 .computeIfAbsent(nextPath, p -> repoForPath(p, repoRoots, repositories, fileSystem, canonicalizer)); 95 } else { 96 maybeRepo = repoForPath(nextPath, repoRoots, repositories, fileSystem, canonicalizer); 97 } 98 /* 99 * Given that this class has to be thread-safe, Optional.empty() value cannot be assumed to be a definitive 100 * answer that a given directory is not inside a repo, because of backfilling. 101 * E.g. 102 * During lookup of repository /a/b/c/d in repository /a, the following happens: 103 * - /a/b/c/d: no entry in the cache, insert empty 104 * - /a/b/c: no entry in the cache, insert empty 105 * - /a/b: no entry in the cache, insert empty 106 * - /a: found Repo(a), insert it into cache 107 * 108 * Now, the code needs to replace (backfill) entries for /a/b/c/d, /a/b/c and /a/b 109 * However, before that happens, a concurrent findRepository() call can come in and find empty mappings 110 * for /a/b/c/d, /a/b/c and /a/b, before they were backfilled. So, instead of returning an incorrect 111 * cached result, we continue confirming negative result fully. This is a reasonable choice because 112 * most lookups will be made for files that do live inside repositories, so non-empty cached values will 113 * be the dominant cache hits. 114 * 115 * An alternative to provide definitive negative cache result would be to use a special sentinel 116 * Repository object that represents NO_REPOSITORY as a definitive negative cached value, while Optional.empty 117 * would represent a tentative negative value. 118 */ 119 if (maybeRepo.isEmpty()) { 120 if (isDirectory) { 121 backfillPaths.add(nextPath); 122 } 123 path = path.getParent(); 124 } 125 } 126 /* 127 * If the lookup was for /a/b/c/d and we found the repo at /a, then 128 * - /a -> Repo(/a) is done by the computeIfAbsent call 129 * And backfill fills in: 130 * - /a/b -> Repo(/a) 131 * - /a/b/c-> Repo(/a) 132 */ 133 maybeRepo.ifPresent(repo -> 134 // Replace empty values with newly found Repository 135 backfillPaths.forEach(backfillPath -> dirToRepoCache.replace(backfillPath, Optional.empty(), Optional.of(repo)))); 136 137 return maybeRepo; 138 } 139 140 /** 141 * Try to find path's enclosing repository by attempting to canonicalize it relative to 142 * a given set of repository roots. This method can be pretty expensive, with complexity 143 * of O(N*M) where N is path's depth and M is number of repository parent dirs. 144 * 145 * @param path path to find enclosing Repository for 146 * @param repoParentDirs limit canonicalization search only to these Repository parent dirs 147 * @param canonicalizer path canonicalizer implementation to use 148 * @return Optional of matching Repository (empty if not found) 149 */ repoForPath(String path, Set<String> repoParentDirs, Map<String, Repository> repositories, FileSystem fileSystem, PathCanonicalizer canonicalizer)150 private Optional<Repository> repoForPath(String path, Set<String> repoParentDirs, 151 Map<String, Repository> repositories, 152 FileSystem fileSystem, PathCanonicalizer canonicalizer) { 153 Optional<Repository> repo = Optional.ofNullable(repositories.get(path)); 154 155 if (repo.isPresent()) { 156 return repo; 157 } 158 159 for (String repoRoot : repoParentDirs) { 160 try { 161 String rel = canonicalizer.resolve(fileSystem.getPath(path), fileSystem.getPath(repoRoot)); 162 if (!rel.equals(path)) { 163 // Resolve the relative against root key and look it up 164 String canonicalPath = Paths.get(repoRoot, rel).toString(); 165 repo = Optional.ofNullable(repositories.get(canonicalPath)); 166 if (repo.isPresent()) { 167 return repo; 168 } 169 } 170 } catch (IOException e) { 171 LOGGER.log(Level.WARNING, 172 "Failed to get relative to canonical for " + path + " and " + repoRoot, 173 e); 174 } 175 } 176 return Optional.empty(); 177 } 178 179 @Override getRepository(Path path, Set<String> repoParentDirs, Map<String, Repository> repositories, PathCanonicalizer canonicalizer)180 public Repository getRepository(Path path, Set<String> repoParentDirs, Map<String, Repository> repositories, 181 PathCanonicalizer canonicalizer) { 182 Comparator<String> stringComparator = String::compareTo; 183 Comparator<String> reversedComparator = stringComparator.reversed(); 184 TreeSet<String> allRoots = new TreeSet<>(repoParentDirs); 185 String pathStr = path.toString(); 186 // parentRepoDirs are canonicalized, so we need to filter them against canonical path representation 187 Path canonicalPath; 188 try { 189 canonicalPath = path.toRealPath(); 190 } catch (IOException e) { 191 canonicalPath = path.normalize().toAbsolutePath(); 192 } 193 194 if (!canonicalPath.equals(path)) { 195 pathStr = canonicalPath.toString(); 196 } 197 198 /* 199 * Find all potential Repository parent dirs by matching their roots to file's prefix 200 * This is useful to limit the number of iterations in repoForPath call. 201 * Store matches in reverse-ordered TreeSet so that longest prefixes appear first 202 * (There may be multiple entries if we allow nested repositories) 203 */ 204 TreeSet<String> filteredParentDirs = allRoots.stream().filter(pathStr::startsWith) 205 .collect(Collectors.toCollection(() -> new TreeSet<>(reversedComparator))); 206 207 return findRepository(path, filteredParentDirs, repositories, canonicalizer).orElse(null); 208 } 209 210 @Override repositoriesRemoved(Collection<Repository> removedRepos)211 public void repositoriesRemoved(Collection<Repository> removedRepos) { 212 dirToRepoCache.entrySet().stream().filter(entry -> entry.getValue().filter(removedRepos::contains).isPresent()) 213 .map(Map.Entry::getKey).forEach(dirToRepoCache::remove); 214 } 215 216 @Override clear()217 public void clear() { 218 dirToRepoCache.clear(); 219 } 220 size()221 public int size() { 222 return dirToRepoCache.size(); 223 } 224 } 225