xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/history/RepositoryLookupCached.java (revision c6f0939b1c668e9f8e1e276424439c3106b3a029)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2020, Anatoly Akkerman <akkerman@gmail.com>, <anatoly.akkerman@twosigma.com>.
22  */
23 package org.opengrok.indexer.history;
24 
25 import org.opengrok.indexer.logger.LoggerFactory;
26 
27 import java.io.IOException;
28 import java.nio.file.FileSystem;
29 import java.nio.file.Files;
30 import java.nio.file.Path;
31 import java.nio.file.Paths;
32 import java.util.ArrayList;
33 import java.util.Collection;
34 import java.util.Comparator;
35 import java.util.List;
36 import java.util.Map;
37 import java.util.Optional;
38 import java.util.Set;
39 import java.util.TreeSet;
40 import java.util.concurrent.ConcurrentHashMap;
41 import java.util.concurrent.ConcurrentMap;
42 import java.util.logging.Level;
43 import java.util.logging.Logger;
44 import java.util.stream.Collectors;
45 
46 public class RepositoryLookupCached implements RepositoryLookup {
47     private static final Logger LOGGER = LoggerFactory.getLogger(RepositoryLookupCached.class);
48 
49     /**
50      * Cache of directories to their enclosing repositories.
51      *
52      * This cache will contain Optional.empty() for any directory that has so far not been mapped to a repo.
53      * See findRepository for more information.
54      */
55     private final ConcurrentMap<String, Optional<Repository>> dirToRepoCache = new ConcurrentHashMap<>();
56 
57     /**
58      * Find enclosing Repository for a given file, limiting canonicalization operations
59      * to a given set of Repository parent dirs.
60      * Search results are cached in an internal cache to speed up subsequent lookups for all directories
61      * between path and the root path of the enclosing Repository (if found). Negative results are also
62      * cached as empty Optional values in the cache. However, negative results are not treated as definitive.
63      * <p>
64      * The cache is only invalidated on Repository invalidation or removal
65      * which means that if the filesystem changes under HistoryGuru, it may return stale results
66      *
67      * @param filePath      path to find enclosing repository for
68      * @param repoRoots     set of Repository parent dirs
69      * @param repositories  map of repository roots to repositories
70      * @param canonicalizer path canonicalizer implementation to use
71      * @return Optional Repository (empty if not found)
72      */
findRepository(final Path filePath, final Set<String> repoRoots, Map<String, Repository> repositories, PathCanonicalizer canonicalizer)73     private Optional<Repository> findRepository(final Path filePath, final Set<String> repoRoots,
74         Map<String, Repository> repositories, PathCanonicalizer canonicalizer) {
75         Path path = filePath;
76         FileSystem fileSystem = filePath.getFileSystem();
77         Optional<Repository> maybeRepo = Optional.empty();
78 
79         // If we find repo mapping, we backfill the cache with these entries as well
80         List<String> backfillPaths = new ArrayList<>();
81         // Walk up the file's path until we find a matching enclosing Repository
82         while (maybeRepo.isEmpty() && path != null) {
83             String nextPath = path.toString();
84             boolean isDirectory = Files.isDirectory(path);
85             if (isDirectory) {
86                 /*
87                  * Only store (and lookup) directory entries in the cache to avoid cache explosion
88                  * This may fail to find a Repository (e.g. nextPath is inside a root), so this will
89                  * insert an Optional.empty() into the map. However, as we walk up, we may eventually
90                  * find a Repo, in which case we need to backfill these placeholder Optional.empty() entries
91                  * with the final answer
92                  */
93                 maybeRepo = dirToRepoCache
94                     .computeIfAbsent(nextPath, p -> repoForPath(p, repoRoots, repositories, fileSystem, canonicalizer));
95             } else {
96                 maybeRepo = repoForPath(nextPath, repoRoots, repositories, fileSystem, canonicalizer);
97             }
98             /*
99              * Given that this class has to be thread-safe, Optional.empty() value cannot be assumed to be a definitive
100              * answer that a given directory is not inside a repo, because of backfilling.
101              * E.g.
102              * During lookup of repository /a/b/c/d in repository /a, the following happens:
103              * - /a/b/c/d: no entry in the cache, insert empty
104              * - /a/b/c: no entry in the cache, insert empty
105              * - /a/b: no entry in the cache, insert empty
106              * - /a: found Repo(a), insert it into cache
107              *
108              * Now, the code needs to replace (backfill) entries for /a/b/c/d, /a/b/c and /a/b
109              * However, before that happens, a concurrent findRepository() call can come in and find empty mappings
110              * for /a/b/c/d, /a/b/c and /a/b, before they were backfilled. So, instead of returning an incorrect
111              * cached result, we continue confirming negative result fully. This is a reasonable choice because
112              * most lookups will be made for files that do live inside repositories, so non-empty cached values will
113              * be the dominant cache hits.
114              *
115              * An alternative to provide definitive negative cache result would be to use a special sentinel
116              * Repository object that represents NO_REPOSITORY as a definitive negative cached value, while Optional.empty
117              * would represent a tentative negative value.
118              */
119             if (maybeRepo.isEmpty()) {
120                 if (isDirectory) {
121                     backfillPaths.add(nextPath);
122                 }
123                 path = path.getParent();
124             }
125         }
126         /*
127          * If the lookup was for /a/b/c/d and we found the repo at /a, then
128          *   - /a -> Repo(/a) is done by the computeIfAbsent call
129          * And backfill fills in:
130          *   - /a/b -> Repo(/a)
131          *   - /a/b/c-> Repo(/a)
132          */
133         maybeRepo.ifPresent(repo ->
134             // Replace empty values with newly found Repository
135             backfillPaths.forEach(backfillPath -> dirToRepoCache.replace(backfillPath, Optional.empty(), Optional.of(repo))));
136 
137         return maybeRepo;
138     }
139 
140     /**
141      * Try to find path's enclosing repository by attempting to canonicalize it relative to
142      * a given set of repository roots. This method can be pretty expensive, with complexity
143      * of O(N*M) where N is path's depth and M is number of repository parent dirs.
144      *
145      * @param path           path to find enclosing Repository for
146      * @param repoParentDirs limit canonicalization search only to these Repository parent dirs
147      * @param canonicalizer  path canonicalizer implementation to use
148      * @return Optional of matching Repository (empty if not found)
149      */
repoForPath(String path, Set<String> repoParentDirs, Map<String, Repository> repositories, FileSystem fileSystem, PathCanonicalizer canonicalizer)150     private Optional<Repository> repoForPath(String path, Set<String> repoParentDirs,
151         Map<String, Repository> repositories,
152         FileSystem fileSystem, PathCanonicalizer canonicalizer) {
153         Optional<Repository> repo = Optional.ofNullable(repositories.get(path));
154 
155         if (repo.isPresent()) {
156             return repo;
157         }
158 
159         for (String repoRoot : repoParentDirs) {
160             try {
161                 String rel = canonicalizer.resolve(fileSystem.getPath(path), fileSystem.getPath(repoRoot));
162                 if (!rel.equals(path)) {
163                     // Resolve the relative against root key and look it up
164                     String canonicalPath = Paths.get(repoRoot, rel).toString();
165                     repo = Optional.ofNullable(repositories.get(canonicalPath));
166                     if (repo.isPresent()) {
167                         return repo;
168                     }
169                 }
170             } catch (IOException e) {
171                 LOGGER.log(Level.WARNING,
172                     "Failed to get relative to canonical for " + path + " and " + repoRoot,
173                     e);
174             }
175         }
176         return Optional.empty();
177     }
178 
179     @Override
getRepository(Path path, Set<String> repoParentDirs, Map<String, Repository> repositories, PathCanonicalizer canonicalizer)180     public Repository getRepository(Path path, Set<String> repoParentDirs, Map<String, Repository> repositories,
181         PathCanonicalizer canonicalizer) {
182         Comparator<String> stringComparator = String::compareTo;
183         Comparator<String> reversedComparator = stringComparator.reversed();
184         TreeSet<String> allRoots = new TreeSet<>(repoParentDirs);
185         String pathStr = path.toString();
186         // parentRepoDirs are canonicalized, so we need to filter them against canonical path representation
187         Path canonicalPath;
188         try {
189             canonicalPath = path.toRealPath();
190         } catch (IOException e) {
191             canonicalPath = path.normalize().toAbsolutePath();
192         }
193 
194         if (!canonicalPath.equals(path)) {
195             pathStr = canonicalPath.toString();
196         }
197 
198         /*
199          * Find all potential Repository parent dirs by matching their roots to file's prefix
200          * This is useful to limit the number of iterations in repoForPath call.
201          * Store matches in reverse-ordered TreeSet so that longest prefixes appear first
202          * (There may be multiple entries if we allow nested repositories)
203          */
204         TreeSet<String> filteredParentDirs = allRoots.stream().filter(pathStr::startsWith)
205             .collect(Collectors.toCollection(() -> new TreeSet<>(reversedComparator)));
206 
207         return findRepository(path, filteredParentDirs, repositories, canonicalizer).orElse(null);
208     }
209 
210     @Override
repositoriesRemoved(Collection<Repository> removedRepos)211     public void repositoriesRemoved(Collection<Repository> removedRepos) {
212         dirToRepoCache.entrySet().stream().filter(entry -> entry.getValue().filter(removedRepos::contains).isPresent())
213             .map(Map.Entry::getKey).forEach(dirToRepoCache::remove);
214     }
215 
216     @Override
clear()217     public void clear() {
218         dirToRepoCache.clear();
219     }
220 
size()221     public int size() {
222         return dirToRepoCache.size();
223     }
224 }
225