xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/web/EftarFile.java (revision 7483fa4f6ea5b37f9dc2e3053394dfc04d7212e6)
1b5840353SAdam Hornáček /*
2b5840353SAdam Hornáček  * CDDL HEADER START
3b5840353SAdam Hornáček  *
4b5840353SAdam Hornáček  * The contents of this file are subject to the terms of the
5b5840353SAdam Hornáček  * Common Development and Distribution License (the "License").
6b5840353SAdam Hornáček  * You may not use this file except in compliance with the License.
7b5840353SAdam Hornáček  *
8b5840353SAdam Hornáček  * See LICENSE.txt included in this distribution for the specific
9b5840353SAdam Hornáček  * language governing permissions and limitations under the License.
10b5840353SAdam Hornáček  *
11b5840353SAdam Hornáček  * When distributing Covered Code, include this CDDL HEADER in each
12b5840353SAdam Hornáček  * file and include the License file at LICENSE.txt.
13b5840353SAdam Hornáček  * If applicable, add the following below this CDDL HEADER, with the
14b5840353SAdam Hornáček  * fields enclosed by brackets "[]" replaced with your own identifying
15b5840353SAdam Hornáček  * information: Portions Copyright [yyyy] [name of copyright owner]
16b5840353SAdam Hornáček  *
17b5840353SAdam Hornáček  * CDDL HEADER END
18b5840353SAdam Hornáček  */
19b5840353SAdam Hornáček 
20b5840353SAdam Hornáček /*
21c6f0939bSAdam Hornacek  * Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
22b5840353SAdam Hornáček  */
239805b761SAdam Hornáček package org.opengrok.indexer.web;
24b5840353SAdam Hornáček 
25b5840353SAdam Hornáček import java.io.BufferedOutputStream;
26b5840353SAdam Hornáček import java.io.DataOutputStream;
27b5840353SAdam Hornáček import java.io.FileOutputStream;
28b5840353SAdam Hornáček import java.io.IOException;
29b5840353SAdam Hornáček import java.util.Map;
30a11b766cSVladimir Kotal import java.util.Set;
31b5840353SAdam Hornáček import java.util.StringTokenizer;
32b5840353SAdam Hornáček import java.util.TreeMap;
33b5840353SAdam Hornáček 
34b5840353SAdam Hornáček /**
35ff44f24aSAdam Hornáček  * An Extremely Fast Tagged Attribute Read-only File System.
36b5840353SAdam Hornáček  * Created on October 12, 2005
37b5840353SAdam Hornáček  *
38*7483fa4fSVladimir Kotal  * <i>Eftar</i> File has the following format
39*7483fa4fSVladimir Kotal  * <code>
40b5840353SAdam Hornáček  * FILE --&gt; Record  ( Record | tagString ) *
41*7483fa4fSVladimir Kotal  * <br>
42b5840353SAdam Hornáček  * Record --&gt; 64bit:Hash 16bit:childrenOffset  16bit:(numberChildren|lenthOfTag) 16bit:tagOffset
43*7483fa4fSVladimir Kotal  * </code>
44b5840353SAdam Hornáček  *
45*7483fa4fSVladimir Kotal  * It is a tree of tagged names, doing binary search in sorted list of children
46b5840353SAdam Hornáček  *
47b5840353SAdam Hornáček  * @author Chandan
48b5840353SAdam Hornáček  */
49b5840353SAdam Hornáček public class EftarFile {
50b5840353SAdam Hornáček 
51b5840353SAdam Hornáček     public static final int RECORD_LENGTH = 14;
52b5840353SAdam Hornáček     private long offset;
53a11b766cSVladimir Kotal     private Node root;
54b5840353SAdam Hornáček 
55c6f0939bSAdam Hornacek     static class Node {
56b5840353SAdam Hornáček 
57*7483fa4fSVladimir Kotal         private final long hash;
58*7483fa4fSVladimir Kotal         private String tag;
59*7483fa4fSVladimir Kotal         private final Map<Long, Node> children;
60*7483fa4fSVladimir Kotal         private long tagOffset;
61*7483fa4fSVladimir Kotal         private long childOffset;
62b5840353SAdam Hornáček 
Node(long hash, String tag)63d1e826faSAdam Hornáček         Node(long hash, String tag) {
64b5840353SAdam Hornáček             this.hash = hash;
65b5840353SAdam Hornáček             this.tag = tag;
66b5840353SAdam Hornáček             children = new TreeMap<>();
67b5840353SAdam Hornáček         }
68b5840353SAdam Hornáček 
put(long hash, String desc)69b5840353SAdam Hornáček         public Node put(long hash, String desc) {
70*7483fa4fSVladimir Kotal             return children.computeIfAbsent(hash, newNode -> new Node(hash, desc));
71b5840353SAdam Hornáček         }
72b5840353SAdam Hornáček 
get(long hash)73b5840353SAdam Hornáček         public Node get(long hash) {
74b5840353SAdam Hornáček             return children.get(hash);
75b5840353SAdam Hornáček         }
76b5840353SAdam Hornáček     }
77b5840353SAdam Hornáček 
myHash(String name)78b5840353SAdam Hornáček     public static long myHash(String name) {
79b5840353SAdam Hornáček         if (name == null || name.length() == 0) {
80b5840353SAdam Hornáček             return 0;
81b5840353SAdam Hornáček         }
82b5840353SAdam Hornáček         long hash = 2861;
83b5840353SAdam Hornáček         int n = name.length();
84b5840353SAdam Hornáček         if (n > 100) {
85b5840353SAdam Hornáček             n = 100;
86b5840353SAdam Hornáček         }
87b5840353SAdam Hornáček         for (int i = 0; i < n; i++) {
88edf8e58eSVladimir Kotal             hash = (hash * 641L + name.charAt(i) * 2969 + hash << 6) % 9322397;
89b5840353SAdam Hornáček         }
90b5840353SAdam Hornáček         return hash;
91b5840353SAdam Hornáček     }
92b5840353SAdam Hornáček 
write(Node n, DataOutputStream out)93b5840353SAdam Hornáček     private void write(Node n, DataOutputStream out) throws IOException {
94b5840353SAdam Hornáček         if (n.tag != null) {
95b5840353SAdam Hornáček             out.write(n.tag.getBytes());
96b5840353SAdam Hornáček             offset += n.tag.length();
97b5840353SAdam Hornáček         }
98edf8e58eSVladimir Kotal         for (Node childNode : n.children.values()) {
99edf8e58eSVladimir Kotal             out.writeLong(childNode.hash);
100edf8e58eSVladimir Kotal             if (childNode.children.size() > 0) {
101edf8e58eSVladimir Kotal                 out.writeShort((short) (childNode.childOffset - offset));
102edf8e58eSVladimir Kotal                 out.writeShort((short) childNode.children.size());
103b5840353SAdam Hornáček             } else {
104b5840353SAdam Hornáček                 out.writeShort(0);
105edf8e58eSVladimir Kotal                 if (childNode.tag == null) {
106b5840353SAdam Hornáček                     out.writeShort((short) 0);
107b5840353SAdam Hornáček                 } else {
108edf8e58eSVladimir Kotal                     out.writeShort((short) childNode.tag.length());
109b5840353SAdam Hornáček                 }
110b5840353SAdam Hornáček             }
111edf8e58eSVladimir Kotal             if (childNode.tag == null) {
112b5840353SAdam Hornáček                 out.writeShort(0);
113b5840353SAdam Hornáček             } else {
114edf8e58eSVladimir Kotal                 out.writeShort((short) (childNode.tagOffset - offset));
115b5840353SAdam Hornáček             }
116b5840353SAdam Hornáček             offset += RECORD_LENGTH;
117b5840353SAdam Hornáček         }
118edf8e58eSVladimir Kotal         for (Node childNode : n.children.values()) {
119edf8e58eSVladimir Kotal             write(childNode, out);
120b5840353SAdam Hornáček         }
121b5840353SAdam Hornáček     }
122b5840353SAdam Hornáček 
traverse(Node n)123b5840353SAdam Hornáček     private void traverse(Node n) {
124b5840353SAdam Hornáček         if (n.tag == null) {
125b5840353SAdam Hornáček             n.tagOffset = 0;
126b5840353SAdam Hornáček         } else {
127b5840353SAdam Hornáček             n.tagOffset = offset;
128b5840353SAdam Hornáček             offset += n.tag.length();
129b5840353SAdam Hornáček         }
130b5840353SAdam Hornáček         if (n.children.size() > 0) {
131b5840353SAdam Hornáček             n.childOffset = offset;
132edf8e58eSVladimir Kotal             offset += ((long) RECORD_LENGTH * n.children.size());
133b5840353SAdam Hornáček         } else {
134b5840353SAdam Hornáček             n.childOffset = 0;
135b5840353SAdam Hornáček         }
136b5840353SAdam Hornáček         for (Node childnode : n.children.values()) {
137b5840353SAdam Hornáček             traverse(childnode);
138b5840353SAdam Hornáček         }
139b5840353SAdam Hornáček     }
140f197cd77SVladimir Kotal 
141f197cd77SVladimir Kotal     /**
142f197cd77SVladimir Kotal      * Reads the input into interim representation. Can be called multiple times.
143a11b766cSVladimir Kotal      * @param descriptions set of PathDescription
144f197cd77SVladimir Kotal      */
readInput(Set<PathDescription> descriptions)145edf8e58eSVladimir Kotal     private void readInput(Set<PathDescription> descriptions) {
146b5840353SAdam Hornáček         if (root == null) {
147b5840353SAdam Hornáček             root = new Node(1, null);
148b5840353SAdam Hornáček         }
149a11b766cSVladimir Kotal 
150a11b766cSVladimir Kotal         for (PathDescription desc : descriptions) {
151a11b766cSVladimir Kotal             StringTokenizer toks = new StringTokenizer(desc.getPath(), "\\/");
152b5840353SAdam Hornáček             Node n = root;
153b5840353SAdam Hornáček             while (toks.hasMoreTokens()) {
154b5840353SAdam Hornáček                 n = n.put(myHash(toks.nextToken()), null);
155b5840353SAdam Hornáček             }
156a11b766cSVladimir Kotal             n.tag = desc.getDescription();
157b5840353SAdam Hornáček         }
158b5840353SAdam Hornáček     }
159b5840353SAdam Hornáček 
write(String outPath)160a11b766cSVladimir Kotal     public void write(String outPath) throws IOException {
161b5840353SAdam Hornáček         offset = RECORD_LENGTH;
162b5840353SAdam Hornáček         traverse(root);
163edf8e58eSVladimir Kotal         try (DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(outPath)))) {
164b5840353SAdam Hornáček             out.writeLong(0x5e33);
165b5840353SAdam Hornáček             out.writeShort(RECORD_LENGTH);
166b5840353SAdam Hornáček             out.writeShort(root.children.size());
167b5840353SAdam Hornáček             out.writeShort(0);
168b5840353SAdam Hornáček             offset = RECORD_LENGTH;
169b5840353SAdam Hornáček             write(root, out);
170b5840353SAdam Hornáček         }
171b5840353SAdam Hornáček     }
172b5840353SAdam Hornáček 
create(Set<PathDescription> descriptions, String outputPath)173a11b766cSVladimir Kotal     public void create(Set<PathDescription> descriptions, String outputPath) throws IOException {
174a11b766cSVladimir Kotal         readInput(descriptions);
175f197cd77SVladimir Kotal         write(outputPath);
176b5840353SAdam Hornáček     }
177b5840353SAdam Hornáček }
178