xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/util/TandemFilename.java (revision 5d9f3aa0ca3da3a714233f987fa732f62c0965f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2018, Chris Fraire <cfraire@me.com>.
22  */
23 package org.opengrok.indexer.util;
24 
25 import java.io.File;
26 import java.nio.charset.StandardCharsets;
27 import java.security.MessageDigest;
28 import java.security.NoSuchAlgorithmException;
29 import java.util.Arrays;
30 import java.util.Base64;
31 
32 /**
33  * Represents a utility class for creating a filename to operate in tandem with
34  * an original filename by adding a new file extension but limiting the length
35  * of the new filename to 255 UTF-8 encoded bytes if necessary by truncating
36  * and packing in a Base64-encoded SHA-256 hash of the original file name.
37  */
38 public class TandemFilename {
39 
40     private static final int MAX_BYTES = 255;
41 
42     /**
43      * One fewer than {@link #MAX_BYTES} as a cap for simple concatenation to
44      * avoid the possibility of easily fabricating a collision against this
45      * algorithm. I.e., a 255 byte tandem filename will always include a
46      * computed hash and not just be the concatenation of original filename
47      * plus new extension.
48      */
49     private static final int MAX_CAT_BYTES = MAX_BYTES - 1;
50 
51     /**
52      * "Instances of Base64.Encoder class are safe for use by multiple
53      * concurrent threads." --Oracle.
54      */
55     private static final Base64.Encoder encoder = Base64.getUrlEncoder();
56 
57     /** Private to enforce static. */
TandemFilename()58     private TandemFilename() {
59     }
60 
61     /**
62      * Appends an ASCII extension to the specified {@code filename}, truncating
63      * and packing in a SHA-256 hash if the UTF-8 encoding would exceed 254
64      * bytes and arriving at a final size of 255 bytes in that special case.
65      * @param filename a defined instance
66      * @param asciiExtension a defined instance that is expected to be only
67      *                       ASCII so that its UTF-8 form is the same length
68      * @return a transformed filename whose UTF-8 encoding is not more than 255
69      * bytes.
70      * @throws IllegalArgumentException thrown if {@code filename} has a
71      * parent or if {@code asciiExtension} is too long to allow packing a
72      * SHA-256 hash in the transformation.
73      */
join(String filename, String asciiExtension)74     public static String join(String filename, String asciiExtension) {
75 
76         File file = new File(filename);
77         if (file.getParent() != null) {
78             throw new IllegalArgumentException("filename can't have parent");
79         }
80 
81         /*
82          * If the original filename length * 4 (for longest possible UTF-8
83          * encoding) plus asciiExtension length is not greater than one less
84          * than 255, then quickly return the concatenation.
85          */
86         if (filename.length() * 4 + asciiExtension.length() <= MAX_CAT_BYTES) {
87             return filename + asciiExtension;
88         }
89         return maybePackSha(filename, asciiExtension);
90     }
91 
maybePackSha(String filename, String asciiExtension)92     private static String maybePackSha(String filename, String asciiExtension) {
93 
94         byte[] uFilename = filename.getBytes(StandardCharsets.UTF_8);
95         int nBytes = uFilename.length;
96         if (nBytes + asciiExtension.length() <= MAX_CAT_BYTES) {
97             // Here the UTF-8 encoding already allows for the new extension.
98             return filename + asciiExtension;
99         }
100 
101         /*
102          * If filename has an ASCII extension already (of a reasonable length),
103          * shift it to the new asciiExtension so that it won't be overwritten
104          * by the packed hash.
105          */
106         int pos = filename.lastIndexOf('.');
107         int extLength = filename.length() - pos;
108         if (pos >= 0 && extLength < 30 && extLength > 1) {
109             int i;
110             for (i = pos + 1; i < filename.length(); ++i) {
111                 char ch = filename.charAt(i);
112                 if (!Character.isLetterOrDigit(ch) || ch > 'z') {
113                     break;
114                 }
115             }
116             if (i >= filename.length()) {
117                 // By this point, we affirmed a letters/numbers extension.
118                 asciiExtension = filename.substring(pos) + asciiExtension;
119                 filename = filename.substring(0, pos);
120                 uFilename = filename.getBytes(StandardCharsets.UTF_8);
121                 nBytes = uFilename.length;
122             }
123         }
124 
125         // Pack the hash just before the file extension.
126         asciiExtension = sha256base64(filename) + asciiExtension;
127 
128         /*
129          * Now trim the filename by code points until the full UTF-8 encoding
130          * fits within MAX_BYTES.
131          */
132         int newLength = filename.length();
133         while (nBytes + asciiExtension.length() > MAX_BYTES) {
134             int cp = filename.codePointBefore(newLength);
135             int nChars = Character.charCount(cp);
136             String c = filename.substring(newLength - nChars, newLength);
137             nBytes -= c.getBytes(StandardCharsets.UTF_8).length;
138             newLength -= nChars;
139 
140             if (newLength <= 0) {
141                 throw new IllegalArgumentException("asciiExtension too long");
142             }
143         }
144 
145         // Pad if necessary to exactly MAX_BYTES.
146         if (nBytes + asciiExtension.length() != MAX_BYTES) {
147             char[] pad = new char[MAX_BYTES - nBytes - asciiExtension.length()];
148             Arrays.fill(pad, '_');
149             asciiExtension = new String(pad) + asciiExtension;
150         }
151 
152         return filename.substring(0, newLength) + asciiExtension;
153     }
154 
sha256base64(String value)155     private static String sha256base64(String value) {
156 
157         MessageDigest hasher;
158         try {
159             hasher = MessageDigest.getInstance("SHA-256");
160         } catch (NoSuchAlgorithmException e) {
161             /*
162              * This will not happen since "Every implementation of the Java
163              * platform is required to support the following standard
164              * MessageDigest algorithms: MD5, SHA-1, SHA-256."
165              */
166             throw new RuntimeException(e);
167         }
168 
169         byte[] digest = hasher.digest(value.getBytes(StandardCharsets.UTF_8));
170         return encoder.encodeToString(digest);
171     }
172 }
173