1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * See LICENSE.txt included in this distribution for the specific 9 * language governing permissions and limitations under the License. 10 * 11 * When distributing Covered Code, include this CDDL HEADER in each 12 * file and include the License file at LICENSE.txt. 13 * If applicable, add the following below this CDDL HEADER, with the 14 * fields enclosed by brackets "[]" replaced with your own identifying 15 * information: Portions Copyright [yyyy] [name of copyright owner] 16 * 17 * CDDL HEADER END 18 */ 19 20 /* 21 * Copyright (c) 2018, Chris Fraire <cfraire@me.com>. 22 */ 23 package org.opengrok.indexer.util; 24 25 import java.io.File; 26 import java.nio.charset.StandardCharsets; 27 import java.security.MessageDigest; 28 import java.security.NoSuchAlgorithmException; 29 import java.util.Arrays; 30 import java.util.Base64; 31 32 /** 33 * Represents a utility class for creating a filename to operate in tandem with 34 * an original filename by adding a new file extension but limiting the length 35 * of the new filename to 255 UTF-8 encoded bytes if necessary by truncating 36 * and packing in a Base64-encoded SHA-256 hash of the original file name. 37 */ 38 public class TandemFilename { 39 40 private static final int MAX_BYTES = 255; 41 42 /** 43 * One fewer than {@link #MAX_BYTES} as a cap for simple concatenation to 44 * avoid the possibility of easily fabricating a collision against this 45 * algorithm. I.e., a 255 byte tandem filename will always include a 46 * computed hash and not just be the concatenation of original filename 47 * plus new extension. 48 */ 49 private static final int MAX_CAT_BYTES = MAX_BYTES - 1; 50 51 /** 52 * "Instances of Base64.Encoder class are safe for use by multiple 53 * concurrent threads." --Oracle. 54 */ 55 private static final Base64.Encoder encoder = Base64.getUrlEncoder(); 56 57 /** Private to enforce static. */ TandemFilename()58 private TandemFilename() { 59 } 60 61 /** 62 * Appends an ASCII extension to the specified {@code filename}, truncating 63 * and packing in a SHA-256 hash if the UTF-8 encoding would exceed 254 64 * bytes and arriving at a final size of 255 bytes in that special case. 65 * @param filename a defined instance 66 * @param asciiExtension a defined instance that is expected to be only 67 * ASCII so that its UTF-8 form is the same length 68 * @return a transformed filename whose UTF-8 encoding is not more than 255 69 * bytes. 70 * @throws IllegalArgumentException thrown if {@code filename} has a 71 * parent or if {@code asciiExtension} is too long to allow packing a 72 * SHA-256 hash in the transformation. 73 */ join(String filename, String asciiExtension)74 public static String join(String filename, String asciiExtension) { 75 76 File file = new File(filename); 77 if (file.getParent() != null) { 78 throw new IllegalArgumentException("filename can't have parent"); 79 } 80 81 /* 82 * If the original filename length * 4 (for longest possible UTF-8 83 * encoding) plus asciiExtension length is not greater than one less 84 * than 255, then quickly return the concatenation. 85 */ 86 if (filename.length() * 4 + asciiExtension.length() <= MAX_CAT_BYTES) { 87 return filename + asciiExtension; 88 } 89 return maybePackSha(filename, asciiExtension); 90 } 91 maybePackSha(String filename, String asciiExtension)92 private static String maybePackSha(String filename, String asciiExtension) { 93 94 byte[] uFilename = filename.getBytes(StandardCharsets.UTF_8); 95 int nBytes = uFilename.length; 96 if (nBytes + asciiExtension.length() <= MAX_CAT_BYTES) { 97 // Here the UTF-8 encoding already allows for the new extension. 98 return filename + asciiExtension; 99 } 100 101 /* 102 * If filename has an ASCII extension already (of a reasonable length), 103 * shift it to the new asciiExtension so that it won't be overwritten 104 * by the packed hash. 105 */ 106 int pos = filename.lastIndexOf('.'); 107 int extLength = filename.length() - pos; 108 if (pos >= 0 && extLength < 30 && extLength > 1) { 109 int i; 110 for (i = pos + 1; i < filename.length(); ++i) { 111 char ch = filename.charAt(i); 112 if (!Character.isLetterOrDigit(ch) || ch > 'z') { 113 break; 114 } 115 } 116 if (i >= filename.length()) { 117 // By this point, we affirmed a letters/numbers extension. 118 asciiExtension = filename.substring(pos) + asciiExtension; 119 filename = filename.substring(0, pos); 120 uFilename = filename.getBytes(StandardCharsets.UTF_8); 121 nBytes = uFilename.length; 122 } 123 } 124 125 // Pack the hash just before the file extension. 126 asciiExtension = sha256base64(filename) + asciiExtension; 127 128 /* 129 * Now trim the filename by code points until the full UTF-8 encoding 130 * fits within MAX_BYTES. 131 */ 132 int newLength = filename.length(); 133 while (nBytes + asciiExtension.length() > MAX_BYTES) { 134 int cp = filename.codePointBefore(newLength); 135 int nChars = Character.charCount(cp); 136 String c = filename.substring(newLength - nChars, newLength); 137 nBytes -= c.getBytes(StandardCharsets.UTF_8).length; 138 newLength -= nChars; 139 140 if (newLength <= 0) { 141 throw new IllegalArgumentException("asciiExtension too long"); 142 } 143 } 144 145 // Pad if necessary to exactly MAX_BYTES. 146 if (nBytes + asciiExtension.length() != MAX_BYTES) { 147 char[] pad = new char[MAX_BYTES - nBytes - asciiExtension.length()]; 148 Arrays.fill(pad, '_'); 149 asciiExtension = new String(pad) + asciiExtension; 150 } 151 152 return filename.substring(0, newLength) + asciiExtension; 153 } 154 sha256base64(String value)155 private static String sha256base64(String value) { 156 157 MessageDigest hasher; 158 try { 159 hasher = MessageDigest.getInstance("SHA-256"); 160 } catch (NoSuchAlgorithmException e) { 161 /* 162 * This will not happen since "Every implementation of the Java 163 * platform is required to support the following standard 164 * MessageDigest algorithms: MD5, SHA-1, SHA-256." 165 */ 166 throw new RuntimeException(e); 167 } 168 169 byte[] digest = hasher.digest(value.getBytes(StandardCharsets.UTF_8)); 170 return encoder.encodeToString(digest); 171 } 172 } 173