14da26a1eSChris Fraire /* 24da26a1eSChris Fraire * CDDL HEADER START 34da26a1eSChris Fraire * 44da26a1eSChris Fraire * The contents of this file are subject to the terms of the 54da26a1eSChris Fraire * Common Development and Distribution License (the "License"). 64da26a1eSChris Fraire * You may not use this file except in compliance with the License. 74da26a1eSChris Fraire * 84da26a1eSChris Fraire * See LICENSE.txt included in this distribution for the specific 94da26a1eSChris Fraire * language governing permissions and limitations under the License. 104da26a1eSChris Fraire * 114da26a1eSChris Fraire * When distributing Covered Code, include this CDDL HEADER in each 124da26a1eSChris Fraire * file and include the License file at LICENSE.txt. 134da26a1eSChris Fraire * If applicable, add the following below this CDDL HEADER, with the 144da26a1eSChris Fraire * fields enclosed by brackets "[]" replaced with your own identifying 154da26a1eSChris Fraire * information: Portions Copyright [yyyy] [name of copyright owner] 164da26a1eSChris Fraire * 174da26a1eSChris Fraire * CDDL HEADER END 184da26a1eSChris Fraire */ 194da26a1eSChris Fraire 204da26a1eSChris Fraire /* 214da26a1eSChris Fraire * Copyright (c) 2018, Chris Fraire <cfraire@me.com>. 224da26a1eSChris Fraire */ 234da26a1eSChris Fraire package org.opengrok.indexer.util; 244da26a1eSChris Fraire 254da26a1eSChris Fraire import java.io.File; 264da26a1eSChris Fraire import java.nio.charset.StandardCharsets; 274da26a1eSChris Fraire import java.security.MessageDigest; 284da26a1eSChris Fraire import java.security.NoSuchAlgorithmException; 294da26a1eSChris Fraire import java.util.Arrays; 304da26a1eSChris Fraire import java.util.Base64; 314da26a1eSChris Fraire 324da26a1eSChris Fraire /** 334da26a1eSChris Fraire * Represents a utility class for creating a filename to operate in tandem with 344da26a1eSChris Fraire * an original filename by adding a new file extension but limiting the length 354da26a1eSChris Fraire * of the new filename to 255 UTF-8 encoded bytes if necessary by truncating 364da26a1eSChris Fraire * and packing in a Base64-encoded SHA-256 hash of the original file name. 374da26a1eSChris Fraire */ 384da26a1eSChris Fraire public class TandemFilename { 394da26a1eSChris Fraire 404da26a1eSChris Fraire private static final int MAX_BYTES = 255; 414da26a1eSChris Fraire 424da26a1eSChris Fraire /** 434da26a1eSChris Fraire * One fewer than {@link #MAX_BYTES} as a cap for simple concatenation to 444da26a1eSChris Fraire * avoid the possibility of easily fabricating a collision against this 454da26a1eSChris Fraire * algorithm. I.e., a 255 byte tandem filename will always include a 464da26a1eSChris Fraire * computed hash and not just be the concatenation of original filename 474da26a1eSChris Fraire * plus new extension. 484da26a1eSChris Fraire */ 494da26a1eSChris Fraire private static final int MAX_CAT_BYTES = MAX_BYTES - 1; 504da26a1eSChris Fraire 514da26a1eSChris Fraire /** 524da26a1eSChris Fraire * "Instances of Base64.Encoder class are safe for use by multiple 534da26a1eSChris Fraire * concurrent threads." --Oracle. 544da26a1eSChris Fraire */ 554da26a1eSChris Fraire private static final Base64.Encoder encoder = Base64.getUrlEncoder(); 564da26a1eSChris Fraire 57*ff44f24aSAdam Hornáček /** Private to enforce static. */ TandemFilename()584da26a1eSChris Fraire private TandemFilename() { 594da26a1eSChris Fraire } 604da26a1eSChris Fraire 614da26a1eSChris Fraire /** 624da26a1eSChris Fraire * Appends an ASCII extension to the specified {@code filename}, truncating 634da26a1eSChris Fraire * and packing in a SHA-256 hash if the UTF-8 encoding would exceed 254 644da26a1eSChris Fraire * bytes and arriving at a final size of 255 bytes in that special case. 654da26a1eSChris Fraire * @param filename a defined instance 664da26a1eSChris Fraire * @param asciiExtension a defined instance that is expected to be only 674da26a1eSChris Fraire * ASCII so that its UTF-8 form is the same length 684da26a1eSChris Fraire * @return a transformed filename whose UTF-8 encoding is not more than 255 694da26a1eSChris Fraire * bytes. 704da26a1eSChris Fraire * @throws IllegalArgumentException thrown if {@code filename} has a 714da26a1eSChris Fraire * parent or if {@code asciiExtension} is too long to allow packing a 724da26a1eSChris Fraire * SHA-256 hash in the transformation. 734da26a1eSChris Fraire */ join(String filename, String asciiExtension)744da26a1eSChris Fraire public static String join(String filename, String asciiExtension) { 754da26a1eSChris Fraire 764da26a1eSChris Fraire File file = new File(filename); 774da26a1eSChris Fraire if (file.getParent() != null) { 784da26a1eSChris Fraire throw new IllegalArgumentException("filename can't have parent"); 794da26a1eSChris Fraire } 804da26a1eSChris Fraire 814da26a1eSChris Fraire /* 824da26a1eSChris Fraire * If the original filename length * 4 (for longest possible UTF-8 834da26a1eSChris Fraire * encoding) plus asciiExtension length is not greater than one less 844da26a1eSChris Fraire * than 255, then quickly return the concatenation. 854da26a1eSChris Fraire */ 864da26a1eSChris Fraire if (filename.length() * 4 + asciiExtension.length() <= MAX_CAT_BYTES) { 874da26a1eSChris Fraire return filename + asciiExtension; 884da26a1eSChris Fraire } 894da26a1eSChris Fraire return maybePackSha(filename, asciiExtension); 904da26a1eSChris Fraire } 914da26a1eSChris Fraire maybePackSha(String filename, String asciiExtension)924da26a1eSChris Fraire private static String maybePackSha(String filename, String asciiExtension) { 934da26a1eSChris Fraire 944da26a1eSChris Fraire byte[] uFilename = filename.getBytes(StandardCharsets.UTF_8); 954da26a1eSChris Fraire int nBytes = uFilename.length; 964da26a1eSChris Fraire if (nBytes + asciiExtension.length() <= MAX_CAT_BYTES) { 974da26a1eSChris Fraire // Here the UTF-8 encoding already allows for the new extension. 984da26a1eSChris Fraire return filename + asciiExtension; 994da26a1eSChris Fraire } 1004da26a1eSChris Fraire 1014da26a1eSChris Fraire /* 1024da26a1eSChris Fraire * If filename has an ASCII extension already (of a reasonable length), 1034da26a1eSChris Fraire * shift it to the new asciiExtension so that it won't be overwritten 1044da26a1eSChris Fraire * by the packed hash. 1054da26a1eSChris Fraire */ 1064da26a1eSChris Fraire int pos = filename.lastIndexOf('.'); 1074da26a1eSChris Fraire int extLength = filename.length() - pos; 1084da26a1eSChris Fraire if (pos >= 0 && extLength < 30 && extLength > 1) { 1094da26a1eSChris Fraire int i; 1104da26a1eSChris Fraire for (i = pos + 1; i < filename.length(); ++i) { 1114da26a1eSChris Fraire char ch = filename.charAt(i); 1124da26a1eSChris Fraire if (!Character.isLetterOrDigit(ch) || ch > 'z') { 1134da26a1eSChris Fraire break; 1144da26a1eSChris Fraire } 1154da26a1eSChris Fraire } 1164da26a1eSChris Fraire if (i >= filename.length()) { 1174da26a1eSChris Fraire // By this point, we affirmed a letters/numbers extension. 1184da26a1eSChris Fraire asciiExtension = filename.substring(pos) + asciiExtension; 1194da26a1eSChris Fraire filename = filename.substring(0, pos); 1204da26a1eSChris Fraire uFilename = filename.getBytes(StandardCharsets.UTF_8); 1214da26a1eSChris Fraire nBytes = uFilename.length; 1224da26a1eSChris Fraire } 1234da26a1eSChris Fraire } 1244da26a1eSChris Fraire 1254da26a1eSChris Fraire // Pack the hash just before the file extension. 1264da26a1eSChris Fraire asciiExtension = sha256base64(filename) + asciiExtension; 1274da26a1eSChris Fraire 1284da26a1eSChris Fraire /* 1294da26a1eSChris Fraire * Now trim the filename by code points until the full UTF-8 encoding 1304da26a1eSChris Fraire * fits within MAX_BYTES. 1314da26a1eSChris Fraire */ 1324da26a1eSChris Fraire int newLength = filename.length(); 1334da26a1eSChris Fraire while (nBytes + asciiExtension.length() > MAX_BYTES) { 1344da26a1eSChris Fraire int cp = filename.codePointBefore(newLength); 1354da26a1eSChris Fraire int nChars = Character.charCount(cp); 1364da26a1eSChris Fraire String c = filename.substring(newLength - nChars, newLength); 1374da26a1eSChris Fraire nBytes -= c.getBytes(StandardCharsets.UTF_8).length; 1384da26a1eSChris Fraire newLength -= nChars; 1394da26a1eSChris Fraire 1404da26a1eSChris Fraire if (newLength <= 0) { 1414da26a1eSChris Fraire throw new IllegalArgumentException("asciiExtension too long"); 1424da26a1eSChris Fraire } 1434da26a1eSChris Fraire } 1444da26a1eSChris Fraire 1454da26a1eSChris Fraire // Pad if necessary to exactly MAX_BYTES. 1464da26a1eSChris Fraire if (nBytes + asciiExtension.length() != MAX_BYTES) { 1474da26a1eSChris Fraire char[] pad = new char[MAX_BYTES - nBytes - asciiExtension.length()]; 1484da26a1eSChris Fraire Arrays.fill(pad, '_'); 1494da26a1eSChris Fraire asciiExtension = new String(pad) + asciiExtension; 1504da26a1eSChris Fraire } 1514da26a1eSChris Fraire 1524da26a1eSChris Fraire return filename.substring(0, newLength) + asciiExtension; 1534da26a1eSChris Fraire } 1544da26a1eSChris Fraire sha256base64(String value)1554da26a1eSChris Fraire private static String sha256base64(String value) { 1564da26a1eSChris Fraire 1574da26a1eSChris Fraire MessageDigest hasher; 1584da26a1eSChris Fraire try { 1594da26a1eSChris Fraire hasher = MessageDigest.getInstance("SHA-256"); 1604da26a1eSChris Fraire } catch (NoSuchAlgorithmException e) { 1614da26a1eSChris Fraire /* 1624da26a1eSChris Fraire * This will not happen since "Every implementation of the Java 1634da26a1eSChris Fraire * platform is required to support the following standard 1644da26a1eSChris Fraire * MessageDigest algorithms: MD5, SHA-1, SHA-256." 1654da26a1eSChris Fraire */ 1664da26a1eSChris Fraire throw new RuntimeException(e); 1674da26a1eSChris Fraire } 1684da26a1eSChris Fraire 1694da26a1eSChris Fraire byte[] digest = hasher.digest(value.getBytes(StandardCharsets.UTF_8)); 1704da26a1eSChris Fraire return encoder.encodeToString(digest); 1714da26a1eSChris Fraire } 1724da26a1eSChris Fraire } 173