xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/util/TandemFilename.java (revision 5d9f3aa0ca3da3a714233f987fa732f62c0965f6)
14da26a1eSChris Fraire /*
24da26a1eSChris Fraire  * CDDL HEADER START
34da26a1eSChris Fraire  *
44da26a1eSChris Fraire  * The contents of this file are subject to the terms of the
54da26a1eSChris Fraire  * Common Development and Distribution License (the "License").
64da26a1eSChris Fraire  * You may not use this file except in compliance with the License.
74da26a1eSChris Fraire  *
84da26a1eSChris Fraire  * See LICENSE.txt included in this distribution for the specific
94da26a1eSChris Fraire  * language governing permissions and limitations under the License.
104da26a1eSChris Fraire  *
114da26a1eSChris Fraire  * When distributing Covered Code, include this CDDL HEADER in each
124da26a1eSChris Fraire  * file and include the License file at LICENSE.txt.
134da26a1eSChris Fraire  * If applicable, add the following below this CDDL HEADER, with the
144da26a1eSChris Fraire  * fields enclosed by brackets "[]" replaced with your own identifying
154da26a1eSChris Fraire  * information: Portions Copyright [yyyy] [name of copyright owner]
164da26a1eSChris Fraire  *
174da26a1eSChris Fraire  * CDDL HEADER END
184da26a1eSChris Fraire  */
194da26a1eSChris Fraire 
204da26a1eSChris Fraire /*
214da26a1eSChris Fraire  * Copyright (c) 2018, Chris Fraire <cfraire@me.com>.
224da26a1eSChris Fraire  */
234da26a1eSChris Fraire package org.opengrok.indexer.util;
244da26a1eSChris Fraire 
254da26a1eSChris Fraire import java.io.File;
264da26a1eSChris Fraire import java.nio.charset.StandardCharsets;
274da26a1eSChris Fraire import java.security.MessageDigest;
284da26a1eSChris Fraire import java.security.NoSuchAlgorithmException;
294da26a1eSChris Fraire import java.util.Arrays;
304da26a1eSChris Fraire import java.util.Base64;
314da26a1eSChris Fraire 
324da26a1eSChris Fraire /**
334da26a1eSChris Fraire  * Represents a utility class for creating a filename to operate in tandem with
344da26a1eSChris Fraire  * an original filename by adding a new file extension but limiting the length
354da26a1eSChris Fraire  * of the new filename to 255 UTF-8 encoded bytes if necessary by truncating
364da26a1eSChris Fraire  * and packing in a Base64-encoded SHA-256 hash of the original file name.
374da26a1eSChris Fraire  */
384da26a1eSChris Fraire public class TandemFilename {
394da26a1eSChris Fraire 
404da26a1eSChris Fraire     private static final int MAX_BYTES = 255;
414da26a1eSChris Fraire 
424da26a1eSChris Fraire     /**
434da26a1eSChris Fraire      * One fewer than {@link #MAX_BYTES} as a cap for simple concatenation to
444da26a1eSChris Fraire      * avoid the possibility of easily fabricating a collision against this
454da26a1eSChris Fraire      * algorithm. I.e., a 255 byte tandem filename will always include a
464da26a1eSChris Fraire      * computed hash and not just be the concatenation of original filename
474da26a1eSChris Fraire      * plus new extension.
484da26a1eSChris Fraire      */
494da26a1eSChris Fraire     private static final int MAX_CAT_BYTES = MAX_BYTES - 1;
504da26a1eSChris Fraire 
514da26a1eSChris Fraire     /**
524da26a1eSChris Fraire      * "Instances of Base64.Encoder class are safe for use by multiple
534da26a1eSChris Fraire      * concurrent threads." --Oracle.
544da26a1eSChris Fraire      */
554da26a1eSChris Fraire     private static final Base64.Encoder encoder = Base64.getUrlEncoder();
564da26a1eSChris Fraire 
57*ff44f24aSAdam Hornáček     /** Private to enforce static. */
TandemFilename()584da26a1eSChris Fraire     private TandemFilename() {
594da26a1eSChris Fraire     }
604da26a1eSChris Fraire 
614da26a1eSChris Fraire     /**
624da26a1eSChris Fraire      * Appends an ASCII extension to the specified {@code filename}, truncating
634da26a1eSChris Fraire      * and packing in a SHA-256 hash if the UTF-8 encoding would exceed 254
644da26a1eSChris Fraire      * bytes and arriving at a final size of 255 bytes in that special case.
654da26a1eSChris Fraire      * @param filename a defined instance
664da26a1eSChris Fraire      * @param asciiExtension a defined instance that is expected to be only
674da26a1eSChris Fraire      *                       ASCII so that its UTF-8 form is the same length
684da26a1eSChris Fraire      * @return a transformed filename whose UTF-8 encoding is not more than 255
694da26a1eSChris Fraire      * bytes.
704da26a1eSChris Fraire      * @throws IllegalArgumentException thrown if {@code filename} has a
714da26a1eSChris Fraire      * parent or if {@code asciiExtension} is too long to allow packing a
724da26a1eSChris Fraire      * SHA-256 hash in the transformation.
734da26a1eSChris Fraire      */
join(String filename, String asciiExtension)744da26a1eSChris Fraire     public static String join(String filename, String asciiExtension) {
754da26a1eSChris Fraire 
764da26a1eSChris Fraire         File file = new File(filename);
774da26a1eSChris Fraire         if (file.getParent() != null) {
784da26a1eSChris Fraire             throw new IllegalArgumentException("filename can't have parent");
794da26a1eSChris Fraire         }
804da26a1eSChris Fraire 
814da26a1eSChris Fraire         /*
824da26a1eSChris Fraire          * If the original filename length * 4 (for longest possible UTF-8
834da26a1eSChris Fraire          * encoding) plus asciiExtension length is not greater than one less
844da26a1eSChris Fraire          * than 255, then quickly return the concatenation.
854da26a1eSChris Fraire          */
864da26a1eSChris Fraire         if (filename.length() * 4 + asciiExtension.length() <= MAX_CAT_BYTES) {
874da26a1eSChris Fraire             return filename + asciiExtension;
884da26a1eSChris Fraire         }
894da26a1eSChris Fraire         return maybePackSha(filename, asciiExtension);
904da26a1eSChris Fraire     }
914da26a1eSChris Fraire 
maybePackSha(String filename, String asciiExtension)924da26a1eSChris Fraire     private static String maybePackSha(String filename, String asciiExtension) {
934da26a1eSChris Fraire 
944da26a1eSChris Fraire         byte[] uFilename = filename.getBytes(StandardCharsets.UTF_8);
954da26a1eSChris Fraire         int nBytes = uFilename.length;
964da26a1eSChris Fraire         if (nBytes + asciiExtension.length() <= MAX_CAT_BYTES) {
974da26a1eSChris Fraire             // Here the UTF-8 encoding already allows for the new extension.
984da26a1eSChris Fraire             return filename + asciiExtension;
994da26a1eSChris Fraire         }
1004da26a1eSChris Fraire 
1014da26a1eSChris Fraire         /*
1024da26a1eSChris Fraire          * If filename has an ASCII extension already (of a reasonable length),
1034da26a1eSChris Fraire          * shift it to the new asciiExtension so that it won't be overwritten
1044da26a1eSChris Fraire          * by the packed hash.
1054da26a1eSChris Fraire          */
1064da26a1eSChris Fraire         int pos = filename.lastIndexOf('.');
1074da26a1eSChris Fraire         int extLength = filename.length() - pos;
1084da26a1eSChris Fraire         if (pos >= 0 && extLength < 30 && extLength > 1) {
1094da26a1eSChris Fraire             int i;
1104da26a1eSChris Fraire             for (i = pos + 1; i < filename.length(); ++i) {
1114da26a1eSChris Fraire                 char ch = filename.charAt(i);
1124da26a1eSChris Fraire                 if (!Character.isLetterOrDigit(ch) || ch > 'z') {
1134da26a1eSChris Fraire                     break;
1144da26a1eSChris Fraire                 }
1154da26a1eSChris Fraire             }
1164da26a1eSChris Fraire             if (i >= filename.length()) {
1174da26a1eSChris Fraire                 // By this point, we affirmed a letters/numbers extension.
1184da26a1eSChris Fraire                 asciiExtension = filename.substring(pos) + asciiExtension;
1194da26a1eSChris Fraire                 filename = filename.substring(0, pos);
1204da26a1eSChris Fraire                 uFilename = filename.getBytes(StandardCharsets.UTF_8);
1214da26a1eSChris Fraire                 nBytes = uFilename.length;
1224da26a1eSChris Fraire             }
1234da26a1eSChris Fraire         }
1244da26a1eSChris Fraire 
1254da26a1eSChris Fraire         // Pack the hash just before the file extension.
1264da26a1eSChris Fraire         asciiExtension = sha256base64(filename) + asciiExtension;
1274da26a1eSChris Fraire 
1284da26a1eSChris Fraire         /*
1294da26a1eSChris Fraire          * Now trim the filename by code points until the full UTF-8 encoding
1304da26a1eSChris Fraire          * fits within MAX_BYTES.
1314da26a1eSChris Fraire          */
1324da26a1eSChris Fraire         int newLength = filename.length();
1334da26a1eSChris Fraire         while (nBytes + asciiExtension.length() > MAX_BYTES) {
1344da26a1eSChris Fraire             int cp = filename.codePointBefore(newLength);
1354da26a1eSChris Fraire             int nChars = Character.charCount(cp);
1364da26a1eSChris Fraire             String c = filename.substring(newLength - nChars, newLength);
1374da26a1eSChris Fraire             nBytes -= c.getBytes(StandardCharsets.UTF_8).length;
1384da26a1eSChris Fraire             newLength -= nChars;
1394da26a1eSChris Fraire 
1404da26a1eSChris Fraire             if (newLength <= 0) {
1414da26a1eSChris Fraire                 throw new IllegalArgumentException("asciiExtension too long");
1424da26a1eSChris Fraire             }
1434da26a1eSChris Fraire         }
1444da26a1eSChris Fraire 
1454da26a1eSChris Fraire         // Pad if necessary to exactly MAX_BYTES.
1464da26a1eSChris Fraire         if (nBytes + asciiExtension.length() != MAX_BYTES) {
1474da26a1eSChris Fraire             char[] pad = new char[MAX_BYTES - nBytes - asciiExtension.length()];
1484da26a1eSChris Fraire             Arrays.fill(pad, '_');
1494da26a1eSChris Fraire             asciiExtension = new String(pad) + asciiExtension;
1504da26a1eSChris Fraire         }
1514da26a1eSChris Fraire 
1524da26a1eSChris Fraire         return filename.substring(0, newLength) + asciiExtension;
1534da26a1eSChris Fraire     }
1544da26a1eSChris Fraire 
sha256base64(String value)1554da26a1eSChris Fraire     private static String sha256base64(String value) {
1564da26a1eSChris Fraire 
1574da26a1eSChris Fraire         MessageDigest hasher;
1584da26a1eSChris Fraire         try {
1594da26a1eSChris Fraire             hasher = MessageDigest.getInstance("SHA-256");
1604da26a1eSChris Fraire         } catch (NoSuchAlgorithmException e) {
1614da26a1eSChris Fraire             /*
1624da26a1eSChris Fraire              * This will not happen since "Every implementation of the Java
1634da26a1eSChris Fraire              * platform is required to support the following standard
1644da26a1eSChris Fraire              * MessageDigest algorithms: MD5, SHA-1, SHA-256."
1654da26a1eSChris Fraire              */
1664da26a1eSChris Fraire             throw new RuntimeException(e);
1674da26a1eSChris Fraire         }
1684da26a1eSChris Fraire 
1694da26a1eSChris Fraire         byte[] digest = hasher.digest(value.getBytes(StandardCharsets.UTF_8));
1704da26a1eSChris Fraire         return encoder.encodeToString(digest);
1714da26a1eSChris Fraire     }
1724da26a1eSChris Fraire }
173