xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/util/StringUtils.java (revision 0ec550ccbbcccee5cc0a7caf1de73b7f1c4f3887)
1b5840353SAdam Hornáček /*
2b5840353SAdam Hornáček  * CDDL HEADER START
3b5840353SAdam Hornáček  *
4b5840353SAdam Hornáček  * The contents of this file are subject to the terms of the
5b5840353SAdam Hornáček  * Common Development and Distribution License (the "License").
6b5840353SAdam Hornáček  * You may not use this file except in compliance with the License.
7b5840353SAdam Hornáček  *
8b5840353SAdam Hornáček  * See LICENSE.txt included in this distribution for the specific
9b5840353SAdam Hornáček  * language governing permissions and limitations under the License.
10b5840353SAdam Hornáček  *
11b5840353SAdam Hornáček  * When distributing Covered Code, include this CDDL HEADER in each
12b5840353SAdam Hornáček  * file and include the License file at LICENSE.txt.
13b5840353SAdam Hornáček  * If applicable, add the following below this CDDL HEADER, with the
14b5840353SAdam Hornáček  * fields enclosed by brackets "[]" replaced with your own identifying
15b5840353SAdam Hornáček  * information: Portions Copyright [yyyy] [name of copyright owner]
16b5840353SAdam Hornáček  *
17b5840353SAdam Hornáček  * CDDL HEADER END
18b5840353SAdam Hornáček  */
19b5840353SAdam Hornáček 
20b5840353SAdam Hornáček /*
216c62ede9SAdam Hornacek  * Copyright (c) 2008, 2021, Oracle and/or its affiliates. All rights reserved.
225d9f3aa0SAdam Hornáček  * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>.
23b5840353SAdam Hornáček  */
249805b761SAdam Hornáček package org.opengrok.indexer.util;
25b5840353SAdam Hornáček 
26b5840353SAdam Hornáček import java.util.regex.Matcher;
27b5840353SAdam Hornáček import java.util.regex.Pattern;
28b5840353SAdam Hornáček 
29b5840353SAdam Hornáček /**
30b5840353SAdam Hornáček  * Various String utility methods.
31b5840353SAdam Hornáček  *
32b5840353SAdam Hornáček  * @author austvik
33b5840353SAdam Hornáček  */
34b5840353SAdam Hornáček public final class StringUtils {
35b5840353SAdam Hornáček 
36b5840353SAdam Hornáček     /**
37ff44f24aSAdam Hornáček      * Matches a standard end-of-line indicator, identical to Common.lexh's {EOL}.
38b5840353SAdam Hornáček      */
3954544730SChris Fraire     public static final Pattern STANDARD_EOL = Pattern.compile("\\r\\n?|\\n");
40b5840353SAdam Hornáček 
41b5840353SAdam Hornáček     /**
42b5840353SAdam Hornáček      * Matches an apostrophe not following a backslash escape or following an
43b5840353SAdam Hornáček      * even number¹ of backslash escapes.
44b5840353SAdam Hornáček      * <p>
45b5840353SAdam Hornáček      * ¹See {@link RegexUtils#getNotFollowingEscapePattern()} for a caveat
46b5840353SAdam Hornáček      * about the backslash assertion.
47b5840353SAdam Hornáček      */
48b5840353SAdam Hornáček     public static final Pattern APOS_NO_BSESC =
49b5840353SAdam Hornáček         Pattern.compile("\\'" + RegexUtils.getNotFollowingEscapePattern());
50b5840353SAdam Hornáček 
51b5840353SAdam Hornáček     /**
52ff44f24aSAdam Hornáček      * Matches the close of a C comment.
53b5840353SAdam Hornáček      */
54ff44f24aSAdam Hornáček     public static final Pattern END_C_COMMENT = Pattern.compile("\\*\\/");
55b5840353SAdam Hornáček 
56b5840353SAdam Hornáček     /**
57ff44f24aSAdam Hornáček      * Matches one of the same possible characters as CommonPath.lexh's {FPath}.
58b5840353SAdam Hornáček      */
59ff44f24aSAdam Hornáček     private static final String FPATH_CHAR_PAT = "[a-zA-Z0-9_\\-\\./]";
60b5840353SAdam Hornáček 
616c62ede9SAdam Hornacek     private static final Pattern FPATH_CHAR_STARTSMATCH = Pattern.compile("^" + FPATH_CHAR_PAT);
62b5840353SAdam Hornáček 
63ff44f24aSAdam Hornáček     /** Private to enforce singleton. */
StringUtils()64b5840353SAdam Hornáček     private StringUtils() {
65b5840353SAdam Hornáček     }
66b5840353SAdam Hornáček 
67b5840353SAdam Hornáček     static final Pattern javaClassPattern =
68b5840353SAdam Hornáček         Pattern.compile("([a-z][A-Za-z]*\\.)+[A-Z][A-Za-z0-9]*");
69b5840353SAdam Hornáček     /**
7014bbb99cSChris Fraire      * Returns true if the string is possibly a Java class name, and only
7114bbb99cSChris Fraire      * matching a subset of possible class names to prevent false positives.
7214bbb99cSChris Fraire      * <p><ul>
7314bbb99cSChris Fraire      *     <li>class must be qualified with a package name</li>
7414bbb99cSChris Fraire      *     <li>package name must contain only letters and start lower case</li>
7514bbb99cSChris Fraire      *     <li>class name must be in CamelCase and start upper case</li>
7614bbb99cSChris Fraire      * </ul>
77b5840353SAdam Hornáček      *
78b5840353SAdam Hornáček      * @param s the string to be checked
79b5840353SAdam Hornáček      * @return true if string could be a java class name
80b5840353SAdam Hornáček     */
isPossiblyJavaClass(String s)81b5840353SAdam Hornáček     public static boolean isPossiblyJavaClass(String s) {
82b5840353SAdam Hornáček         return javaClassPattern.matcher(s).matches();
83b5840353SAdam Hornáček     }
84b5840353SAdam Hornáček 
85b5840353SAdam Hornáček     /**
86b5840353SAdam Hornáček      * Convert value in milliseconds to readable time.
87*0ec550ccSAdam Hornacek      * @param timeMs delta in milliseconds
88b5840353SAdam Hornáček      * @return human readable string
89b5840353SAdam Hornáček      */
getReadableTime(long timeMs)90*0ec550ccSAdam Hornacek     public static String getReadableTime(long timeMs) {
9112fc2901SChris Fraire         StringBuilder output = new StringBuilder();
92*0ec550ccSAdam Hornacek         long timeDelta = timeMs;
93b5840353SAdam Hornáček 
94*0ec550ccSAdam Hornacek         int milliseconds = (int) (timeDelta % 1000);
95*0ec550ccSAdam Hornacek         timeDelta /= 1000;
96*0ec550ccSAdam Hornacek         int seconds = (int) (timeDelta % 60);
97*0ec550ccSAdam Hornacek         timeDelta /= 60;
98*0ec550ccSAdam Hornacek         int minutes = (int) (timeDelta % 60);
99*0ec550ccSAdam Hornacek         timeDelta /= 60;
100*0ec550ccSAdam Hornacek         int hours = (int) (timeDelta % 24);
101*0ec550ccSAdam Hornacek         int days = (int) (timeDelta / 24);
102b5840353SAdam Hornáček 
103b5840353SAdam Hornáček         if (days != 0) {
10412fc2901SChris Fraire             output.append(days);
10512fc2901SChris Fraire             output.append(" day");
106b5840353SAdam Hornáček             if (days > 1) {
10712fc2901SChris Fraire                 output.append("s");
108b5840353SAdam Hornáček             }
109b5840353SAdam Hornáček         }
110b5840353SAdam Hornáček         if ((hours != 0) || (minutes != 0)) {
11112fc2901SChris Fraire             if (output.length() > 0) {
11212fc2901SChris Fraire                 // Use zero-padded hours here as it's longer than a day.
11312fc2901SChris Fraire                 output.append(String.format(" %02d:%02d:%02d", hours, minutes, seconds));
11412fc2901SChris Fraire             } else {
11512fc2901SChris Fraire                 // Don't pad hours if less than a day.
11612fc2901SChris Fraire                 output.append(String.format("%d:%02d:%02d", hours, minutes, seconds));
117b5840353SAdam Hornáček             }
11812fc2901SChris Fraire         } else if (output.length() > 0) {
11912fc2901SChris Fraire             /*
12012fc2901SChris Fraire              * If a day+ with zero hours and zero minutes, just report the days.
12112fc2901SChris Fraire              * E.g. "1 day", and not "1 day 35 ms".
12212fc2901SChris Fraire              */
12312fc2901SChris Fraire             return output.toString();
12412fc2901SChris Fraire         } else if (seconds != 0) {
12512fc2901SChris Fraire             output.append(String.format("%d.%d seconds", seconds, milliseconds));
12612fc2901SChris Fraire         } else if (milliseconds != 0) {
12712fc2901SChris Fraire             output.append(String.format("%d ms", milliseconds));
128b5840353SAdam Hornáček         }
129b5840353SAdam Hornáček 
13012fc2901SChris Fraire         return (output.length() == 0 ? "0 ms" : output.toString());
131b5840353SAdam Hornáček     }
132b5840353SAdam Hornáček 
133b5840353SAdam Hornáček     /**
134b5840353SAdam Hornáček      * Finds n-th index of a given substring in a string.
135b5840353SAdam Hornáček      *
136b5840353SAdam Hornáček      * @param str an original string
137b5840353SAdam Hornáček      * @param substr a substring to match
138b5840353SAdam Hornáček      * @param n n-th occurrence
139b5840353SAdam Hornáček      * @return the index of the first character of the substring in the original
140b5840353SAdam Hornáček      * string where the substring occurred n-th times in the string. If the n-th
141b5840353SAdam Hornáček      * candidate does not exist, -1 is returned.
142b5840353SAdam Hornáček      */
nthIndexOf(String str, String substr, int n)143b5840353SAdam Hornáček     public static int nthIndexOf(String str, String substr, int n) {
144b5840353SAdam Hornáček         int pos = -1;
145b5840353SAdam Hornáček         while (n > 0) {
146b5840353SAdam Hornáček             if (pos >= str.length()) {
147b5840353SAdam Hornáček                 return -1;
148b5840353SAdam Hornáček             }
149b5840353SAdam Hornáček             if ((pos = str.indexOf(substr, pos + 1)) == -1) {
150b5840353SAdam Hornáček                 break;
151b5840353SAdam Hornáček             }
152b5840353SAdam Hornáček             n--;
153b5840353SAdam Hornáček         }
154b5840353SAdam Hornáček         return pos;
155b5840353SAdam Hornáček     }
156b5840353SAdam Hornáček 
157b5840353SAdam Hornáček     /**
158b5840353SAdam Hornáček      * Count the number of ending pushback characters from a matched URI.
159b5840353SAdam Hornáček      * <p>
160b5840353SAdam Hornáček      * jflex does not support negative lookbehind, so modifying a URI matcher
161b5840353SAdam Hornáček      * to backtrack on ending characters that are otherwise normally valid
162b5840353SAdam Hornáček      * (e.g. '.') is difficult. Instead, this method allows counting and
163b5840353SAdam Hornáček      * pushing back.
164b5840353SAdam Hornáček      * @param value the URI to test
165b5840353SAdam Hornáček      * @return the number of characters greater than or equal to zero which
166b5840353SAdam Hornáček      * could be pushed back.
167b5840353SAdam Hornáček      */
countURIEndingPushback(String value)168b5840353SAdam Hornáček     public static int countURIEndingPushback(String value) {
169b5840353SAdam Hornáček         int n = 0;
170b5840353SAdam Hornáček         for (int i = value.length() - 1; i >= 0; --i) {
171b5840353SAdam Hornáček             char c = value.charAt(i);
172b5840353SAdam Hornáček             if (c == '.') {
173b5840353SAdam Hornáček                 ++n;
174b5840353SAdam Hornáček             } else {
175b5840353SAdam Hornáček                 break;
176b5840353SAdam Hornáček             }
177b5840353SAdam Hornáček         }
178b5840353SAdam Hornáček         return n;
179b5840353SAdam Hornáček     }
180b5840353SAdam Hornáček 
181b5840353SAdam Hornáček     /**
182b5840353SAdam Hornáček      * Determines if the specified {@code pattern} matches in the specified
183b5840353SAdam Hornáček      * {@code value}.
184b5840353SAdam Hornáček      * @param value the string to inspect
185b5840353SAdam Hornáček      * @param pattern the pattern to match
186b5840353SAdam Hornáček      * @return the index of the first occurrence of the specified pattern, or
187b5840353SAdam Hornáček      * -1 if there is no such occurrence
188b5840353SAdam Hornáček      */
patindexOf(String value, Pattern pattern)189b5840353SAdam Hornáček     public static int patindexOf(String value, Pattern pattern) {
190b5840353SAdam Hornáček         Matcher m = pattern.matcher(value);
191a72324b1SAdam Hornáček         if (!m.find()) {
192a72324b1SAdam Hornáček             return -1;
193a72324b1SAdam Hornáček         }
194b5840353SAdam Hornáček         return m.start();
195b5840353SAdam Hornáček     }
196b5840353SAdam Hornáček 
197b5840353SAdam Hornáček     /**
198b5840353SAdam Hornáček      * Determines if the {@code value} starts with a character in
199b5840353SAdam Hornáček      * CommonPath.lexh's {FPath}.
200b5840353SAdam Hornáček      * @param value the input to test
201b5840353SAdam Hornáček      * @return true if {@code value} matches at its start
202b5840353SAdam Hornáček      */
startsWithFpathChar(String value)203b5840353SAdam Hornáček     public static boolean startsWithFpathChar(String value) {
204b5840353SAdam Hornáček         return FPATH_CHAR_STARTSMATCH.matcher(value).matches();
205b5840353SAdam Hornáček     }
206b5840353SAdam Hornáček 
207b5840353SAdam Hornáček     /**
208b5840353SAdam Hornáček      * Determines if the specified pattern, {@code pat}, matches the specified
209b5840353SAdam Hornáček      * {@code capture}, and computes an eligible pushback.
210b5840353SAdam Hornáček      * @param capture a defined input
211b5840353SAdam Hornáček      * @param pat a pattern, or null to skip computation
212b5840353SAdam Hornáček      * @return a positive value if {@code pat} matches in {@code capture} at or
213b5840353SAdam Hornáček      * after the second character to indicate the number of characters to
214b5840353SAdam Hornáček      * pushback including the first-matched character; otherwise 0 to indicate
215b5840353SAdam Hornáček      * no match or a match at the 0-th character. (The 0-th chracter is
216b5840353SAdam Hornáček      * ineligible for fear of looping non-stop upon pushing back the entire
217b5840353SAdam Hornáček      * {@code yytext()}.)
218b5840353SAdam Hornáček      */
countPushback(String capture, Pattern pat)219b5840353SAdam Hornáček     public static int countPushback(String capture, Pattern pat) {
220b5840353SAdam Hornáček         if (pat != null) {
221b5840353SAdam Hornáček             int o = StringUtils.patindexOf(capture, pat);
222b5840353SAdam Hornáček             if (o >= 0) {
223b5840353SAdam Hornáček                 int n = capture.length() - o;
224b5840353SAdam Hornáček                 // Push back if positive, but not if equal to the full length.
225a72324b1SAdam Hornáček                 if (n > 0 && n < capture.length()) {
226a72324b1SAdam Hornáček                     return n;
227a72324b1SAdam Hornáček                 }
228b5840353SAdam Hornáček             }
229b5840353SAdam Hornáček         }
230b5840353SAdam Hornáček         return 0;
231b5840353SAdam Hornáček     }
2326c593e21SChris Fraire 
2336c593e21SChris Fraire     /**
2346c593e21SChris Fraire      * Determine the length of the next whitespace- or ISO control
2356c593e21SChris Fraire      * character-related sequence within a string.
2366c593e21SChris Fraire      * @param str a defined instance
2376c593e21SChris Fraire      * @param off the starting offset within {@code str}
2386c593e21SChris Fraire      * @param shouldMatch a value indicating whether to match all contiguous
2396c593e21SChris Fraire      *                    whitespace or ISO control characters ({@code true}) or
2406c593e21SChris Fraire      *                    all contiguous non-whitespace and non-control
2416c593e21SChris Fraire      *                    characters ({@code false}) starting at {@code off}
2426c593e21SChris Fraire      * @return a length greater than or equal to zero
2436c593e21SChris Fraire      */
whitespaceOrControlLength(String str, int off, boolean shouldMatch)2446c593e21SChris Fraire     public static int whitespaceOrControlLength(String str, int off, boolean shouldMatch) {
2456c593e21SChris Fraire         int i = 0;
2466c593e21SChris Fraire         while (off + i < str.length()) {
2476c593e21SChris Fraire             int cp = Character.codePointAt(str, off + i);
2486c593e21SChris Fraire             if ((Character.isWhitespace(cp) || Character.isISOControl(cp)) != shouldMatch) {
2496c593e21SChris Fraire                 return i;
2506c593e21SChris Fraire             }
2516c593e21SChris Fraire             i += Character.charCount(cp);
2526c593e21SChris Fraire         }
2536c593e21SChris Fraire         return str.length() - off;
2546c593e21SChris Fraire     }
255b5840353SAdam Hornáček }
256