xref: /OpenGrok/opengrok-indexer/src/main/java/org/opengrok/indexer/util/StringUtils.java (revision 0ec550ccbbcccee5cc0a7caf1de73b7f1c4f3887)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * See LICENSE.txt included in this distribution for the specific
9  * language governing permissions and limitations under the License.
10  *
11  * When distributing Covered Code, include this CDDL HEADER in each
12  * file and include the License file at LICENSE.txt.
13  * If applicable, add the following below this CDDL HEADER, with the
14  * fields enclosed by brackets "[]" replaced with your own identifying
15  * information: Portions Copyright [yyyy] [name of copyright owner]
16  *
17  * CDDL HEADER END
18  */
19 
20 /*
21  * Copyright (c) 2008, 2021, Oracle and/or its affiliates. All rights reserved.
22  * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>.
23  */
24 package org.opengrok.indexer.util;
25 
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28 
29 /**
30  * Various String utility methods.
31  *
32  * @author austvik
33  */
34 public final class StringUtils {
35 
36     /**
37      * Matches a standard end-of-line indicator, identical to Common.lexh's {EOL}.
38      */
39     public static final Pattern STANDARD_EOL = Pattern.compile("\\r\\n?|\\n");
40 
41     /**
42      * Matches an apostrophe not following a backslash escape or following an
43      * even number¹ of backslash escapes.
44      * <p>
45      * ¹See {@link RegexUtils#getNotFollowingEscapePattern()} for a caveat
46      * about the backslash assertion.
47      */
48     public static final Pattern APOS_NO_BSESC =
49         Pattern.compile("\\'" + RegexUtils.getNotFollowingEscapePattern());
50 
51     /**
52      * Matches the close of a C comment.
53      */
54     public static final Pattern END_C_COMMENT = Pattern.compile("\\*\\/");
55 
56     /**
57      * Matches one of the same possible characters as CommonPath.lexh's {FPath}.
58      */
59     private static final String FPATH_CHAR_PAT = "[a-zA-Z0-9_\\-\\./]";
60 
61     private static final Pattern FPATH_CHAR_STARTSMATCH = Pattern.compile("^" + FPATH_CHAR_PAT);
62 
63     /** Private to enforce singleton. */
StringUtils()64     private StringUtils() {
65     }
66 
67     static final Pattern javaClassPattern =
68         Pattern.compile("([a-z][A-Za-z]*\\.)+[A-Z][A-Za-z0-9]*");
69     /**
70      * Returns true if the string is possibly a Java class name, and only
71      * matching a subset of possible class names to prevent false positives.
72      * <p><ul>
73      *     <li>class must be qualified with a package name</li>
74      *     <li>package name must contain only letters and start lower case</li>
75      *     <li>class name must be in CamelCase and start upper case</li>
76      * </ul>
77      *
78      * @param s the string to be checked
79      * @return true if string could be a java class name
80     */
isPossiblyJavaClass(String s)81     public static boolean isPossiblyJavaClass(String s) {
82         return javaClassPattern.matcher(s).matches();
83     }
84 
85     /**
86      * Convert value in milliseconds to readable time.
87      * @param timeMs delta in milliseconds
88      * @return human readable string
89      */
getReadableTime(long timeMs)90     public static String getReadableTime(long timeMs) {
91         StringBuilder output = new StringBuilder();
92         long timeDelta = timeMs;
93 
94         int milliseconds = (int) (timeDelta % 1000);
95         timeDelta /= 1000;
96         int seconds = (int) (timeDelta % 60);
97         timeDelta /= 60;
98         int minutes = (int) (timeDelta % 60);
99         timeDelta /= 60;
100         int hours = (int) (timeDelta % 24);
101         int days = (int) (timeDelta / 24);
102 
103         if (days != 0) {
104             output.append(days);
105             output.append(" day");
106             if (days > 1) {
107                 output.append("s");
108             }
109         }
110         if ((hours != 0) || (minutes != 0)) {
111             if (output.length() > 0) {
112                 // Use zero-padded hours here as it's longer than a day.
113                 output.append(String.format(" %02d:%02d:%02d", hours, minutes, seconds));
114             } else {
115                 // Don't pad hours if less than a day.
116                 output.append(String.format("%d:%02d:%02d", hours, minutes, seconds));
117             }
118         } else if (output.length() > 0) {
119             /*
120              * If a day+ with zero hours and zero minutes, just report the days.
121              * E.g. "1 day", and not "1 day 35 ms".
122              */
123             return output.toString();
124         } else if (seconds != 0) {
125             output.append(String.format("%d.%d seconds", seconds, milliseconds));
126         } else if (milliseconds != 0) {
127             output.append(String.format("%d ms", milliseconds));
128         }
129 
130         return (output.length() == 0 ? "0 ms" : output.toString());
131     }
132 
133     /**
134      * Finds n-th index of a given substring in a string.
135      *
136      * @param str an original string
137      * @param substr a substring to match
138      * @param n n-th occurrence
139      * @return the index of the first character of the substring in the original
140      * string where the substring occurred n-th times in the string. If the n-th
141      * candidate does not exist, -1 is returned.
142      */
nthIndexOf(String str, String substr, int n)143     public static int nthIndexOf(String str, String substr, int n) {
144         int pos = -1;
145         while (n > 0) {
146             if (pos >= str.length()) {
147                 return -1;
148             }
149             if ((pos = str.indexOf(substr, pos + 1)) == -1) {
150                 break;
151             }
152             n--;
153         }
154         return pos;
155     }
156 
157     /**
158      * Count the number of ending pushback characters from a matched URI.
159      * <p>
160      * jflex does not support negative lookbehind, so modifying a URI matcher
161      * to backtrack on ending characters that are otherwise normally valid
162      * (e.g. '.') is difficult. Instead, this method allows counting and
163      * pushing back.
164      * @param value the URI to test
165      * @return the number of characters greater than or equal to zero which
166      * could be pushed back.
167      */
countURIEndingPushback(String value)168     public static int countURIEndingPushback(String value) {
169         int n = 0;
170         for (int i = value.length() - 1; i >= 0; --i) {
171             char c = value.charAt(i);
172             if (c == '.') {
173                 ++n;
174             } else {
175                 break;
176             }
177         }
178         return n;
179     }
180 
181     /**
182      * Determines if the specified {@code pattern} matches in the specified
183      * {@code value}.
184      * @param value the string to inspect
185      * @param pattern the pattern to match
186      * @return the index of the first occurrence of the specified pattern, or
187      * -1 if there is no such occurrence
188      */
patindexOf(String value, Pattern pattern)189     public static int patindexOf(String value, Pattern pattern) {
190         Matcher m = pattern.matcher(value);
191         if (!m.find()) {
192             return -1;
193         }
194         return m.start();
195     }
196 
197     /**
198      * Determines if the {@code value} starts with a character in
199      * CommonPath.lexh's {FPath}.
200      * @param value the input to test
201      * @return true if {@code value} matches at its start
202      */
startsWithFpathChar(String value)203     public static boolean startsWithFpathChar(String value) {
204         return FPATH_CHAR_STARTSMATCH.matcher(value).matches();
205     }
206 
207     /**
208      * Determines if the specified pattern, {@code pat}, matches the specified
209      * {@code capture}, and computes an eligible pushback.
210      * @param capture a defined input
211      * @param pat a pattern, or null to skip computation
212      * @return a positive value if {@code pat} matches in {@code capture} at or
213      * after the second character to indicate the number of characters to
214      * pushback including the first-matched character; otherwise 0 to indicate
215      * no match or a match at the 0-th character. (The 0-th chracter is
216      * ineligible for fear of looping non-stop upon pushing back the entire
217      * {@code yytext()}.)
218      */
countPushback(String capture, Pattern pat)219     public static int countPushback(String capture, Pattern pat) {
220         if (pat != null) {
221             int o = StringUtils.patindexOf(capture, pat);
222             if (o >= 0) {
223                 int n = capture.length() - o;
224                 // Push back if positive, but not if equal to the full length.
225                 if (n > 0 && n < capture.length()) {
226                     return n;
227                 }
228             }
229         }
230         return 0;
231     }
232 
233     /**
234      * Determine the length of the next whitespace- or ISO control
235      * character-related sequence within a string.
236      * @param str a defined instance
237      * @param off the starting offset within {@code str}
238      * @param shouldMatch a value indicating whether to match all contiguous
239      *                    whitespace or ISO control characters ({@code true}) or
240      *                    all contiguous non-whitespace and non-control
241      *                    characters ({@code false}) starting at {@code off}
242      * @return a length greater than or equal to zero
243      */
whitespaceOrControlLength(String str, int off, boolean shouldMatch)244     public static int whitespaceOrControlLength(String str, int off, boolean shouldMatch) {
245         int i = 0;
246         while (off + i < str.length()) {
247             int cp = Character.codePointAt(str, off + i);
248             if ((Character.isWhitespace(cp) || Character.isISOControl(cp)) != shouldMatch) {
249                 return i;
250             }
251             i += Character.charCount(cp);
252         }
253         return str.length() - off;
254     }
255 }
256