xref: /Lucene/dev-tools/scripts/reproduceJenkinsFailures.py (revision 3edfeb5eb224344e35f3454f5d51288ab05452c1)
1# Licensed to the Apache Software Foundation (ASF) under one or more
2# contributor license agreements.  See the NOTICE file distributed with
3# this work for additional information regarding copyright ownership.
4# The ASF licenses this file to You under the Apache License, Version 2.0
5# (the "License"); you may not use this file except in compliance with
6# the License.  You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import argparse
17import http.client
18import os
19import re
20import shutil
21import ssl
22import subprocess
23import sys
24import time
25import traceback
26import urllib.error
27import urllib.request
28from textwrap import dedent
29
30# Example: Checking out Revision e441a99009a557f82ea17ee9f9c3e9b89c75cee6 (refs/remotes/origin/master)
31reGitRev = re.compile(r'Checking out Revision (\S+)\s+\(refs/remotes/origin/([^)]+)')
32
33#         Policeman Jenkins example:           [Lucene-Solr-7.x-Linux] $ /var/lib/jenkins/tools/hudson.tasks.Ant_AntInstallation/ANT_1.8.2/bin/ant "-Dargs=-XX:-UseCompressedOops -XX:+UseConcMarkSweepGC" jenkins-hourly
34# Policeman Jenkins Windows example:      [Lucene-Solr-master-Windows] $ cmd.exe /C "C:\Users\jenkins\tools\hudson.tasks.Ant_AntInstallation\ANT_1.8.2\bin\ant.bat '"-Dargs=-client -XX:+UseConcMarkSweepGC"' jenkins-hourly && exit %%ERRORLEVEL%%"
35#               ASF Jenkins example:        [Lucene-Solr-Tests-master] $ /home/jenkins/tools/ant/apache-ant-1.8.4/bin/ant jenkins-hourly
36#       ASF Jenkins nightly example:                        [checkout] $ /home/jenkins/tools/ant/apache-ant-1.8.4/bin/ant -file build.xml -Dtests.multiplier=2 -Dtests.linedocsfile=/home/jenkins/jenkins-slave/workspace/Lucene-Solr-NightlyTests-master/test-data/enwiki.random.lines.txt jenkins-nightly
37#        ASF Jenkins smoker example: [Lucene-Solr-SmokeRelease-master] $ /home/jenkins/tools/ant/apache-ant-1.8.4/bin/ant nightly-smoke
38reAntInvocation = re.compile(r'\bant(?:\.bat)?\s+.*(?:jenkins-(?:hourly|nightly)|nightly-smoke)')
39reAntSysprops = re.compile(r'"-D[^"]+"|-D[^=]+="[^"]*"|-D\S+')
40
41# Method example: NOTE: reproduce with: ant test  -Dtestcase=ZkSolrClientTest -Dtests.method=testMultipleWatchesAsync -Dtests.seed=6EF5AB70F0032849 -Dtests.locale=he-IL -Dtests.timezone=NST -Dtests.asserts=true -Dtests.file.encoding=UTF-8
42# Suite example:  NOTE: reproduce with: ant test  -Dtestcase=CloudSolrClientTest -Dtests.seed=DB2DF2D8228BAF27 -Dtests.multiplier=3 -Dtests.locale=es-AR -Dtests.timezone=America/Argentina/Cordoba -Dtests.asserts=true -Dtests.file.encoding=US-ASCII
43reReproLine = re.compile(r'NOTE:\s+reproduce\s+with:(\s+ant\s+test\s+-Dtestcase=(\S+)\s+(?:-Dtests.method=\S+\s+)?(.*))')
44reTestsSeed = re.compile(r'-Dtests.seed=\S+\s*')
45
46# Example: https://jenkins.thetaphi.de/job/Lucene-Solr-master-Linux/21108/
47reJenkinsURLWithoutConsoleText = re.compile(r'https?://.*/\d+/?\Z', re.IGNORECASE)
48
49reJavaFile = re.compile(r'(.*)\.java\Z')
50reModule = re.compile(r'\.[\\/](.*)[\\/]src[\\/]')
51reTestOutputFile = re.compile(r'TEST-(.*\.([^-.]+))(?:-\d+)?\.xml\Z')
52reErrorFailure = re.compile(r'(?:errors|failures)="[^0]')
53reGitMainBranch = re.compile(r'^(?:master|branch_[x_\d]+)$')
54
55# consoleText from Policeman Jenkins's Windows jobs fails to decode as UTF-8
56encoding = 'iso-8859-1'
57
58lastFailureCode = 0
59gitCheckoutSucceeded = False
60
61description = dedent('''\
62                     Must be run from a Lucene/Solr git workspace. Downloads the Jenkins
63                     log pointed to by the given URL, parses it for Git revision and failed
64                     Lucene/Solr tests, checks out the Git revision in the local workspace,
65                     groups the failed tests by module, then runs
66                     'ant test -Dtest.dups=%d -Dtests.class="*.test1[|*.test2[...]]" ...'
67                     in each module of interest, failing at the end if any of the runs fails.
68                     To control the maximum number of concurrent JVMs used for each module's
69                     test run, set 'tests.jvms', e.g. in ~/lucene.build.properties
70                     ''')
71defaultIters = 5
72
73def readConfig():
74  parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
75                                   description=description)
76  parser.add_argument('url', metavar='URL',
77                      help='Points to the Jenkins log to parse')
78  parser.add_argument('--no-git', dest='useGit', action='store_false', default=True,
79                      help='Do not run "git" at all')
80  parser.add_argument('--iters', dest='testIters', type=int, default=defaultIters, metavar='N',
81                      help='Number of iterations per test suite (default: %d)' % defaultIters)
82  return parser.parse_args()
83
84def runOutput(cmd):
85  print('[repro] %s' % cmd)
86  try:
87    return subprocess.check_output(cmd.split(' '), universal_newlines=True).strip()
88  except CalledProcessError as e:
89    raise RuntimeError("ERROR: Cmd '%s' failed with exit code %d and the following output:\n%s"
90                       % (cmd, e.returncode, e.output))
91
92# Remembers non-zero exit code in lastFailureCode unless rememberFailure==False
93def run(cmd, rememberFailure=True):
94  global lastFailureCode
95  print('[repro] %s' % cmd)
96  code = os.system(cmd)
97  if 0 != code and rememberFailure:
98    print('\n[repro] Setting last failure code to %d\n' % code)
99    lastFailureCode = code
100  return code
101
102def fetchAndParseJenkinsLog(url, numRetries):
103  global revisionFromLog
104  global branchFromLog
105  global antOptions
106  revisionFromLog = None
107  antOptions = ''
108  tests = {}
109  print('[repro] Jenkins log URL: %s\n' % url)
110  try:
111    # HTTPS fails at certificate validation, see LUCENE-9412, PEP-476
112    context = ssl._create_unverified_context()
113    with urllib.request.urlopen(url, context=context) as consoleText:
114      for rawLine in consoleText:
115        line = rawLine.decode(encoding)
116        match = reGitRev.match(line)
117        if match is not None:
118          revisionFromLog = match.group(1)
119          branchFromLog = match.group(2)
120          print('[repro] Revision: %s\n' % revisionFromLog)
121        else:
122          match = reReproLine.search(line)
123          if match is not None:
124            print('[repro] Repro line: %s\n' % match.group(1))
125            testcase = match.group(2)
126            reproLineWithoutMethod = match.group(3).strip()
127            tests[testcase] = reproLineWithoutMethod
128          else:
129            match = reAntInvocation.search(line)
130            if match is not None:
131              antOptions = ' '.join(reAntSysprops.findall(line))
132              if len(antOptions) > 0:
133                print('[repro] Ant options: %s' % antOptions)
134  except urllib.error.URLError as e:
135    raise RuntimeError('ERROR: fetching %s : %s' % (url, e))
136  except http.client.IncompleteRead as e:
137    if numRetries > 0:
138      print('[repro] Encountered IncompleteRead exception, pausing and then retrying...')
139      time.sleep(2) # pause for 2 seconds
140      return fetchAndParseJenkinsLog(url, numRetries - 1)
141    else:
142      print('[repro] Encountered IncompleteRead exception, aborting after too many retries.')
143      raise RuntimeError('ERROR: fetching %s : %s' % (url, e))
144
145  if revisionFromLog == None:
146    if reJenkinsURLWithoutConsoleText.match(url):
147      print('[repro] Not a Jenkins log. Appending "/consoleText" and retrying ...\n')
148      return fetchAndParseJenkinsLog(url + '/consoleText', numRetries)
149    else:
150      raise RuntimeError('ERROR: %s does not appear to be a Jenkins log.' % url)
151  if 0 == len(tests):
152    print('[repro] No "reproduce with" lines found; exiting.')
153    sys.exit(0)
154  return tests
155
156def prepareWorkspace(useGit, gitRef):
157  global gitCheckoutSucceeded
158  if useGit:
159    code = run('git fetch')
160    if 0 != code:
161      raise RuntimeError('ERROR: "git fetch" failed.  See above.')
162    checkoutCmd = 'git checkout %s' % gitRef
163    code = run(checkoutCmd)
164    if 0 != code:
165      addWantedBranchCmd = "git remote set-branches --add origin %s" % gitRef
166      checkoutBranchCmd = 'git checkout -t -b %s origin/%s' % (gitRef, gitRef) # Checkout remote branch as new local branch
167      print('"%s" failed. Trying "%s" and "%s".' % (checkoutCmd, addWantedBranchCmd, checkoutBranchCmd))
168      code = run(addWantedBranchCmd)
169      if 0 != code:
170        raise RuntimeError('ERROR: "%s" failed.  See above.' % addWantedBranchCmd)
171      code = run(checkoutBranchCmd)
172      if 0 != code:
173        raise RuntimeError('ERROR: "%s" failed.  See above.' % checkoutBranchCmd)
174    gitCheckoutSucceeded = True
175    run('git merge --ff-only', rememberFailure=False) # Ignore failure on non-branch ref
176
177  code = run('ant clean')
178  if 0 != code:
179    raise RuntimeError('ERROR: "ant clean" failed.  See above.')
180
181def groupTestsByModule(tests):
182  modules = {}
183  for (dir, _, files) in os.walk('.'):
184    for file in files:
185      match = reJavaFile.search(file)
186      if match is not None:
187        test = match.group(1)
188        if test in tests:
189          match = reModule.match(dir)
190          module = match.group(1)
191          if module not in modules:
192            modules[module] = set()
193          modules[module].add(test)
194  print('[repro] Test suites by module:')
195  for module in modules:
196    print('[repro]    %s' % module)
197    for test in modules[module]:
198      print('[repro]       %s' % test)
199  return modules
200
201def runTests(testIters, modules, tests):
202  cwd = os.getcwd()
203  testCmdline = 'ant test-nocompile -Dtests.dups=%d -Dtests.maxfailures=%d -Dtests.class="%s" -Dtests.showOutput=onerror %s %s'
204  for module in modules:
205    moduleTests = list(modules[module])
206    testList = '|'.join(map(lambda t: '*.%s' % t, moduleTests))
207    numTests = len(moduleTests)
208    params = tests[moduleTests[0]] # Assumption: all tests in this module have the same cmdline params
209    os.chdir(module)
210    code = run('ant compile-test')
211    try:
212      if 0 != code:
213        raise RuntimeError("ERROR: Compile failed in %s/ with code %d.  See above." % (module, code))
214      run(testCmdline % (testIters, testIters * numTests, testList, antOptions, params))
215    finally:
216      os.chdir(cwd)
217
218def printAndMoveReports(testIters, newSubDir, location):
219  failures = {}
220  for start in ('lucene/build', 'solr/build'):
221    for (dir, _, files) in os.walk(start):
222      for file in files:
223        testOutputFileMatch = reTestOutputFile.search(file)
224        if testOutputFileMatch is not None:
225          testcase = testOutputFileMatch.group(1)
226          if testcase not in failures:
227            failures[testcase] = 0
228          filePath = os.path.join(dir, file)
229          with open(filePath, encoding='UTF-8') as testOutputFile:
230            for line in testOutputFile:
231              errorFailureMatch = reErrorFailure.search(line)
232              if errorFailureMatch is not None:
233                failures[testcase] += 1
234                break
235          # have to play nice with 'ant clean'...
236          newDirPath = os.path.join('repro-reports', newSubDir, dir)
237          os.makedirs(newDirPath, exist_ok=True)
238          os.rename(filePath, os.path.join(newDirPath, file))
239  print("[repro] Failures%s:" % location)
240  for testcase in sorted(failures, key=lambda t: (failures[t],t)): # sort by failure count, then by testcase
241    print("[repro]   %d/%d failed: %s" % (failures[testcase], testIters, testcase))
242  return failures
243
244def getLocalGitBranch():
245  origGitBranch = runOutput('git rev-parse --abbrev-ref HEAD')
246  if origGitBranch == 'HEAD':                       # In detached HEAD state
247    origGitBranch = runOutput('git rev-parse HEAD') # Use the SHA when not on a branch
248  print('[repro] Initial local git branch/revision: %s' % origGitBranch)
249  return origGitBranch
250
251def main():
252  config = readConfig()
253  tests = fetchAndParseJenkinsLog(config.url, numRetries = 2)
254  if config.useGit:
255    localGitBranch = getLocalGitBranch()
256
257  try:
258    # have to play nice with ant clean, so printAndMoveReports will move all the junit XML files here...
259    print('[repro] JUnit rest result XML files will be moved to: ./repro-reports')
260    if os.path.isdir('repro-reports'):
261      print('[repro]   Deleting old ./repro-reports');
262      shutil.rmtree('repro-reports')
263    prepareWorkspace(config.useGit, revisionFromLog)
264    modules = groupTestsByModule(tests)
265    runTests(config.testIters, modules, tests)
266    failures = printAndMoveReports(config.testIters, 'orig',
267                                   ' w/original seeds' + (' at %s' % revisionFromLog if config.useGit else ''))
268
269
270    if config.useGit:
271      # Retest 100% failures at the tip of the branch
272      oldTests = tests
273      tests = {}
274      for fullClass in failures:
275        testcase = fullClass[(fullClass.rindex('.') + 1):]
276        if failures[fullClass] == config.testIters:
277          tests[testcase] = oldTests[testcase]
278      if len(tests) > 0:
279        print('\n[repro] Re-testing 100%% failures at the tip of %s' % branchFromLog)
280        prepareWorkspace(True, branchFromLog)
281        modules = groupTestsByModule(tests)
282        runTests(config.testIters, modules, tests)
283        failures = printAndMoveReports(config.testIters, 'branch-tip',
284                                       ' original seeds at the tip of %s' % branchFromLog)
285
286        # Retest 100% tip-of-branch failures without a seed
287        oldTests = tests
288        tests = {}
289        for fullClass in failures:
290          testcase = fullClass[(fullClass.rindex('.') + 1):]
291          if failures[fullClass] == config.testIters:
292            tests[testcase] = re.sub(reTestsSeed, '', oldTests[testcase])
293        if len(tests) > 0:
294          print('\n[repro] Re-testing 100%% failures at the tip of %s without a seed' % branchFromLog)
295          prepareWorkspace(False, branchFromLog)
296          modules = groupTestsByModule(tests)
297          runTests(config.testIters, modules, tests)
298          printAndMoveReports(config.testIters, 'branch-tip-no-seed',
299                              ' at the tip of %s without a seed' % branchFromLog)
300  except Exception as e:
301    print('[repro] %s' % traceback.format_exc())
302    sys.exit(1)
303  finally:
304    if config.useGit and gitCheckoutSucceeded:
305      run('git checkout %s' % localGitBranch, rememberFailure=False) # Restore original git branch/sha
306
307  print('[repro] Exiting with code %d' % lastFailureCode)
308  sys.exit(lastFailureCode)
309
310if __name__ == '__main__':
311  try:
312    main()
313  except KeyboardInterrupt:
314    print('[repro] Keyboard interrupt...exiting')
315