xref: /Lucene/dev-tools/scripts/create_line_file_docs.py (revision 3bedc0871e96429fb7eb0f6b9fb3f97ffa3e10d2)
1*3bedc087SDawid Weiss#!/usr/bin/env python3
2*3bedc087SDawid Weiss# -*- coding: utf-8 -*-
3*3bedc087SDawid Weiss# Licensed to the Apache Software Foundation (ASF) under one or more
4*3bedc087SDawid Weiss# contributor license agreements.  See the NOTICE file distributed with
5*3bedc087SDawid Weiss# this work for additional information regarding copyright ownership.
6*3bedc087SDawid Weiss# The ASF licenses this file to You under the Apache License, Version 2.0
7*3bedc087SDawid Weiss# (the "License"); you may not use this file except in compliance with
8*3bedc087SDawid Weiss# the License.  You may obtain a copy of the License at
9*3bedc087SDawid Weiss#
10*3bedc087SDawid Weiss#     http://www.apache.org/licenses/LICENSE-2.0
11*3bedc087SDawid Weiss#
12*3bedc087SDawid Weiss# Unless required by applicable law or agreed to in writing, software
13*3bedc087SDawid Weiss# distributed under the License is distributed on an "AS IS" BASIS,
14*3bedc087SDawid Weiss# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15*3bedc087SDawid Weiss# See the License for the specific language governing permissions and
16*3bedc087SDawid Weiss# limitations under the License.
17*3bedc087SDawid Weiss
18e0c06ee6SMike McCandlessimport os
19e0c06ee6SMike McCandlessimport gzip
20e0c06ee6SMike McCandlessimport time
21e0c06ee6SMike McCandlessimport random
22e0c06ee6SMike McCandlessimport re
23e0c06ee6SMike McCandlessimport urllib.request
24e0c06ee6SMike McCandlessimport subprocess
25e0c06ee6SMike McCandlessimport tempfile
26e0c06ee6SMike McCandlessimport shutil
27e0c06ee6SMike McCandless
28e0c06ee6SMike McCandlessDEBUG = False
29e0c06ee6SMike McCandless
30e0c06ee6SMike McCandlessTARGET_DOC_CHARS = 1024
31e0c06ee6SMike McCandless
32e0c06ee6SMike McCandlessdef compress_with_seek_points(file_name_in, file_name_out, num_seek_points):
33e0c06ee6SMike McCandless
34e0c06ee6SMike McCandless  bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points
35e0c06ee6SMike McCandless
36e0c06ee6SMike McCandless  seek_points = []
37e0c06ee6SMike McCandless
38e0c06ee6SMike McCandless  if os.path.exists(file_name_out):
39e0c06ee6SMike McCandless    os.remove(file_name_out)
40e0c06ee6SMike McCandless
41e0c06ee6SMike McCandless  with open(file_name_in, 'rb') as f_in:
42e0c06ee6SMike McCandless
43e0c06ee6SMike McCandless    f_out = None
44e0c06ee6SMike McCandless
45e0c06ee6SMike McCandless    bytes_in_chunk = 0
46e0c06ee6SMike McCandless
47e0c06ee6SMike McCandless    chunk_count = 0
48e0c06ee6SMike McCandless
49e0c06ee6SMike McCandless    while True:
50e0c06ee6SMike McCandless      if f_out is None:
51e0c06ee6SMike McCandless        if os.path.exists(file_name_out):
52e0c06ee6SMike McCandless          seek_points.append(os.path.getsize(file_name_out))
53e0c06ee6SMike McCandless          print('  create chunk %s at pos=%s' % (chunk_count, seek_points[-1]))
54e0c06ee6SMike McCandless        else:
55e0c06ee6SMike McCandless          print('  create chunk %s at pos=0' % chunk_count)
56e0c06ee6SMike McCandless        f_out = gzip.open(file_name_out, 'ab')
57e0c06ee6SMike McCandless        chunk_count += 1
58e0c06ee6SMike McCandless
59e0c06ee6SMike McCandless      line = f_in.readline()
60e0c06ee6SMike McCandless      if len(line) == 0:
61e0c06ee6SMike McCandless        break
62e0c06ee6SMike McCandless
63e0c06ee6SMike McCandless      bytes_in_chunk += len(line)
64e0c06ee6SMike McCandless      f_out.write(line)
65e0c06ee6SMike McCandless
66e0c06ee6SMike McCandless      if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points:
67e0c06ee6SMike McCandless        f_out.close()
68e0c06ee6SMike McCandless        f_out = None
69e0c06ee6SMike McCandless        bytes_in_chunk = 0
70e0c06ee6SMike McCandless
71e0c06ee6SMike McCandless  with open(file_name_out[:-3] + '.seek', 'w') as f_out:
72e0c06ee6SMike McCandless    for seek_point in seek_points:
73e0c06ee6SMike McCandless      f_out.write('%d\n' % seek_point)
74e0c06ee6SMike McCandless
75e0c06ee6SMike McCandlessre_tag = re.compile('<[^>]+?>')
76e0c06ee6SMike McCandlessre_newlines = re.compile('\n+')
77e0c06ee6SMike McCandlessre_space = re.compile('\s')
78e0c06ee6SMike McCandless
79e0c06ee6SMike McCandless# used to find word break, for splitting docs into ~1 KB sized smaller docs:
80e0c06ee6SMike McCandlessre_next_non_word_character = re.compile('\W', re.U)
81e0c06ee6SMike McCandless
82e0c06ee6SMike McCandlessEUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz'
83e0c06ee6SMike McCandless
84e0c06ee6SMike McCandlessdef split_docs(all_out, title_string, date_string, body_string):
85e0c06ee6SMike McCandless
86e0c06ee6SMike McCandless  '''
87e0c06ee6SMike McCandless  Splits docs into smallish (~1 KB) sized docs, repeating same title and date
88e0c06ee6SMike McCandless  '''
89e0c06ee6SMike McCandless
90e0c06ee6SMike McCandless  doc_count = 0
91e0c06ee6SMike McCandless  while len(body_string) > 0:
92e0c06ee6SMike McCandless    char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4))
93e0c06ee6SMike McCandless    if char_count < 64:
94e0c06ee6SMike McCandless      # trimmed normal?
95e0c06ee6SMike McCandless      continue
96e0c06ee6SMike McCandless
97e0c06ee6SMike McCandless    m = re_next_non_word_character.search(body_string, char_count)
98e0c06ee6SMike McCandless    if m is not None:
99e0c06ee6SMike McCandless      char_count = m.start(0)
100e0c06ee6SMike McCandless    else:
101e0c06ee6SMike McCandless      char_count = len(body_string)
102e0c06ee6SMike McCandless
103e0c06ee6SMike McCandless    body_string_fragment = body_string[:char_count].strip()
104e0c06ee6SMike McCandless
105e0c06ee6SMike McCandless    #print('write title %d, body %d' % (len(title_string), len(body_string_fragment)))
106e0c06ee6SMike McCandless    all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment))
107e0c06ee6SMike McCandless    body_string = body_string[char_count:]
108e0c06ee6SMike McCandless    doc_count += 1
109e0c06ee6SMike McCandless
110e0c06ee6SMike McCandless  return doc_count
111e0c06ee6SMike McCandless
112e0c06ee6SMike McCandlessdef sample_europarl():
113e0c06ee6SMike McCandless
114e0c06ee6SMike McCandless  # download europarl.tgz v7, if not already here (in cwd):
115e0c06ee6SMike McCandless  file_name = 'europarl.tgz'
116e0c06ee6SMike McCandless  if not os.path.exists(file_name):
117e0c06ee6SMike McCandless    print('Download %s to %s...' % (EUROPARL_V7_URL, file_name))
118e0c06ee6SMike McCandless    urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp')
119e0c06ee6SMike McCandless    os.rename(file_name + '.tmp', file_name)
120e0c06ee6SMike McCandless  else:
121e0c06ee6SMike McCandless    print('%s already here; skipping download...' % file_name)
122e0c06ee6SMike McCandless
123e0c06ee6SMike McCandless  if not DEBUG:
124e0c06ee6SMike McCandless    tmp_dir_path = tempfile.mkdtemp()
125e0c06ee6SMike McCandless  else:
126e0c06ee6SMike McCandless    tmp_dir_path = '/tmp/tmp31ekzg75'
127e0c06ee6SMike McCandless  print('Using tmp dir "%s"...' % tmp_dir_path)
128e0c06ee6SMike McCandless  try:
129e0c06ee6SMike McCandless    if not DEBUG:
130e0c06ee6SMike McCandless      cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path)
131e0c06ee6SMike McCandless      print('Run: %s' % cmd)
132e0c06ee6SMike McCandless      subprocess.run(cmd, shell=True)
133e0c06ee6SMike McCandless
134e0c06ee6SMike McCandless    doc_count = 0
135e0c06ee6SMike McCandless    skip_count = 0
136e0c06ee6SMike McCandless    file_count = 0
137e0c06ee6SMike McCandless
138e0c06ee6SMike McCandless    all_txt_file_name = '%s/all.txt' % tmp_dir_path
139e0c06ee6SMike McCandless
140e0c06ee6SMike McCandless    print('Extract text...')
141e0c06ee6SMike McCandless
142e0c06ee6SMike McCandless    start_time = time.time()
143e0c06ee6SMike McCandless    next_print_time = start_time + 3
144e0c06ee6SMike McCandless    # normalize text a bit and concatenate all lines into single file, counting total lines/bytes
145e0c06ee6SMike McCandless    with open(all_txt_file_name, 'w', encoding='utf-8') as all_out:
146e0c06ee6SMike McCandless      for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path):
147e0c06ee6SMike McCandless        for file_name in file_names:
148e0c06ee6SMike McCandless          if file_name.endswith('.txt'):
149e0c06ee6SMike McCandless            file_count += 1
150e0c06ee6SMike McCandless
151e0c06ee6SMike McCandless            year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3])
152e0c06ee6SMike McCandless            if year >= 50:
153e0c06ee6SMike McCandless              year = 1900 + year
154e0c06ee6SMike McCandless            else:
155e0c06ee6SMike McCandless              year = 2000 + year
156e0c06ee6SMike McCandless
157e0c06ee6SMike McCandless            date_string = '%04d-%02d-%02d' % (year, month, day)
158e0c06ee6SMike McCandless
159e0c06ee6SMike McCandless            # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8:
160e0c06ee6SMike McCandless            chapter_count = 0
161e0c06ee6SMike McCandless            with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in:
162e0c06ee6SMike McCandless              last_text = []
163e0c06ee6SMike McCandless              last_title = None
164e0c06ee6SMike McCandless              while True:
165e0c06ee6SMike McCandless                line = f_in.readline()
166e0c06ee6SMike McCandless                if line == '':
167e0c06ee6SMike McCandless                  break
168e0c06ee6SMike McCandless                line = line.strip()
169e0c06ee6SMike McCandless                if line.startswith('<CHAPTER '):
170e0c06ee6SMike McCandless                  if last_title is not None:
171e0c06ee6SMike McCandless                    s = ' '.join(last_text)
172e0c06ee6SMike McCandless                    s = re_tag.sub(' ', s)
173e0c06ee6SMike McCandless                    s = re_newlines.sub(' ', s)
174e0c06ee6SMike McCandless                    s = s.strip()
175e0c06ee6SMike McCandless                    if len(s) > 0:
176e0c06ee6SMike McCandless                      doc_count += split_docs(all_out, last_title, date_string, s)
177e0c06ee6SMike McCandless                    else:
178e0c06ee6SMike McCandless                      skip_count += 1
179e0c06ee6SMike McCandless
180e0c06ee6SMike McCandless                    last_text = []
181e0c06ee6SMike McCandless                    chapter_count += 1
182e0c06ee6SMike McCandless                  while True:
183e0c06ee6SMike McCandless                    last_title = f_in.readline()
184e0c06ee6SMike McCandless                    if last_title == '':
185e0c06ee6SMike McCandless                      last_title = None
186e0c06ee6SMike McCandless                      break
187e0c06ee6SMike McCandless                    last_title = re_tag.sub(' ', last_title).strip()
188e0c06ee6SMike McCandless                    if len(last_title) > 0:
189e0c06ee6SMike McCandless                      break
190e0c06ee6SMike McCandless                  continue
191e0c06ee6SMike McCandless                else:
192e0c06ee6SMike McCandless                  last_text.append(line)
193e0c06ee6SMike McCandless
194e0c06ee6SMike McCandless              if last_title is not None:
195e0c06ee6SMike McCandless                s = ' '.join(last_text)
196e0c06ee6SMike McCandless                s = re_tag.sub(' ', s)
197e0c06ee6SMike McCandless                s = re_newlines.sub(' ', s)
198e0c06ee6SMike McCandless                s = s.strip()
199e0c06ee6SMike McCandless                if len(s) > 0:
200e0c06ee6SMike McCandless                  doc_count += split_docs(all_out, last_title, date_string, s)
201e0c06ee6SMike McCandless                else:
202e0c06ee6SMike McCandless                  skip_count += 1
203e0c06ee6SMike McCandless                chapter_count += 1
204e0c06ee6SMike McCandless              else:
205e0c06ee6SMike McCandless                skip_count += 1
206e0c06ee6SMike McCandless
207e0c06ee6SMike McCandless              if chapter_count > 0:
208e0c06ee6SMike McCandless                #print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count))
209e0c06ee6SMike McCandless                pass
210e0c06ee6SMike McCandless
211e0c06ee6SMike McCandless            now = time.time()
212e0c06ee6SMike McCandless            if now > next_print_time:
213e0c06ee6SMike McCandless              print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
214e0c06ee6SMike McCandless                    (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
215e0c06ee6SMike McCandless                     100 * (file_count - skip_count) / file_count,
216e0c06ee6SMike McCandless                     doc_count / 1000000, all_out.tell() / 1024/1024/1024))
217e0c06ee6SMike McCandless              while next_print_time < now:
218e0c06ee6SMike McCandless                next_print_time += 3
219e0c06ee6SMike McCandless
220e0c06ee6SMike McCandless    total_mb = os.path.getsize(all_txt_file_name)/1024/1024
221e0c06ee6SMike McCandless    now = time.time()
222e0c06ee6SMike McCandless    print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
223e0c06ee6SMike McCandless          (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
224e0c06ee6SMike McCandless           100 * (file_count - skip_count) / file_count,
225e0c06ee6SMike McCandless           doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024))
226e0c06ee6SMike McCandless
227e0c06ee6SMike McCandless    print('Shuffle...')
228e0c06ee6SMike McCandless    subprocess.run('shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True)
229e0c06ee6SMike McCandless
230e0c06ee6SMike McCandless    for mb in (20, 200, 2000):
231e0c06ee6SMike McCandless      print('Sample %d MB file...' % mb)
232e0c06ee6SMike McCandless      file_name_out = '%dmb.txt' % mb
233e0c06ee6SMike McCandless      with open(file_name_out, 'w', encoding='utf-8') as f_out:
234e0c06ee6SMike McCandless
235e0c06ee6SMike McCandless        chance = mb / total_mb
236e0c06ee6SMike McCandless
237e0c06ee6SMike McCandless        with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f:
238e0c06ee6SMike McCandless
239e0c06ee6SMike McCandless          while True:
240e0c06ee6SMike McCandless            line = f.readline()
241e0c06ee6SMike McCandless            if len(line) == 0:
242e0c06ee6SMike McCandless              break
243e0c06ee6SMike McCandless            if random.random() <= chance:
244e0c06ee6SMike McCandless              f_out.write(line)
245e0c06ee6SMike McCandless
246e0c06ee6SMike McCandless      print('  got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024))
247e0c06ee6SMike McCandless
248e0c06ee6SMike McCandless      compress_with_seek_points(file_name_out,
249e0c06ee6SMike McCandless                                file_name_out + '.gz',
250e0c06ee6SMike McCandless                                mb)
251e0c06ee6SMike McCandless
252e0c06ee6SMike McCandless  finally:
253e0c06ee6SMike McCandless    print('Removing tmp dir "%s"...' % tmp_dir_path)
254e0c06ee6SMike McCandless    if not DEBUG:
255e0c06ee6SMike McCandless      shutil.rmtree(tmp_dir_path)
256e0c06ee6SMike McCandless
257e0c06ee6SMike McCandless  print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n')
258e0c06ee6SMike McCandless
259e0c06ee6SMike McCandlessif False:
260e0c06ee6SMike McCandless  compress_with_seek_points('/x/tmp/europarl.lines.txt',
261e0c06ee6SMike McCandless                            '/x/tmp/foo.txt.gz',
262e0c06ee6SMike McCandless                            16)
263e0c06ee6SMike McCandlesselse:
264e0c06ee6SMike McCandless  sample_europarl()
265