xref: /Lucene/dev-tools/scripts/create_line_file_docs.py (revision 3bedc0871e96429fb7eb0f6b9fb3f97ffa3e10d2)
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3# Licensed to the Apache Software Foundation (ASF) under one or more
4# contributor license agreements.  See the NOTICE file distributed with
5# this work for additional information regarding copyright ownership.
6# The ASF licenses this file to You under the Apache License, Version 2.0
7# (the "License"); you may not use this file except in compliance with
8# the License.  You may obtain a copy of the License at
9#
10#     http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17
18import os
19import gzip
20import time
21import random
22import re
23import urllib.request
24import subprocess
25import tempfile
26import shutil
27
28DEBUG = False
29
30TARGET_DOC_CHARS = 1024
31
32def compress_with_seek_points(file_name_in, file_name_out, num_seek_points):
33
34  bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points
35
36  seek_points = []
37
38  if os.path.exists(file_name_out):
39    os.remove(file_name_out)
40
41  with open(file_name_in, 'rb') as f_in:
42
43    f_out = None
44
45    bytes_in_chunk = 0
46
47    chunk_count = 0
48
49    while True:
50      if f_out is None:
51        if os.path.exists(file_name_out):
52          seek_points.append(os.path.getsize(file_name_out))
53          print('  create chunk %s at pos=%s' % (chunk_count, seek_points[-1]))
54        else:
55          print('  create chunk %s at pos=0' % chunk_count)
56        f_out = gzip.open(file_name_out, 'ab')
57        chunk_count += 1
58
59      line = f_in.readline()
60      if len(line) == 0:
61        break
62
63      bytes_in_chunk += len(line)
64      f_out.write(line)
65
66      if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points:
67        f_out.close()
68        f_out = None
69        bytes_in_chunk = 0
70
71  with open(file_name_out[:-3] + '.seek', 'w') as f_out:
72    for seek_point in seek_points:
73      f_out.write('%d\n' % seek_point)
74
75re_tag = re.compile('<[^>]+?>')
76re_newlines = re.compile('\n+')
77re_space = re.compile('\s')
78
79# used to find word break, for splitting docs into ~1 KB sized smaller docs:
80re_next_non_word_character = re.compile('\W', re.U)
81
82EUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz'
83
84def split_docs(all_out, title_string, date_string, body_string):
85
86  '''
87  Splits docs into smallish (~1 KB) sized docs, repeating same title and date
88  '''
89
90  doc_count = 0
91  while len(body_string) > 0:
92    char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4))
93    if char_count < 64:
94      # trimmed normal?
95      continue
96
97    m = re_next_non_word_character.search(body_string, char_count)
98    if m is not None:
99      char_count = m.start(0)
100    else:
101      char_count = len(body_string)
102
103    body_string_fragment = body_string[:char_count].strip()
104
105    #print('write title %d, body %d' % (len(title_string), len(body_string_fragment)))
106    all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment))
107    body_string = body_string[char_count:]
108    doc_count += 1
109
110  return doc_count
111
112def sample_europarl():
113
114  # download europarl.tgz v7, if not already here (in cwd):
115  file_name = 'europarl.tgz'
116  if not os.path.exists(file_name):
117    print('Download %s to %s...' % (EUROPARL_V7_URL, file_name))
118    urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp')
119    os.rename(file_name + '.tmp', file_name)
120  else:
121    print('%s already here; skipping download...' % file_name)
122
123  if not DEBUG:
124    tmp_dir_path = tempfile.mkdtemp()
125  else:
126    tmp_dir_path = '/tmp/tmp31ekzg75'
127  print('Using tmp dir "%s"...' % tmp_dir_path)
128  try:
129    if not DEBUG:
130      cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path)
131      print('Run: %s' % cmd)
132      subprocess.run(cmd, shell=True)
133
134    doc_count = 0
135    skip_count = 0
136    file_count = 0
137
138    all_txt_file_name = '%s/all.txt' % tmp_dir_path
139
140    print('Extract text...')
141
142    start_time = time.time()
143    next_print_time = start_time + 3
144    # normalize text a bit and concatenate all lines into single file, counting total lines/bytes
145    with open(all_txt_file_name, 'w', encoding='utf-8') as all_out:
146      for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path):
147        for file_name in file_names:
148          if file_name.endswith('.txt'):
149            file_count += 1
150
151            year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3])
152            if year >= 50:
153              year = 1900 + year
154            else:
155              year = 2000 + year
156
157            date_string = '%04d-%02d-%02d' % (year, month, day)
158
159            # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8:
160            chapter_count = 0
161            with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in:
162              last_text = []
163              last_title = None
164              while True:
165                line = f_in.readline()
166                if line == '':
167                  break
168                line = line.strip()
169                if line.startswith('<CHAPTER '):
170                  if last_title is not None:
171                    s = ' '.join(last_text)
172                    s = re_tag.sub(' ', s)
173                    s = re_newlines.sub(' ', s)
174                    s = s.strip()
175                    if len(s) > 0:
176                      doc_count += split_docs(all_out, last_title, date_string, s)
177                    else:
178                      skip_count += 1
179
180                    last_text = []
181                    chapter_count += 1
182                  while True:
183                    last_title = f_in.readline()
184                    if last_title == '':
185                      last_title = None
186                      break
187                    last_title = re_tag.sub(' ', last_title).strip()
188                    if len(last_title) > 0:
189                      break
190                  continue
191                else:
192                  last_text.append(line)
193
194              if last_title is not None:
195                s = ' '.join(last_text)
196                s = re_tag.sub(' ', s)
197                s = re_newlines.sub(' ', s)
198                s = s.strip()
199                if len(s) > 0:
200                  doc_count += split_docs(all_out, last_title, date_string, s)
201                else:
202                  skip_count += 1
203                chapter_count += 1
204              else:
205                skip_count += 1
206
207              if chapter_count > 0:
208                #print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count))
209                pass
210
211            now = time.time()
212            if now > next_print_time:
213              print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
214                    (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
215                     100 * (file_count - skip_count) / file_count,
216                     doc_count / 1000000, all_out.tell() / 1024/1024/1024))
217              while next_print_time < now:
218                next_print_time += 3
219
220    total_mb = os.path.getsize(all_txt_file_name)/1024/1024
221    now = time.time()
222    print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
223          (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
224           100 * (file_count - skip_count) / file_count,
225           doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024))
226
227    print('Shuffle...')
228    subprocess.run('shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True)
229
230    for mb in (20, 200, 2000):
231      print('Sample %d MB file...' % mb)
232      file_name_out = '%dmb.txt' % mb
233      with open(file_name_out, 'w', encoding='utf-8') as f_out:
234
235        chance = mb / total_mb
236
237        with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f:
238
239          while True:
240            line = f.readline()
241            if len(line) == 0:
242              break
243            if random.random() <= chance:
244              f_out.write(line)
245
246      print('  got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024))
247
248      compress_with_seek_points(file_name_out,
249                                file_name_out + '.gz',
250                                mb)
251
252  finally:
253    print('Removing tmp dir "%s"...' % tmp_dir_path)
254    if not DEBUG:
255      shutil.rmtree(tmp_dir_path)
256
257  print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n')
258
259if False:
260  compress_with_seek_points('/x/tmp/europarl.lines.txt',
261                            '/x/tmp/foo.txt.gz',
262                            16)
263else:
264  sample_europarl()
265