1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# Licensed to the Apache Software Foundation (ASF) under one or more 4# contributor license agreements. See the NOTICE file distributed with 5# this work for additional information regarding copyright ownership. 6# The ASF licenses this file to You under the Apache License, Version 2.0 7# (the "License"); you may not use this file except in compliance with 8# the License. You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17 18import os 19import gzip 20import time 21import random 22import re 23import urllib.request 24import subprocess 25import tempfile 26import shutil 27 28DEBUG = False 29 30TARGET_DOC_CHARS = 1024 31 32def compress_with_seek_points(file_name_in, file_name_out, num_seek_points): 33 34 bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points 35 36 seek_points = [] 37 38 if os.path.exists(file_name_out): 39 os.remove(file_name_out) 40 41 with open(file_name_in, 'rb') as f_in: 42 43 f_out = None 44 45 bytes_in_chunk = 0 46 47 chunk_count = 0 48 49 while True: 50 if f_out is None: 51 if os.path.exists(file_name_out): 52 seek_points.append(os.path.getsize(file_name_out)) 53 print(' create chunk %s at pos=%s' % (chunk_count, seek_points[-1])) 54 else: 55 print(' create chunk %s at pos=0' % chunk_count) 56 f_out = gzip.open(file_name_out, 'ab') 57 chunk_count += 1 58 59 line = f_in.readline() 60 if len(line) == 0: 61 break 62 63 bytes_in_chunk += len(line) 64 f_out.write(line) 65 66 if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points: 67 f_out.close() 68 f_out = None 69 bytes_in_chunk = 0 70 71 with open(file_name_out[:-3] + '.seek', 'w') as f_out: 72 for seek_point in seek_points: 73 f_out.write('%d\n' % seek_point) 74 75re_tag = re.compile('<[^>]+?>') 76re_newlines = re.compile('\n+') 77re_space = re.compile('\s') 78 79# used to find word break, for splitting docs into ~1 KB sized smaller docs: 80re_next_non_word_character = re.compile('\W', re.U) 81 82EUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz' 83 84def split_docs(all_out, title_string, date_string, body_string): 85 86 ''' 87 Splits docs into smallish (~1 KB) sized docs, repeating same title and date 88 ''' 89 90 doc_count = 0 91 while len(body_string) > 0: 92 char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4)) 93 if char_count < 64: 94 # trimmed normal? 95 continue 96 97 m = re_next_non_word_character.search(body_string, char_count) 98 if m is not None: 99 char_count = m.start(0) 100 else: 101 char_count = len(body_string) 102 103 body_string_fragment = body_string[:char_count].strip() 104 105 #print('write title %d, body %d' % (len(title_string), len(body_string_fragment))) 106 all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment)) 107 body_string = body_string[char_count:] 108 doc_count += 1 109 110 return doc_count 111 112def sample_europarl(): 113 114 # download europarl.tgz v7, if not already here (in cwd): 115 file_name = 'europarl.tgz' 116 if not os.path.exists(file_name): 117 print('Download %s to %s...' % (EUROPARL_V7_URL, file_name)) 118 urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp') 119 os.rename(file_name + '.tmp', file_name) 120 else: 121 print('%s already here; skipping download...' % file_name) 122 123 if not DEBUG: 124 tmp_dir_path = tempfile.mkdtemp() 125 else: 126 tmp_dir_path = '/tmp/tmp31ekzg75' 127 print('Using tmp dir "%s"...' % tmp_dir_path) 128 try: 129 if not DEBUG: 130 cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path) 131 print('Run: %s' % cmd) 132 subprocess.run(cmd, shell=True) 133 134 doc_count = 0 135 skip_count = 0 136 file_count = 0 137 138 all_txt_file_name = '%s/all.txt' % tmp_dir_path 139 140 print('Extract text...') 141 142 start_time = time.time() 143 next_print_time = start_time + 3 144 # normalize text a bit and concatenate all lines into single file, counting total lines/bytes 145 with open(all_txt_file_name, 'w', encoding='utf-8') as all_out: 146 for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path): 147 for file_name in file_names: 148 if file_name.endswith('.txt'): 149 file_count += 1 150 151 year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3]) 152 if year >= 50: 153 year = 1900 + year 154 else: 155 year = 2000 + year 156 157 date_string = '%04d-%02d-%02d' % (year, month, day) 158 159 # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8: 160 chapter_count = 0 161 with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in: 162 last_text = [] 163 last_title = None 164 while True: 165 line = f_in.readline() 166 if line == '': 167 break 168 line = line.strip() 169 if line.startswith('<CHAPTER '): 170 if last_title is not None: 171 s = ' '.join(last_text) 172 s = re_tag.sub(' ', s) 173 s = re_newlines.sub(' ', s) 174 s = s.strip() 175 if len(s) > 0: 176 doc_count += split_docs(all_out, last_title, date_string, s) 177 else: 178 skip_count += 1 179 180 last_text = [] 181 chapter_count += 1 182 while True: 183 last_title = f_in.readline() 184 if last_title == '': 185 last_title = None 186 break 187 last_title = re_tag.sub(' ', last_title).strip() 188 if len(last_title) > 0: 189 break 190 continue 191 else: 192 last_text.append(line) 193 194 if last_title is not None: 195 s = ' '.join(last_text) 196 s = re_tag.sub(' ', s) 197 s = re_newlines.sub(' ', s) 198 s = s.strip() 199 if len(s) > 0: 200 doc_count += split_docs(all_out, last_title, date_string, s) 201 else: 202 skip_count += 1 203 chapter_count += 1 204 else: 205 skip_count += 1 206 207 if chapter_count > 0: 208 #print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count)) 209 pass 210 211 now = time.time() 212 if now > next_print_time: 213 print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \ 214 (now - start_time, (file_count - skip_count) / 1000, file_count / 1000, 215 100 * (file_count - skip_count) / file_count, 216 doc_count / 1000000, all_out.tell() / 1024/1024/1024)) 217 while next_print_time < now: 218 next_print_time += 3 219 220 total_mb = os.path.getsize(all_txt_file_name)/1024/1024 221 now = time.time() 222 print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \ 223 (now - start_time, (file_count - skip_count) / 1000, file_count / 1000, 224 100 * (file_count - skip_count) / file_count, 225 doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024)) 226 227 print('Shuffle...') 228 subprocess.run('shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True) 229 230 for mb in (20, 200, 2000): 231 print('Sample %d MB file...' % mb) 232 file_name_out = '%dmb.txt' % mb 233 with open(file_name_out, 'w', encoding='utf-8') as f_out: 234 235 chance = mb / total_mb 236 237 with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f: 238 239 while True: 240 line = f.readline() 241 if len(line) == 0: 242 break 243 if random.random() <= chance: 244 f_out.write(line) 245 246 print(' got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024)) 247 248 compress_with_seek_points(file_name_out, 249 file_name_out + '.gz', 250 mb) 251 252 finally: 253 print('Removing tmp dir "%s"...' % tmp_dir_path) 254 if not DEBUG: 255 shutil.rmtree(tmp_dir_path) 256 257 print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n') 258 259if False: 260 compress_with_seek_points('/x/tmp/europarl.lines.txt', 261 '/x/tmp/foo.txt.gz', 262 16) 263else: 264 sample_europarl() 265