1*3bedc087SDawid Weiss#!/usr/bin/env python3 2*3bedc087SDawid Weiss# -*- coding: utf-8 -*- 3*3bedc087SDawid Weiss# Licensed to the Apache Software Foundation (ASF) under one or more 4*3bedc087SDawid Weiss# contributor license agreements. See the NOTICE file distributed with 5*3bedc087SDawid Weiss# this work for additional information regarding copyright ownership. 6*3bedc087SDawid Weiss# The ASF licenses this file to You under the Apache License, Version 2.0 7*3bedc087SDawid Weiss# (the "License"); you may not use this file except in compliance with 8*3bedc087SDawid Weiss# the License. You may obtain a copy of the License at 9*3bedc087SDawid Weiss# 10*3bedc087SDawid Weiss# http://www.apache.org/licenses/LICENSE-2.0 11*3bedc087SDawid Weiss# 12*3bedc087SDawid Weiss# Unless required by applicable law or agreed to in writing, software 13*3bedc087SDawid Weiss# distributed under the License is distributed on an "AS IS" BASIS, 14*3bedc087SDawid Weiss# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15*3bedc087SDawid Weiss# See the License for the specific language governing permissions and 16*3bedc087SDawid Weiss# limitations under the License. 17*3bedc087SDawid Weiss 18e0c06ee6SMike McCandlessimport os 19e0c06ee6SMike McCandlessimport gzip 20e0c06ee6SMike McCandlessimport time 21e0c06ee6SMike McCandlessimport random 22e0c06ee6SMike McCandlessimport re 23e0c06ee6SMike McCandlessimport urllib.request 24e0c06ee6SMike McCandlessimport subprocess 25e0c06ee6SMike McCandlessimport tempfile 26e0c06ee6SMike McCandlessimport shutil 27e0c06ee6SMike McCandless 28e0c06ee6SMike McCandlessDEBUG = False 29e0c06ee6SMike McCandless 30e0c06ee6SMike McCandlessTARGET_DOC_CHARS = 1024 31e0c06ee6SMike McCandless 32e0c06ee6SMike McCandlessdef compress_with_seek_points(file_name_in, file_name_out, num_seek_points): 33e0c06ee6SMike McCandless 34e0c06ee6SMike McCandless bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points 35e0c06ee6SMike McCandless 36e0c06ee6SMike McCandless seek_points = [] 37e0c06ee6SMike McCandless 38e0c06ee6SMike McCandless if os.path.exists(file_name_out): 39e0c06ee6SMike McCandless os.remove(file_name_out) 40e0c06ee6SMike McCandless 41e0c06ee6SMike McCandless with open(file_name_in, 'rb') as f_in: 42e0c06ee6SMike McCandless 43e0c06ee6SMike McCandless f_out = None 44e0c06ee6SMike McCandless 45e0c06ee6SMike McCandless bytes_in_chunk = 0 46e0c06ee6SMike McCandless 47e0c06ee6SMike McCandless chunk_count = 0 48e0c06ee6SMike McCandless 49e0c06ee6SMike McCandless while True: 50e0c06ee6SMike McCandless if f_out is None: 51e0c06ee6SMike McCandless if os.path.exists(file_name_out): 52e0c06ee6SMike McCandless seek_points.append(os.path.getsize(file_name_out)) 53e0c06ee6SMike McCandless print(' create chunk %s at pos=%s' % (chunk_count, seek_points[-1])) 54e0c06ee6SMike McCandless else: 55e0c06ee6SMike McCandless print(' create chunk %s at pos=0' % chunk_count) 56e0c06ee6SMike McCandless f_out = gzip.open(file_name_out, 'ab') 57e0c06ee6SMike McCandless chunk_count += 1 58e0c06ee6SMike McCandless 59e0c06ee6SMike McCandless line = f_in.readline() 60e0c06ee6SMike McCandless if len(line) == 0: 61e0c06ee6SMike McCandless break 62e0c06ee6SMike McCandless 63e0c06ee6SMike McCandless bytes_in_chunk += len(line) 64e0c06ee6SMike McCandless f_out.write(line) 65e0c06ee6SMike McCandless 66e0c06ee6SMike McCandless if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points: 67e0c06ee6SMike McCandless f_out.close() 68e0c06ee6SMike McCandless f_out = None 69e0c06ee6SMike McCandless bytes_in_chunk = 0 70e0c06ee6SMike McCandless 71e0c06ee6SMike McCandless with open(file_name_out[:-3] + '.seek', 'w') as f_out: 72e0c06ee6SMike McCandless for seek_point in seek_points: 73e0c06ee6SMike McCandless f_out.write('%d\n' % seek_point) 74e0c06ee6SMike McCandless 75e0c06ee6SMike McCandlessre_tag = re.compile('<[^>]+?>') 76e0c06ee6SMike McCandlessre_newlines = re.compile('\n+') 77e0c06ee6SMike McCandlessre_space = re.compile('\s') 78e0c06ee6SMike McCandless 79e0c06ee6SMike McCandless# used to find word break, for splitting docs into ~1 KB sized smaller docs: 80e0c06ee6SMike McCandlessre_next_non_word_character = re.compile('\W', re.U) 81e0c06ee6SMike McCandless 82e0c06ee6SMike McCandlessEUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz' 83e0c06ee6SMike McCandless 84e0c06ee6SMike McCandlessdef split_docs(all_out, title_string, date_string, body_string): 85e0c06ee6SMike McCandless 86e0c06ee6SMike McCandless ''' 87e0c06ee6SMike McCandless Splits docs into smallish (~1 KB) sized docs, repeating same title and date 88e0c06ee6SMike McCandless ''' 89e0c06ee6SMike McCandless 90e0c06ee6SMike McCandless doc_count = 0 91e0c06ee6SMike McCandless while len(body_string) > 0: 92e0c06ee6SMike McCandless char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4)) 93e0c06ee6SMike McCandless if char_count < 64: 94e0c06ee6SMike McCandless # trimmed normal? 95e0c06ee6SMike McCandless continue 96e0c06ee6SMike McCandless 97e0c06ee6SMike McCandless m = re_next_non_word_character.search(body_string, char_count) 98e0c06ee6SMike McCandless if m is not None: 99e0c06ee6SMike McCandless char_count = m.start(0) 100e0c06ee6SMike McCandless else: 101e0c06ee6SMike McCandless char_count = len(body_string) 102e0c06ee6SMike McCandless 103e0c06ee6SMike McCandless body_string_fragment = body_string[:char_count].strip() 104e0c06ee6SMike McCandless 105e0c06ee6SMike McCandless #print('write title %d, body %d' % (len(title_string), len(body_string_fragment))) 106e0c06ee6SMike McCandless all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment)) 107e0c06ee6SMike McCandless body_string = body_string[char_count:] 108e0c06ee6SMike McCandless doc_count += 1 109e0c06ee6SMike McCandless 110e0c06ee6SMike McCandless return doc_count 111e0c06ee6SMike McCandless 112e0c06ee6SMike McCandlessdef sample_europarl(): 113e0c06ee6SMike McCandless 114e0c06ee6SMike McCandless # download europarl.tgz v7, if not already here (in cwd): 115e0c06ee6SMike McCandless file_name = 'europarl.tgz' 116e0c06ee6SMike McCandless if not os.path.exists(file_name): 117e0c06ee6SMike McCandless print('Download %s to %s...' % (EUROPARL_V7_URL, file_name)) 118e0c06ee6SMike McCandless urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp') 119e0c06ee6SMike McCandless os.rename(file_name + '.tmp', file_name) 120e0c06ee6SMike McCandless else: 121e0c06ee6SMike McCandless print('%s already here; skipping download...' % file_name) 122e0c06ee6SMike McCandless 123e0c06ee6SMike McCandless if not DEBUG: 124e0c06ee6SMike McCandless tmp_dir_path = tempfile.mkdtemp() 125e0c06ee6SMike McCandless else: 126e0c06ee6SMike McCandless tmp_dir_path = '/tmp/tmp31ekzg75' 127e0c06ee6SMike McCandless print('Using tmp dir "%s"...' % tmp_dir_path) 128e0c06ee6SMike McCandless try: 129e0c06ee6SMike McCandless if not DEBUG: 130e0c06ee6SMike McCandless cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path) 131e0c06ee6SMike McCandless print('Run: %s' % cmd) 132e0c06ee6SMike McCandless subprocess.run(cmd, shell=True) 133e0c06ee6SMike McCandless 134e0c06ee6SMike McCandless doc_count = 0 135e0c06ee6SMike McCandless skip_count = 0 136e0c06ee6SMike McCandless file_count = 0 137e0c06ee6SMike McCandless 138e0c06ee6SMike McCandless all_txt_file_name = '%s/all.txt' % tmp_dir_path 139e0c06ee6SMike McCandless 140e0c06ee6SMike McCandless print('Extract text...') 141e0c06ee6SMike McCandless 142e0c06ee6SMike McCandless start_time = time.time() 143e0c06ee6SMike McCandless next_print_time = start_time + 3 144e0c06ee6SMike McCandless # normalize text a bit and concatenate all lines into single file, counting total lines/bytes 145e0c06ee6SMike McCandless with open(all_txt_file_name, 'w', encoding='utf-8') as all_out: 146e0c06ee6SMike McCandless for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path): 147e0c06ee6SMike McCandless for file_name in file_names: 148e0c06ee6SMike McCandless if file_name.endswith('.txt'): 149e0c06ee6SMike McCandless file_count += 1 150e0c06ee6SMike McCandless 151e0c06ee6SMike McCandless year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3]) 152e0c06ee6SMike McCandless if year >= 50: 153e0c06ee6SMike McCandless year = 1900 + year 154e0c06ee6SMike McCandless else: 155e0c06ee6SMike McCandless year = 2000 + year 156e0c06ee6SMike McCandless 157e0c06ee6SMike McCandless date_string = '%04d-%02d-%02d' % (year, month, day) 158e0c06ee6SMike McCandless 159e0c06ee6SMike McCandless # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8: 160e0c06ee6SMike McCandless chapter_count = 0 161e0c06ee6SMike McCandless with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in: 162e0c06ee6SMike McCandless last_text = [] 163e0c06ee6SMike McCandless last_title = None 164e0c06ee6SMike McCandless while True: 165e0c06ee6SMike McCandless line = f_in.readline() 166e0c06ee6SMike McCandless if line == '': 167e0c06ee6SMike McCandless break 168e0c06ee6SMike McCandless line = line.strip() 169e0c06ee6SMike McCandless if line.startswith('<CHAPTER '): 170e0c06ee6SMike McCandless if last_title is not None: 171e0c06ee6SMike McCandless s = ' '.join(last_text) 172e0c06ee6SMike McCandless s = re_tag.sub(' ', s) 173e0c06ee6SMike McCandless s = re_newlines.sub(' ', s) 174e0c06ee6SMike McCandless s = s.strip() 175e0c06ee6SMike McCandless if len(s) > 0: 176e0c06ee6SMike McCandless doc_count += split_docs(all_out, last_title, date_string, s) 177e0c06ee6SMike McCandless else: 178e0c06ee6SMike McCandless skip_count += 1 179e0c06ee6SMike McCandless 180e0c06ee6SMike McCandless last_text = [] 181e0c06ee6SMike McCandless chapter_count += 1 182e0c06ee6SMike McCandless while True: 183e0c06ee6SMike McCandless last_title = f_in.readline() 184e0c06ee6SMike McCandless if last_title == '': 185e0c06ee6SMike McCandless last_title = None 186e0c06ee6SMike McCandless break 187e0c06ee6SMike McCandless last_title = re_tag.sub(' ', last_title).strip() 188e0c06ee6SMike McCandless if len(last_title) > 0: 189e0c06ee6SMike McCandless break 190e0c06ee6SMike McCandless continue 191e0c06ee6SMike McCandless else: 192e0c06ee6SMike McCandless last_text.append(line) 193e0c06ee6SMike McCandless 194e0c06ee6SMike McCandless if last_title is not None: 195e0c06ee6SMike McCandless s = ' '.join(last_text) 196e0c06ee6SMike McCandless s = re_tag.sub(' ', s) 197e0c06ee6SMike McCandless s = re_newlines.sub(' ', s) 198e0c06ee6SMike McCandless s = s.strip() 199e0c06ee6SMike McCandless if len(s) > 0: 200e0c06ee6SMike McCandless doc_count += split_docs(all_out, last_title, date_string, s) 201e0c06ee6SMike McCandless else: 202e0c06ee6SMike McCandless skip_count += 1 203e0c06ee6SMike McCandless chapter_count += 1 204e0c06ee6SMike McCandless else: 205e0c06ee6SMike McCandless skip_count += 1 206e0c06ee6SMike McCandless 207e0c06ee6SMike McCandless if chapter_count > 0: 208e0c06ee6SMike McCandless #print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count)) 209e0c06ee6SMike McCandless pass 210e0c06ee6SMike McCandless 211e0c06ee6SMike McCandless now = time.time() 212e0c06ee6SMike McCandless if now > next_print_time: 213e0c06ee6SMike McCandless print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \ 214e0c06ee6SMike McCandless (now - start_time, (file_count - skip_count) / 1000, file_count / 1000, 215e0c06ee6SMike McCandless 100 * (file_count - skip_count) / file_count, 216e0c06ee6SMike McCandless doc_count / 1000000, all_out.tell() / 1024/1024/1024)) 217e0c06ee6SMike McCandless while next_print_time < now: 218e0c06ee6SMike McCandless next_print_time += 3 219e0c06ee6SMike McCandless 220e0c06ee6SMike McCandless total_mb = os.path.getsize(all_txt_file_name)/1024/1024 221e0c06ee6SMike McCandless now = time.time() 222e0c06ee6SMike McCandless print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \ 223e0c06ee6SMike McCandless (now - start_time, (file_count - skip_count) / 1000, file_count / 1000, 224e0c06ee6SMike McCandless 100 * (file_count - skip_count) / file_count, 225e0c06ee6SMike McCandless doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024)) 226e0c06ee6SMike McCandless 227e0c06ee6SMike McCandless print('Shuffle...') 228e0c06ee6SMike McCandless subprocess.run('shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True) 229e0c06ee6SMike McCandless 230e0c06ee6SMike McCandless for mb in (20, 200, 2000): 231e0c06ee6SMike McCandless print('Sample %d MB file...' % mb) 232e0c06ee6SMike McCandless file_name_out = '%dmb.txt' % mb 233e0c06ee6SMike McCandless with open(file_name_out, 'w', encoding='utf-8') as f_out: 234e0c06ee6SMike McCandless 235e0c06ee6SMike McCandless chance = mb / total_mb 236e0c06ee6SMike McCandless 237e0c06ee6SMike McCandless with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f: 238e0c06ee6SMike McCandless 239e0c06ee6SMike McCandless while True: 240e0c06ee6SMike McCandless line = f.readline() 241e0c06ee6SMike McCandless if len(line) == 0: 242e0c06ee6SMike McCandless break 243e0c06ee6SMike McCandless if random.random() <= chance: 244e0c06ee6SMike McCandless f_out.write(line) 245e0c06ee6SMike McCandless 246e0c06ee6SMike McCandless print(' got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024)) 247e0c06ee6SMike McCandless 248e0c06ee6SMike McCandless compress_with_seek_points(file_name_out, 249e0c06ee6SMike McCandless file_name_out + '.gz', 250e0c06ee6SMike McCandless mb) 251e0c06ee6SMike McCandless 252e0c06ee6SMike McCandless finally: 253e0c06ee6SMike McCandless print('Removing tmp dir "%s"...' % tmp_dir_path) 254e0c06ee6SMike McCandless if not DEBUG: 255e0c06ee6SMike McCandless shutil.rmtree(tmp_dir_path) 256e0c06ee6SMike McCandless 257e0c06ee6SMike McCandless print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n') 258e0c06ee6SMike McCandless 259e0c06ee6SMike McCandlessif False: 260e0c06ee6SMike McCandless compress_with_seek_points('/x/tmp/europarl.lines.txt', 261e0c06ee6SMike McCandless '/x/tmp/foo.txt.gz', 262e0c06ee6SMike McCandless 16) 263e0c06ee6SMike McCandlesselse: 264e0c06ee6SMike McCandless sample_europarl() 265