import os import gzip import time import random import re import urllib.request import subprocess import tempfile import shutil DEBUG = False TARGET_DOC_CHARS = 1024 def compress_with_seek_points(file_name_in, file_name_out, num_seek_points): bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points seek_points = [] if os.path.exists(file_name_out): os.remove(file_name_out) with open(file_name_in, 'rb') as f_in: f_out = None bytes_in_chunk = 0 chunk_count = 0 while True: if f_out is None: if os.path.exists(file_name_out): seek_points.append(os.path.getsize(file_name_out)) print(' create chunk %s at pos=%s' % (chunk_count, seek_points[-1])) else: print(' create chunk %s at pos=0' % chunk_count) f_out =, 'ab') chunk_count += 1 line = f_in.readline() if len(line) == 0: break bytes_in_chunk += len(line) f_out.write(line) if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points: f_out.close() f_out = None bytes_in_chunk = 0 with open(file_name_out[:-3] + '.seek', 'w') as f_out: for seek_point in seek_points: f_out.write('%d\n' % seek_point) re_tag = re.compile('<[^>]+?>') re_newlines = re.compile('\n+') re_space = re.compile('\s') # used to find word break, for splitting docs into ~1 KB sized smaller docs: re_next_non_word_character = re.compile('\W', re.U) EUROPARL_V7_URL = '' def split_docs(all_out, title_string, date_string, body_string): ''' Splits docs into smallish (~1 KB) sized docs, repeating same title and date ''' doc_count = 0 while len(body_string) > 0: char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4)) if char_count < 64: # trimmed normal? continue m =, char_count) if m is not None: char_count = m.start(0) else: char_count = len(body_string) body_string_fragment = body_string[:char_count].strip() #print('write title %d, body %d' % (len(title_string), len(body_string_fragment))) all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment)) body_string = body_string[char_count:] doc_count += 1 return doc_count def sample_europarl(): # download europarl.tgz v7, if not already here (in cwd): file_name = 'europarl.tgz' if not os.path.exists(file_name): print('Download %s to %s...' % (EUROPARL_V7_URL, file_name)) urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp') os.rename(file_name + '.tmp', file_name) else: print('%s already here; skipping download...' % file_name) if not DEBUG: tmp_dir_path = tempfile.mkdtemp() else: tmp_dir_path = '/tmp/tmp31ekzg75' print('Using tmp dir "%s"...' % tmp_dir_path) try: if not DEBUG: cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path) print('Run: %s' % cmd), shell=True) doc_count = 0 skip_count = 0 file_count = 0 all_txt_file_name = '%s/all.txt' % tmp_dir_path print('Extract text...') start_time = time.time() next_print_time = start_time + 3 # normalize text a bit and concatenate all lines into single file, counting total lines/bytes with open(all_txt_file_name, 'w', encoding='utf-8') as all_out: for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path): for file_name in file_names: if file_name.endswith('.txt'): file_count += 1 year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3]) if year >= 50: year = 1900 + year else: year = 2000 + year date_string = '%04d-%02d-%02d' % (year, month, day) # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8: chapter_count = 0 with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in: last_text = [] last_title = None while True: line = f_in.readline() if line == '': break line = line.strip() if line.startswith(' 0: doc_count += split_docs(all_out, last_title, date_string, s) else: skip_count += 1 last_text = [] chapter_count += 1 while True: last_title = f_in.readline() if last_title == '': last_title = None break last_title = re_tag.sub(' ', last_title).strip() if len(last_title) > 0: break continue else: last_text.append(line) if last_title is not None: s = ' '.join(last_text) s = re_tag.sub(' ', s) s = re_newlines.sub(' ', s) s = s.strip() if len(s) > 0: doc_count += split_docs(all_out, last_title, date_string, s) else: skip_count += 1 chapter_count += 1 else: skip_count += 1 if chapter_count > 0: #print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count)) pass now = time.time() if now > next_print_time: print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \ (now - start_time, (file_count - skip_count) / 1000, file_count / 1000, 100 * (file_count - skip_count) / file_count, doc_count / 1000000, all_out.tell() / 1024/1024/1024)) while next_print_time < now: next_print_time += 3 total_mb = os.path.getsize(all_txt_file_name)/1024/1024 now = time.time() print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \ (now - start_time, (file_count - skip_count) / 1000, file_count / 1000, 100 * (file_count - skip_count) / file_count, doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024)) print('Shuffle...')'shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True) for mb in (20, 200, 2000): print('Sample %d MB file...' % mb) file_name_out = '%dmb.txt' % mb with open(file_name_out, 'w', encoding='utf-8') as f_out: chance = mb / total_mb with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f: while True: line = f.readline() if len(line) == 0: break if random.random() <= chance: f_out.write(line) print(' got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024)) compress_with_seek_points(file_name_out, file_name_out + '.gz', mb) finally: print('Removing tmp dir "%s"...' % tmp_dir_path) if not DEBUG: shutil.rmtree(tmp_dir_path) print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n') if False: compress_with_seek_points('/x/tmp/europarl.lines.txt', '/x/tmp/foo.txt.gz', 16) else: sample_europarl()