import os import sys import re import hashlib import math import getopt import requests import colorama from torf import Torrent index_path = '/backup-index.html' status_path = '/:db:/:date:/dumpstatus.json' mirrors = ['https://dumps.wikimedia.org', 'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps', 'https://dumps.wikimedia.your.org'] trackers = ['udp://tracker.opentrackr.org:1337/announce', 'udp://tracker.openbittorrent.com:6969', 'udp://explodie.org:6969'] prev_progress = 0 def main(): colorama.init() try: opts, args = getopt.gnu_getopt(sys.argv, "hm:d:", ["help", "mirror=", "date="]) except getopt.GetoptError: show_usage() sys.exit(-2) mirror = mirrors[0] date = '' for arg, value in opts: if arg == "-h" or arg == "--help": show_usage() sys.exit(-2) elif arg == "-m" or arg == "--mirror": v = int(value) mirror = mirrors[v] elif arg == "-d" or arg == "--date": date = date_from_opt(value) if len(args) != 2: show_usage() sys.exit(-2) db = args[1] if date == '': date = find_last_date(mirror, db) print(f'Preparing torrent for wikipedia\'s {db} articles dump recombine dating {date_format(date)}.\n') recombine = get_job_state(mirror, db, date, 'articlesmultistreamdumprecombine') if recombine['status'] != 'done': print('"articles dump recombine" job is not done yet!') sys.exit(-1) dir_name = f'{db}-{date}-pages-articles-multistream' try: os.mkdir(dir_name) except OSError as error: print(error) expected_file_name1 = f'{db}-{date}-pages-articles-multistream-index.txt.bz2' file1 = recombine['files'][expected_file_name1] path1 = file1['url'] download_file(f'{dir_name}/{expected_file_name1}', f'{mirror}{path1}', file1['size']) verify_checksum(f'{dir_name}/{expected_file_name1}', file1['sha1'], file1['size']) expected_file_name2 = f'{db}-{date}-pages-articles-multistream.xml.bz2' file2 = recombine['files'][expected_file_name2] path2 = file2['url'] download_file(f'{dir_name}/{expected_file_name2}', f'{mirror}{path2}', file2['size']) verify_checksum(f'{dir_name}/{expected_file_name2}', file2['sha1'], file2['size']) # webseeds = [ # f'{mirrors[0]}{path}', # f'{mirrors[1]}{path}', # ] torrent = Torrent(path=dir_name, trackers=trackers) print('Creating torrent file...', end='') torrent.generate() torrent.write(f'{dir_name}.torrent') print('Done') def date_from_opt(from_opt: str): r = r'\d{4}-\d{2}-\d{2}' if re.match(r, from_opt) is None: show_usage() sys.exit(-2) return from_opt.replace('-', '') def date_format(date): return f'{date[:4]}-{date[4:6]}-{date[6:]}' def verify_checksum(filename, expected, size): print(f'Checking checksum of {filename} ', end='', flush=True) show_progress(0) h = hashlib.sha1() one_percent = size / 100 next_percent = one_percent with open(filename, 'rb') as fd: done = 0 for chunk in fd: h.update(chunk) done += len(chunk) if done >= next_percent: next_percent += one_percent show_progress(int(done * 100 / size)) rc = h.hexdigest() if rc != expected: print(' Bad', flush=True) sys.exit(-1) print(' Ok', flush=True) def get_job_state(base_uri, db, date, jobname): path = status_path.replace(':db:', db).replace(':date:', date) page = requests.get(f'{base_uri}{path}') state = page.json() jobs = state['jobs'] return jobs[jobname] def download_file(filename, uri, size): with open(filename, 'ab') as fd: start_from = fd.tell() if start_from == size: headers = None elif start_from > 0: headers = {'Range': f'bytes={start_from}-'} else: headers = {} print(f'Downloading {uri} ', end='', flush=True) one_percent = size / 100 next_percent = start_from + one_percent show_progress(0) if start_from >= one_percent: for p in range(1, math.ceil(start_from * 100 / size) + 1): show_progress(p) if headers is not None: done = start_from file = requests.get(uri, headers=headers, stream=True) for chunk in file.iter_content(16384): fd.write(chunk) done += len(chunk) if done >= next_percent: next_percent += one_percent show_progress(int(done * 100 / size)) print(" Done", flush=True) def show_progress(percent: int): is_tty = sys.stdout.isatty() if percent == 0 and is_tty: print(' 0%', end='', flush=True) elif is_tty: print(f"\033[4D{percent:3d}%", end='', flush=True) elif percent != 0: print('.', end='', flush=True) def find_last_date(base_uri, db): page = requests.get(f'{base_uri}{index_path}') r = re.compile(f'') found = re.findall(r, page.text) return found[0] def show_usage(): print('Wrong usage') print('Usage', sys.argv[0], '[db] [options...]') print('Options:') print(' -m, --mirror\tUse mirror number (see list bellow)') print(' -d, --date\t\tDump date (i.e. 2020-11-01)') print() print('Mirrors:') i = 0 for mirror in mirrors: print(i, mirror) i += 1 if __name__ == '__main__': main()