diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 4786d18..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Datasource local storage ignored files -/../../../../:\dev\wmfb\.idea/dataSources/ -/dataSources.local.xml -# Editor-based HTTP Client requests -/httpRequests/ diff --git a/.idea/misc.xml b/.idea/misc.xml index 6dc06c8..0573499 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/wmfb.iml b/.idea/wmfb.iml index 9cf510e..ca39836 100644 --- a/.idea/wmfb.iml +++ b/.idea/wmfb.iml @@ -4,7 +4,7 @@ - + \ No newline at end of file diff --git a/main.py b/main.py index cb49947..f864e49 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,4 @@ +import os import sys import re import hashlib @@ -13,8 +14,8 @@ mirrors = ['https://dumps.wikimedia.org', 'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps', 'https://dumps.wikimedia.your.org'] trackers = ['udp://tracker.opentrackr.org:1337/announce', - 'udp://tracker.coppersurfer.tk:6969', - 'udp://tracker.leechers-paradise.org:6969'] + 'udp://tracker.openbittorrent.com:6969', + 'udp://explodie.org:6969'] prev_progress = 0 @@ -47,19 +48,29 @@ def main(): if recombine['status'] != 'done': print('"articles dump recombine" job is not done yet!') sys.exit(-1) - expected_file_name = f'{db}-{date}-pages-articles-multistream.xml.bz2' - file = recombine['files'][expected_file_name] - path = file['url'] - download_file(expected_file_name, f'{mirror}{path}', file['size']) - verify_checksum(expected_file_name, file['sha1'], file['size']) - webseeds = [ - f'{mirrors[0]}{path}', - f'{mirrors[1]}{path}', - ] - torrent = Torrent(path=expected_file_name, webseeds=webseeds, trackers=trackers) + dir_name = f'{db}-{date}-pages-articles-multistream' + try: + os.mkdir(dir_name) + except OSError as error: + print(error) + expected_file_name1 = f'{db}-{date}-pages-articles-multistream-index.txt.bz2' + file1 = recombine['files'][expected_file_name1] + path1 = file1['url'] + download_file(f'{dir_name}/{expected_file_name1}', f'{mirror}{path1}', file1['size']) + verify_checksum(f'{dir_name}/{expected_file_name1}', file1['sha1'], file1['size']) + expected_file_name2 = f'{db}-{date}-pages-articles-multistream.xml.bz2' + file2 = recombine['files'][expected_file_name2] + path2 = file2['url'] + download_file(f'{dir_name}/{expected_file_name2}', f'{mirror}{path2}', file2['size']) + verify_checksum(f'{dir_name}/{expected_file_name2}', file2['sha1'], file2['size']) + # webseeds = [ + # f'{mirrors[0]}{path}', + # f'{mirrors[1]}{path}', + # ] + torrent = Torrent(path=dir_name, trackers=trackers) print('Creating torrent file...', end='') torrent.generate() - torrent.write(f'{expected_file_name}.torrent') + torrent.write(f'{dir_name}.torrent') print('Done') @@ -76,7 +87,7 @@ def date_format(date): def verify_checksum(filename, expected, size): - print(f'Checking checksum of {filename} ', end='') + print(f'Checking checksum of {filename} ', end='', flush=True) show_progress(0) h = hashlib.sha1() one_percent = size / 100 @@ -91,9 +102,9 @@ def verify_checksum(filename, expected, size): show_progress(int(done * 100 / size)) rc = h.hexdigest() if rc != expected: - print(' Bad') + print(' Bad', flush=True) sys.exit(-1) - print(' Ok') + print(' Ok', flush=True) def get_job_state(base_uri, db, date, jobname): @@ -113,7 +124,7 @@ def download_file(filename, uri, size): headers = {'Range': f'bytes={start_from}-'} else: headers = {} - print(f'Downloading {uri} ', end='') + print(f'Downloading {uri} ', end='', flush=True) one_percent = size / 100 next_percent = start_from + one_percent show_progress(0) @@ -129,17 +140,17 @@ def download_file(filename, uri, size): if done >= next_percent: next_percent += one_percent show_progress(int(done * 100 / size)) - print(" Done") + print(" Done", flush=True) def show_progress(percent: int): is_tty = sys.stdout.isatty() if percent == 0 and is_tty: - print(' 0%', end='') + print(' 0%', end='', flush=True) elif is_tty: - print(f"\033[4D{percent:3d}%", end='') + print(f"\033[4D{percent:3d}%", end='', flush=True) elif percent != 0: - print('.', end='') + print('.', end='', flush=True) def find_last_date(base_uri, db):