diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 4786d18..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
-# Datasource local storage ignored files
-/../../../../:\dev\wmfb\.idea/dataSources/
-/dataSources.local.xml
-# Editor-based HTTP Client requests
-/httpRequests/
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 6dc06c8..0573499 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
-
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/wmfb.iml b/.idea/wmfb.iml
index 9cf510e..ca39836 100644
--- a/.idea/wmfb.iml
+++ b/.idea/wmfb.iml
@@ -4,7 +4,7 @@
-
+
\ No newline at end of file
diff --git a/main.py b/main.py
index cb49947..f864e49 100644
--- a/main.py
+++ b/main.py
@@ -1,3 +1,4 @@
+import os
import sys
import re
import hashlib
@@ -13,8 +14,8 @@ mirrors = ['https://dumps.wikimedia.org',
'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps',
'https://dumps.wikimedia.your.org']
trackers = ['udp://tracker.opentrackr.org:1337/announce',
- 'udp://tracker.coppersurfer.tk:6969',
- 'udp://tracker.leechers-paradise.org:6969']
+ 'udp://tracker.openbittorrent.com:6969',
+ 'udp://explodie.org:6969']
prev_progress = 0
@@ -47,19 +48,29 @@ def main():
if recombine['status'] != 'done':
print('"articles dump recombine" job is not done yet!')
sys.exit(-1)
- expected_file_name = f'{db}-{date}-pages-articles-multistream.xml.bz2'
- file = recombine['files'][expected_file_name]
- path = file['url']
- download_file(expected_file_name, f'{mirror}{path}', file['size'])
- verify_checksum(expected_file_name, file['sha1'], file['size'])
- webseeds = [
- f'{mirrors[0]}{path}',
- f'{mirrors[1]}{path}',
- ]
- torrent = Torrent(path=expected_file_name, webseeds=webseeds, trackers=trackers)
+ dir_name = f'{db}-{date}-pages-articles-multistream'
+ try:
+ os.mkdir(dir_name)
+ except OSError as error:
+ print(error)
+ expected_file_name1 = f'{db}-{date}-pages-articles-multistream-index.txt.bz2'
+ file1 = recombine['files'][expected_file_name1]
+ path1 = file1['url']
+ download_file(f'{dir_name}/{expected_file_name1}', f'{mirror}{path1}', file1['size'])
+ verify_checksum(f'{dir_name}/{expected_file_name1}', file1['sha1'], file1['size'])
+ expected_file_name2 = f'{db}-{date}-pages-articles-multistream.xml.bz2'
+ file2 = recombine['files'][expected_file_name2]
+ path2 = file2['url']
+ download_file(f'{dir_name}/{expected_file_name2}', f'{mirror}{path2}', file2['size'])
+ verify_checksum(f'{dir_name}/{expected_file_name2}', file2['sha1'], file2['size'])
+ # webseeds = [
+ # f'{mirrors[0]}{path}',
+ # f'{mirrors[1]}{path}',
+ # ]
+ torrent = Torrent(path=dir_name, trackers=trackers)
print('Creating torrent file...', end='')
torrent.generate()
- torrent.write(f'{expected_file_name}.torrent')
+ torrent.write(f'{dir_name}.torrent')
print('Done')
@@ -76,7 +87,7 @@ def date_format(date):
def verify_checksum(filename, expected, size):
- print(f'Checking checksum of {filename} ', end='')
+ print(f'Checking checksum of {filename} ', end='', flush=True)
show_progress(0)
h = hashlib.sha1()
one_percent = size / 100
@@ -91,9 +102,9 @@ def verify_checksum(filename, expected, size):
show_progress(int(done * 100 / size))
rc = h.hexdigest()
if rc != expected:
- print(' Bad')
+ print(' Bad', flush=True)
sys.exit(-1)
- print(' Ok')
+ print(' Ok', flush=True)
def get_job_state(base_uri, db, date, jobname):
@@ -113,7 +124,7 @@ def download_file(filename, uri, size):
headers = {'Range': f'bytes={start_from}-'}
else:
headers = {}
- print(f'Downloading {uri} ', end='')
+ print(f'Downloading {uri} ', end='', flush=True)
one_percent = size / 100
next_percent = start_from + one_percent
show_progress(0)
@@ -129,17 +140,17 @@ def download_file(filename, uri, size):
if done >= next_percent:
next_percent += one_percent
show_progress(int(done * 100 / size))
- print(" Done")
+ print(" Done", flush=True)
def show_progress(percent: int):
is_tty = sys.stdout.isatty()
if percent == 0 and is_tty:
- print(' 0%', end='')
+ print(' 0%', end='', flush=True)
elif is_tty:
- print(f"\033[4D{percent:3d}%", end='')
+ print(f"\033[4D{percent:3d}%", end='', flush=True)
elif percent != 0:
- print('.', end='')
+ print('.', end='', flush=True)
def find_last_date(base_uri, db):