update default trackers and add index for multistream

This commit is contained in:
Nicolas Dextraze 2022-01-09 16:43:28 -05:00
parent 11e356de8d
commit 0c7cdb6a0a
5 changed files with 40 additions and 31 deletions

8
.idea/.gitignore vendored
View File

@ -1,8 +0,0 @@
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/../../../../:\dev\wmfb\.idea/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (wmfb)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (wmfb)" project-jdk-type="Python SDK" />
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.8 (wmfb)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.9 (wmfb)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

53
main.py
View File

@ -1,3 +1,4 @@
import os
import sys
import re
import hashlib
@ -13,8 +14,8 @@ mirrors = ['https://dumps.wikimedia.org',
'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps',
'https://dumps.wikimedia.your.org']
trackers = ['udp://tracker.opentrackr.org:1337/announce',
'udp://tracker.coppersurfer.tk:6969',
'udp://tracker.leechers-paradise.org:6969']
'udp://tracker.openbittorrent.com:6969',
'udp://explodie.org:6969']
prev_progress = 0
@ -47,19 +48,29 @@ def main():
if recombine['status'] != 'done':
print('"articles dump recombine" job is not done yet!')
sys.exit(-1)
expected_file_name = f'{db}-{date}-pages-articles-multistream.xml.bz2'
file = recombine['files'][expected_file_name]
path = file['url']
download_file(expected_file_name, f'{mirror}{path}', file['size'])
verify_checksum(expected_file_name, file['sha1'], file['size'])
webseeds = [
f'{mirrors[0]}{path}',
f'{mirrors[1]}{path}',
]
torrent = Torrent(path=expected_file_name, webseeds=webseeds, trackers=trackers)
dir_name = f'{db}-{date}-pages-articles-multistream'
try:
os.mkdir(dir_name)
except OSError as error:
print(error)
expected_file_name1 = f'{db}-{date}-pages-articles-multistream-index.txt.bz2'
file1 = recombine['files'][expected_file_name1]
path1 = file1['url']
download_file(f'{dir_name}/{expected_file_name1}', f'{mirror}{path1}', file1['size'])
verify_checksum(f'{dir_name}/{expected_file_name1}', file1['sha1'], file1['size'])
expected_file_name2 = f'{db}-{date}-pages-articles-multistream.xml.bz2'
file2 = recombine['files'][expected_file_name2]
path2 = file2['url']
download_file(f'{dir_name}/{expected_file_name2}', f'{mirror}{path2}', file2['size'])
verify_checksum(f'{dir_name}/{expected_file_name2}', file2['sha1'], file2['size'])
# webseeds = [
# f'{mirrors[0]}{path}',
# f'{mirrors[1]}{path}',
# ]
torrent = Torrent(path=dir_name, trackers=trackers)
print('Creating torrent file...', end='')
torrent.generate()
torrent.write(f'{expected_file_name}.torrent')
torrent.write(f'{dir_name}.torrent')
print('Done')
@ -76,7 +87,7 @@ def date_format(date):
def verify_checksum(filename, expected, size):
print(f'Checking checksum of {filename} ', end='')
print(f'Checking checksum of {filename} ', end='', flush=True)
show_progress(0)
h = hashlib.sha1()
one_percent = size / 100
@ -91,9 +102,9 @@ def verify_checksum(filename, expected, size):
show_progress(int(done * 100 / size))
rc = h.hexdigest()
if rc != expected:
print(' Bad')
print(' Bad', flush=True)
sys.exit(-1)
print(' Ok')
print(' Ok', flush=True)
def get_job_state(base_uri, db, date, jobname):
@ -113,7 +124,7 @@ def download_file(filename, uri, size):
headers = {'Range': f'bytes={start_from}-'}
else:
headers = {}
print(f'Downloading {uri} ', end='')
print(f'Downloading {uri} ', end='', flush=True)
one_percent = size / 100
next_percent = start_from + one_percent
show_progress(0)
@ -129,17 +140,17 @@ def download_file(filename, uri, size):
if done >= next_percent:
next_percent += one_percent
show_progress(int(done * 100 / size))
print(" Done")
print(" Done", flush=True)
def show_progress(percent: int):
is_tty = sys.stdout.isatty()
if percent == 0 and is_tty:
print(' 0%', end='')
print(' 0%', end='', flush=True)
elif is_tty:
print(f"\033[4D{percent:3d}%", end='')
print(f"\033[4D{percent:3d}%", end='', flush=True)
elif percent != 0:
print('.', end='')
print('.', end='', flush=True)
def find_last_date(base_uri, db):