update default trackers and add index for multistream
This commit is contained in:
parent
11e356de8d
commit
0c7cdb6a0a
8
.idea/.gitignore
generated
vendored
8
.idea/.gitignore
generated
vendored
@ -1,8 +0,0 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Datasource local storage ignored files
|
||||
/../../../../:\dev\wmfb\.idea/dataSources/
|
||||
/dataSources.local.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
@ -1,4 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (wmfb)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (wmfb)" project-jdk-type="Python SDK" />
|
||||
</project>
|
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
2
.idea/wmfb.iml
generated
2
.idea/wmfb.iml
generated
@ -4,7 +4,7 @@
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.8 (wmfb)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.9 (wmfb)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
53
main.py
53
main.py
@ -1,3 +1,4 @@
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import hashlib
|
||||
@ -13,8 +14,8 @@ mirrors = ['https://dumps.wikimedia.org',
|
||||
'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps',
|
||||
'https://dumps.wikimedia.your.org']
|
||||
trackers = ['udp://tracker.opentrackr.org:1337/announce',
|
||||
'udp://tracker.coppersurfer.tk:6969',
|
||||
'udp://tracker.leechers-paradise.org:6969']
|
||||
'udp://tracker.openbittorrent.com:6969',
|
||||
'udp://explodie.org:6969']
|
||||
prev_progress = 0
|
||||
|
||||
|
||||
@ -47,19 +48,29 @@ def main():
|
||||
if recombine['status'] != 'done':
|
||||
print('"articles dump recombine" job is not done yet!')
|
||||
sys.exit(-1)
|
||||
expected_file_name = f'{db}-{date}-pages-articles-multistream.xml.bz2'
|
||||
file = recombine['files'][expected_file_name]
|
||||
path = file['url']
|
||||
download_file(expected_file_name, f'{mirror}{path}', file['size'])
|
||||
verify_checksum(expected_file_name, file['sha1'], file['size'])
|
||||
webseeds = [
|
||||
f'{mirrors[0]}{path}',
|
||||
f'{mirrors[1]}{path}',
|
||||
]
|
||||
torrent = Torrent(path=expected_file_name, webseeds=webseeds, trackers=trackers)
|
||||
dir_name = f'{db}-{date}-pages-articles-multistream'
|
||||
try:
|
||||
os.mkdir(dir_name)
|
||||
except OSError as error:
|
||||
print(error)
|
||||
expected_file_name1 = f'{db}-{date}-pages-articles-multistream-index.txt.bz2'
|
||||
file1 = recombine['files'][expected_file_name1]
|
||||
path1 = file1['url']
|
||||
download_file(f'{dir_name}/{expected_file_name1}', f'{mirror}{path1}', file1['size'])
|
||||
verify_checksum(f'{dir_name}/{expected_file_name1}', file1['sha1'], file1['size'])
|
||||
expected_file_name2 = f'{db}-{date}-pages-articles-multistream.xml.bz2'
|
||||
file2 = recombine['files'][expected_file_name2]
|
||||
path2 = file2['url']
|
||||
download_file(f'{dir_name}/{expected_file_name2}', f'{mirror}{path2}', file2['size'])
|
||||
verify_checksum(f'{dir_name}/{expected_file_name2}', file2['sha1'], file2['size'])
|
||||
# webseeds = [
|
||||
# f'{mirrors[0]}{path}',
|
||||
# f'{mirrors[1]}{path}',
|
||||
# ]
|
||||
torrent = Torrent(path=dir_name, trackers=trackers)
|
||||
print('Creating torrent file...', end='')
|
||||
torrent.generate()
|
||||
torrent.write(f'{expected_file_name}.torrent')
|
||||
torrent.write(f'{dir_name}.torrent')
|
||||
print('Done')
|
||||
|
||||
|
||||
@ -76,7 +87,7 @@ def date_format(date):
|
||||
|
||||
|
||||
def verify_checksum(filename, expected, size):
|
||||
print(f'Checking checksum of {filename} ', end='')
|
||||
print(f'Checking checksum of {filename} ', end='', flush=True)
|
||||
show_progress(0)
|
||||
h = hashlib.sha1()
|
||||
one_percent = size / 100
|
||||
@ -91,9 +102,9 @@ def verify_checksum(filename, expected, size):
|
||||
show_progress(int(done * 100 / size))
|
||||
rc = h.hexdigest()
|
||||
if rc != expected:
|
||||
print(' Bad')
|
||||
print(' Bad', flush=True)
|
||||
sys.exit(-1)
|
||||
print(' Ok')
|
||||
print(' Ok', flush=True)
|
||||
|
||||
|
||||
def get_job_state(base_uri, db, date, jobname):
|
||||
@ -113,7 +124,7 @@ def download_file(filename, uri, size):
|
||||
headers = {'Range': f'bytes={start_from}-'}
|
||||
else:
|
||||
headers = {}
|
||||
print(f'Downloading {uri} ', end='')
|
||||
print(f'Downloading {uri} ', end='', flush=True)
|
||||
one_percent = size / 100
|
||||
next_percent = start_from + one_percent
|
||||
show_progress(0)
|
||||
@ -129,17 +140,17 @@ def download_file(filename, uri, size):
|
||||
if done >= next_percent:
|
||||
next_percent += one_percent
|
||||
show_progress(int(done * 100 / size))
|
||||
print(" Done")
|
||||
print(" Done", flush=True)
|
||||
|
||||
|
||||
def show_progress(percent: int):
|
||||
is_tty = sys.stdout.isatty()
|
||||
if percent == 0 and is_tty:
|
||||
print(' 0%', end='')
|
||||
print(' 0%', end='', flush=True)
|
||||
elif is_tty:
|
||||
print(f"\033[4D{percent:3d}%", end='')
|
||||
print(f"\033[4D{percent:3d}%", end='', flush=True)
|
||||
elif percent != 0:
|
||||
print('.', end='')
|
||||
print('.', end='', flush=True)
|
||||
|
||||
|
||||
def find_last_date(base_uri, db):
|
||||
|
Loading…
Reference in New Issue
Block a user