update default trackers and add index for multistream
This commit is contained in:
parent
11e356de8d
commit
0c7cdb6a0a
8
.idea/.gitignore
generated
vendored
8
.idea/.gitignore
generated
vendored
@ -1,8 +0,0 @@
|
|||||||
# Default ignored files
|
|
||||||
/shelf/
|
|
||||||
/workspace.xml
|
|
||||||
# Datasource local storage ignored files
|
|
||||||
/../../../../:\dev\wmfb\.idea/dataSources/
|
|
||||||
/dataSources.local.xml
|
|
||||||
# Editor-based HTTP Client requests
|
|
||||||
/httpRequests/
|
|
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (wmfb)" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (wmfb)" project-jdk-type="Python SDK" />
|
||||||
</project>
|
</project>
|
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
2
.idea/wmfb.iml
generated
2
.idea/wmfb.iml
generated
@ -4,7 +4,7 @@
|
|||||||
<content url="file://$MODULE_DIR$">
|
<content url="file://$MODULE_DIR$">
|
||||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="Python 3.8 (wmfb)" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Python 3.9 (wmfb)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
53
main.py
53
main.py
@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
import hashlib
|
import hashlib
|
||||||
@ -13,8 +14,8 @@ mirrors = ['https://dumps.wikimedia.org',
|
|||||||
'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps',
|
'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps',
|
||||||
'https://dumps.wikimedia.your.org']
|
'https://dumps.wikimedia.your.org']
|
||||||
trackers = ['udp://tracker.opentrackr.org:1337/announce',
|
trackers = ['udp://tracker.opentrackr.org:1337/announce',
|
||||||
'udp://tracker.coppersurfer.tk:6969',
|
'udp://tracker.openbittorrent.com:6969',
|
||||||
'udp://tracker.leechers-paradise.org:6969']
|
'udp://explodie.org:6969']
|
||||||
prev_progress = 0
|
prev_progress = 0
|
||||||
|
|
||||||
|
|
||||||
@ -47,19 +48,29 @@ def main():
|
|||||||
if recombine['status'] != 'done':
|
if recombine['status'] != 'done':
|
||||||
print('"articles dump recombine" job is not done yet!')
|
print('"articles dump recombine" job is not done yet!')
|
||||||
sys.exit(-1)
|
sys.exit(-1)
|
||||||
expected_file_name = f'{db}-{date}-pages-articles-multistream.xml.bz2'
|
dir_name = f'{db}-{date}-pages-articles-multistream'
|
||||||
file = recombine['files'][expected_file_name]
|
try:
|
||||||
path = file['url']
|
os.mkdir(dir_name)
|
||||||
download_file(expected_file_name, f'{mirror}{path}', file['size'])
|
except OSError as error:
|
||||||
verify_checksum(expected_file_name, file['sha1'], file['size'])
|
print(error)
|
||||||
webseeds = [
|
expected_file_name1 = f'{db}-{date}-pages-articles-multistream-index.txt.bz2'
|
||||||
f'{mirrors[0]}{path}',
|
file1 = recombine['files'][expected_file_name1]
|
||||||
f'{mirrors[1]}{path}',
|
path1 = file1['url']
|
||||||
]
|
download_file(f'{dir_name}/{expected_file_name1}', f'{mirror}{path1}', file1['size'])
|
||||||
torrent = Torrent(path=expected_file_name, webseeds=webseeds, trackers=trackers)
|
verify_checksum(f'{dir_name}/{expected_file_name1}', file1['sha1'], file1['size'])
|
||||||
|
expected_file_name2 = f'{db}-{date}-pages-articles-multistream.xml.bz2'
|
||||||
|
file2 = recombine['files'][expected_file_name2]
|
||||||
|
path2 = file2['url']
|
||||||
|
download_file(f'{dir_name}/{expected_file_name2}', f'{mirror}{path2}', file2['size'])
|
||||||
|
verify_checksum(f'{dir_name}/{expected_file_name2}', file2['sha1'], file2['size'])
|
||||||
|
# webseeds = [
|
||||||
|
# f'{mirrors[0]}{path}',
|
||||||
|
# f'{mirrors[1]}{path}',
|
||||||
|
# ]
|
||||||
|
torrent = Torrent(path=dir_name, trackers=trackers)
|
||||||
print('Creating torrent file...', end='')
|
print('Creating torrent file...', end='')
|
||||||
torrent.generate()
|
torrent.generate()
|
||||||
torrent.write(f'{expected_file_name}.torrent')
|
torrent.write(f'{dir_name}.torrent')
|
||||||
print('Done')
|
print('Done')
|
||||||
|
|
||||||
|
|
||||||
@ -76,7 +87,7 @@ def date_format(date):
|
|||||||
|
|
||||||
|
|
||||||
def verify_checksum(filename, expected, size):
|
def verify_checksum(filename, expected, size):
|
||||||
print(f'Checking checksum of {filename} ', end='')
|
print(f'Checking checksum of {filename} ', end='', flush=True)
|
||||||
show_progress(0)
|
show_progress(0)
|
||||||
h = hashlib.sha1()
|
h = hashlib.sha1()
|
||||||
one_percent = size / 100
|
one_percent = size / 100
|
||||||
@ -91,9 +102,9 @@ def verify_checksum(filename, expected, size):
|
|||||||
show_progress(int(done * 100 / size))
|
show_progress(int(done * 100 / size))
|
||||||
rc = h.hexdigest()
|
rc = h.hexdigest()
|
||||||
if rc != expected:
|
if rc != expected:
|
||||||
print(' Bad')
|
print(' Bad', flush=True)
|
||||||
sys.exit(-1)
|
sys.exit(-1)
|
||||||
print(' Ok')
|
print(' Ok', flush=True)
|
||||||
|
|
||||||
|
|
||||||
def get_job_state(base_uri, db, date, jobname):
|
def get_job_state(base_uri, db, date, jobname):
|
||||||
@ -113,7 +124,7 @@ def download_file(filename, uri, size):
|
|||||||
headers = {'Range': f'bytes={start_from}-'}
|
headers = {'Range': f'bytes={start_from}-'}
|
||||||
else:
|
else:
|
||||||
headers = {}
|
headers = {}
|
||||||
print(f'Downloading {uri} ', end='')
|
print(f'Downloading {uri} ', end='', flush=True)
|
||||||
one_percent = size / 100
|
one_percent = size / 100
|
||||||
next_percent = start_from + one_percent
|
next_percent = start_from + one_percent
|
||||||
show_progress(0)
|
show_progress(0)
|
||||||
@ -129,17 +140,17 @@ def download_file(filename, uri, size):
|
|||||||
if done >= next_percent:
|
if done >= next_percent:
|
||||||
next_percent += one_percent
|
next_percent += one_percent
|
||||||
show_progress(int(done * 100 / size))
|
show_progress(int(done * 100 / size))
|
||||||
print(" Done")
|
print(" Done", flush=True)
|
||||||
|
|
||||||
|
|
||||||
def show_progress(percent: int):
|
def show_progress(percent: int):
|
||||||
is_tty = sys.stdout.isatty()
|
is_tty = sys.stdout.isatty()
|
||||||
if percent == 0 and is_tty:
|
if percent == 0 and is_tty:
|
||||||
print(' 0%', end='')
|
print(' 0%', end='', flush=True)
|
||||||
elif is_tty:
|
elif is_tty:
|
||||||
print(f"\033[4D{percent:3d}%", end='')
|
print(f"\033[4D{percent:3d}%", end='', flush=True)
|
||||||
elif percent != 0:
|
elif percent != 0:
|
||||||
print('.', end='')
|
print('.', end='', flush=True)
|
||||||
|
|
||||||
|
|
||||||
def find_last_date(base_uri, db):
|
def find_last_date(base_uri, db):
|
||||||
|
Loading…
Reference in New Issue
Block a user