initial commit
This commit is contained in:
commit
11e356de8d
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
.idea/workspace.xml
|
8
.idea/.gitignore
vendored
Normal file
8
.idea/.gitignore
vendored
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/../../../../:\dev\wmfb\.idea/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
4
.idea/misc.xml
Normal file
4
.idea/misc.xml
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (wmfb)" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/wmfb.iml" filepath="$PROJECT_DIR$/.idea/wmfb.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
10
.idea/wmfb.iml
Normal file
10
.idea/wmfb.iml
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.8 (wmfb)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
167
main.py
Normal file
167
main.py
Normal file
|
@ -0,0 +1,167 @@
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import hashlib
|
||||||
|
import math
|
||||||
|
import getopt
|
||||||
|
import requests
|
||||||
|
import colorama
|
||||||
|
from torf import Torrent
|
||||||
|
|
||||||
|
index_path = '/backup-index.html'
|
||||||
|
status_path = '/:db:/:date:/dumpstatus.json'
|
||||||
|
mirrors = ['https://dumps.wikimedia.org',
|
||||||
|
'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps',
|
||||||
|
'https://dumps.wikimedia.your.org']
|
||||||
|
trackers = ['udp://tracker.opentrackr.org:1337/announce',
|
||||||
|
'udp://tracker.coppersurfer.tk:6969',
|
||||||
|
'udp://tracker.leechers-paradise.org:6969']
|
||||||
|
prev_progress = 0
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
colorama.init()
|
||||||
|
try:
|
||||||
|
opts, args = getopt.gnu_getopt(sys.argv, "hm:d:", ["help", "mirror=", "date="])
|
||||||
|
except getopt.GetoptError:
|
||||||
|
show_usage()
|
||||||
|
sys.exit(-2)
|
||||||
|
mirror = mirrors[0]
|
||||||
|
date = ''
|
||||||
|
for arg, value in opts:
|
||||||
|
if arg == "-h" or arg == "--help":
|
||||||
|
show_usage()
|
||||||
|
sys.exit(-2)
|
||||||
|
elif arg == "-m" or arg == "--mirror":
|
||||||
|
v = int(value)
|
||||||
|
mirror = mirrors[v]
|
||||||
|
elif arg == "-d" or arg == "--date":
|
||||||
|
date = date_from_opt(value)
|
||||||
|
if len(args) != 2:
|
||||||
|
show_usage()
|
||||||
|
sys.exit(-2)
|
||||||
|
db = args[1]
|
||||||
|
if date == '':
|
||||||
|
date = find_last_date(mirror, db)
|
||||||
|
print(f'Preparing torrent for wikipedia\'s {db} articles dump recombine dating {date_format(date)}.\n')
|
||||||
|
recombine = get_job_state(mirror, db, date, 'articlesmultistreamdumprecombine')
|
||||||
|
if recombine['status'] != 'done':
|
||||||
|
print('"articles dump recombine" job is not done yet!')
|
||||||
|
sys.exit(-1)
|
||||||
|
expected_file_name = f'{db}-{date}-pages-articles-multistream.xml.bz2'
|
||||||
|
file = recombine['files'][expected_file_name]
|
||||||
|
path = file['url']
|
||||||
|
download_file(expected_file_name, f'{mirror}{path}', file['size'])
|
||||||
|
verify_checksum(expected_file_name, file['sha1'], file['size'])
|
||||||
|
webseeds = [
|
||||||
|
f'{mirrors[0]}{path}',
|
||||||
|
f'{mirrors[1]}{path}',
|
||||||
|
]
|
||||||
|
torrent = Torrent(path=expected_file_name, webseeds=webseeds, trackers=trackers)
|
||||||
|
print('Creating torrent file...', end='')
|
||||||
|
torrent.generate()
|
||||||
|
torrent.write(f'{expected_file_name}.torrent')
|
||||||
|
print('Done')
|
||||||
|
|
||||||
|
|
||||||
|
def date_from_opt(from_opt: str):
|
||||||
|
r = r'\d{4}-\d{2}-\d{2}'
|
||||||
|
if re.match(r, from_opt) is None:
|
||||||
|
show_usage()
|
||||||
|
sys.exit(-2)
|
||||||
|
return from_opt.replace('-', '')
|
||||||
|
|
||||||
|
|
||||||
|
def date_format(date):
|
||||||
|
return f'{date[:4]}-{date[4:6]}-{date[6:]}'
|
||||||
|
|
||||||
|
|
||||||
|
def verify_checksum(filename, expected, size):
|
||||||
|
print(f'Checking checksum of {filename} ', end='')
|
||||||
|
show_progress(0)
|
||||||
|
h = hashlib.sha1()
|
||||||
|
one_percent = size / 100
|
||||||
|
next_percent = one_percent
|
||||||
|
with open(filename, 'rb') as fd:
|
||||||
|
done = 0
|
||||||
|
for chunk in fd:
|
||||||
|
h.update(chunk)
|
||||||
|
done += len(chunk)
|
||||||
|
if done >= next_percent:
|
||||||
|
next_percent += one_percent
|
||||||
|
show_progress(int(done * 100 / size))
|
||||||
|
rc = h.hexdigest()
|
||||||
|
if rc != expected:
|
||||||
|
print(' Bad')
|
||||||
|
sys.exit(-1)
|
||||||
|
print(' Ok')
|
||||||
|
|
||||||
|
|
||||||
|
def get_job_state(base_uri, db, date, jobname):
|
||||||
|
path = status_path.replace(':db:', db).replace(':date:', date)
|
||||||
|
page = requests.get(f'{base_uri}{path}')
|
||||||
|
state = page.json()
|
||||||
|
jobs = state['jobs']
|
||||||
|
return jobs[jobname]
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(filename, uri, size):
|
||||||
|
with open(filename, 'ab') as fd:
|
||||||
|
start_from = fd.tell()
|
||||||
|
if start_from == size:
|
||||||
|
headers = None
|
||||||
|
elif start_from > 0:
|
||||||
|
headers = {'Range': f'bytes={start_from}-'}
|
||||||
|
else:
|
||||||
|
headers = {}
|
||||||
|
print(f'Downloading {uri} ', end='')
|
||||||
|
one_percent = size / 100
|
||||||
|
next_percent = start_from + one_percent
|
||||||
|
show_progress(0)
|
||||||
|
if start_from >= one_percent:
|
||||||
|
for p in range(1, math.ceil(start_from * 100 / size) + 1):
|
||||||
|
show_progress(p)
|
||||||
|
if headers is not None:
|
||||||
|
done = start_from
|
||||||
|
file = requests.get(uri, headers=headers, stream=True)
|
||||||
|
for chunk in file.iter_content(16384):
|
||||||
|
fd.write(chunk)
|
||||||
|
done += len(chunk)
|
||||||
|
if done >= next_percent:
|
||||||
|
next_percent += one_percent
|
||||||
|
show_progress(int(done * 100 / size))
|
||||||
|
print(" Done")
|
||||||
|
|
||||||
|
|
||||||
|
def show_progress(percent: int):
|
||||||
|
is_tty = sys.stdout.isatty()
|
||||||
|
if percent == 0 and is_tty:
|
||||||
|
print(' 0%', end='')
|
||||||
|
elif is_tty:
|
||||||
|
print(f"\033[4D{percent:3d}%", end='')
|
||||||
|
elif percent != 0:
|
||||||
|
print('.', end='')
|
||||||
|
|
||||||
|
|
||||||
|
def find_last_date(base_uri, db):
|
||||||
|
page = requests.get(f'{base_uri}{index_path}')
|
||||||
|
r = re.compile(f'<a href="{db}/(.+)">')
|
||||||
|
found = re.findall(r, page.text)
|
||||||
|
return found[0]
|
||||||
|
|
||||||
|
|
||||||
|
def show_usage():
|
||||||
|
print('Wrong usage')
|
||||||
|
print('Usage', sys.argv[0], '[db] [options...]')
|
||||||
|
print('Options:')
|
||||||
|
print(' -m, --mirror\tUse mirror number (see list bellow)')
|
||||||
|
print(' -d, --date\t\tDump date (i.e. 2020-11-01)')
|
||||||
|
print()
|
||||||
|
print('Mirrors:')
|
||||||
|
i = 0
|
||||||
|
for mirror in mirrors:
|
||||||
|
print(i, mirror)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user