Skip to content

Commit e42b197

Browse files
committed
Include the Mediawiki source into the distribution archive
1 parent 5189bbe commit e42b197

File tree

2 files changed

+85
-0
lines changed

2 files changed

+85
-0
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,3 +239,4 @@ source:
239239
http://en.cppreference.com/w/ ; \
240240
popd > /dev/null
241241

242+
./export.py --url=http://en.cppreference.com/mwiki reference/cppreference-export-ns0-ns4-ns10.xml 0 4 10

export.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python3
2+
'''
3+
Copyright (C) 2017 Povilas Kanapickas <[email protected]>
4+
5+
This file is part of cppreference-doc
6+
7+
This program is free software: you can redistribute it and/or modify
8+
it under the terms of the GNU General Public License as published by
9+
the Free Software Foundation, either version 3 of the License, or
10+
(at your option) any later version.
11+
12+
This program is distributed in the hope that it will be useful,
13+
but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
GNU General Public License for more details.
16+
17+
You should have received a copy of the GNU General Public License
18+
along with this program. If not, see http://www.gnu.org/licenses/.
19+
'''
20+
21+
import argparse
22+
import urllib.parse
23+
import urllib.request
24+
import json
25+
26+
def retrieve_page_names(root, ns_index):
27+
28+
begin = None
29+
pages = []
30+
31+
while True:
32+
params = {
33+
'action' : 'query',
34+
'list' : 'allpages',
35+
'apnamespace' : ns_index,
36+
'aplimit' : 500,
37+
'format' : 'json'
38+
}
39+
if begin is not None:
40+
params['apcontinue'] = begin
41+
42+
url = "{0}/api.php?{1}".format(root, urllib.parse.urlencode(params))
43+
44+
with urllib.request.urlopen(url) as f:
45+
data = json.loads(f.read().decode('utf-8'))
46+
pages += [ p['title'] for p in data['query']['allpages'] ]
47+
48+
if ('query-continue' in data and 'allpages' in data['query-continue'] and
49+
'apcontinue' in data['query-continue']['allpages']):
50+
begin = data['query-continue']['allpages']['apcontinue']
51+
else:
52+
return pages
53+
54+
def export_pages(root, pages, output_path):
55+
params = {
56+
'wpDownload' : '',
57+
'curonly' : 1,
58+
'pages' : '\n'.join(pages)
59+
}
60+
61+
data = urllib.parse.urlencode(params)
62+
data = data.encode('ascii')
63+
url = "{0}/index.php?title=Special:Export&action=submit".format(root)
64+
65+
urllib.request.urlretrieve(url, output_path, data=data)
66+
67+
def main():
68+
parser = argparse.ArgumentParser(prog='export.py')
69+
parser.add_argument('--url', type=str, help='The URL to the root of the MediaWiki installation')
70+
parser.add_argument('output_path', type=str, help='The path to the XML file to save output to')
71+
parser.add_argument('ns_index', type=str, nargs='+', help='The indices of the namespaces to retrieve')
72+
args = parser.parse_args()
73+
74+
pages = []
75+
for ns_index in args.ns_index:
76+
new_pages = retrieve_page_names(args.url, ns_index)
77+
print("Retrieved {0} pages for namespace {1}".format(len(new_pages), ns_index))
78+
pages += new_pages
79+
80+
pages = sorted(pages)
81+
export_pages(args.url, pages, args.output_path)
82+
83+
if __name__ == "__main__":
84+
main()

0 commit comments

Comments
 (0)