#!/usr/bin/env python3 """Simply turn all input files to html. No errorchecking, so keep backups. It uses the mediawiki webapi, so you need to be online. Usage: ./parse_wikipedia_files_to_html.py Copyright: 2010 © Arne Babenhauserheide License: You can use this under the GPLv3 or later, if you add the appropriate license files → http://gnu.org/licenses/gpl.html """ from urllib.request import urlopen from urllib.parse import quote from urllib.error import HTTPError, URLError from time import sleep from random import random from yaml import load from sys import argv mediawiki_files = argv[1:] def wikitext_to_html(text): """parse text in mediawiki markup to html.""" url = "http://en.wikipedia.org/w/api.php?action=parse&format=yaml&text=" + quote(text, safe="") + " " f = urlopen(url) y = f.read() f.close() text = load(y)["parse"]["text"]["*"] return text for mf in mediawiki_files: with open(mf) as f: text = f.read() HTML_HEADER = "" + mf + "" HTML_FOOTER = "" try: text = wikitext_to_html(text) with open(mf, "w") as f: f.write(HTML_HEADER) f.write(text) f.write(HTML_FOOTER) except HTTPError: print("Error converting file", mf) except URLError: print("Server doesn’t like us :(", mf) sleep(10*random()) # add a random wait, so the api server doesn’t kick us sleep(3*random())