User:Visviva/sloppy.py

def parselist():
	import xmlreader
	English=set()
	dump=xmlreader.XmlDump("D:\Code\\wikt.bz2")
	for entry in dump.parse():
		if "==English==" not in entry and "== English ==" not in entry: continue
		else: English.add(entry.title)
	print len(English) # Should be something sane.
	writefile=open("parsed_titles.txt","w")
	for entry in dump.parse():
		if entry.title not in English: continue
		section=re.split("\n\=\=[^\=]{1}",entry.text.split("English==",1)[1])[0]
		posses=re.split("\n[\=]{3,5}(?=[^\=]{1})",section)
		for p in posses[1:]:
			if "\n#" not in p: continue
			pos=p.split("=")[0].replace("{","").replace("}","").title()
			defs=re.split("\n\#(?=[^\:\*]{1})",p)[1:]
			for d in defs:
				d=d.split("\n")[0]
				d=re.sub("\[\[[^\]]*?\|","",d)
				d=re.sub("\<\!\-\-.*?\-\-\>","",d)
				d=d.replace("[[","").replace("]]","")
				d=d.replace("{{","(").replace("}}",")")
				d=d.replace(" of|"," of ")
				d=d.replace("from=","from ") # Surname /given name templates
				d=d.replace("context|","").replace("qualifier|","").replace("ib|","").replace("italbrac|","")
				d=d.replace("'''",'"') #Most common use of explicit boldface (heaven knows why...)
				d=d.replace("''(","(").replace(")''",")").replace("(''","(").replace("'')",")")
				d=d.replace("|_|"," ").replace("|",", ")
				d=d.replace("\t"," <tab> ") #final cleaning
				d=d.replace('""','"')
				line="\t".join([entry.title,pos,d])
				line=line.encode("utf-8","ignore")
				writefile.write(line+"\n")
	writefile.close()