def parselist():
import xmlreader
English=set()
dump=xmlreader.XmlDump("D:\Code\\wikt.bz2")
for entry in dump.parse():
if "==English==" not in entry and "== English ==" not in entry: continue
else: English.add(entry.title)
print len(English) # Should be something sane.
writefile=open("parsed_titles.txt","w")
for entry in dump.parse():
if entry.title not in English: continue
section=re.split("\n\=\=[^\=]{1}",entry.text.split("English==",1)[1])[0]
posses=re.split("\n[\=]{3,5}(?=[^\=]{1})",section)
for p in posses[1:]:
if "\n#" not in p: continue
pos=p.split("=")[0].replace("{","").replace("}","").title()
defs=re.split("\n\#(?=[^\:\*]{1})",p)[1:]
for d in defs:
d=d.split("\n")[0]
d=re.sub("\[\[[^\]]*?\|","",d)
d=re.sub("\<\!\-\-.*?\-\-\>","",d)
d=d.replace("[[","").replace("]]","")
d=d.replace("{{","(").replace("}}",")")
d=d.replace(" of|"," of ")
d=d.replace("from=","from ") # Surname /given name templates
d=d.replace("context|","").replace("qualifier|","").replace("ib|","").replace("italbrac|","")
d=d.replace("'''",'"') #Most common use of explicit boldface (heaven knows why...)
d=d.replace("''(","(").replace(")''",")").replace("(''","(").replace("'')",")")
d=d.replace("|_|"," ").replace("|",", ")
d=d.replace("\t"," <tab> ") #final cleaning
d=d.replace('""','"')
line="\t".join([entry.title,pos,d])
line=line.encode("utf-8","ignore")
writefile.write(line+"\n")
writefile.close()