import time
import urllib2
from xml.sax.saxutils import unescape as unescape0
def unescape(text):
return unescape0(text, {"'": "'", """: '"', "'":"'"})
def pages_in_cat_except(category="English nouns",exceptions=["English noun forms","English plurals"],lag=10):
queryurl = "https://en.wiktionary.org/w/api.php?action=query&cmnamespace=0&list=categorymembers&cmtitle=Category:%s&format=xml&cmlimit=500" % category.replace(" ","_")
results = set()
continueparam = True
while continueparam:
if continueparam is not True:
time.sleep(lag)
thisurl = queryurl + "&cmcontinue=%s&rawcontinue=true" % continueparam
else:
thisurl = queryurl
print thisurl
page = urllib2.urlopen(thisurl,timeout=60).read()
try:
continueparam = page.split('cmcontinue="')[1].split('"')[0]
except IndexError:
continueparam = False
if "<cm " in page:
memberz = page.split("<cm ")[1:]
memberz = [x.split('title="')[1].split('"')[0] for x in memberz]
results |= set(memberz)
print "%s members found" % str(len(results))
time.sleep(lag)
print "Fetching subcats..."
subcat_url = "https://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:%s&format=xml&cmlimit=500&cmtype=subcat" % category.replace(" ","_")
subcatpage = urllib2.urlopen(subcat_url,timeout=60).read()
try:
badcats = set(exceptions)# avoid doubling
subcats = [x.split('title="Category:')[1].split('"')[0] for x in subcatpage.split("<cm ")[1:]]
for subcat in subcats:
if subcat in badcats:
continue
print subcat
newpages = pages_in_cat_except(category=subcat,exceptions=badcats) # BEHOLD the power of recursion!
results |= newpages
badcats.add(subcat)
except IndexError: # no subcats
pass
results_list = [unescape(x) for x in results]
results_list.sort()
return results_list
if __name__ == "__main__":
pages = pages_in_cat_except()
open("wiktionary_category_members.txt","w").write("\n".join(pages))