User:Flubot/How to find sorting problems in French categories
If we have the whole list of French words in a given category, we can check it for sorting problems, eg missing sort keys in some entries.
The list of words must be in a file named unsorted_list, in the same directory.
Run with python find_unsorted.py > list_of_problems
Read the file list_of_problems to find possibly problematic entries.
find_unsorted.py edit
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys, re, codecs
page_tag = re.compile('(.*)')
reload(sys)
sys.setdefaultencoding('utf-8')
spaces_tag = re.compile('([\*\.,\'\(\)]*)')
apost_tag = re.compile(u'\’')
page_tag = re.compile('(.*)')
mtg_apo = u'à,À,â,Â,é,É,è,È,ê,Ê,ë,Ë,î,Î,ï,Ï,ô,Ô,ù,Ù,û,Û,ü,Ü,ç,Ç,Æ,æ,Œ,œ,-'
mtg_se = u'a,a,a,a,e,e,e,e,e,e,e,e,i,i,i,i,o,o,u,u,u,u,u,u,c,c,ae,ae,oe,oe, '
trkeys = mtg_apo.split(u',')
for i in range(len(trkeys)):
trkeys[i] = ord(trkeys[i])
trvals = mtg_se.split(u',')
trtable = dict(zip(trkeys,trvals))
fin = codecs.open('unsorted_list', 'r', 'utf-8')
eof=0
line = fin.readline()
t1 = page_tag.search(line)
pr = t1.group(1)
kleida = pr.translate(trtable)
kleida = spaces_tag.sub('',kleida)
kleida = apost_tag.sub('',kleida)
kleida = kleida.lower()
kleida = kleida.strip(' ')
while not eof:
line = fin.readline()
#line = unicode(line, 'utf-8')
if line == "":
eof = 1
else:
t1 = page_tag.search(line)
next = t1.group(1)
kleida1 = next.translate(trtable)
kleida1 = spaces_tag.sub('',kleida1)
kleida1 = apost_tag.sub('',kleida1)
kleida1 = kleida1.lower()
kleida1 = kleida1.strip(' ')
if not (kleida1 >= kleida):
print next
pr = next
kleida = kleida1
fin.close()