User:Kephir/t.love

This is a list of pages which are suspected to contain incorrectly formatted translations and should probably be run through xte.

There are 14351 items on the list (generated by checking against a known-good pattern). So I needed to split it.

/0, /1, /2, /3, /4, /5, /6, /7, /8, /9, /10, /11, /12, /13, /14

The script has also been run through some other dumps, giving the number of results as shown below:

   25815 2012-11-04
   23993 2013-08-25
   23582 2013-09-07
   22278 2013-09-19
   21946 2013-10-02
   21837 2013-10-17
   21590 2013-11-17
   21478 2013-12-02
         2013-12-08..15: first Buttermilch run
   15595 2013-12-17
   14494 2014-04-15
   14426 2014-05-22
   14398 2014-06-09
   14351 2014-07-02
         2014-07-28: xte was updated to process even items containing {{t}}

The list was generated from the 2013-08-25 dump of Wiktionary by the following script:

#!/usr/bin/python3

# released under the WTFPL
# usage: curl http://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2 | ./scan > needs_wuv.mw
# processing the whole dump takes a couple of minutes, but little memory.

import re
import sys
import bz2
import xml.sax

_re_trans_top  = re.compile(r'\s*\{\{trans-top(?:-also)?(\||}})')
_re_ttbc_top   = re.compile(r'\s*\{\{(checktrans|ttbc)-top(\||}})')
_re_trans_bot  = re.compile(r'\s*\{\{(checktrans|trans|ttbc)-bottom(\||}})\s*')

_re_trans_line = re.compile(r'\*(?![\*:])\s*(?:\[\[)?([^:\{]+?)(?:\]\])?\s*:\s*')
_re_ttbc_line  = re.compile(r'\*:?\s*(\{\{ttbc\s*\|(.*?)}})\s*:\s*')
_re_trans_sub  = re.compile(r'\*[*:]\s*(?:\[\[)?([^:\{<]+?)(?:\]\])?\s*:\s*')
_re_trans_mid  = re.compile(r'\s*\{\{(checktrans|trans|ttbc)-mid(\||}})\s*')
_re_trans_req  = re.compile(r'\*:?\s*\{\{trreq\|(.*?)}}\s*?(?=\s|$)')
_re_dummy      = re.compile(r'(?:<!--.*?-->)?$')
_re_hiero      = re.compile(r'\*:?\s*<hiero>.*?</hiero>')
_re_split      = re.compile(r'\s*[;,]\s*')
_re_item       = re.compile(r'((\{\{(qualifier|i)\|[^}]*?}}\s*)?\{\{(t[-+ø0]?|t[-+]check|)\|[^}]*?}}(\s*\{\{(qualifier|i)\|[^}]*?}})?|\{\{t-needed\|[a-z\-]+}})$')

def splitdefs(defs):
	if defs == '':
		return
	op = 0
	oc = 0
	os = 0
	ot = 0
	cur = ''
	for item in re.split(_re_split, defs):
		op += item.count('(') - item.count(')')
		oc += item.count('{') - item.count('}')
		os += item.count('[') - item.count(']')
		ot += item.count('<') - item.count('>')
		cur += item
		if not (op or oc or os or ot):
			yield cur
			cur = ''

def scanpage(title, text):
	mode = 0
	for line in text.splitlines():
		if re.match(_re_trans_top, line):
			mode = 1
			continue
		elif re.match(_re_ttbc_top, line):
			mode = 1
			continue
		if mode == 0:
			continue
		elif re.match(_re_trans_bot, line):
			mode = 0
			continue
		
		m = re.match(_re_trans_line, line)
		if m:
			rest = line[len(m.group(0)):]
			
			for item in splitdefs(rest):
				if not re.match(_re_item, item):
					return 'definition not recognised: %s' % (repr(item))
			continue
	
		m = re.match(_re_ttbc_line, line)
		if m:
			continue

		m = re.match(_re_trans_sub, line)
		if m:
			continue

		m = re.match(_re_trans_mid, line)
		if m:
			continue

		m = re.match(_re_trans_req, line)
		if m:
			continue
		
		if re.match(_re_hiero, line):
			continue
		if re.match(_re_dummy, line):
			continue
		
		return 'line not recognised: %s' % (repr(line))

	return None

class handler:
	def __init__(self):
		self.buf = None
		self.tit = None
		self.current = None
	
	def processingInstruction(self):
		pass
	
	def setDocumentLocator(self, locator):
		pass
		
	def startDocument(self):
		pass

	def startElement(self, name, attrs):
		self.current = name
		if name == 'text':
			self.buf = ''
		elif name == 'title':
			self.tit = ''

	def characters(self, data):
		if self.current == 'text':
			self.buf += data
		elif self.current == 'title':
			self.tit += data
		
	def endElement(self, name):
		self.current = None
		if name == 'text':
			reason = scanpage(self.tit, self.buf)
			if reason:
				sys.stdout.write('* [[%s]] because: <tt><nowiki>%s</nowiki></tt>\n' % (self.tit, reason))
		elif name == 'page':
			self.buf = None
			self.tit = None
			
	def endDocument(self):
		pass

xml.sax.parse(bz2.BZ2File(sys.stdin.buffer), handler())