User:MewBot/formbot.py

#!/usr/bin/env python
#coding: utf-8

# Copyright CodeCat 2010 - 2013

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import wikipedia, re, string, sys


class GenericFormBot:
	"""A generic class for Wiktionary form bots.
	
	This class is an abstract base class, and isn't meant to be instantiated
	directly. To use it, derive a new class from it, and override the
	generateForms method with a proper definition, and provide a call to
	the base class constructor.
	Once you're ready to let it run, just call run() and it's all sorted.
	
	The purpose of this script is to provide automated generation of
	Wiktionary entries for inflected forms. It does this by fetching a
	Wiktionary page, then checks for the existence of certain on that page.
	If found, it extracts the necessary information from the template
	parameters, and passes it on to the generateForms method, which generates
	the forms (just as the templates themselves do) and uploads the result as
	new entries.
	
	It will either create a new page or append a new section to the
	page. It will skip the page if it already contains a section of the same
	type as the one being created.
	If the page already exists, it will add {{rfc-auto}} to it,
	so that the AutoFormat bot can automatically place the section in the
	proper place on the page.
	"""
	
	def __init__(self, head, templates, langCode, langName,
		cleanupCat = None, simulation = False, force = False, verbose = False):
		
		self._head = head
		self._templates = templates
		self._langCode = langCode
		self._langName = langName
		self._cleanupCat = cleanupCat
		
		self._simulation = simulation
		self._force = force
		self._verbose = verbose
	
	
	def run(self):
		"""Fetch a wiktionary entry and create entries from information in all form template occurrences."""
		
		page = wikipedia.Page(wikipedia.getSite('en', 'wiktionary'), self._head)
		
		if page.exists():
			contents = page.get()
			
			# Find all occurrences of form templates
			templates = getTemplates(contents, self._templates)
			
			if not templates:
				wikipedia.output(u"No form template on page [[{0}]].".format(self._head))
				return
			else:
				for temp in templates:
					wikipedia.output(u"Found: {0}".format(temp))
					name, params = parseTemplate(temp)
					self.makeEntries(name, params)
		else:
			wikipedia.output(u"Can't find page [[{0}]].".format(self._head))
	
	
	def makeEntries(self, template, params):
		"""Create entries from information in one form template."""
		
		entries = self.generateEntries(template, params)
		
		if not entries:
			return
		
		try:
			del entries[self._head]
		except KeyError:
			pass
		
		result = False
		
		# Merge the lists into a single string per entry
		for title, entry in entries.iteritems():
			changed = self.saveEntry(title, entry)
			result = result or changed
		
		if not result:
			wikipedia.output(u"Note: Did not add any new entries from page [[{0}]].".format(self._head))
	
	
	def zipEntries(self, entries, header):
		"""Return with each entry zipped together into one string."""
		ret = {}
		
		for form, entry in entries.iteritems():
			ret[form] = header + '# ' + '\n# '.join(entry)
		
		return ret
	
	
	def generateEntries(self, template, params):
		"""Override this in a derived class."""
		pass
	
	
	def saveEntry(self, title, entry):
		"""Save a new entry to Wiktionary."""
		
		page = wikipedia.Page(wikipedia.getSite('en', 'wiktionary'), title)
		newContents = '=={0}==\n'.format(self._langName) + entry
		
		if page.exists():
			oldContents = page.get()
			
			if entry in oldContents:
				wikipedia.output(u"Skipped page [[{0}]]. Already contains the new entry.".format(title))
				return False
			
			langSections = getSections(oldContents, self._langName, 2)
			newContents = '\n\n----\n' + newContents
			
			if langSections:
				# There is more than one section for this language already.
				# The bot probably was here before!
				if len(langSections) > 1:
					if self._force:
						wikipedia.output(u"WARNING: Forced append to [[{0}]]. More than one {1} section on page.".format(title, self._langName))
						
						if self._cleanupCat:
							newContents += '\n[[' + self._cleanupCat + ']]'
					else:
						wikipedia.output(u"Skipped page [[{0}]]. More than one {1} section on page.".format(title, self._langName))
						return False
				else:
					# There is a lang section on the page
					langContents = oldContents[langSections[0][0]:langSections[0][1]]
					
					# Does the lang section have numbered etymologies?
					if re.search(ur'=== *Etymology \d+ *===', langContents, re.UNICODE):
						if self._force:
							wikipedia.output(u"WARNING: Forced append to [[{0}]]. {1} section has numbered etymology sections.".format(title, self._langName))
							
							if self._cleanupCat:
								newContents += '\n[[' + self._cleanupCat + ']]'
						else:
							wikipedia.output(u"Skipped page [[{0}]]. {1} section has numbered etymology sections.".format(title, self._langName))
							return False
					else:
						pos = re.match(ur'===(\w+)===', entry, re.UNICODE).group(1)
						posHeaders = [pos, pos + u' form']
						
						# Special case... this happened to me once, so I might as well code it in
						if pos == 'Verb':
							posHeaders.append(u'Participle')
						
						# Does the lang section have a verb section already in it?
						if re.search(ur'=== *(?:{0}) *==='.format(u'|'.join(posHeaders)), langContents, re.UNICODE):
							if self._force:
								wikipedia.output(u"WARNING: Forced append to [[{0}]]. Already has {1} {2} section.".format(title, self._langName, pos))
								
								if self._cleanupCat:
									newContents += '\n[[' + self._cleanupCat + ']]'
							else:
								wikipedia.output(u"Skipped page [[{0}]]. Already has {1} {2} section.".format(title, self._langName, pos))
								return False
						else:
							newContents += '\n{{rfc-auto}}'
			else:
				newContents += '\n{{rfc-auto}}'
			
			if self._simulation:
				wikipedia.output(u"Simulated update to page [[{0}]].".format(title))
			else:
				page.put(oldContents + newContents, comment = u'Auto-generated {0} verb forms - appended'.format(self._langName), minorEdit = False)
		else:
			if self._simulation:
				wikipedia.output(u"Simulated creating page [[{0}]].".format(title))
			else:
				page.put(newContents, comment = u'Auto-generated {0} verb forms'.format(self._langName), minorEdit = True)
		
		if self._verbose:
			wikipedia.output(u"Page [[{0}]] new contents:\n".format(title) + '-' * 60, toStdout = True)
			wikipedia.output(newContents, toStdout = True)
			wikipedia.output('*' * 60, toStdout = True)
		
		return True


def getTemplates(contents, names):
	"""Get all template calls to a specific set of templates from a page."""
	
	templates = []
	matches = re.finditer(ur'{{\s*((?:' + ur'|'.join(names) + ur').*?)\s*}}', contents, re.UNICODE | re.DOTALL)
	
	for match in matches:
		templates.append(match.group(1))
	
	return templates


def parseTemplate(template):
	"""Parse and convert parameters of a template into dictionaries."""
	
	template = re.sub(ur'<!--.*?-->', '', template, flags = re.UNICODE | re.DOTALL)
	template = string.split(template, '|')
	templateName = template[0]
	
	params = {}
	paramIndex = 1
	
	for s in template[1:]:
		s = string.split(s, '=', 1)
		
		# The string contains an =
		if len(s) >= 2:
			paramName = string.strip(s[0])
			# Is the name a number?
			try:
				paramName = int(paramName)
			except ValueError:
				pass
			
			paramValue = string.strip(s[1])
			
			if paramValue:
				params[paramName] = paramValue
		else:
			paramValue = string.strip(s[0])
			
			if paramValue:
				params[paramIndex] = paramValue
			
			paramIndex += 1
	
	return templateName, params

def makeTemplate(name, params):
	"""Expand a template, given its name and parameters."""
	
	templatestring = u"{{" + name
	
	for key, val in params.iteritems():
		templatestring += u"|" + str(key) + u"=" + val
	
	templatestring += "}}"
	
	return templatestring

def getSections(contents, name, level, inclHeader = True):
	"""Get the start and end index of a section of a given name, or return None."""
	
	sectionRegex = ur'({0} *{1} *{0}\s*)(.*?)(?:(?:\n{0} *[^\n=]+ *{0})|$)'.format('=' * level, name)
	matches = re.finditer(sectionRegex, contents, re.DOTALL | re.UNICODE)
	
	if not matches:
		return None
	
	ret = []
	
	for match in matches:
		if inclHeader:
			ret.append((match.start(1), match.end(2)))
		else:
			ret.append((match.start(2), match.end(2)))
	
	return ret