User:Interwicket/code/reciprocal



#!/usr/bin/python
# -*- coding: utf-8  -*-
# wikipath en wiktionary User:Interwicket/code/reciprocal


"""
This bot updates iwiki links between wiktionaries

26.1.9: try adding reciprocals; can then use this in full run?

This process checks for the "Interwicket" user on the FL wikt, tries to log in,
create user page, check user status, and create a reciprocal link to match an en.wikt
link just added (or about to be added)

"""

import wikipedia
import sys
import socket
import re
import pickle
from time import time, strftime, gmtime, sleep
from mwapi import getwikitext, getedit, readapi, putedit
from iwlinks import getiwlinks, replaceiwlinks

# borrow global:
from config import usernames

import Queue
import threading
toreplink = Queue.Queue() # was 35, soft limit
repinit = False
rthread = None

# with plock: print lock, acquired around all print statements, caller can use to avoid munging lines together
plock = threading.Lock()

def srep(s):
    return repr(u''+s)[2:-1]

def safe(s): return srep(s)

class ufo:
    def __init__(self, **k):
        for a in k: setattr(self, a, k[a])

class FLwikt:
    def __init__(self, code):
        self.lc = code
        self.lastcheck = None
        self.status = None
        self.userpage = False
        self.mainpage = ''
        # for test mode:
        self.edits = 0
        self.limit = 2.0
        self.newikt = True
        self.lockedwikt = False
        self.deletecode = False

        self.tbd = -1 # meaning "not known", 0 is valid

        try: # getting site will throw exceptions for unknown code(s)
            self.site = wikipedia.getSite(code, "wiktionary")
            if code in self.site.family.obsolete: self.lockedwikt = True
        except Exception:
            print "(code %s is not valid)" % code # avoid plock I think
            self.site = None # should not be referenced?
            self.lockedwikt = True # or doesn't exist at all
            return # rest of this is invalid

        # see if we have a login in user config, else invent it
        if code not in usernames['wiktionary']:
            usernames['wiktionary'][code] = "Interwicket"

        # other options
        self.redirs = None     # may be None, False, or True, can be tested either way
        self.attop = False
        self.oneline = False
        self.sortorder = ''

        # now decode what is in the family, so we can list it out (we don't do anything with it!)
        if self.site.language() in self.site.family.interwiki_attop: self.attop = True
        if self.site.language() in self.site.family.interwiki_on_one_line: self.oneline = True
        pf = self.site.interwiki_putfirst()
        if pf:
            if pf == self.site.family.alphabetic: self.sortorder = 'alpha by language name'
            elif pf == self.site.family.alphabetic_revised: self.sortorder = \
                'alpha by language name (revised)'
            elif pf == self.site.family.fyinterwiki: self.sortorder = 'code in special fy order'
            elif pf == self.site.family.dodde_order: self.sortorder = 'Dodde order'
            else: self.sortorder = '%s first' % (','.join(pf))            

        # specific pairings not to link
        self.nolink = [ ]

        # put these right here for now: (;-)
        if code == 'pl': self.nolink.append('ru')
        # if code in ['en', 'sv', 'sw']: self.redirs = True

class FLdict(dict):
    def __init__(self):
        pass
    def __missing__(self, code):
        self[code] = FLwikt(code)
        return self[code]
# so we can just reference the dictionary (;-)
flws = FLdict() # FLwikt by code

# Note: It is very important that flw's are NOT created for things that aren't iwiki codes!
# This takes some care on the part of calling code.

redits = re.compile('editcount="(\d+)"')
# noflag hack
noflagtext = None
noflaglast = 0
redirtext = None
redirlast = 0

remainpage = re.compile(r'<message name="mainpage"[^>]*>(.*?)</message>')

# safety; this was not written to be re-entrant, probably is okay, but is simple to prevent
# there is a lot of lock contention here when a process like mbwa starts
gfslock = threading.Lock()

def getflstatus(flw, nowrite = False):
  global noflagtext, noflaglast, redirtext, redirlast

  # before taking lock, can we just tell caller the status for this one?
  if flw.lockedwikt:
       flw.status = 'blocked'
       with plock: print '(wikt', flw.lc, 'is locked)'
       return flw.status

  # four hours for now
  if flw.lastcheck and flw.lastcheck > time() - (4 * 3600): return flw.status

  with gfslock:

    was = flw.status

    # if not a good status, start with test; in particular change exception to test
    if flw.status not in ['bot', 'noflag']: flw.status = "test"

    # check logged in (or not)
    # we need try/except here, take keyboard interrupt and make it status = 'exception'
    # anything else thrown will get handled
    try:
        # take print lock around this, will stall other threads,
        # we may need to respond to login prompt, and it will print messages
        with plock: flw.site.forceLogin()
    except KeyboardInterrupt:
        with plock: print "Keyboard interrupt, skipping this wikt"
        flw.status = 'exception'
        return
    except Exception, e:
        flw.status = 'exception'
        with plock: print "exception trying to login on %s:" % flw.lc, str(e)
        return

    try:
        ustat = readapi(flw.site,
             "action=query&meta=userinfo"
             "&uiprop=blockinfo|rights|editcount&format=xml")
    except Exception, e:
        with plock: print "exception trying to read user status from %s.wikt:" % flw.lc, str(e)
        flw.status = "exception"
        return "exception"

    # edit count?
    mo = redits.search(ustat)
    if mo: flw.edits = int(mo.group(1))

    # we can be bot, or blocked, or not known:

    if "<r>bot</r>" in ustat: flw.status = "bot"
    if "blockedby=" in ustat: flw.status = "blocked" # over-rides "bot", as it can be both
    if "missing=" in ustat: flw.status = "missing" # ? can get here now?

    # noflag hack
    if flw.status == 'test':
        if not noflagtext or noflaglast < time() - 3600:
            # just pick up once an hour
            try:
                nfp = wikipedia.Page(flws['en'].site, "User:Interwicket/noflags")
                with plock: print '(reading noflags list)'
                noflagtext = getwikitext(nfp, plock = plock)
                noflaglast = time()
            except Exception, e:
                with plock: print "some exception getting noflag", str(e)
                pass # use previous file text
        if noflagtext:
            if "* '''" + flw.lc + "'''" in noflagtext: flw.status = 'noflag'

    # dyn pickup of redir configuration:
    if not redirtext or redirlast < time() - 3600:
        # just pick up once an hour
        try:
            rdp = wikipedia.Page(flws['en'].site, "User:Interwicket/redirs")
            with plock: print '(reading redirs list)'
            redirtext = getwikitext(rdp, plock = plock)
            redirlast = time()
        except Exception, e:
            with plock: print "some exception getting redirs list", str(e)
            pass # use previous file text
    if redirtext:
        if "* '''" + flw.lc + "'''" in redirtext: flw.redirs = True
        else: flw.redirs = None # we don't use the "False" state at present

    # find main page title from WM "message":
    try:
        mtext = readapi(flw.site, "action=query&meta=allmessages&ammessages=mainpage&format=xml")
        mo = remainpage.search(mtext)
        flw.mainpage = mo.group(1)
    except Exception:
        flw.mainpage = '(exception)'

    if flw.status == 'test':
        if flw.lastcheck:
            flw.limit += (time() - flw.lastcheck) / 4800.0  # allow one more every 90 minutes
            flw.limit = min(flw.limit, flw.edits + 3.0) # don't accumulate too much quota
        else:
            flw.limit = flw.edits # initial state on most runs, allows one

    if flw.status != was:
        with plock: print "(status on %s.wikt is %s)" % (flw.lc, flw.status)
    if flw.status != 'exception': flw.lastcheck = time()

    # if nowrite, we are done for now (e.g. used by mbwa in intitialization)
    if nowrite and flw.status in [ 'bot', 'noflag' ] and was == None: return flw.status

    if flw.newikt and flw.status not in ['missing', 'exception']: flw.newikt = False # set up complete

  # (release gfslock)
  if flw.status != was or flw.newikt: updstatus(flw)

  return flw.status

# add or update user page on the FL wikt:

userpage = """'''Wiktionary interwiki 'bot'''

User "Interwicket" is the 'bot that adds interwiki (inter-language) links to entries.
It is designed for the Wiktionaries. It is not the "wikipedia bot", it is
much more efficient. It operates only in the main namespace (NS:0).

Here, user "Interwicket" will add links to all of the other wiktionaries when needed.

* If user "Interwicket" is blocked here, it will not edit (of course)
* If user "Interwicket" is given a bot flag here, it will add iwikis whenever needed

Otherwise it will operate in a test mode, doing only a very few edits, 
that can then be checked (by me, and by anyone else). Most of the possible updates will not be done
because of this limit.

:Discussion page for Interwicket is [[:en:User talk:Interwicket]].
:Code is at [[:en:User:Interwicket/code]].
:Status, number of edits, etc for each wikt at [[:en:User:Interwicket/FL status]].
:My talk page is [[:en:User talk:Robert Ullmann]].

Finally, my sincere apologies for writing this message only in English!<!-- note that all of the
text in this page is re-written by the 'bot; it is pointless to edit it. Any templates added at
the top or categories and iwikis at the bottom will be left -->

"""

noflag = """
----
The bot has been configured to run here without a bot flag, but at full rate, '''not in test mode'''.
This is done for some small or inactive wiktionaries. If you are a user or admin here and would
like to see it flagged, please note on [[:en:User talk:Interwicket]] and I will resolve it.

It is sometimes hard to find the bot flag request page on various wikts; if you have one and I have
not added a request, please write me a note on [[:en:User talk:Interwicket]] with a link!

I strongly suggest that this wiktionary subscribe to one or both of the automatic approval policy or global bot policy. Please see [[m:Bot policy]].

Feel free to ask me any questions. [[en:User talk:Robert Ullmann]]
"""

def adduserpage(flw):

    if flw.lc == 'en': flw.userpage = True
    if flw.userpage: return

    page = wikipedia.Page(flw.site, "User:Interwicket")
    try:
        op = getedit(page, plock = plock)
    except wikipedia.NoPage:
        op = ''
        pass
    except wikipedia.UserBlocked:
        flw.status = 'blocked'
        updstatus(flw)
        with plock: print "apparently blocked on", flw.lc, "/ wikt may be locked"
        return
    except Exception, e:
        with plock: print "exception trying to read %s:" % page.aslink(), str(e)
        return

    wikipedia.setAction("writing user page")

    # if templates added at top of (whereever) the page (bot template, or placeholder)
    # and cats, iwikis at end (if one per line, etc), contain ':'
    utext = (u'\n'.join(re.findall(r'\{\{.*?}}', op))
             + '\n\n' + userpage
             + '\n\n' + u'\n'.join(re.findall(r'^\[\[.*?:.*?]]$', op, re.M))).strip('\n ')

    if flw.status == "noflag": utext += noflag

    try:
        page.put(utext)
        flw.userpage = True
    except Exception, e:
        with plock: print "exception trying to write %s:" % safe(page.aslink()), str(e)
        return

    if flw.status == "missing":
        flw.status = "test"
    # trying to re-read status won't work for a while!

# add a log entry, so we don't lose these in testing
# temporary, although might be expanded and kept

loglines = [ ]

loglock = threading.Lock()

def addlog(link, action):
  global loglines

  with loglock:

    # save up 20 to do in one edit:

    loglines.append('* ' + strftime("%d %B %H:%M", gmtime()) + ' [[:' + link[2:] + action[5:])

    if len(loglines) < 20: return
    loglines.reverse()

    try:
        page = wikipedia.Page(flws['en'].site, "User:Interwicket/FL log")
        text = getedit(page, plock = plock)

        k = 0
        newt = ''
        for line in text.splitlines():
            newt += line + '\n'
            if line == '----':
                for l2 in loglines:
                    newt += l2 + '\n'
                k = 1
                continue
            if k:
                k += 1
                if k > 180: break

        putedit(page, newt, comment = "log entry " + link, plock = plock)
        loglines = [ ]
    except wikipedia.NoPage:
        pass
    except Exception, e:
        with plock: print "exception writing log entry", str(e)

# update status table
# re-entrant, but might edit-conflict with itself or elide edits (has been noted)

updlock = threading.Lock()

def updstatus(flw):

  if flw.lockedwikt: return # no point in listing
  with updlock:

    try:
        page = wikipedia.Page(flws['en'].site, "User:Interwicket/FL status")
        text = getedit(page, plock = plock)

        notes = ''
        if flw.redirs == True: notes += 'link to redirects, '
        if flw.redirs == False: notes += 'no links to redirects, '
        if flw.attop: notes += 'iwikis at top, '
        if flw.oneline: notes += 'on one line, '
        if flw.nolink: notes += 'no links to %s added, ' % (",".join(flw.nolink))
        if flw.sortorder: notes += 'sort %s, ' % flw.sortorder
        notes = notes.rstrip(", ")

        # day number used to provide an invisible sort key in date column
        daynumber = "%04d" % (time()/86400 - 14700)  # days since about 1 April 2010
        today = '<span style="display:none;">' + daynumber + '</span>' + \
                strftime(" %d %B", gmtime()).replace(' 0', ' ')

        if flw.tbd >= 0: tbdtext = "%d" % flw.tbd
        else: tbdtext = ''

        lines = []
        for line in text.splitlines():
            # keep the old lines we want:
            if not line.startswith("| "): continue
            if "'''" + flw.lc + "'''" in line:

                parts = line.split('||') # (first will have the leading |)
                if len(parts) < 7: continue # (bad line? will replace it)
                uf = False
                if parts[2].strip() != flw.status: uf = True
                if not parts[4].strip().startswith(today): uf = True
                if tbdtext:
                    if parts[5].strip() != tbdtext: uf = True
                else:
                    tbdtext = parts[5].strip() # keep what was there
                if parts[6].strip() != notes: uf = True

                # if not worth updating, we are done
                if not uf: return
                # else elide this line, to be regenerated
                continue

            lines.append(line)

        lines.append(
            "| '''%s''' || {{%s|l=}} || %s || %d || %s {{subst:CURRENTTIME}} || %s || %s || %s"
            % (flw.lc, flw.lc, flw.status, flw.edits, today, tbdtext, notes, flw.mainpage) )

        text = """{{/header}}

{| class="wikitable sortable"
! code
! language
! status
! edits
! as of
! to be done
! width = 25% | notes
! main page
|-
""" + '\n|-\n'.join(sorted(lines)) + """
|}
"""

        putedit(page, text, comment = "update status for " + flw.lc, plock = plock)
    except wikipedia.NoPage:
        pass
    except Exception, e:
        with plock: print "exception writing status table", str(e)


# main event:
def addrci(page, mysite, links = { }, redirs = { }, skips = [ ], remove = False):
    """
    page to add to
    localsite (to be always added)
    links is a dict of pages for all other links
    redirs is a dict of pages of other links that are redirects (i.e. subset of links)

    will add missing links not in redirs, will add if in redirs and allowed on FL.wikt
    will remove links that are not in links (if not "None")

    does not add or remove anything in skips

    only removes anything if remove; with incomplete list call with remove False
    """
 
    flw = flws[page.site().lang]
    if getflstatus(flw) == "blocked": return # no kidding ...

    # if not blocked, try writing/overwriting user page, could do on "missing" but
    # we want to update it on new runs
    # useful access confirmation anyway
    if not flw.userpage: adduserpage(flw)

    # valid status?
    if flw.status not in ["test", "bot", "noflag"]: return

    # test limit per run
    if flw.status == "test" and flw.edits > flw.limit:
        with plock: print "(edit limit reached for %s)" % flw.lc
        return

    mypage = wikipedia.Page(mysite, page.title())
    links = links.copy() # shallow copy
    links[mysite.lang] = mypage

    # now drop the request into a layer of threading magic:

    replink(page = page, links = links, redirs = redirs, skips = skips, remove = remove)
    return

def replink(page = None, links = { }, redirs = { }, skips = [ ], remove = False, end = False):
    # [yes, the empty dicts and lists are created once on load, but we aren't going to mutate them]
    # call replink(end = True) to finish up and exit
    # this can be called from outside addrci (and I expect it to be)

    global repinit, rthread
    if not repinit:
        if end: return # no need to start
        for i in range(1, 4+1):
            rthread = threading.Thread(target=replinks)
            rthread.name = 'add replinks %d' % i
            rthread.start()
        repinit = True

    rtask = ufo(page = page, links = links, redirs = redirs, skips = skips, remove = remove, end = end)

    if not rtask.end: sleep(toreplink.qsize()) # soft q limit
    toreplink.put(rtask)
    if rtask.end:
        # make sure we have one per thread, extras do not matter
        for i in range(1, 4+1): toreplink.put(rtask)

def replinks():

    with plock: print "(rep link thread started)"

    while True:
        rtask = toreplink.get()
        if rtask.end: break
        reptask(page = rtask.page, links = rtask.links, redirs = rtask.redirs,
                skips = rtask.skips, remove = rtask.remove)

    with plock: print "(rep link thread ended)"

rewpr = re.compile(r'\[\[:([a-z-]+):.*?\]\]')

# remove count page
recountpage = re.compile(r'\{\{count page\|[^\|\}]+\}\}\n?')

ticktock = threading.Lock()

reptick = 10.0 # default

def setreptick(rt):
    global reptick
    reptick = rt

def reptask(page = None, links = { }, redirs = { }, skips = [ ], remove = False):
    global reptick

    # now we have emerged from the thread magic, continue as before (:-)
    if not page: return # (?)

    flw = flws[page.site().lang]

    # we may already have page text, so use page given to us

    # some retry logic:
    done = False
    nap = 5
    while not done and nap < 300:

        try:
            text = getwikitext(page, plock = plock)
        except wikipedia.NoPage:
            with plock: print "    ... no page %s now" % safe(page.aslink())
            break
        except wikipedia.IsRedirectPage:
            with plock: print "    ... page %s is a redirect?" % safe(page.aslink())
            break
        except Exception, e:
            with plock: 
                print "    ... some exception reading %s" % safe(page.aslink()), repr(e)
                # print "(sleeping %d seconds)" % nap
            sleep(nap)
            nap += nap/2
            continue

        oldlinks = getiwlinks(text, flws)

        # print "debug, oldlinks are:", repr(oldlinks)

        # small optimization:
        if not links and not oldlinks: break  # no links, none in entry

        # block edits to "main page" (!)
        if page.title() == flw.mainpage:
            with plock: print "    ... not updating %s, wikt main page"  % safe(page.aslink())
            break 

        # add/remove links

        if True: # just for left over indent

            act = "iwiki"

            # bad links, we seem to find a few, not infrequently (page moves, people adding links)
            act += ' -'
            title = page.title()
            for code in oldlinks.keys():
                if oldlinks[code] != title or code == flw.lc:
                    if len(act) < 70: act += '[[:%s:%s]], ' % (code, oldlinks[code])
                    else: act += code + ', '
                    del oldlinks[code]
                    # will add valid link in next step if present
            act = act.rstrip(', -')

            act += " +"
            for code in sorted(links):
                # but not target page:
                if code == flw.lc: continue
                if code in flw.nolink: continue  # e.g. pl->ru
                if code not in oldlinks and (flw.redirs or code not in redirs) and code not in skips:
                    if len(act) < 70: act += '[[:%s:%s]], ' % (code, title)
                    else: act += code + ', '
                    oldlinks[code] = title
            act = act.rstrip(', +')

            if remove:
                act += ' -'
                for code in sorted(oldlinks):
                    if code not in links and code not in skips:
                         act += code + ', '
                         del oldlinks[code]
                act = act.rstrip(', -')

            # with plock: print "(debug: rtask %s action %s)" % (safe(page.aslink()), safe(act))

            if act == "iwiki": break  # nothing was done

        newtext = replaceiwlinks(text, oldlinks, flw, flws)

        # special case for en.wikt, remove count page if we've added an iwiki:
        # leave odd variants to AF as before
        if flw.lc == 'en' and "+" in act and '{{count page|' in newtext:
            newtext, k = recountpage.subn('', newtext)
            if k: act += ", -{{count page}}"
                
        # pace to max rate, take lock and sleep
        with ticktock: sleep(reptick)

        try:
            if text != getedit(page, plock = plock):
                with plock: print "page changed during edit?", srep(page.aslink(forceInterwiki = True))
                continue # try this again
            # page.put(newtext, comment = act)
            putedit(page, newtext, comment = act, plock = plock)
            done = True
            flw.edits += 1
            if flw.status == "test" or (" -" in act and "-{" not in act): addlog(page.aslink(), act)
            with plock: print "    ... %s %s" % (srep(page.aslink(forceInterwiki = True)),
                              srep(rewpr.sub(r'\1', act[6:])))
        except Exception, e:
            if nap > 9 or '10054' not in repr(e):
                # e.g. not another box reset, do report on 3rd failure
                with plock:
                    print "    ... some exception trying to update %s" % safe(page.aslink()), str(e)
                    # print "(sleeping %d seconds)" % nap
            sleep(nap)
            nap += nap/2
            continue

    return

if __name__ == "__main__":

    # init all the flws, getiwlinks relies on this
    for code in flws['en'].site.family.langs: foo = flws[code]

    # production calls from mbwa init all of them

    # test

    # flws['en'].site.forceLogin()

    with plock: print "test FL get status"
    # valid = getflstatus(flws['sw'])
    # valid = getflstatus(flws["en"])

    valid = getflstatus(flws["mg"])

    with plock: print "test add en to chat on mg"
    page = wikipedia.Page(flws['mg'].site, "chat")
    addrci(page, flws['mg'].site)
  

    """
    # other tests:

    valid = getflstatus(flws["fr"])
    valid = getflstatus(flws["pl"])

    # test add userpage

    with plock: print "test add user page"
    adduserpage(flws["sw"])

    # test add

    # flws['sw'].tbd = 17

    with plock: print "test add en to cat on sw"
    page = wikipedia.Page(flws['sw'].site, "cat")
    addrci(page, flws['en'].site)

    with plock: print "test add en to Mwanzo (main page) on sw"
    page = wikipedia.Page(flws['sw'].site, "Mwanzo")
    addrci(page, flws['en'].site)

    with plock: print "test add en to cat on pl"
    page = wikipedia.Page(flws['pl'].site, "cat")
    addrci(page, flws['en'].site)

    with plock: print "test add en to cat on vi"
    page = wikipedia.Page(flws['vi'].site, "cat")
    addrci(page, flws['en'].site)

    with plock: print "test add en to cat on sw, links fr, vi"
    page = wikipedia.Page(flws['sw'].site, "cat")
    links = { 'fr':wikipedia.Page(flws['fr'].site, "cat"), 
              'vi':wikipedia.Page(flws['vi'].site, "cat") }
    redirs = { }
    addrci(page, flws['en'].site, links = links, redirs = redirs)

    # should not change any entry

    # now fix foo

    with plock: print "test fix foo on en"
    page = wikipedia.Page(flws['en'].site, "foo")
    addrci(page, flws['en'].site)

    # "locked" wikt:
    with plock: print "test add en to father on as"
    page = wikipedia.Page(flws['as'].site, "father")
    addrci(page, flws['en'].site)

    # rm bad link
    with plock: print "test add en to septendecim on ko"
    page = wikipedia.Page(flws['ko'].site, "septendecim")
    addrci(page, flws['en'].site)
    """

    replink(end = True)