User:Hippietrail/ajaxtranslinks.js

Note – after saving, you may have to bypass your browser’s cache to see the changes.

  • Mozilla / Firefox / Safari: hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Command-R on a Macintosh);
  • Konqueror and Chrome: click Reload or press F5;
  • Opera: clear the cache in Tools → Preferences;
  • Internet Explorer: hold Ctrl while clicking Refresh, or press Ctrl-F5.

// <nowiki>
// TODO {{top}} used with non-translation sections are parsed anyway
// TODO don't treat whitespace as tokens. instead include a prevWhite field for each token
// TODO is the sublang system complete?
// TODO finish the refactoring
// TODO doesn't handle non linked multiword terms like "Sri Lanka"

// TODO == Current bugs (flat dom parser) ==
// TODO can we cope with missing colon after language name?
// TODO sense numbers after terms: Finnish: jargon (1, 2)
// TODO <i><a>trad.</a></i> and <i><a>simpl.</a></i> in some Chinese entries
// TODO (<i>pf.</i>) and (<i>impf.</i>) in some Russian entries
// TODO comma between term and its gender [[rose]] Catalan

// TODO == Old bugs (manual parser) ==
// TODO Cantonese and Mandarin as sublangs can match each other
// TODO handle wikified sublanguages
// TODO handle translations that have both a main entry and subentries ([[corn]] German)
// TODO selflinked language names cause breakage

// TODO == Can we handle these cases?
// TODO single terms wikilinked in separate parts: Hungarian: [[tönköly]] [[búza]]
// TODO non-linked reflexive particles: sich [[treffen]], [[pridružiti]] se
// TODO what to do about "comments" before or after the line or one term? (foo) ''(foo)'' (''foo'') ''foo''
// TODO subentries which are not sublanguages: Chinese and Japanese entries at "Mongolian"
//        this seems to require non complicated lookahead

//////////////////////////////////////////////////////////////////
//
// functions for parsing the "other" page in raw wikitext form
//
//////////////////////////////////////////////////////////////////

// callback from ajaxing in the term in the other language
// parse the wikitext of another page
// modify the dom of the translation entry
function parse_other_raw(li, anchor, term, lang, sublang, page) {
  var state = 0;
  var arr = page.split("\n");
  var line;

  if (anchor) {
    anchor.className = 'dunno';
    anchor.title = 'Looking...';
  }
 
  // so we can detect synonyms
  var langpats = build_lang_patterns(lang, sublang);

  for (var i = 0; i < arr.length; i++) {
    line = arr[i];

    // redirect?
    if (line.match(/#\s*[rR][eE][dD][iI][rR][eE][cC][tT]/)) {
      state = -1;
      break;

    } else {
      var foundlang;
      var r;

      // ==Lang== or one of its synonyms or variants?
      r = line.match(langpats[0]);
      if (r != null)
        foundlang = r[1];

      // ==[[Lang]]== or one of its synonyms or variants?
      r = line.match(langpats[1]);
      if (r != null)
        foundlang = r[1];

      if (foundlang != null) {
        state = 1;
        break;
      }
    }
  }

  langpats = null;

  // set the class and title of the <a> 

  // page exists and has an entry for this language
  if (state == 1) {
    if (anchor) {
      anchor.className = '';
      anchor.title = term + ' (exists in ' + foundlang + ')';
      if (anchor.href.indexOf('#') == -1) {
        // TODO unicode in lang names breaks: Guaraní -> #Guaran%C3%AD but #Guaran.C3.AD
        anchor.href += '#' + foundlang;
      }

    // plain terms have no <a> so output results some other way
    } else {
      li.title += '; res: ' + term + ' exists in ' + foundlang;
    }

  // page exists but has no entry for this language
  } else if (state == 0) {
    if (anchor) {
      anchor.className = 'new partlynew';
      anchor.title = term + ' exists (but not in ' + (sublang ? lang + ' (' + sublang + ')' : lang) + ')';
    } else {
      li.title += '; res: ' + term + ' exists but not in ' + (sublang ? lang + ' (' + sublang + ')' : lang);
    }

  // page exists but is a redirect
  } else if (state == -1) {
    if (anchor) {
      anchor.className = 'redirect';
      anchor.title = term + ' exists but is a redirect';
    } else {
      li.title += '; res: ' + term + ' exists but is a redirect';
    }
  }
}

// the language name used in the translation table might not be a
// synonym or variant of the name used in the foreign terms' own page
function build_lang_patterns(lang, sublang) {
  var langs = lang;

  // Ancient Greek
  if (sublang)
    langs += '|' + sublang + '|' + sublang + ' ' + lang + '|' + lang + ' \\(' + sublang + '\\)|' + lang + ', ' + sublang;

  // Chinese
  if (lang == 'Chinese')
    langs += '|Mandarin|Cantonese';
  else if (lang == 'Mandarin' || lang == 'Cantonese')
    langs += '|Chinese';

  // CJKV/Han characters
  else if (lang.match(/(CJKV?|Chinese) [cC]haracters/))
    langs += '|Translingual';

  // Other synonyms and spelling variants
  else if (lang == 'Anglo-Saxon') langs += '|Old English';
  else if (lang == 'Azerbaijani') langs += '|Azeri';
  else if (lang == 'Azeri') langs += '|Azerbaijani';
  else if (lang == 'Burmese') langs += '|Myanmar';
  else if (lang == 'Faeroese') langs += '|Faroese';
  else if (lang == 'Faroese') langs += '|Faeroese';
  else if (lang == 'Farsi') langs += '|Persian';
  else if (lang == 'Guaraní') langs += '|Guarani';
  else if (lang == 'Guarani') langs += '|Guaraní';
  else if (lang == 'Malay') langs += '|Malaysian';
  else if (lang == 'Malaysian') langs += '|Malay';
  else if (lang == 'Maori') langs += '|Māori';
  else if (lang == 'Māori') langs += '|Maori';
  else if (lang == 'Myanmar') langs += '|Burmese';
  else if (lang == 'Old English') langs += '|Anglo-Saxon';
  else if (lang == 'Persian') langs += '|Farsi';
  else if (lang == 'Romani') langs += '|Romany';
  else if (lang == 'Romansch') langs += '|Romansh';
  else if (lang == 'Romansh') langs += '|Romansch';
  else if (lang == 'Romany') langs += '|Romani';
  else if (lang == 'Scots Gaelic') langs += '|Scottish Gaelic';
  else if (lang == 'Scottish Gaelic') langs += '|Scots Gaelic';
  else if (lang == 'Sinhala') langs += '|Sinhalese';
  else if (lang == 'Sinhalese') langs += '|Sinhala';
  else if (lang == 'Slovak') langs += '|Slovakian';
  else if (lang == 'Slovakian') langs += '|Slovak';
  else if (lang == 'Slovene') langs += '|Slovenian';
  else if (lang == 'Slovenian') langs += '|Slovene';
  else if (lang == 'Tupinambá') langs += '|Tupinamba';
  else if (lang == 'Tupinamba') langs += '|Tupinambá';
  else if (lang == 'Uighur') langs += '|Uyghur';
  else if (lang == 'Uyghur') langs += '|Uighur';

  var rx1 = new RegExp('^==\\s*(' + langs + ')\\s*==');
  var rx2 = new RegExp('^==\\s*\\[\\[(' + langs + ')]]\\s*==');
 
  return [rx1, rx2];
}
 
// use ajax to load the raw pages of each translated term
function lookup_langs(li, anchor, term, lang, sublang) {
  function on200(req) {
    parse_other_raw(li, anchor, term, lang, sublang, req.responseText );
  }
  if (term)
    ajax(wgScript + '?title=' + term.replace(/ /g, '_') + '&action=raw', on200, function() {});
}
 
// TODO misses items not wrapped in { {top}} etc
// TODO gets some related terms etc which also use { {top}} etc
function get_trans_listitems() {
  var lis = [];

  // find the translations section
  // { {top}} and { {trans-top}} both result in <table class="translations">
  var bc = document.getElementById('bodyContent');

  if (bc != null) {
    var tables = bc.getElementsByTagName('table');

    if (tables != null) {
      for (var t = 0; t < tables.length; t++) {
        if (tables[t].className.match(/\btranslations\b/)) {
          var somelis = tables[t].getElementsByTagName('li');

          for (var l = 0; l < somelis.length; l++) {
            lis.push(somelis[l]);
          }
        }
      }
    }
  }

  return lis;
}

function parse_translistitems(lis) {
  if (window.domtokenizer)
    parse_translistitems_haveflatdom(lis);
  /*else
    parse_translistitems_noflatdom(lis);
  */
}

// for each item create parser, prefetch, parse, destroy parser
function parse_translistitems_haveflatdom(lis) {
  // members accessible by any parser function
  // item always points to the top-level li
  // subitem always points to the current level
  //   which may be the li or one of its child dd
  var toker = null;     // dom tokenizer (generator)
  var gItem = null;     // lis[i]:       eg *Spanish:
  var gSubItem = null;  // lis[i] or dd: eg *Serbian: *: Cyrillic
  var gDepth = 0;

  // main loop
  for (var i = 0; i < lis.length; i++) {
    gSubItem = gItem = lis[i];

    toker = new domtokenizer(gItem);

    // get first token
    toker.gettok();

    try {
      parsetransentry();
    }
    // google chrome can't handle "if" here
    //catch (e if e == 'WiktParseException') {
    catch (e) {
      if (e == 'WiktParseException') {
        //consolelog('caught parser exception: ' + e);
      } else {
        throw e;
      }
    }

    toker = null;
  }

  // SUB FUNCTIONS

  function pp_unexpected(level, msg) {
    var txt = level + ': unexpected token';
    if (msg)
      txt += ' at ' + msg;
    txt += ':';

    consolelog(txt);
    consolelog(toker.tok);
    consolelog(toker.nexttok);
  }

  function pp_expect_text(v) {
    if (toker.tok && toker.tok.t == 't' && toker.tok.x == v)
      toker.gettok();
    else
      pp_error('text "' + v + '"');
  }

  function pp_expect_start(v) {
    if (toker.tok && toker.tok.t == 's' && toker.tok.n.nodeName == v)
      toker.gettok();
    else
      pp_error('<' + v + '>');
  }

  function pp_expect_end(v) {
    if (toker.tok && toker.tok.t == 'e' && toker.tok.n.nodeName == v)
      toker.gettok();
    else
      pp_error('</' + v + '>');
  }

  function pp_warn(msg) {
    addclass(gSubItem, 'parsewarn');
    pp_unexpected('warning', msg);
  }

  function pp_error(msg) {
    addclass(gSubItem, 'parserror');
    pp_unexpected('error', msg);
    throw 'WiktParseException';
  }

  // TODO accepts any whitespace including nbsp due to mw french punc feature
  function pp_tolerate_space() {
    if (toker.tok.t == 't' && toker.tok.isWhite) {
      toker.gettok();
      pp_warn('tolerate space');
    }
  }

  function pp_tolerate_missing_space() {
    if (toker.tok.t == 't' && toker.tok.isWhite)
      toker.gettok();
    else
      pp_warn('tolerate missing space');
  }

  //////////////////////////////////////////////////////////

  // returns tree
  // this is the only place where gSubItem is changed!
  function pp_sublang() {
    var lat = null;
    var dd = toker.tok.n;

    pp_expect_start('DD');
    gSubItem = dd;
    gDepth ++;
    lat = pp_lang_and_terms();
    gDepth --;
    pp_expect_end('DD');
    pp_expect_text('\n');

    return lat;
  }

  // returns array of sublang trees
  function pp_sublangs() {
    var aot = [];

    pp_expect_start('DL');
    pp_expect_text('\n');

    while (true) {
      if (toker.tok.t == 's' && toker.tok.n.nodeName == 'DD')
        aot.push(pp_sublang());
      else
        break;
    }

    pp_expect_end('DL');
    pp_expect_text('\n');

    return aot;
  }

  // returns tree
  // TODO tolerate comma or missing colon between langname and dl
  function pp_lang_and_terms() {
    var lang = null;
    var sublangs = null;
    var terms = null;

    lang = pp_lang();
    pp_tolerate_space();
    pp_expect_text(':');
    pp_tolerate_missing_space();  // space, or \n if followed by <dl>

    // set the language (and title) for each branch and leaf
    gSubItem.title = lang;
    gSubItem.wiktLang = lang;

    // if we're a branch node
    if (toker.tok.t == 's' && toker.tok.n.nodeName == 'DL' && toker.nexttok.t == 't' && toker.nexttok.isEOL == true) {
      // then process the leaves
      sublangs = pp_sublangs();

    // else we're a leaf node
    } else {
      // set the term array only for leaves
      // only set this for leaf nodes
      // watch out because gSubItem will be set the same for the last leaf
      //   node and afterward its parent branch node
      //   which would result in nulling the last sublang's terms
      gSubItem.wiktTerms = terms = pp_terms();
    }
    
    // return a tree
    //   branch nodes always have null "terms"
    //   leaf nodes always have null "sublangs"
    return { lang: lang, sublangs: sublangs, terms: terms };
  }

  // parse the <li> then call function to parse its lang and terms
  // trreq and ttbc are handled here as they are not expected to
  // occur in sublanguages
  function parsetransentry() {
    var lang = null;
    var termArray = null;
    var isErr = false;
    var isWarn = false;
    var data = '';

    pp_expect_start('LI');

    if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'trreq')
      lang = trreq();
    else {
      var tree = pp_lang_and_terms();

      consolelog(' lang: ' + tree.lang);
      if (tree.sublangs)
        for (var i in tree.sublangs)
          consolelog('  ' + tree.sublangs[i].lang);
    }

    pp_expect_end('LI');

    return;

    function trreq() {
      var lang = null;

      toker.gettok(); // <span>
      lang = parseunlinkedlangname();
      // TODO this should be done in the function above
      toker.gettok();
      pp_expect_text(':');
      pp_tolerate_missing_space();
      pp_expect_start('I');
      while (toker.tok.t == 't')
        toker.gettok(); // please add...
      pp_expect_end('I');
      pp_expect_end('SPAN');

      return lang;
    }
  }

  // TODO handle language names with a linked part and unlinked part: [[Sorbian]] (lower)
  function pp_lang() {
    var lang = null;

    if (toker.tok.t == 't') {
      lang = parseunlinkedlangname();
      // TODO this should be done in the function above
      toker.gettok();
    } else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A')
      lang = linked_or_ttbc_lang('A');
    else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'ttbc')
      lang = linked_or_ttbc_lang('SPAN');

    return lang;
  }

  // TODO are there 3-part language names which use comma or parentheses?
  // TODO this function doesn't consume each token as soon as it is recognized
  function parseunlinkedlangname() {

    // first word of lang name
    lang = toker.tok.x;

    // following words?
    if (toker.nexttok.t == 't') {
      // Old English; English (Old)
      if (toker.nexttok.x == ' ') {
        toker.gettok(); // eat first word
        if (toker.nexttok.t == 't') {
          // English (Old)
          if (toker.nexttok.x == '(') {
            toker.gettok(); // eat space
            lang = toker.nexttok.x + ' ' + lang;  // get second word
            toker.gettok(); // eat (
            toker.gettok(); // eat second word
          // Old English; Torres Strait Creole
          } else {
            lang = lang + ' ' + toker.gettok().x; // eat space, get second word
            if (toker.nexttok.t == 't' && toker.nexttok.x == ' ') {
              toker.gettok(); // eat second word
              // Torres Strait Creole
              if (toker.nexttok.t == 't')
                lang = lang + ' ' + toker.gettok().x; // eat space, get third word
            }
          }
        }

      // English, Old or Greek, instead of Greek:
      } else if (toker.nexttok.x == ',') {
        toker.gettok(); // eat first word
        if (toker.nexttok.t == 't' && toker.nexttok.x == ' ') {
          toker.gettok(); // eat comma
          lang = toker.gettok().x + ' ' + lang; // eat space, get second word
        }
        else
          toker.ungettok();
      }
    }

    return lang;
  }

  function linked_or_ttbc_lang(tag) {
    toker.gettok(); // <a> or <span class="ttbc">
    var lang = parseunlinkedlangname();
    // TODO this should be done in the function above
    toker.gettok();
    pp_expect_end(tag);
    return lang;
  }

  // parse a list of terms separated by commas or semicolons
  // does not handle sublanguage lists
  // returns array of terms only
  function pp_terms() {
    var terms = [];
    var term = null;

    term = parseterm();
    if (term)
      terms.push(term);

    parseterms_rest();

    return terms;

    // SUB FUNCTIONS

    // term interwiki? ((gender translit?) | (translit gender?))?
    function parseterm() {
      var term = null;

      if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A')
        term = parseterm_link();
      else if (is_script_span(toker.tok))
        term = parseterm_script();
      else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'STRONG' && toker.tok.n.className == 'selflink')
        term = parseterm_selflink();
      else if (toker.tok.t == 't') {
        if (toker.tok.x == '(' || toker.tok.x == '[') {
          var term = {};
          parse_translit_gender(term);
        } else
          term = parseterm_plain();
      } else
        pp_unexpected('error', 'term');

      // if we parsed a term now parse its attributes such as gender and transliteration
      if (term) {
        parse_optional_interwiki(term);
        parse_optional_gender_translit(term);

        logterm(term);
      }

      return term;

      function logterm(term) {
        var outputstr = '';
       
        if (typeof term.p != 'undefined') {
          outputstr += '  ' + term.p;

          if (typeof term.x != 'undefined' && term.x != term.p)
            outputstr += '|' + term.x;
        }

        if (term.tr)
          outputstr += ' (' + term.tr + ')';
        if (term.g)
          outputstr += ' ' + term.g + '.';
        if (term.n)
          outputstr += ' ' + term.n + '.';
        if (term.iw.code || term.iw.sign) {
          outputstr += ' ';
          if (term.iw.code)
            outputstr += term.iw.code;
          if (term.iw.sign)
            outputstr += term.iw.sign;
        }

        consolelog(outputstr);
      }

      function parseterm_link() {
        var a = toker.tok.n;
        var term = null;

        toker.gettok(); // <a>
        if (is_script_span(toker.tok))
          term = parseterm_link_script();
        else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'FONT')
          term = parseterm_link_font();
        else
          term = parseterm_inner();
        pp_expect_end('A');

        term.a = a;
        term.p = title_from_anchor(a);

        return term;

        function parseterm_link_script() {
          var term = null;

          toker.gettok(); // <span>
          term = parseterm_inner();
          pp_expect_end('SPAN');

          return term;
        }

        function parseterm_link_font() {
          var term = null;

          toker.gettok(); // <font>
          term = parseterm_inner();
          pp_expect_end('FONT');

          return term;
        }
      }

      function parseterm_script() {
        var term = null;

        toker.gettok(); // <span>
        if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A')
          term = parseterm_script_link();
        else
          term = parseterm_inner();
        pp_expect_end('SPAN');

        return term;

        function parseterm_script_link() {
          var a = toker.tok.n;
          var term = null;

          toker.gettok(); // <a>
          term = parseterm_inner();
          pp_expect_end('A');

          term.a = a;
          term.p = title_from_anchor(a);

          return term;
        }
      }

      function parseterm_selflink() {
        var term = null;

        toker.gettok(); // <strong>
        term = parseterm_inner();
        pp_expect_end('STRONG');

        return term;
      }

      // get one piece of text
      function parseterm_plain() {
        var term = {};
        
        term.x = term.p = toker.tok.x;
        toker.gettok(); // term itself

        return term;
      }
    }

    // TODO this doesn't really have to be recursive does it?
    function parseterms_rest() {
      var term = null;
      var dorest = false;

      // comma or semicolon possibly preceded by a space
      if (toker.tok.t == 't') {
        if (toker.tok.x == ',' || toker.tok.x == ';' || toker.tok.x == '/')
          dorest = true;
        else if (toker.tok.x == ' ') {
          if (toker.nexttok.t == 't' && (toker.nexttok.x == ',' || toker.nexttok.x == ';' || toker.nexttok.x == '/')) {
            toker.gettok(); // eat whitespace
            dorest = true;
          }
        }

        if (dorest) {
          if (toker.tok.x == '/')
            pp_warn('tolerate / in place of , or ;');
          toker.gettok(); // eat , or ;

          pp_tolerate_missing_space();

          // next term
          term = parseterm();
          if (term)
            terms.push(term);

          // recur
          parseterms_rest();
        }
      }
    }
  }

  // interwiki?
  function parse_optional_interwiki(term) {
    var iw = {};

    // current (cc) style
    if (toker.tok.t == 't' && toker.tok.isWhite) {
      if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SUP') {
        toker.gettok(); // space

        // class to tell us if it's a red or blue link?
        if (toker.tok.n.className == 'tpos')
          iw.sign = '+';
        else if (toker.tok.n.className == 'tneg')
          iw.sign = '-';
        else
          iw.sign = '';

        toker.gettok(); // <sup>

        // template:t style
        if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A') {
          toker.gettok(); // <a>

          // blue cross-wikt link or new-style class'd sup
          if (toker.tok.t == 't' && toker.tok.x == '(') {
            toker.gettok(); // (
            iw.code = toker.tok.x;
            //iw.sign = '+';
            toker.gettok(); // language code
            pp_expect_text(')');

          // old-style red cross-wikt link which wrapped sup with span
          } else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'new') {
            toker.gettok(); // <span class="new">
            pp_expect_text('(');
            iw.code = toker.tok.x;
            //iw.sign = '-';
            toker.gettok(); // language code
            pp_expect_text(')');
            pp_expect_end('SPAN');
          }

          pp_expect_end('A');
        }

        // template:he-translation style
        else if (toker.tok.t == 't' && toker.tok.x == '(') {
          toker.gettok(); // (
          pp_expect_start('A');
          iw.code = toker.tok.x;
          iw.sign = '';
          toker.gettok(); // he
          pp_expect_end('A');
          pp_expect_text(')');
        }

        toker.gettok(); // </sup>
      }

      // old ^ style
      else if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'A' && toker.nexttok.n.className == 'extiw') {
        toker.gettok(); // space

        toker.gettok(); // <a>
        iw.code = '^';
        iw.sign = '';
        toker.gettok(); // <^>
        pp_expect_end('A');
      }

      // ^ style used on [[swan]] Greek
      // AMBIG looks like transliteration
      // TODO since this comes between the transliteration and the gender
      // TODO we should accept all of (transliteration, gender, interwiki) in any order
      // TODO this would entail left factoring (^) and transliteration
      /*
      else if (toker.nexttok.t == 't' && toker.nexttok.x == '(') {
        toker.gettok(); // space
        if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'A') {

          toker.gettok(); // (
          toker.gettok(); // <a>
          iw.code = '^';
          iw.sign = '';
          toker.gettok(); // <^>
          toker.gettok(); // </a>
          toker.gettok(); // )
        }
        else
          toker.ungettok();
      }
      */
    }

    term.iw = iw;
  }

  function parse_optional_gender_translit(term) {
    var gt = null;

    if (toker.tok.t == 't' && toker.tok.isWhite) {
      // transliteration?
      if ((toker.nexttok.t == 't' && (toker.nexttok.x == '(' || toker.nexttok.x == '['))
        || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
          && (toker.nexttok.n.className == 'IPA' || toker.nexttok.n.className == 'Unicode'))
        || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
          && toker.nexttok.n.className == 'ib-brac')
        || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'I')) {
        toker.gettok(); // space
        parse_translit_gender(term);
      }

      // gender?
      else if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
          && toker.nexttok.n.className.match(/\b(gend|numb)er\b/)) {
        toker.gettok(); // space
        parse_gender_translit(term);
      }
    }

    return gt;
  }

  // translit gender?
  function parse_translit_gender(term) {
    parse_translit(term);

    // is there a gender after the transliteration?
    if (toker.tok.t == 't' && toker.tok.isWhite) {
      if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
          && toker.nexttok.n.className.match(/\b(gend|numb)er\b/)) {
        toker.gettok(); // space
        parse_gender_num(term);
      }
    }
  }

  // transliterations
  function parse_translit(term) {
    var rbrac = null;

    if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN'
        && (toker.tok.n.className == 'IPA' || toker.tok.n.className == 'Unicode')) {
      toker.gettok(); // <span class="IPA|Unicode">
      outer_list(term);
      pp_expect_end('SPAN');
    } else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'I') {
      toker.gettok(); // <i>
      outer_list(term);
      pp_expect_end('I');
    } else if (toker.tok.x == '(' || toker.tok.x == '[')
      outer_list(term);
    else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN'
        && toker.tok.n.className == 'ib-brac') {
      // TODO this won't handle more than one transliteration inside { {ib}}
      toker.gettok(); // <span class="ib-brac">
      pp_expect_text('(');
      pp_expect_end('SPAN');
      pp_expect_start('SPAN'); // class="ib-content"
      inner_list(term);
      pp_expect_end('SPAN');   // class="ib-content"
      pp_expect_start('SPAN');
      pp_expect_text(')');
      pp_expect_end('SPAN');   // class="ib-brac"
    } else
      pp_error('transliteration');

    return;

    // => "(" , transliteration { "," , transliteration } , ")"
    function outer_list(term) {
      if (toker.tok.x == '(') rbrac = ')';
      else if (toker.tok.x == '[') rbrac = ']';
      else pp_error('transliteration list start bracket');

      toker.gettok(); // ( or [
      term.tr = inner_list(term);
      pp_expect_text(rbrac);
    }

    // => transliteration { "," , transliteration }
    function inner_list(term) {
      var translits = [];
      var translit = null;

      // => transliteration
      if (translit = parsetranslit())
        translits.push(translit);

      // => { "," , transliteration }
      parsetranslits_rest();

      return translits;

      /////////////////////////

      // lang="XX" + A + translit || A + translit || translit
      function parsetranslit() {
        var translit = null;

        // japanese may wrap kana transliterations in a font tag
        if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className.match(/^[A-Z][A-Z]$/)) {
          translit = lang_anchor();
        } else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A') {
          translit = anchor();
        } else {
          translit = inner();
        }

        return translit;

        function lang_anchor() {
          var translit = null;

          toker.gettok(); // <span lang="JA">
          translit = anchor();
          pp_expect_end('SPAN');

          return translit;
        }

        function anchor() {
          var translit = null;

          toker.gettok(); // <a>
          translit = inner();
          pp_expect_end('A');

          return translit;
        }

        function inner() {
          var translit = '';
          
          while (true) {
            translit += toker.tok.x;
            toker.gettok(); // transliteration itself

            // some greek transliterations use [] instead of ()
            if (toker.tok.x == ',' || toker.tok.x == rbrac || toker.tok.t != 't')
              break;
          }

          return translit;
        }
      }

      function parsetranslits_rest() {
        var translit = null;
        var dorest = false;

        // comma possibly preceded by a space
        if (toker.tok.t == 't') {
          if (toker.tok.x == ',')
            dorest = true;
          else if (toker.tok.x == ' ') {
            if (toker.nexttok.t == 't' && toker.nexttok.x == ',') {
              toker.gettok(); // eat whitespace
              dorest = true;
            }
          }

          if (dorest) {
            toker.gettok(); // eat , or ;

            pp_tolerate_missing_space();

            // next translit
            translit = parsetranslit();
            if (translit)
              translits.push(translit);

            // recur
            parsetranslits_rest();
          }
        }
      }
    }
  }

  // gender translit?
  function parse_gender_translit(term) {
    parse_gender_num(term);

    // is there a transliteration after the gender?
    // TODO sense numbers after the gender look like transliterations to the parser
    if (toker.tok.t == 't' && toker.tok.isWhite) {
      if ((toker.nexttok.t == 't' && (toker.nexttok.x == '(' || toker.nexttok.x == '['))
          || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
            && (toker.nexttok.n.className == 'IPA' || toker.nexttok.n.className == 'Unicode'))
          || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'
            && toker.nexttok.n.className == 'ib-brac')) {
        toker.gettok(); // space
        parse_translit(term);
      }
    }
  }

  // gender(s) and possibly number
  function parse_gender_num(term) {
    var gender = null;

    // TODO we handle { {m}} but not yet ''m''
    gender = parse_gender_or_number();

    // plain comma?
    if (toker.tok.t == 's' && toker.tok.n.nodeName == 'I') {
      while (true) {
        pp_expect_start('I');
        pp_expect_text(',');
        pp_expect_end('I');

        toker.gettok(); // space
        gender += parse_gender_or_number();

        if (toker.tok.t != 's' || toker.tok.n.nodeName != 'I')
          break;
      }

      toker.gettok(); // <span class="serial-comma">
      pp_expect_start('I');
      pp_expect_text(',');
      pp_expect_end('I');
      pp_expect_end('SPAN');
    }

    // serial comma?
    if (toker.tok.t == 't' && toker.tok.isWhite
      && toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN' && toker.nexttok.n.className == 'serial-and') {
      toker.gettok(); // space
      toker.gettok(); // <span class="serial-and">
      pp_expect_start('I');
      pp_expect_text('and');
      pp_expect_end('I');
      pp_expect_end('SPAN');
      toker.gettok(); // space

      gender += parse_gender_or_number();
    }

    // and number?
    if (toker.tok.t == 't' && toker.tok.isWhite
      && toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN' && toker.nexttok.n.className.match(/\bnumber\b/)) {

      toker.gettok(); // space
      term.n = parse_gender_or_number();
    }

    term.g = gender;

    function parse_gender_or_number() {
      var gender = null;

      toker.gettok(); // <span class="gender">
      pp_expect_start('I');
      gender = toker.tok.x;
      toker.gettok(); // gender
      pp_expect_start('SPAN');
      pp_expect_text('.');
      pp_expect_end('SPAN');
      pp_expect_end('I');
      pp_expect_end('SPAN');

      return gender;
    }
  }

  // get all pieces of text
  function parseterm_inner() {
    var px = '';

    px = '';
    
    while (true) {
      px += toker.tok.x;
      toker.gettok(); // term itself

      if (toker.tok.t != 't')
        break;
    }

    return { p: px, x: px };
  }
}

//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Helper functions

// log to firebug console if it exists
function consolelog(data) {
  if (typeof window.console != 'undefined') {
    console.log(data);
  }
}
 
// script spans are mostly inserted by templates. old ones use language codes as two capital letters
// some newer ones use the newer ISO script names but these may be prefixed by 'sc' or not...
// TODO add 'polytonic' here?
function is_script_span(tok) {
  if (tok.t == 's' && tok.n.nodeName == 'SPAN'
    && (tok.n.className.match(/^[A-Z][A-Z]$/) || tok.n.className == 'scHebr' || tok.n.className == 'Deva'))
    return true;
  else
    return false;
}

// get an unadorned title from an anchor
// since the nodeValue will contain the title
// including optional characters like Hebrew vowels
// and Latin and Old English macrons
function title_from_anchor(a) {
  var t;

  // red link
  if (a.search) {
    var l = a.search.indexOf('title=') + 6;
    var r = a.search.indexOf('&', l);

    if (r == -1)
      t = a.search.substr(l);
    else
      t = a.search.substr(l, r-l);
  }

  // blue link
  else
    t = a.pathname.substr(a.pathname.lastIndexOf('/') + 1);

  return decodeURIComponent(t).replace(/_/g, ' ');
}

// add a CSS class to an element which may or may
// not already have other classes. will not add
// a class that's already there
function addclass(ele, newclass) {
  if (ele.className) {
    var p = new RegExp('\\b' + newclass + '\\b');
    if (!ele.className.match(p))
      ele.className += newclass;
  } else
    ele.className = newclass;
}

if (wgNamespaceNumber === 0)
jQuery(document).ready(function () {
	jQuery.when(
		jQuery.getScript(mw.util.getUrl('User:Hippietrail/hippajax.js', { action: 'raw', ctype: 'text/javascript', maxage: 86400, smaxage: 86400 })),
		jQuery.getScript(mw.util.getUrl('User:Hippietrail/domtokenizer.js', { action: 'raw', ctype: 'text/javascript', maxage: 86400, smaxage: 86400 }))
	).then(function () {
		// find all the translation entries in the dom
		var lis = get_trans_listitems(); 
	 
		if (lis != null) {
			// parse the language name from each translation entry
			// and an array of terms for each entry
		    parse_translistitems(lis);
	
		    // look up the other language term for each entry
		    for (var i = 0; i < lis.length; i++) {
				if (lis[i].wiktTerms && lis[i].wiktTerms.length > 0)
					for (var j = 0; j < lis[i].wiktTerms.length; j++)
					  lookup_langs(lis[i], lis[i].wiktTerms[j].a, lis[i].wiktTerms[j].p, lis[i].wiktLang, null /*sublang*/);
				
				// do sublanguages
				var dds = lis[i].getElementsByTagName('dd');
				
				for (var k = 0; k < dds.length; k++)
					if (dds[k].wiktTerms && dds[k].wiktTerms.length > 0)
					  for (var l = 0; l < dds[k].wiktTerms.length; l++)
					    lookup_langs(dds[k], dds[k].wiktTerms[l].a, dds[k].wiktTerms[l].p, lis[i].wiktLang, dds[k].wiktLang);
			}
		}
	});
});