MediaWiki:ExtractFirst.xsl

<?xml version="1.0" encoding="UTF-8"?>
 <!-- This is for extracting the first definition of a word from wiktionary, that can be used in a cross site manner. Consider: 
 http://en.wiktionary.org/w/api.php?action=parse&prop=text&page=word&format=xml&xslt=MediaWiki:extractFirst.xsl -->
 <xsl:stylesheet version="1.0"
 xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
 <xsl:output method='html'/>
 <!-- for translation. Also see JS -->
 <xsl:variable name="dir">ltr</xsl:variable>
 <xsl:variable name="more">» More</xsl:variable>
 <xsl:variable name="error">Error: </xsl:variable>
 <xsl:variable name='audio'>Pronunciation info</xsl:variable>
 <xsl:variable name='audio-separator'> • </xsl:variable>
 <xsl:variable name="copyright"> © <a href="http://en.wiktionary.org/wiki/">Wiktionary</a>. Released under <a href="http://creativecommons.org/licenses/by-sa/3.0/" rel="license copyright">CC-BY-SA 3.0</a></xsl:variable>
 <xsl:variable name="contentLang" select="'en'"/> <!-- make sure quoted-->
<!-- END XSLT VARIABLES TO TRANSLATE. SEE JS as well -->

<!-- other variables. DO NOT TRANSLATE THESE -->
 <xsl:variable name="pageName" select="concat('http://', $contentLang, '.wiktionary.org')"/>
<!-- end xslt constants -->
 <xsl:template match="/">

 <html dir="{$dir}" lang="{$contentLang}" xml:lang="{$contentLang}">
 <head>
 <meta name="generator" content="Wiktionary Extract XSLT 1.10-EN"/>
 <base target='_blank' href="{$pageName}" />
<title> Wiktionary extract</title>
 
 <style>
 #wordThisIsFor { font-weight:bold;}
 a.wtif1  { color: black; text-decoration: none;}
 a.wtif1:hover {text-decoration: underline;}
 .disambig-see-also, .disambig-see-also-2 {display:inline;}
 #container {background-color:white; padding: 0.5em; border: solid black thin;}
 a.new {color: red;}
 #error {color: red;font-size:larger;}

 div.exit {float:right;font-weight:bold;font-family:sans-serif;border:outset;padding:0.1em}
 div.exit a {color: black; text-decoration:none;}
 </style>
 <script type='text/javascript'>
 /*<![CDATA[*/
 function setup () {
 //Stuff to translate:
 var preferLang = {'en': 'English', 'fr': 'French', 'de': 'German', 'es': 'Spanish', 'it': 'Italian', 'pt': 'Portuguese', 'ja': 'Japanese', 'pl': 'Polish', 'ru': 'Russian', 'nl': 'Dutch', 'qqqAny': null}; //for now.
 var extractSeeAlso = /<div class=\"disambig-see-also(?:-2)?\">[\s\S]*?<\/div>/; //no subexpressions!
 var see_also_process = function (sa) {return sa;}
 var createLink = '» Create'; // text only.
 var not_found = "Could not retrieve definition of $1.";

 //END stuff to translate (there is one more translation block below)

 //Stuff not to translate in general (setup).

 var rd = location.search.match(/\&rd\=([^&]*)/); //is this from redirect. + converts to numeric.
 rd = rd ? (+rd[1] + 1) : 1; //redirection level.
 var showWord = 0; //default to not showing. 0 = none, 1 = bold, 2 = bold link.
 var showWordRaw = location.search.match(/\&showWord\=([^&]*)/);
 showWordRaw = showWordRaw ? showWordRaw[1] : 'none';
 if (showWordRaw !== "none") {
  showWord++;
 }
 if (showWordRaw === "link") {
  showWord++;
 }
 var useAudio = 0;
 if (location.search.match(/\&audio\=(?!none)/)) useAudio++;
 if (location.search.match(/\&audio\=autoplay/)) useAudio++;
 var closeLink = false;
 if (location.search.match(/&exit\=(?!false)/)) closeLink = true;
 var numbDfn = location.search.match(/\&count\=([^&]*)/); //count. + converts to numeric.
 numbDfn = numbDfn ? (parseInt(numbDfn[1])) : 1; //default to 1
 var pageURL = '/w/index.php?title=' + encodeURIComponent(decodeURIComponent(location.search.match(/\&page\=([^&]*)/)[1]));
 var src = document.getElementById('src');
 var display = document.getElementById('word-list');
 var loc = location.search.match(/\&page\=([^&]*)/)[1]; //this is not escaped
 var escWord = decodeURIComponent(loc).replace(/&/, '&amp;').replace(/>/, '&lt;').replace(/</, '&gt;');
 //note: wordEsc does not escape quotes. DO NOT PUT AS ATTRIBUTE VALUE
 var preferLangCode = location.search.match(/\&lang\=([^&]*)/);
 if (preferLangCode) { preferLangCode = preferLangCode[1]; }
 else {preferLangCode = 'qqqAny';}
 src.normalize();
 var html = src.firstChild.data;
 var def = html //may be redefined later.

  //stuff you might need to translate, but hopefully won't
  var subSectRegex = new RegExp('<h2>[^<]*?(<span[^<]*?<a[^<]*?<\/a[^<]*?</span[^<]*?)?<span class="mw-headline" id="' + preferLang[preferLangCode] + '"[^>]*>[\\s\\S]*$');
  var extractCurLangName = /<span class="mw-headline" id[^>]*>([\s\S]*?)<\/span>/; //first subexpression
  //End stuff you hopefully won't need to translate.
 
 try {
  //this assumes attribute order doesn't change!!!
  html = html.replace(/<div id="toctitle">[\s\S]*?<\/div>/, '');
  if (preferLangCode && preferLang[preferLangCode]) {
   try {
    //strip off all definitions before tagret lang.
    var subSect = html.match(subSectRegex)[0];
    if (subSect.match(/<ol>[\s\S]*?<li>/)) {
     //if it has content
     def = subSect;
    }
   } catch (e) { /*alert(e)*/}
  } 
  var lang = def.match(extractCurLangName)[1];
  var intro = "(" + lang + ") ";

  //Start testing to see if we can play audio.

  var audioCreditLink;
  var audioLink = html.match(/\<a\s*\S*?\s*href\=\"?(http:\/\/upload.wikimedia.org\/\S*.(?:oga|ogg))\"?[^>]*>/);
  if (!audioLink) audioLink = html.match(/\<button[\s\S]*?\sonclick=\"[\s\S]*?&quot;videoUrl&quot;:\s*&quot;(http:\/\/upload.wikimedia.org\/\S*?)&quot;[^"]*?&quot;isVideo&quot;:\sfalse[^"]*?".*?>/);
  if (audioLink) {
   audioCreditLink = audioLink[1].match(/\/([^\/]*)$/);
   if (audioCreditLink) audioCreditLink = 'File:' + audioCreditLink[1]
   if (useAudio > 0) {
    var supportsHTML5Audio = (typeof Audio !== 'undefined');
    var audioAutoplay = (useAudio === 2) ? "autoplay" : "";
    audioLink = encodeURI(decodeURI(audioLink[1])); //should be already encoded, but better safe.
    intro += ' <audio style="float:right;" controls src="' + audioLink + '" ' + audioAutoplay; + ' >';

    //the cortando stuff is inspired (stolen) from oggHandler.
    //the object tag interferes with the audio tag on firefox for autoplay
    //specificly it interperts autoplay differently then mplayer does.
    if (!supportsHTML5Audio) {
     intro += '<object align="right" type="application/ogg"  height="18" width="180" data="' + audioLink + '" autoplay="' + (audioAutoplay === 'autoplay') + '" >';
    }
    intro += '<applet code="com.fluendo.player.Cortado.class" ' +
		    '      width=180' +
		    '      height=20' + 
		    '      archive="http://upload.wikimedia.org/jars/cortado.jar">' +
		    '  <param name="url"  value="' +  audioLink + '" />' +
		    '  <param name="seekable"  value="true"/>' +
		    '  <param name="autoPlay" value="' + (audioAutoplay === 'autoplay') + '"/>' +
		    '  <param name="showStatus"  value="show"/>' +
		    '  <param name="showSpeaker" value="false"/>' +
		    '  <param name="statusHeight"  value="18"/>' +
		    '<p style="display:none"></p></applet>';
     if (!supportsHTML5Audio) {
      intro += '</object>';
     }
     intro += '</audio>';
   }
  }

  if (showWord)  intro = '<a href="' + pageURL + '" id="wordThisIsFor" class="wtif' + showWord + '" >' + escWord + "</a> " + intro ;

  var definitions_matched;
  //FIXME: in both cases the extraction method does not properly strip nested divs. This results in image thumbnails being left behind
  if (numbDfn === 1) {
  definitions_matched = def.match(/<ol>[\s\S]*?<\/ol>/)[0].replace(/<dl>[\s\S]*?<\/dl>/g, '').replace(/<div[^>]*>[\s\S]*?<\/div>/g, '').replace(/<\/div>/g, '').replace(/<ul>[\s\S]*?<\/ul>/g, '').replace(/<a href="(#[^"]*)">/g, '<a href="' + pageURL + '$1">').match(/<li>([\s\S]*?)<\/li>/);
  display.innerHTML = intro +  definitions_matched[1];
  } else {
   //this use not well supported...
   definitions_matched = def.match(/<ol>[\s\S]*?<\/ol>/)[0].replace(/<dl>[\s\S]*?<\/dl>/g, '').replace(/<div[^>]*>[\s\S]*?<\/div>/g, '').replace(/<\/div>/g, '').replace(/<ul>[\s\S]*?<\/ul>/g, '').replace(/<a href="(#[^"]*)">/g, '<a href="' + pageURL + '$1">').match(/<li>([\s\S]*?)<\/li>/g);
   var tmp = intro + ' <ul>';
   for (var i = 0; i < numbDfn && i < definitions_matched.length; i++) {
    tmp += definitions_matched[i];
   }
   display.innerHTML = tmp + '</ul>';
  }
 }
 catch (e) {
  //alert(e)
  //page does not exist, not well formed, these regexs suck, etc

  //note, this is appending a text node, thus it is ok, that loc is not escaped.
  display.appendChild(document.createTextNode(not_found.replace("$1", decodeURIComponent(loc))));
  document.getElementById('more-link').firstChild.data = createLink;
  if (rd < 9) { //arbitrary to prevent infinite loops
   //make sure don't have loops.
   var newLoc; //this should not be urlEncoded.
   var remAlt = false;
   var dLoc = decodeURIComponent(loc);

   newLoc = dLoc.charAt(0).toLowerCase() + dLoc.substring(1, loc.length);
   //try some other redirections.

   if (newLoc === dLoc) newLoc = dLoc.toLowerCase();
   if (newLoc === dLoc && location.search.match(/\&alt\=([^&]*)/)) {
    newLoc = decodeURIComponent(location.search.match(/\&alt\=([^&]*)/)[1]);
    remAlt = true;
   }
   
   if (newLoc !== dLoc) { //redir
    var newURL = location.href.replace(/(^[\s\S]*?\&page\=)[^&]*([\s\S]*$)/, '$1'+ encodeURIComponent(newLoc) + '$2');
    newURL = newURL.replace(/&rd\=[^&]*/, ''); //strip old redirect header.
    if (remAlt) {
     location.href.replace(/&alt\=[^&]*/, '');
    }
    location = newURL + '&rd=' + rd;
   }
  }
 }
 var sa = html.match(extractSeeAlso);
 if(sa) {
  document.getElementById('see-also').innerHTML = ' (' + see_also_process(sa) + ')' ;
 }
 if (closeLink) {
  var container = document.getElementById('container');
  container.innerHTML = '<div class="exit"><a href="about:blank" title="Hide" target="_self">X</a></div>'  + container.innerHTML;
 }
 document.getElementById('more-link').href= pageURL;
 if (audioCreditLink) {
  document.getElementById('audio-separator').style.display = 'inline';
  var audioA = document.getElementById('audio-link');
  audioA.href = '/w/index.php?title=' + encodeURIComponent(decodeURIComponent(audioCreditLink));
  audioA.style.display = 'inline';
 }
 
}

 /*]]>*/
 </script>
</head>
 <body onload='setup()'>
 <div id='container'>
 <div id='word-list'><xsl:apply-templates select='api/error'/></div>
 <div><a id='more-link'><xsl:value-of select="$more"/> </a> <span id='audio-separator' style='display:none;'><xsl:value-of select="$audio-separator"/></span>  <a id='audio-link' style='display:none;'> <xsl:value-of select="$audio"/></a> <span id='see-also'></span> <small id="copyright-notice"> <xsl:copy-of select="$copyright"/></small>
 </div>  
 </div>
 <div id='src' style='display:None'>
  <xsl:value-of select='api/parse/text'/>
 </div>
 </body>
 </html>

 </xsl:template>
 <xsl:template match='api/error'>
 <span id='error'><b><xsl:value-of select="$error"/></b> <xsl:value-of select='@info'/></span>
 </xsl:template>

 </xsl:stylesheet>