User:Erutuon/scripts/scriptRecognition.js

Note: You may have to bypass your browser’s cache to see the changes. In addition, after saving a sitewide CSS file such as MediaWiki:Common.css, it will take 5-10 minutes before the changes take effect, even if you clear your cache.

  • Mozilla / Firefox / Safari: hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Command-R on a Macintosh);
  • Konqueror and Chrome: click Reload or press F5;
  • Opera: clear the cache in Tools → Preferences;
  • Internet Explorer: hold Ctrl while clicking Refresh, or press Ctrl-F5.

/*
	Two functions: a codepoint-to-Wiktionary-script-code function like
	char_to_script in [[Module:Unicode data]], and a string-to-script-code
	function. Language-agnostic.
	
	Originally from [[User:Erutuon/scripts/watchlistScriptTagging.js]] and
	[[User:Erutuon/scripts/scriptTitles.js]].
	
	Requires ECMAScript 2016 (ES7) because it uses Array.prototype.includes.
*/

/* jshint esversion: 6 */
/* globals mw */

(function scriptRecognitionIIFE() {
'use strict';

// The following data is from [[Module:Unicode data/scripts]] and ultimately
// based on [[Module:scripts/data]].
const scriptRanges = [
	[
		[0x41, 0x5A, 'Latn'],
		[0x61, 0x7A, 'Latn'],
		[0xC0, 0xD6, 'Latn'],
		[0xD8, 0xF6, 'Latn'],
		[0xF8, 0x24F, 'Latn'],
		[0x370, 0x3E1, 'Grek'],
		[0x3E2, 0x3EF, 'Copt'],
		[0x3F0, 0x3FF, 'Grek'],
		[0x400, 0x45F, 'Cyrl'],
		[0x464, 0x469, 'Cyrs'],
		[0x46A, 0x46D, 'Cyrl'],
		[0x46F, 0x471, 'Cyrs'],
		[0x472, 0x475, 'Cyrl'],
		[0x476, 0x489, 'Cyrs'],
		[0x48A, 0x527, 'Cyrl'],
		[0x531, 0x58F, 'Armn'],
		[0x590, 0x5FF, 'Hebr'],
		[0x600, 0x6FF, 'Arab'],
		[0x700, 0x74F, 'Syrc'],
		[0x750, 0x77F, 'Arab'],
		[0x780, 0x7B1, 'Thaa'],
		[0x7C0, 0x7FA, 'Nkoo'],
		[0x800, 0x83E, 'Samr'],
		[0x840, 0x85E, 'Mand'],
		[0x860, 0x86A, 'Syrc'],
		[0x8A0, 0x8FF, 'Arab'],
		[0x900, 0x97F, 'Deva'],
		[0x981, 0x9FA, 'Beng'],
		[0xA01, 0xA75, 'Guru'],
		[0xA81, 0xAF1, 'Gujr'],
		[0xB01, 0xB77, 'Orya'],
		[0xB82, 0xBFA, 'Taml'],
		[0xC01, 0xC7F, 'Telu'],
		[0xC82, 0xCF2, 'Knda'],
		[0xD02, 0xD7F, 'Mlym'],
		[0xD82, 0xDF4, 'Sinh'],
		[0xE01, 0xE5B, 'Thai'],
		[0xE81, 0xEDF, 'Laoo'],
		[0xF00, 0xFDA, 'Tibt']
	],
	[
		[0x1000, 0x109F, 'Mymr'],
		[0x10A0, 0x10CD, 'Geok'], // Asomtavruli
		[0x10D0, 0x10FC, 'Geor'], // Mkhedruli
		[0x1100, 0x11FF, 'Hang'],
		[0x1200, 0x1399, 'Ethi'],
		[0x13A0, 0x13F4, 'Cher'],
		[0x1400, 0x167F, 'Cans'],
		[0x1680, 0x169C, 'Ogam'],
		[0x16A0, 0x16F0, 'Runr'],
		[0x1700, 0x1714, 'Tglg'],
		[0x1720, 0x1734, 'Hano'],
		[0x1740, 0x1753, 'Buhd'],
		[0x1760, 0x1773, 'Tagb'],
		[0x1780, 0x17F9, 'Khmr'],
		[0x1800, 0x18AA, 'Mong'],
		[0x1900, 0x194F, 'Limb'],
		[0x1950, 0x1974, 'Tale'],
		[0x1980, 0x19DF, 'Talu'],
		[0x19E0, 0x19FF, 'Khmr'],
		[0x1A00, 0x1A1F, 'Bugi'],
		[0x1A20, 0x1AAD, 'Lana'],
		[0x1B00, 0x1B7C, 'Bali'],
		[0x1B80, 0x1BBF, 'Sund'],
		[0x1BC0, 0x1BFF, 'Batk'],
		[0x1C00, 0x1C4F, 'Lepc'],
		[0x1C50, 0x1C7F, 'Olck'],
		[0x1E00, 0x1EFF, 'Latn'],
		[0x1F00, 0x1FFE, 'polytonic']
	],
	[
		[0x2200, 0x22FF, 'Zmth'],
		[0x2300, 0x23F3, 'Zsym'],
		[0x2500, 0x27BF, 'Zsym'],
		[0x27C0, 0x27EF, 'Zmth'],
		[0x2800, 0x28FF, 'Brai'],
		[0x2980, 0x29FF, 'Zmth'],
		[0x2A00, 0x2AFF, 'Zmth'],
		[0x2C00, 0x2C5E, 'Glag'],
		[0x2C60, 0x2C7F, 'Latinx'],
		[0x2C80, 0x2CFF, 'Copt'],
		[0x2D00, 0x2D2D, 'Geok'], // Nuskhuri
		[0x2D30, 0x2D7F, 'Tfng'],
		[0x2D80, 0x2DDE, 'Ethi'],
		[0x2E80, 0x2FDF, 'Hani']
	],
	[
		[0x3000, 0x303F, 'Hani'],
		[0x3041, 0x309F, 'Hira'],
		[0x30A0, 0x30FF, 'Kana'],
		[0x3105, 0x312D, 'Bopo'],
		[0x3131, 0x318E, 'Hang'],
		[0x31A0, 0x31BA, 'Bopo'],
		[0x31C0, 0x31E3, 'Hani'],
		[0x31F0, 0x31FF, 'Kana'],
		[0x3300, 0x3357, 'Kana'],
		[0x337B, 0x337F, 'Hani'],
		[0x3400, 0x3FFF, 'Hani']
	],
	[
		[0x4000, 0x4DB5, 'Hani'],
		[0x4E00, 0x4FFF, 'Hani']
	],
	[
		[0x5000, 0x5FFF, 'Hani']
	],
	[
		[0x6000, 0x6FFF, 'Hani']
	],
	[
		[0x7000, 0x7FFF, 'Hani']
	],
	[
		[0x8000, 0x8FFF, 'Hani']
	],
	[
		[0x9000, 0x9FFF, 'Hani']
	],
	[
		[0xA000, 0xA4C6, 'Yiii'],
		[0xA4D0, 0xA4FF, 'Lisu'],
		[0xA500, 0xA62B, 'Vaii'],
		[0xA640, 0xA697, 'Cyrs'],
		[0xA680, 0xA697, 'Cyrl'],
		[0xA6A0, 0xA6F7, 'Bamu'],
		[0xA720, 0xA7FF, 'Latinx'],
		[0xA800, 0xA82B, 'Sylo'],
		[0xA840, 0xA877, 'Phag'],
		[0xA880, 0xA8D9, 'Saur'],
		[0xA8E0, 0xA8FB, 'Deva'],
		[0xA900, 0xA92F, 'Kali'],
		[0xA930, 0xA95F, 'Rjng'],
		[0xA980, 0xA9DF, 'Java'],
		[0xA9E0, 0xA9FE, 'Mymr'],
		[0xAA00, 0xAA5F, 'Cham'],
		[0xAA60, 0xAA7F, 'Mymr'],
		[0xAA80, 0xAADF, 'Tavt'],
		[0xAAE0, 0xAAFF, 'Mtei'],
		[0xAB01, 0xAB2E, 'Ethi'],
		[0xAB30, 0xAB65, 'Latinx'],
		[0xAB70, 0xABBF, 'Cher'],
		[0xABC0, 0xABFF, 'Mtei'],
		[0xAC00, 0xAFFF, 'Hang']
	],
	[
		[0xB000, 0xBFFF, 'Hang']
	],
	[
		[0xC000, 0xCFFF, 'Hang']
	],
	[
		[0xD000, 0xD7A3, 'Hang']
	],
	[
		// no data for 0xF000-0xFFFF
	],
	[
		[0xFB13, 0xFB17, 'Armn'],
		[0xFB1D, 0xFB4F, 'Hebr'],
		[0xFB50, 0xFDFD, 'Arab'],
		[0xFE70, 0xFEFC, 'Arab']
	],
	[
		[0x10000, 0x100FA, 'Linb'],
		[0x10280, 0x1029C, 'Lyci'],
		[0x102A0, 0x102D0, 'Cari'],
		[0x102E1, 0x102FB, 'Copt'],
		[0x10300, 0x10323, 'Ital'],
		[0x10330, 0x1034A, 'Goth'],
		[0x10350, 0x1037A, 'Perm'],
		[0x10380, 0x1039F, 'Ugar'],
		[0x103A0, 0x103D5, 'Xpeo'],
		[0x10400, 0x1044F, 'Dsrt'],
		[0x10450, 0x1047F, 'Shaw'],
		[0x10480, 0x104A9, 'Osma'],
		[0x104B0, 0x104FB, 'Osge'],
		[0x10500, 0x10527, 'Elba'],
		[0x10530, 0x10563, 'Aghb'],
		[0x10600, 0x10767, 'Lina'],
		[0x10800, 0x1083F, 'Cprt'],
		[0x10840, 0x1085F, 'Armi'],
		[0x10860, 0x1087F, 'Palm'],
		[0x10880, 0x108AF, 'Nbat'],
		[0x108E0, 0x108FF, 'Hatr'],
		[0x10900, 0x1091F, 'Phnx'],
		[0x10920, 0x1093F, 'Lydi'],
		[0x10980, 0x1099F, 'Mero'],
		[0x109A0, 0x109BF, 'Merc'],
		[0x10A00, 0x10A58, 'Khar'],
		[0x10A60, 0x10A7F, 'Sarb'],
		[0x10A80, 0x10A9F, 'Narb'],
		[0x10AC0, 0x10AF6, 'Mani'],
		[0x10B00, 0x10B3F, 'Avst'],
		[0x10B40, 0x10B5F, 'Prti'],
		[0x10B60, 0x10B7F, 'Phli'],
		[0x10B80, 0x10BAF, 'Phlp'],
		[0x10C00, 0x10C48, 'Orkh'],
		[0x10C80, 0x10CB2, 'Hung'],
		[0x10E60, 0x10E7E, 'Ruminumerals']
	],
	[
		[0x11000, 0x1106F, 'Brah'],
		[0x11080, 0x110C1, 'Kthi'],
		[0x110D0, 0x110F9, 'Sora'],
		[0x11100, 0x11143, 'Cakm'],
		[0x11176, 0x11150, 'Mahj'],
		[0x11180, 0x111D9, 'Shrd'],
		[0x11200, 0x1123D, 'Khoj'],
		[0x11280, 0x112A9, 'Mult'],
		[0x112B0, 0x112F9, 'Sind'],
		[0x11301, 0x11374, 'Gran'],
		[0x11400, 0x1145D, 'Newa'],
		[0x11480, 0x114D9, 'Tirh'],
		[0x11580, 0x115DD, 'Sidd'],
		[0x11600, 0x11659, 'Modi'],
		[0x11680, 0x116C9, 'Takr'],
		[0x11700, 0x1173F, 'Ahom'],
		[0x118A0, 0x118FF, 'Wara'],
		[0x11A00, 0x11A47, 'Zanb'],
		[0x11A50, 0x11AA2, 'Soyo'],
		[0x11AC0, 0x11AF8, 'Pauc'],
		[0x11C00, 0x11C6C, 'Bhks'],
		[0x11C70, 0x11CB6, 'Marc'],
		[0x11D00, 0x11D59, 'Gonm']
	],
	[
		[0x12000, 0x1236E, 'Xsux'],
		[0x12400, 0x12473, 'Xsux']
	],
	[
		[0x13000, 0x1342E, 'Egyp']
	],
	[
		[0x14400, 0x14646, 'Hluw']
	],
	[
		// no data for 0x15000-0x15FFF
	],
	[
		[0x16800, 0x16A38, 'Bamu'],
		[0x16A40, 0x16A6F, 'Mroo'],
		[0x16AD0, 0x16AF5, 'Bass'],
		[0x16B00, 0x16B8F, 'Hmng'],
		[0x16F00, 0x16F9F, 'Plrd']
	],
	[
		[0x17000, 0x17FFF, 'Tang']
	],
	[
		[0x18000, 0x187EC, 'Tang'],
		[0x18800, 0x18AF2, 'Tang']
	],
	[
		// no data for 0x19000-0x19FFF
	],
	[
		// no data for 0x1A000-0x1AFFF
	],
	[
		[0x1B002, 0x1B11E, 'Hira'], // no unique code for hentaigana on Wiktionary
		[0x1B170, 0x1B2FB, 'Nshu'],
		[0x1BC00, 0x1BC9F, 'Dupl']
	],
	[
		// no data for 0x1C000-0x1CFFF
	],
	[
		[0x1D100, 0x1D1DD, 'musical'],
		[0x1D400, 0x1D7FF, 'Zmth'],
		[0x1D800, 0x1DAAF, 'Sgnw']
	],
	[
		[0x1E000, 0x1E02A, 'Glag'],
		[0x1E800, 0x1E8D6, 'Mend'],
		[0x1E900, 0x1E95F, 'Adlm']
	],
	[
		[0x1F300, 0x1F6C5, 'Zsym']
	],
	[
		[0x20000, 0x20FFF, 'Hani']
	],
	[
		[0x21000, 0x21FFF, 'Hani']
	],
	[
		[0x22000, 0x22FFF, 'Hani']
	],
	[
		[0x23000, 0x23FFF, 'Hani']
	],
	[
		[0x24000, 0x24FFF, 'Hani']
	],
	[
		[0x25000, 0x25FFF, 'Hani']
	],
	[
		[0x26000, 0x26FFF, 'Hani']
	],
	[
		[0x27000, 0x27FFF, 'Hani']
	],
	[
		[0x28000, 0x28FFF, 'Hani']
	],
	[
		[0x29000, 0x29FFF, 'Hani']
	],
	[
		[0x2A000, 0x2AFFF, 'Hani']
	],
	[
		[0x2B000, 0x2BFFF, 'Hani']
	],
	[
		[0x2C000, 0x2CFFF, 'Hani']
	],
	[
		[0x2D000, 0x2DFFF, 'Hani']
	],
	[
		[0x2E000, 0x2EBE0, 'Hani']
	]
];

const charToScript = {
	0x460: 'Cyrs',
	0x461: 'Cyrs',
	0x462: 'Cyrl',
	0x463: 'Cyrl',
	0x2135: 'Zmth',
	0x2190: 'Zsym',
	0x21FF: 'Zsym',
	0xFA0E: 'Hani',
	0xFA0F: 'Hani',
	0xFA11: 'Hani',
	0xFA13: 'Hani',
	0xFA14: 'Hani',
	0xFA1F: 'Hani',
	0xFA21: 'Hani',
	0xFA23: 'Hani',
	0xFA24: 'Hani',
	0xFA27: 'Hani',
	0xFA28: 'Hani',
	0xFA29: 'Hani',
	0x1056F: 'Aghb',
	0x16FE0: 'Tang',
	0x1B000: 'Kana',
	0x1B001: 'Hira'
};

// Groups of 4096 (0x1000) codepoints.
const groupToScript = [
	[ 4, 9, 'Hani' ],
	[ 11, 13, 'Hang' ],
	[ 32, 46, 'Hani' ]
];

// Used to decide which script "wins" when text contains characters from two
// script categories.

// For example, text containing both Grek and polytonic characters should be
// tagged as polytonic.
const scriptOverrulings = {
	'Grek': 'polytonic',
	'Cyrl': 'Cyrs',
	'Latinx': 'Latn',
};

/*
	Handles scripts that contain two or more basic scripts.

	The first item is the code of the compound script; the other items are
	the component scripts, or in the case of Hani, the other scripts that
	can be used alongside it.

	Hani is treated as a compound script for convenience, because it
	sometimes uses Latn characters.
*/
const compoundScripts = [
	[ "Hani", "Latn" ],
	[ "Jpan", "Hani", "Hira", "Kana", "Latn" ],
	[ "Kore", "Hang", "Hani", "Latn" ]
];

const log = window.scriptRecognition && window.scriptRecognition.log
	? console.log.bind(console)
	: () => {};

function linearSearch(codePoint, ranges) {
	for (const [lower, higher, result] of ranges) {
		// If ranges are greater than codepoint, no match will be found. Short-circuit the loop.
		if (codePoint < lower)
			return null;
		else if (codePoint <= higher)
			return result;
	}
}

function compareRanges(range1, range2) {
	return range1[0] === range2[0]
		? range1[1] - range2[1]
		: range1[0] - range2[0];
}

function binarySearch(codePoint, ranges) {
	if (!ranges)
		return null;

	if (binarySearch.cache) {
		const cacheResult = linearSearch(codePoint, binarySearch.cache);
		if (cacheResult)
			return cacheResult;
	} else
		binarySearch.cache = [];

	let bottom = 0, middle = 0, top = ranges.length;

	while (bottom <= top) {
		middle = (bottom + top) >> 1;
		const range = ranges[middle];
		if (!range)
			break;
		if (codePoint < range[0])
			top = middle - 1;
		else if (codePoint <= range[1]) {
			binarySearch.cache.push(range);
			binarySearch.cache.sort(compareRanges);
			return range[2];
		} else
			bottom = middle + 1;
	}

	return null;
}

// Returns a string (a Wiktionary script code) or null.
function codePointToScript(codePoint) {
	let script = charToScript[codePoint];
	if (script)
		return script;
	
	const group = codePoint >> 12;
	script = linearSearch(group, groupToScript);
	if (script)
		return script;
	
	const ranges = scriptRanges[group];
	
	if (ranges === undefined)
		return null;
	else if (ranges.length > 5)
		return binarySearch(codePoint, ranges);
	else
		return linearSearch(codePoint, ranges);
}

function getScripts(string) {
	const scripts = [];
	
	for (const character of string) {
		const script = codePointToScript(character.codePointAt(0));
		if (script !== null && !scripts.includes(script))
			scripts.push(script);
	}
	
	return scripts;
}

// Chooses one script out of an array of two or more scripts, or finds an
// appropriate compound script.
function resolveScriptConflicts(scripts, string) {
	return scripts.reduce((winningScript, script) => {
		if (winningScript === script || scriptOverrulings[script] === winningScript)
			return winningScript;
		else if (scriptOverrulings[winningScript] === script) {
			log(`${script} won out over ${winningScript}`);
			return script;
		}
		
		const compoundScript = compoundScripts.find(
			componentScripts => componentScripts.includes(winningScript) && componentScripts.includes(script));
		
		if (compoundScript) {
			log(`${script} and ${winningScript} were replaced with ${compoundScript[0]}`);
			return compoundScript[0];
		} else {
			log(`No script chosen out of ${scripts.join(", ")} in this string: ${string}.`);
			return undefined;
		}
	});
}

function getScript(text) {	
	const scripts = getScripts(text);
	
	return scripts.length > 0 ? resolveScriptConflicts(scripts, text) : undefined;
}

function containsScript(text, scriptToFind) {
	for (const character of text)
		if (codePointToScript(character.codePointAt(0)) === scriptToFind)
			return true;
	return false;
}

window.codePointToScript	= codePointToScript;
window.getScripts			= getScripts;
window.getScript			= getScript;
window.containsScript		= containsScript;

})();