User:Erutuon/scripts/scriptRecognition.js
< User:Erutuon | scripts
Note: You may have to bypass your browser’s cache to see the changes. In addition, after saving a sitewide CSS file such as MediaWiki:Common.css, it will take 5-10 minutes before the changes take effect, even if you clear your cache.
- Mozilla / Firefox / Safari: hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (Command-R on a Macintosh);
- Konqueror and Chrome: click Reload or press F5;
- Opera: clear the cache in Tools → Preferences;
- Internet Explorer: hold Ctrl while clicking Refresh, or press Ctrl-F5.
- This script lacks a documentation subpage. Please create it.
- Useful links: root page • root page’s subpages • links • redirects • your own
/*
Two functions: a codepoint-to-Wiktionary-script-code function like
char_to_script in [[Module:Unicode data]], and a string-to-script-code
function. Language-agnostic.
Originally from [[User:Erutuon/scripts/watchlistScriptTagging.js]] and
[[User:Erutuon/scripts/scriptTitles.js]].
Requires ECMAScript 2016 (ES7) because it uses Array.prototype.includes.
*/
/* jshint esversion: 6 */
/* globals mw */
(function scriptRecognitionIIFE() {
'use strict';
// The following data is from [[Module:Unicode data/scripts]] and ultimately
// based on [[Module:scripts/data]].
const scriptRanges = [
[
[0x41, 0x5A, 'Latn'],
[0x61, 0x7A, 'Latn'],
[0xC0, 0xD6, 'Latn'],
[0xD8, 0xF6, 'Latn'],
[0xF8, 0x24F, 'Latn'],
[0x370, 0x3E1, 'Grek'],
[0x3E2, 0x3EF, 'Copt'],
[0x3F0, 0x3FF, 'Grek'],
[0x400, 0x45F, 'Cyrl'],
[0x464, 0x469, 'Cyrs'],
[0x46A, 0x46D, 'Cyrl'],
[0x46F, 0x471, 'Cyrs'],
[0x472, 0x475, 'Cyrl'],
[0x476, 0x489, 'Cyrs'],
[0x48A, 0x527, 'Cyrl'],
[0x531, 0x58F, 'Armn'],
[0x590, 0x5FF, 'Hebr'],
[0x600, 0x6FF, 'Arab'],
[0x700, 0x74F, 'Syrc'],
[0x750, 0x77F, 'Arab'],
[0x780, 0x7B1, 'Thaa'],
[0x7C0, 0x7FA, 'Nkoo'],
[0x800, 0x83E, 'Samr'],
[0x840, 0x85E, 'Mand'],
[0x860, 0x86A, 'Syrc'],
[0x8A0, 0x8FF, 'Arab'],
[0x900, 0x97F, 'Deva'],
[0x981, 0x9FA, 'Beng'],
[0xA01, 0xA75, 'Guru'],
[0xA81, 0xAF1, 'Gujr'],
[0xB01, 0xB77, 'Orya'],
[0xB82, 0xBFA, 'Taml'],
[0xC01, 0xC7F, 'Telu'],
[0xC82, 0xCF2, 'Knda'],
[0xD02, 0xD7F, 'Mlym'],
[0xD82, 0xDF4, 'Sinh'],
[0xE01, 0xE5B, 'Thai'],
[0xE81, 0xEDF, 'Laoo'],
[0xF00, 0xFDA, 'Tibt']
],
[
[0x1000, 0x109F, 'Mymr'],
[0x10A0, 0x10CD, 'Geok'], // Asomtavruli
[0x10D0, 0x10FC, 'Geor'], // Mkhedruli
[0x1100, 0x11FF, 'Hang'],
[0x1200, 0x1399, 'Ethi'],
[0x13A0, 0x13F4, 'Cher'],
[0x1400, 0x167F, 'Cans'],
[0x1680, 0x169C, 'Ogam'],
[0x16A0, 0x16F0, 'Runr'],
[0x1700, 0x1714, 'Tglg'],
[0x1720, 0x1734, 'Hano'],
[0x1740, 0x1753, 'Buhd'],
[0x1760, 0x1773, 'Tagb'],
[0x1780, 0x17F9, 'Khmr'],
[0x1800, 0x18AA, 'Mong'],
[0x1900, 0x194F, 'Limb'],
[0x1950, 0x1974, 'Tale'],
[0x1980, 0x19DF, 'Talu'],
[0x19E0, 0x19FF, 'Khmr'],
[0x1A00, 0x1A1F, 'Bugi'],
[0x1A20, 0x1AAD, 'Lana'],
[0x1B00, 0x1B7C, 'Bali'],
[0x1B80, 0x1BBF, 'Sund'],
[0x1BC0, 0x1BFF, 'Batk'],
[0x1C00, 0x1C4F, 'Lepc'],
[0x1C50, 0x1C7F, 'Olck'],
[0x1E00, 0x1EFF, 'Latn'],
[0x1F00, 0x1FFE, 'polytonic']
],
[
[0x2200, 0x22FF, 'Zmth'],
[0x2300, 0x23F3, 'Zsym'],
[0x2500, 0x27BF, 'Zsym'],
[0x27C0, 0x27EF, 'Zmth'],
[0x2800, 0x28FF, 'Brai'],
[0x2980, 0x29FF, 'Zmth'],
[0x2A00, 0x2AFF, 'Zmth'],
[0x2C00, 0x2C5E, 'Glag'],
[0x2C60, 0x2C7F, 'Latinx'],
[0x2C80, 0x2CFF, 'Copt'],
[0x2D00, 0x2D2D, 'Geok'], // Nuskhuri
[0x2D30, 0x2D7F, 'Tfng'],
[0x2D80, 0x2DDE, 'Ethi'],
[0x2E80, 0x2FDF, 'Hani']
],
[
[0x3000, 0x303F, 'Hani'],
[0x3041, 0x309F, 'Hira'],
[0x30A0, 0x30FF, 'Kana'],
[0x3105, 0x312D, 'Bopo'],
[0x3131, 0x318E, 'Hang'],
[0x31A0, 0x31BA, 'Bopo'],
[0x31C0, 0x31E3, 'Hani'],
[0x31F0, 0x31FF, 'Kana'],
[0x3300, 0x3357, 'Kana'],
[0x337B, 0x337F, 'Hani'],
[0x3400, 0x3FFF, 'Hani']
],
[
[0x4000, 0x4DB5, 'Hani'],
[0x4E00, 0x4FFF, 'Hani']
],
[
[0x5000, 0x5FFF, 'Hani']
],
[
[0x6000, 0x6FFF, 'Hani']
],
[
[0x7000, 0x7FFF, 'Hani']
],
[
[0x8000, 0x8FFF, 'Hani']
],
[
[0x9000, 0x9FFF, 'Hani']
],
[
[0xA000, 0xA4C6, 'Yiii'],
[0xA4D0, 0xA4FF, 'Lisu'],
[0xA500, 0xA62B, 'Vaii'],
[0xA640, 0xA697, 'Cyrs'],
[0xA680, 0xA697, 'Cyrl'],
[0xA6A0, 0xA6F7, 'Bamu'],
[0xA720, 0xA7FF, 'Latinx'],
[0xA800, 0xA82B, 'Sylo'],
[0xA840, 0xA877, 'Phag'],
[0xA880, 0xA8D9, 'Saur'],
[0xA8E0, 0xA8FB, 'Deva'],
[0xA900, 0xA92F, 'Kali'],
[0xA930, 0xA95F, 'Rjng'],
[0xA980, 0xA9DF, 'Java'],
[0xA9E0, 0xA9FE, 'Mymr'],
[0xAA00, 0xAA5F, 'Cham'],
[0xAA60, 0xAA7F, 'Mymr'],
[0xAA80, 0xAADF, 'Tavt'],
[0xAAE0, 0xAAFF, 'Mtei'],
[0xAB01, 0xAB2E, 'Ethi'],
[0xAB30, 0xAB65, 'Latinx'],
[0xAB70, 0xABBF, 'Cher'],
[0xABC0, 0xABFF, 'Mtei'],
[0xAC00, 0xAFFF, 'Hang']
],
[
[0xB000, 0xBFFF, 'Hang']
],
[
[0xC000, 0xCFFF, 'Hang']
],
[
[0xD000, 0xD7A3, 'Hang']
],
[
// no data for 0xF000-0xFFFF
],
[
[0xFB13, 0xFB17, 'Armn'],
[0xFB1D, 0xFB4F, 'Hebr'],
[0xFB50, 0xFDFD, 'Arab'],
[0xFE70, 0xFEFC, 'Arab']
],
[
[0x10000, 0x100FA, 'Linb'],
[0x10280, 0x1029C, 'Lyci'],
[0x102A0, 0x102D0, 'Cari'],
[0x102E1, 0x102FB, 'Copt'],
[0x10300, 0x10323, 'Ital'],
[0x10330, 0x1034A, 'Goth'],
[0x10350, 0x1037A, 'Perm'],
[0x10380, 0x1039F, 'Ugar'],
[0x103A0, 0x103D5, 'Xpeo'],
[0x10400, 0x1044F, 'Dsrt'],
[0x10450, 0x1047F, 'Shaw'],
[0x10480, 0x104A9, 'Osma'],
[0x104B0, 0x104FB, 'Osge'],
[0x10500, 0x10527, 'Elba'],
[0x10530, 0x10563, 'Aghb'],
[0x10600, 0x10767, 'Lina'],
[0x10800, 0x1083F, 'Cprt'],
[0x10840, 0x1085F, 'Armi'],
[0x10860, 0x1087F, 'Palm'],
[0x10880, 0x108AF, 'Nbat'],
[0x108E0, 0x108FF, 'Hatr'],
[0x10900, 0x1091F, 'Phnx'],
[0x10920, 0x1093F, 'Lydi'],
[0x10980, 0x1099F, 'Mero'],
[0x109A0, 0x109BF, 'Merc'],
[0x10A00, 0x10A58, 'Khar'],
[0x10A60, 0x10A7F, 'Sarb'],
[0x10A80, 0x10A9F, 'Narb'],
[0x10AC0, 0x10AF6, 'Mani'],
[0x10B00, 0x10B3F, 'Avst'],
[0x10B40, 0x10B5F, 'Prti'],
[0x10B60, 0x10B7F, 'Phli'],
[0x10B80, 0x10BAF, 'Phlp'],
[0x10C00, 0x10C48, 'Orkh'],
[0x10C80, 0x10CB2, 'Hung'],
[0x10E60, 0x10E7E, 'Ruminumerals']
],
[
[0x11000, 0x1106F, 'Brah'],
[0x11080, 0x110C1, 'Kthi'],
[0x110D0, 0x110F9, 'Sora'],
[0x11100, 0x11143, 'Cakm'],
[0x11176, 0x11150, 'Mahj'],
[0x11180, 0x111D9, 'Shrd'],
[0x11200, 0x1123D, 'Khoj'],
[0x11280, 0x112A9, 'Mult'],
[0x112B0, 0x112F9, 'Sind'],
[0x11301, 0x11374, 'Gran'],
[0x11400, 0x1145D, 'Newa'],
[0x11480, 0x114D9, 'Tirh'],
[0x11580, 0x115DD, 'Sidd'],
[0x11600, 0x11659, 'Modi'],
[0x11680, 0x116C9, 'Takr'],
[0x11700, 0x1173F, 'Ahom'],
[0x118A0, 0x118FF, 'Wara'],
[0x11A00, 0x11A47, 'Zanb'],
[0x11A50, 0x11AA2, 'Soyo'],
[0x11AC0, 0x11AF8, 'Pauc'],
[0x11C00, 0x11C6C, 'Bhks'],
[0x11C70, 0x11CB6, 'Marc'],
[0x11D00, 0x11D59, 'Gonm']
],
[
[0x12000, 0x1236E, 'Xsux'],
[0x12400, 0x12473, 'Xsux']
],
[
[0x13000, 0x1342E, 'Egyp']
],
[
[0x14400, 0x14646, 'Hluw']
],
[
// no data for 0x15000-0x15FFF
],
[
[0x16800, 0x16A38, 'Bamu'],
[0x16A40, 0x16A6F, 'Mroo'],
[0x16AD0, 0x16AF5, 'Bass'],
[0x16B00, 0x16B8F, 'Hmng'],
[0x16F00, 0x16F9F, 'Plrd']
],
[
[0x17000, 0x17FFF, 'Tang']
],
[
[0x18000, 0x187EC, 'Tang'],
[0x18800, 0x18AF2, 'Tang']
],
[
// no data for 0x19000-0x19FFF
],
[
// no data for 0x1A000-0x1AFFF
],
[
[0x1B002, 0x1B11E, 'Hira'], // no unique code for hentaigana on Wiktionary
[0x1B170, 0x1B2FB, 'Nshu'],
[0x1BC00, 0x1BC9F, 'Dupl']
],
[
// no data for 0x1C000-0x1CFFF
],
[
[0x1D100, 0x1D1DD, 'musical'],
[0x1D400, 0x1D7FF, 'Zmth'],
[0x1D800, 0x1DAAF, 'Sgnw']
],
[
[0x1E000, 0x1E02A, 'Glag'],
[0x1E800, 0x1E8D6, 'Mend'],
[0x1E900, 0x1E95F, 'Adlm']
],
[
[0x1F300, 0x1F6C5, 'Zsym']
],
[
[0x20000, 0x20FFF, 'Hani']
],
[
[0x21000, 0x21FFF, 'Hani']
],
[
[0x22000, 0x22FFF, 'Hani']
],
[
[0x23000, 0x23FFF, 'Hani']
],
[
[0x24000, 0x24FFF, 'Hani']
],
[
[0x25000, 0x25FFF, 'Hani']
],
[
[0x26000, 0x26FFF, 'Hani']
],
[
[0x27000, 0x27FFF, 'Hani']
],
[
[0x28000, 0x28FFF, 'Hani']
],
[
[0x29000, 0x29FFF, 'Hani']
],
[
[0x2A000, 0x2AFFF, 'Hani']
],
[
[0x2B000, 0x2BFFF, 'Hani']
],
[
[0x2C000, 0x2CFFF, 'Hani']
],
[
[0x2D000, 0x2DFFF, 'Hani']
],
[
[0x2E000, 0x2EBE0, 'Hani']
]
];
const charToScript = {
0x460: 'Cyrs',
0x461: 'Cyrs',
0x462: 'Cyrl',
0x463: 'Cyrl',
0x2135: 'Zmth',
0x2190: 'Zsym',
0x21FF: 'Zsym',
0xFA0E: 'Hani',
0xFA0F: 'Hani',
0xFA11: 'Hani',
0xFA13: 'Hani',
0xFA14: 'Hani',
0xFA1F: 'Hani',
0xFA21: 'Hani',
0xFA23: 'Hani',
0xFA24: 'Hani',
0xFA27: 'Hani',
0xFA28: 'Hani',
0xFA29: 'Hani',
0x1056F: 'Aghb',
0x16FE0: 'Tang',
0x1B000: 'Kana',
0x1B001: 'Hira'
};
// Groups of 4096 (0x1000) codepoints.
const groupToScript = [
[ 4, 9, 'Hani' ],
[ 11, 13, 'Hang' ],
[ 32, 46, 'Hani' ]
];
// Used to decide which script "wins" when text contains characters from two
// script categories.
// For example, text containing both Grek and polytonic characters should be
// tagged as polytonic.
const scriptOverrulings = {
'Grek': 'polytonic',
'Cyrl': 'Cyrs',
'Latinx': 'Latn',
};
/*
Handles scripts that contain two or more basic scripts.
The first item is the code of the compound script; the other items are
the component scripts, or in the case of Hani, the other scripts that
can be used alongside it.
Hani is treated as a compound script for convenience, because it
sometimes uses Latn characters.
*/
const compoundScripts = [
[ "Hani", "Latn" ],
[ "Jpan", "Hani", "Hira", "Kana", "Latn" ],
[ "Kore", "Hang", "Hani", "Latn" ]
];
const log = window.scriptRecognition && window.scriptRecognition.log
? console.log.bind(console)
: () => {};
function linearSearch(codePoint, ranges) {
for (const [lower, higher, result] of ranges) {
// If ranges are greater than codepoint, no match will be found. Short-circuit the loop.
if (codePoint < lower)
return null;
else if (codePoint <= higher)
return result;
}
}
function compareRanges(range1, range2) {
return range1[0] === range2[0]
? range1[1] - range2[1]
: range1[0] - range2[0];
}
function binarySearch(codePoint, ranges) {
if (!ranges)
return null;
if (binarySearch.cache) {
const cacheResult = linearSearch(codePoint, binarySearch.cache);
if (cacheResult)
return cacheResult;
} else
binarySearch.cache = [];
let bottom = 0, middle = 0, top = ranges.length;
while (bottom <= top) {
middle = (bottom + top) >> 1;
const range = ranges[middle];
if (!range)
break;
if (codePoint < range[0])
top = middle - 1;
else if (codePoint <= range[1]) {
binarySearch.cache.push(range);
binarySearch.cache.sort(compareRanges);
return range[2];
} else
bottom = middle + 1;
}
return null;
}
// Returns a string (a Wiktionary script code) or null.
function codePointToScript(codePoint) {
let script = charToScript[codePoint];
if (script)
return script;
const group = codePoint >> 12;
script = linearSearch(group, groupToScript);
if (script)
return script;
const ranges = scriptRanges[group];
if (ranges === undefined)
return null;
else if (ranges.length > 5)
return binarySearch(codePoint, ranges);
else
return linearSearch(codePoint, ranges);
}
function getScripts(string) {
const scripts = [];
for (const character of string) {
const script = codePointToScript(character.codePointAt(0));
if (script !== null && !scripts.includes(script))
scripts.push(script);
}
return scripts;
}
// Chooses one script out of an array of two or more scripts, or finds an
// appropriate compound script.
function resolveScriptConflicts(scripts, string) {
return scripts.reduce((winningScript, script) => {
if (winningScript === script || scriptOverrulings[script] === winningScript)
return winningScript;
else if (scriptOverrulings[winningScript] === script) {
log(`${script} won out over ${winningScript}`);
return script;
}
const compoundScript = compoundScripts.find(
componentScripts => componentScripts.includes(winningScript) && componentScripts.includes(script));
if (compoundScript) {
log(`${script} and ${winningScript} were replaced with ${compoundScript[0]}`);
return compoundScript[0];
} else {
log(`No script chosen out of ${scripts.join(", ")} in this string: ${string}.`);
return undefined;
}
});
}
function getScript(text) {
const scripts = getScripts(text);
return scripts.length > 0 ? resolveScriptConflicts(scripts, text) : undefined;
}
function containsScript(text, scriptToFind) {
for (const character of text)
if (codePointToScript(character.codePointAt(0)) === scriptToFind)
return true;
return false;
}
window.codePointToScript = codePointToScript;
window.getScripts = getScripts;
window.getScript = getScript;
window.containsScript = containsScript;
})();