Open main menu

Module:User:Erutuon/script recognition

This module generated the codepoint-to-script lookup table in Module:Unicode data/scripts.

{
	[0x00] = {
		{ 0x00041, 0x0005A, "Latn" },
		{ 0x00061, 0x0007A, "Latn" },
		{ 0x000C0, 0x000D6, "Latn" },
		{ 0x000D8, 0x000F6, "Latn" },
		{ 0x000F8, 0x0024F, "Latn" },
		{ 0x00370, 0x003E1, "Grek" },
		{ 0x003E2, 0x003EF, "Copt" },
		{ 0x003F0, 0x003FF, "Grek" },
		{ 0x00400, 0x0045F, "Cyrl" },
		{ 0x00460, 0x00469, "Cyrs" },
		{ 0x0046A, 0x0046D, "Cyrl" },
		{ 0x0046E, 0x00471, "Cyrs" },
		{ 0x00472, 0x00475, "Cyrl" },
		{ 0x00476, 0x00489, "Cyrs" },
		{ 0x0048A, 0x00527, "Cyrl" },
		{ 0x00531, 0x0058F, "Armn" },
		{ 0x00590, 0x005FF, "Hebr" },
		{ 0x00600, 0x006FF, "Arab" },
		{ 0x00700, 0x0074F, "Syrc" },
		{ 0x00750, 0x0077F, "Arab" },
		{ 0x00780, 0x007B1, "Thaa" },
		{ 0x007C0, 0x007FF, "Nkoo" },
		{ 0x00800, 0x0083E, "Samr" },
		{ 0x00840, 0x0085E, "Mand" },
		{ 0x00860, 0x0086A, "Syrc" },
		{ 0x008A0, 0x008FF, "Arab" },
		{ 0x00900, 0x0097F, "Deva" },
		{ 0x00980, 0x00983, "Beng" },
		{ 0x00985, 0x0098C, "Beng" },
		{ 0x00993, 0x009A8, "Beng" },
		{ 0x009AA, 0x009B0, "Beng" },
		{ 0x009B6, 0x009B9, "Beng" },
		{ 0x009BC, 0x009C4, "Beng" },
		{ 0x009CB, 0x009CE, "Beng" },
		{ 0x009E0, 0x009E3, "Beng" },
		{ 0x009E6, 0x009EF, "Beng" },
		{ 0x009F0, 0x009F1, "as-Beng" },
		{ 0x00A01, 0x00A76, "Guru" },
		{ 0x00A81, 0x00AF1, "Gujr" },
		{ 0x00B01, 0x00B77, "Orya" },
		{ 0x00B82, 0x00BFA, "Taml" },
		{ 0x00C00, 0x00C7F, "Telu" },
		{ 0x00C80, 0x00CF2, "Knda" },
		{ 0x00D02, 0x00D7F, "Mlym" },
		{ 0x00D82, 0x00DF4, "Sinh" },
		{ 0x00E01, 0x00E5B, "Thai" },
		{ 0x00E81, 0x00EDF, "Laoo" },
		{ 0x00F00, 0x00FDA, "Tibt" },
		length = 48,
	},
	[0x01] = {
		{ 0x01000, 0x0109F, "Mymr" },
		{ 0x010A0, 0x010CD, "Geok" },
		{ 0x010D0, 0x010FF, "Geor" },
		{ 0x01100, 0x011FF, "Hang" },
		{ 0x01200, 0x01399, "Ethi" },
		{ 0x013A0, 0x013F4, "Cher" },
		{ 0x01400, 0x0167F, "Cans" },
		{ 0x01680, 0x0169C, "Ogam" },
		{ 0x016A0, 0x016F0, "Runr" },
		{ 0x01700, 0x01714, "Tglg" },
		{ 0x01720, 0x01734, "Hano" },
		{ 0x01740, 0x01753, "Buhd" },
		{ 0x01760, 0x01773, "Tagb" },
		{ 0x01780, 0x017F9, "Khmr" },
		{ 0x01800, 0x018AA, "Mong" },
		{ 0x01900, 0x0194F, "Limb" },
		{ 0x01950, 0x01974, "Tale" },
		{ 0x01980, 0x019DF, "Talu" },
		{ 0x019E0, 0x019FF, "Khmr" },
		{ 0x01A00, 0x01A1F, "Bugi" },
		{ 0x01A20, 0x01AAD, "Lana" },
		{ 0x01B00, 0x01B7C, "Bali" },
		{ 0x01B80, 0x01BBF, "Sund" },
		{ 0x01BC0, 0x01BFF, "Batk" },
		{ 0x01C00, 0x01C4F, "Lepc" },
		{ 0x01C50, 0x01C7F, "Olck" },
		{ 0x01C90, 0x01CBF, "Geor" },
		{ 0x01E00, 0x01EFF, "Latn" },
		{ 0x01F00, 0x01FFE, "polytonic" },
		length = 29,
	},
	[0x02] = {
		{ 0x02190, 0x021FF, "Zsym" },
		{ 0x02200, 0x022FF, "Zmth" },
		{ 0x02300, 0x023FF, "Zsym" },
		{ 0x02500, 0x027BF, "Zsym" },
		{ 0x027C0, 0x027EF, "Zmth" },
		{ 0x02800, 0x028FF, "Brai" },
		{ 0x02980, 0x02AFF, "Zmth" },
		{ 0x02B00, 0x02BFE, "Zsym" },
		{ 0x02C00, 0x02C5E, "Glag" },
		{ 0x02C60, 0x02C7F, "Latinx" },
		{ 0x02C80, 0x02CFF, "Copt" },
		{ 0x02D00, 0x02D2D, "Geok" },
		{ 0x02D30, 0x02D7F, "Tfng" },
		{ 0x02D80, 0x02DDE, "Ethi" },
		{ 0x02E80, 0x02FDF, "Hani" },
		length = 15,
	},
	[0x03] = {
		{ 0x03000, 0x0303F, "Hani" },
		{ 0x03041, 0x0309F, "Hira" },
		{ 0x030A0, 0x030FF, "Kana" },
		{ 0x03105, 0x0312F, "Bopo" },
		{ 0x03131, 0x0318E, "Hang" },
		{ 0x031A0, 0x031BA, "Bopo" },
		{ 0x031C0, 0x031E3, "Hani" },
		{ 0x031F0, 0x031FF, "Kana" },
		{ 0x03300, 0x03357, "Kana" },
		{ 0x0337B, 0x0337F, "Hani" },
		{ 0x03400, 0x03FFF, "Hani" },
		length = 11,
	},
	[0x04] = {
		{ 0x04000, 0x04DB5, "Hani" },
		{ 0x04E00, 0x04FFF, "Hani" },
		length = 2,
	},
	[0x05] = {
		{ 0x05000, 0x05FFF, "Hani" },
		length = 1,
	},
	[0x06] = {
		{ 0x06000, 0x06FFF, "Hani" },
		length = 1,
	},
	[0x07] = {
		{ 0x07000, 0x07FFF, "Hani" },
		length = 1,
	},
	[0x08] = {
		{ 0x08000, 0x08FFF, "Hani" },
		length = 1,
	},
	[0x09] = {
		{ 0x09000, 0x09FFF, "Hani" },
		length = 1,
	},
	[0x0A] = {
		{ 0x0A000, 0x0A4C6, "Yiii" },
		{ 0x0A4D0, 0x0A4FF, "Lisu" },
		{ 0x0A500, 0x0A62B, "Vaii" },
		{ 0x0A640, 0x0A67F, "Cyrs" },
		{ 0x0A680, 0x0A697, "Cyrl" },
		{ 0x0A6A0, 0x0A6F7, "Bamu" },
		{ 0x0A720, 0x0A7FF, "Latinx" },
		{ 0x0A800, 0x0A82B, "Sylo" },
		{ 0x0A840, 0x0A877, "Phag" },
		{ 0x0A880, 0x0A8D9, "Saur" },
		{ 0x0A8E0, 0x0A8FF, "Deva" },
		{ 0x0A900, 0x0A92F, "Kali" },
		{ 0x0A930, 0x0A95F, "Rjng" },
		{ 0x0A980, 0x0A9DF, "Java" },
		{ 0x0A9E0, 0x0A9FE, "Mymr" },
		{ 0x0AA00, 0x0AA5F, "Cham" },
		{ 0x0AA60, 0x0AA7F, "Mymr" },
		{ 0x0AA80, 0x0AADF, "Tavt" },
		{ 0x0AAE0, 0x0AAFF, "Mtei" },
		{ 0x0AB01, 0x0AB2E, "Ethi" },
		{ 0x0AB30, 0x0AB65, "Latinx" },
		{ 0x0AB70, 0x0ABBF, "Cher" },
		{ 0x0ABC0, 0x0ABFF, "Mtei" },
		{ 0x0AC00, 0x0AFFF, "Hang" },
		length = 24,
	},
	[0x0B] = {
		{ 0x0B000, 0x0BFFF, "Hang" },
		length = 1,
	},
	[0x0C] = {
		{ 0x0C000, 0x0CFFF, "Hang" },
		length = 1,
	},
	[0x0D] = {
		{ 0x0D000, 0x0D7A3, "Hang" },
		length = 1,
	},
	[0x0F] = {
		{ 0x0FA27, 0x0FA29, "Hani" },
		{ 0x0FB13, 0x0FB17, "Armn" },
		{ 0x0FB1D, 0x0FB4F, "Hebr" },
		{ 0x0FB50, 0x0FDFD, "Arab" },
		{ 0x0FE70, 0x0FEFC, "Arab" },
		length = 5,
	},
	[0x10] = {
		{ 0x10000, 0x100FA, "Linb" },
		{ 0x10280, 0x1029C, "Lyci" },
		{ 0x102A0, 0x102D0, "Cari" },
		{ 0x102E1, 0x102FB, "Copt" },
		{ 0x10300, 0x10323, "Ital" },
		{ 0x10330, 0x1034A, "Goth" },
		{ 0x10350, 0x1037A, "Perm" },
		{ 0x10380, 0x1039F, "Ugar" },
		{ 0x103A0, 0x103D5, "Xpeo" },
		{ 0x10400, 0x1044F, "Dsrt" },
		{ 0x10450, 0x1047F, "Shaw" },
		{ 0x10480, 0x104A9, "Osma" },
		{ 0x104B0, 0x104FB, "Osge" },
		{ 0x10500, 0x10527, "Elba" },
		{ 0x10530, 0x10563, "Aghb" },
		{ 0x10600, 0x10767, "Lina" },
		{ 0x10800, 0x1083F, "Cprt" },
		{ 0x10840, 0x1085F, "Armi" },
		{ 0x10860, 0x1087F, "Palm" },
		{ 0x10880, 0x108AF, "Nbat" },
		{ 0x108E0, 0x108FF, "Hatr" },
		{ 0x10900, 0x1091F, "Phnx" },
		{ 0x10920, 0x1093F, "Lydi" },
		{ 0x10980, 0x1099F, "Mero" },
		{ 0x109A0, 0x109BF, "Merc" },
		{ 0x10A00, 0x10A58, "Khar" },
		{ 0x10A60, 0x10A7F, "Sarb" },
		{ 0x10A80, 0x10A9F, "Narb" },
		{ 0x10AC0, 0x10AF6, "Mani" },
		{ 0x10B00, 0x10B3F, "Avst" },
		{ 0x10B40, 0x10B5F, "Prti" },
		{ 0x10B60, 0x10B7F, "Phli" },
		{ 0x10B80, 0x10BAF, "Phlp" },
		{ 0x10C00, 0x10C48, "Orkh" },
		{ 0x10C80, 0x10CB2, "Hung" },
		{ 0x10D00, 0x10D39, "Rohg" },
		{ 0x10E60, 0x10E7E, "Ruminumerals" },
		{ 0x10F00, 0x10F27, "Sogo" },
		{ 0x10F30, 0x10F59, "Sogd" },
		length = 39,
	},
	[0x11] = {
		{ 0x11000, 0x1107F, "Brah" },
		{ 0x11080, 0x110CD, "Kthi" },
		{ 0x110D0, 0x110F9, "Sora" },
		{ 0x11100, 0x11146, "Cakm" },
		{ 0x11150, 0x11176, "Mahj" },
		{ 0x11180, 0x111D9, "Shrd" },
		{ 0x11200, 0x1123D, "Khoj" },
		{ 0x11280, 0x112A9, "Mult" },
		{ 0x112B0, 0x112F9, "Sind" },
		{ 0x11301, 0x11374, "Gran" },
		{ 0x11400, 0x1145E, "Newa" },
		{ 0x11480, 0x114D9, "Tirh" },
		{ 0x11580, 0x115DD, "Sidd" },
		{ 0x11600, 0x11659, "Modi" },
		{ 0x11680, 0x116C9, "Takr" },
		{ 0x11700, 0x1173F, "Ahom" },
		{ 0x11800, 0x1183B, "Dogr" },
		{ 0x118A0, 0x118FF, "Wara" },
		{ 0x11A00, 0x11A47, "Zanb" },
		{ 0x11A50, 0x11AA2, "Soyo" },
		{ 0x11AC0, 0x11AF8, "Pauc" },
		{ 0x11C00, 0x11C6C, "Bhks" },
		{ 0x11C70, 0x11CB6, "Marc" },
		{ 0x11D00, 0x11D59, "Gonm" },
		{ 0x11D60, 0x11DA9, "Gong" },
		{ 0x11EE0, 0x11EF8, "Maka" },
		length = 26,
	},
	[0x12] = {
		{ 0x12000, 0x1236E, "Xsux" },
		{ 0x12400, 0x12473, "Xsux" },
		length = 2,
	},
	[0x13] = {
		{ 0x13000, 0x1342E, "Egyp" },
		length = 1,
	},
	[0x14] = {
		{ 0x14400, 0x14646, "Hluw" },
		length = 1,
	},
	[0x16] = {
		{ 0x16800, 0x16A38, "Bamu" },
		{ 0x16A40, 0x16A6F, "Mroo" },
		{ 0x16AD0, 0x16AF5, "Bass" },
		{ 0x16B00, 0x16B8F, "Hmng" },
		{ 0x16E40, 0x16E9A, "Medf" },
		{ 0x16F00, 0x16F9F, "Plrd" },
		length = 6,
	},
	[0x17] = {
		{ 0x17000, 0x17FFF, "Tang" },
		length = 1,
	},
	[0x18] = {
		{ 0x18000, 0x18AF2, "Tang" },
		length = 1,
	},
	[0x1B] = {
		{ 0x1B001, 0x1B11E, "Hira" },
		{ 0x1B170, 0x1B2FB, "Nshu" },
		{ 0x1BC00, 0x1BC9F, "Dupl" },
		length = 3,
	},
	[0x1D] = {
		{ 0x1D100, 0x1D1DD, "musical" },
		{ 0x1D2E0, 0x1D2F3, "Maya" },
		{ 0x1D400, 0x1D7FF, "Zmth" },
		{ 0x1D800, 0x1DAAF, "Sgnw" },
		length = 4,
	},
	[0x1E] = {
		{ 0x1E000, 0x1E02A, "Glag" },
		{ 0x1E800, 0x1E8D6, "Mend" },
		{ 0x1E900, 0x1E95F, "Adlm" },
		length = 3,
	},
	[0x1F] = {
		{ 0x1F000, 0x1F0F5, "Zsym" },
		{ 0x1F300, 0x1FA6D, "Zsym" },
		length = 2,
	},
	[0x20] = {
		{ 0x20000, 0x20FFF, "Hani" },
		length = 1,
	},
	[0x21] = {
		{ 0x21000, 0x21FFF, "Hani" },
		length = 1,
	},
	[0x22] = {
		{ 0x22000, 0x22FFF, "Hani" },
		length = 1,
	},
	[0x23] = {
		{ 0x23000, 0x23FFF, "Hani" },
		length = 1,
	},
	[0x24] = {
		{ 0x24000, 0x24FFF, "Hani" },
		length = 1,
	},
	[0x25] = {
		{ 0x25000, 0x25FFF, "Hani" },
		length = 1,
	},
	[0x26] = {
		{ 0x26000, 0x26FFF, "Hani" },
		length = 1,
	},
	[0x27] = {
		{ 0x27000, 0x27FFF, "Hani" },
		length = 1,
	},
	[0x28] = {
		{ 0x28000, 0x28FFF, "Hani" },
		length = 1,
	},
	[0x29] = {
		{ 0x29000, 0x29FFF, "Hani" },
		length = 1,
	},
	[0x2A] = {
		{ 0x2A000, 0x2AFFF, "Hani" },
		length = 1,
	},
	[0x2B] = {
		{ 0x2B000, 0x2BFFF, "Hani" },
		length = 1,
	},
	[0x2C] = {
		{ 0x2C000, 0x2CFFF, "Hani" },
		length = 1,
	},
	[0x2D] = {
		{ 0x2D000, 0x2DFFF, "Hani" },
		length = 1,
	},
	[0x2E] = {
		{ 0x2E000, 0x2EBE0, "Hani" },
		length = 1,
	},

	individual = {
		[0x00462] = "Cyrl",
		[0x00463] = "Cyrl",
		[0x0098F] = "Beng",
		[0x00990] = "Beng",
		[0x009A1] = "Beng",
		[0x009A2] = "Beng",
		[0x009AF] = "Beng",
		[0x009B2] = "Beng",
		[0x009BC] = "Beng",
		[0x009C7] = "Beng",
		[0x009C8] = "Beng",
		[0x009D7] = "Beng",
		[0x02135] = "Zmth",
		[0x0FA0E] = "Hani",
		[0x0FA0F] = "Hani",
		[0x0FA11] = "Hani",
		[0x0FA13] = "Hani",
		[0x0FA14] = "Hani",
		[0x0FA1F] = "Hani",
		[0x0FA21] = "Hani",
		[0x0FA23] = "Hani",
		[0x0FA24] = "Hani",
		[0x1056F] = "Aghb",
		[0x16FE0] = "Tang",
		[0x16FE1] = "Nshu",
		[0x1B000] = "Kana",
	},

	blocks = {
		{ 0x04, 0x09, "Hani" },
		{ 0x0B, 0x0D, "Hang" },
		{ 0x17, 0x18, "Tang" },
		{ 0x20, 0x2E, "Hani" },
	},
}

local export = {}

local getCodepoint = mw.ustring.codepoint
local U = mw.ustring.char
local floor = math.floor

local title = mw.title.getCurrentTitle().fullText

local function check(funcName, expectType)
	return function(argIndex, arg)
		require("libraryUtil").checkType(funcName, argIndex, arg, expectType)
	end
end

local output_mt = {}
function output_mt:insert(str)
	self.n = self.n + 1
	self[self.n] = str
end

function output_mt:insert_format(...)
	self:insert(string.format(...))
end

output_mt.join = table.concat

output_mt.__index = output_mt

local function Output()
	return setmetatable({ n = 0 }, output_mt)
end

local function dump(val)
	local output = Output()
	
	output:insert('{\n')
	local range_format =
[[
		{ 0x%05X, 0x%05X, "%s" },
]]
	local length_format = -- also close range array
[[
		length = %d,
	},
]]
	for i = 0, 0x10FFFF / 0x100 do
		local ranges = val[i]
		if ranges then
			output:insert_format(
[[
	[0x%02X] = {
]], i)
			for j, range in ipairs(ranges) do
				output:insert_format(range_format, unpack(range))
			end
			output:insert_format(length_format, ranges.length or -1)
		end
	end
	
	output:insert
[[

	individual = {
]]
	
	for codepoint, script in require "Module:table".sortedPairs(val.individual) do
		output:insert_format(
[[
		[0x%05X] = "%s",
]],		codepoint, script)
	end
	output:insert [[
	},

	blocks = {
]]
	
	for _, blockRange in ipairs(val.blocks) do
		output:insert_format(
[[
		{ 0x%02X, 0x%02X, "%s" },
]],		unpack(blockRange))
	end
	
	output:insert
[[
	},
}]]
	
	return require "Module:debug".highlight(table.concat(output))
end

local function printRanges(ranges)
	local output = Output()
	output:insert("Ranges:")
	for _, range in ipairs(ranges) do
		output:insert_format('\n\tU+%04X-U+%04X: %s', unpack(range))
	end
	mw.log(output:join())
end

local function hasContents(t)
	if next(t) then
		return true
	else
		return false
	end
end

local function log(message)
	if title:match("testcases/documentation$") then
		mw.log(message)
	end
end

local function makeRangeKey(codepoint)
	return floor(codepoint / 0x1000)
end

local function isInRange(value, lower, upper)
	-- mw.log(value, lower, upper)
	local check = check("isInRange", "number")
	check(1, value)
	check(2, lower)
	check(3, upper)
	
	return value >= lower and value <= upper
end

local function lookupCharacter(characterLookup, character)
	local codepoint
	if type(character) == "string" then
		if mw.ustring.len(character) == 1 then
			codepoint = getCodepoint(character)
		else
			error("Character " .. character .. " has length " .. mw.ustring.len(character) .. ". It is supposed to be a single character.")
		end
	elseif type(character) == "number" then
		codepoint = character
	else
		error("Character is the wrong type: " .. type(character) .. ".")
	end
	
	if characterLookup.smallest and not isInRange(codepoint, characterLookup.smallest, characterLookup.largest) then
		return false
	elseif characterLookup.values and characterLookup.values[codepoint] then
		return true
	else
		for i, range in ipairs(characterLookup) do
			if isInRange(codepoint, range[1], range[2]) then
				return true
			end
		end
	end
	
	return false
end

local function forEachChar(str, func)
	if type(func) == "function" then
		for i = 1, mw.ustring.len(str) do
			char = mw.ustring.sub(str, i, i)
			func(char)
		end
	end
end

function export.makeCharacterLookup(pattern)
	local characterLookup = {}
	local values = {}
	local allValues = {}
	
	local i = 1
	-- Create ranges in which all characters belong to the script.
	local workingString = mw.ustring.gsub(
		pattern,
		"([^-])%-([^-])",
		function(item1, item2)
			local codepoint1, codepoint2 = getCodepoint(item1), getCodepoint(item2)
			--[[
			if not (codepoint1 < codepoint2) then
				error("Wrong codepoint order with " .. U(codepoint1) .. " and " .. U(codepoint2) .. "!")
			end
			]]
			table.insert(characterLookup, { codepoint1, codepoint2 })
			allValues[codepoint1] = true
			allValues[codepoint2] = true
			return ""
		end
	)
	if workingString ~= "" then
		workingString = mw.ustring.gsub(
			workingString,
			".",
			function(char)
				local codepoint = getCodepoint(char)
				values[codepoint] = true
				allValues[codepoint] = true
			end
		)
	end
	
	--[[
		Place the tables of ranges in the Unicode order (the patterns
		should already be in that order, but just to be safe).
	]]
	table.sort(
		characterLookup,
		function(item1, item2)
			return item1[1] < item2[1]
		end)
	
	local allValuesKeys = require("Module:table").numKeys(allValues)
	
	local smallest, largest = allValuesKeys[1], allValuesKeys[#allValuesKeys]
	
	-- Don't create an empty values table.
	if hasContents(values) then
		characterLookup.values = values
	end
	
	--[[
		Don't record the smallest and largest values if they're found in the
		first range.
	]]
	if not (smallest == characterLookup[1][1] and largest == characterLookup[1][2]) then
		characterLookup.smallest, characterLookup.largest = smallest, largest
	end
	
	return characterLookup
end

function export.makeAllScriptsCharacterLookup()
	local allScriptsCharacterLookup = {}
	local patternToScript = {}
	for code, data in pairs(mw.loadData("Module:scripts/data")) do
		if data.characters then
			-- Don't generate identical lookup table twice.
			local scriptWithPattern = patternToScript[data.characters]
			if scriptWithPattern then
				allScriptsCharacterLookup[code] = allScriptsCharacterLookup[scriptWithPattern]
			else
				allScriptsCharacterLookup[code] = export.makeCharacterLookup(data.characters)
			end
			patternToScript[data.characters] = code
		end
	end
	return allScriptsCharacterLookup
end

-- fa-Arab → Arab-fa
local function switchLangSc(scriptCode)
	return scriptCode:gsub("^([^-]+)%-(.+)$", "%2-%1")
end

-- To ensure that Grek and Latn appear first.
-- This also makes Grek and Latn take precedence when generating
-- the codepoint-to-script lookup table.
local scriptCodeReplacements = {
	polytonic = "Grek2",
	Latinx = "Latnx",
	Latf = "Latnf",
}

local function modifyAdHocCode(code)
	if scriptCodeReplacements[code] then
		return scriptCodeReplacements[code]
	elseif not (code:find("%u%l%l%l") or code:find("%l%l%l%-%u%l%l%l")) then
		return code:gsub("^(.+)$", "~%1")
	else
		return code
	end
end
	
local function keySort(key1, key2)
	local type1, type2 = type(key1), type(key2)
	if type1 == "number" and type2 == "string" then
		return true
	elseif type1 == "string" and type2 == "number" then
		return false
	elseif type1 == "string" then
		key1, key2 = modifyAdHocCode(key1), modifyAdHocCode(key2)
		key1, key2 = switchLangSc(key1), switchLangSc(key2)
		local lower1, lower2 = mw.ustring.lower(key1), mw.ustring.lower(key2)
		return lower1 < lower2
	else
		return key1 < key2
	end
end

local function hex(number)
	return string.format("0x%X", number)
end

local function divideRange(lower, upper, width, testing)
	local ranges = {}
	
	if not (lower and upper) then
		mw.log("divideRange failed:", lower, upper, width, testing)
		return nil
	end
	
	local position = floor(lower / width)
	local start = position * width
	
	local i = 0
	local increment = i * width
	repeat
		local range1 = start + increment
		local range2 = range1 + width - 1
		
		if range1 < lower then
			range1 = lower
		end
		
		if range2 > upper then
			range2 = upper
		end
		
		if testing then
			range1, range2 = hex(range1), hex(range2)
		end
		
		ranges[position + i] = { range1, range2 }
		
		i = i + 1
		increment = i * width
	until
		 start + increment > upper
	
	return ranges
end

function export.showDividedRange(frame)
	local lower = 0x2A700
	local higher = 0x2B73F
	local width = 0x1000
	local dividedRange = divideRange(lower, higher, width, true)
	return table.concat({ hex(lower), hex(higher) }, ", ") .. dump(dividedRange)
end

-- Scripts that consist entirely of characters from another script.
local scriptBlacklist = {
	["Latf"]		= true;
	["Hans"]		= true;
	["Hant"]		= true;
	["Kore"]		= true;
	["Jpan"]		= true;
	["fa-Arab"] 	= true;
	["kk-Arab"] 	= true;
	["ks-Arab"] 	= true;
	["ku-Arab"]		= true;
	["ms-Arab"]		= true;
	["mzn-Arab"]	= true;
	["ota-Arab"]	= true;
	["pa-Arab"]		= true;
	["ps-Arab"]		= true;
	["sd-Arab"]		= true;
	["tt-Arab"]		= true;
	["ug-Arab"]		= true;
	["ur-Arab"]		= true;
	["nv-Latn"]		= true;
	["pjt-Latn"]	= true;
	["Zyyy"]		= true;
}

local function sortRange(range1, range2)
	local number1, number2 = tonumber(range1[1]), tonumber(range2[1])
	if number1 == number2 then
		return keySort(range1[3], range2[3])
	else
		return number1 < number2
	end
end

local function printScriptRange(range, hideScriptName)
	if hideScriptName then
		return ("U+%04X-U+%04X"):format(range[1], range[2])
	else
		return ("%s (U+%04X-U+%04X)"):format(range[3], range[1], range[2])
	end
end
	
-- When there is overlap between ranges belonging to two different scripts,
-- the key in this table overrides the value.
local overrides = {
	Beng = "as-Beng",
	Cyrl = "Cyrs",
	Grek = "polytonic",
	Latn = "Latinx",
}

local function fixRangeOverlaps(ranges)
	local prev
	local i = 1
	while ranges[i] do
		range = ranges[i]
		prev = ranges[i - 1]
		if prev and (range[1] <= prev[1] or range[2] <= prev[2]) then
			-- mw.log(("%s in conflict with %s")
				-- :format(printScriptRange(prev), printScriptRange(range)))
			local overrider, overridden
			if overrides[range[3]] == prev[3] then
				overrider, overridden = range, prev
			elseif overrides[prev[3]] == range[3] then
				overrider, overridden = prev, range
			end
			
			if overrider and overridden then
				mw.log(("%s overrides %s"):format(printScriptRange(overrider),
					printScriptRange(overridden)))
			else
				mw.log(("Should %s override %s or the other way around?")
					:format(printScriptRange(prev), printScriptRange(range)))
			end
			
			if overrider[1] <= overridden[1] then -- low end of overridden is inside overrider
				if overridden[2] <= overrider[2]  then -- overridden entirely within overrider
					table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
					if overridden == prev then
						i = i - 1
					end
				else -- upper part of overridden outside of overrider
					if overridden[2] - overrider[2] == 1 then -- one codepoint of overridden is outside overrider
						table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
						if overridden == prev then
							i = i - 1
						end
						individual[overridden[2]] = overridden[3]
					else
						overridden[1] = overrider[2] + 1
					end
				end
			else -- overridden[1] < overrider[1]: low end of overridden is outside overrider
				-- single codepoint at low end of overridden is outside overrider
				table.remove(ranges, overridden == range and i or i - 1) -- remove overridden
				if overridden == prev then
					i = i - 1
				end
				
				if overrider[1] - overridden[1] == 1 then
					individual[overridden[1]] = overridden[3]
				else -- multiple codepoints at low end of overridden are outside overrider
					ranges:insert(i,
						{ overridden[1], overrider[1] - 1, overridden[3] })
					i = i + 1
				end
				
				if overrider[2] < overridden[2] then -- high end of overridden is outside overrider
					-- single codepoint at high end of overridden is outside of overrider
					if overridden[2] - overrider[2] == 1 then
						individual[overridden[2]] = overridden[3]
					else
						ranges:insert(i,
							{ overrider[2] + 1, overridden[2], overridden[3] })
						i = i + 1
					end
				end
			end
		end
		i = i + 1
	end
end

local function checkRangeOverlaps(ranges)
	local prev
	for i, range in ipairs(ranges) do
		if prev and prev[2] >= range[1] then
			mw.log(("%s overlaps with %s")
				:format(printScriptRange(prev), printScriptRange(range)))
		end
		prev = range
	end
end

local function makeCodepointToScriptLookup(testing)
	local output = {}
	local ranges_mt = {
		insert = function (self, i, value)
			if value ~= nil then
				if self[i][1] < value[1] then
					i = i + 1
				end
				mw.log(("Inserting %s below %s")
					:format(printScriptRange(value), printScriptRange(self[i])))
				table.insert(self, i, value)
			else
				value = i
				table.insert(self, value)
			end
		end,
		remove = table.remove,
	}
	ranges_mt.__index = ranges_mt
	
	setmetatable(output,
		{
			__index = function (self, key)
				local val = setmetatable({}, ranges_mt)
				self[key] = val
				return val
			end,
		})
	output.individual = {}
	local individual = output.individual
	local rangeStrings = {}
	
	local allScriptsCharacterLookup = export.makeAllScriptsCharacterLookup()
	for scriptCode, lookup in require("Module:table").sortedPairs(allScriptsCharacterLookup, keySort) do
		if not scriptBlacklist[scriptCode] then
			for key, value in ipairs(lookup) do
				if type(value) == "table" then
					local newRanges = divideRange(value[1], value[2], 0x1000, testing)
					if newRanges then
						for position, newRange in pairs(newRanges) do
							local rangeString = newRange[1] .. newRange[2]
							if rangeStrings[rangeString] then
								mw.log(("The range U+%04X-U+%04X is already "
									.. "recorded as belonging to the script "
									.. "code %s.")
									:format(newRange[1], newRange[2], rangeStrings[rangeString]))
							else
								rangeStrings[rangeString] = scriptCode
								
								output[position]:insert({ newRange[1], newRange[2], scriptCode })
							end
						end
					end
				end
			end
			
			if lookup.values then
				for codepoint in pairs(lookup.values) do
					if individual[codepoint] then
						mw.log("The codepoint " .. hex(codepoint) ..
								" is already recorded as belonging to the script code " ..
								individual[codepoint] .. ".")
					else
						individual[codepoint] = scriptCode
					end
				end
			end
		end
	end
	
	for position, ranges in pairs(output) do
		if type(position) == "number" then
			local prevRange
			local i = 1
			while ranges[i] do
				range = ranges[i]
				if prevRange and range[3] == prevRange[3] and prevRange[2] == range[1] - 1 then
					mw.log(("Merged %s with %s")
						:format(printScriptRange(range), printScriptRange(prevRange)))
					prevRange[2] = range[2]
					table.remove(ranges, i)
					i = i - 1 -- to compensate for removed element
				end
				prevRange = range
				i = i + 1
			end
			table.sort(ranges, sortRange)
		end
	end
	
	local individualCodepoints = require "Module:table".numKeys(individual)
	local minimumCodepointRange = 3
	local i = 1
	while individualCodepoints[i] do
		local codepoint = individualCodepoints[i]
		local script = individual[codepoint]
		if not script then
			error(("No script for U+%04X"):format(codepoint))
		end
		
		local startOfRun = codepoint
		while individual[codepoint + 1] == script do
			codepoint = codepoint + 1
			i = i + 1
		end
		
		if codepoint - startOfRun + 1 >= minimumCodepointRange
			and makeRangeKey(startOfRun) == makeRangeKey(codepoint) then
			for j = startOfRun, codepoint do
				individual[j] = nil
			end
			
			local rangeKey = makeRangeKey(startOfRun)
			local ranges = output[rangeKey]
			if not ranges then
				ranges = {}
				output[rangeKey] = ranges
			end
			ranges:insert({ startOfRun, codepoint, script })
			mw.log(("Added range %s from a run in individual map")
				:format(printScriptRange { startOfRun, codepoint, script }))
			table.sort(ranges, sortRange)
		end
		
		i = i + 1
	end
	
	for position, ranges in pairs(output) do
		if type(position) == "number" then
			if ranges[2] then
				fixRangeOverlaps(ranges)
			end
		end
	end
	
	-- Add length field to range arrays and check that there are no overlaps
	-- between ranges.
	output.blocks = {}
	local prevScript, blockRange
	for index, ranges in pairs(output) do
		if type(index) == "number" then
			ranges.length = #ranges
			if ranges[2] then
				checkRangeOverlaps(ranges)
			end
			local firstScript = ranges[1][3]
			if not ranges[2] or require "Module:fun".all(
					function (range)
						return range[3] == firstScript
					end,
					ranges) then -- All ranges contain the same script.
				if prevScript and firstScript == prevScript then
					if not blockRange then
						blockRange = { index - 1, index, prevScript }
						table.insert(output.blocks, blockRange)
					else
						blockRange[2] = index
					end
				else
					blockRange = nil
					prevScript = firstScript
				end
			else
				prevScript = nil
			end
		end
	end
	
	setmetatable(output, nil)
	
	return output
end

--[[
	Binary search: more efficient for the longer lists of codepoint ranges than
	for the shorter ones.
]]
local function binarySearch(ranges, value)
	--	Initialize numbers.
	local iStart, iMid = 1, 0
	-- Can't use # because table is loaded by mw.loadData.
	local iEnd = require("Module:table").size(ranges)
	
	if iEnd == 0 then
		return nil
	end
	
	local iterations = 0
	
	-- Do search.
	while iStart <= iEnd do
		iterations = iterations + 1
		
		-- Calculate middle.
		iMid = floor((iStart + iEnd) / 2)
		
		-- Get compare value.
		local range = ranges[iMid]
		
		-- Return matching index. Assumes there are no duplicates.
		if isInRange(value, range[1], range[2]) then
			return range
		
		-- Keep searching.
		elseif value < range[1] then
			iEnd = iMid - 1
		else
			iStart = iMid + 1
		end
	end
	return nil
end

local function lookupInOrder(number, ranges)
	for i, range in ipairs(ranges) do
		if isInRange(number, range[1], range[2]) then
			-- mw.log(mw.ustring.char(number), hex(number), i)
			return range[3]
		end
		if number < range[1] then
			-- mw.log(mw.ustring.char(number), hex(number), i)
			return nil
		end
	end
end

-- Save previously used codepoint ranges in case another character is in the
-- same range.
local rangesCache = {}

function export.charToScript(char)
	local lookup = mw.loadData("Module:User:Erutuon/script recognition/data") -- makeCodepointToScriptLookup()
	local codepoint = mw.ustring.codepoint(char)
	
	local individualMatch = lookup.individual[codepoint]
	if individualMatch then
		return individualMatch
	else
		local script = lookupInOrder(codepoint, rangesCache)
		if script then
			return script
		end
		
		local index = makeRangeKey(codepoint)
		
		script = lookupInOrder(index, lookup.blocks)
		if script then
			return script
		end
		
		local range = binarySearch(lookup[index], codepoint)
		if range then
			table.insert(rangesCache, range)
			table.sort(rangesCache, sortRange)
			return range[3]
		end
	end
	
	return nil
end

function export.show(frame)
	local allScriptsCharacterLookup = mw.loadData("Module:User:Erutuon/script recognition/data")
	
	local str = frame.args[1] or "ABCD一丨丶丿乙亅"
	
	local result = {}
	forEachChar(
		str,
		function(char)
			table.insert(result, tostring(export.charToScript(char)))
		end
	)

	return str .. ": " .. table.concat(result, ", ")
end

function export.show(frame)
	return dump(makeCodepointToScriptLookup())
end

return export