This is a private module sandbox of Theknightwho, for their own experimentation. Items in this module may be added and removed at Theknightwho's discretion; do not rely on this module's stability.


local char = string.char
local concat = table.concat
local floor = math.floor
local insert = table.insert
local split = mw.text.split
local tonumber = tonumber
local u = mw.ustring.char

local skipped_ranges = {}
do
	local hex = require("hex").to_hex
	local udata = require("Module:User:Theknightwho/UnicodeData.txt")
	
	local last = 0
	for line in udata:gmatch("[^\n]+") do
		local cp = line:match("^%x+")
		local a = cp
		if cp then
			cp = tonumber(cp, 16)
			if cp - last > 0x1000 then
				insert(skipped_ranges, {hex(last), hex(cp - 1)})
			end
			last = cp
		end
	end
	return skipped_ranges
end



local ducet = require("Module:User:Theknightwho/UCA/DUCET")

local implicit_ranges = {
	{0x3400, 0x4DBF}, -- CJK Unified Ideographs Extension A
	{0x4E00, 0x9FFF}, -- CJK Unified Ideographs
	{0xAC00, 0xD7AF}, -- Hangul Syllables
	{0xD800, 0xF8FF}, -- Surrogates, Private Use Area
	{0x12550, 0x12F8F}, -- Unassigned
	{0x13460, 0x143FF}, -- Unassigned
	{0x14680, 0x167FF}, -- Unassigned
	{0x17000, 0x1AFEF}, -- Tangut, Tangut Components, Khitan Small Script, Tangut Supplement, Unassigned
	{0x1B170, 0x1BBFF}, -- Nushu, Unassigned
	{0x1BCB0, 0x1CEFF}, -- Unassigned
	{0x1DAB0, 0x1DEFF}, -- Unassigned
	{0x20000, 0x2A6DF}, -- CJK Unified Ideographs Extension B
	{0x2A700, 0x2B73F}, -- CJK Unified Ideographs Extension C
	{0x2B740, 0x2B81F}, -- CJK Unified Ideographs Extension D
	{0x2B820, 0x2CEAF}, -- CJK Unified Ideographs Extension E
	{0x2CEB0, 0x2EBEF}, -- CJK Unified Ideographs Extension F
	{0x2EBF0, 0x2EE5F}, -- CJK Unified Ideographs Extension I
	{0x30000, 0x3134F}, -- CJK Unified Ideographs Extension G
	{0x31350, 0x323AF}, -- CJK Unified Ideographs Extension H
}

local export = {}

do
	local function to_hex(cp)
		return ("%04x"):format(cp)
	end
	
	local function escape_char(cp)
		if cp < 0x10000 then
			return "\\u" .. to_hex(cp)
		end
		return "\\u" .. to_hex(floor((cp - 0x10000) / 0x400) + 0xD800) ..
			"\\u" .. to_hex(cp % 0x400 + 0xDC00)
	end
	
	local function process_line(line, head_chars)
		local cps = line:match("^[%x ]+%f[ ]")
		if not cps then
			return
		end
		local head_char = tonumber(line:match("%[[*.](%x+)"), 16)
		if head_chars[head_char] then
			return
		end
		local output = {}
		for cp in cps:gmatch("%x+") do
			insert(output, escape_char(tonumber(cp, 16)))
		end
		head_chars[head_char] = concat(output)
	end
	
	local function process_key(i, head_chars, output)
		local v = head_chars[i]
		if not v then
			return
		end
		insert(output, "\t\"" .. escape_char(i + 0x100000) .. "\": \"" .. v .. "\"")
	end

	function export.headers()
		local head_chars = {}
		for line in ducet:gmatch("[^\n]+") do
			process_line(line, head_chars)
		end
		local output = {}
		for i = 0, 65535 do
			process_key(i, head_chars, output)
		end
		return concat(output, ",\n")
	end
end

do
	local escapes = {
		[0x07] = "\\a", [0x08] = "\\b", [0x09] = "\\t",
		[0x0A] = "\\n", [0x0B] = "\\v", [0x0C] = "\\f",
		[0x0D] = "\\r", [0x22] = "\\\"", [0x5C] = "\\\\"
	}
	
	local ranges
	
	local function adjust_codepoint(cp, plane)
		local new_cp = cp
		for _, range in ipairs(ranges) do
			if range[2] < cp then
				new_cp = new_cp - (range[2] - range[1] + 1)
			end
		end
		return new_cp
	end
	
	local function process_line(line, plane, output)
		-- Get the codepoint(s), and return if not found.
		local cp = line:match("^[%x ]+%f[ ]")
		if not cp then
			return
		end
		-- If there is more than one, (TODO)
		cp = split(cp, " ")
		if #cp > 1 then
			-- TODO
			return
		end
		-- Check this is the correct plane, and return if not. Planes range from 0x0 to 0x10 (17 in total), and each has 0x10000 characters, from U+(X)0000 to U+(X)FFFF.
		cp = tonumber(cp[1], 16)
		if floor(cp / 0x10000) ~= plane then
			return
		end
		-- Normalize codepoint by removing implicit ranges and the plane.
		cp = adjust_codepoint(cp, plane) % 0x10000
		-- Get the weights.
		local weights = {}
		for w1, w2, w3 in line:gmatch("%[[*.](%x+)%.(%x+)%.(%x+)%]") do
			insert(weights, {w1, w2, w3})
		end
		-- If there is more than one set, (TODO)
		if #weights > 1 then
			-- TODO
			return
		end
		weights = weights[1]
		-- Convert each the first two weights (ranging from 0x0000 to 0xFFFF) to 2-digit base-256 and store each digit as the corresponding character (e.g. 0xFFFD is "\255" and "\253"). The final weight can be stored as 1 digit, because it only ranges from 0x0000 to 0x001F.
		local w = {}
		for i = 1, 2 do
			local weight = tonumber(weights[i], 16)
			insert(w, char(floor(weight / 256)))
			insert(w, char(weight % 256))
		end
		insert(w, char(tonumber(weights[3], 16) % 256))
		for i = 1, #w do
			output[cp * 5 + i] = w[i]
		end
	end
	
	function export.weights(plane)
		if not plane then
			error("Please enter a plane.")
		end
		-- Collate the implicit ranges for this plane (if any).
		ranges = {}
		for _, range in ipairs(implicit_ranges) do
			if floor(range[1] / 0x10000) == plane then
				insert(ranges, range)
			end
		end
		local output = {}
		for line in ducet:gmatch("[^\n]+") do
			process_line(line, plane, output)
		end
		-- Fill in any blanks with zeroes.
		local max = 5 * (adjust_codepoint(plane * 0x10000 + 0xFFFF, plane) % 0x10000 + 1)
		for i = 1, max do
			output[i] = output[i] or "\0"
		end
		for i = #output, 1, -1 do
			local b = output[i]:byte()
			if b > 0x7E then
				output[i] = "\\" .. b
			elseif b < 0x07 or b > 0x0D and b < 0x20 then
				local nxt = output[i + 1]
				if nxt and nxt:match("^%d$") then
					b = ("%03d"):format(b)
				end
				output[i] = "\\" .. b
			else
				output[i] = escapes[b] or output[i]
			end
		end
		return concat(output)
	end
end

return export