Module:User:Theknightwho/UCA
- This module lacks a documentation subpage. You may create it.
- Useful links: root page • root page’s subpages • links • transclusions • testcases • user page • user talk page • userspace
This is a private module sandbox of Theknightwho, for their own experimentation. Items in this module may be added and removed at Theknightwho's discretion; do not rely on this module's stability.
local char = string.char
local concat = table.concat
local floor = math.floor
local insert = table.insert
local split = mw.text.split
local tonumber = tonumber
local u = mw.ustring.char
local skipped_ranges = {}
do
local hex = require("hex").to_hex
local udata = require("Module:User:Theknightwho/UnicodeData.txt")
local last = 0
for line in udata:gmatch("[^\n]+") do
local cp = line:match("^%x+")
local a = cp
if cp then
cp = tonumber(cp, 16)
if cp - last > 0x1000 then
insert(skipped_ranges, {hex(last), hex(cp - 1)})
end
last = cp
end
end
return skipped_ranges
end
local ducet = require("Module:User:Theknightwho/UCA/DUCET")
local implicit_ranges = {
{0x3400, 0x4DBF}, -- CJK Unified Ideographs Extension A
{0x4E00, 0x9FFF}, -- CJK Unified Ideographs
{0xAC00, 0xD7AF}, -- Hangul Syllables
{0xD800, 0xF8FF}, -- Surrogates, Private Use Area
{0x12550, 0x12F8F}, -- Unassigned
{0x13460, 0x143FF}, -- Unassigned
{0x14680, 0x167FF}, -- Unassigned
{0x17000, 0x1AFEF}, -- Tangut, Tangut Components, Khitan Small Script, Tangut Supplement, Unassigned
{0x1B170, 0x1BBFF}, -- Nushu, Unassigned
{0x1BCB0, 0x1CEFF}, -- Unassigned
{0x1DAB0, 0x1DEFF}, -- Unassigned
{0x20000, 0x2A6DF}, -- CJK Unified Ideographs Extension B
{0x2A700, 0x2B73F}, -- CJK Unified Ideographs Extension C
{0x2B740, 0x2B81F}, -- CJK Unified Ideographs Extension D
{0x2B820, 0x2CEAF}, -- CJK Unified Ideographs Extension E
{0x2CEB0, 0x2EBEF}, -- CJK Unified Ideographs Extension F
{0x2EBF0, 0x2EE5F}, -- CJK Unified Ideographs Extension I
{0x30000, 0x3134F}, -- CJK Unified Ideographs Extension G
{0x31350, 0x323AF}, -- CJK Unified Ideographs Extension H
}
local export = {}
do
local function to_hex(cp)
return ("%04x"):format(cp)
end
local function escape_char(cp)
if cp < 0x10000 then
return "\\u" .. to_hex(cp)
end
return "\\u" .. to_hex(floor((cp - 0x10000) / 0x400) + 0xD800) ..
"\\u" .. to_hex(cp % 0x400 + 0xDC00)
end
local function process_line(line, head_chars)
local cps = line:match("^[%x ]+%f[ ]")
if not cps then
return
end
local head_char = tonumber(line:match("%[[*.](%x+)"), 16)
if head_chars[head_char] then
return
end
local output = {}
for cp in cps:gmatch("%x+") do
insert(output, escape_char(tonumber(cp, 16)))
end
head_chars[head_char] = concat(output)
end
local function process_key(i, head_chars, output)
local v = head_chars[i]
if not v then
return
end
insert(output, "\t\"" .. escape_char(i + 0x100000) .. "\": \"" .. v .. "\"")
end
function export.headers()
local head_chars = {}
for line in ducet:gmatch("[^\n]+") do
process_line(line, head_chars)
end
local output = {}
for i = 0, 65535 do
process_key(i, head_chars, output)
end
return concat(output, ",\n")
end
end
do
local escapes = {
[0x07] = "\\a", [0x08] = "\\b", [0x09] = "\\t",
[0x0A] = "\\n", [0x0B] = "\\v", [0x0C] = "\\f",
[0x0D] = "\\r", [0x22] = "\\\"", [0x5C] = "\\\\"
}
local ranges
local function adjust_codepoint(cp, plane)
local new_cp = cp
for _, range in ipairs(ranges) do
if range[2] < cp then
new_cp = new_cp - (range[2] - range[1] + 1)
end
end
return new_cp
end
local function process_line(line, plane, output)
-- Get the codepoint(s), and return if not found.
local cp = line:match("^[%x ]+%f[ ]")
if not cp then
return
end
-- If there is more than one, (TODO)
cp = split(cp, " ")
if #cp > 1 then
-- TODO
return
end
-- Check this is the correct plane, and return if not. Planes range from 0x0 to 0x10 (17 in total), and each has 0x10000 characters, from U+(X)0000 to U+(X)FFFF.
cp = tonumber(cp[1], 16)
if floor(cp / 0x10000) ~= plane then
return
end
-- Normalize codepoint by removing implicit ranges and the plane.
cp = adjust_codepoint(cp, plane) % 0x10000
-- Get the weights.
local weights = {}
for w1, w2, w3 in line:gmatch("%[[*.](%x+)%.(%x+)%.(%x+)%]") do
insert(weights, {w1, w2, w3})
end
-- If there is more than one set, (TODO)
if #weights > 1 then
-- TODO
return
end
weights = weights[1]
-- Convert each the first two weights (ranging from 0x0000 to 0xFFFF) to 2-digit base-256 and store each digit as the corresponding character (e.g. 0xFFFD is "\255" and "\253"). The final weight can be stored as 1 digit, because it only ranges from 0x0000 to 0x001F.
local w = {}
for i = 1, 2 do
local weight = tonumber(weights[i], 16)
insert(w, char(floor(weight / 256)))
insert(w, char(weight % 256))
end
insert(w, char(tonumber(weights[3], 16) % 256))
for i = 1, #w do
output[cp * 5 + i] = w[i]
end
end
function export.weights(plane)
if not plane then
error("Please enter a plane.")
end
-- Collate the implicit ranges for this plane (if any).
ranges = {}
for _, range in ipairs(implicit_ranges) do
if floor(range[1] / 0x10000) == plane then
insert(ranges, range)
end
end
local output = {}
for line in ducet:gmatch("[^\n]+") do
process_line(line, plane, output)
end
-- Fill in any blanks with zeroes.
local max = 5 * (adjust_codepoint(plane * 0x10000 + 0xFFFF, plane) % 0x10000 + 1)
for i = 1, max do
output[i] = output[i] or "\0"
end
for i = #output, 1, -1 do
local b = output[i]:byte()
if b > 0x7E then
output[i] = "\\" .. b
elseif b < 0x07 or b > 0x0D and b < 0x20 then
local nxt = output[i + 1]
if nxt and nxt:match("^%d$") then
b = ("%03d"):format(b)
end
output[i] = "\\" .. b
else
output[i] = escapes[b] or output[i]
end
end
return concat(output)
end
end
return export