This module will transliterate Chinese language text. It is also used to transliterate Eastern Min, Jin, Mandarin, Gan, Xiang, Middle Chinese, Literary Chinese, Northern Min, Old Chinese, Wu, Cantonese, Sichuanese, and Taishanese. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:zh-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local m_str_utils = require("Module:string utilities")
local m_utils = require("Module:utilities")

local findTemplates = require("Module:template parser").findTemplates
local get_section = m_utils.get_section
local gsub = string.gsub
local insert = table.insert
local safe_require = m_utils.safe_require
local sub = string.sub
local toNFD = mw.ustring.toNFD
local trim = mw.text.trim
local ugsub = m_str_utils.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local usub = m_str_utils.sub
local uupper = m_str_utils.upper

local frame = mw.getCurrentFrame()
local tag

local lect_code = mw.loadData("Module:zh/data/lect codes").langcode_to_abbr

local export = {}

local function fail(lang, request)
	local langObj, req, cat = require("Module:languages").getByCode(lang)
	if request then
		cat = {"Requests for transliteration of " .. langObj:getCanonicalName() .. " terms"}
	end
	return nil, true, cat
end

local function get_content(title)
	local content = mw.title.new(title)
	if not content then
		return false
	end
	return get_section(content:getContent(), "Chinese", 2)
end

local function get_reading(readings, lang, i, i_end, start)
	if i == i_end then
		return sub(readings, start, i - 1)
	end
	local c = sub(readings, i, i)
	if c == "," and (
		lang == "cmn" or
		lang == "wuu" or
		lang == "yue" or
		lang == "zhx-tai"
	) then
		if sub(readings, i + 1, i + 1) ~= " " then
			return sub(readings, start, i - 1)
		end
	elseif c == "/" then
		return sub(readings, start, i - 1)
	end
end

local function handle_readings(readings, lang, tr)
	if lang == "ltc" or lang == "och" then
		if tr and readings ~= tr then
			return false
		end
		return readings
	end
	local tr_orig, i, start, i_end, c, reading = tr, 1, 1, #readings + 1
	while i <= i_end do
		reading = get_reading(readings, lang, i, i_end, start)
		if not reading then
		elseif not reading:find("=") then
			if (
				not tr or
				tr == reading or
				gsub(ulower(tr), "%^", "") == reading
			) then
				tr = reading
			elseif ulower(reading) ~= tr then
				return false
			end
			start = i + 1
		elseif lang == "cmn" and reading == "cap=y" then
			local tr_cap = "^" .. tr
			if not tr_orig or tr_orig == tr_cap then
				tr = tr_cap
			end
		end
		i = i + 1
	end
	return tr
end

local function iterate_content(content, lang, see, seen, tr)
	for template, args in findTemplates(content) do
		if template == "zh-pron" then
			for k, v in pairs(args) do
				if (
					#v > 0 and
					type(k) == "string" and
					frame:preprocess(k) == lect_code[lang]
				) then
					tr = handle_readings(frame:preprocess(v), lang, tr)
					break
				end
			end
			if tr == false then
				return tr
			end
		elseif template == "zh-see" then
			local arg = trim(frame:preprocess(args[1]))
			if not seen[arg] then
				insert(see, arg)
			end
		end
	end
	return tr
end

function export.tr(text, lang, sc)
	if (not text) or text == "" then
		return text
	end
	
	if lang == "zh" or lang == "lzh" then
		lang = "cmn"
	end
	
	if not lect_code[lang] then
		lang = require("Module:languages").getByCode(lang, nil, true):getFullCode()
	end
	
	local content = get_content(text)
	if not content then
		return fail(lang)
	end
	
	local see = {}
	local seen = {
		[text] = true
	}
	local tr = iterate_content(content, lang, see, seen)
	
	if tr == nil then
		local i, title = 1
		while i <= #see do
			title = see[i]
			content = get_content(title)
			if content then
				tr = iterate_content(content, lang, see, seen, tr)
				if tr == false then
					return fail(lang)
				end
				seen[title] = true
			end
			i = i + 1
		end
	end
	
	if not tr then
		return fail(lang)
	end
	
	if lang == "cmn" then
		tr = tr:gsub("#", "")
		if tr:match("[\194-\244]") then
			tag = tag or mw.loadData("Module:zh/data/cmn-tag").MT
			tr = tr:gsub(".[\128-\191]*", function(m)
				if m == "一" then
					return "yī"
				elseif m == "不" then
					return "bù"
				else
					m = tag[m] and tag[m][1]
					if m then
						return toNFD(m):gsub("^[aeiou]", "'%0")
					end
				end
			end)
				:gsub("^'", "") --remove initial apostrophe inserted by previous function
		end
		tr = ugsub(tr, "%^(.)", uupper)
	elseif lang == "hak" then
		-- TODO
	elseif lang == "ltc" or lang == "och" then
		if tr == "n" then
			return fail(lang)
		end
		local index = {}
		if tr then
			if lang == "ltc" then
				index = mw.text.split(tr, ",")
			else
				index = mw.text.split(tr, ";")
			end
		end
		for i = 1, ulen(text) do
			local module_type = lang .. "-pron"
			if lang == "och" then
				module_type = module_type .. "-ZS"
			end
			
			local data_module = safe_require("Module:zh/data/" .. module_type .. "/" .. usub(text, i, i))
			
			if not data_module or (((not index[i]) or index[i] == "y") and #data_module > 1) then
				return fail(lang)
			end
			
			if index[i] == "y" then
				index[i] = 1
			elseif index[i] then
				index[i] = tonumber(index[i])
			end
			
			index[i] = index[i] and data_module[index[i]] or data_module[1]
			
			if lang == "ltc" then
				local data = mw.loadData("Module:ltc-pron/data")
				local initial, final, tone = require("Module:ltc-pron").infer_categories(index[i])
				tone = tone ~= "" and ("<sup>" .. tone .. "</sup>") or tone
				index[i] = data.initialConv["Zhengzhang"][initial] .. data.finalConv["Zhengzhang"][final] .. tone
			else
				index[i] = index[i][6]
			end
		end
		tr = table.concat(index, " ")
		if lang == "och" then
			tr = "*" .. tr
		end
	elseif lang == "nan" then
		-- TODO
	elseif lang == "yue" then
		tr = tr:gsub("[%d-]+", "<sup>%0</sup>")
	elseif lang == "zhx-sic" then
		tr = tr
			:gsub("([%d-])(%a)", "%1 %2")
			:gsub("[%d-]+", "<sup>%0</sup>")
	elseif lang == "zhx-tai" then
		tr = tr:gsub("[%d*]+%-?[%d*]*", "<sup>%0</sup>")
	elseif lang == "nan-tws" then
		-- TODO
	elseif lang == "wuu" then
		local w_pron = require("Module:wuu-pron")
		if tr:match(';') then
			--TODO
			return fail(lang)
		elseif tr:match(':') then
			tr = w_pron.wugniu_format(tr:sub(4))
		else
			tr = w_pron.wugniu_format(w_pron.wikt_to_wugniu(tr))
		end
	else
		tr = require("Module:" .. lang .. "-pron").rom(tr)
	end
	
	-- End with a space so that concurrent parts of running text that need to be transliterated separately (e.g. due to links) are still properly separated.
	return tr .. " "
end

return export