This module provides lexicographic tools for Vietnamese language text.

Usage edit

In a template:

  • {{#invoke:vi|toReformedTones|xóa}} → xoá
  • {{#invoke:vi|toTraditionalTones|xoá}} → xóa
  • {{#invoke:vi|removeDiacritics|thay đổi gần đây}} → thay doi gan day
  • {{#invoke:vi|removeDiacritics|thay đổi gần đây|tones=0}} → thay dỏi gàn day
  • {{#invoke:vi|removeDiacritics|thay đổi gần đây|accents=0}} → thay dôi gân dây
  • {{#invoke:vi|removeDiacritics|thay đổi gần đây|đ=0}} → thay đoi gan đay

In another module:

viet = require "Module:vi"

t = {"an ninh", "bóng rổ", "Ả Rập", "bóng đá", "ăn", "Á Châu"}
table.sort(t, viet.comp)

causes t to hold:

Ả Rập, Á Châu, an ninh, ăn, bóng đá, bóng rổ

For best results, call _toTraditionalTones() or _toReformedTones() on each string before sorting them using comp().


---Lexicographic tools for Vietnamese language text.
local m_str_utils = require("Module:string utilities")

local find = m_str_utils.find
local format = mw.ustring.format
local gmatch = m_str_utils.gmatch
local gsub = m_str_utils.gsub
local len = m_str_utils.len
local lower = m_str_utils.lower
local match = m_str_utils.match
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = m_str_utils.char
local upper = m_str_utils.upper

local lang = require("Module:languages").getByCode("vi")

local export = {}

---Converts the given text to traditional tone marks.
function export.toTraditionalTones(text)
	if type(text) == "table" then
		text = text.args[1]
	end
	return (gsub(text, "%a+", function (word)
		if match(word, "^qu[yýỳỷỹỵ]$") then return word end
		return (gsub(word, "%a%a$", {
			["oá"] = "óa", ["oà"] = "òa", ["oả"] = "ỏa", ["oã"] = "õa", ["oạ"] = "ọa",
			["oé"] = "óe", ["oè"] = "òe", ["oẻ"] = "ỏe", ["oẽ"] = "õe", ["oẹ"] = "ọe",
			["uý"] = "úy", ["uỳ"] = "ùy", ["uỷ"] = "ủy", ["uỹ"] = "ũy", ["uỵ"] = "ụy"
		}))
	end))
end

---Converts the given text to reformed tone marks.
function export.toReformedTones(text)
	if type(text) == "table" then
		text = text.args[1]
	end
	return (gsub(text, "%a+", function (word)
		return (gsub(word, "%a%a$", {
			["óa"] = "oá", ["òa"] = "oà", ["ỏa"] = "oả", ["õa"] = "oã", ["ọa"] = "oạ",
			["óe"] = "oé", ["òe"] = "oè", ["ỏe"] = "oẻ", ["õe"] = "oẽ", ["ọe"] = "oẹ",
			["úy"] = "uý", ["ùy"] = "uỳ", ["ủy"] = "uỷ", ["ũy"] = "uỹ", ["ụy"] = "uỵ"
		}))
	end))
end

---Generate alternative orthographies.
function export.allSpellings(main_spelling, makeLinks)
	local frame = nil
	if type(main_spelling) == "table" then
		frame = main_spelling
		main_spelling, makeLinks = frame.args[1], frame.args.link
	end
	
	local xformers = {
		export.toTraditionalTones, export.toReformedTones,
	}
	
	local spellings = {}
	for i, xformer in ipairs(xformers) do
		local alt_spelling = xformer(main_spelling)
		if not spellings[alt_spelling] then
			table.insert(spellings, alt_spelling)
			spellings[alt_spelling] = true
		end
	end
	
	if makeLinks then
		local m_links = require("Module:links") -- [[Module:links]]
		for k, link in ipairs(spellings) do
			spellings[k] = m_links.full_link({lang = lang, term = link})
		end
	end
	return frame and table.concat(spellings, "/") or spellings
end

---Unicode codepoints for combining Vietnamese tone marks.
export.combiningToneMarks = u(
	0x300,  -- à
	0x301,  -- á
	0x303,  -- ã
	0x309,  -- ả
	0x323   -- ạ
)

---Unicode codepoints for combining Vietnamese accent marks.
export.combiningAccentMarks = u(
	0x302,  -- â
	0x306,  -- ă
	0x31b   -- ơ
)

---Strips Vietnamese diacritical marks from the given text.
-- @param tones     Set to “0” to leave tone marks intact.
-- @param accents   Set to “0” to leave accent marks intact.
-- @param đ         Set to “0” to leave “Đ” and “đ” intact.
function export.removeDiacritics(text, toneMarks, accentMarks, stroke)
	if type(text) == "table" then
		text, toneMarks, accentMarks, stroke = text.args[1],
			not text.args.tones or tonumber(text.args.tones) == 1,
			not text.args.accents or tonumber(text.args.accents) == 1,
			not text.args["đ"] or tonumber(text.args["đ"]) == 1
	end
	text = toNFD(text)
	if toneMarks then
		text = gsub(text, "[" .. export.combiningToneMarks .. "]", "")
	end
	if accentMarks then
		text = gsub(text, "[" .. export.combiningAccentMarks .. "]", "")
	end
	if stroke then
		text = gsub(text, "[Đđ]", {["Đ"] = "D", ["đ"] = "d"})
	end
	return toNFC(text)
end

---Vietnamese letters for use in comp().
export.letters = "aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ"

---Compare two syllables according to Vietnamese dictionary sorting order.
function export.compWord(word1, word2)
	if find(word1, word2, 1, true) == 0 then return false end
	if find(word2, word1, 1, true) == 0 then return true end
	
	do
		local func1, static1, var1 = gmatch(word1, "[" .. export.letters .. "]")
		local func2, static2, var2 = gmatch(word2, "[" .. export.letters .. "]")
		while true do
			local c1 = func1(static1, var1)
			local c2 = func2(static2, var2)
			if c1 == nil or c2 == nil then break end
			
			local idx1 = find(export.letters, c1, 1, true)
			local idx2 = find(export.letters, c2, 1, true)
			if idx1 and idx2 then
				if idx1 < idx2 then return true end
				if idx1 > idx2 then return false end
			end
		end
	end
	
	return word1 < word2
end

---Compare two strings according to Vietnamese dictionary sorting order.
function export.comp(text1, text2)
	if text1 == text2 then return false end
	
	do
		local func1, static1, var1 = gmatch(text1, "%a+")
		local func2, static2, var2 = gmatch(text2, "%a+")
		while true do
			local word1 = func1(static1, var1)
			local word2 = func2(static2, var2)
			if word1 == nil then return true end
			if word2 == nil then return false end
			
			if word1 ~= word2 then
				local lower1 = lower(word1)
				local lower2 = lower(word2)
				local noTones1 = export.removeDiacritics(lower1, true, false, false)
				local noTones2 = export.removeDiacritics(lower2, true, false, false)
				
				-- Compare base letters.
				if noTones1 ~= noTones2 then
					return export.compWord(noTones1, noTones2)
				end
				
				-- Compare letters case-insensitively.
				if lower1 ~= lower2 then
					return export.compWord(lower1, lower2)
				end
				
				-- Compare letters including tones.
				assert(word1 ~= word2)
				return export.compWord(word1, word2)
			end
		end
	end
	
	return text1 < text2
end

-- pruby variable for phien thiet hyperlinks (used by export.readings() and export.ruby())
local pruby = {}

---Abbreviations and text for Han tu references (used by export.createRefTag())
---[[Wiktionary:Beer parlour/2018/December#References for Vietnamese readings listed under Template:vi-readings]]
export.refAbbreviations = {
	tvctdhv = "Trần (1999)";
	hvttd = "Nguyễn (1974)";
	vntd = "Văn Mới (1954)";
	tchvtd = "Thiều Chửu (1942)";
	tdcndg = "Nguyễn (2014)",
	tdcntd = "Nguyễn et al. (2009)",
	gdhn = "Trần (2004)",
	dtdcn = "Vũ (1998)",
	btcn = "Hồ (1976)",
	bonet = "Bonet (1899)",
	genibrel = "Génibrel (1898)",
	taberd = "Taberd & Pigneau de Béhaine (1838)",
}

---Creates a ref tag containing [[Template:vi-ref]].
---Expands abbreviations using export.refAbbreviations.
function export.createRefTag(ref)
	local refFullName = export.refAbbreviations[ref] or ref
	return mw.getCurrentFrame():extensionTag{
		name = "ref",
		args = {
			name = ref,
		},
		content = format("{{vi-ref|%s.}}", refFullName),
	}
end

---[[Template:vi-readings]]
function export.readings(hanviet, nom, rs, phienthiet, reading)
	local pagename = mw.title.getCurrentTitle().text
	if type(hanviet) == "table" then
		local args = hanviet:getParent().args
		hanviet, nom, rs, phienthiet, reading =
			args.hanviet or args.hv, args.nom or args.n, args.rs or args.sort,
			args.phienthiet or args.phth or args.fanqie, args.reading or args.readings
	end
	
	local lines = {}
	local styles = {
		{
			link = "Hán Việt",
			cat = "Vietnamese Chữ Hán",
			list = hanviet and mw.text.split(hanviet, "%s*,%s*"),
			phienthiet = phienthiet and mw.text.split(phienthiet, "%s*,%s*")
		},
		{
			link = "chữ Nôm|Nôm",
			cat = "Vietnamese Nom",
			list = nom and mw.text.split(nom, "%s*,%s*"),
		},
		{
			link = "Hán Nôm",
			cat = "Vietnamese Han characters with unconfirmed readings",
			list = reading and mw.text.split(reading, "%s*,%s*")
		},
	}
	for i, style in ipairs(styles) do
		if style.list and #style.list > 0 and #style.list[1] > 0 then
			local readings = style.list
--			table.sort(readings, export.comp)
			for j, reading in ipairs(readings) do

				local ref
				local a, b = match(reading, "(.-)%s*%-%s*(.+)")
				if a then
					reading, ref = a, b
				end

				local spellings = export.allSpellings(reading, true)
				readings[j] = table.concat(spellings, "/")
				
				-- Linking of "切" to "fanqie" for English explanation
				if style.phienthiet and style.phienthiet[j] then
					pruby = "link"
					local ruby = export.ruby(match(mw.text.trim(style.phienthiet[j]),
						"(%a+) +(.+)"))
					pruby = {}
					if ruby then
						pruby = "nocolor"
						local suffix = export.ruby("切", "thiết")
						pruby = {}
						readings[j] = format("%s (%s[[fanqie#English|%s]])",
							readings[j], ruby, suffix)
					end
				end

				-- References
				if ref then
					for ref in mw.text.gsplit(ref, "%s*;%s*") do
						readings[j] = readings[j] .. export.createRefTag(ref)
					end
				end
			end
			if #readings > 0 then
				local sortkey = rs or mw.title.getCurrentTitle().text
				readings = table.concat(readings, ", ")
				table.insert(lines, format("<span class='Hani' lang='vi' style='font-size: 135%%;'>%s</span>: '''[[%s]]''' readings: %s[[Category:%s|%s]] [[Category:Vietnamese lemmas]] [[Category:Vietnamese Han characters]]</br>",
					pagename, style.link, readings, style.cat, sortkey))
			end
		end
	end
	
	return table.concat(lines, "\n")
end

---[[Template:vi-ruby]]
function export.ruby(characters, readings, mark, alts)
	if type(characters) == "table" then
		local args = characters:getParent().args
		characters, readings, mark, alts =
			args[1] or "",
			args[2] or "",
			args.mark or mw.title.getCurrentTitle().text,
			((args.alts and mw.text.split(args.alts, "%s+")) or
				(args.ids and mw.text.split(args.ids, "%s+")) or {})
	end
	
	if not readings then
		return characters
	end
	
	readings = mw.text.split(readings, "[^" .. export.letters .. "]+")
	
	local result = {}
	local character_idx = 1
	local alt_idx = 1
	for character in gmatch(characters, ".") do
		local is_alt = false
		if character == "*" and alts[alt_idx] then
			character = alts[alt_idx]
			is_alt = true
			alt_idx = alt_idx + 1
		end
		if is_alt or (match(character, "^%a$") and not character:match("^%w$")) then
			local reading = readings[character_idx]
			if mark and character == mark then
				character = format("<mark>%s</mark>", character)
				reading = format("<mark>%s</mark>", reading)
			end
			if pruby == 'link' then
				character = format(
					"<ruby><rb><span class='Hani'; span style='font-size: 100%%'>[[%s#Vietnamese|%s]]</span></rb><rp>(</rp><rt><span style='padding: 0 0.25em; font-size: 135%%;'>[[%s#Vietnamese|%s]]</span></rt><rp>)</rp></ruby>",
					character, character, reading, reading)
			end
			if pruby == 'nocolor' then
				character = format(
					"<ruby><rb><span class='Hani' style='color:#000000;'>%s</span></rb><rp>(</rp><rt><span style='padding: 0 0.25em; font-size: 125%%;'>%s</span></rt><rp>)</rp></ruby>",
					character, reading)
			end
			if pruby ~= 'link' and pruby ~= 'nocolor' then
				character = format(
					"<ruby><rb><span class='Hani'>%s</span></rb><rp>(</rp><rt><span style='padding: 0 0.25em;'>%s</span></rt><rp>)</rp></ruby>",
					character, reading)
			end
			character_idx = character_idx + 1
		end
		table.insert(result, character)
	end
	return format("<span lang='vi' style='font-size: 135%%;'>%s</span>", table.concat(result))
end

function export.hantutab()
	local HaniChars = require("Module:scripts").getByCode("Hani"):getCharacters()
	local hantu = gsub(mw.title.getCurrentTitle().text, '[^' .. HaniChars .. ']', '')
	local table_head = '<table class="floatright wikitable" style="text-align:center; font-size:small;"><tr><th colspan="' .. 
		len(hantu) .. 
		'" style="font-weight:normal;">[[Hán Nôm|chữ Hán Nôm]] in this term</th></tr><tr lang="vi" class="Hani" style="font-size:2em; background:white; line-height:1em;">'
	return table_head .. 
		gsub(hantu, '(.)', '<td style="padding:0.5em;">[[%1#Vietnamese|%1]]</td>') .. 
		'</tr></table>'
end

---Returns the categories indicated by the given wikitext.
function export.classifierCategories(frame)
	local src = frame.args[1]
	local classifiers = {}
	for classifier in gmatch(gsub(src, "<[^>]->", ""), "[" .. export.letters .. "]+") do
		if classifier ~= "l" and classifier ~= "vi" and classifier ~= "vi-l" and
				classifier ~= "Vietnamese" then
			local cat = format("[[Category:Vietnamese nouns classified by %s]]",
				classifier)
			table.insert(classifiers, cat)
		end
	end
	return table.concat(classifiers)
end

function export.new(frame)
	local title = mw.title.getCurrentTitle().subpageText
	local args = frame:getParent().args
	local pos = args[1] or ""
	local def = args[2] or "{{rfdef|vi}}"
	local pos2 = args[3] or (args[4] and "" or false)
	local def2 = args[4] or "{{rfdef|vi}}"
	local pos3 = args[5] or (args[6] and "" or false)
	local def3 = args[6] or "{{rfdef|vi}}"
	local etym = args["e"] or false
	local head = args["head"] or false
	local cat = args["cat"] or false
	local reg = args["reg"] or false
	local cls = args["cls"] or false
	local rdp = args["rdp"] or false
	local nom = args["nom"] or false
	local pic = args["pic"] or false
	local picc = args["picc"] or false
	
	nom = nom and gsub(nom, "(.)", "[[%1]], ") or false
	nom = nom and gsub(nom, ", $", "") or false
	if args["h"] then
		etym = "{{vi-etym-sino|" .. args["h"] .. "}}."
	end
	if not etym and match(title, " ") then
		etym = "{{com|vi"
		for word in mw.text.gsplit(title, " ") do
			etym = etym .. "|" .. word
		end
		etym = etym .. "}}."
	end
	if etym == "-" then etym = false end
	if etym then etym = gsub(etym, "^%<", "From") end
	
	local result = ""
	
	local function genTitle(text)
		local pos_title = {
			[""] = "Noun", ["n"] = "Noun", ["pn"] = "Proper noun", ["propn"] = "Proper noun", ["pron"] = "Pronoun",
			["v"] = "Verb", ["vf"] = "Verb", ["a"] = "Adjective", ["adj"] = "Adjective", ["adv"] = "Adverb",
			["prep"] = "Preposition", ["postp"] = "Postposition", ["conj"] = "Conjunction",
			["part"] = "Particle", ["suf"] = "Suffix",
			["prov"] = "Proverb", ["id"] = "Idiom", ["ph"] = "Phrase", ["intj"] = "Interjection", ["interj"] = "Interjection",
			["cl"] = "Classifier", ["cls"] = "Classifier", ["num"] = "Numeral", ["abb"] = "Abbreviation", ["deter"] = "Determiner"
		};
		return pos_title[text] or upper(sub(text, 1, 1)) .. sub(text, 2, -1)
	end
	
	local function genHead(text)
		local pos_head = {
			[""] = "noun", ["n"] = "noun", ["pn"] = "proper noun", ["propn"] = "proper noun", ["v"] = "verb", ["vf"] = "verb form", ["a"] = "adj",
			["postp"] = "post", ["conj"] = "conj", ["part"] = "particle", ["pron"] = "pronoun",
			["prov"] = "proverb", ["id"] = "idiom", ["ph"] = "phrase", ["intj"] = "interj",
			["abb"] = "abbr", ["cl"] = "classifier", ["deter"] = "det"
		};
		return pos_head[text] or text
	end
	
	local function other(class, title, args)
		local code = ""
		if class == "der" and args[class] then
			code = code .. "\n\n===" .. title .. "===\n{{col3|vi|" .. args[class]
			i = 2
			while args[class .. i] do
				code = code .. "|" .. args[class .. i]
				i = i + 1
			end
			code = code .. "}}"
		elseif args[class] then
			code = code .. "\n\n===" .. title .. "===\n* {{l|vi|" .. args[class] .. "}}"
			i = 2
			
			while args[class .. i] do
				code = code .. "\n* {{l|vi|" .. args[class .. i] .. "}}"
				i = i + 1
			end
		end
		return code
	end
	
	result = result .. "==Vietnamese=="
	if args["wp"] then result = result .. "\n{{wikipedia|lang=vi" .. 
		(args["wp"] == "y" and "" or "|" .. args["wp"]) .. "}}" end
	if pic then result = result .. "\n[[File:" .. pic .. "|thumb|" .. 
		(picc or gsub(title, '^%l', upper) .. ".") .. "]]" end
	result = result .. other("alt", "Alternative forms", args)
	
	if etym then result = result .. "\n\n===Etymology===\n" .. etym end
	
	result = result .. "\n\n===Pronunciation===\n{{vi-IPA}}"
	result = result .. "\n\n===" .. genTitle(pos) .. "===\n{{vi-" .. genHead(pos) .. (head and ("|head=" .. head) or "") .. 
		((genHead(pos) == "noun" and cls) and "|cls=" .. cls or "") .. 
		(((genHead(pos) == "adj" or genHead(pos) == "verb" or genHead(pos) == "adv") and rdp) and "|rdp=" .. rdp or "") .. 
		(nom and "|" .. nom or "") .. 
		"}}\n\n# " .. def
		
	result = result .. other("syn", "=Synonyms=", args)
	result = result .. other("ant", "=Antonyms=", args)
	result = result .. other("der", "=Derived terms=", args)
	result = result .. other("also", "=See also=", args)
	
	if pos2 then
		result = result .. "\n\n===" .. genTitle(pos2) .. "===\n{{vi-" .. genHead(pos2) .. (head and ("|head=" .. head) or "") .. 
		((genHead(pos) == "noun" and cls) and "|cls=" .. cls or "") .. 
		(((genHead(pos2) == "adj" or genHead(pos2) == "verb" or genHead(pos2) == "adv") and rdp) and "|rdp=" .. rdp or "") .. 
		(nom and "|" .. nom or "") .. 
		"}}\n\n# " .. def2
	end
	
	if pos3 then
		result = result .. "\n\n===" .. genTitle(pos3) .. "===\n{{vi-" .. genHead(pos3) .. (head and ("|head=" .. head) or "") .. 
		((genHead(pos) == "noun" and cls) and "|cls=" .. cls or "") .. 
		(((genHead(pos3) == "adj" or genHead(pos3) == "verb" or genHead(pos3) == "adv") and rdp) and "|rdp=" .. rdp or "") .. 
		(nom and "|" .. nom or "") .. 
		"}}\n\n# " .. def3
	end
	
	if cat then result = result .. "\n\n{{C|vi|" .. cat .. "}}" end
	
	return result
end

function export.new_der(frame)
	local title = mw.title.getCurrentTitle().subpageText
	local data_module = require("Module:vi/vocab-list")
	local args = frame:getParent().args
	local result = {}
	for _, arg in ipairs(args) do
		table.insert(result, arg)
	end
	
	for _, word in ipairs(data_module) do
		if find(word, title) and word ~= title and not find(word, title .. "[^ ]") and not find(word, "[^ ]" .. title) then
			table.insert(result, word)
		end
	end
	
	local hash, res = {}, {}
	for _, element in ipairs(result) do
		if not hash[element] then
			res[#res + 1] = element
			hash[element] = true
		end
	end
	
	local vi_sort_module = require("Module:vi-sortkey")
	local makeSortKey = require("Module:fun").memoize(vi_sort_module.makeSortKey)
	table.sort(res, function(term1, term2) return makeSortKey(term1) < makeSortKey(term2) end)
	
	return "{{vi-der|" .. table.concat(res, "|") .. "}}"
end

function export.derived(frame)
	local tu_lay_note = "<span style=\"padding-left:4px; padding-right:4px\">&nbsp;</span><span style=\"background:#ffffe0\">(''[[từ láy]]'')</span>"
	local m_columns = require("Module:columns")
	local lang = require("Module:languages").getByCode("vi")
	local m_links = require("Module:links")
	local args = frame:getParent().args
	local pagename = mw.title.getCurrentTitle().text
	local result = {}
	local length = 0
	
	unfold = args["unfold"] and true or false
	title = args["title"] or false
	title_text = title or "Derived terms"

	for i, word in ipairs(args) do
		word, is_tu_lay = gsub(word, "%:tl", "")
		tu_lay = is_tu_lay > 0 and tu_lay_note or ""
		local word_parts = mw.text.split(gsub(word, "\n", "" ), ":")
		table.insert(result, m_links.full_link({ 
			lang = lang, 
			term = word_parts[1], 
			gloss = word_parts[2] or nil }) ..
			
		tu_lay)
		
		length = math.max(len(word), length)
	end
	
	return 
		m_columns.create_table(
			(length > 15 and 2 or 3), 
			result, 
			1, 
			"#F5F5FF",
			((unfold or #result < 7) and false or true), 
			"Derived terms",
			title_text, 
			nil, 
			nil,
			lang
		)
end

return export