Module:urk-common


local export = {}

local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match
local sub = mw.ustring.sub

export.syllable_pattern = "([เแโ]?)([กคงจชซฌญดตทนบปพฟมยรลวอฮ]ฺ?)([รล]?)([อาัิีึึืุู]?ว?)([ยะ]?)([กงจดนบวมลฮํ]?)"

-- tokenise an entry into its syllables
function export.syllabise(entry, perform_respell)
	local syllables = {}
	local idx = 1

	while idx <= len(entry) do
		-- leave non-thai characters alone
		if sub(entry, idx, idx) == " " or not match(sub(entry, idx, idx), "[ก-๎]") then
			table.insert(syllables, sub(entry, idx, idx))
			idx = idx + 1
		else
			-- initialise syllabification
			local v_pref, i, m, v_suf, f_pref, f_suf = match(sub(entry, idx), export.syllable_pattern)
			local match_length = len(v_pref .. i .. m .. v_suf .. f_pref .. f_suf)

			-- prevent initial consonant in the next syllable being
			-- misinterpreted as the final consonant in the current syllable
			if idx + match_length <= len(entry) and find(sub(entry, idx + match_length, idx + match_length), "[อาัิีึึืุู]") then
				f_pref = ""
				f_suf = ""
				match_length = len(v_pref .. i .. m .. v_suf)
			end
			-- "ะ" can only have "ฮ" as its second segment
			if f_pref == "ะ" and f_suf ~= "ฮ" then
				f_suf = ""
				match_length = len(v_pref .. i .. m .. v_suf .. f_pref)
			end

			-- perform respellings
			if perform_respell then
				-- syllables with non-approximant syllable-final have vowel "โ" by default
				if v_pref == "" and v_suf == "" and find(f_suf, "[กงดนบม]") then
					v_pref = "โ"
				-- syllables with syllable-final "ะ" have vowel "ั" by default
				elseif v_pref == "" and v_suf == "" and f_pref == "ะ" and f_suf == "" then
					v_suf = "ั"
				-- syllables with explicitly short vowel have syllable-final "ะ" by default
				elseif find(v_suf, "[ัิุ]") and f_pref == "" and f_suf == "" then
					f_pref = "ะ"
				-- syllables with "ว" and syllable-final actually have vowel "ัว"
				elseif v_suf == "ว" and f_pref ~= "" then
					v_suf = "ัว"
				end
			end

			-- "ว" cannot be part of the vowel if "ั" does not precede it
			if match(v_suf, "ว") and v_suf ~= "ัว" then
				v_suf = sub(v_suf, 1, 1)
				match_length = len(v_pref .. i .. m .. v_suf)
			end

			-- construct respelt syllable
			table.insert(syllables, v_pref .. i .. m .. v_suf .. f_pref .. f_suf)
			idx = idx + match_length
		end
	end

	return syllables
end

return export