This module page is in beta stage.
Its interface has been stabilised, but the module page may still contain errors. Do not deploy widely until the module page has been tested.

local m_str_utils = require("Module:string utilities")

local find = m_str_utils.find
local gmatch = m_str_utils.gmatch
local gsub = m_str_utils.gsub
local lower = m_str_utils.lower
local match = m_str_utils.match
local reverse = m_str_utils.reverse
local upper = m_str_utils.upper

local export = {}

-- XXX: needs rewrite [3 February 2020 (UTC)]
-- XXX: 老壯文 seems to omit marks tones from new Mandarin borrowings ([[w:zh:新壮文#注解]])

-- https://en.wikipedia.org/wiki/Standard_Zhuang
-- https://baike.baidu.com/item/壮语/7703463
-- 在线学壮文 https://web.archive.org/web/0/http://www.gxmyw.com.cn/plus/list.php?tid=21
-- 基础壮文学习系列:壮文标点符号与书写规则 https://web.archive.org/web/0/http://www.gxmyw.com.cn/wsxzw/2013/1017/57.html

local initialConv = {
	['b']   = 'p',
	['mb']  = 'ɓ',
	['m']   = 'm',
	['f']   = 'f',
	['v']   = 'β',
	['by']  = 'pʲ',
	['my']  = 'mʲ',

	['d']   = 't',
	['nd']  = 'ɗ',
	['n']   = 'n',
	['l']   = 'l',
	['s']   = 'θ',

	['ny']  = 'ɲ',
	['c']   = 'ɕ',
	['y']   = 'j',

	['g']   = 'k',
	['ng']  = 'ŋ',
	['r']   = 'ɣ',
	['gy']  = 'kʲ',
	['ngv'] = 'ŋʷ',
	['gv']  = 'kʷ',

	['']    = 'ʔ',
	['h']   = 'h',
}
	-- [bmfvdnslghrcy]?[gbd]?[vy]?

local vowelConv = {
	['a']   = { alone = 'a', wfinal = 'aː' },
	['e']   = { alone = 'e', wfinal = 'eː' },
	['i']   = { alone = 'i', wfinal = 'i' },
	['o']   = { alone = 'o', wfinal = 'oː' },
	['u']   = { alone = 'u', wfinal = 'u' },
	['w']   = { alone = 'ɯ', wfinal = 'ɯ' },

	['ai']  = { alone = 'aːi', wfinal = false },
	['ei']  = { alone = 'ei', wfinal = false },
	['oi']  = { alone = 'oːi', wfinal = false },
	['ui']  = { alone = 'uːi', wfinal = false },
	['wi']  = { alone = 'ɯːi', wfinal = false },

	['ae']  = { alone = 'ai', wfinal = 'a' },
	['ie']  = { alone = false, wfinal = 'iː' },
	['oe']  = { alone = false, wfinal = 'o' },
	['ue']  = { alone = false, wfinal = 'uː' },
	['we']  = { alone = false, wfinal = 'ɯː' },

	['au']  = { alone = 'aːu', wfinal = false },
	['aeu'] = { alone = 'au', wfinal = false },
	['eu']  = { alone = 'eːu', wfinal = false },
	['iu']  = { alone = 'iu', wfinal = false },
	['ou']  = { alone = 'ou', wfinal = false },

	['aw']  = { alone = 'aɯ', wfinal = false },
}
	-- [aeiouw][ieu]?[uw]?
	-- w/ final only: [aeiouw]e?
	-- cannot be w/ final: ai, ei, oi, ui, wi, au, aeu, eu, iu, ou, aw // [aeiouw]e?[iuw]
	-- cannot be w/o final: ie, oe, ue // [iou]e

local finalConv = {
	['']   = '',
	['m']  = 'm',

	['n']  = 'n',
	['ng'] = 'ŋ',
	['p']  = 'p',
	['b']  = 'p',
	['t']  = 't',
	['d']  = 't',
	['k']  = 'k',
	['g']  = 'k',
}
	-- [mnpbtdkg]?g?

local toneConv = {
	['1']   = '˨˦', --24
	['2']  = '˧˩', --31 z
	['3']  = '˥', --55 j
	['4']  = '˦˨', --42 x
	['5']  = '˧˥', --35 q
	['6']  = '˧', --33 h

	['7']  = '˥', --55
	['7:'] = '˧˥', --35
	['8']  = '˧', --33
}

local toneConvToNumbers = {
	['']   = '1',
	['z']  = '2',
	['j']  = '3',
	['x']  = '4',
	['q']  = '5',
	['h']  = '6',
}

local toneConvFromNumbers = {
	['1']  = '',
	['2']  = 'z',
	['3']  = 'j',
	['4']  = 'x',
	['5']  = 'q',
	['6']  = 'h',

	['7']  = '',
	['7:']  = '',
	['8']  = '',
}

local consonantConv_1957 = {
	['mb']  = 'ƃ',
	['nd']  = 'ƌ',
	['ng']  = 'ŋ',
	['ngv'] = 'ŋv',
}

local vowelConv_1957 = {
	['oe'] = 'ɵ',
	['ae'] = 'ə',
	['w']  = 'ɯ',
}

local toneConv_1957 = {
	['1']  = '',
	['2'] = 'ƨ',
	['3'] = 'з',
	['4'] = 'ч',
	['5'] = 'ƽ',
	['6'] = 'ƅ',

	['7']  = '',
	['7:']  = '',
	['8']  = '',
}

local function fix(text)
	local output = {}

	for word in gmatch(text, '\'?[A-Za-z]+[^A-Za-z]*') do
		local apostrophe, word, nonword = match(word, '(\'?)([A-Za-z]+)([^A-Za-z]*)')

		word = gsub(word, '[zjxq]', toneConvToNumbers) -- excludes h which is ambiguously tone or consonant

		-- /CV-CV/...=<CVCV>...
		-- /CVC-V/...=<CVC'V>...
		-- regex (pattern?) wildcards are greedy from the beginning of the string
		-- so counteract this by reversing the string
		-- so if we look for "([CVC])" it will first match what was originally the last CVC sequence
		-- (or something)
		word = reverse(word)
		word = '|' .. gsub(word, '(g?[mnpbtdkg]?)([ieu]?[uw]?[aeiouwAEIUOUW]+)([vy]?[gbd]?[bmfvdnslghrcyBMFVDNSLGHRCY]?)', '%1%2%3|')
		-- "+" seems to be needed after "[aiueow]"
		-- correct: "daeuz"→"daeuz" wrong: "daeuz"→"da|euz"
		word = reverse(word)
		mw.log('za1>' .. word)

		-- fix bad initial consonant: "|hya"→"h|ya", "|ngya"→"n|gya"
		word = gsub(word, '(|)([^aiueow])([^aiueow])([^aiueow]?)([aiueow])', function(x,a,b,c,d)
			if not initialConv[lower(a..b..c)] then
				return a..x..b..c..d
			end
		end)
		word = gsub(word, '([aiueow]+)([mnpbtdkg]g?)(|)', function(v,c,x)
			-- if there is a final consonant,
			if c ~= '' then
				-- and vowel sequence is not a sequence that only appears before finals,
				if not match(v, '^[aeiouw]e?$') then
					-- detect valid ...VC sequence at end of string
					return reverse(gsub(reverse(v..c..x), '(|)([^aiueow]+)(e?[aeiouw])', '%1%2%3|'))
				end
			end
		end)
		word = gsub(word, '|gvu', 'g|vu')
		mw.log('za2>' .. word)

		word = gsub(word, 'h|', '6|')
		word = gsub(word, '([A-Za-z]+)|', function(a)
			if match(a, '[ptk]$') then
				return a..'7|'
			elseif match(a, '[bdg]$') and not match(a, 'ng$') then
				return a..'8|'
			else
				return a..'1|'
			end
		end)
		mw.log('za3>' .. word)

		table.insert(output, apostrophe .. gsub(word, '|', '') .. nonword)
	end

	return table.concat(output)
end

function export.convert(text, scheme, new_bor)
	if type(text) == "table" then
		text, scheme, new_bor = text.args[1], text.args[2], text.args['new_bor']
	end
	local converted = {}

	local extra_pre = match(text, '^[^A-Za-z]*')

	text = fix(text)

	mw.log('za4>' .. text)

	for syllable in gmatch(text, '[A-Za-z]+%d[^A-Za-z]*') do
		local initial, vowel, final, tone, extra = match(syllable, '^([BMFVDNSLGHRCYbmfvdnslghrcy]?[gbd]?[vy]?)([AEIOUWaeiouw][ieu]?[uw]?)([mnpbtdkg]?g?)(%d)([^A-Za-z]*)$')
		
		local caps = false
		mw.log('za5>' .. initial, vowel, final, tone, extra)

		if find(initial .. vowel .. final, '[A-Z]') then
			caps = true
			initial, vowel, final = lower(initial), lower(vowel), lower(final)
		end

		if scheme == 'IPA' then
			initial = initialConv[initial]
			vowel = final == '' and vowelConv[vowel].alone or vowelConv[vowel].wfinal
			final = finalConv[final]
			if tone == '7' and find(vowel, 'ː') then
				tone = '7:'
			elseif new_bor and tone == '1' then
				tone = '5'
			end

			tone = toneConv[tone]

			syllable = initial .. vowel .. final .. tone

			table.insert(converted, syllable)
		elseif scheme == 'old' then
			initial = consonantConv_1957[initial] or initial
			vowel = gsub(vowel, '[oa]e', vowelConv_1957)
			vowel = gsub(vowel, 'w', vowelConv_1957)
			final = consonantConv_1957[final] or final
			tone = toneConv_1957[tone]

			if vowel == 'ə' and final == '' then
				vowel = 'əi'
			elseif vowel == 'aɯ' and final == '' then
				vowel = 'əɯ'
			end

			syllable = initial .. vowel .. final .. tone .. extra
			if caps then syllable = gsub(syllable, '^(.)', upper) end

			table.insert(converted, syllable)
		elseif scheme == 'hyphenation' then
			tone = toneConvFromNumbers[tone]

			extra = gsub(extra, '\'', '')
			syllable = initial .. vowel .. final .. tone .. extra
			if caps then syllable = gsub(syllable, '^(.)', upper) end

			table.insert(converted, syllable)
		elseif scheme == 'tone_numbers' then
			if new_bor and tone == '1' then
				tone = '5'
			end

			extra = gsub(extra, '\'', '')
			syllable = initial .. vowel .. final .. '<sup>' .. tone .. '</sup>' .. extra
			if caps then syllable = gsub(syllable, '^(.)', upper) end

			table.insert(converted, syllable)
		elseif scheme == 'raw_syllables' then
			table.insert(converted, syllable)
		else
			error('Convert to what representation?')
		end
	end

	if scheme == 'IPA' then
		converted = '/' .. table.concat(converted, ' ') .. '/'
	elseif scheme == 'old' then
		converted = extra_pre .. table.concat(converted, '')
		converted = mw.ustring.gsub(mw.ustring.gsub(converted, "([6Ƅƅ])'", "%1"), "([6Ƅƅ])&#39;", "%1")
	elseif scheme == 'hyphenation' then
		converted = gsub(extra_pre .. table.concat(converted, '‧'), ' ', '')
	elseif scheme == 'tone_numbers' then
		converted = extra_pre .. table.concat(converted, '')
	elseif scheme == 'raw_syllables' then
		-- (pass)
	end

	return converted
end

function export.show(frame)
	local params = {
		[1] = { },
		['new_bor'] = { type = "boolean" },
	}
	local args = require("Module:parameters").process(frame:getParent().args, params)

	local text, new_bor = args[1], args['new_bor']
	if not text then text = mw.title.getCurrentTitle().text end

	local ret = {}

	table.insert(
		ret,
		require('Module:accent qualifier').format_qualifiers({'Standard Zhuang'}) ..
		' ' ..
		require('Module:IPA').format_IPA_full(
			require('Module:languages').getByCode('za'),
			{
				{
					pron = export.convert(text, 'IPA', new_bor)
				}
			}
		)
	)

	table.insert(
		ret,
		'Tone numbers: ' ..
		export.convert(text, 'tone_numbers', new_bor)
	)

	table.insert(
		ret,
		'Hyphenation: ' ..
		export.convert(text, 'hyphenation', new_bor) ..
		'[[Category:Zhuang ' .. #export.convert(text, 'raw_syllables') .. '-syllable words]]'
	)

	return table.concat(ret, '\n* ')
end

function export.is_latin(frame)
	local text = frame.args[1]
	if find(text, '[ƂƃƋƌŊŋƏəƟɵƜɯƧƨЗзЧчƼƽƄƅ]') then
		return ''
	elseif find(text, '[A-Za-z]') then
		return 'y'
	else
		return '' -- CJK is too much of a pain to detect
	end
end

return export