Module:bcl-basahan sc

This module lacks a documentation subpage. Please create it.
Useful links: subpage list • links • transclusions • testcases • sandbox
-- Based on [[Module:tl-bay sc]] by [[User:Ysrael214]].

local export = {}

local lang = require("Module:languages").getByCode("bcl")
local sc_Tglg = require("Module:scripts").getByCode("Tglg")

local u = mw.ustring.char
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rsplit = mw.text.split
local ulower = mw.ustring.lower

local AC = u(0x0301) -- acute =  ́
local GR = u(0x0300) -- grave =  ̀
local CFLEX = u(0x0302) -- circumflex =  ̂
local TILDE = u(0x0303) -- tilde =  ̃
local DIA = u(0x0308) -- diaeresis =  ̈
local MACRON = u(0x0304) -- macron 

local vowel = "aeəiouàèìòù" -- vowel
local V = "[" .. vowel .. "]"
local accent = AC .. GR .. CFLEX .. MACRON
local accent_c = "[" .. accent .. "]"
local stress_c = "[" .. AC .. GR .. "]"
local separator = accent ..  "# ./"
local separator_c = "[" .. separator .. "]"
local C = "[^" .. vowel .. separator .. "]" -- consonant

local basahan_chars = { 
	["a"] = "ᜀ", 
	["i"] = "ᜁ", 
	["u"] = "ᜂ",
	["b"] = "ᜊ", 
	["k"] = "ᜃ", 
	["d"] = "ᜇ", 
	["g"] = "ᜄ", 
	["h"] = "ᜑ", 
	["l"] = "ᜎ",
	["m"] = "ᜋ",
	["n"] = "ᜈ",
	["ŋ"] = "ᜅ",
	["p"] = "ᜉ",
	["r"] = "ᜍ",
	["s"] = "ᜐ",
	["t"] = "ᜆ",
	["w"] = "ᜏ",
	["y"] = "ᜌ"
}

local basahan_marks = {
	["a"] = "",
	["i"] = "ᜒ",
	["u"] = "ᜓ",
	["+"] = "᜔",
	["/"] = "᜕"
}

local basahan_replace_word = {
	["mga"] = "manga"
}

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- version of rsubn() that returns a 2nd argument boolean indicating whether
-- a substitution was made.
local function rsubb(term, foo, bar)
	local retval, nsubs = rsubn(term, foo, bar)
	return retval, nsubs > 0
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

-- ĵ, ɟ and ć are used internally to represent [d͡ʒ], [j] and [t͡ʃ]

function export.transcribe(text, trad, diph)
	local debug = {}

	text = ulower(text or mw.title.getCurrentTitle().text)
	-- decompose everything but ñ and ü
	text = mw.ustring.toNFD(text)
	text = rsub(text, "." .. "[" .. TILDE .. DIA .."]", {
		["n" .. TILDE] = "ñ",
		["u" .. TILDE] = "ü",
		["e" .. DIA] = "ə",
	})
	
	-- convert commas and en/en dashes to IPA foot boundaries
	text = rsub(text, "%s*[,–—]%s*", " | ")
	-- question mark or exclamation point in the middle of a sentence -> IPA foot boundary
	text = rsub(text, "([^%s])%s*[!?]%s*([^%s])", "%1 | %2")

	-- canonicalize multiple spaces and remove leading and trailing spaces
	local function canon_spaces(text)
		text = rsub(text, "%s+", " ")
		text = rsub(text, "^ ", "")
		text = rsub(text, " $", "")
		return text
	end

	text = canon_spaces(text)

	local words = rsplit(text, " ")
	
	for i, word in ipairs(words) do
		-- Remove accent mark in checking
		if basahan_replace_word[rsub(word, "^(.*)(" .. accent_c .. ")(.*)$", "%1%3")] then
			words[i] = basahan_replace_word[rsub(word, "^(.*)(" .. accent_c .. ")(.*)$", "%1%3")]
		end
	end

	text = table.concat(words, " ")
	
	-- Convert slashes to bantasan, kulit divider
	text = rsub(text, "//", " ᜶ ")
	text = rsub(text, "/", trad and ' ᜶ ' or " ᜵ ")

	-- Convert hyphens to dot
	text = rsub(text, "%-", ".")
	-- canonicalize multiple spaces again, which may have been introduced by hyphens
	text = canon_spaces(text)
	-- now eliminate punctuation
	text = rsub(text, "[!?']", "")
	-- put # at word beginning and end and double ## at text/foot boundary beginning/end
	text = rsub(text, " | ", "# | #")
	text = "##" .. rsub(text, " ", "# #") .. "##"
	
	-- Move this early for now
	--c, gü/gu+e or i, q
	text = rsub(text, "c([iey])", "s%1")
	text = rsub(text, "([aeëiou])gü([ie])", "%1ɡw%2")
	text = rsub(text, "gü([ie])", "ɡuw%1")
	text = rsub(text, "gu([e])", "ɡ%1") -- Only e, so words like "biguin" will not be read as "bigin"
	text = rsub(text, "qu([ie])", "k%1")
	text = rsub(text, "ü", "u") 
	
	--ll
	text = rsub(text, "ll([i]?)([aeëiou])", "ly%2")
	
	-- Correction for vowels with in-between glottal stop, now default
	text = rsub_repeatedly(text, "(" .. V .. ")(" .. V .. ")", "%1.%2")

	table.insert(debug, text)
	
	-- Reenable "j" sound be equivalent to "dy"
	-- Ex. gaja = ga(d)ya
	text = rsub(text, "dj(".. V .. ")"  , "dy%1")
	text = rsub(text, "j(" .. V .. ")", "dy%1")

	-- handle certain combinations; ch ng and sh handling needs to go first
	text = rsub(text, "([t]?)ch", "ts") --not the real sound
	text = rsub(text, "([n]?)g̃", "ŋ") -- Spanish spelling support
	text = rsub(text, "ng", "ŋ")
	text = rsub(text, "sh", "ʃ")
	
	--ck
	text = rsub(text, "ck", "k") -- foreign sound in case

	--x
	text = rsub(text, "([#])x([aeëiou])", "%1s%2")
	text = rsub(text, "x", "ks")
	
	table.insert(debug, text)

	--alphabet-to-phoneme
	text = rsub(text, "[cgjñqvz7]",
	--["g"]="ɡ":  U+0067 LATIN SMALL LETTER G → U+0261 LATIN SMALL LETTER SCRIPT G
		{ ["c"] = "k", ["g"] = "ɡ", ["j"] = "ĵ", ["ñ"] = "ɲ", ["q"] = "k", ["v"] = "b", ["z"] = "s"})

	--r
	text = rsub(text, "rr", "r")

	-- ts
	text = rsub(text, "ts", "ĉ") --not the real sound
	
	--determining whether "y" is a consonant or a vowel
	--Basahan treats as consonant regardless
	text = rsub(text, "y(" .. V .. ")", "ɟ%1") -- not the real sound
	text = rsub(text,"y([ˈˌ.]?)([bćĉdɡhjĵklmnɲŋpɾrsʃtwɟʔ" .. vowel .. "])","i%1%2")
	text = rsub(text, "y#", "i")
	text = rsub(text, "w(" .. V .. ")","w%1")
	text = rsub(text,"w([ˈˌ]?)([bćĉdɡhjĵklmnɲŋpɾrsʃtwɟʔ])","u%1%2")
	text = rsub(text, "w#","u")
	--text = rsub(text, "sɟ", "ʃ")

	table.insert(debug, text)

	text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*)(" .. C .. V .. ")", "%1.%2")
	text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*" .. C .. ")(" .. C .. V .. ")", "%1.%2")
	text = rsub_repeatedly(text, "(" .. V .. accent_c .. "*" .. C .. "+)(" .. C .. C .. V .. ")", "%1.%2")
	text = rsub_repeatedly(text, "(" .. C .. ")%.s(" .. C .. ")", "%1s.%2")
	-- Any aeo, or stressed iu, should be syllabically divided from a following aeo or stressed iu.
	text = rsub_repeatedly(text, "([aeo]" .. accent_c .. "*)([aeo])", "%1.%2")
	text = rsub_repeatedly(text, "([aeo]" .. accent_c .. "*)(" .. V .. stress_c .. ")", "%1.%2")
	text = rsub(text, "([əiu]" .. stress_c .. ")([aeo])", "%1.%2")
	text = rsub_repeatedly(text, "([əiu]" .. stress_c .. ")(" .. V .. stress_c .. ")", "%1.%2")
	text = rsub_repeatedly(text, "i(" .. accent_c .. "*)i", "i%1.i")
	text = rsub_repeatedly(text, "u(" .. accent_c .. "*)[ou]", "u%1.u")

	table.insert(debug, text)
	
	-- Remove accent marks
	text = rsub(text, "^(.*)(" .. accent_c .. ")(.*)$", "%1%3")

    table.insert(debug,text)

	if (not diph) then
    	--Corrections for diphthongs
	    text = rsub(text,"([aeəou])i","%1j") --y
	    text = rsub(text,"([aeəio])u","%1w") --w
    end

    table.insert(debug, text)
    
    -- Disabled "dy" combination to be automatically j sound
    -- text = rsub(text,"d[.]ɟ",".ĵ") --/d/ before /j/ 
    text = rsub(text,"d[.]ĵ",".ĵ") --/d/ before /j/ 
    
    text = rsub_repeatedly(text,"(n)[.]([kɡ])","ŋ.%2") -- /n/ before /k/ (some proper nouns)
    text = rsub(text,"n[.]ɟ",".ɲ") -- /n/ before /j/
    text = rsub(text,"s[.]ɟ",".ʃ") -- /s/ before /j/
    -- text = rsub(text,"t[.][ɟ]","%1ĉ") -- /t/ before /j/
    text = rsub(text,"([.])d([ɟj])([aeəiou])","%1ĵ%3") -- /dj/ before any vowel following stress
    text = rsub(text,"([.])n([ɟj])([aeəiou])","%1ɲ%3") -- /nj/ before any vowel following stress
    text = rsub(text,"([.])s([ɟj])([aeəiou])","%1ʃ%3") -- /sj/ before any vowel following stress
    
    -- Separated ts and ty sounds
    -- text = rsub(text,"([.])t([ɟj])([aeəiou])","%1ĉ%3") -- /tj/ before any vowel following stress
    
    -- After processing pronunciation, Basahan Start Translate
    text = rsub(text, "[əei]", "i")
    text = rsub(text, "[ou]", "u")
 
	-- Remove /kt/ like "abstrakt"
    text = rsub(text, "kt([#.])", "k%1")
    
    -- Check if there are errors with vowels again
    text = rsub(text,"([aiu])([^.]?)([aəiu])","%1.%2%3")
    

    local function basahan_syllable(syll, post, last_vowel)
    	
    	syll2 = ""
    		
		local bas_double = {
			["ĉ"] = "t", ["ĵ"] = "d",
			["ɲ"] = "n", ["ʃ"] = "s",
			["ɡ"] = "g", ["ŋ"] = "N",
		}
			
    	local function basahan(character)
    		local bas_soundpre = ''
			character = rsub(character, "[ɡ]", "g")
			
			if character == 'ĉ' 
			or character == 'ĵ' 
			or character == 'ɲ' 
			or character == 'ʃ' then
    			bas_soundpre = bas_double[character]
    			bas_soundpre = basahan_chars[bas_soundpre] .. basahan_marks[trad and 'i' or '+']
    			
    			if character == 'ĉ' then
    				if trad then bas_soundpre = '' end
    				character = rsub(character, "[" .. character .. "]", "s")
    			else
    				character = rsub(character, "[" .. character .. "]", "y")
    			end
			end
			
    		character = rsub(character, "[f]", "p")
    		character = rsub(character, "[ɟj]", "y")
    		character = rsub(character, "[N]", "ŋ")

    		return bas_soundpre .. basahan_chars[character]
    	end
  
		if not trad then
			-- Remove /h/ as it is not pronounced in between
			syll = rsub(syll, "([^h]+)(h+)", "%1")
			post = rsub(post, "(h+)", "")
			
			post = rsub(post, "ŋ", bas_double["ŋ"])
			post = rsub(post, "ɲ", bas_double["ɲ"])
			post = rsub(post, "ɡ", bas_double["ɡ"])
			post = rsub(post, "ʃ", bas_double["ʃ"])
			post = rsub(post, "ĵ", bas_double["ĵ"] .. 's')
			post = rsub(post, "ĉ", bas_double["ĉ"] .. 's')
			
			for c in post:gmatch('.') do
				syll2 = syll2 .. basahan(c) .. basahan_marks['+']
			end
		end
    	syll = rsub(syll, "(" .. C .. "*)(" .. V .. "+)",
			function(consonant, vowel)
				local bas_char = ''
				
				if string.len(consonant) == 0 then
					bas_char = basahan(vowel)
				elseif string.len(consonant) == 1 or string.match(consonant, "[ĉĵɲŋʃɡ]") and string.len(consonant) == 2 then
					bas_char = basahan(consonant) .. basahan_marks[vowel]
				elseif string.match(consonant, "^(.*)ll$") then
					for c in consonant:gmatch('^(.)ll$') do
						bas_char = bas_char ..basahan(c) .. basahan_marks[trad and vowel or '+']
					end
					
					bas_char = bas_char .. basahan("l") .. basahan_marks[trad and "i" or '+']
					bas_char = bas_char .. basahan("y") .. basahan_marks[vowel]
				else
					-- Two character unicode problems
					consonant = rsub(consonant, "ŋ", bas_double["ŋ"])
					consonant = rsub(consonant, "ɲ", bas_double["ɲ"])
					consonant = rsub(consonant, "ɡ", bas_double["ɡ"])
					consonant = rsub(consonant, "ʃ", bas_double["ʃ"])
					consonant = rsub(consonant, "ĉ", bas_double["ĉ"] .. (trad and 'y' or 's'))
					consonant = rsub(consonant, "ɟ", "y")

					for c in consonant:gmatch('.') do
						bas_char = bas_char .. basahan(c) .. basahan_marks[trad and (last_vowel or vowel) or '+']
						last_vowel = nil
					end
					
					bas_char = rsub(bas_char, basahan_marks['+'] .. "$", basahan_marks[vowel])
				end
				return bas_char
			end
		)
		
    	return syll .. syll2
    end
   
    local words = rsplit(text, " ")
	for i, word in ipairs(words) do
		
		-- (C)/y/ and --(C)w fixes
		-- /h/ being pronounced
		if trad then
			word = rsub(word, "([^w" .. vowel .. separator .. "])(w)(" .. V .. ")(" .. C .. "*)([.#]+)", "%1u.%2%3%4%5")
			word = rsub(word, "([^ɟ" .. vowel .. separator .. "])(ɟ)(" .. V .. ")(" .. C .. "*)([.#]+)", "%1i.%2%3%4%5")
			word = rsub(word, "(" .. C .. "*)(" .. V .. ")(h)(" .. C .. "+)([.#]+)", "%1%2.%3%2%4%5")
		end
		
		local syllables = rsplit(word, "[.]")
		local last_vowel = nil
		for j = 1, #syllables do
			if string.match(syllables[j], V) then
				syllables[j] = rsub(syllables[j], "^([#]*)(" .. C .. "*)(" .. V .. "+)(" .. C .. "*)([#]*)$",
					function(temp1 ,pre, vowel, post, temp2)
						retval = temp1 .. basahan_syllable(pre .. vowel, post, last_vowel) .. temp2
						last_vowel = string.match(post, "[mn]") and vowel or nil
						return retval
					end
				)
			elseif not string.match(syllables[j], "[᜵᜶]") then
				-- This is only a fallback when no vowel is entered
				syllables[j] = rsub(syllables[j], "^([#]*)(" .. C .. "+)([#]*)$",
					function(temp1 , consonant , temp2)
						if trad then
							return temp1 .. basahan_syllable(consonant .. "a", "") .. temp2
						else
							return temp1 .. basahan_syllable("", consonant) .. temp2
						end
					end
				)
			end
		end
		words[i] = table.concat(syllables, "")
	end
    
    text = table.concat(words, " ")

	-- remove # symbols at word and text boundaries
	text = rsub(text, "#", "")
	
	text = canon_spaces(text)

	return mw.ustring.toNFC(text)
end

function export.show(frame)
	local params = {
		[1] = {},
		["trad"] = {},
		["diph"] = {},
		["disp"] = {},
		["pre"] = {},
		["tr"] = {},
		-- ["bullets"] = {type = "number", default = 1},
	}

	local parargs = frame:getParent().args
	local args = require("Module:parameters").process(parargs, params)

	local results = {}

	local text = args[1] or mw.title.getCurrentTitle().text
	local disp = args.disp or false
	local trad = args.trad or false
	local diph = args.diph or false
	
	results = export.transcribe(text, trad, diph)
	
	-- Basahan to Latin
	local tr = args["tr"] or 0
	if tr == '1' then
		tr = (lang:transliterate(results, sc_Tglg))
		tr =  rsub(tr, "%s[,]", ",")
		tr =  rsub(tr, "%s[.]", ".")
		tr = ' (' .. tr ..  ')'
	elseif tr == '2' then
		tr = text
		tr = rsub(tr, "[.]", "")
		tr = rsub(tr, "//", ".")
		tr = rsub(tr, "/", ",")
		tr = ' (' .. tr ..  ')'
	else
		tr = ''
	end

	local pre = args.pre and args.pre .. " " or ""
	
	if trad then
		results =  rsub_repeatedly(results, "([^᜶]) ([^᜶])", "%1%2")
	end
	
	if disp then
		results = '<span class="' .. sc_Tglg:getCode()  ..  '" lang="'  .. lang:getCode()  .. '">' .. results ..  "</span>"
	else
		results = results 
	end
	
	return pre .. results ..  tr
end

return export