Module:cy-IPA/sandbox

This module lacks a documentation subpage. Please create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox of
--[=====[ 
Currently missing:
* Dialects: should be include North Wales, South Wales and standard vs. colloquial variants of each. These parameters are optional when no difference
* ch, dd, ff, ng, ll, ph, th should be treated as single letters, all other
  consonant combinations should not
* Function for  de-aspiration of b, d, g in /sC/ clusters and word finally, but
  remaining distinct from aspirated p, t, k
* Function to treat all voicing assimilation as becomong voiceless rather than
  progressive or regressive assimilation
* Function for pre-consonantal obstruent devoicing of d, g, b, s
* y should be treated as /ə/, unless in a final syllable when it is /ɨ/ or /ɨː/
  depending on vowel length. This ensures it stays separate from u /ɨ/ or /ɨː/
* y (in final syllables) and u merge with i in South Wales, including in diphthongs
* ae should be treated as /ɑːɨ/ in final syllables and /eːɨ̯/ elsewhere
* ng is usually ŋ (marked for alphabetisation as g~ already) but may be ŋɡ (not
  considered one letter) especially in compound words
* Function to reduce double letters, after appropriate consideration of effects on vowel length
* Function to convert ⟨ai, au⟩ in final unstressed syllables to /ɛ/ in colloquial Welsh
* Function to convert ⟨ai, au, e⟩ in final unstressed syllables to /a/ in colloquial Northern Welsh
* Many other dipthongs (including stressed) are smoothed in South Welsh - need to research
* Rule to determine stress - always penultimate syllable, unless there is a stressed
  suffix such as -(h)áu or the word is a recent loanword
* Rules to determine when to make vowels short vs. long. The best way to do this
  is by taking South Welsh as normative wrt. vowel length and North Welsh as
  normative wrt vowel quality (and length in diphthongs). Whichever of the two
  has a long vowel before a cluster should be normative in this respect.
* An input whether the word is a recent loan from Englsh might make a lot of exceptions
  predictable/automatable, e.g. words with atypical short and long vowels or stress
* Rules to determine when to make vowels short vs. long. There will need to be
  ways to override this, e.g. by adding a circumflex to long vowels and a grave to short vowels.
  Some defaults:
  - vowels should be short if unstressed or /ə/
  - vowels should be long in a stressed open syllable (unless non-final in North Welsh)
  - vowels should be long in a stressed final syllable before /b, ch, d, dd, g, f, ff, g, h, l, n, r, ph, s, th/
  - note that exceptions to the above are common for /l, n, r/
  - vowels should also be long in stressed open syllables before /b, ch, d, dd, g, f, ff, g, h, l, n, r, ph, th/
  but NOT /s/ (except in North Wales, where all non-final vowels are short)
  - all other vowels should be short, especially when an aspirated stop and some
  liquid consonants follow /c, m, ll, ng, nn, p, rr, t/
  - vowels should generally be short before clusters, with well-defined exceptions
  - Vowels in North Welsh are long in stressed final syllables before /sC, ɬC/ clusters - should form part of the norm with South Welsh automatically derived from it
  - Diphthongs with long vowels in North Welsh (only in final syllables)
  include ae /ɑːɨ̯, eːɨ̯/, aw /ɑːu̯/, ew /eːu̯/, ey /e.ɨ̯/, oe /ɔːɨ̯/, ou /ɔːɨ̯/- should form part of the norm with South Welsh automatically derived from it
  - syllables with secondary stress should be treated as if stressed
--]=====]

local export = {}

local u = mw.ustring.char
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub
local ulen = mw.ustring.len

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

local function ine(x)
	if x == "" then return nil else return x end
end

local AC = u(0x0301)
local GR = u(0x0300)
local BREVE = u(0x0306)
local stress_accent = AC .. GR
local stress_accent_c = "[" .. stress_accent .. "]"
local accent = stress_accent .. BREVE
local accents_r = "[" .. accent .. "]*"
local DIA = u(0x0308)
local vowel = "àáâäèéêëìíîïòóôöùúûẁẃŵẅüỳýŷÿ" .. accent
local vowel_c = "[" .. vowel .. "]"
local cons_c = "[^" .. vowel .. ".⁀ %-()]"
local front_vowel = "eiyæœ" -- Artefact from Module:de-IPA, Welsh has i-umlaut so may be useful?
local front_vowel_c = "[" .. front_vowel .. "]"

local sequences = {
	["a"] = {
		["a"   ] = "a";
		["à"   ] = "a";
		["á"   ] = "a";
		["â"   ] = "a";
		["ä"   ] = "a";
		["ae"  ] = "ɑːɨ̯";
		["ai"  ] = "ai̯";
		["au"  ] = "aɨ̯";
		["aw"  ] = "ɑːu̯";
	};
	["b"] = {
		["b"   ] = "b";
	};
	["c"] = {
		["c"   ] = "k";
		["ch"  ] = "ç";
	};
	["d"] = {
		["d"   ] = "d";
		["dd"  ] = "d";
	};
	["e"] = {
		["e"   ] = "ɛ";
		["è"   ] = "ɛ";
		["é"   ] = "eː";
		["ê"   ] = "eː";
		["ë"   ] = "e";
		["ei"  ] = "ɛi̯";
		["eu"  ] = "əɨ̯";
		["ew"  ] = "eːu̯";
		["ey"  ] = "aɨ̯";
	};
	["f"] = {
		["f"   ] = "v";
		["ff"  ] = "f";
	}; -- Here, Arafsymudwr stopped editing and what follows is from Module:de-IPA
	["f"] = "f";
	["g"] = "ɡ";
	["h"] = "h";
	["i"] = {
		["i"   ] = "ɪ";
		["ie"  ] = "iː";
	};
	["j"] = "j";
	["k"] = {
		["k"   ] = "k";
		["kk"  ] = "k";
		["ck"  ] = "k";
	};
	["l"] = "l";
	["m"] = "m";
	["n"] = {
		["n"   ] = "n";
		["ng"  ] = "ŋ";
		["nn"  ] = "n";
	};
	["o"] = {
		["oo"  ] = "oː";
		["os"  ] = { "ɔ", "s" };
		["o"   ] = "ɔ";
	};
	["ö"] = {
		 -- XXX: manchmal /øː/
		["ö"   ] = "œ";
		["ös"  ] = { "œ", "s" };
	};
	["p"] = {
		["ph"  ] = "f";
		["pp"  ] = "p";
		["p"   ] = "p";
	};
	["q"] = {
		["qu"  ] = { "k", "f" };
		["q"   ] = "k"; -- XXX
	};
	["r"] = {
		 -- XXX: /ʀ/? /r/?; manchmal /ɐ/ ("Uhr"); auch /ər/ ("oder")
		["r"   ] = "r";
		["rr"  ] = "r";
	};
	["s"] = {
		["s"   ] = "s";
		["sch" ] = "ʃ";
		["sp"  ] = { "ʃ", "p" }; 
		["ss"  ] = "s";
		["st"  ] = { "ʃ", "t" };
	};
	["t"] = {
		["t"   ] = "t";
		["tsch"] = "t͡ʃ";
		["tt"  ] = "t";
		["tion"] = { "t͡s", "i̯", "o", "n" };
	};
	["u"] = {
		["u"   ] = "ʊ";
		["uch" ] = { "ʊ", "x" };
	};
	["ü"] = {
		["ü"   ] = "yː";
		["üh"  ] = "yː";
	};
	["v"] = "f";
	["w"] = "ʋ";
	["x"] = { "k", "s" }; -- XXX
	["y"] = "i";
	["z"] = "z"; -- already converted from s
	["ß"] = "s";
	["́"] = "ˈ"; -- FIXME
	["-"] = {};
}

function export.IPA(text, orig, pos)
	if type(text) == 'table' then
		text, orig, pos = ine(text.args[1]), ine(text.args.orig), ine(text.args.pos)
	end
	text = text or mw.title.getCurrentTitle().text
	text = ulower(text)
	-- decompose, then recompose umlauted vowels, and convert ae oe ue to
	-- umlauted vowels
	text = mw.ustring.toNFD(text)
	-- while we're doing this, don't get confused by wrongly-ordered umlauts/e's
	-- and other accents
	text = rsub(text, "(" .. accents_r .. ")([e" .. DIA .. "])", "%2%1")
	text = rsub(text, "([aou])[e" .. DIA .. "]", {a="ä", o="ö", u="ü"})
	-- put breves before acute/grave accents
	text = rsub(text, "(" .. stress_accent_c .. ")" .. BREVE, BREVE .. "%1")

	-- To simplify checking for word boundaries and liaison markers, we
	-- add ⁀ at the beginning and end of all words, and remove it at the end.
	-- Note that the liaison marker is ‿.
	text = rsub(text, "%s*,%s*", '⁀⁀ | ⁀⁀')
	text = rsub(text, "%s+", '⁀ ⁀')
	text = rsub(text, "%-+", '⁀-⁀')
	text = '⁀⁀' .. text .. '⁀⁀'

	text = rsub(text, "([aou]" .. accents_r .. ")" .. "ch", "%1χ")
	text = rsub(text, "sch", "ʃ")
	text = rsub(text, "ch", "ç")
	text = rsub(text, "ck", "kk")
	text = rsub(text, "z", "c")
	text = rsub(text, "s(" .. vowel_c .. ")", "z%1")
	text = rsub(text, "([bdgr])(" .. cons_or_boundary_c .. ")",
		function(c1, c2)
			return devoiced_cons[c1] .. c2
		end)
	
	-- Buchstaben in Foneme konvertieren
	local phones, i, n = {}, 1, ulen(text)
	while i <= n do
		local bid = ulower(usub(text, i, i))
		local value = sequences[bid]
		
		if (type(value) == 'table') and not value[1] then
			local bidl = ulen(bid)
			for seq in pairs(value) do
				local seql = ulen(seq)
				if seql > bidl then
					if (ulower(usub(text, i, i + seql - 1)) == seq) then
						bid = seq
						bidl = ulen(bid)
					end
				end
			end
			value = value[bid]
		end
		
		if type(value) == 'string' then
			table.insert(phones, value)
		elseif not value then
			table.insert(phones, bid)
		else
			for _, phone in ipairs(value) do
				table.insert(phones, phone)
			end
		end
		
		i = i + ulen(bid)
	end

	text = table.concat(phones)
	--remove hyphens and word-boundary markers
	text = rsub(text, '[⁀%-]', '')
	return text
end

return export