Module:grc-translit

The following documentation is located at Module:grc-translit/documentation. ^[edit] Categories were auto-generated by Module:module categorization. ^[edit]

Useful links: subpage list • links • transclusions • testcases • sandbox (diff)

This module will transliterate Ancient Greek language text per WT:GRC TR. It is also used to transliterate Demotic, Greek, Paeonian, Old Ossetic, Dacian, Ancient Macedonian, and Phrygian. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:grc-translit/testcases.

Functions

tr(text, lang, sc): Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.; When the transliteration fails, returns nil.

2 of 36 tests failed. (refresh)

testcases for `tr` function in Module:grc-translit:
	Text	Expected	Actual
	λόγος	lógos	lógos
	σφίγξ	sphínx	sphínx
	ϝάναξ	wánax	wánax
	οἷαι	hoîai	hoîai
current problems
	ΙΧΘΥΣ	IKHTHUS	IKhThUS
	Υἱός	'''Hu'''iós	'''U'''hiós
u/y
	ταῦρος	taûros	taûros
	νηῦς	nēûs	nēûs
	σῦς	sûs	sûs
	ὗς	hûs	hûs
	γυῖον	guîon	guîon
	ἀναῡ̈τέω	anaṻtéō	anaṻtéō
	δαΐφρων	daḯphrōn	daḯphrōn
vowel length
	τῶν	tôn	tôn
	τοὶ	toì	toì
	τῷ	tôi	tôi
	τούτῳ	toútōi	toútōi
	σοφίᾳ	sophíāi	sophíāi
	μᾱ̆νός	mānós	mānós
h (rough breathing)
	ὁ	ho	ho
	οἱ	hoi	hoi
	εὕρισκε	heúriske	heúriske
	ὑϊκός	huïkós	huïkós
	πυρρός	purrhós	purrhós
	ῥέω	rhéō	rhéō
	σάἁμον	sáhamon	sáhamon
capitals
	Ὀδυσσεύς	Odusseús	Odusseús
	Εἵλως	Heílōs	Heílōs
	ᾍδης	Hā́idēs	Hā́idēs
	ἡ Ἑλήνη	hē Helḗnē	hē Helḗnē
punctuation
	ἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή;	ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?	ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?
	τί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν;	tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?	tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?
	τούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω.	toútōn phōnḗenta mén estin heptá; a e ē i o u ō.	toútōn phōnḗenta mén estin heptá; a e ē i o u ō.
	πήγ(νῡμῐ)	pḗg(nūmi)	pḗg(nūmi)
HTML entities
	καλός καὶ ἀγαθός	kalós kaì agathós	kalós kaì agathós
	καλός καὶ ἀγαθός	kalós kaì agathós	kalós kaì agathós

local export = {}

local m_data = require("Module:grc-utilities/data")
local m_str_utils = require("Module:string utilities")

local tokenize = require('Module:grc-utilities').tokenize

local ufind = m_str_utils.find
local ugsub = m_str_utils.gsub
local U = m_str_utils.char
local ulower = m_str_utils.lower
local uupper = m_str_utils.upper

-- Diacritics
local diacritics = m_data.named

-- Greek
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local diaeresis = diacritics.diaeresis
local smooth = diacritics.smooth
local rough = diacritics.rough
local macron = diacritics.macron
local breve = diacritics.breve
local subscript = diacritics.subscript

-- Latin
local hat = diacritics.Latin_circum

local macron_diaeresis = macron .. diaeresis .. "?" .. hat
local a_subscript = '^[αΑ].*' .. subscript .. '$'
local velar = 'κγχξ'

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["η"] = "e" .. macron,
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",
	["ω"] = "o" .. macron,

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",
	
	-- Archaic letters
	["ϛ"] = "st",
	["ϝ"] = "w",
	["ϻ"] = "ś",
	["ϙ"] = "q",
	["ϡ"] = "š",
	["ͷ"] = "v",
	
	-- Incorrect characters: see [[Wiktionary:About Ancient Greek#Miscellaneous]].
	-- These are tracked by [[Module:script utilities]].
	["ϐ"] = "b",
	["ϑ"] = "th",
	["ϰ"] = "k",
	["ϱ"] = "r",
	["ϲ"] = "s",
	["ϕ"] = "ph",
	
	-- Diacritics
	-- unchanged: macron, diaeresis, grave, acute
	[breve] = '',
	[smooth] = '',
	[rough] = '',
	[circumflex] = hat,
	[subscript] = 'i',
}

function export.tr(text, lang, sc)
	if text == '῾' then
		return 'h'
	end
	
	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except after an ASCII alphanumeric character (to avoid converting
		semicolons in HTML entities).
	]]
	text = ugsub(text, "([^A-Za-z0-9])[;" .. U(0x37E) .. "]", "%1?")
	
	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = text:gsub("·", ";")
	
	local tokens = tokenize(text)

	--now read the tokens
	local output = {}
	for i, token in pairs(tokens) do
		-- Convert token to lowercase and substitute each character
		-- for its transliteration
		local translit = ulower(token):gsub(".[\128-\191]*", tt)
		
		local next_token = tokens[i + 1]
		
		if token == 'γ' and next_token and velar:find(next_token, 1, true) then
			-- γ before a velar should be <n>
			translit = 'n'
		elseif token == 'ρ' and tokens[i - 1] == 'ρ' then
			-- ρ after ρ should be <rh>
			translit = 'rh'
		elseif ufind(token, a_subscript) then
			-- add macron to ᾳ
			translit = ugsub(translit, '([aA])', '%1' .. macron)
		end
		
		if token:find(rough) then
			if ufind(token, '^[Ρρ]') then
				translit = translit .. 'h'
			else -- vowel
				translit = 'h' .. translit
			end
		end
		
		-- Remove macron from a vowel that has a circumflex.
		if ufind(translit, macron_diaeresis) then
			translit = translit:gsub(macron, '')
		end
		
		-- Capitalize first character of transliteration.
		if token ~= ulower(token) then
			translit = translit:gsub("^" .. ".[\128-\191]*", uupper)
		end
		
		table.insert(output, translit)
	end
	output = table.concat(output)
	
	return output
end

return export