This is a private module sandbox of Babr, for their own experimentation. Items in this module may be added and removed at Babr's discretion; do not rely on this module's stability.


local export = {}

local m_string_utils = require("Module:string utilities")
local gcodepoint = m_string_utils.gcodepoint
local rfind = m_string_utils.find
local rsubn = m_string_utils.gsub
local rmatch = m_string_utils.match
local rsplit = m_string_utils.split
local U = m_string_utils.char

local fatHataan = U(0x64B) -- an
local Dammataan = U(0x64C) -- un
local kasrataan = U(0x64D) -- in
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local tashdid = U(0x651) -- also called shadda
local jazm = U(0x652)
local he = U(0x647)
local zwnj = U(0x200C)
local highhmz = U(0x654)
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark

local alif = "ا"
local alif_madd = "آ"
local hamza = "ء"
local yaa = "ی" -- farsi ye
local hamza_yaa = "ئ"
local hamza_waaw = "ؤ"
local waaw = "و"
local dagger_alif = U(0x670)
local marbuta = U(0x629) 
local returning_yaa = "ے" -- baRi ye

local mapping = {
	["آ"] = "ā",
	["ب"] = "b",
	["پ"] = "p",
	["ت"] = "t",
	["ث"] = "s",
	["ج"] = "j",
	["چ"] = "č",
	["ح"] = "h",
	["خ"] = "x",
	["د"] = "d",
	["ذ"] = "z",
	["ر"] = "r",
	["ز"] = "z",
	["ژ"] = "ž",
	["س"] = "s",
	["ش"] = "š",
	["ص"] = "s",
	["ض"] = "z",
	["ط"] = "t",
	["ظ"] = "z",
	["غ"] = "ğ",
	["ف"] = "f",
	["ق"] = "q",
	["ک"] = "k",
	["گ"] = "g",
	["ل"] = "l",
	["م"] = "m",
	["ن"] = "n",
	["و"] = "ō",
	["ی"] = "ē",
	["۔"] = ".",

	["ه"] = "h",

	["ع"] = "'",
	["ء"] = "'",
	["ئ"] = "'",
	["ؤ"] = "'",
	["أ"] = "'",

	-- diacritics
	[zabar] = "a",
	[zer] = "i",
	[pesh] = "u",
	[fatHataan] = "an",
	[kasrataan] = "in",
	[Dammataan] = "un",
	[jazm] = "", -- also sukun - no vowel
	[zwnj] = "-", -- ZWNJ (zero-width non-joiner)
	[highhmz] = "-yi",

	-- ligatures
	["ﻻ"] = "lā",
	["ﷲ"] = "allāh",

	-- kashida
	["ـ"] = "‐", -- kashida, no sound

	-- alif_wasla
	[alif_wasla] = "", -- nothing

	-- numerals
	["۱"] = "1",
	["۲"] = "2",
	["۳"] = "3",
	["۴"] = "4",
	["۵"] = "5",
	["۶"] = "6",
	["۷"] = "7",
	["۸"] = "8",
	["۹"] = "9",
	["۰"] = "0",

	-- punctuation (leave on separate lines)
	["؟"] = "?", -- question mark
	["،"] = ",", -- comma
	["؛"] = ";", -- semicolon
	["«"] = "“", -- quotation mark
	["»"] = "”", -- quotation mark
	["٪"] = "%", -- percent
	["؉"] = "‰", -- per mille
	["٫"] = ".", -- decimals
	["٬"] = ",", -- thousan

	-- regional characters (FOR VERY SPECIFIC USECASES)
	["ټ"] = "ṭ",
	["ٹ"] = "ṭ",
	["ډ"] = "ḍ",
	["ڈ"] = "ḍ",
	-- balti
	-- cant do anything about ژ because it conflicts with persian
	["ڃ"] = "ž",
	["ڇ"] = "č̣",
	["ڑ"] = "ṛ",
	["ڗ"] = "dz",
	["ݜ"] = "ṣ",
	["ݨ"] = "ng",
	["ݩ"] = "ny",
	["ھ"] = "h",
	["ے"] = "e",
}
local sun_letters = "تثدذرزسشصضطظلن"
local punctuation = ":%(%)%[%]*&٫؛؟،ـ«\".'!»٪؉۔`,/–—%{%}"
local numbers = "۱۲۳۴۵۶۷۸۹۰"

local balticonsonants = "ڃڇڑڗݜݨݩǩ" -- for any other languages using this module

local consonants_needing_vowels = "بپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنؤهئء" .. balticonsonants
local rconsonants = consonants_needing_vowels .. malif .. "وی"
local lconsonants = consonants_needing_vowels -- yaa and waaw can be vowels w/o diacritics

local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. zwnj .. "]"