Open main menu

Wiktionary β

Module:data consistency check

This module checks the validity and internal consistency of the language, language family, and script data used on Wiktionary: the modules in Category:Language data modules as well as Module:scripts/data.

Checks performedEdit

For multiple data modules:

  • Codes for languages, families and etymology-only languages must be unique and cannot clash with one another.
  • Canonical names for languages, families, and etymology-only languages must not be found in the list of other names.
  • Each name in the list of other names must appear only once.
  • otherNames, if present, must be an array.

Codes in Module:languages data must:

  • Be defined in the correct submodule according to whether the code is two-letter, three-letter or exceptional.
  • Have canonicalName, which must not be the same as the canonical name of another language.
  • If scripts is given, then it must be an array, and each string in the array must be a valid script code.
  • If family is given, it must be a valid family code.
  • If type is given, it must be one of the recognised values (regular, reconstructed, appendix-constructed).
  • If entry_name is given, it must contain two arrays (from and to).
  • If sort_key is given, it must be either a string or a table containing two arrays (from and to).
  • If entry_name or sort_key is given, the from array must be longer or equal in length to the to array.
  • If standardChars is given, it must form a valid Lua string pattern when placed between square brackets with ^ before it ("[^...]). (It should match all characters regularly used in the language, but that cannot be tested.)
  • Have no data keys besides these: "canonicalName", "entry_name", "sort_key", "otherNames", "type", "scripts", "family", "ancestors", "wikimedia_codes", "wikipedia_article", "standardChars", "translit_module", "override_translit", "link_tr", "wikidata_item".

Checks not performed:

  • If translit_module is present, it should be the name of a module, and this module should contain a tr function that takes a pagename (and optionally a language code and script code) as arguments.
  • If sort_key is a string, it should be the name of a module, and this module should contain a makeSortKey function that takes a pagename (and optionally a language code and script code) as arguments.

These are not checked here, because module errors will quickly crop up in entries if these conditions are not met, assuming that Module:utilities attempts to generate a sortkey for a category pertaining to the language in question, or full_link attempts to use the transliteration module.

Module:languages/code to canonical name and Module:languages/canonical names must contain all the codes and canonical names found in the data submodules of Module:languages, and no more.

Codes in Module:etymology languages data must:

  • Have canonicalName.
  • Have parent, which must be a valid language, family or etymology-only language code.
  • Have no data keys besides these: "canonicalName", "otherNames", "parent", "ancestors", "wikipedia_article", "wikidata_item".

Codes in Module:families data must:

  • Have canonicalName, which must not be the same as the canonical name of another family.
  • If family is given, it must be a valid family code.
  • Have at least one language or subfamily belonging to it.
  • Have no data keys besides these: "canonicalName", "otherNames", "family", "protoLanguage", "wikidata_item".

Codes in Module:scripts data must:

  • Have canonicalName.
  • Have at least one language that lists it as one of its scripts.
  • Have a characters pattern for script autodetection, and this must form a valid Lua string pattern when placed between square brackets ("[...]"). (It should match all characters in the script, but that cannot be tested.)
  • Have no data keys besides these: "canonicalName", "otherNames", "parent", "systems", "wikipedia_article", "characters", "direction".

OutputEdit

Discrepancies detected:

Module:families/data

Module:languages/canonical names

  • The canonical name Proto-Sanglechi-Ishkashimi (ira-sgi-pro) is missing.

Module:languages/code to canonical name

  • The code ira-sgi-pro (Proto-Sanglechi-Ishkashimi) is missing.
  • The code ira-wnj (Wanji) is missing.

Module:languages/datax

  • Wanji language (ira-wnj) has a canonical name that is not unique, it is also used by the code wbi.

Module:scripts/data


local export = {}

local m_table = require("Module:table")
local list_to_set = m_table.listToSet

local messages = {}
setmetatable(
	messages,
	{
		__index = function(self, key) -- automatically generate subtables
			local val = {}
			self[key] = val
			return val
		end
	})

local function discrepancy(modname, ...)
	table.insert(messages[modname], string.format(...))
end

local all_codes = {}

local language_names = {}
local family_names = {}
local script_names = {}

local nonempty_fams = {}
local nonempty_scrs = {}
	
local function link(name)
	if not name then
		return "???"
	elseif name:find("[Ll]anguage$") then
		return "[[:Category:" .. name .. "|" .. name .. "]]"
	else
		return "[[:Category:" .. name .. " language|" .. name .. " language]]"
	end
end
	
local function link_script(name)
	if not name then
		return "???"
	elseif name:find("[Cc]ode$") or name:find("[Ss]emaphore$") then
		return "[[:Category:" .. name:gsub("^%l", string.upper) .. "|" .. name .. "]]"
	else
		return "[[:Category:" .. name .. " script|" .. name .. " script]]"
	end
end

local m_fun = require("Module:fun")
local map = m_fun.map

local function invalid_keys_message(modname, code, data, invalid_keys, is_script)
	local plural = invalid_keys[2] and true or false
	discrepancy(modname, "The data key%s %s for %s (<code>%s</code>) %s invalid.",
		plural and "s" or "",
		table.concat(
			map(
				function(key)
					return '<code>' .. key .. '</code>'
				end,
				invalid_keys),
			", "),
		(is_script and link_script or link)(data.canonicalName),
		code,
		plural and "are" or "is")
end

local function check_data_keys(valid_keys, is_script)
	valid_keys = list_to_set(valid_keys)
	
	return function (modname, code, data)
		local invalid_keys, i
		for k in pairs(data) do
			if not valid_keys[k] then
				invalid_keys = invalid_keys or {}
				i = (i or 0) + 1
				invalid_keys[i] = k
			end
		end
		if invalid_keys then
			invalid_keys_message(modname, code, data, invalid_keys, is_script)
		end
	end
end

local function check_other_names(modname, code, canonical_name, other_names)
	local names = {}
	for _, other_name in ipairs(other_names or {}) do
		if other_name == canonical_name then
			discrepancy(modname,
				"%s, the canonical name for <code>%s</code>, is repeated in the table of <code>otherNames</code>.",
				canonical_name, code)
		end
		if names[other_name] then
			discrepancy(modname,
				"The name %s is found twice or more in the list of <code>otherNames</code> for %s (<code>%s</code>)",
				other_name, canonical_name, code)
		end
		names[other_name] = true
	end
end

local get_codepoint = mw.ustring.codepoint
local function validate_pattern(pattern, modname, code, data, standardChars)
	if type(pattern) ~= "string" then
		discrepancy(modname, '"%s", the %spattern for %s (<code>%s</code>), is not a string.',
			pattern, standardChars and 'standard character ' or '', code, data.canonicalName)
	end
	local ranges
	for lower, higher in mw.ustring.gmatch(pattern, "(.)%-(.)") do
		if get_codepoint(lower) >= get_codepoint(higher) then
			ranges = ranges or {}
			table.insert(ranges, { lower, higher })
		end
	end
	if ranges and ranges[1] then
		local plural = ranges[2] and "s" or ""
		discrepancy(modname, '%s (<code>%s</code>) specifies an invalid pattern ' ..
			'for %scharacter detection: <code>"%s"</code>. The first codepoint%s ' ..
			'in the range%s %s %s must be less than the second.',
			link(data.canonicalName), code, standardChars and 'standard ' or '', pattern, plural, plural,
			table.concat(
				map(
					function(range)
						return range[1] .. "-" .. range[2] .. (" (U+%X, U+%X)")
							:format(get_codepoint(range[1]), get_codepoint(range[2]))
					end,
					ranges),
				", "),
			ranges[2] and "are" or "is")
	end
	if not pcall(mw.ustring.find, "", "[" .. pattern .. "]") then
		discrepancy(modname, '%s (<code>%s</code>) specifies an invalid pattern for ' ..
			(standardChars and 'standard' or '') .. ' character detection: <code>"%s"</code>',
			link(data.canonical_name), code, pattern)
	end
end

-- Modification of isArray in [[Module:table]].
local function find_gap(t)
	local i = 0
	for _ in pairs(t) do
		i = i + 1
		if t[i] == nil then
			return i
		end
	end
end

local function check_array(modname, code, data, array_name)
	local gap = find_gap(data[array_name])
	if gap then
		discrepancy(modname, "The %s array in the data table for %s (<code>%s</code>) has a gap at index %d.",
			array_name, data.canonicalName, code, gap)
	end
end

local repl_keys = { "from", "to" }
local function check_entry_name_or_sortkey(modname, code, data, replacements_name)
	local replacements = data[replacements_name]
	if type(replacements) == "string" then
		if replacements_name ~= "sort_key" then
			discrepancy(modname, "The %s field in the data table for %s (<code>%s</code>) must be a table.",
				replacements_name, data.canonicalName, code)
		end
		return
	end
	
	for _, key in ipairs(repl_keys) do
		local gap = find_gap(replacements[key])
		if gap then
			discrepancy(modname, "The %s array in the %s table for %s (<code>%s</code>) has a gap at index %d.",
				key, replacements_name, data.canonicalName, code, gap)
		end
	end
	
	if replacements.from and replacements.to
			and m_table.length(replacements.to) > m_table.length(replacements.from) then
		discrepancy(modname,
			"The <code>from</code> array in the %s table for %s (<code>%s</code>) must be shorter or equal to the <code>to</code> array.",
			replacements_name, data.canonicalName, code)
	end
end

local function check_languages()
	local m_family_data = mw.loadData('Module:families/data')
	local m_script_data = mw.loadData('Module:scripts/data')
	local m_language_codes = mw.loadData('Module:languages/code to canonical name')
	local m_language_canonical_names = mw.loadData('Module:languages/canonical names')
	
	local check_language_data_keys = check_data_keys{
		1, 2, 3, -- canonical name, wikidata item, family
		"entry_name", "sort_key", "otherNames", "type", "scripts", "ancestors",
		"wikimedia_codes", "wikipedia_article", "standardChars",
		"translit_module", "override_translit", "link_tr",
	}
	
	local function check_language(modname, code, data)
		local canonical_name, wikidata_item, lang_type = data[1], data[2], data.type
		
		check_language_data_keys(modname, code, data)
		
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			if not m_language_codes[code] then
				discrepancy("languages/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name)
			end
			all_codes[code] = modname
		end
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[canonical_name] then
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.",
				link(canonical_name), code, language_names[canonical_name])
		else
			if not m_language_canonical_names[canonical_name] then
				discrepancy("languages/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code)
			end
			language_names[canonical_name] = code
		end
		
		if wikidata_item then
			if not wikidata_item:match '^Q%d+$' then
				discrepancy(modname,
					"%s (<code>%s</code>) has a Wikidata item with an invalid form: <code>%s</code>.",
					canonical_name, code, wikidata_item)
			end
		end
		
		if data.otherNames then
			check_other_names(modname, code, canonical_name, data.otherNames)
			check_array(modname, code, data, "otherNames")
		end
		
		if lang_type and (lang_type ~= "regular") and (lang_type ~= "reconstructed") and (lang_type ~= "appendix-constructed") then
			discrepancy(modname, "%s (<code>%s</code>) is of an invalid type <code>%s</code>.", link(canonical_name), code, data.type)
		end
		
		if data.scripts then
			check_array(modname, code, data, "scripts")
			if not data.scripts[1] then
				discrepancy(modname, "%s (<code>%s</code>) has no scripts listed.", link(canonical_name), code)
			else
				for _, sccode in ipairs(data.scripts) do
					if not m_script_data[sccode] then
						discrepancy(modname,
							"%s (<code>%s</code>) lists an invalid script code <code>%s</code>.",
							link(canonical_name), code, sccode)
					end
		
					nonempty_scrs[sccode] = true
				end
			end
		end
		
		if data[3] then
			local family = data[3]
			if not m_family_data[family] then
				discrepancy(modname,
					"%s (<code>%s</code>) has an invalid family code <code>%s</code>.",
					link(canonical_name), code, family)
			end
			
			nonempty_fams[family] = true
		end
		
		if data.sort_key then
			check_entry_name_or_sortkey(modname, code, data, "sort_key")
		end
		
		if data.entry_name then
			check_entry_name_or_sortkey(modname, code, data, "entry_name")
		end

		if data.standardChars then
			validate_pattern(data.standardChars, modname, code, data, true)
		end
	end
	
	-- Check two-letter codes
	local modname = "languages/data2"
	local data2 = mw.loadData("Module:" .. modname)
	
	for code, data in pairs(data2) do
		if not code:find("^[a-z][a-z]$") then
			discrepancy(modname, '%s (<code>%s</code>) does not have a two-letter code.', link(data.canonicalName), code)
		end
		
		check_language(modname, code, data)
	end
	
	-- Check three-letter codes
	for i = string.byte('a'), string.byte('z') do
		local letter = string.char(i)
		local modname = "languages/data3/" .. letter
		local data3 = mw.loadData("Module:" .. modname)
		local code_pattern = "^" .. letter .. "[a-z][a-z]$"
		
		for code, data in pairs(data3) do
			if not code:find(code_pattern) then
				discrepancy(modname,
					'%s (<code>%s</code>) does not have a three-letter code starting with "<code>%s</code>".',
					link(data.canonicalName), code, letter)
			end
			
			check_language(modname, code, data)
		end
	end
	
	-- Check exceptional codes
	modname = "languages/datax"
	local datax = mw.loadData("Module:" .. modname)
	
	for code, data in pairs(datax) do
		if code:find("^[a-z][a-z][a-z]?$") then
			discrepancy(modname, '%s (<code>%s</code>) has a two- or three-letter code.', link(data.canonicalName), code)
		end
		
		check_language(modname, code, data)
	end
	
	-- These checks must be done while all_codes only contains language codes:
	-- that is, after language data modules have been processed, but before
	-- etymology languages, families, and scripts have.
	local function check_code_and_name(modname, code, canonical_name)
		if not all_codes[code] then
			if not language_names[canonical_name] then
				discrepancy(modname,
					"The code <code>%s</code> and the canonical name %s should be removed; they are not found in a submodule of [[Module:languages]].",
					code, canonical_name)
			else
				discrepancy(modname,
					"<code>%s</code>, the code for the canonical name %s, is wrong; it should be <code>%s</code>.",
					code, canonical_name, language_names[canonical_name])
			end
		elseif not language_names[canonical_name] then
			local data_table = mw.loadData("Module:" .. all_codes[code])[code]
			discrepancy(modname,
				"%s, the canonical name for the code <code>%s</code>, is wrong; it should be %s.",
				canonical_name, code, data_table[1] or data_table.canonicalName)
		end
	end
	
	for code, canonical_name in pairs(m_language_codes) do
		check_code_and_name("languages/code to canonical name", code, canonical_name)
	end
	
	for canonical_name, code in pairs(m_language_canonical_names) do
		check_code_and_name("languages/canonical names", code, canonical_name)
	end		
end

local function check_etym_languages()
	local modname = "etymology languages/data"
	local m_etym_language_data = require("Module:" .. modname) -- no mw.loadData
	local m_language_data = mw.loadData("Module:languages/alldata")
	local m_family_data = mw.loadData('Module:families/data')
	
	local check_etymology_language_data_keys = check_data_keys{
		"canonicalName", "otherNames", "parent",
		"wikipedia_article", "wikidata_item"
	}
	
	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguage$") then
			return name
		else
			return name .. " language"
		end
	end
	
	for code, data in pairs(m_etym_language_data) do
		local canonical_name, parent = data.canonicalName, data.parent
		check_etymology_language_data_keys(modname, code, data)
		
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[canonical_name] then
			--[=[
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.",
				link(data.names[1]), code, language_names[data.names[1]])
			--]=]
		else
			language_names[canonical_name] = code
		end
		
		if data.otherNames then
			check_other_names(modname, code, canonical_name, data.otherNames)
			check_array(modname, code, data, "otherNames")
		end
		
		if parent then
			if not m_language_data[parent] and not m_family_data[parent] and not m_etym_language_data[parent] then
				discrepancy(modname,
					"Etymology-only %s (<code>%s</code>) has invalid parent language or family code <code>%s</code>.",
					link(canonical_name), code, parent)
			end
			
			nonempty_fams[parent] = true
		else
			discrepancy(modname,
				"Etymology-only %s (<code>%s</code>) has no parent language or family code.",
				link(canonical_name), code)
		end
	end

	local checked = {}
	for code, data in pairs(m_etym_language_data) do
		local stack = {}

		while data do
			if checked[data] then
				break	
			end
			if stack[data] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data.canonical_name), code,
					link(m_etym_language_data[data.parent].canonicalName), data.parent
				)
				break
			end
			stack[data] = true
			code, data = data.parent, data.parent and m_etym_language_data[data.parent]
		end
		
		for data in pairs(stack) do
			checked[data] = true	
		end
	end
end

local function check_families()
	local modname = "families/data"
	local m_family_data = mw.loadData("Module:" .. modname)
	
	local check_family_data_keys = check_data_keys{
		"canonicalName", "otherNames", "family", "protoLanguage", "wikidata_item"
	}

	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguages$") then
			return "[[:Category:" .. name .. "|" .. name .. " family]]"
		else
			return "[[:Category:" .. name .. " languages|" .. name .. " family]]"
		end
	end
	
	for code, data in pairs(m_family_data) do
		check_family_data_keys(modname, code, data)
		
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not data.canonicalName then
			discrepancy(modname, "<code>%s</code> has no canonical name specified.", code)
		elseif family_names[data.canonicalName] then
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.",
				link(data.canonicalName), code, family_names[data.canonicalName])
		else
			family_names[data.canonicalName] = code
		end
		
		if data.otherNames then
			check_other_names(modname, code, data.canonical_name, data.otherNames)
			check_array(modname, code, data, "otherNames")
		end
		
		if data.family then
			if not m_family_data[data.family] then
				discrepancy(modname,
					"%s (<code>%s</code>) has an invalid parent family code <code>%s</code>.",
					link(data.canonicalName), code, data.family)
			end
			
			nonempty_fams[data.family] = true
		end
	end
	
	for code, data in pairs(m_family_data) do
		if not nonempty_fams[code] then
			discrepancy(modname, "%s (<code>%s</code>) has no child families or languages.", link(data.canonicalName), code)
		end
	end

	local checked = { ['qfa-not'] = true }
	for code, data in pairs(m_family_data) do
		local stack = {}

		while data do
			if checked[code] then
				break	
			end
			if stack[code] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data.canonicalName), code,
					link(m_family_data[data[3]].canonicalName), data[3]
				)
				break
			end
			stack[code] = true
			code, data = data.family, m_family_data[data[3]]
		end
		
		for code in pairs(stack) do
			checked[code] = true	
		end
	end
end

local function check_scripts()
	local modname = "scripts/data"
	local m_script_data = mw.loadData("Module:" .. modname)
	
	local check_script_data_keys = check_data_keys({
		"canonicalName", "otherNames", "parent", "systems", "wikipedia_article",
		"characters", "direction", "character_category",
	}, true)
	
	local m_script_codes = mw.loadData('Module:scripts/code to canonical name')
	local m_script_canonical_names = mw.loadData('Module:scripts/by name')
	
	for code, data in pairs(m_script_data) do
		local canonical_name = data.canonicalName
		if not m_script_codes[code] and #code == 4 then
			discrepancy('scripts/code to canonical name', '<code>%s</code> (%s) is missing', code, canonical_name)
		end
		
		check_script_data_keys(modname, code, data)
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif script_names[canonical_name] then
			--[=[
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.",
				link_script(data.names[1]), code, script_names[data.names[1]])
			--]=]
		else
			if not m_script_canonical_names[canonical_name] and #code == 4 then
				discrepancy('scripts/by name', '%s (<code>%s</code>) is missing', canonical_name, code)
			end
			script_names[canonical_name] = code
		end
		
		if data.otherNames then
			check_other_names(modname, code, canonical_name, data.otherNames)
			check_array(modname, code, data, "otherNames")
		end
		
		if not nonempty_scrs[code] then
			discrepancy(modname,
				"%s (<code>%s</code>) is not used by any language%s.",
				link_script(canonical_name), code, data.characters and ""
					or " and has no characters listed for auto-detection")
		--[[
		elseif not data.characters then
			discrepancy(modname, "%s (<code>%s</code>) has no characters listed for auto-detection.", link_script(canonical_name), code)
		--]]
		end

		if data.characters then
			validate_pattern(data.characters, modname, code, data, false)
		end
	end
end

function export.perform(frame)
	check_languages()
	check_etym_languages()

	-- families and scripts must be checked AFTER languages; languages checks fill out
	-- the nonempty_fams and nonempty_scrs tables, used for testing if a family/script
	-- is ever used in the data
	check_families()
	check_scripts()
	
	local function find_code(message)
		return string.match(message, "<code>([^<]+)</code>")
	end
	
	find_code = m_fun.memoize(find_code)
	
	local function comp(message1, message2)
		local code1, code2 = find_code(message1), find_code(message2)
		if code1 and code2 then
			return code1 < code2
		else
			return message1 < message2
		end
	end
	
	-- Format the messages
	local ret = {}
	local i = 1 -- leave index 1 for opening message
	for modname, msglist in m_table.sortedPairs(messages) do
		table.sort(msglist, comp)
		i = i + 1
		ret[i] = table.concat{
			'===[[Module:', modname, ']]===',
			table.concat(map(function(msg) return "\n* " .. msg end, msglist))
		}
	end
	
	-- Are there any messages?
	if i == 1 then
		return '<b class="success">Glory to Arstotzka.</b>'
	else
		ret[1] = '<b class="warning">Discrepancies detected:</b>'
		
		return table.concat(ret, '\n')
	end
end

return export