Module:headword
- The following documentation is located at Module:headword/documentation. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module is used to show headword lines, along with any annotations like genders, transliterations and inflections. It's used by the template {{head}}
, via the submodule Module:headword/templates. It's also used by many other headword modules; for a full list, see Category:Headword-line modules. Some of the data used by this module is found in Module:headword/data.
export.head_is_multiword
function export.head_is_multiword(head)
Return true if the given head is multiword according to the algorithm used in full_headword().
export.add_multiword_links
function export.add_multiword_links(head, default)
Add links to a multiword head.
export.pluralize_pos
function export.pluralize_pos(pos)
-- Returns the plural form of pos
, a raw part of speech input, which could be singular or -- plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to -- "kanji").
export.pos_lemma_or_nonlemma
function export.pos_lemma_or_nonlemma(plpos, best_guess)
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil -- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.). -- If you have a POS in its singular form, call export.pluralize_pos() above to pluralize it -- in a smart fashion that knows when to add "-s" and when to add "-es", and also takes -- into account any irregular plurals.]==] -- -- If best_guess
is given and the POS is in neither the lemma nor non-lemma list, guess -- based on whether it ends in " forms"; otherwise, return nil.
export.maintenance_cats
function export.maintenance_cats(page, lang, lang_cats, page_cats)
This function lacks documentation. Please add a description of its usages, inputs and outputs, or its difference from similar functions, or make it local to remove it from the function list.
export.full_headword
function export.full_headword(data)
This is the primary external entry point. full_headword(data)
This is used by {{head}}
and various language-specific headword templates (e.g. {{ru-adj}}
for Russian adjectives, {{de-noun}}
for German nouns, etc.) to display an entire headword line. See #Further explanations for full_headword()
Further explanations for full_headword()
The sole argument, data
, is a table containing the following items (WARNING: they will be destructively modified):
{
lang = language_object,
pagename = nil or "pagename",
heads = { "head1", "head2", "head3", ... } or {
{
term = nil or "head1",
tr = nil or "translit1",
ts = nil or "transcription1",
sc = nil or script_object,
q = nil or {"left_qualifier1", "left_qualifier2", ...},
qq = nil or {"right_qualifier1", "right_qualifier2", ...},
refs = nil or {{text = "ref_text1" or "", name = nil or "ref_name1", group = nil or "ref_group1"}, ...},
separator = nil or "separator",
},
...
},
translits = { [1] = "translit1", [3] = "translit3", ... },
transcriptions = { [2] = "transcription2", [3] = "transcription3", ... },
sc = script_object,
inflections = {
enable_auto_translit = boolean,
{ label = "grammatical_category", "inflected_form1", "inflected_form2", ... },
{ label = "grammatical_category", accel = {form = "tag|tag", lemma = "lémma"}, "inflected_form1", "inflected_form2", ... },
{
label = "grammatical_category",
accel = {
form = "tag|tag",
target = "form_target",
tr = nil or "form_manual_translit",
gender = "gender_spec" or {"gender_spec1", "gender_spec2", ...},
pos = "form_part_of_speech",
lemma = nil or "lémma",
lemma_translit = nil or "lemma_manual_translit",
no_store = boolean,
},
sc = nil or inflection_specific_script_object,
enable_auto_translit = boolean,
"inflected_form1",
{
term = "inflected_form2",
alt = nil or "display_text",
translit = nil or "manual_transliteration",
transcription = nil or "manual_transcription",
genders = {"gender1", "gender2", {spec = "gender3", qualifiers = nil or {"qualifier1", "qualifier2", ... }}},
accel = {form = "tag|tag|tag", lemma = "lemma_of_inflected_form", lemma_translit = "manual_translit" },
lang = nil or term_specific_lang_object,
sc = nil or term_specific_script_object,
id = "sense_id",
gloss = "gloss",
pos = "part_of_speech",
lit = "literal meaning",
q = nil or {"left_qualifier1", "left_qualifier2", ... },
qq = nil or {"right_qualifier1", "right_qualifier2", ... },
refs = nil or {{text = "ref_text1" or "", name = nil or "ref_name1", group = nil or "ref_group1"}, ...},
separator = nil or "separator",
nolinkinfl = boolean,
face = nil or "plain" or "hypothetical",
},
{
label = "raw_textual_label",
q = nil or {"left_qualifier1", "left_qualifier2", ... },
qq = nil or {"right_qualifier1", "right_qualifier2", ... },
refs = nil or {{text = "ref_text1" or "", name = nil or "ref_name1", group = nil or "ref_group1"}, ...},
separator = nil or "separator",
},
...
},
{ label = "grammatical_category", request = true },
...
},
genders = {
"gender1",
{spec = "gender2", qualifiers = {"qualifier1", "qualifier2", ...}},
...
},
pos_category = "plural_part_of_speech",
categories = { "category1", "category2", ... },
whole_page_categories = { "category1", "category2", ... },
force_cat_output = boolean,
sccat = boolean,
noposcat = boolean,
nogendercat = boolean,
nomultiwordcat = boolean,
nopalindromecat = boolean,
nolinkhead = boolean,
nolinkinfl = boolean,
sort_key = "sort_key",
id = "sense_id",
}
Further explanation:
data.lang
is required and is a language object from Module:languages corresponding to a given language. For example, userequire("Module:languages").getByCode("ru")
to retrieve the object corresponding to Russian.data.pagename
is optional and allows you to override the pagename used variously in the module (e.g. as the default value when a head is omitted, for setting categories such as palindromes andterms spelled with CHAR
, etc.).data.heads
is a table listing the heads of the headword. Each element is either a string specifying only the headword itself (old-style), or an object specifying all the properties of the headword (new-style). You cannot mix and match these two styles; all elements should be of one type or the other. If no heads are specified at all (data.heads
is omitted or is an empty array), a default head is set based on the assumed pagename (either the actual pagename or the value ofdata.pagename
, if set). When using old-style head strings, a given head in the array can benil
, in which case a default head is set as above. When head objects are used, a given object can have the following properties:.term
: A string specifying the headword. This can be omitted, in which case a default head is set as above. Explicit headwords are generally used to specify extra diacritics (in languages with such diacritics, e.g. Russian, Arabic, Latin, Ancient Greek, Old English, etc.), or to link individual words of a multiword term, particularly when the words are inflected forms. Note that by default, each word is linked individually to itself, so there is no need to specify links for a term like a golden key can open any door. Some additional notes:- If a headword string contains wikilinks, they are converted into language-section links for the given language (using
Module:links#language_link
, which is also used by{{l}}
). For example, giving"[[give]] [[up]]"
, if the language provided is English, will produce:"[[give#English|give]] [[up#English|up]]"
. If string is prefixed with * or if any of the links are, then they are interpreted as reconstructed terms and it will create links to the Reconstruction namespace as appropriate. - If the page name contains spaces or punctuation marks (except for punctuation marks that are used inside of words), it is split and each individual word is automatically wikilinked as above.
- If the current page is in the
Reconstruction:
namespace, then an asterisk"*"
will be prepended to the headword to indicate that it is a reconstructed term.
- If a headword string contains wikilinks, they are converted into language-section links for the given language (using
.tr
: A string specifying the transliteration of the headword. This is only needed when the headword is in a non-Latin script, and even then only when the automatic transliteration specified using the language's transliteration module is incorrect (or the language has no transliteration module, such as with Persian and Hebrew). For languages with a transliteration module, pass in"-"
to suppress the transliteration entirely..ts
: A string specifying the transcription of the headword. This is only used in a few languages with non-Latin scripts where the spelling is significantly different from the pronunciation, such as Akkadian, Old Persian or Hittite. In cases like this, the transliteration usually reflects the spelling and the transcription reflects the pronunciation. For this reason, transcriptions are displayed between slashes. Transcriptions should NOT be used simply to display IPA pronunciation of a language like Russian or Arabic. Unlike for transliterations, there are no automatic transcription modules..sc
: An optional script object from Module:scripts corresponding to a given script, specifying the script that the headword is in. If omitted, defaults to the top-leveldata.sc
value. Most of the time, neither the per-headword script nor top-level script need to be specified: If both are omitted, Module:scripts will determine the script(s) using the list of scripts in the language's data file and the characters that are in the headword. Specifically, if there are multiple possible scripts for a language, the script with the largest number of characters in the headword is chosen..q
: An optional array specifying one of more qualifiers displayed to the left of the headword. Qualifiers are displayed in italics and with parentheses around them, and are intended to specify relevant properties of the headword, especially when there is more than one headword..qq
: An optional array specifying one of more qualifiers displayed to the right of the headword, as above..refs
: An optional array specifying one of more references (i.e. footnotes) for the headword. This is similar to using<ref>...</ref>
to specify a reference/footnote after a given word in the text. Each element of the array is either a string (the text of the reference) or an object of the form{text = "ref_text" or "", name = nil or "ref_name", group = nil or "ref_group"}
. In this latter format:.text
specifies the reference text (which cannot benil
; use a blank string when cross-referencing to another reference);.name
gives an optional name to the reference for cross-reference purposes, if the reference text is non-empty, similarly to<ref name="ref_name">ref_text</ref>
; however, if the reference text is empty, it specifies a cross-reference to a previously-named reference, similarly to<ref name="ref_name"/>
;.group
gives an optional group to the reference for grouping purposes, similarly to<ref name="ref_name" group="ref_group">ref_text</ref>
; however, if the reference text is empty, it specifies the group of a cross-reference to a previously-named reference, similarly to<ref name="ref_name" group="ref_group"/>
.
.separator
: The separator preceding the headword. If omitted, the default value is<i>or</i>
(i.e. the italicized word or surrounded by spaces) for the second and higher headword, and a blank string for the first headword. Use a blank string to request no separator at all.
data.translits
is an optional table listing the transliterations corresponding to each headword indata.heads
, when old-style head strings are used; omitting this field is equivalent to setting it to an empty list. If new-style head objects are used, this field must be omitted. The Nth numbered entry should be either a string specifying the transliteration of headword N, or may be omitted, as with the.tr
property described above. Note that, if there are multiple headwords, the table indata.translits
might have entries in the middle of the list that arenil
. A list of this sort cannot be created withtable.insert()
, as attempting to insertnil
this way does nothing. Instead, each transliteration must be explicitly assigned using a number as index, e.g.{ [1] = "string", [3] = "string", ... }
; here, item2
isnil
, because no value was assigned to it.data.transcriptions
is an optional table listing the transcriptions corresponding to each headword indata.heads
, when old-style head strings are used; omitting this field is equivalent to setting it to an empty list. If new-style head objects are used, this field must be omitted. It is of the same format asdata.translits
, and can have holes in it as needed. The meaning of the transcription field is as described abobe for.ts
.data.sc
is an optional script object from Module:scripts corresponding to a given script. If specified, this applies equally to all heads specified usingdata.heads
; if you need to specify per-head scripts, use the head object format documented above. Most of the time you can omit this item, and Module:scripts will determine the script(s) as specified above for the.sc
headword property.data.genders
is a table listing the gender/number specifications for the headwords. This can be omitted for no genders or numbers. Each element is either a string specifying a gender/number spec, or a table of the form{spec = "gender/number_spec", qualifiers = nil or {"qualifier1", "qualifier2", ...}
. In either case, the accepted values for genders or numbers are given in Module:gender and number; examples are"m"
for masculine,"f-an-p"
for feminine animate plural and"c2"
for noun class 2 in languages such as Swahili that have noun classes. If the format with qualifiers is given, the qualifiers are displayed to the left of the gender/number specification. Categories are automatically added according to the specific genders, e.g.LANG masculine nouns
for the language specified indata.lang
if the gender is masculine and the part of speech (see below) isnouns
orreconstructed nouns
. To suppress the addition of these categories, specifydata.nogendercat = true
.data.inflections
is a table listing the inflections to be displayed in the headword entry. The format of this table is somewhat complex and is described below underformat_inflections
.data.pos_category
is the part-of-speech category for the entry. This is one of thelemma
andnonlemma
parts of speech listed in Module:headword/data. It should be in the plural: for example,"nouns"
. If this item is omitted, the part of speech category must be included in as the first item indata.categories
.data.categories
is a table listing the categories to which the entry containing the headword will be added. The first category should be a part-of-speech category, with the canonical name of the language at the beginning –"Russian nouns"
– unless the part of speech is given in the fielddata.pos_category
.data.whole_page_categories
is a table listing language-agnostic categories to which the page will be added, which it is nevertheless useful for the headword module to handle (e.g. Category:Unsupported titles). Because they are not tied to a language, pages in them should be sorted according to their{{DEFAULTSORT:}}
values for the sake of consistency. Note that some of these - including "Category:Unsupported titles" - are already handled automatically.data.sort_key
is a string specifying a sort key for the categories listed indata.categories
. Sort keys should usually be omitted, because theformat_categories
function in Module:utilities will generate a suitable sortkey in most cases. The sortkey is used to ensure that the page is listed in the correct order in the categories to which it belongs.data.nolinkhead
is a boolean value determining whether or not to link the individual words of a multiword headword, which is done by default.data.nolinkinfl
is a boolean value determining whether or not to link the inflections of the entire headword. Not to be confused withpart.nolinkinfl
, which disables linking only for one of the inflections. It is used, for example, by Module:la-headword for reconstructed terms.
Examples
A simple example
full_headword{
lang = require("Module:languages").getByCode("en"), -- language code
heads = {"book"}, -- headwords
inflections = {
{label = "plural", "books"} -- inflections
},
categories = {"English nouns"}, -- part-of-speech category
}
might give (depending on the page it's run on):
<strong class="Latn headword" lang="en">book</strong> (''plural'' <b class="Latn" lang="en">[[books#English|books]]</b>)[[Category:English lemmas|HEADWORD]][[Category:English nouns|HEADWORD]]
which displays as:
- book (plural books)
A fuller example
full_headword{
lang = require("Module:languages").getByCode("de"),
heads = {"Hund"},
genders = {"m"},
inflections = {
{label = "genitive", "Hundes", "Hunds"},
{label = "plural", "Hunde", {term="Hünde", q="nonstandard"}},
{label = "diminutive",
{term = "Hündchen", genders = {"n"}},
{nolinkinfl=true, term = "Hündlein", genders = {"n"}}
}
},
categories = {"German nouns"},
}
might give (depending on the page it's run on):
<strong class="Latn headword" lang="de">Hund</strong> <span class="gender"><abbr title="masculine gender">m</abbr></span> (''genitive'' <b class="Latn" lang="de">[[Hundes#German|Hundes]]</b> ''or'' <b class="Latn" lang="de">[[Hunds#German|Hunds]]</b>, ''plural'' <b class="Latn" lang="de">[[Hunde#German|Hunde]] </b>''or (nonstandard)''<b> [[Hünde#German|Hünde]]</b>, ''diminutive'' <b class="Latn" lang="de">[[Hündchen#German|Hündchen]]</b> <span class="gender"><abbr title="neuter gender">n</abbr></span> ''or'' <b class="Latn" lang="de">Hündlein</b> <span class="gender"><abbr title="neuter gender">n</abbr></span>)[[Category:German lemmas|HEADWORD]][[Category:German nouns|HEADWORD]]
which displays as:
- Hund m (genitive Hundes or Hunds, plural Hunde or (nonstandard) Hünde, diminutive Hündchen n or Hündlein n)
An example in a non-Latin script
This example is in Russian, which has automatic transliteration:
full_headword{
lang = require("Module:languages").getByCode("ru"),
heads = {"кни́га"},
genders = {"f-in"},
inflections = {
{label = "genitive", "кни́ги"},
{label = "nominative plural", "кни́ги"},
{label = "genitive plural", "книг"}
},
categories = {"Russian nouns"},
}
might give (depending on the page it's run on):
<strong class="Cyrl headword" lang="ru">кни́га</strong> [[Wiktionary:Russian transliteration|•]] (<span class="tr" lang=""><span class="tr" lang="">kníga</span></span>) <span class="gender"><abbr title="feminine gender">f</abbr> <abbr title="inanimate">inan</abbr></span> (''genitive'' <b class="Cyrl" lang="ru">[[книги#Russian|кни́ги]]</b>, ''nominative plural'' <b class="Cyrl" lang="ru">[[книги#Russian|кни́ги]]</b>, ''genitive plural'' <b class="Cyrl" lang="ru">[[книг#Russian|книг]]</b>)[[Category:Russian lemmas|HEADWORD]][[Category:Russian nouns|HEADWORD]]
which displays as
Note a few things about the transliteration:
- If the transliteration is specified and non-empty, Module:headword adds some stuff before and after it. For example, if the transliteration is
"foo"
and the language is Hebrew, produceswhich looks like “• (foo)”.[[Wiktionary:Hebrew transliteration|•]] (<span lang="">foo</span>)
- The bullet linking to a transliteration policy page is only added if the page actually exists.
A fuller example in a non-Latin script
This example is in Russian, with two headwords, each of which requires manual transliteration:
full_headword{
lang = require("Module:languages").getByCode("ru"),
heads = {
{term = "интервьюе́р", tr = "intɛrvʹjuér"},
{term = "интервью́ер", "intɛrvʹjújer"},
},
genders = {"m-an"},
inflections = {
{label = "genitive", "интервьюе́ра", "интервью́ера"},
{label = "nominative plural", "интервьюе́ры", "интервью́еры"},
{label = "genitive plural", "интервьюе́ров", "интервью́еров"},
},
categories = {"Russian nouns"},
}
might give (depending on the page it's run on):
<strong class="Cyrl headword" lang="ru">интервьюе́р</strong> ''or'' <strong class="Cyrl headword" lang="ru">интервью́ер</strong> [[Wiktionary:Russian transliteration|•]] (<span class="tr" lang=""><span class="tr" lang="">intɛrvʹjuér</span> ''or'' <span class="tr" lang="">intɛrvʹjújer</span></span>) <span class="gender"><abbr title="masculine gender">m</abbr> <abbr title="animate">anim</abbr></span> (''genitive'' <b class="Cyrl" lang="ru">[[интервьюера#Russian|интервьюе́ра]]</b> ''or'' <b class="Cyrl" lang="ru">[[интервьюера#Russian|интервью́ера]]</b>, ''nominative plural'' <b class="Cyrl" lang="ru">[[интервьюеры#Russian|интервьюе́ры]]</b> ''or'' <b class="Cyrl" lang="ru">[[интервьюеры#Russian|интервью́еры]]</b>, ''genitive plural'' <b class="Cyrl" lang="ru">[[интервьюеров#Russian|интервьюе́ров]]</b> ''or'' <b class="Cyrl" lang="ru">[[интервьюеров#Russian|интервью́еров]]</b>)[[Category:Russian lemmas|HEADWORD]][[Category:Russian nouns|HEADWORD]]
which displays as
- интервьюе́р or интервью́ер • (intɛrvʹjuér or intɛrvʹjújer) m anim (genitive интервьюе́ра or интервью́ера, nominative plural интервьюе́ры or интервью́еры, genitive plural интервьюе́ров or интервью́еров)
Another fuller example in a non-Latin script
This example is in Arabic, with embedded links in the headword, manual transliteration in an inflection and use of enable_auto_translit
:
full_headword{
lang = require("Module:languages").getByCode("ar"),
heads = {
{term = "[[غُدّة]] [[بَصَلِيّ|بَصَلِيّة]] [[إحْلِيلِيّ|إحْلِيلِيّة]]", tr = "ḡudda baṣaliyya ʾiḥlīliyya"},
},
genders = {"f"},
inflections = {
enable_auto_translit = true,
{label = "plural", {term="غُدَد بَصَلِيَّة إِحْلِيلِيَة", translit="ḡudad baṣaliyya ʾiḥlīliyya"}},
},
categories = {"Arabic nouns"},
}
might give (depending on the page it's run on):
<strong class="Arab headword" lang="ar">[[غدة#Arabic|غُدّة]] [[بصلي#Arabic|بَصَلِيّة]] [[إحليلي#Arabic|إحْلِيلِيّة]]</strong> [[Wiktionary:Arabic transliteration|•]] (<span class="tr" lang=""><span class="tr" lang="">ḡudda baṣaliyya ʾiḥlīliyya</span></span>) <span class="gender"><abbr title="feminine gender">f</abbr></span> (''plural'' <b class="Arab" lang="ar">[[غدد بصلية إحليلية#Arabic|غُدَد بَصَلِيَّة إِحْلِيلِيَة]]</b> (<span lang="" class="tr">ḡudad baṣaliyya ʾiḥlīliyya</span>))[[Category:Arabic lemmas|HEADWORD]][[Category:Arabic nouns|HEADWORD]]
which displays as
- غُدّة بَصَلِيّة إحْلِيلِيّة • (ḡudda baṣaliyya ʾiḥlīliyya) f (plural غُدَد بَصَلِيَّة إِحْلِيلِيَة (ḡudad baṣaliyya ʾiḥlīliyya))
Proposed/planned changes
- Checking for invalid genders, given a list of genders that are valid for a particular language.
local export = {}
-- Named constants for all modules used, to make it easier to swap out sandbox versions.
local debug_track_module = "Module:debug/track"
local en_utilities_module = "Module:en-utilities"
local gender_and_number_module = "Module:gender and number"
local headword_data_module = "Module:headword/data"
local headword_page_module = "Module:headword/page"
local links_module = "Module:links"
local pages_module = "Module:pages"
local palindromes_module = "Module:palindromes"
local pron_qualifier_module = "Module:pron qualifier"
local scripts_module = "Module:scripts"
local scripts_data_module = "Module:scripts/data"
local script_utilities_module = "Module:script utilities"
local script_utilities_data_module = "Module:script utilities/data"
local string_utilities_module = "Module:string utilities"
local table_module = "Module:table"
local utilities_module = "Module:utilities"
local require = require
local require_when_needed = require("Module:require when needed")
local m_str_utils = require(string_utilities_module)
local concat = table.concat
local encode_entities = m_str_utils.encode_entities
local find_best_script_without_lang = require_when_needed(scripts_module, "findBestScriptWithoutLang")
local format_categories = require_when_needed(utilities_module, "format_categories")
local format_genders = require_when_needed(gender_and_number_module, "format_genders")
local format_pron_qualifiers = require_when_needed(pron_qualifier_module, "format_qualifiers")
local full_link = require_when_needed(links_module, "full_link")
local get_current_L2 = require_when_needed(pages_module, "get_current_L2")
local get_link_page = require_when_needed(links_module, "get_link_page")
local get_script = require_when_needed(scripts_module, "getByCode")
local insert = table.insert
local ipairs = ipairs
local is_palindrome = require_when_needed(palindromes_module, "is_palindrome")
local language_link = require_when_needed(links_module, "language_link")
local load_data = mw.loadData
local max = math.max
local new_title = mw.title.new
local pairs = pairs
local pattern_escape = m_str_utils.pattern_escape
local pluralize = require_when_needed(en_utilities_module, "pluralize")
local process_page = require_when_needed(headword_page_module, "process_page")
local remove_links = require_when_needed(links_module, "remove_links")
local shallowcopy = require_when_needed(table_module, "shallowcopy")
local tag_text = require_when_needed(script_utilities_module, "tag_text")
local tag_transcription = require_when_needed(script_utilities_module, "tag_transcription")
local tag_translit = require_when_needed(script_utilities_module, "tag_translit")
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local trim = m_str_utils.trim
local type = type
local ufind = mw.ustring.find
local ugmatch = mw.ustring.gmatch
local ugsub = mw.ustring.gsub
local ulen = m_str_utils.len
local umatch = mw.ustring.match
local m_data = load_data(headword_data_module)
local isLemma = m_data.lemmas
local isNonLemma = m_data.nonlemmas
local notranslit = m_data.notranslit
local toBeTagged = m_data.toBeTagged
-- If set to true, categories always appear, even in non-mainspace pages
local test_force_categories = false
-- Add a tracking category to track entries with certain (unusually undesirable) properties. `track_id` is an identifier
-- for the particular property being tracked and goes into the tracking page. Specifically, this adds a link in the
-- page text to [[Wiktionary:Tracking/headword/TRACK_ID]], meaning you can find all entries with the `track_id` property
-- by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID]].
--
-- If `lang` (a language object) is given, an additional tracking page [[Wiktionary:Tracking/headword/TRACK_ID/CODE]] is
-- linked to where CODE is the language code of `lang`, and you can find all entries in the combination of `track_id`
-- and `lang` by visiting [[Special:WhatLinksHere/Wiktionary:Tracking/headword/TRACK_ID/CODE]]. This makes it possible to
-- isolate only the entries with a specific tracking property that are in a given language. Note that if `lang`
-- references at etymology-only language, both that language's code and its full parent's code are tracked.
local function track(track_id, lang)
local tracking_page = "headword/" .. track_id
local m_debug_track = require(debug_track_module)
if lang and lang:hasType("etymology-only") then
m_debug_track{tracking_page, tracking_page .. "/" .. lang:getCode(),
tracking_page .. "/" .. lang:getFullCode()}
elseif lang then
m_debug_track{tracking_page, tracking_page .. "/" .. lang:getCode()}
else
m_debug_track(tracking_page)
end
return true
end
local function text_in_script(text, script_code)
local sc = get_script(script_code)
if not sc then
error("Internal error: Bad script code " .. script_code)
end
local characters = sc.characters
local out
if characters then
text = ugsub(text, "%W", "")
out = ufind(text, "[" .. characters .. "]")
end
if out then
return true
else
return false
end
end
local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
Used to exclude characters from the regex above. ]]
local wordPunc = "-#%%&@־׳״'.·*’་•:᠊"
local notWordPunc = "[^" .. wordPunc .. "]+"
-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, labels, references
-- or customized separator: `part` is the object specifying the term (and `lang` the language of the term), which should
-- optionally contain:
-- * left qualifiers in `q`, an array of strings;
-- * right qualifiers in `qq`, an array of strings;
-- * left labels in `l`, an array of strings;
-- * right labels in `ll`, an array of strings;
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
-- (formatted reference text) and optionally `name` and/or `group`;
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
local function format_term_with_qualifiers_and_refs(lang, part, formatted, j)
if part.q and part.q[1] or part.qq and part.qq[1] or part.l and part.l[1] or
part.ll and part.ll[1] or part.refs and part.refs[1] then
formatted = format_pron_qualifiers {
lang = lang,
text = formatted,
q = part.q,
qq = part.qq,
l = part.l,
ll = part.ll,
refs = part.refs,
}
end
local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator
if separator then
formatted = separator .. formatted
end
return formatted
end
--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
function export.head_is_multiword(head)
for possibleWordBreak in ugmatch(head, spacingPunctuation) do
if umatch(possibleWordBreak, notWordPunc) then
return true
end
end
return false
end
do
local function workaround_to_exclude_chars(s)
return (ugsub(s, notWordPunc, "\2%1\1"))
end
--[==[Add links to a multiword head.]==]
function export.add_multiword_links(head, default)
head = "\1" .. ugsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
if default then
head = head
:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
end
--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
head = encode_entities(head, "[]", true, true)
--[=[
use this when workaround is no longer needed:
head = "[[" .. ugsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"
Remove any empty links, which could have been created above
at the beginning or end of the string.
]=]
return (head
:gsub("\1\2", "")
:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
end
end
local function non_categorizable(full_raw_pagename)
return full_raw_pagename:find("^Appendix:Gestures/") or
-- Unsupported titles with descriptive names.
(full_raw_pagename:find("^Unsupported titles/") and not full_raw_pagename:find("`"))
end
local function tag_text_and_add_quals_and_refs(data, head, formatted, j)
-- Add language and script wrapper.
formatted = tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
-- Add qualifiers, labels, references and separator.
return format_term_with_qualifiers_and_refs(data.lang, head, formatted, j)
end
-- Format a headword with transliterations.
local function format_headword(data)
-- Are there non-empty transliterations?
local has_translits = false
local has_manual_translits = false
------ Format the headwords. ------
local head_parts = {}
local unique_head_parts = {}
local has_multiple_heads = #data.heads > 1
for j, head in ipairs(data.heads) do
if head.tr or head.ts then
has_translits = true
end
if head.tr and head.tr_manual or head.ts then
has_manual_translits = true
end
local formatted
-- Apply processing to the headword, for formatting links and such.
if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then
formatted = language_link{term = head.term, lang = data.lang}
else
formatted = data.lang:makeDisplayText(head.term, head.sc, true)
end
local head_part = tag_text_and_add_quals_and_refs(data, head, formatted, j)
insert(head_parts, head_part)
-- If multiple heads, try to determine whether all heads display the same. To do this we need to effectively
-- rerun the text tagging and addition of qualifiers and references, using 1 for all indices.
if has_multiple_heads then
local unique_head_part
if j == 1 then
unique_head_part = head_part
else
unique_head_part = tag_text_and_add_quals_and_refs(data, head, formatted, 1)
end
unique_head_parts[unique_head_part] = true
end
end
local set_size = 0
if has_multiple_heads then
for _ in pairs(unique_head_parts) do
set_size = set_size + 1
end
end
if set_size == 1 then
head_parts = head_parts[1]
else
head_parts = concat(head_parts)
end
if has_manual_translits then
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/has-manual-translit]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/has-manual-translit/LANGCODE]]
track("has-manual-translit", data.lang)
end
------ Format the transliterations and transcriptions. ------
local translits_formatted
if has_translits then
local translit_parts = {}
for _, head in ipairs(data.heads) do
if head.tr or head.ts then
local this_parts = {}
if head.tr then
insert(this_parts, tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
if head.ts then
insert(this_parts, " ")
end
end
if head.ts then
insert(this_parts, "/" .. tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
end
insert(translit_parts, concat(this_parts))
end
end
translits_formatted = " (" .. concat(translit_parts, " <i>or</i> ") .. ")"
local langname = data.lang:getCanonicalName()
local transliteration_page = new_title(langname .. " transliteration", "Wiktionary")
local saw_translit_page = false
if transliteration_page and transliteration_page.exists then
translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
saw_translit_page = true
end
-- If data.lang is an etymology-only language and we didn't find a translation page for it, fall back to the
-- full parent.
if not saw_translit_page and data.lang:hasType("etymology-only") then
langname = data.lang:getFullName()
transliteration_page = new_title(langname .. " transliteration", "Wiktionary")
if transliteration_page and transliteration_page.exists then
translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
end
end
else
translits_formatted = ""
end
------ Paste heads and transliterations/transcriptions. ------
local lemma_gloss
if data.gloss then
lemma_gloss = ' <span class="ib-content qualifier-content">' .. data.gloss .. '</span>'
else
lemma_gloss = ""
end
return head_parts .. translits_formatted .. lemma_gloss
end
local function format_headword_genders(data)
local retval = ""
if data.genders and #data.genders > 0 then
if data.gloss then
retval = ","
end
local pos_for_cat
if not data.nogendercat and not m_data.no_gender_cat[data.lang:getCode()] and
not m_data.no_gender_cat[data.lang:getFullCode()] then
local pos_category = data.pos_category:gsub("^reconstructed ", "")
pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]
end
local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
for _, cat in ipairs(cats) do
insert(data.categories, cat)
end
retval = retval .. " " .. text
end
return retval
end
local function format_inflection_parts(data, parts)
local any_part_translit = false
for j, part in ipairs(parts) do
if type(part) ~= "table" then
part = {term = part}
end
local partaccel = part.accel
local face = part.face or "bold"
if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then
error("The face `" .. face .. "` " .. (
load_data(script_utilities_data_module).faces[face] and
"should not be used for non-headword terms on the headword line." or
"is invalid."
))
end
-- Here the final part 'or data.nolinkinfl' allows to have 'nolinkinfl=true'
-- right into the 'data' table to disable inflection links of the entire headword
-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
local nolinkinfl = part.face == "hypothetical" or (part.nolink and track("nolink") or part.nolinkinfl) or (
data.nolink and track("nolink") or data.nolinkinfl)
local formatted
if part.label then
-- FIXME: There should be a better way of italicizing a label. As is, this isn't customizable.
formatted = "<i>" .. part.label .. "</i>"
else
-- Convert the term into a full link. Don't show a transliteration here unless enable_auto_translit is
-- requested, either at the `parts` level (i.e. per inflection) or at the `data.inflections` level (i.e.
-- specified for all inflections). This is controllable in {{head}} using autotrinfl=1 for all inflections,
-- or fNautotr=1 for an individual inflection (remember that a single inflection may be associated with
-- multiple terms). The reason for doing this is to avoid clutter in headword lines by default in languages
-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
-- to be enabled in languages with more complex scripts (e.g. Arabic).
local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
if tr ~= "-" then
any_part_translit = true
end
formatted = full_link(
{
term = not nolinkinfl and part.term or nil,
alt = part.alt or (nolinkinfl and part.term or nil),
lang = part.lang or data.lang,
sc = part.sc or parts.sc or nil,
gloss = part.gloss,
pos = part.pos,
lit = part.lit,
id = part.id,
genders = part.genders,
tr = tr,
ts = part.transcription,
accel = partaccel or parts.accel,
},
face
)
end
parts[j] = format_term_with_qualifiers_and_refs(part.lang or data.lang, part,
formatted, j)
end
local parts_output
if #parts > 0 then
parts_output = (parts.label and " " or "") .. concat(parts)
elseif parts.request then
parts_output = " <small>[please provide]</small>"
insert(data.categories, "Requests for inflections in " .. data.lang:getFullName() .. " entries")
else
parts_output = ""
end
local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
return parts_label .. parts_output, any_part_translit
end
-- Format the inflections following the headword.
local function format_inflections(data)
local any_part_translit = false
if data.inflections and #data.inflections > 0 then
-- Format each inflection individually.
for key, infl in ipairs(data.inflections) do
local this_any_part_translit
data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
if this_any_part_translit then
any_part_translit = true
end
end
local concat_result = concat(data.inflections, ", ")
return " (" .. concat_result .. ")"
else
return ""
end
end
--[==[
-- Returns the plural form of `pos`, a raw part of speech input, which could be singular or
-- plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to
-- "kanji").]==]
function export.pluralize_pos(pos)
-- Make the plural form of the part of speech
return m_data.irregular_plurals[pos] or
pos:sub(-1) == "s" and pos or pluralize(pos)
end
--[==[
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
-- If you have a POS in its singular form, call export.pluralize_pos() above to pluralize it
-- in a smart fashion that knows when to add "-s" and when to add "-es", and also takes
-- into account any irregular plurals.]==]
--
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
-- based on whether it ends in " forms"; otherwise, return nil.]==]
function export.pos_lemma_or_nonlemma(plpos, best_guess)
-- Is it a lemma category?
if isLemma[plpos] then
return "lemma"
end
local plpos_no_recon = plpos:gsub("^reconstructed ", "")
if isLemma[plpos_no_recon] then
return "lemma"
end
-- Is it a nonlemma category?
if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
return "non-lemma form"
end
local plpos_no_mut = plpos:gsub("^mutated ", "")
if isLemma[plpos_no_mut] or isNonLemma[plpos_no_mut] then
return "non-lemma form"
elseif best_guess then
return plpos:find(" forms$") and "non-lemma form" or "lemma"
else
return nil
end
end
-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
-- zero-length array if unspecified. Check to make sure all keys are numeric (other than "maxindex", which is set by
-- [[Module:parameters]] for list parameters), all values are strings, and unless `allow_blank_string` is given,
-- no blank (zero-length) strings are present.
local function init_and_find_maximum_index(data, element, allow_blank_string)
local maxind = 0
if not data[element] then
data[element] = {}
end
local typ = type(data[element])
if typ ~= "table" then
error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
end
for k, v in pairs(data[element]) do
if k ~= "maxindex" then
if type(k) ~= "number" then
error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
end
if k > maxind then
maxind = k
end
if v then
if type(v) ~= "string" then
error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
end
if not allow_blank_string and v == "" then
error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
end
end
end
end
return maxind
end
--[==[
-- Add the page to various maintenance categories for the language and the
-- whole page. These are placed in the headword somewhat arbitrarily, but
-- mainly because headword templates are mandatory for entries (meaning that
-- in theory it provides full coverage).
--
-- This is provided as an external entry point so that modules which transclude
-- information from other entries (such as {{tl|ja-see}}) can take advantage
-- of this feature as well, because they are used in place of a conventional
-- headword template.]==]
do
-- Handle any manual sortkeys that have been specified in raw categories
-- by tracking if they are the same or different from the automatically-
-- generated sortkey, so that we can track them in maintenance
-- categories.
local function handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
sortkey = sortkey or lang:makeSortKey(page.pagename)
-- If there are raw categories with no sortkey, then they will be
-- sorted based on the default MediaWiki sortkey, so we check against
-- that.
if tbl == true then
if page.raw_defaultsort ~= sortkey then
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
end
return
end
local redundant, different
for k in pairs(tbl) do
if k == sortkey then
redundant = true
else
different = true
end
end
if redundant then
insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys")
end
if different then
insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
end
return sortkey
end
function export.maintenance_cats(page, lang, lang_cats, page_cats)
for _, cat in ipairs(page.cats) do
insert(page_cats, cat)
end
lang = lang:getFull() -- since we are just generating categories
local canonical = lang:getCanonicalName()
local tbl, sortkey = page.wikitext_topic_cat[lang:getCode()]
if tbl then
sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
insert(lang_cats, canonical .. " entries with topic categories using raw markup")
end
tbl = page.wikitext_langname_cat[canonical]
if tbl then
handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
insert(lang_cats, canonical .. " entries with language name categories using raw markup")
end
if get_current_L2() ~= canonical then
insert(lang_cats, canonical .. " entries with incorrect language header")
end
end
end
--[==[This is the primary external entry point.
{{lua|full_headword(data)}}
This is used by {{temp|head}} and various language-specific headword templates (e.g. {{temp|ru-adj}} for Russian adjectives, {{temp|de-noun}} for German nouns, etc.) to display an entire headword line.
See [[#Further explanations for full_headword()]]
]==]
function export.full_headword(data)
-- Prevent data from being destructively modified.
local data = shallowcopy(data)
------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------
if data.getCanonicalName then
error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
end
if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
end
if data.id and type(data.id) ~= "string" then
error("The id in the data table should be a string.")
end
------------ 2. Initialize pagename etc. ------------
local langcode = data.lang:getCode()
local full_langcode = data.lang:getFullCode()
local langname = data.lang:getCanonicalName()
local full_langname = data.lang:getFullName()
local raw_pagename, page = data.pagename
if raw_pagename and raw_pagename ~= m_data.pagename then -- for testing, doc pages, etc.
page = process_page(raw_pagename)
else
page = m_data.page
end
-- Check the namespace against the language type.
local namespace = page.namespace
if namespace == "" then
if data.lang:hasType("reconstructed") then
error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
elseif data.lang:hasType("appendix-constructed") then
error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
end
elseif namespace == "Citations" or namespace == "Thesaurus" then
error("Headword templates should not be used in the " .. namespace .. ": namespace.")
end
------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
if type(data.heads) == "table" and type(data.heads[1]) == "table" then
-- new-style
if data.translits or data.transcriptions then
error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
end
else
-- convert old-style `heads`, `translits` and `transcriptions` to new-style
local maxind = max(
init_and_find_maximum_index(data, "heads"),
init_and_find_maximum_index(data, "translits", true),
init_and_find_maximum_index(data, "transcriptions", true)
)
for i = 1, maxind do
data.heads[i] = {
term = data.heads[i],
tr = data.translits[i],
ts = data.transcriptions[i],
}
end
end
-- Make sure there's at least one head.
if not data.heads[1] then
data.heads[1] = {}
end
------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------
init_and_find_maximum_index(data, "categories")
init_and_find_maximum_index(data, "whole_page_categories")
local pos_category_already_present = false
if #data.categories > 0 then
local escaped_langname = pattern_escape(full_langname)
local matches_lang_pattern = "^" .. escaped_langname .. " "
for _, cat in ipairs(data.categories) do
-- Does the category begin with the language name? If not, tag it with a tracking category.
if not cat:find(matches_lang_pattern) then
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/no lang category/LANGCODE]]
track("no lang category", data.lang)
end
end
-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we
-- throw an error below.
if not data.pos_category and data.categories[1]:find(matches_lang_pattern) then
data.pos_category = data.categories[1]:gsub(matches_lang_pattern, "")
-- Optimization to avoid inserting category already present.
pos_category_already_present = true
end
end
if not data.pos_category then
error("`data.pos_category` not specified and could not be inferred from the categories given in "
.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
.. "language's canonical name plus the plural part of speech (e.g. \"Norwegian Bokmål proper nouns\")."
)
end
-- Insert a category at the beginning for the part of speech unless it's already present or `data.noposcat` given.
if not pos_category_already_present and not data.noposcat then
local pos_category = full_langname .. " " .. data.pos_category
-- FIXME: [[User:Theknightwho]] Why is this special case here? Please add an explanatory comment.
if pos_category ~= "Translingual Han characters" then
insert(data.categories, 1, pos_category)
end
end
-- Try to determine whether the part of speech refers to a lemma or a non-lemma form; if we can figure this out,
-- add an appropriate category.
local postype = export.pos_lemma_or_nonlemma(data.pos_category)
if not postype then
-- We don't know what this category is, so tag it with a tracking category.
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/LANGCODE]]
track("unrecognized pos", data.lang)
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/POS]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/unrecognized pos/POS/LANGCODE]]
track("unrecognized pos/pos/" .. data.pos_category, data.lang)
elseif not data.noposcat then
insert(data.categories, 1, full_langname .. " " .. postype .. "s")
end
------------ 5. Create a default headword, and add links to multiword page names. ------------
-- Determine if term is reconstructed
local is_reconstructed = namespace == "Reconstruction" or data.lang:hasType("reconstructed")
-- Create a default headword based on the pagename, which is determined in
-- advance by the data module so that it only needs to be done once.
local default_head = page.pagename
-- Add links to multi-word page names when appropriate
if not data.nolinkhead and not m_data.no_multiword_links[langcode] and not m_data.no_multiword_links[full_langcode]
and not is_reconstructed and export.head_is_multiword(default_head) then
default_head = export.add_multiword_links(default_head, true)
end
if is_reconstructed then
default_head = "*" .. default_head
end
------------ 6. Fill in missing values in `data.heads`. ------------
-- True if any script among the headword scripts has spaces in it.
local any_script_has_spaces = false
-- True if any term has a redundant head= param.
local has_redundant_head_param = false
for _, head in ipairs(data.heads) do
------ 6a. If missing head, replace with default head.
if not head.term then
head.term = default_head
elseif head.term == default_head then
has_redundant_head_param = true
elseif head.term:find("^[!?]$") then
-- If explicit head= just consists of ! or ?, add it to the end of the default head.
head.term = default_head .. head.term
end
if is_reconstructed then
local head_term = head.term
if head_term:find("%[%[") then
head_term = remove_links(head_term)
end
if head_term:sub(1, 1) ~= "*" then
error("The headword '" .. head_term .. "' must begin with '*' to indicate that it is reconstructed.")
end
end
------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
------ otherwise fall back to the overall script if given. If neither given, autodetect the script.
local auto_sc = data.lang:findBestScript(head.term)
if (
auto_sc:getCode() == "None" and
find_best_script_without_lang(head.term):getCode() ~= "None"
) then
insert(data.categories, full_langname .. " terms in nonstandard scripts")
end
if not (head.sc or data.sc) then -- No script code given, so use autodetected script.
head.sc = auto_sc
else
if not head.sc then -- Overall script code given.
head.sc = data.sc
end
-- Track uses of sc parameter.
if head.sc:getCode() == auto_sc:getCode() then
insert(data.categories, full_langname .. " terms with redundant script codes")
else
insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
end
end
-- If using a discouraged character sequence, add to maintenance category.
if head.sc:hasNormalizationFixes() == true then
local composed_head = toNFC(head.term)
if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
insert(data.whole_page_categories, "Pages using discouraged character sequences")
end
end
any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()
------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
------ (provided automatic translit is available, e.g. not in Persian or Hebrew).
-- Make transliterations
head.tr_manual = nil
-- Try to generate a transliteration if necessary
if head.tr == "-" then
head.tr = nil
elseif not notranslit[langcode] and not notranslit[full_langcode] and head.sc:isTransliterated() then
head.tr_manual = not not head.tr
local text = head.term
if not data.lang:link_tr(head.sc) then
text = remove_links(text)
end
local automated_tr, tr_categories
automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)
if automated_tr or head.tr_fail then
local manual_tr = head.tr
if manual_tr then
if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
insert(data.categories, full_langname .. " terms with redundant transliterations")
elseif not head.tr_fail then
insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
end
end
if not manual_tr then
head.tr = automated_tr
for _, category in ipairs(tr_categories) do
insert(data.categories, category)
end
end
end
-- There is still no transliteration?
-- Add the entry to a cleanup category.
if not head.tr then
head.tr = "<small>transliteration needed</small>"
-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
else
-- Otherwise, trim it.
head.tr = trim(head.tr)
end
end
-- Link to the transliteration entry for languages that require this.
if head.tr and data.lang:link_tr(head.sc) then
head.tr = full_link{
term = head.tr,
lang = data.lang,
sc = get_script("Latn"),
tr = "-"
}
end
end
------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
-- (FIXME: Don't make assumptions like this, and if you need to do so, throw an error if the assumption is violated.)
-- Avoid tagging ASCII as Hani even when it is tagged as Hani in the headword, as in [[check]]. The check for ASCII
-- might need to be expanded to a check for any Latin characters and whitespace or punctuation.
local display_title
-- Where there are multiple headwords, use the script for the first. This assumes the first headword is similar to
-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be
-- about the best we can do (alternatively we could potentially do script detection on the pagename).
local dt_script = data.heads[1].sc
local dt_script_code = dt_script:getCode()
local page_non_ascii = namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "")
if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
elseif page_non_ascii and toBeTagged[dt_script_code]
or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
display_title = '<span class="' .. dt_script_code .. '">' .. page.full_raw_pagename .. '</span>'
-- Keep Han entries region-neutral in the display title.
elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>'
elseif namespace == "Reconstruction" then
local matched
display_title, matched = ugsub(
page.full_raw_pagename,
"^(Reconstruction:[^/]+/)(.+)$",
function(before, term)
return before .. tag_text(term, data.lang, dt_script)
end
)
if matched == 0 then
display_title = nil
end
end
-- FIXME: Generalize this.
-- If the current language uses ur-Arab (for Urdu, etc.), ku-Arab (Central Kurdish) or pa-Arab
-- (Shahmukhi, for Punjabi) and there's more than one language on the page, don't set the display title
-- because these three scripts display in Nastaliq and we don't want this for terms that also exist in other
-- languages that don't display in Nastaliq (e.g. Arabic or Persian) to display in Nastaliq. Because the word
-- "Urdu" occurs near the end of the alphabet, Urdu fonts tend to override the fonts of other languages.
-- FIXME: This is checking for more than one language on the page but instead needs to check if there are any
-- languages using scripts other than the ones just mentioned.
if (dt_script_code == "ur-Arab" or dt_script_code == "ku-Arab" or dt_script_code == "pa-Arab") and page.L2_list.n > 1 then
display_title = nil
end
if display_title then
mw.getCurrentFrame():callParserFunction(
"DISPLAYTITLE",
display_title
)
end
------------ 8. Insert additional categories. ------------
if data.force_cat_output then
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/force cat output]]
track("force cat output")
end
if has_redundant_head_param then
if not data.no_redundant_head_cat then
insert(data.categories, full_langname .. " terms with redundant head parameter")
end
end
-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" and
not m_data.no_multiword_cat[langcode] and not m_data.no_multiword_cat[full_langcode] then
-- Check for spaces or hyphens, but exclude prefixes and suffixes.
-- Use the pagename, not the head= value, because the latter may have extra
-- junk in it, e.g. superscripted text that throws off the algorithm.
local checkpattern = ".[%s%-፡]."
if m_data.hyphen_not_multiword_sep[langcode] or m_data.hyphen_not_multiword_sep[full_langcode] then
-- Exclude hyphens if the data module states that they should for this language
checkpattern = ".[%s፡]."
end
if umatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
insert(data.categories, full_langname .. " multiword terms")
end
end
if data.sccat then
for _, head in ipairs(data.heads) do
insert(data.categories, full_langname .. " " .. data.pos_category .. " in " ..
head.sc:getDisplayForm())
end
end
-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.
if namespace ~= "Reconstruction" then
-- Map from languages to a string containing the characters to ignore when considering whether a term has
-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic
-- values.
local characters_to_ignore = {
["aaq"] = "α", -- Penobscot
["acy"] = "δθ", -- Cypriot Arabic
["anc"] = "γ", -- Ngas
["aou"] = "χ", -- A'ou
["awg"] = "β", -- Anguthimri
["bhp"] = "β", -- Bima
["byk"] = "θ", -- Biao
["cdy"] = "θ", -- Chadong
["clm"] = "χ", -- Klallam
["col"] = "χ", -- Colombia-Wenatchi
["coo"] = "χ", -- Comox; FIXME: others? E.g. Greek theta (θ)?
["ets"] = "θ", -- Yekhee
["gmw-gts"] = "χ", -- Gottscheerish
["hur"] = "θ", -- Halkomelem
["izh"] = "ь", -- Ingrian
["kic"] = "θ", -- Kickapoo
["lil"] = "χ", -- Lillooet
["mhz"] = "β", -- Mor (Austronesian)
["neg"]= "ӡ", -- Negidal (normally in Cyrillic)
["oui"] = "γβ", -- Old Uyghur: FIXME: others? E.g. Greek delta (δ)?
["pox"] = "χ", -- Polabian
["rom"] = "Θθ", -- Romani: International Standard; two different thetas???
["sah"] = "ь", -- Yakut (1929 - 1939 Latin spelling)
["sjw"] = "θ", -- Shawnee
["squ"] = "χ", -- Squamish
["str"] = "χθ", -- Saanich; uses two Greek letters
["twa"] = "χ", -- Twana
["yha"] = "θ", -- Baha
["za"] = "зч", -- Zhuang; 1957-1982 alphabet used two Cyrillic letters (as well as some others like
-- ƃ, ƅ, ƨ, ɯ and ɵ that look like Cyrillic or Greek but are actually Latin)
["zlw-slv"] = "χђћ", -- Slovincian; FIXME: χ is Greek, the other two are Cyrillic, but I'm not sure
-- the currect characters are being chosen in the entry names
["zng"] = "θ", -- Mang
}
-- Determine how many real scripts are found in the pagename, where we exclude symbols and such. We exclude
-- scripts whose `character_category` is false as well as Zmth (mathematical notation symbols), which has a
-- category of "Mathematical notation symbols". When counting scripts, we need to elide language-specific
-- variants because e.g. Beng and as-Beng have slightly different characters but we don't want to consider them
-- two different scripts (e.g. [[এৰ]] has two characters which are detected respectively as Beng and as-Beng).
local seen_scripts = {}
local num_seen_scripts = 0
local num_loops = 0
local canon_pagename = page.pagename
local ch_to_ignore = characters_to_ignore[full_langcode]
if ch_to_ignore then
canon_pagename = ugsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
end
local script_data = load_data(scripts_data_module)
while true do
if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then
break
end
-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]
num_loops = num_loops + 1
local pagename_script = find_best_script_without_lang(canon_pagename, "None only as last resort")
local script_chars = pagename_script.characters
if not script_chars then
-- we are stuck; this happens with None
break
end
local script_code = pagename_script:getCode()
local replaced
canon_pagename, replaced = ugsub(canon_pagename, "[" .. script_chars .. "]", "")
if replaced and script_code ~= "Zmth" and script_data[script_code] and
script_data[script_code].character_category ~= false then
script_code = script_code:gsub("^.-%-", "")
if not seen_scripts[script_code] then
seen_scripts[script_code] = true
num_seen_scripts = num_seen_scripts + 1
end
end
end
if num_seen_scripts > 1 then
insert(data.categories, full_langname .. " terms written in multiple scripts")
end
end
-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
local standard = data.lang:getStandardCharacters()
if standard and not non_categorizable(page.full_raw_pagename) then
local function char_category(char)
local specials = {
["#"] = "number sign",
["("] = "parentheses",
[")"] = "parentheses",
["<"] = "angle brackets",
[">"] = "angle brackets",
["["] = "square brackets",
["]"] = "square brackets",
["_"] = "underscore",
["{"] = "braces",
["|"] = "vertical line",
["}"] = "braces",
["ß"] = "ẞ",
["\205\133"] = "", -- this is UTF-8 for U+0345 ( ͅ)
["\239\191\189"] = "replacement character",
}
char = toNFD(char)
:gsub(".[\128-\191]*", function(m)
local new_m = specials[m]
new_m = new_m or m:uupper()
return new_m
end)
return toNFC(char)
end
if full_langcode ~= "hi" and full_langcode ~= "lo" then
local standard_chars_scripts = {}
for _, head in ipairs(data.heads) do
standard_chars_scripts[head.sc:getCode()] = true
end
-- Iterate over the scripts, in case there is more than one (as they can have different sets of standard characters).
for code in pairs(standard_chars_scripts) do
local sc_standard = data.lang:getStandardCharacters(code)
if sc_standard then
if page.pagename_len > 1 then
local explode_standard = {}
local function explode(char)
explode_standard[char] = true
return ""
end
local sc_standard = ugsub(sc_standard, page.comb_chars.combined_double, explode)
sc_standard = ugsub(sc_standard,page.comb_chars.combined_single, explode)
:gsub(".[\128-\191]*", explode)
local num_cat_inserted
for char in pairs(page.explode_pagename) do
if not explode_standard[char] then
if char:find("[0-9]") then
if not num_cat_inserted then
insert(data.categories, full_langname .. " terms spelled with numbers")
num_cat_inserted = true
end
elseif ufind(char, page.emoji_pattern) then
insert(data.categories, full_langname .. " terms spelled with emoji")
else
local upper = char_category(char)
if not explode_standard[upper] then
char = upper
end
insert(data.categories, full_langname .. " terms spelled with " .. char)
end
end
end
end
-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
sc_standard = toNFD(sc_standard)
for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do
if not umatch(sc_standard, diacritic) then
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic)
end
end
for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do
if not umatch(sc_standard, diacritic) then
insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌")
end
end
end
end
-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
elseif ulen(page.pagename) ~= 1 then
for character in ugmatch(page.pagename, "([^" .. standard .. "])") do
local upper = char_category(character)
if not umatch(upper, "[" .. standard .. "]") then
character = upper
end
insert(data.categories, full_langname .. " terms spelled with " .. character)
end
end
end
if data.heads[1].sc:isSystem("alphabet") then
local pagename, i = page.pagename:ulower(), 2
while umatch(pagename, "(%a)" .. ("%1"):rep(i)) do
i = i + 1
insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter")
end
end
-- Categorise for palindromes
if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(page.pagename) > 2
-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
-- multiple scripts?
and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
insert(data.categories, full_langname .. " palindromes")
end
if namespace == "" and not data.lang:hasType("reconstructed") then
for _, head in ipairs(data.heads) do
if page.full_raw_pagename ~= get_link_page(remove_links(head.term), data.lang, head.sc) then
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch]]
-- [[Special:WhatLinksHere/Wiktionary:Tracking/headword/pagename spelling mismatch/LANGCODE]]
track("pagename spelling mismatch", data.lang)
break
end
end
end
-- Add to various maintenance categories.
export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)
------------ 9. Format and return headwords, genders, inflections and categories. ------------
-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
-- so make sure we do it before evaluating `data.categories`.
local text = '<span class="headword-line">' ..
format_headword(data) ..
format_headword_genders(data) ..
format_inflections(data) .. '</span>'
-- Language-specific categories.
local cats = format_categories(
data.categories, data.lang, data.sort_key, page.encoded_pagename,
data.force_cat_output or test_force_categories, data.heads[1].sc
)
-- Language-agnostic categories.
local whole_page_cats = format_categories(
data.whole_page_categories, nil, "-"
)
return text .. cats .. whole_page_cats
end
return export