Module:ja-parse/sandbox

This module lacks a documentation subpage. Please create it.
Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox of (diff)
--[=[
TODO: Handle cases like this:

===Pronoun===
{{ja-pos|pronoun|かのじょ}}

# {{ja-def|彼女}} [[she]]; [[her]]

===Noun===
{{ja-noun|かのじょ|カノジョ}}

# {{ja-def|彼女|カノジョ}} [[girlfriend]]

The {{ja-see}} template on [[カノジョ]] should take care not to categorize the term as a pronoun, in addition to ignoring the first {{ja-def}} definition line.
]=]

local export = {}

local len = mw.ustring.len
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch

local m_ja = require('Module:ja')

local kanji_pattern = "一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𮯯𰀀-𱍏"
local japanese_pattern = 'ぁ-ゖァ-ヺー' .. kanji_pattern .. 'ａ-ｚＡ-Ｚ０-９〆々'

local headword_templates = {
	['ja-adj'] = true, ['ja-pos'] = true, ['ja-noun'] = true, ['ja-phrase'] = true,
	['ja-verb'] = true, ['ja-verb form'] = true, ['ja-verb-suru'] = true, 
}

local function find_headword_template(wikitext)
	local index =
		wikitext:find('{{ja%-adj[|}]') or
		wikitext:find('{{ja%-pos[|}]') or
		wikitext:find('{{ja%-noun[|}]') or
		wikitext:find('{{ja%-phrase[|}]') or
		wikitext:find('{{ja%-verb[|}]') or
		wikitext:find('{{ja%-verb form[|}]') or
		wikitext:find('{{ja%-verb%-suru[|}]')
	if index then
		-- This assumes that the template has matching braces.
		return wikitext:match('%b{}', index)
	end
end

local function contains(list, item)
	for i = 1, #list do
		if list[i] == item then return true end
	end
	return false
end

-- Matches between str1 up to str2 or end of string.
local function match_between(source, str1, str2)
	local i = select(2, source:find(str1))
	if not i then return end
	local j = source:find(str2, i)
	return i and source:sub(i + 1, j and j - 1)
end

-- Returns L3 sections in format { { header, wikitext }, ... } as well as text above the first L3 section with header "".
local function get_l3_sections(wikitext)
	local l3_sections = {}
	local multi_etym = false
	
	-- special hack mentioned below
	if not wikitext:find('===Etymology 1===') then
		wikitext = wikitext:gsub('{{ja%-spellings', '=== ===\n{{ja-spellings')
		wikitext = wikitext:gsub('{{ja%-kanjitab', '=== ===\n{{ja-kanjitab')
	end
	
	local header, pos, header_end, prev_header, prev_header_start, prev_header_end
	while true do
		header_start, header_end, header = wikitext:find('%f[=]===([^=]+)===', prev_header_end)
		
		if header == 'Etymology 1' then multi_etym = true end
		
		local section_content = wikitext:sub(prev_header_start or 0, (header_start or 0) - 1)
		
		table.insert(l3_sections, { prev_header or '', section_content })
		
		if not header_start then
			break
		end
		
		prev_header, prev_header_start, prev_header_end = header, header_start, header_end
	end
	
	return l3_sections, multi_etym
end


-- A function to parse Japanese entries, returning a list of etym sections, each having the form { wikitext, type = ( 'lemma' | 'redirect' | '' ), keys = <a list of alternative spellings> }. In case of multiple etymologies, each ===Etymology n=== part constitutes an etym section. Otherwise, the whole Japanese section minus any ===Kanji [n]=== subsections constitutes a single etym section.
-- Note: The function divides sections strictly by L3 headers. As a result:
-- (1) If an entry describes both a kanji and a single word, any templates beginning the word (such as {{ja-spellings}}) will be erroneously considered part of the kanji section above. This function only remedies the cases of {{ja-spellings}} and {{ja-kanjitab}}, by inserting an empty header === === above it before parsing. (This problem is absent for entries with multiple etymologies, since each word must begin with ===Etymology n===.)
-- (2) If an entry describes multiple words, word-specific templates such as {{topics|ja|Biology}} must now be placed at the end of the relevant word instead of the whole entry. If they are put at the end of the ==Japanese== entry, they will be either erroneously considered part of the final word or additional sections such as ===References===, and ignored when {{ja-see}} copies categories around.

local function extract_etym_sections(lemma)
	local page = mw.title.new(lemma):getContent() or ''
	local l2 = match_between(page, '==Japanese==\n', '%-%-%-%-')
	
	-- split into L3 sections
	local l3_sections, multi_etym
	
	if l2 then
		l3_sections, multi_etym = get_l3_sections(l2)
	end
	
	-- group the L3 sections into etym sections
	local etym_sections = {}
	if multi_etym then
		for _, v in ipairs(l3_sections) do
			local header, content = unpack(v)
			if find(header, '^Etymology %d+$') then
				table.insert(etym_sections, content)
			end
		end
	else
		local word = {}
		for _, v in ipairs(l3_sections) do
			local header, content = unpack(v)
			if not (header and header:find("^Kanji ?%d*$")) then
				table.insert(word, content)
			end
		end
		word = table.concat(word, '\n')
		table.insert(etym_sections, word)
	end
	
	-- finally, determine the type of each etym section
	for i = 1, #etym_sections do
		etym_section = etym_sections[i]
		local ja_see = find(etym_section, '{{ja%-see[|}]') or find(etym_section, '{{ja%-see-kango[|}]')
		if ja_see then
			local keys = {}
			for v in gmatch(etym_section:match('%b{}', ja_see), '[' .. japanese_pattern .. ']+') do
				table.insert(keys, v)
			end
			etym_sections[i] = { etym_section, type = 'redirect', keys = keys }
		else
			local ja_forms = find(etym_section, '{{ja%-spellings[|}]')
			if ja_forms then
				local keys = {}
				for v in gmatch(etym_section:match('%b{}', ja_forms), '[' .. japanese_pattern .. ']+') do
					table.insert(keys, v)
				end
				etym_sections[i] = { etym_section, type = 'lemma', keys = keys }
			else
				local headword_template = find_headword_template(etym_section)
				if headword_template then
					local keys = {}
					for v in gmatch(headword_template:gsub('[ ^%-%.]', ''), '[' .. japanese_pattern .. ']+') do
						table.insert(keys, v)
					end
					etym_sections[i] = { etym_section, type = 'lemma', keys = keys }
				else
					etym_sections[i] = { etym_section, type = '', keys = {} }
				end
			end
		end
	end
	return etym_sections
end

export.extract_etym_sections = extract_etym_sections

-- A function to parse Japanese entries based to the function above, but filters the result and finds the etym section with the alternative spelling given by the key, and returns it as wikitext.
function export.get_etym_section(lemma, key, frame)
	local words = extract_etym_sections(lemma, frame)
	local result = {}
	for _, v in ipairs(words) do
		if v.type == 'lemma' and contains(v.keys, key) then
			table.insert(result, v[1])
		end
	end
	local wikitext = table.concat(result, '\n')
	return wikitext
end


local templates_to_exclude = {
	-- These templates are ignored as an optimization since they don't generate categories.
	['m'] = true, ['l'] = true, ['ja-l'] = true, ['ja-r'] = true, ['gloss'] = true,
	['w'] = true, ['wp'] = true, ['swp'] = true, ['wikipedia'] = true,
	['lang'] = true, ['furigana'] = true, ['wj'] = true, ['lj'] = true, ['ruby/ja-w2'] = true, ['ruby/ja'] = true, ['ruby'] = true,
	['ja-kanji forms'] = true, ['w2'] = true, ['sense'] = true, 
	['IPAfont'] = true, ['IPAchar'] = true,
	['ja-adj-infl'] = true, ['ja-i'] = true, ['ja-na'] = true, ['ja-adjdecl'] = true, ['ja-decl-na'] = true, ['ja-go-bu'] = true, ['ja-go-gu'] = true, ['ja-go-ku'] = true, ['ja-go-mu'] = true, ['ja-go-nu'] = true, ['ja-go-ou'] = true, ['ja-go-ru'] = true, ['ja-go-su'] = true, ['ja-go-tsu'] = true, ['ja-go-u'] = true, ['ja-honorific'] = true, ['ja-ichi'] = true, ['ja-kuru'] = true, ['ja-suru'] = true, ['ja-suru-i-ku'] = true, ['ja-suru-tsu'] = true, ['ja-verbconj'] = true, ['ja-verbconj-auto'] = true, ['ja-verbconj-row'] = true, ['ja-verbconjugation'] = true, ['ja-zuru'] = true, 
	['ja-kanji spellings'] = true, ['ja-ks'] = true, ['ja-spellings'] = true, ['ja-forms'] = true,
	['Japanese first-person pronouns'] = true, ['der-top'] = true, ['der-bottom'] = true, 
	['der-mid'] = true, ['der-top3'] = true, ['der-top4'] = true, ['der-top5'] = true, ['rel-top'] = true, 
	
	-- These templates are ignored since they generate categories that are spelling-specific or that we're not interested in
	['ja-kanjitab'] = true, ['ateji'] = true, ['ja-ateji'] = true, ['ja-kanji'] = true, ['ja-readings'] = true, 
	['ja-def'] = true, ['synonyms'] = true,
}
function export.extract_definitions_and_categories(wikitext, source, key, frame)
	local def = {}
	local cat = {}
	local current_section = ''
	local key_contains_kanji = find(key, '[' .. kanji_pattern .. ']') ~= nil
	local prev_pos = 0
	
	-- Search for headers and definition lines (numbered lists) and insert
	-- any intervening text into cat.
	for start_pos, line, first_char, end_pos in wikitext:gmatch('%f[^\n]()(([#=]+)%f[^#=:*][^\n]+)()') do
		if first_char == "#" then
			if not line:find('{{rfdef') and not (key_contains_kanji and line:find('{{ja%-def|') and not line:find('|' .. key .. '[|}]')) then
				table.insert(def, { line, pos = current_section })
			end
		else
			current_section = line:match("^=*(.-)=*$")
		end
		if start_pos - prev_pos > 1 then
			table.insert(cat, wikitext:sub(prev_pos + 1, start_pos - 1))
		end
		prev_pos = end_pos
	end
	table.insert(cat, wikitext:sub(prev_pos + 1)) -- add last bit
	
	-- expand the other parts for categories
	local cat = table.concat(cat, '\n')
	cat = cat:gsub('<ref[ >].-</ref>', '')
	cat = cat:gsub('<references/>', '')
	local function process_template_header(a, b) -- if the template begins with "{{ja-usex|", a is "ja-usex" and b is "|". 
		if templates_to_exclude[a] then
			return '{{=' .. b
		elseif headword_templates[a] then
			local source_script = m_ja.script(source)
			if source_script == 'Hira' or source_script == 'Kana' or source_script == 'Hira+Kana' then
				return '{{' .. a .. '|' .. source .. b
			else
				return '{{' .. a .. b
			end
		elseif a == 'ja-usex' or a:find('^quote') then -- special hack
			return '[[Category:Japanese terms with usage examples]]{{=' .. b
		else
			return '{{' .. a .. b
		end
	end
	cat = cat:gsub('{{(.-)%s*([|}])', process_template_header)
	cat = cat:gsub('{{ja%-pron.-}}', function(pron)
		local result = {}
		if not find(pron, '|noipa=') then
			table.insert(result, '[[Category:Japanese terms with IPA pronunciation]]')
		end
		if find(pron, '|a=') or find(pron, '|audio=') then
			table.insert(result, '[[Category:Japanese terms with audio links]]')
		end
		return table.concat(result)
	end)
	cat = frame:preprocess(cat)
	
	local cat2 = {}
	for i in cat:gmatch('%[%[Category:.-%]%]') do table.insert(cat2, i) end
	cat = table.concat(cat2)
	-- one might want to modify the sortkeys here
	
	return def, cat
end

return export