Module:ja-see/furigana


local export = {}
local kanji_pattern = "々一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𮯯𰀀-𱍏"

-- returns an array of possible matches between kanji and kana
-- for example, simple_match('物の哀れ', 'もののあわれ') returns { '[物](も)の[哀](のあわ)れ', '[物](もの)の[哀](あわ)れ' }
local function simple_match(kanji, kana)
	local kanji_segments = mw.ustring.gsub(kanji, "([A-Za-z0-9々一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𮯯𰀀-𱍏0-9A-Za-z]+)", "`%1`")

	local function simple_match_rec(kanji_segments, kana)
		if kanji_segments:find('`') then
			local kana_portion, kanji_portion, rest = mw.ustring.match(kanji_segments, '(.-)`(.-)`(.*)')
			_, _, kana = mw.ustring.find(kana, '^' .. kana_portion .. '(.*)')
			if not kana then return {} end
			local candidates = {}
			for i = 1, mw.ustring.len(kana) do
				for _, candidate in ipairs(simple_match_rec(rest, mw.ustring.sub(kana, i + 1))) do
					table.insert(candidates, kana_portion .. '[' .. kanji_portion .. '](' .. mw.ustring.sub(kana, 1, i) .. ')' .. candidate)
				end
			end
			return candidates
		else
			return (kanji_segments == kana) and { kana } or {}
		end
	end
	
	return simple_match_rec(kanji_segments, kana)
end

function export.simple_match(kanji, kana)
	local simple_results = simple_match(kanji, kana)
	return #simple_results == 1 and simple_results[1] or '[' .. kanji .. '](' .. kana .. ')'
end

-- transcludes the entry, and returns an array of its kanjitabs transformed into the format above
-- for example, extract_kanjitab_from_entry('書留') returns { '[書](かき)[留](とめ)' }
-- if the 書留 entry contains {{ja-kanjitab|か|と|o1=き|o2=め|yomi=k}}
local function extract_kanjitab_from_entry(entry_title)
	local entry_wikicode = mw.title.new(entry_title):getContent() or ''
	local results = {}
	for kanjitab in mw.ustring.gmatch(entry_wikicode, '{{ja%-kanjitab|(.-)}}') do
		kanjitab = mw.ustring.gsub(kanjitab, '%[%[([^%[%]|]-)|([^%[%]|]-)%]%]', '[[%1`%2]]')
		local args, counter = {}, 1
		for arg in mw.text.gsplit(kanjitab, '|') do
			if mw.ustring.find(arg, '=') then
				local _, _, k, v = mw.ustring.find(arg, '(.-)=(.*)')
				k = ({ k = 'k1', o = 'o1' })[k] or tonumber(k) or k
				args[k] = v
			else
				args[counter] = arg
				counter = counter + 1
			end
		end
		
		local argpos, skip = 1, 0
		local result = mw.ustring.gsub(entry_title, '[' .. kanji_pattern .. ']', function(kanji)
			if skip > 0 then skip = skip - 1 return '<CONCAT>' .. kanji end
			local reading_kana, reading_length = '', nil
			if args[argpos] then _, _, reading_kana, reading_length = mw.ustring.find(args[argpos], '^([^0-9]*)([0-9]*)$') end
			if args['k' .. argpos] then reading_kana = args['k' .. argpos] end
			if args['o' .. argpos] then reading_kana = reading_kana .. args['o' .. argpos] end
			reading_length = reading_kana and tonumber(reading_length) or 1
			skip = reading_length - 1
			argpos = argpos + 1
			return '[' .. kanji .. '](' .. reading_kana .. ')'
		end)
		for i = 1, 10 do
			if not mw.ustring.find(result, '<CONCAT>') then break end
			result = mw.ustring.gsub(result, '%[([^%[%]]+)%]%(([^%(%)]+)%)<CONCAT>(.)', '[%1%3](%2)')
		end
		table.insert(results, result)
	end
	return results
end

-- Try simple match first.  If the result is not accurate, that is,
-- if there are zero results, or more than one result, or the result contains consecutive kanji like [書留](かきとめ),
-- then try to transclude the entry and look for its kanjitabs to decide
function export.accurate_match(kanji, kana)
	local simple_results = simple_match(kanji, kana)
	if #simple_results == 1 and not mw.ustring.find(simple_results[1], '[' .. kanji_pattern .. '][' .. kanji_pattern .. ']') then
		return simple_results[1]
	else
		local kanjitab_results = extract_kanjitab_from_entry(kanji)
		for _, result in ipairs(kanjitab_results) do
			if mw.ustring.gsub(result, '%[([^%[%]]+)%]%(([^%(%)]+)%)', '%2') == kana then
				return result
			end
		end
		-- if all fails
		return '[' .. kanji .. '](' .. kana .. ')'
	end
end

return export