Module:User:Erutuon/07


local export = {}

local word_limit = 2000

local m_fun = require "Module:fun"
local m_table = require "Module:table"

local decompose = mw.ustring.toNFD

local U = mw.ustring.char
local acute = U(0x301)
local grave = U(0x300)
local circumflex = U(0x342)

-- matches U+0300-U+037F
local diacritic = "[\204-\205][\128-\191]"
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"

local semicolon = "·"

local function match_to_array(str, patt, filter_func, process_func)
	local array = {}
	local i = 0
	for match in str:gmatch(patt) do
		if filter_func(match) then
			i = i + 1
			array[i] = process_func(match)
			---[[
			if i == word_limit then
				break
			end
			--]]
		end
	end
	return array
end

local replacements = {
	[grave] = acute,
	["["] = "",
	["]"] = "",
	["'"] = "’",
	["\""] = "",
	[","] = "",
	[semicolon] = "",
	["."] = "",
	["«"] = "",
	["»"] = "",
	[";"] = "",
	
}

local process_word = m_fun.memoize(function (word)
	local found_accent = false
	return decompose(word)
		-- Remove all but first accent in word.
		-- Use replacements table.
		:gsub(
			UTF8_char,
			function (char)
				if char == acute or char == grave or char == circumflex then
					if found_accent then
						return ""
					end
					found_accent = true
					if char == grave then
						return acute
					else
						return -- no change
					end
				end
				
				return replacements[char]
			end)
end)

-- No macrons or breves in Odyssey text.
local function make_entry_name(word)
	return word:gsub("’", "'")
end

local function link(text)
	return '<span class="polytonic" lang="grc">[[' .. make_entry_name(text) .. '#Ancient Greek|' .. text .. ']]</span>'
end

local function count(array)
	local count_map = {}
	for _, item in ipairs(array) do
		count_map[item] = (count_map[item] or 0) + 1
	end
	return count_map
end

local function process_count(count, word)
	return "* " .. link(word) .. " (" .. count .. ")"
end

local ugsub = mw.ustring.gsub
local ulower = mw.ustring.lower
local remove_diacritics = m_fun.memoize(function (word)
	return ulower(word):gsub(diacritic, "")
end)

local function count_comp_gen(count)
	return function(word1, word2)
		local count1, count2 = count[word1], count[word2]
		if count1 == count2 then
			return remove_diacritics(word1) < remove_diacritics(word2)
		else
			return count1 > count2
		end
	end
end

function export.show(frame)
	local content = mw.title.new("Module:User:Erutuon/07/documentation"):getContent()
	
	local Odyssey1 = content:match"<!%-%-(.-)%-%->"
	local count = count(match_to_array(Odyssey1, "%S+", function(word) return word:find "[\128-\255]" end, process_word))
	
	return table.concat(m_fun.mapIter(process_count, m_table.sortedPairs(count, count_comp_gen(count))), "\n")
end

return export