Module:User:Surjection/unpacker

This is a private module sandbox of Surjection, for their own experimentation. Items in this module may be added and removed at Surjection's discretion; do not rely on this module's stability.


local export = {}local export = {}
--local data = "\001fi\004foo\019bar\001"

--description of format:
--\001TOP_LEVEL_KEY\004 -> used to search a top level key
--value is expected to be a table. table format:
--		a key followed by \001, \003, \016-\020.
--		if key is empty, the key is taken as the next available numeric index
--				if the key is only \002, it is taken as the empty string
--				if it starts with \002, the rest of the key is parsed as a number.
--				if it starts with \005, the character code of the next character is used as a numeric index into common_keys
--				otherwise it is taken as a string key.
--		\001 means "end of all tables, a top-level key is up next"
--		\003 means "end of this table only"
--		\016-\020 are data types.
--			\016 is NIL (followed by nothing),
--			\017 is BOOL (followed by one byte, 0 = false, anything else is true)
--			\018 is NUMBER. anything until a \001 (end of top-level table), \002 (next index), \003 (end of this table), \005 (compressed key), \016-\020 (data type), \031 (end of value, if ambiguous otherwise i.e. if a string key follows) is captured and converted into a number.
--			\019 is STRING. anything until a \001 (end of top-level table), \002 (next index), \003 (end of this table), \005 (compressed key), \016-\020 (data type), \031 (end of value, if ambiguous otherwise i.e. if a string key follows) is captured and stored as a string.
--			\020 is TABLE and starts a nested table. go back to step 1, expecting a key. \003 ends a nested table, \001 ends all tables.
--				no \031 should follow, we know when a table ends and don't need a special terminator.
--\001 -> end of top-level keys
--in theory, we could have an escape code to allow these characters in keys and strings. that is not implemented, because there is currently no need.

local common_keys = {
	"from", "remove_diacritics",
	"type", "ancestors", "wikimedia_codes", "wikipedia_article", "translit",
	"link_tr", "display_text", "entry_name", "sort_key", "dotted_dotless_i",
	"standardChars"
}
local data = "\001fi\004\019Finnish\0181412\019urj-fin\019Latn\005\010\020\005\001\020\019\003to\020\019'\003\005\002\019ˣ\003\005\011\020\005\001\020\019ø\019æ\019œ\019ß\003to\020\019o\019ae\019oe\019ss\003\005\002\019̧̀́̂̃̋̌':\003\005\003\019regular\005\009\020to\020\019\003\005\001\020\019'\003\003\005\013\019AaBbDdEeFfGgHhIiJjKkLlMmNnOoPpRrSsTtUuVvYyÄäÖö ',%-–…∅\001"

local function unpack_row(packed, index, common_keys)
	local result = {}
	local packed_len = packed:len()
	
	while index <= packed_len do
		-- find key
		local table_key_prefix = packed:byte(index)
		local table_key, table_value_type, table_value
		if table_key_prefix == 5 then
			table_key = common_keys[packed:byte(index + 1)]
			index = index + 3
		else
			table_key = packed:match("([^\001\003\016-\020]*)", index)
			if not table_key then break end
			index = index + table_key:len() + 1
		end
		
		table_value_type = packed:byte(index - 1) - 16
		
		if table_value_type < 0 then			-- end of table
			if table_value_type == -15 then		-- support nested end of table: \001 is end of top-level value - do not consume it
				index = index - 1
			end
			break
		end
		
		if table_key:len() == 0 then
			-- next number
			table_key = #result + 1
		elseif table_key:byte() == 2 then
			-- \002 is a numeric key
			table_key = tonumber(table_key:sub(2)) or ""
		end
		
		if table_value_type == 0 then			-- \016 NIL
			table_value = nil
		elseif table_value_type == 1 then		-- \017 BOOL
			result[table_key] = packed:byte(index) > 0
			index = index + 1
		elseif table_value_type == 4 then		-- \020 TABLE
			table_value, index = unpack_row(packed, index, common_keys)
		else
			local capture = packed:match("([^\001-\003\005\016-\020\031]*)", index)
			if table_value_type == 2 then		-- \018 NUMBER
				table_value = tonumber(capture)
			else--if table_value_type == 3 then	-- \019 STRING
				table_value = capture
			end
			index = index + capture:len()
			if packed:byte(index) == 31 then		-- skip value separator
				index = index + 1
			end
		end
		
		result[table_key] = table_value
	end
	
	return result, index
end

function export.find_key(key)
	local regex = "\001" .. key .. "\004"
	local index, end_index = data:find(regex)
	if not index then error("Key not found") end
	return (unpack_row(data, end_index + 1, common_keys))
end

return export