User:Kephir/Unicode

Below are scripts I use for maintaining the Unicode database at Module:Unicode data.

How to use scripts on this page edit

Ingredients: GNU Make, GNU awk, GNU wget.

  1. Put the scripts below in a dedicated directory.
  2. Run make clean, then make.
  3. The Unicode database will be downloaded, and several .lua files will be generated from it. Save them in appropriate locations as specified in the table below.
  4. Update the Unicode version number wherever you find it (e.g. Module:character list)
  5. Done. Throw a drinking party.
File name Wiktionary page
aliases.lua Module:Unicode data/aliases
blocks.lua the blocks table in Module:Unicode data
combining.lua Module:Unicode data/combining
control.lua Module:Unicode data/control (includes Cx and Zx)
names_XXX.lua Module:Unicode data/names/XXX

Scripts edit

Makefile
.PHONY: all names clean update

all: blocks.lua control.lua aliases.lua combining.lua names

clean:
	rm -f *.lua *.txt

names: names.awk UnicodeData.txt
	gawk -F';' -f $^

update: all
	./upload

%.lua: %.awk
	gawk -F';' -f $^ >'$@'

Derived%.txt:
	wget 'http://unicode.org/Public/UNIDATA/extracted/$@' -O '$@'

%.txt:
	wget 'http://unicode.org/Public/UNIDATA/$@' -O '$@'

blocks.lua: Blocks.txt

control.lua: DerivedGeneralCategory.txt

aliases.lua: NameAliases.txt

combining.lua: DerivedCombiningClass.txt

names_%.lua: names
aliases.awk
/^[0-9A-F]/ {
	codepoint = strtonum("0x" $1)
	codepoints[codepoint][length(codepoints[codepoint])] = $2";"$3
}

END {
	print "local correction, control, alternate, figment, abbreviation = "
	print "\t\"correction\", \"control\", \"alternate\", \"figment\", \"abbreviation\""
	print ""
	print "return {"
	for (cpoint = 0; cpoint < 0x10ffff; ++cpoint) {
		if (!(cpoint in codepoints))
			continue
		printf "\t[0x%06x] = {\n", cpoint
		for (i in codepoints[cpoint]) {
			split(codepoints[cpoint][i], fields, ";")
			printf "\t\t{ %12s, \"%s\" };\n", fields[2], fields[1]
		}
		printf "\t};\n"
	}
	print "}"
}
names.awk
BEGIN {
	last = -1
}

/^[0-9A-Fa-f]/ && !($2 ~ /^</) {
	cp = strtonum("0x"$1)
	page = sprintf("%03X", int(cp / 0x1000))
	if (last != page) {
		if (last != -1)
			print "}" >> "names_" last ".lua"
		print "return {" > "names_" page ".lua"
		last = page
	}
	printf("\t[ 0x%04X ] = \"%s\",\n", strtonum("0x"$1), $2) >> "names_" page ".lua"
}

END {
	print "}" >> "names_" last ".lua"
}
blocks.awk
BEGIN {
	FS=" *(\\.\\.|;) *"
	max_name_len = 0
	i = 0
}

/^[0-9A-Fa-f]/ {
	starts[i] = strtonum("0x"$1)
	ends[i] = strtonum("0x"$2)
	names[i] = $3
	++i
	
	name_length = length($3)
	max_name_len = (name_length > max_name_len) ? name_length : max_name_len
}

END {
	print ("local blocks = {")
	for (i = 0; i in starts; ++i)
		printf("\t{ 0x%06X, 0x%06X, %-*s },\n",
			starts[i], ends[i], max_name_len + 2, "\"" names[i] "\"")
	print ("}")
}
combining.awk
BEGIN {
	FS="\\.\\.| *[;#] *"
}

!($1 ~ /^[0-9A-F]/) {
	next
}

($3 ~ /^[0-9]*$/) {
	if ($3 == "0")
		next
	ranges[strtonum("0x" $1)] = strtonum("0x" $2)
	kinds[strtonum("0x" $1)] = $3
	next
}

!($3 ~ /^[0-9]*$/) {
	if ($2 == "0")
		next
	singles[strtonum("0x" $1)] = $2
	next
}

END {
	print "return {"
	print "\tsingle = {"
	for (i = 0; i < 0x10ffff; ++i) {
		if (i in singles)
			printf("\t\t[0x%06X] = %4s\n", i, singles[i] ",")
	}
	print "\t};"
	print "\tranges = {"
	for (i = 0; i < 0x10ffff; ++i) {
		if (i in ranges)
			printf("\t\t{ 0x%06X, 0x%06X, %3s },\n", i, ranges[i], kinds[i])
	}
	print "\t};"
	print "}"
}
control.awk
BEGIN {
	FS="\\.\\.| *[;#] *"
}

!($1 ~ /^[0-9A-F]/) {
	next
}

($2 ~ /^[CZ][a-z]/) {
	singles[strtonum("0x" $1)] = $2
}

($3 ~ /^[CZ][a-z]/) {
	ranges[strtonum("0x" $1)] = strtonum("0x" $2)
	kinds[strtonum("0x" $1)] = $3
}

END {
	print "local Cc, Cf, Cs, Co, Cn ="
	print "\t\"control\", \"format\", \"surrogate\", \"private-use\", \"unassigned\""
	print "local Zs, Zl, Zp ="
	print "\t\"space-separator\", \"line-separator\", \"paragraph-separator\""
	print ""
	print "return {"
	print "\tsingle = {"
	for (i = 0; i < 0x10ffff; ++i) {
		if (i in singles)
			printf("\t\t[0x%06X] = %s,\n", i, singles[i])
	}
	print "\t};"
	print "\tranges = {"
	for (i = 0; i < 0x10ffff; ++i) {
		if (i in ranges)
			printf("\t\t{ 0x%06X, 0x%06X, %s },\n", i, ranges[i], kinds[i])
	}
	print "\t};"
	print "}"
}