// To compile: javac -encoding UTF-8 BaxterSagartWikitableBuilder.java
// To run: java BaxterSagartWikitableBuilder
// Output: Baxter-Sagart wikitable.txt
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.zip.*;
public class BaxterSagartWikitableBuilder {
public static final String baxterSagartURL =
"http://crlao.ehess.fr/docannexe.php?id=1221";
public static final String unihanURL =
"http://www.unicode.org/Public/UNIDATA/Unihan.zip";
public static final String outFile = "Baxter-Sagart wikitable.txt";
public static void main(String[] args) {
try { main(); }
catch (Exception e) {
e.printStackTrace();
System.exit(1);
} System.exit(0);
}
public static long fetchDate;
public static File baxterSagartFile;
public static File unihanFile;
public static TreeMap<String,String> scMap;
public static PrintWriter writer;
public static void main() throws Exception {
// Remembering the exact time we fetched the online data.
fetchDate = System.currentTimeMillis();
// Retrieving Baxter-Sagart data.
baxterSagartFile = download(baxterSagartURL);
// Retrieving Unihan data.
unihanFile = download(unihanURL);
// Processing Unihan data.
// We need this for mappings of
// Traditional Chinese characters to
// Simplified Chinese characters.
processUnihan();
// Beginning to write out wikitable file.
writer = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new BufferedOutputStream(
new FileOutputStream(outFile)
), "UTF-8"
)
)
);
writer.print(
"== Data ==\n" +
//":''This section is software-generated. The program's Java source code is [[/BaxterSagartWikitableBuilder.java|here]].''\n" +
"This table incorporates data from:\n" +
"* The [http://www.unicode.org/Public/UNIDATA/ Unihan Database].\n" +
"* [[w:William H. Baxter|Baxter, W.]] and " +
"[[w:Laurent Sagart|L. Sagart]] (n.d.) " +
"Baxter-Sagart Old Chinese reconstruction (Version 1.00). " +
"Online at http://crlao.ehess.fr/document.php?id=1217 . Accessed "
);
// Printing the fetch date.
// We don't care about deprecated API. This works well enough.
writer.print(new Date(fetchDate).toGMTString());
writer.print(
".\n" +
"Legend of table headers:\n" +
"* '''TC''': {{w|Traditional Chinese}} character.\n" +
"* '''SC''': {{w|Simplified Chinese}} character.\n" +
"* '''PY''': [[w:Mandarin Chinese|Mandarin]] {{w|Pinyin}} romanization.\n" +
"* '''MC''': {{w|Middle Chinese}} reconstruction.\n" +
"* '''MCI''': Middle Chinese initial.\n" +
"* '''MCF''': Middle Chinese final.\n" +
"* '''MCT''': Middle Chinese tone.\n" +
"** A = even tone (平聲).\n" +
"** B = rising tone (上聲).\n" +
"** C = departing tone (去聲).\n" +
"** D = entering tone (入聲).\n" +
"* '''OC''': {{w|Old Chinese}} reconstruction.\n" +
"* '''Gloss''': Word's meaning.\n" +
"{| class=\"wikitable sortable\"\n" +
"|-\n" +
"! TC\n" +
"! SC\n" +
"! PY\n" +
"! MC\n" +
"! MCI\n" +
"! MCF\n" +
"! MCT\n" +
"! OC\n" +
"! Gloss\n"
);
// Processing Baxter-Sagart data, and writing to file.
processBaxterSagart();
// Close the wikitable.
writer.print("|}\n");
// And we're done.
writer.flush();
writer.close();
}
public static File download(String url) throws Exception {
File file;
InputStream in;
OutputStream out;
byte[] bytes;
int read;
// Create the temp file.
// We don't care where it's stored or what its name is.
file = File.createTempFile("" + url.hashCode(), null);
// The file will be deleted when execution finishes.
file.deleteOnExit();
System.out.println("Downloading: " + url);
System.out.println("This may take a while...");
// Opening an HTTP connection and securing an input stream.
in = new URL(url).openStream();
// Buffering the input stream, if not already buffered.
if (!in.markSupported())
in = new BufferedInputStream(in);
// Opening an output stream to the temp file.
out = new FileOutputStream(file);
// Buffering the output stream.
out = new BufferedOutputStream(out);
// 4K read/write buffer.
bytes = new byte[0x1000];
// Read/write loop.
for (;;) {
read = in.read(bytes);
if (read < 0) // EOF
break;
if (read > 0) // Have some data.
out.write(bytes, 0, read);
}
// Closing input stream.
in.close();
// Flushing and closing output stream.
out.flush();
out.close();
return file;
}
public static void processUnihan() throws Exception {
ZipInputStream zin;
ZipEntry entry;
String filename;
InputStream in;
BufferedReader reader;
String line;
String[] tokens;
String fieldType;
String traditional;
String simplified;
// Create traditional-to-simplified map data structure.
scMap = new TreeMap<String,String>();
// Opening zip file.
zin = new ZipInputStream(
new BufferedInputStream(
new FileInputStream(unihanFile)
)
);
// Searching for the right zip entry.
for (;;) {
entry = zin.getNextEntry();
if (entry == null) {
zin.close();
throw new RuntimeException(
"Can't find Unihan_Variants.txt.");
}
filename = entry.getName();
if (filename.endsWith("Unihan_Variants.txt")) {
// We found what we're looking for.
break;
}
// This isn't the zip entry we're looking for.
entry = null;
zin.closeEntry();
}
// We don't need this anymore.
entry = null;
in = zin;
// Buffering the entry's input stream, if not already buffered.
if (!in.markSupported())
in = new BufferedInputStream(in);
// Creating a UTF-8 input stream reader.
reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
// Looping through the data, gleaning only what we need.
for (;;) {
// Reading a line of text.
line = reader.readLine();
if (line == null) // EOF
break;
// Stripping comments from the line.
line = line.replaceFirst("#.*$", "");
// Stripping trailing whitespace from the line.
line = line.trim();
// Skipping empty lines.
if (line.length() == 0)
continue;
// Split line by tab characters.
tokens = line.split("\t");
// There should be at least three tokens.
if (tokens.length < 3)
continue; // Skip the line.
// Determine if this line has data we're looking for.
fieldType = tokens[1].trim();
if (!fieldType.equalsIgnoreCase("kSimplifiedVariant"))
continue; // Skip the line.
// Traditional and simplified Chinese characters.
traditional = fromUnicodeNotation(tokens[0]);
simplified = fromUnicodeNotation(tokens[2]);
// If by chance they are the same, skip them.
if (traditional.equals(simplified))
continue; // Skip the line.
// We found something we're looking for.
scMap.put(traditional, simplified);
}
// We're done with the zip file.
zin.close();
in = null;
zin = null;
// Rebranch the finished map for improved access speed.
scMap = new TreeMap<String,String>(scMap);
}
public static void processBaxterSagart() throws Exception {
BufferedReader reader;
boolean firstLine;
String line;
String[] tokens;
// Reading Baxter-Sagart database from temp file.
reader = new BufferedReader(
new InputStreamReader(
new BufferedInputStream(
new FileInputStream(baxterSagartFile)
), "UTF-8"
)
);
// We will skip the first non-empty line when we reach it.
firstLine = true;
// Looping through each line.
for (;;) {
// Reading a line of text.
line = reader.readLine();
if (line == null) // EOF
break;
// Trimming trailing whitespace.
line = line.trim();
// Skipping empty lines.
if (line.length() == 0)
continue;
// Skipping the first line, which is a table header.
if (firstLine) {
firstLine = false;
continue;
}
// Split line by tab characters.
tokens = line.split("[\\s^\t]*\t\\s*");
// There should be at least eleven tokens.
if (tokens.length < 11)
continue;
// Process tokens in another function.
processBaxterSagart(tokens[0], tokens[1], tokens[2],
tokens[3], tokens[4], tokens[5], tokens[6],
tokens[7], tokens[8], tokens[9], tokens[10]);
}
// Closing the reader.
reader.close();
}
public static void processBaxterSagart(
String tc, String py, String py2,
String mc, String mci, String mcf, String mct,
String oc, String gloss, String gst, String utf16
) throws Exception {
String s;
// Begin the new table row.
writer.print("|-\n");
// Traditional Chinese character.
writer.print('|');
if (tc.length() > 0) {
writer.print("lang=zh-Hant|[[");
printEscaped(tc);
writer.print("]]");
} writer.print('\n');
// Simplified Chinese character.
writer.print('|');
if (tc.length() > 0) {
s = scMap.get(tc);
if (s == null || s.length() == 0)
s = tc;
writer.print("lang=zh-Hans|[[");
printEscaped(s);
writer.print("]]");
} writer.print('\n');
// Pinyin, sortable.
writer.print('|');
if (py.length() > 0) {
py2 = pinyinToSortable(py);
if (!py.equals(py2)) {
writer.print("<span style=display:none>");
printEscaped(py2);
writer.print(" </span>");
} writer.print("[[");
printEscaped(py);
writer.print("#Mandarin|");
printEscaped(py);
writer.print("]]");
} writer.print('\n');
// Middle Chinese, sortable.
writer.print('|');
if (mc.length() > 0) {
mc = middleChineseToUnicode(mc);
s = middleChineseToSortable(mc);
if (!mc.equals(s)) {
writer.print("<span style=display:none>");
printEscaped(s);
writer.print(" </span>");
} printEscaped(mc);
} writer.print('\n');
if (mcf.startsWith("-r")) { // misplaced
mcf = "-" + mcf.substring(2);
mci = mci.substring(0, mci.length() - 1) + "r-";
}
// Middle Chinese initial, sortable.
writer.print('|');
if (mci.length() > 0) {
mci = middleChineseToUnicode(mci);
s = middleChineseToSortable(mci);
if (!mci.equals(s)) {
writer.print("<span style=display:none>");
printEscaped(s);
writer.print(" </span>");
} printEscaped(mci);
} writer.print('\n');
// Middle Chinese final, sortable.
writer.print('|');
if (mcf.length() > 0) {
mcf = middleChineseToUnicode(mcf);
s = middleChineseToSortable(mcf);
if (!mcf.equals(s)) {
writer.print("<span style=display:none>");
printEscaped(s);
writer.print(" </span>");
} printEscaped(mcf);
} writer.print('\n');
// Middle Chinese tone, sortable.
writer.print('|');
if (mct.length() > 0) {
switch (mct.charAt(0)) {
case 'A':
writer.print("<span style=display:none>A</span>even"); break;
case 'B':
writer.print("<span style=display:none>B</span>rising"); break;
case 'C':
writer.print("<span style=display:none>C</span>departing"); break;
case 'D':
writer.print("<span style=display:none>D</span>entering"); break;
default: printEscaped(mct);
}
} writer.print('\n');
// Old Chinese, semi-sortable.
writer.print('|');
if (oc.length() > 0) {
writer.print("class=IPA|");
oc = oldChineseToUnicode(oc);
s = oldChineseToSortable(oc);
if (!oc.equals(s)) {
writer.print("<span style=display:none>");
printEscaped(s);
writer.print(" </span>");
} printEscaped(oc);
} writer.print('\n');
// Gloss, semi-sortable
writer.print('|');
if (gloss.length() > 0)
printEscaped(gloss);
writer.print('\n');
}
public static String pinyinToSortable(String string) {
string = string.replaceFirst("([āēīōūǖ].*)$", "$1"+"1");
string = string.replaceFirst("([áéíóúǘ].*)$", "$1"+"2");
string = string.replaceFirst("([ǎěǐǒǔǚ].*)$", "$1"+"3");
string = string.replaceFirst("([àèìòùǜ].*)$", "$1"+"4");
string = string.replaceFirst("[āáǎà]", "a");
string = string.replaceFirst("[ēéěè]", "e");
string = string.replaceFirst("[īíǐì]", "i");
string = string.replaceFirst("[ōóǒò]", "o");
string = string.replaceFirst("[ūúǔù]", "u");
string = string.replaceFirst("[üǖǘǚǜ]", "v");
return string;
}
public static String middleChineseToUnicode(String string) {
// Converting ASCII-friendly version to Unicode.
string = string.replace('\'', 'ʔ');
string = string.replace("ae", "æ");
string = string.replace("ea", "ɛ");
string = string.replace('+', 'ɨ');
return string;
}
public static String middleChineseToSortable(String string) {
// Dashes are not needed in sorting.
string = string.replace("-", "");
// 'ʔ' < letters
string = string.replace('ʔ', '\'');
// 'a' < 'æ' < 'b'
string = string.replace("æ", "a~");
// 'd' < 'ɛ' < 'e'
string = string.replace("e", "e~");
string = string.replace('ɛ', 'e');
// 'h' < 'ɨ' < 'i'
string = string.replace("i", "i~");
string = string.replace('ɨ', 'i');
// rising tone is second tone
string = string.replace('X', '2');
// departing tone is third tone
string = string.replace('H', '3');
return string;
}
public static String oldChineseToUnicode(String string) {
// Streamlining devoicing diacritics.
string = string.replaceAll("[̥̊]+", "̥");
string = string.replace("ŋ̥", "ŋ̊");
// Streamlining pharyngealization diacritics.
string = string.replace('ˤ', 'ˁ');
string = string.replaceAll("(\\*(\\[?[A-Za-z]ə?[\\.\\-])?\\[?)g", "$1ɡ");
return string;
}
public static String oldChineseToSortable(String string) {
string = string.replace('ɡ', 'g');
// Temporarily converting "ts" and "dz".
string = string.replace("ts", "ʦ");
string = string.replace("dz", "ʣ");
// Making loosely-bound prefix schwas sort-neutral.
string = string.replaceAll("ə([\\.\\-])", "$1");
// Stripping lots of sort-neutral stuff.
string = string.replaceAll(
"[\\*\\-\\.\\<\\>\\(\\)\\[\\]\\{\\}]", "");
// space < 'C' < 'N' < 'ʔ' < letters
string = string.replace('C', '$');
string = string.replace('N', '%');
string = string.replace('ʔ', '\'');
// 'd' < 'dz' < 'ə' < 'e'
string = string.replace("ʣ", "d~");
string = string.replace("e", "e~");
string = string.replace('ə', 'e');
// 'g' < 'ɢ' < 'h'
string = string.replace("ɢ", "g~");
// 'l' < 'l̥' < 'm' < 'm̥' < 'n' < 'n̥' < 'ŋ' < 'ŋ̊' < 'o'
// 'r' < 'r̥' < 's'
string = string.replaceAll("[̥̊]+", "~");
string = string.replace("ŋ", "n~~");
// 't' < 'ts' < 'u'
string = string.replace("ʦ", "t~");
// letters < 'ˁ' < 'ʰ' < 'ʷ'
string = string.replace("ˁ", "z");
string = string.replace("ʰ", "z~");
string = string.replace("ʷ", "z~~");
return string;
}
public static String fromUnicodeNotation(String string)
throws Exception {
int code;
StringBuilder builder;
// Stripping everything after a certain point.
string = string.replaceFirst("[ <].*$", "");
// Stripping all non-hexadecimal characters.
string = string.replaceAll("[^0-9A-Fa-f]", "");
// Parsing hexadecimal number.
code = Integer.parseInt(string, 16);
// Converting the code point to a string and returning it.
builder = new StringBuilder(4);
try {
builder.appendCodePoint(code);
} catch (IllegalArgumentException e) { System.out.println(string + ", " + code); throw e; }
return builder.toString().intern();
}
public static void printEscaped(String string) {
int length, index;
String substit;
char ch;
length = string.length();
for (index = 0; index < length; index++) {
substit = null;
ch = string.charAt(index);
switch (ch) {
case '&': substit = "&"; break;
case '<': substit = "‹"; break;
case '>': substit = "›"; break;
case '\"': substit = """; break;
case '\'': substit = "&#" + (int)'\'' + ";"; break;
case '[': substit = "&#" + (int)'[' + ";"; break;
case ']': substit = "&#" + (int)']' + ";"; break;
case '{': substit = "&#" + (int)'{' + ";"; break;
case '}': substit = "&#" + (int)'}' + ";"; break;
case '|': substit = "&#" + (int)'|' + ";"; break;
}
if (substit != null)
writer.print(substit);
else writer.print(ch);
}
}
}