Open main menu

User:Equinox/code/FindMissingNounPlurals

< User:Equinox
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;


namespace FindMissingNounPlurals
{
    /// <summary>
    /// Loops through the per-entry XML files generated by SplitWiktDumpXml and identifies any that
    /// (i) use the en-noun template and (ii) specify plural(s) that lack an English entry.
    /// </summary>
    /// <remarks>
    /// It's a bit hacky, but it works fine for the vast majority of cases.
    /// </remarks>
    class Program
    {
        private const string INPUT_FOLDER = @"C:\Users\home\Desktop\wiktdump\output"; // output from the XML dump
        private const string OUTPUT_FILE = @"C:\Users\home\Desktop\missing_plurals.csv";


        private static string UnescapeXmlString(string s)
        {
            // Unescape the five "predefined entities" of XML.
            // Do the ampersand last to ensure it cannot create false positives for any of the others!

            return s.Replace("&lt;", "<").Replace("&gt;", ">").Replace("&quot;", "\"").Replace("&apos;", "'").Replace("&amp;", "&");
        }


        private static string[] GetPluralsFromNounTemplate(string pageTitle, string s)
        {
            // First, delete any [[...]] segments, as I think these will only occur in e.g.
            // {{en-noun|head=[[chloro-|chloro]][[fluoro-|fluoro]][[methane]]|~}}
            // and this kind of thing would otherwise cause problems when we split on the | character.
            // (Wiping out the head= value doesn't matter since we will be discarding that bit anyway.)

            int start = 0;

            while ((start = s.IndexOf("[[")) != -1)
            {
                s = s.Substring(0, start) + s.Substring(s.IndexOf("]]") + "]]".Length);
            }

            // Split the template into its piped sub-items.

            var bits = new List<string>(s.TrimStart('{').TrimEnd('}').Split('|'));

            // Remove any sub-items we don't care about.

            bits.RemoveAt(0); // we don't need the constant 'en-noun'

            for (int i = bits.Count - 1; i >= 0; i--)
            {
                if (bits[i].StartsWith("head=") || bits[i].StartsWith("head =") // odd spacing sometimes
                    || bits[i].StartsWith("plqual=") || bits[i].StartsWith("pl2qual=")
                    || bits[i].StartsWith("pl3qual=") || bits[i].StartsWith("pl4qual=")
                    || bits[i].StartsWith("pl5qual=")
                    )
                {
                    bits.RemoveAt(i);
                }
                else if (bits[i].Contains("="))
                {
                    throw new ApplicationException(); // unexpected named parameter
                }
            }

            // Now build the list of plurals, using HashSet to avoid duplicates.

            var plurals = new HashSet<string>();

            if (bits.Count == 0) // default {{en-noun}} with implied -s plural
            {
                plurals.Add(pageTitle + "s");
            }
            else // |-, |~, |s, |dragonflies, |-|s, |s|deciduae, |-|es|coulis, etc.
            {
                if (bits.Count == 1 && bits[0] == "~")
                {
                    // {{en-noun|~}} implies an -s plural but e.g. {{en-noun|~|es}} does not.
                    plurals.Add(pageTitle + "s");
                }
                else
                {
                    foreach (string bit in bits)
                    {
                        if (bit == "s" || bit == "es")
                        {
                            plurals.Add(pageTitle + bit);
                        }
                        else if (bit != "-" && bit != "!" && bit != "~")
                        {
                            plurals.Add(bit); // fully specified plural, e.g. mice
                        }
                    }
                }
            }

            return plurals.ToArray();
        }


        static void Main(string[] args)
        {
            var plurals = new Dictionary<string, string[]>();

            foreach (string file in Directory.GetFiles(INPUT_FOLDER))
            {
                string pageTitle = null;

                foreach (string line in File.ReadAllLines(file))
                {
                    string trimmed = line.Trim();

                    if (pageTitle == null
                        && trimmed.StartsWith("<title>", StringComparison.Ordinal)
                        && trimmed.EndsWith("</title>", StringComparison.Ordinal))
                    // ^ note: without StringComparison.Ordinal, we fail on e.g. Hawaiian okina character
                    {
                        pageTitle = trimmed.Replace("<title>", String.Empty).Replace("</title>", String.Empty);
                        pageTitle = UnescapeXmlString(pageTitle);

                        // We need a key in the Dictionary for every title we see, regardless of page contents,
                        // since we will later use these keys as a guide to which pages already exist.
                        plurals[pageTitle] = new string[0];
                    }

                    if (line.Contains("{{head|en|noun"))
                    {
                        // This template says nothing about plurality, so we can't easily process it.
                    }
                    else if (line.Contains("{{en-noun") && !trimmed.StartsWith("<comment>")/* ignore edit summaries */)
                    {
                        if (pageTitle == null) throw new ApplicationException(); // shouldn't happen

                        if (!line.StartsWith("{{en-noun") // sth earlier on the line e.g. "quartan fever"
                            || line.Contains("{{vern|")) // just awkward! e.g. "araneomorph funnel-web spider"
                        {
                            continue; // skip these rare difficult cases
                        }

                        // Discard anything else on the same line after the noun template.
                        string nounTemplate = line.Substring(0, line.IndexOf("}}") + "}}".Length);

                        // Work out the plurals based on the template.
                        plurals[pageTitle] = GetPluralsFromNounTemplate(pageTitle, nounTemplate);
                    }
                }
            }

            // Now that we've seen all the entries, generate a list of the missing plurals.
            // Note: This only finds the cases where there is no English entry for the plural word;
            // if that English entry exists, we just assume that it contains the noun plural.

            using (StreamWriter writer = new StreamWriter(OUTPUT_FILE, false, Encoding.UTF8))
            {
                foreach (string pageTitle in plurals.Keys)
                {
                    foreach (string plural in plurals[pageTitle])
                    {
                        if (!plurals.ContainsKey(plural))
                        {
                            writer.WriteLine(pageTitle + "\t" + plural);
                        }
                    }
                }
            }

            Console.Beep();
            Console.WriteLine("Press Enter to exit.");
            Console.ReadLine();
        }
    }
}