User:Equinox/code/ExtractBookWords

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;


namespace ExtractBookWords
{
    /// <summary>
    /// Given an input text such as the contents of a book, converts the words to wiki links (handling
    /// letter casing, punctuation, etc.) so that missing words become easily-spotted red links.
    /// </summary>
    class Program
    {
        private const string INPUT_FILE = @"C:\Users\home\Desktop\input.txt";
        private const string OUTPUT_FILE = @"C:\Users\home\Desktop\output.txt";


        static void Main(string[] args)
        {
            string[] s = File.ReadAllText(INPUT_FILE)
                .Split(new char[] { ' ','\r','\n','\t' }, StringSplitOptions.RemoveEmptyEntries);

            StringBuilder sb = new StringBuilder();

            bool wasSentenceEnd = false;

            foreach (string t in s)
            {
                if (!wasSentenceEnd)
                {
                    string u = KeepAlpha(t);
                    if (u.Length > 0) sb.Append("[[" + u + "]] ");
                }

                wasSentenceEnd = t.EndsWith(".") || t.EndsWith("?") || t.EndsWith("!") || t.EndsWith(@"""");
            }

            File.WriteAllText(OUTPUT_FILE, sb.ToString());
        }


        private static string KeepAlpha(string s)
        {
            string t = String.Empty;
            bool first = true, forceLower = false;

            foreach (char ch in s)
            {
                if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))
                {
                    t += ch;
                }
                else if (first)
                {
                    forceLower = true;
                }

                first = false;

                if (ch == '-' || ch == '\'') return String.Empty;
            }

            if (forceLower) t = t.ToLower();

            return t;
        }
    }
}