User:Erutuon/scripts/fix Arabic.py

#! /usr/bin/env python3

from sys import argv
import os, json, traceback, mwparserfromhell
import regex as re
from unicodedata import name as character_name, normalize
from pywikibot import Page, Site

last_saved_filepath = "last_saved.txt"

if len(argv) != 2:
    raise ValueError("expected one commandline argument: filename")

print("filename:", argv[1])
text = open("wrong_script/" + argv[1], "r").read()

site = Site(code="en", fam="wiktionary")

replacements = {
    "ar": {
        "ک":"ك", "ی":"ي",
    },
    "fa": {
        "ك":"ک", "ي":"ی", "ى":"ی",
    },
	"ota": {
		"ک":"ك", "ي":"ی", "ى":"ی",
	},
    "ps": {
        "ك":"ک",
    },
}
replacements["ur"] = replacements["fa"]

language_names = {
    "ar": "Arabic", "fa": "Persian", "ota": "Ottoman Turkish", "ps": "Pashto", "ur": "Urdu",
}

single_char = re.compile(".", re.DOTALL)
def fix_Arabic (text, language_code):
    return single_char.sub(
        lambda char: replacements[language_code][char[0]]
            if char[0] in replacements[language_code]
            else char[0],
        text)

def show_characters (chars):
    return ", ".join(["[[" + char + "]] ("
        + character_name(char).lower()
            .replace("arabic letter ", "")
            .replace("farsi", "Farsi")
        + ")" for char in chars])

def iterate_template_data (text, skip_to_title):
    start_processing = skip_to_title == None
    
    for line in text.splitlines():
        data = json.loads(line)
        title = data["title"]
        if not start_processing:
            if title == skip_to_title:
                start_processing = True
            else:
                continue
        
        yield title, data["templates"]

def process_pages (text, skip_to):
    for (title, templates) in iterate_template_data(text, skip_to):
        print("title: [[{}]]".format(title))
        
        corrections = []
        languages = set()
        
        # Avoid loading page if no changes need to be made.
        page = None
        old_text = None
        
        for instance in templates:
            template_text = instance["template"]
            template = mwparserfromhell.parse(template_text)
            
            try:
                template = template.get(0)
            except:
                print("no template")
                continue
            
            language_code = instance["lang"]
            link_target = instance["text"]
            link_target_param = instance["param"]
            
            if not (language_code and link_target and link_target_param):
                print("missing language code or term")
                continue
            
            if language_code and language_code in replacements and link_target:
                link_target_corrected = fix_Arabic(link_target, language_code)
                
                if link_target != link_target_corrected:
                    if page == None:
                        page = Page(site, title)
                        old_text = page.text
                    
                    print("{} \N{RIGHTWARDS ARROW} {} ({})".format(link_target,
                                                                   link_target_corrected,
                                                                   language_code))
                    template.add(link_target_param, link_target_corrected)
                    
                    page.text = page.text.replace(template_text, str(template))
                    
                    for x in link_target:
                        if x in replacements[language_code]:
                            corrections.append((x, replacements[language_code][x]))
                    
                    languages.add(language_code)
                else:
                    print("did not make any changes to", template_text, "automatically")
        
        corrections = list(zip(*list(corrections)))
        
        if len(corrections) == 2 and page.text != old_text:
            correction_len = 0
            
            old = corrections[0]
            new = corrections[1]
            
            if all([x == old[0] for x in old]) and all([x == new[0] for x in new]):
                correction_len = len(old)
                
                old = [ old[0] ]
                new = [ new[0] ]
            else:
                old = list(old)
                Cyrillic = list(old)
            
            old = show_characters(old)
            new = show_characters(new)
            
            len_str = " " + str(correction_len) + " times" if correction_len > 1 else ""
            summary = "correcting Arabic-script characters: replaced {} with {}{} in {}".format(
                old,
                new,
                len_str,
                " and ".join(language_names[language_code] for language_code in list(languages)))
            
            print("> summary:", summary)
            
            while True:
                answer = input("> Save edit? y/n (or quit: q)\n>>> ")
                
                if len(answer) > 0:
                    answer = answer[0].lower()
                    
                    if answer == "y":
                        page.save(summary=summary, minor=True, watch="watch")
                        print("")
                        break
                    elif answer == "q":
                        print("> quitting")
                        return title
                    elif answer == "n":
                        print("")
                        break
                    else:
                        print("> Answer not recognized.")
        else:
            print("> no changes\n")
    else:
        print("done!")
        last_saved = os.remove(last_saved_filepath)
    return None

try:
    skip_to = None
    try:
        last_saved = open(last_saved_filepath, "r")
        lines = [line for line in last_saved]
        skip_to = lines[len(lines) - 1]
        print("skipping to [[{}]]\n".format(skip_to))
    except:
        print("no page to skip to")
    
    title = process_pages(text, skip_to)
    
    if title:
        last_saved = open(last_saved_filepath, "w")
        last_saved.write(title)
except Exception as e:
    print(e)
    traceback.print_exc()
    print("quitting")