User:Erutuon/scripts/fix Arabic.py
< User:Erutuon | scripts
#! /usr/bin/env python3
from sys import argv
import os, json, traceback, mwparserfromhell
import regex as re
from unicodedata import name as character_name, normalize
from pywikibot import Page, Site
last_saved_filepath = "last_saved.txt"
if len(argv) != 2:
raise ValueError("expected one commandline argument: filename")
print("filename:", argv[1])
text = open("wrong_script/" + argv[1], "r").read()
site = Site(code="en", fam="wiktionary")
replacements = {
"ar": {
"ک":"ك", "ی":"ي",
},
"fa": {
"ك":"ک", "ي":"ی", "ى":"ی",
},
"ota": {
"ک":"ك", "ي":"ی", "ى":"ی",
},
"ps": {
"ك":"ک",
},
}
replacements["ur"] = replacements["fa"]
language_names = {
"ar": "Arabic", "fa": "Persian", "ota": "Ottoman Turkish", "ps": "Pashto", "ur": "Urdu",
}
single_char = re.compile(".", re.DOTALL)
def fix_Arabic (text, language_code):
return single_char.sub(
lambda char: replacements[language_code][char[0]]
if char[0] in replacements[language_code]
else char[0],
text)
def show_characters (chars):
return ", ".join(["[[" + char + "]] ("
+ character_name(char).lower()
.replace("arabic letter ", "")
.replace("farsi", "Farsi")
+ ")" for char in chars])
def iterate_template_data (text, skip_to_title):
start_processing = skip_to_title == None
for line in text.splitlines():
data = json.loads(line)
title = data["title"]
if not start_processing:
if title == skip_to_title:
start_processing = True
else:
continue
yield title, data["templates"]
def process_pages (text, skip_to):
for (title, templates) in iterate_template_data(text, skip_to):
print("title: [[{}]]".format(title))
corrections = []
languages = set()
# Avoid loading page if no changes need to be made.
page = None
old_text = None
for instance in templates:
template_text = instance["template"]
template = mwparserfromhell.parse(template_text)
try:
template = template.get(0)
except:
print("no template")
continue
language_code = instance["lang"]
link_target = instance["text"]
link_target_param = instance["param"]
if not (language_code and link_target and link_target_param):
print("missing language code or term")
continue
if language_code and language_code in replacements and link_target:
link_target_corrected = fix_Arabic(link_target, language_code)
if link_target != link_target_corrected:
if page == None:
page = Page(site, title)
old_text = page.text
print("{} \N{RIGHTWARDS ARROW} {} ({})".format(link_target,
link_target_corrected,
language_code))
template.add(link_target_param, link_target_corrected)
page.text = page.text.replace(template_text, str(template))
for x in link_target:
if x in replacements[language_code]:
corrections.append((x, replacements[language_code][x]))
languages.add(language_code)
else:
print("did not make any changes to", template_text, "automatically")
corrections = list(zip(*list(corrections)))
if len(corrections) == 2 and page.text != old_text:
correction_len = 0
old = corrections[0]
new = corrections[1]
if all([x == old[0] for x in old]) and all([x == new[0] for x in new]):
correction_len = len(old)
old = [ old[0] ]
new = [ new[0] ]
else:
old = list(old)
Cyrillic = list(old)
old = show_characters(old)
new = show_characters(new)
len_str = " " + str(correction_len) + " times" if correction_len > 1 else ""
summary = "correcting Arabic-script characters: replaced {} with {}{} in {}".format(
old,
new,
len_str,
" and ".join(language_names[language_code] for language_code in list(languages)))
print("> summary:", summary)
while True:
answer = input("> Save edit? y/n (or quit: q)\n>>> ")
if len(answer) > 0:
answer = answer[0].lower()
if answer == "y":
page.save(summary=summary, minor=True, watch="watch")
print("")
break
elif answer == "q":
print("> quitting")
return title
elif answer == "n":
print("")
break
else:
print("> Answer not recognized.")
else:
print("> no changes\n")
else:
print("done!")
last_saved = os.remove(last_saved_filepath)
return None
try:
skip_to = None
try:
last_saved = open(last_saved_filepath, "r")
lines = [line for line in last_saved]
skip_to = lines[len(lines) - 1]
print("skipping to [[{}]]\n".format(skip_to))
except:
print("no page to skip to")
title = process_pages(text, skip_to)
if title:
last_saved = open(last_saved_filepath, "w")
last_saved.write(title)
except Exception as e:
print(e)
traceback.print_exc()
print("quitting")