User:Erutuon/scripts/fix Cyrillic.py
< User:Erutuon | scripts
#! /usr/bin/env python3
from sys import argv
import json
import regex as re
from pywikibot import Page, Site
import mwparserfromhell
from unicodedata import name as character_name, normalize
if len(argv) != 2:
raise ValueError("expected one commandline argument: filename")
print("filename:", argv[1])
text = open("wrong_script/" + argv[1], "r").read()
site = Site(code="en", fam="wiktionary")
# from https://en.wikipedia.org/wiki/User:Trey314159/homoglyphHunter.js
# "I":"І", -> "I":"ӏ",
Latin_to_Cyrillic = {
"a":"а", "A":"А", "ă":"ӑ", "Ă":"Ӑ", "ä":"ӓ", "Ä":"Ӓ", "æ":"ӕ", "Æ":"Ӕ",
"B":"В", "c":"с", "C":"С", "ç":"ҫ", "Ç":"Ҫ", "e":"е", "E":"Е", "è":"ѐ",
"È":"Ѐ", "ë":"ё", "Ë":"Ё", "ĕ":"ӗ", "Ĕ":"Ӗ", "ə":"ә", "Ə":"Ә", "H":"Н",
"i":"і", "I":"ӏ", "ï":"ї", "Ï":"Ї", "ḯ":"ї́", "Ḯ":"Ї́", "j":"ј", "J":"Ј",
"k":"к", "K":"К", "M":"М", "m":"м", "o":"о", "O":"О", "ö":"ӧ", "Ö":"Ӧ", "p":"р",
"P":"Р", "Q":"Ԛ", "s":"ѕ", "S":"Ѕ", "T":"Т", "W":"Ԝ", "x":"х", "X":"Х",
"y":"у", "Y":"У", "ȳ":"ӯ", "Ȳ":"Ӯ", "ÿ":"ӱ", "Ÿ":"Ӱ", "á":"а́", "é":"е́",
"í":"і́", "ó":"о́", "ý":"у́", "ħ":"ћ", "ɜ":"з", "ò":"о̀", "Ò":"О̀", "l":"ӏ",
"h":"һ", "ā":"а̄", "Ā":"А̄", "ē":"е̄", "Ē":"Е̄", "ī":"і̄", "ō":"о̄", "Ō":"О̄",
"ō":"о̄", "Ō":"О̄",
}
single_char = re.compile(".", re.DOTALL)
def replace_Latin_with_Cyrillic (text):
# desired character:
# ҫ (CYRILLIC SMALL LETTER ES WITH DESCENDER)
# character resulting from replacement after decomposition:
# с̧ (CYRILLIC SMALL LETTER ES, COMBINING CEDILLA)
text = single_char.sub(lambda char: Latin_to_Cyrillic[char[0]]
if char[0] in Latin_to_Cyrillic else char[0],
text)
text = normalize("NFD", text)
text = single_char.sub(lambda char: Latin_to_Cyrillic[char[0]]
if char[0] in Latin_to_Cyrillic else char[0],
text)
return normalize("NFC", text)
def show_graphemes (graphemes):
return ", ".join("[[" + grapheme[0] + "]] with "
+ (" and ").join([ character_name(char).lower() for char in grapheme[1:] ])
if len(grapheme) > 1
else "[[" + grapheme + "]]" for grapheme in graphemes)
def iterate_template_data (text, skip_to_title):
start_processing = skip_to_title == None
for line in text.splitlines():
data = json.loads(line)
title = data["title"]
if not start_processing:
if title == skip_to_title:
start_processing = True
else:
continue
yield title, data["templates"]
def process_pages (text, skip_to):
only_Latin = re.compile(r"^[\p{Latn}\p{Zinh}\p{Zyyy}]+$")
for (title, templates) in iterate_template_data(text, skip_to):
print("title: [[{}]]".format(title))
corrections = []
# Avoid loading page if no changes need to be made.
page = None
old_text = None
for instance in templates:
template_text = instance["template"]
wikitext = mwparserfromhell.parse(template_text)
template = None
try:
template = wikitext.get(0)
except:
print("no template")
continue
language_code = instance["lang"]
link_target = instance["text"]
link_target_param = instance["param"]
if not (language_code and link_target and link_target_param):
print("missing language code or term")
continue
link_target_corrected = replace_Latin_with_Cyrillic(link_target)
if link_target != link_target_corrected:
if not only_Latin.match(link_target):
if page == None:
page = Page(site, title)
if page.isRedirectPage():
page = page.getRedirectTarget()
new_title = page.title()
print("Followed redirect from [[" + title + "]] to [[" + new_title + "]]")
title = new_title
old_text = page.text
print("{} \N{RIGHTWARDS ARROW} {} ({})".format(link_target,
link_target_corrected,
language_code))
template.add(link_target_param, link_target_corrected)
page.text = page.text.replace(template_text, str(template))
for x in link_target:
if x in Latin_to_Cyrillic:
corrections.append((x, Latin_to_Cyrillic[x]))
else:
print("'{}' only contains Latin, so will not be modified".format(link_target))
else:
print("could not correct", template_text, "automatically")
corrections = list(zip(*list(corrections)))
if len(corrections) == 2 and page.text != old_text:
correction_len = 0
Latin = corrections[0]
Greek = corrections[1]
if all([x == corrections[0][0] for x in corrections[0]]) and all([x == corrections[1][0] for x in corrections[1]]):
correction_len = len(corrections[0])
Latin = [ Latin[0] ]
Greek = [ Greek[0] ]
else:
Latin = list(Latin)
Cyrillic = list(Latin)
Latin = show_graphemes(Latin)
Cyrillic = show_graphemes(Greek)
len_str = " " + str(correction_len) + " times" if correction_len > 1 else ""
summary = "replaced Latin {} with Cyrillic {}{}".format(Latin, Cyrillic, len_str)
print("> summary:", summary)
while True:
answer = input("> Save edit? y/n (or quit: q)\n>>> ")
if len(answer) > 0:
answer = answer[0].lower()
if answer == "y":
page.save(summary=summary, minor=True, watch="watch")
print("")
break
elif answer == "q":
print("> quitting")
return title
elif answer == "n":
print("")
break
else:
print("> Answer not recognized.")
else:
print("> no changes\n")
else:
print("done!")
try:
skip_to = None
try:
last_saved = open("last_saved.txt", "r")
lines = [line for line in last_saved]
skip_to = lines[len(lines) - 1]
print("skipping to [[{}]]\n".format(skip_to))
except:
print("no page to skip to")
title = process_pages(text, skip_to)
if title:
last_saved = open("last_saved.txt", "w")
last_saved.write(title)
except Exception as e:
print(e)
print("quitting")