From 3a712cce4302d8b64f2dddd4534b340482e4db20 Mon Sep 17 00:00:00 2001 From: sebimoe Date: Sat, 4 Mar 2023 23:20:55 +0100 Subject: [PATCH 1/3] Updated python scripts to work with python 2 and 3, fixed XML parsers --- kanjivg.py | 175 ++++++++++++++++++++++++++++++++++------------------- kvg.py | 24 +++++--- utils.py | 75 +++++++++++++++++++++++ 3 files changed, 201 insertions(+), 73 deletions(-) create mode 100644 utils.py diff --git a/kanjivg.py b/kanjivg.py index 87228ab0f5..1b0a9e541d 100644 --- a/kanjivg.py +++ b/kanjivg.py @@ -16,6 +16,11 @@ # along with this program. If not, see . from xmlhandler import * +from utils import PYTHON_VERSION_MAJOR, canonicalId + +if PYTHON_VERSION_MAJOR > 2: + def unicode(s): + return s # Sample licence header licenseString = """Copyright (C) 2009-2013 Ulrich Apel. @@ -56,16 +61,19 @@ def realchr(i): class Kanji: """Describes a kanji. The root stroke group is accessible from the strokes member.""" - def __init__(self, code, variant): - # Unicode of char being represented (int) - self.code = code + def __init__(self, code, variant = None): + # Unicode of char being represented (standard str) + self.code = canonicalId(code) # Variant of the character, if any self.variant = variant self.strokes = None + def __repr__(self): + return repr(vars(self)) + # String identifier used to uniquely identify the kanji def kId(self): - ret = "%05x" % (self.code,) + ret = self.code if self.variant: ret += "-%s" % (self.variant,) return ret @@ -88,7 +96,7 @@ def getStrokes(self): class StrokeGr: """Describes a stroke group belonging to a kanji as closely as possible to the XML format. Sub-stroke groups or strokes are available in the childs member. They can either be of class StrokeGr or Stroke so their type should be checked.""" - def __init__(self, parent): + def __init__(self, parent = None): self.parent = parent if parent: parent.childs.append(self) # Element of strokegr @@ -107,6 +115,15 @@ def __init__(self, parent): self.childs = [] + def __repr__(self): + return repr(vars(self)) + + def setParent(self, parent): + if self.parent is not None or parent is None: + raise "Set parent should only be set once! There is no cleanup for old parents." + parent.childs.append(self) + self.parent = parent + def toSVG(self, out, rootId, groupCpt = [0], strCpt = [1], indent = 0): gid = rootId if groupCpt[0] != 0: gid += "-g" + str(groupCpt[0]) @@ -208,6 +225,9 @@ def __init__(self, parent): self.stype = None self.svg = None self.numberPos = None + + def __repr__(self): + return repr(vars(self)) def numberToSVG(self, out, number, indent = 0): if self.numberPos: @@ -224,40 +244,58 @@ def toSVG(self, out, rootId, groupCpt, strCpt, indent = 0): class KanjisHandler(BasicHandler): """XML handler for parsing kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name.""" - def __init__(self, code, variant): + def __init__(self): BasicHandler.__init__(self) - self.kanji = Kanji(code, variant) + self.kanji = None + self.kanjis = {} + self.group = None self.groups = [] self.compCpt = {} self.metComponents = set() def handle_start_kanji(self, attrs): - pass + if self.kanji is not None: + raise Exception("Kanji cannot be nested") + if self.group is not None: + raise Exception("Kanji cannot be inside a group") + if len(self.groups) != 0: + raise Exception("Previous kanji not closed correctly") + idType, idVariantStr = str(attrs["id"]).split("_") + if idType != "kvg:kanji": + raise Exception("Each kanji should have id formatted as kvg:kanji_XXXXX.") + idVariant = idVariantStr.split('-') + self.kanji = Kanji(*idVariant) + def handle_end_kanji(self): - if len(self.groups) != 0: - print("WARNING: stroke groups remaining after reading kanji!") + if self.group is not None: + raise Exception("A group is not closed inside the kanji.") + if len(self.groups) != 1: + raise Exception("Kanji should have 1 root group.") + self.kanji.strokes = self.groups[0] + self.kanjis[self.kanji.code] = self.kanji self.groups = [] + self.kanji = None - def handle_start_strokegr(self, attrs): - if len(self.groups) == 0: parent = None - else: parent = self.groups[-1] - group = StrokeGr(parent) + def handle_start_g(self, attrs): + if self.kanji is None: + raise Exception("Stroke group must be inside a kanji") + group = StrokeGr(self.group) # Now parse group attributes - if attrs.has_key("element"): group.element = unicode(attrs["element"]) - if attrs.has_key("variant"): group.variant = str(attrs["variant"]) - if attrs.has_key("partial"): group.partial = str(attrs["partial"]) - if attrs.has_key("original"): group.original = unicode(attrs["original"]) - if attrs.has_key("part"): group.part = int(attrs["part"]) - if attrs.has_key("number"): group.number = int(attrs["number"]) - if attrs.has_key("tradForm") and str(attrs["tradForm"]) == "true": group.tradForm = True - if attrs.has_key("radicalForm") and str(attrs["radicalForm"]) == "true": group.radicalForm = True - if attrs.has_key("position"): group.position = unicode(attrs["position"]) - if attrs.has_key("radical"): group.radical = unicode(attrs["radical"]) - if attrs.has_key("phon"): group.phon = unicode(attrs["phon"]) - - self.groups.append(group) + if "kvg:element" in attrs: group.element = unicode(attrs["kvg:element"]) + if "kvg:variant" in attrs: group.variant = str(attrs["kvg:variant"]) + if "kvg:partial" in attrs: group.partial = str(attrs["kvg:partial"]) + if "kvg:original" in attrs: group.original = unicode(attrs["kvg:original"]) + if "kvg:part" in attrs: group.part = int(attrs["kvg:part"]) + if "kvg:number" in attrs: group.number = int(attrs["kvg:number"]) + if "kvg:tradForm" in attrs and str(attrs["kvg:tradForm"]) == "true": group.tradForm = True + if "kvg:radicalForm" in attrs and str(attrs["kvg:radicalForm"]) == "true": group.radicalForm = True + if "kvg:position" in attrs: group.position = unicode(attrs["kvg:position"]) + if "kvg:radical" in attrs: group.radical = unicode(attrs["kvg:radical"]) + if "kvg:phon" in attrs: group.phon = unicode(attrs["kvg:phon"]) + + self.group = group if group.element: self.metComponents.add(group.element) if group.original: self.metComponents.add(group.original) @@ -266,13 +304,13 @@ def handle_start_strokegr(self, attrs): if not group.part: print("%s: Number specified, but part missing" % (self.kanji.kId())) # The group must exist already if group.part > 1: - if not self.compCpt.has_key(group.element + str(group.number)): + if (group.element + str(group.number)) not in self.compCpt: print("%s: Missing numbered group" % (self.kanji.kId())) elif self.compCpt[group.element + str(group.number)] != group.part - 1: print("%s: Incorrectly numbered group" % (self.kanji.kId())) # The group must not exist else: - if self.compCpt.has_key(group.element + str(group.number)): + if (group.element + str(group.number)) in self.compCpt: print("%s: Duplicate numbered group" % (self.kanji.kId())) self.compCpt[group.element + str(group.number)] = group.part # No number, just a part - groups restart with part 1, otherwise must @@ -280,26 +318,25 @@ def handle_start_strokegr(self, attrs): elif group.part: # The group must exist already if group.part > 1: - if not self.compCpt.has_key(group.element): + if group.element not in self.compCpt: print("%s: Incorrectly started multi-part group" % (self.kanji.kId())) elif self.compCpt[group.element] != group.part - 1: print("%s: Incorrectly splitted multi-part group" % (self.kanji.kId())) self.compCpt[group.element] = group.part - def handle_end_strokegr(self): - group = self.groups.pop() - if len(self.groups) == 0: - if self.kanji.strokes: - print("WARNING: overwriting root of kanji!") - self.kanji.strokes = group + def handle_end_g(self): + if self.group.parent is None: + self.groups.append(self.group) + self.group = self.group.parent - def handle_start_stroke(self, attrs): - if len(self.groups) == 0: parent = None - else: parent = self.groups[-1] - stroke = Stroke(parent) - stroke.stype = unicode(attrs["type"]) - if attrs.has_key("path"): stroke.svg = unicode(attrs["path"]) - self.groups[-1].childs.append(stroke) + def handle_start_path(self, attrs): + if self.kanji is None or self.group is None: + raise Exception("Stroke must be inside a kanji and group!") + stroke = Stroke(self.group) + if "kvg:type" in attrs: + stroke.stype = unicode(attrs["kvg:type"]) + if "d" in attrs: stroke.svg = unicode(attrs["d"]) + self.group.childs.append(stroke) class SVGHandler(BasicHandler): """SVG handler for parsing final kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanji are accessible through the kanjis member, indexed by their svg file name.""" @@ -311,28 +348,36 @@ def __init__(self): self.metComponents = set() def handle_start_g(self, attrs): + group = StrokeGr() + # Special case for handling the root if len(self.groups) == 0: - id = hex(realord(attrs["kvg:element"]))[2:] - self.currentKanji = Kanji(id) - self.kanjis[id] = self.currentKanji + idType, idVariantStr = str(attrs["id"]).split("_") + idVariant = idVariantStr.split('-') + if idType == "kvg:StrokePaths": + pass + elif idType == "kvg:StrokeNumbers": + return + else: + raise Exception("Invalid root group id type (%s)" % (str(attrs["id"]),)) + self.currentKanji = Kanji(*idVariant) + self.kanjis[self.currentKanji.code] = self.currentKanji self.compCpt = {} - parent = None - else: parent = self.groups[-1] + else: + group.setParent(self.groups[-1]) - group = StrokeGr(parent) # Now parse group attributes - if attrs.has_key("kvg:element"): group.element = unicode(attrs["kvg:element"]) - if attrs.has_key("kvg:variant"): group.variant = str(attrs["kvg:variant"]) - if attrs.has_key("kvg:partial"): group.partial = str(attrs["kvg:partial"]) - if attrs.has_key("kvg:original"): group.original = unicode(attrs["kvg:original"]) - if attrs.has_key("kvg:part"): group.part = int(attrs["kvg:part"]) - if attrs.has_key("kvg:number"): group.number = int(attrs["kvg:number"]) - if attrs.has_key("kvg:tradForm") and str(attrs["kvg:tradForm"]) == "true": group.tradForm = True - if attrs.has_key("kvg:radicalForm") and str(attrs["kvg:radicalForm"]) == "true": group.radicalForm = True - if attrs.has_key("kvg:position"): group.position = unicode(attrs["kvg:position"]) - if attrs.has_key("kvg:radical"): group.radical = unicode(attrs["kvg:radical"]) - if attrs.has_key("kvg:phon"): group.phon = unicode(attrs["kvg:phon"]) + if "kvg:element" in attrs: group.element = unicode(attrs["kvg:element"]) + if "kvg:variant" in attrs: group.variant = str(attrs["kvg:variant"]) + if "kvg:partial" in attrs: group.partial = str(attrs["kvg:partial"]) + if "kvg:original" in attrs: group.original = unicode(attrs["kvg:original"]) + if "kvg:part" in attrs: group.part = int(attrs["kvg:part"]) + if "kvg:number" in attrs: group.number = int(attrs["kvg:number"]) + if "kvg:tradForm" in attrs and str(attrs["kvg:tradForm"]) == "true": group.tradForm = True + if "kvg:radicalForm" in attrs and str(attrs["kvg:radicalForm"]) == "true": group.radicalForm = True + if "kvg:position" in attrs: group.position = unicode(attrs["kvg:position"]) + if "kvg:radical" in attrs: group.radical = unicode(attrs["kvg:radical"]) + if "kvg:phon" in attrs: group.phon = unicode(attrs["kvg:phon"]) self.groups.append(group) @@ -364,9 +409,11 @@ def handle_start_g(self, attrs): self.compCpt[group.element] = group.part def handle_end_g(self): + if len(self.groups) == 0: + return group = self.groups.pop() # End of kanji? - if len(self.groups) == 0: + if len(self.groups) == 1: # index 1 - ignore root group self.currentKanji.strokes = group self.currentKanji = None self.groups = [] @@ -376,6 +423,8 @@ def handle_start_path(self, attrs): if len(self.groups) == 0: parent = None else: parent = self.groups[-1] stroke = Stroke(parent) - stroke.stype = unicode(attrs["kvg:type"]) - if attrs.has_key("d"): stroke.svg = unicode(attrs["d"]) + if "kvg:type" in attrs: + stroke.stype = unicode(attrs["kvg:type"]) + if "d" in attrs: + stroke.svg = unicode(attrs["d"]) self.groups[-1].childs.append(stroke) diff --git a/kvg.py b/kvg.py index 37f210c7e3..960d267ae0 100755 --- a/kvg.py +++ b/kvg.py @@ -16,8 +16,9 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import os, os.path, sys, codecs, re, datetime +import sys, os, re, datetime from kanjivg import licenseString +from utils import open pathre = re.compile(r'') @@ -28,9 +29,9 @@ release create single release file""" % (sys.argv[0],) def createPathsSVG(f): - s = codecs.open(f, "r", "utf-8").read() + s = open(f, "r", encoding="utf-8").read() paths = pathre.findall(s) - out = codecs.open(f[:-4] + "-paths.svg", "w", "utf-8") + out = open(f[:-4] + "-paths.svg", "w", encoding="utf-8") out.write(""" \n""") i = 1 @@ -44,20 +45,21 @@ def mergePathsSVG(f): if not os.path.exists(pFile): print("%s does not exist!" % (pFile,)) return - s = codecs.open(pFile, "r", "utf-8").read() + s = open(pFile, "r", encoding="utf-8").read() paths = pathre.findall(s) - s = codecs.open(f, "r", "utf-8").read() + s = open(f, "r", encoding="utf-8").read() pos = 0 while True: match = pathre.search(s[pos:]) if match and len(paths) == 0 or not match and len(paths) > 0: print("Paths count mismatch for %s" % (f,)) return - if not match and len(paths) == 0: break + if not match and len(paths) == 0: + break s = s[:pos + match.start(1)] + paths[0] + s[pos + match.end(1):] pos += match.start(1) + len(paths[0]) del paths[0] - codecs.open(f, "w", "utf-8").write(s) + open(f, "w", encoding="utf-8").write(s) def release(): datadir = "kanji" @@ -69,7 +71,7 @@ def release(): del allfiles files.sort() - out = open("kanjivg.xml", "w") + out = open("kanjivg.xml", "w", encoding='utf8') out.write('\n') out.write("\n") out.write("\n") for f in files: - data = open(os.path.join(datadir, f)).read() + data = open(os.path.join(datadir, f), encoding='utf8').read() + data = data.replace("\r\n", "\n") data = data[data.find("= 2 and idLen <= 5: + id = int(id, 16) + else: + raise ValueError("Character id must be a 1-character string with the character itself, or 2-5 hex digit unicode codepoint.") + if not isinstance(id, int): + raise ValueError("canonicalId: id must be int or str") + if id > 0xf and id <= 0xfffff: + return "%05x" % (id) + raise ValueError("Character id out of range") + +class SvgFileInfo: + def __init__(self, file, dir): + self.path = os.path.join(dir, file) + if file[-4:].lower() != ".svg": + raise Exception("File should have .svg exension. (%s)" % (str(self.path))) + parts = (file[:-4]).split('-') + if len(parts) == 2: + self.variant = parts[1] + elif len(parts) != 1: + raise Exception("File should have at most 2 parts separated by a dash. (%s)" % (str(file))) + self.id = parts[0] + if self.id != canonicalId(self.id): + raise Exception("File name not in canonical format (%s)" % (str(self.path))) + + def __repr__(self): + return repr(vars(self)) + + def read(self, SVGHandler=None): + if SVGHandler is None: + from kanjivg import SVGHandler + handler = SVGHandler() + parseXmlFile(self.path, handler) + parsed = list(handler.kanjis.values()) + if len(parsed) != 1: + raise Exception("File does not contain 1 kanji entry. (%s)" % (self.path)) + return parsed[0] + +def parseXmlFile(path, handler): + from xml.sax import parse + parse(path, handler) + +def listSvgFiles(dir): + return [ + SvgFileInfo(f, dir) + for f in os.listdir(dir) + ] + +def readXmlFile(path, KanjisHandler=None): + if KanjisHandler is None: + from kanjivg import KanjisHandler + handler = KanjisHandler() + parseXmlFile(path, handler) + parsed = list(handler.kanjis.values()) + if len(parsed) == 0: + raise Exception("File does not contain any kanji entries. (%s)" % (path)) + return handler.kanjis + From 7431f32e19be4f82ac08048633538830888548f2 Mon Sep 17 00:00:00 2001 From: sebimoe Date: Fri, 10 Mar 2023 22:11:25 +0100 Subject: [PATCH 2/3] Added command-line kanji lookup tool The tool displays a summary of loaded character, can be used for validating xml parser functions. --- kvg-lookup.py | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 kvg-lookup.py diff --git a/kvg-lookup.py b/kvg-lookup.py new file mode 100644 index 0000000000..89080aa905 --- /dev/null +++ b/kvg-lookup.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2023 Sebastian Grygiel +# Copyright (C) 2011-2013 Alexandre Courbot +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import sys, os, re, datetime +from kanjivg import Stroke, StrokeGr +from utils import listSvgFiles, readXmlFile, canonicalId, PYTHON_VERSION_MAJOR + +if PYTHON_VERSION_MAJOR > 2: + def unicode(s): + return s + def unichr(c): + return chr(c) + +helpString = """Usage: %s [...elementN] + +Recognized commands: + find-svg Find and view summary of an SVG file for the given + element in ./kanji/ directory. + find-xml Find and view summary of a entry for + the given element from ./kanjivg.xml file. + +Parameters: + element May either be the singular character, e.g. 並 or its + unicode code-point e.g. 4e26. + +Examples: + %s find-svg 並 Will list SVG files describing given character. + %s find-xml 4e26 Will list entry for the same character. +""" % (sys.argv[0], sys.argv[0], sys.argv[0]) + +# Output helper + +lossInWeirdEncoding = False +def writeOutput(data, output): + if PYTHON_VERSION_MAJOR >= 3: + output.write(data) + return + + global lossInWeirdEncoding + if output.encoding == None: + encoding = 'utf8' + else: + encoding = output.encoding + + encoded = data.encode(encoding, errors="replace") + if encoding != 'utf8' and encoding != 'utf-8': + if encoded.decode(encoding) != data: + lossInWeirdEncoding = encoding + + output.write(encoded) + +# Summary generators + +def strokeGroupSummary(gr, indent = 0): + if not isinstance(gr, StrokeGr): + raise Exception("Invalid structure") + + ret = unicode(" " * indent * 4) + # ret += gr.element if gr.element is not None and len(gr.element) > 0 else "・" + ret += "- group" + if gr.element is not None and len(gr.element) > 0: + ret += " %s" % (gr.element,) + if gr.position: + ret += " (%s)" % (gr.position,) + + childStrokes = [s.stype for s in gr.childs if isinstance(s, Stroke) and s.stype] + if len(childStrokes): + ret += "\n%s- strokes: %s" % (" " * (indent+1) * 4, ' '.join(childStrokes)) + + ret += "\n" + + for g in gr.childs: + if isinstance(g, StrokeGr): + ret += strokeGroupSummary(g, indent + 1) + + return ret + +def characterSummary(c): + ret = "Character summary: %s (%s)" % (c.code, c.strokes.element) + if c.variant: + ret += " - variant: %s" % (c.variant) + ret += "\n" + ret += strokeGroupSummary(c.strokes) + return ret + +# Commands + +def commandFindSvg(arg): + id = canonicalId(arg) + kanji = [(f.path, f.read()) for f in listSvgFiles("./kanji/") if f.id == id] + print("Found %d files matching ID %s" % (len(kanji), id)) + for i, (path, c) in enumerate(kanji): + print("\nFile %s (%d/%d):" % (path, i+1, len(kanji))) + writeOutput(characterSummary(c) + "\n", sys.stdout) + +def commandFindXml(arg): + id = canonicalId(arg) + files = readXmlFile('./kanjivg.xml') + if id in files: + writeOutput(characterSummary(files[id]) + "\n", sys.stdout) + else: + writeOutput(unicode("Character %s (%s) not found.\n") % (id, unichr(int(id, 16))), sys.stdout) + +# Main wrapper + +actions = { + "find-svg": (commandFindSvg, 2), + "find-xml": (commandFindXml, 2), +} + +if __name__ == "__main__": + if len(sys.argv) < 2 or sys.argv[1] not in actions.keys() or \ + len(sys.argv) <= actions[sys.argv[1]][1]: + print(helpString) + sys.exit(0) + + action = actions[sys.argv[1]][0] + args = sys.argv[2:] + + if len(args) == 0: + action() + else: + for f in args: + action(f) + + if lossInWeirdEncoding: + notice = """\nNotice: SOME CHARACTERS IN THE OUTPUT HAVE BEEN REPLACED WITH QUESTION MARKS. + The text output has been encoded using your encoding of the standard + output (%s). Try redirecting output to file if you can't get it to + work in terminal. e.g.: kvg-lookup.py find-svg 4e26 > output.txt\n""" % (lossInWeirdEncoding,) + writeOutput(notice, sys.stderr) From 81f183bc3fb6c0a590366c9abab01e3729e7844e Mon Sep 17 00:00:00 2001 From: sebimoe Date: Sun, 12 Mar 2023 00:45:43 +0100 Subject: [PATCH 3/3] Fixed whitespace --- kvg-lookup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kvg-lookup.py b/kvg-lookup.py index 89080aa905..84c6facc87 100644 --- a/kvg-lookup.py +++ b/kvg-lookup.py @@ -36,7 +36,7 @@ def unichr(c): the given element from ./kanjivg.xml file. Parameters: - element May either be the singular character, e.g. 並 or its + element May either be the singular character, e.g. 並 or its unicode code-point e.g. 4e26. Examples: