Skip to content

Commit

Permalink
Merge pull request #418 from sebimoe/feature/python3-kanji-lookup
Browse files Browse the repository at this point in the history
Added python3 support, added lookup utility
  • Loading branch information
benkasminbullock authored Mar 11, 2023
2 parents 3d5b57d + 81f183b commit 1344798
Show file tree
Hide file tree
Showing 4 changed files with 348 additions and 73 deletions.
175 changes: 112 additions & 63 deletions kanjivg.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from xmlhandler import *
from utils import PYTHON_VERSION_MAJOR, canonicalId

if PYTHON_VERSION_MAJOR > 2:
def unicode(s):
return s

# Sample licence header
licenseString = """Copyright (C) 2009-2013 Ulrich Apel.
Expand Down Expand Up @@ -56,16 +61,19 @@ def realchr(i):

class Kanji:
"""Describes a kanji. The root stroke group is accessible from the strokes member."""
def __init__(self, code, variant):
# Unicode of char being represented (int)
self.code = code
def __init__(self, code, variant = None):
# Unicode of char being represented (standard str)
self.code = canonicalId(code)
# Variant of the character, if any
self.variant = variant
self.strokes = None

def __repr__(self):
return repr(vars(self))

# String identifier used to uniquely identify the kanji
def kId(self):
ret = "%05x" % (self.code,)
ret = self.code
if self.variant: ret += "-%s" % (self.variant,)
return ret

Expand All @@ -88,7 +96,7 @@ def getStrokes(self):

class StrokeGr:
"""Describes a stroke group belonging to a kanji as closely as possible to the XML format. Sub-stroke groups or strokes are available in the childs member. They can either be of class StrokeGr or Stroke so their type should be checked."""
def __init__(self, parent):
def __init__(self, parent = None):
self.parent = parent
if parent: parent.childs.append(self)
# Element of strokegr
Expand All @@ -107,6 +115,15 @@ def __init__(self, parent):

self.childs = []

def __repr__(self):
return repr(vars(self))

def setParent(self, parent):
if self.parent is not None or parent is None:
raise "Set parent should only be set once! There is no cleanup for old parents."
parent.childs.append(self)
self.parent = parent

def toSVG(self, out, rootId, groupCpt = [0], strCpt = [1], indent = 0):
gid = rootId
if groupCpt[0] != 0: gid += "-g" + str(groupCpt[0])
Expand Down Expand Up @@ -208,6 +225,9 @@ def __init__(self, parent):
self.stype = None
self.svg = None
self.numberPos = None

def __repr__(self):
return repr(vars(self))

def numberToSVG(self, out, number, indent = 0):
if self.numberPos:
Expand All @@ -224,40 +244,58 @@ def toSVG(self, out, rootId, groupCpt, strCpt, indent = 0):

class KanjisHandler(BasicHandler):
"""XML handler for parsing kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name."""
def __init__(self, code, variant):
def __init__(self):
BasicHandler.__init__(self)
self.kanji = Kanji(code, variant)
self.kanji = None
self.kanjis = {}
self.group = None
self.groups = []
self.compCpt = {}
self.metComponents = set()

def handle_start_kanji(self, attrs):
pass
if self.kanji is not None:
raise Exception("Kanji cannot be nested")
if self.group is not None:
raise Exception("Kanji cannot be inside a group")
if len(self.groups) != 0:
raise Exception("Previous kanji not closed correctly")
idType, idVariantStr = str(attrs["id"]).split("_")
if idType != "kvg:kanji":
raise Exception("Each kanji should have id formatted as kvg:kanji_XXXXX.")
idVariant = idVariantStr.split('-')
self.kanji = Kanji(*idVariant)


def handle_end_kanji(self):
if len(self.groups) != 0:
print("WARNING: stroke groups remaining after reading kanji!")
if self.group is not None:
raise Exception("A group is not closed inside the kanji.")
if len(self.groups) != 1:
raise Exception("Kanji should have 1 root group.")
self.kanji.strokes = self.groups[0]
self.kanjis[self.kanji.code] = self.kanji
self.groups = []
self.kanji = None

def handle_start_strokegr(self, attrs):
if len(self.groups) == 0: parent = None
else: parent = self.groups[-1]
group = StrokeGr(parent)
def handle_start_g(self, attrs):
if self.kanji is None:
raise Exception("Stroke group must be inside a kanji")
group = StrokeGr(self.group)

# Now parse group attributes
if attrs.has_key("element"): group.element = unicode(attrs["element"])
if attrs.has_key("variant"): group.variant = str(attrs["variant"])
if attrs.has_key("partial"): group.partial = str(attrs["partial"])
if attrs.has_key("original"): group.original = unicode(attrs["original"])
if attrs.has_key("part"): group.part = int(attrs["part"])
if attrs.has_key("number"): group.number = int(attrs["number"])
if attrs.has_key("tradForm") and str(attrs["tradForm"]) == "true": group.tradForm = True
if attrs.has_key("radicalForm") and str(attrs["radicalForm"]) == "true": group.radicalForm = True
if attrs.has_key("position"): group.position = unicode(attrs["position"])
if attrs.has_key("radical"): group.radical = unicode(attrs["radical"])
if attrs.has_key("phon"): group.phon = unicode(attrs["phon"])

self.groups.append(group)
if "kvg:element" in attrs: group.element = unicode(attrs["kvg:element"])
if "kvg:variant" in attrs: group.variant = str(attrs["kvg:variant"])
if "kvg:partial" in attrs: group.partial = str(attrs["kvg:partial"])
if "kvg:original" in attrs: group.original = unicode(attrs["kvg:original"])
if "kvg:part" in attrs: group.part = int(attrs["kvg:part"])
if "kvg:number" in attrs: group.number = int(attrs["kvg:number"])
if "kvg:tradForm" in attrs and str(attrs["kvg:tradForm"]) == "true": group.tradForm = True
if "kvg:radicalForm" in attrs and str(attrs["kvg:radicalForm"]) == "true": group.radicalForm = True
if "kvg:position" in attrs: group.position = unicode(attrs["kvg:position"])
if "kvg:radical" in attrs: group.radical = unicode(attrs["kvg:radical"])
if "kvg:phon" in attrs: group.phon = unicode(attrs["kvg:phon"])

self.group = group

if group.element: self.metComponents.add(group.element)
if group.original: self.metComponents.add(group.original)
Expand All @@ -266,40 +304,39 @@ def handle_start_strokegr(self, attrs):
if not group.part: print("%s: Number specified, but part missing" % (self.kanji.kId()))
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element + str(group.number)):
if (group.element + str(group.number)) not in self.compCpt:
print("%s: Missing numbered group" % (self.kanji.kId()))
elif self.compCpt[group.element + str(group.number)] != group.part - 1:
print("%s: Incorrectly numbered group" % (self.kanji.kId()))
# The group must not exist
else:
if self.compCpt.has_key(group.element + str(group.number)):
if (group.element + str(group.number)) in self.compCpt:
print("%s: Duplicate numbered group" % (self.kanji.kId()))
self.compCpt[group.element + str(group.number)] = group.part
# No number, just a part - groups restart with part 1, otherwise must
# increase correctly
elif group.part:
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element):
if group.element not in self.compCpt:
print("%s: Incorrectly started multi-part group" % (self.kanji.kId()))
elif self.compCpt[group.element] != group.part - 1:
print("%s: Incorrectly splitted multi-part group" % (self.kanji.kId()))
self.compCpt[group.element] = group.part

def handle_end_strokegr(self):
group = self.groups.pop()
if len(self.groups) == 0:
if self.kanji.strokes:
print("WARNING: overwriting root of kanji!")
self.kanji.strokes = group
def handle_end_g(self):
if self.group.parent is None:
self.groups.append(self.group)
self.group = self.group.parent

def handle_start_stroke(self, attrs):
if len(self.groups) == 0: parent = None
else: parent = self.groups[-1]
stroke = Stroke(parent)
stroke.stype = unicode(attrs["type"])
if attrs.has_key("path"): stroke.svg = unicode(attrs["path"])
self.groups[-1].childs.append(stroke)
def handle_start_path(self, attrs):
if self.kanji is None or self.group is None:
raise Exception("Stroke must be inside a kanji and group!")
stroke = Stroke(self.group)
if "kvg:type" in attrs:
stroke.stype = unicode(attrs["kvg:type"])
if "d" in attrs: stroke.svg = unicode(attrs["d"])
self.group.childs.append(stroke)

class SVGHandler(BasicHandler):
"""SVG handler for parsing final kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanji are accessible through the kanjis member, indexed by their svg file name."""
Expand All @@ -311,28 +348,36 @@ def __init__(self):
self.metComponents = set()

def handle_start_g(self, attrs):
group = StrokeGr()

# Special case for handling the root
if len(self.groups) == 0:
id = hex(realord(attrs["kvg:element"]))[2:]
self.currentKanji = Kanji(id)
self.kanjis[id] = self.currentKanji
idType, idVariantStr = str(attrs["id"]).split("_")
idVariant = idVariantStr.split('-')
if idType == "kvg:StrokePaths":
pass
elif idType == "kvg:StrokeNumbers":
return
else:
raise Exception("Invalid root group id type (%s)" % (str(attrs["id"]),))
self.currentKanji = Kanji(*idVariant)
self.kanjis[self.currentKanji.code] = self.currentKanji
self.compCpt = {}
parent = None
else: parent = self.groups[-1]
else:
group.setParent(self.groups[-1])

group = StrokeGr(parent)
# Now parse group attributes
if attrs.has_key("kvg:element"): group.element = unicode(attrs["kvg:element"])
if attrs.has_key("kvg:variant"): group.variant = str(attrs["kvg:variant"])
if attrs.has_key("kvg:partial"): group.partial = str(attrs["kvg:partial"])
if attrs.has_key("kvg:original"): group.original = unicode(attrs["kvg:original"])
if attrs.has_key("kvg:part"): group.part = int(attrs["kvg:part"])
if attrs.has_key("kvg:number"): group.number = int(attrs["kvg:number"])
if attrs.has_key("kvg:tradForm") and str(attrs["kvg:tradForm"]) == "true": group.tradForm = True
if attrs.has_key("kvg:radicalForm") and str(attrs["kvg:radicalForm"]) == "true": group.radicalForm = True
if attrs.has_key("kvg:position"): group.position = unicode(attrs["kvg:position"])
if attrs.has_key("kvg:radical"): group.radical = unicode(attrs["kvg:radical"])
if attrs.has_key("kvg:phon"): group.phon = unicode(attrs["kvg:phon"])
if "kvg:element" in attrs: group.element = unicode(attrs["kvg:element"])
if "kvg:variant" in attrs: group.variant = str(attrs["kvg:variant"])
if "kvg:partial" in attrs: group.partial = str(attrs["kvg:partial"])
if "kvg:original" in attrs: group.original = unicode(attrs["kvg:original"])
if "kvg:part" in attrs: group.part = int(attrs["kvg:part"])
if "kvg:number" in attrs: group.number = int(attrs["kvg:number"])
if "kvg:tradForm" in attrs and str(attrs["kvg:tradForm"]) == "true": group.tradForm = True
if "kvg:radicalForm" in attrs and str(attrs["kvg:radicalForm"]) == "true": group.radicalForm = True
if "kvg:position" in attrs: group.position = unicode(attrs["kvg:position"])
if "kvg:radical" in attrs: group.radical = unicode(attrs["kvg:radical"])
if "kvg:phon" in attrs: group.phon = unicode(attrs["kvg:phon"])

self.groups.append(group)

Expand Down Expand Up @@ -364,9 +409,11 @@ def handle_start_g(self, attrs):
self.compCpt[group.element] = group.part

def handle_end_g(self):
if len(self.groups) == 0:
return
group = self.groups.pop()
# End of kanji?
if len(self.groups) == 0:
if len(self.groups) == 1: # index 1 - ignore root group
self.currentKanji.strokes = group
self.currentKanji = None
self.groups = []
Expand All @@ -376,6 +423,8 @@ def handle_start_path(self, attrs):
if len(self.groups) == 0: parent = None
else: parent = self.groups[-1]
stroke = Stroke(parent)
stroke.stype = unicode(attrs["kvg:type"])
if attrs.has_key("d"): stroke.svg = unicode(attrs["d"])
if "kvg:type" in attrs:
stroke.stype = unicode(attrs["kvg:type"])
if "d" in attrs:
stroke.svg = unicode(attrs["d"])
self.groups[-1].childs.append(stroke)
Loading

0 comments on commit 1344798

Please sign in to comment.