Merge pull request #2 from 8uos/develop

Add get_chars_from_ttf.py
clovaai · Apr 6, 2021 · 096c262 · 096c262
2 parents eafc156 + e3aba4c
commit 096c262
Show file tree

Hide file tree

Showing 10 changed files with 100 additions and 18 deletions.
diff --git a/data/ttfs/train/MaShanZheng-Regular.txt b/data/ttfs/train/MaShanZheng-Regular.txt
diff --git a/data/ttfs/train/ZCOOLKuaiLe-Regular.txt b/data/ttfs/train/ZCOOLKuaiLe-Regular.txt
diff --git a/data/ttfs/val/MaShanZheng-Regular.txt b/data/ttfs/val/MaShanZheng-Regular.txt
diff --git a/data/ttfs/val/ZCOOLKuaiLe-Regular.txt b/data/ttfs/val/ZCOOLKuaiLe-Regular.txt
diff --git a/datasets/__init__.py b/datasets/__init__.py
@@ -8,7 +8,7 @@
 
 from .imagefolder_dataset import ImageTestDataset
 from .ttf_dataset import TTFTrainDataset, TTFValDataset
-from .ttf_utils import get_available_chars, read_font, render
+from .ttf_utils import get_filtered_chars, read_font, render
 from torch.utils.data import DataLoader
 
 
@@ -51,4 +51,4 @@ def get_test_loader(cfg, transform, **kwargs):
     return dset, loader
 
 
-__all__ = ["get_trn_loader", "get_val_loader", "get_test_loader", "get_available_chars", "read_font", "render"]
+__all__ = ["get_trn_loader", "get_val_loader", "get_test_loader", "get_filtered_chars", "read_font", "render"]
diff --git a/datasets/imagefolder_dataset.py b/datasets/imagefolder_dataset.py
@@ -11,18 +11,18 @@
 import torch
 from torch.utils.data import Dataset
 
-from .ttf_utils import get_available_chars, read_font, render
+from .ttf_utils import get_filtered_chars, read_font, render
 
 
 class ImageTestDataset(Dataset):
     def __init__(self, data_dir, source_font, gen_chars_file=None, transform=None, extension="png"):
 
         self.data_dir = Path(data_dir)
         self.source_font = read_font(source_font)
-        self.gen_chars = get_available_chars(source_font)
+        self.gen_chars = get_filtered_chars(source_font)
         if gen_chars_file is not None:
             gen_chars = json.load(open(gen_chars_file))
-            self.gen_chars = [x for x in self.gen_chars if x in gen_chars]
+            self.gen_chars = list(set(self.gen_chars).intersection(set(gen_chars)))
 
         self.font_ref_chars = self.load_data_list(self.data_dir, extension)
 

diff --git a/datasets/ttf_dataset.py b/datasets/ttf_dataset.py
@@ -11,7 +11,7 @@
 import torch
 from torch.utils.data import Dataset
 
-from .ttf_utils import get_available_chars, read_font, render
+from .ttf_utils import read_font, render
 
 
 class TTFTrainDataset(Dataset):
@@ -53,7 +53,7 @@ def filter_chars(self):
         filtered_chars = list(char_key_dict)
         key_char_dict = {}
         for key, chars in self.key_char_dict.items():
-            key_char_dict[key] = [c for c in chars if c in filtered_chars]
+            key_char_dict[key] = list(set(chars).intersection(filtered_chars))
 
         return key_char_dict, char_key_dict
 
@@ -122,11 +122,18 @@ class TTFValDataset(Dataset):
     def __init__(self, data_dir, source_font, char_filter, n_ref=4, n_gen=20, transform=None):
 
         self.data_dir = data_dir
-        self.source_font = read_font(source_font)
+        self.source_font = read_font(source_font) if source_font is not None else None
         self.n_ref = n_ref
         self.n_gen = n_gen
 
         self.key_font_dict, self.key_char_dict = load_data_list(data_dir, char_filter=char_filter)
+        if self.source_font is None:
+            self.char_key_dict = {}
+            for key, charlist in self.key_char_dict.items():
+                for char in charlist:
+                    self.char_key_dict.setdefault(char, []).append(key)
+
+            self.key_char_dict, self.char_key_dict = self.filter_chars()
         self.ref_chars, self.gen_chars = self.sample_ref_gen_chars(self.key_char_dict)
 
         self.gen_char_dict = {k: self.gen_chars for k in self.key_font_dict}
@@ -148,7 +155,13 @@ def __getitem__(self, index):
         ref_imgs = torch.stack([self.transform(render(font, c))
                                 for c in self.ref_chars])
 
-        source_img = self.transform(render(self.source_font, char))
+        if self.source_font is not None:
+            source_font = self.source_font
+        else:
+            source_key = random.choice(self.char_key_dict[char])
+            source_font = self.key_font_dict[source_key]
+
+        source_img = self.transform(render(source_font, char))
         trg_img = self.transform(render(font, char))
 
         ret = {
@@ -202,9 +215,11 @@ def load_data_list(data_dir, char_filter=None):
         font = read_font(font_path)
         key_font_dict[font_path.stem] = font
 
-        chars = get_available_chars(font_path)
+        with open(str(font_path).replace(".ttf", ".txt")) as f:
+            chars = f.read()
+
         if char_filter is not None:
-            chars = [x for x in chars if x in char_filter]
-        key_char_dict[font_path.stem] = chars
+            chars = set(chars).intersection(char_filter)
+        key_char_dict[font_path.stem] = list(chars)
 
     return key_font_dict, key_char_dict
diff --git a/datasets/ttf_utils.py b/datasets/ttf_utils.py
@@ -5,16 +5,31 @@
 """
 
 from fontTools.ttLib import TTFont
-from itertools import chain
 from PIL import Image, ImageFont, ImageDraw
+import numpy as np
 
 
-def get_available_chars(fontfile):
+def get_defined_chars(fontfile):
     ttf = TTFont(fontfile)
-    chars = sorted(set(chain.from_iterable([chr(y) for y in x.cmap.keys()] for x in ttf["cmap"].tables)))
+    chars = [chr(y) for y in ttf["cmap"].tables[0].cmap.keys()]
     return chars
 
 
+def get_filtered_chars(fontpath):
+    ttf = read_font(fontpath)
+    defined_chars = get_defined_chars(fontpath)
+    avail_chars = []
+
+    for char in defined_chars:
+        img = np.array(render(ttf, char))
+        if img.mean() == 255.:
+            pass
+        else:
+            avail_chars.append(char.encode('utf-16', 'surrogatepass').decode('utf-16'))
+
+    return avail_chars
+
+
 def read_font(fontfile, size=150):
     font = ImageFont.truetype(str(fontfile), size=size)
     return font

diff --git a/get_chars_from_ttf.py b/get_chars_from_ttf.py
@@ -0,0 +1,31 @@
+"""
+MX-Font
+Copyright (c) 2021-present NAVER Corp.
+MIT license
+"""
+
+import argparse
+from tqdm import tqdm
+from pathlib import Path
+
+from datasets import get_filtered_chars
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root_dir")
+    args = parser.parse_args()
+
+    print(args.root_dir)
+    ttffiles = list(Path(args.root_dir).rglob("*.ttf"))
+
+    for ttffile in tqdm(ttffiles):
+        filename = ttffile.stem
+        dirname = ttffile.parent
+        avail_chars = get_filtered_chars(ttffile)
+        with open((dirname / (filename+".txt")), "w") as f:
+            f.write("".join(avail_chars))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/readme.md b/readme.md
@@ -34,14 +34,29 @@ conda install numpy scipy scikit-image tqdm jsonlib-python3 fonttools
 
 
 # Usage
-The examples of datasets are in *(./data)*
+
 Note that, we only provide the example font files; not the font files used for the training the provided weight *(generator.pth)*.
 The example font files are downloaded from [here](https://www.freechinesefont.com/tag/commercial-use-ok/).
 
-## Training
-### Preparing font files (.ttf)
+## Preparing Data
+* The examples of datasets are in *(./data)*
+
+### Font files (.ttf)
 * Prepare the TrueType font files(.ttf) to use for the training and the validation.
 * Put the training font files and validation font files into separate directories.
+
+### The text files containing the available characters of .ttf files (.txt)
+* If you have the available character list of a .ttf file, save its available characters list to a text file (.txt) with the same name in the same directory with the ttf file.
+    * (example) **TTF file**: data/ttfs/train/MaShanZheng-Regular.ttf, **its available characters**: data/ttfs/train/MaShanZheng-Regular.txt
+* You can also generate the available characters files automatically using the `get_chars_from_ttf.py`
+```
+# Generating the available characters file
+
+python get_chars_from_ttf.py --root_dir path/to/ttf/dir
+```
+* --root_dir: The root directory to find the .ttf files. All the .ttf files under this directory and its subdirectories will be processed.
+
+### The json files with decomposition information (.json)
 * The files for the decomposition information are needed.
     * The files for the Chinese characters are provided. (data/chn_decomposition.json, data/primals.json)
     * If you want to train the model with a language other than Chinese, the files for the decomposition rule (see below) are also needed.
@@ -55,6 +70,8 @@ The example font files are downloaded from [here](https://www.freechinesefont.co
             * example: ['亠', '厶', '川', '囗', '口']
 
 
+## Training
+
 ### Modify the configuration file (cfgs/train.yaml)
 
 ```