-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.sh
80 lines (54 loc) · 2.21 KB
/
train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
LANG="jpn"
MAX_PAGES=0 #0 for all
NUM_ITERATIONS=5000000
LANG_DATA_DIR="tesseract/langdata_lstm"
TESSDATA_DIR="tesseract/tessdata"
MODEL_DIR="model"
OUTPUT_DIR="output"
TRAIN_DIR="data_lstmf"
TRAINING_TEXT="untitled.txt"
# TRAINING_TEXT=${LANG_DATA_DIR}/${LANG}/${LANG}.training_text
START_MODEL="jpn_ver5"
OUTPUT_NAME="jpn_vert_new"
# # # Remove the previosly generated training data
# rm -rf $TRAINDIR/*
# # # # normalize training text
# python3 ./tesseract/tesstrain/normalize.py -v ${TRAINING_TEXT}
# # # # # Generate training data
# # make text image tiff and box, then convert it to lstmf
# python tesstrain.py \
# --lang $LANG \
# --fonts_dir fonts \
# --fontlist "DotGothic16" "Hachi Maru Pop" "Kaisei Opti" "Kosugi Maru" "M PLUS 1p" "Noto Serif JP" "Rampart One" "Reggae One" "Train One" "Yuji Boku" "Yuji Syuku" "Zen Antique" "Zen Kaku Gothic Antique" "Zen Kurenaido" "Zen Maru Gothic" "Zen Old Mincho" \
# --langdata_dir $LANG_DATA_DIR \
# --tessdata_dir $TESSDATA_DIR \
# --output_dir $TRAIN_DIR \
# --maxpages $MAX_PAGES \
# --distort_image \
# --linedata_only \
# --ptsize 24 \
# --noextract_font_properties \
# --exposures 0 \
# --writing_mode_vert \
# --training_text $TRAINING_TEXT \
# # --save_box_tiff
# # Extract the trainned model
combine_tessdata -u $MODEL_DIR/$START_MODEL".traineddata" $MODEL_DIR/$START_MODEL
# # # # # Fine tune the model, (OMP_THREAD_LIMIT, OPENMP burns CPU performance for nothing)
# rm -rf $OUTPUT_DIR/*
OMP_THREAD_LIMIT=1 lstmtraining \
--continue_from $MODEL_DIR/$START_MODEL".lstm" \
--model_output $OUTPUT_DIR/$OUTPUT_NAME \
--traineddata $MODEL_DIR/$START_MODEL".traineddata" \
--train_listfile $TRAIN_DIR/$LANG".training_files.txt" \
--max_iterations $NUM_ITERATIONS \
--old_traineddata $MODEL_DIR/$START_MODEL".traineddata" \
--target_error_rate 1.0 \
# --max_image_MB 12000 \
# --debug_interval -1 \
# --append_index 5 --net_spec '[Lfx128 O1c1]' \
# # # combines into a .traineddata
# lstmtraining --stop_training \
# --continue_from $OUTPUT_DIR/$OUTPUT_NAME"_checkpoint" \
# --traineddata $MODEL_DIR/$START_MODEL".traineddata" \
# --model_output $OUTPUT_DIR/$OUTPUT_NAME".traineddata"