From c81f2e3cdb23b18102ad220b7f978124b00bced6 Mon Sep 17 00:00:00 2001 From: Shan Date: Wed, 22 Apr 2020 12:43:32 +0530 Subject: [PATCH 1/8] reader --- text/utils/tokenization.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/text/utils/tokenization.py b/text/utils/tokenization.py index e0369ac..5342b83 100644 --- a/text/utils/tokenization.py +++ b/text/utils/tokenization.py @@ -34,14 +34,19 @@ def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() index = 0 - with open_reader(vocab_file) as reader: - while True: - token = reader.readline() - if not token: - break - token = token.strip() - vocab[token] = index - index += 1 + if six.PY2: + reader = open_reader(vocab_file) + else: + reader = tf.gfile.GFile(vocab_file, "r") + + while True: + token = reader.readline() + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + reader.close() return vocab From e7b5fb3e7863c09457f290790f05fdbdc98d68fa Mon Sep 17 00:00:00 2001 From: Shan Date: Wed, 22 Apr 2020 12:45:01 +0530 Subject: [PATCH 2/8] unicode --- text/utils/tokenization.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/text/utils/tokenization.py b/text/utils/tokenization.py index 5342b83..3450331 100644 --- a/text/utils/tokenization.py +++ b/text/utils/tokenization.py @@ -270,11 +270,12 @@ def _is_punctuation(char): def _convert_to_unicode_or_throw(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if isinstance(text, str): - text = text.decode("utf-8", "ignore") - if not isinstance(text, unicode): - raise ValueError("`text` must be of type `unicode` or `str`, but is " - "actually of type: %s" % (type(text).__name__)) + if six.PY2: + if isinstance(text, str): + text = text.decode("utf-8", "ignore") + if not isinstance(text, unicode): + raise ValueError("`text` must be of type `unicode` or `str`, but is " + "actually of type: %s" % (type(text).__name__)) return text From 6da3f328bd8aab9ab5c7d2476144b9290340c42b Mon Sep 17 00:00:00 2001 From: Shan Date: Wed, 22 Apr 2020 12:46:35 +0530 Subject: [PATCH 3/8] preprocesspy unicode --- text/preprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/text/preprocess.py b/text/preprocess.py index ef5aab7..2f10bb0 100644 --- a/text/preprocess.py +++ b/text/preprocess.py @@ -21,6 +21,7 @@ import copy import json import os +import six from absl import app from absl import flags @@ -266,7 +267,7 @@ def convert_examples_to_features( # st = " ".join([str(x) for x in tokens]) st = "" for x in tokens: - if isinstance(x, unicode): + if six.PY2 and isinstance(x, unicode): st += x.encode("ascii", "replace") + " " else: st += str(x) + " " From f9e1cfe7409f7d5cf51efa365233153a241d4f8b Mon Sep 17 00:00:00 2001 From: Shan Date: Wed, 22 Apr 2020 12:51:11 +0530 Subject: [PATCH 4/8] logging, run on tpu --- text/augmentation/sent_level_augment.py | 3 ++- text/scripts/run_base_uda.sh | 17 +++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/text/augmentation/sent_level_augment.py b/text/augmentation/sent_level_augment.py index 44f8827..e1cfb9c 100644 --- a/text/augmentation/sent_level_augment.py +++ b/text/augmentation/sent_level_augment.py @@ -22,6 +22,7 @@ import math import random +import six from absl import flags import numpy as np @@ -117,7 +118,7 @@ def back_translation(examples, aug_ops, sub_set, aug_copy_num, text_b=text_b, label=ori_example.label) aug_examples += [example] - if np.random.random() < 0.0001: + if six.PY2 and np.random.random() < 0.0001: tf.logging.info("\tori:\n\t\t{:s}\n\t\t{:s}\n\t\t{:s}\n".format( ori_example.text_a, ori_example.text_b, ori_example.label)) tf.logging.info("\tnew:\n\t\t{:s}\n\t\t{:s}\n\t\t{:s}\n".format( diff --git a/text/scripts/run_base_uda.sh b/text/scripts/run_base_uda.sh index 8bd621a..9e716bb 100644 --- a/text/scripts/run_base_uda.sh +++ b/text/scripts/run_base_uda.sh @@ -12,18 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +gsutil -m rsync -r . gs://bewgle-data/UDA-py3/ python main.py \ - --use_tpu=False \ + --tpu_name= \ --do_train=True \ --do_eval=True \ - --sup_train_data_dir=data/proc_data/IMDB/train_20 \ - --unsup_data_dir=data/proc_data/IMDB/unsup \ - --eval_data_dir=data/proc_data/IMDB/dev \ - --bert_config_file=pretrained_models/bert_base/bert_config.json \ - --vocab_file=pretrained_models/bert_base/vocab.txt \ - --init_checkpoint=pretrained_models/bert_base/bert_model.ckpt \ + --sup_train_data_dir=gs://bewgle-data/UDA-py3/data/proc_data/IMDB/train_20 \ + --unsup_data_dir=gs://bewgle-data/UDA-py3/data/proc_data/IMDB/unsup \ + --eval_data_dir=gs://bewgle-data/UDA-py3/data/proc_data/IMDB/dev \ + --bert_config_file=gs://bewgle-data/UDA-py3/pretrained_models/bert_base/bert_config.json \ + --vocab_file=gs://bewgle-data/UDA-py3/pretrained_models/bert_base/vocab.txt \ + --init_checkpoint=gs://bewgle-data/UDA-py3/pretrained_models/bert_base/bert_model.ckpt \ --task_name=IMDB \ - --model_dir=ckpt/base_uda \ + --model_dir=gs://bewgle-data/UDA-py3/ckpt/base_uda \ --num_train_steps=10000 \ --learning_rate=2e-05 \ --num_warmup_steps=1000 \ From 398d6cbc56da45a69c8454615d95bd40d869bdba Mon Sep 17 00:00:00 2001 From: Shan Date: Wed, 22 Apr 2020 12:56:58 +0530 Subject: [PATCH 5/8] param gs --- text/scripts/run_base_uda.sh | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/text/scripts/run_base_uda.sh b/text/scripts/run_base_uda.sh index 9e716bb..ccf87d1 100644 --- a/text/scripts/run_base_uda.sh +++ b/text/scripts/run_base_uda.sh @@ -12,19 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -gsutil -m rsync -r . gs://bewgle-data/UDA-py3/ +GS_DIR = gs://bewgle-data/UDA-tpu-py3 +gsutil -m rsync -r . $GS_DIR python main.py \ --tpu_name= \ --do_train=True \ --do_eval=True \ - --sup_train_data_dir=gs://bewgle-data/UDA-py3/data/proc_data/IMDB/train_20 \ - --unsup_data_dir=gs://bewgle-data/UDA-py3/data/proc_data/IMDB/unsup \ - --eval_data_dir=gs://bewgle-data/UDA-py3/data/proc_data/IMDB/dev \ - --bert_config_file=gs://bewgle-data/UDA-py3/pretrained_models/bert_base/bert_config.json \ - --vocab_file=gs://bewgle-data/UDA-py3/pretrained_models/bert_base/vocab.txt \ - --init_checkpoint=gs://bewgle-data/UDA-py3/pretrained_models/bert_base/bert_model.ckpt \ + --sup_train_data_dir=$GS_DIR/data/proc_data/IMDB/train_20 \ + --unsup_data_dir=$GS_DIR/data/proc_data/IMDB/unsup \ + --eval_data_dir=$GS_DIR/data/proc_data/IMDB/dev \ + --bert_config_file=$GS_DIR/pretrained_models/bert_base/bert_config.json \ + --vocab_file=$GS_DIR/pretrained_models/bert_base/vocab.txt \ + --init_checkpoint=$GS_DIR/pretrained_models/bert_base/bert_model.ckpt \ --task_name=IMDB \ - --model_dir=gs://bewgle-data/UDA-py3/ckpt/base_uda \ + --model_dir=$GS_DIR/ckpt/base_uda \ --num_train_steps=10000 \ --learning_rate=2e-05 \ --num_warmup_steps=1000 \ From 37d8100aa39b3ade1864d8ad9d2bc05198f53b46 Mon Sep 17 00:00:00 2001 From: Shan Date: Wed, 22 Apr 2020 13:21:45 +0530 Subject: [PATCH 6/8] long --- text/bert/modeling.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/text/bert/modeling.py b/text/bert/modeling.py index b0fa8bb..2199e75 100644 --- a/text/bert/modeling.py +++ b/text/bert/modeling.py @@ -964,7 +964,9 @@ def assert_rank(tensor, expected_rank, name=None): name = tensor.name expected_rank_dict = {} - if isinstance(expected_rank, (int, long)): + if six.PY2 and isinstance(expected_rank, (int, long)): + expected_rank_dict[expected_rank] = True + elif six.PY3 and isinstance(expected_rank, int): expected_rank_dict[expected_rank] = True else: for x in expected_rank: From 2e5f8ac10b917838f79f1a3bde4e4c15f4ed8e09 Mon Sep 17 00:00:00 2001 From: Shan Date: Wed, 22 Apr 2020 13:33:50 +0530 Subject: [PATCH 7/8] modeling --- text/bert/modeling.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/text/bert/modeling.py b/text/bert/modeling.py index 2199e75..d8553de 100644 --- a/text/bert/modeling.py +++ b/text/bert/modeling.py @@ -312,7 +312,10 @@ def get_activation(activation_string): # We assume that anything that's not a string is already an activation # function, so we just return it. - if not isinstance(activation_string, (str, unicode)): + + if six.PY2 and not isinstance(activation_string, (str, unicode)): + return activation_string + elif six.PY3 and not isinstance(activation_string, str): return activation_string if not activation_string: From bd5c67a69601f5b63e8fce15307e9ecfa254bb20 Mon Sep 17 00:00:00 2001 From: Shan Date: Wed, 22 Apr 2020 13:56:06 +0530 Subject: [PATCH 8/8] revert --- text/scripts/run_base_uda.sh | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/text/scripts/run_base_uda.sh b/text/scripts/run_base_uda.sh index ccf87d1..8bd621a 100644 --- a/text/scripts/run_base_uda.sh +++ b/text/scripts/run_base_uda.sh @@ -12,20 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -GS_DIR = gs://bewgle-data/UDA-tpu-py3 -gsutil -m rsync -r . $GS_DIR python main.py \ - --tpu_name= \ + --use_tpu=False \ --do_train=True \ --do_eval=True \ - --sup_train_data_dir=$GS_DIR/data/proc_data/IMDB/train_20 \ - --unsup_data_dir=$GS_DIR/data/proc_data/IMDB/unsup \ - --eval_data_dir=$GS_DIR/data/proc_data/IMDB/dev \ - --bert_config_file=$GS_DIR/pretrained_models/bert_base/bert_config.json \ - --vocab_file=$GS_DIR/pretrained_models/bert_base/vocab.txt \ - --init_checkpoint=$GS_DIR/pretrained_models/bert_base/bert_model.ckpt \ + --sup_train_data_dir=data/proc_data/IMDB/train_20 \ + --unsup_data_dir=data/proc_data/IMDB/unsup \ + --eval_data_dir=data/proc_data/IMDB/dev \ + --bert_config_file=pretrained_models/bert_base/bert_config.json \ + --vocab_file=pretrained_models/bert_base/vocab.txt \ + --init_checkpoint=pretrained_models/bert_base/bert_model.ckpt \ --task_name=IMDB \ - --model_dir=$GS_DIR/ckpt/base_uda \ + --model_dir=ckpt/base_uda \ --num_train_steps=10000 \ --learning_rate=2e-05 \ --num_warmup_steps=1000 \