From c7ccf228f9ea3c723e70ec00da4e10f5a9fab902 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 28 Jul 2020 21:37:30 -0700 Subject: [PATCH 1/3] Update run_squad.py --- scripts/question_answering/run_squad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py index 820aec0c46..8c1f89fbea 100644 --- a/scripts/question_answering/run_squad.py +++ b/scripts/question_answering/run_squad.py @@ -563,7 +563,7 @@ def train(args): segment_ids = sample.segment_ids.as_in_ctx(ctx) if use_segmentation else None valid_length = sample.valid_length.as_in_ctx(ctx) p_mask = sample.masks.as_in_ctx(ctx) - gt_start = sample.gt_start.as_in_ctx(ctx) + gt_start = sample.gt_start.as_in_ctx(ctx).astype(np.int32) gt_end = sample.gt_end.as_in_ctx(ctx) is_impossible = sample.is_impossible.as_in_ctx(ctx).astype(np.int32) batch_idx = mx.np.arange(tokens.shape[0], dtype=np.int32, ctx=ctx) From 168a90ad28a8a93c6a182f27e90f44a54024e1cc Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 28 Jul 2020 21:48:23 -0700 Subject: [PATCH 2/3] Update run_squad.py --- scripts/question_answering/run_squad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py index 8c1f89fbea..1484aeccd2 100644 --- a/scripts/question_answering/run_squad.py +++ b/scripts/question_answering/run_squad.py @@ -564,7 +564,7 @@ def train(args): valid_length = sample.valid_length.as_in_ctx(ctx) p_mask = sample.masks.as_in_ctx(ctx) gt_start = sample.gt_start.as_in_ctx(ctx).astype(np.int32) - gt_end = sample.gt_end.as_in_ctx(ctx) + gt_end = sample.gt_end.as_in_ctx(ctx).astype(np.int32) is_impossible = sample.is_impossible.as_in_ctx(ctx).astype(np.int32) batch_idx = mx.np.arange(tokens.shape[0], dtype=np.int32, ctx=ctx) p_mask = 1 - p_mask # In the network, we use 1 --> no_mask, 0 --> mask From cc1688e9cf81b6aa4056d9e8b6071b59b2d92d44 Mon Sep 17 00:00:00 2001 From: Xingjian Shi Date: Tue, 28 Jul 2020 22:55:05 -0700 Subject: [PATCH 3/3] Update prepare_glue.py --- .../general_nlp_benchmark/prepare_glue.py | 96 ++++++++++++------- 1 file changed, 62 insertions(+), 34 deletions(-) diff --git a/scripts/datasets/general_nlp_benchmark/prepare_glue.py b/scripts/datasets/general_nlp_benchmark/prepare_glue.py index e747626db6..bbaf01cf48 100644 --- a/scripts/datasets/general_nlp_benchmark/prepare_glue.py +++ b/scripts/datasets/general_nlp_benchmark/prepare_glue.py @@ -68,7 +68,23 @@ def read_tsv_glue(tsv_file, num_skip=1, keep_column_names=False): nrows = len(elements) else: assert nrows == len(elements) - return pd.DataFrame(out, columns=column_names) + df = pd.DataFrame(out, columns=column_names) + series_l = [] + for col_name in df.columns: + idx = df[col_name].first_valid_index() + val = df[col_name][idx] + if isinstance(val, str): + try: + dat = pd.to_numeric(df[col_name]) + series_l.append(dat) + continue + except ValueError: + pass + finally: + pass + series_l.append(df[col_name]) + new_df = pd.DataFrame({name: series for name, series in zip(df.columns, series_l)}) + return new_df def read_jsonl_superglue(jsonl_file): @@ -157,6 +173,13 @@ def read_sts(dir_path): else: df = df[[7, 8, 1, 9]] df.columns = ['sentence1', 'sentence2', 'genre', 'score'] + genre_l = [] + for ele in df['genre'].tolist(): + if ele == 'main-forum': + genre_l.append('main-forums') + else: + genre_l.append(ele) + df['genre'] = pd.Series(genre_l) df_dict[fold] = df return df_dict, None @@ -320,8 +343,8 @@ def read_rte_superglue(dir_path): def read_wic(dir_path): df_dict = dict() meta_data = dict() - meta_data['entities1'] = {'type': 'entity', 'parent': 'sentence1'} - meta_data['entities2'] = {'type': 'entity', 'parent': 'sentence2'} + meta_data['entities1'] = {'type': 'entity', 'attrs': {'parent': 'sentence1'}} + meta_data['entities2'] = {'type': 'entity', 'attrs': {'parent': 'sentence2'}} for fold in ['train', 'val', 'test']: if fold != 'test': @@ -340,13 +363,13 @@ def read_wic(dir_path): end2 = row['end2'] if fold == 'test': out.append([sentence1, sentence2, - (start1, end1), - (start2, end2)]) + {'start': start1, 'end': end1}, + {'start': start2, 'end': end2}]) else: label = row['label'] out.append([sentence1, sentence2, - (start1, end1), - (start2, end2), + {'start': start1, 'end': end1}, + {'start': start2, 'end': end2}, label]) df = pd.DataFrame(out, columns=columns) df_dict[fold] = df @@ -357,8 +380,8 @@ def read_wsc(dir_path): df_dict = dict() tokenizer = WhitespaceTokenizer() meta_data = dict() - meta_data['noun'] = {'type': 'entity', 'parent': 'text'} - meta_data['pronoun'] = {'type': 'entity', 'parent': 'text'} + meta_data['noun'] = {'type': 'entity', 'attrs': {'parent': 'text'}} + meta_data['pronoun'] = {'type': 'entity', 'attrs': {'parent': 'text'}} for fold in ['train', 'val', 'test']: jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold)) df = read_jsonl_superglue(jsonl_path) @@ -374,7 +397,7 @@ def read_wsc(dir_path): span2_text = target['span2_text'] # Build entity # list of entities - # 'entity': {'start': 0, 'end': 100} + # 'entities': {'start': 0, 'end': 100} tokens, offsets = tokenizer.encode_with_offsets(text, str) pos_start1 = offsets[span1_index][0] pos_end1 = pos_start1 + len(span1_text) @@ -382,12 +405,12 @@ def read_wsc(dir_path): pos_end2 = pos_start2 + len(span2_text) if fold == 'test': samples.append({'text': text, - 'noun': (pos_start1, pos_end1), - 'pronoun': (pos_start2, pos_end2)}) + 'noun': {'start': pos_start1, 'end': pos_end1}, + 'pronoun': {'start': pos_start2, 'end': pos_end2}}) else: samples.append({'text': text, - 'noun': (pos_start1, pos_end1), - 'pronoun': (pos_start2, pos_end2), + 'noun': {'start': pos_start1, 'end': pos_end1}, + 'pronoun': {'start': pos_start2, 'end': pos_end2}, 'label': label}) df = pd.DataFrame(samples) df_dict[fold] = df @@ -406,8 +429,8 @@ def read_boolq(dir_path): def read_record(dir_path): df_dict = dict() meta_data = dict() - meta_data['entities'] = {'type': 'entity', 'parent': 'text'} - meta_data['answers'] = {'type': 'entity', 'parent': 'text'} + meta_data['entities'] = {'type': 'entity', 'attrs': {'parent': 'text'}} + meta_data['answers'] = {'type': 'entity', 'attrs': {'parent': 'text'}} for fold in ['train', 'val', 'test']: if fold != 'test': columns = ['source', 'text', 'entities', 'query', 'answers'] @@ -422,15 +445,11 @@ def read_record(dir_path): passage = row['passage'] text = passage['text'] entities = passage['entities'] - entities = [(ele['start'], ele['end']) for ele in entities] + entities = [{'start': ele['start'], 'end': ele['end']} for ele in entities] for qas in row['qas']: query = qas['query'] if fold != 'test': - answer_entities = [] - for answer in qas['answers']: - start = answer['start'] - end = answer['end'] - answer_entities.append((start, end)) + answer_entities = qas['answers'] out.append((source, text, entities, query, answer_entities)) else: out.append((source, text, entities, query)) @@ -518,11 +537,15 @@ def format_mrpc(data_dir): os.makedirs(mrpc_dir, exist_ok=True) mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") - download(GLUE_TASK2PATH["mrpc"]['train'], mrpc_train_file) - download(GLUE_TASK2PATH["mrpc"]['test'], mrpc_test_file) + download(GLUE_TASK2PATH["mrpc"]['train'], mrpc_train_file, + sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['train']]) + download(GLUE_TASK2PATH["mrpc"]['test'], mrpc_test_file, + sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['test']]) assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file - download(GLUE_TASK2PATH["mrpc"]['dev'], os.path.join(mrpc_dir, "dev_ids.tsv")) + download(GLUE_TASK2PATH["mrpc"]['dev'], + os.path.join(mrpc_dir, "dev_ids.tsv"), + sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['dev']]) dev_ids = [] with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh: @@ -575,7 +598,7 @@ def get_tasks(benchmark, task_names): @DATA_PARSER_REGISTRY.register('prepare_glue') def get_parser(): parser = argparse.ArgumentParser() - parser.add_argument("--benchmark", choices=['glue', 'superglue', 'sts'], + parser.add_argument("--benchmark", choices=['glue', 'superglue'], default='glue', type=str) parser.add_argument("-d", "--data_dir", help="directory to save data to", type=str, default=None) @@ -618,22 +641,24 @@ def main(args): base_dir = os.path.join(args.data_dir, 'rte_diagnostic') os.makedirs(base_dir, exist_ok=True) download(TASK2PATH['diagnostic'][0], - path=os.path.join(base_dir, 'diagnostic.tsv')) + path=os.path.join(base_dir, 'diagnostic.tsv'), + sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][0]]) download(TASK2PATH['diagnostic'][1], - path=os.path.join(base_dir, 'diagnostic-full.tsv')) + path=os.path.join(base_dir, 'diagnostic-full.tsv'), + sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][1]]) df = reader(base_dir) - df.to_pickle(os.path.join(base_dir, 'diagnostic-full.pd.pkl')) + df.to_parquet(os.path.join(base_dir, 'diagnostic-full.parquet')) else: for key, name in [('broadcoverage-diagnostic', 'AX-b'), ('winogender-diagnostic', 'AX-g')]: data_file = os.path.join(args.cache_path, "{}.zip".format(key)) url = TASK2PATH[key] reader = TASK2READER[key] - download(url, data_file) + download(url, data_file, sha1_hash=_URL_FILE_STATS[url]) with zipfile.ZipFile(data_file) as zipdata: zipdata.extractall(args.data_dir) df = reader(os.path.join(args.data_dir, name)) - df.to_pickle(os.path.join(args.data_dir, name, '{}.pd.pkl'.format(name))) + df.to_parquet(os.path.join(args.data_dir, name, '{}.parquet'.format(name))) elif task == 'mrpc': reader = TASK2READER[task] format_mrpc(args.data_dir) @@ -641,7 +666,7 @@ def main(args): for key, df in df_dict.items(): if key == 'val': key = 'dev' - df.to_pickle(os.path.join(args.data_dir, 'mrpc', '{}.pd.pkl'.format(key))) + df.to_parquet(os.path.join(args.data_dir, 'mrpc', '{}.parquet'.format(key))) with open(os.path.join(args.data_dir, 'mrpc', 'metadata.json'), 'w') as f: json.dump(meta_data, f) else: @@ -649,8 +674,11 @@ def main(args): data_file = os.path.join(args.cache_path, "{}.zip".format(task)) url = TASK2PATH[task] reader = TASK2READER[task] - download(url, data_file) + download(url, data_file, sha1_hash=_URL_FILE_STATS[url]) base_dir = os.path.join(args.data_dir, task) + if os.path.exists(base_dir): + print('Found!') + continue zip_dir_name = None with zipfile.ZipFile(data_file) as zipdata: if zip_dir_name is None: @@ -662,7 +690,7 @@ def main(args): for key, df in df_dict.items(): if key == 'val': key = 'dev' - df.to_pickle(os.path.join(base_dir, '{}.pd.pkl'.format(key))) + df.to_parquet(os.path.join(base_dir, '{}.parquet'.format(key))) if meta_data is not None: with open(os.path.join(base_dir, 'metadata.json'), 'w') as f: json.dump(meta_data, f)