Skip to content

Commit 26f9778

Browse files
committed
2.0.12
1 parent b1084e1 commit 26f9778

File tree

71 files changed

+2038
-134
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+2038
-134
lines changed

examples-v2/train_rna_tokenizers.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# -*- coding: utf-8 -*-
2+
# file: train_tokenizers.py
3+
# time: 2022/11/19 15:30
4+
# author: yangheng <[email protected]>
5+
# github: https://github.com/yangheng95
6+
# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
7+
# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research
8+
# Copyright (C) 2022. All Rights Reserved.
9+
import findfile
10+
from transformers import AutoTokenizer
11+
12+
from pyabsa.utils import train_word2vec, train_bpe_tokenizer
13+
14+
if __name__ == '__main__':
15+
"""
16+
This script is used to train word2vec and bpe tokenizer for rna/protein classification/regression tasks.
17+
For example:
18+
MQFKVYTYKRESRYRLFCDVQSDIIDTPGRRMVIPLASARLLSDKVSRELYPVVHIGDESWRMMTTDMASVPVSVIGEEVADLSHRENDIKNAINLMFWGI
19+
-> Tokenize
20+
MQFK VYTYKR ESRY RLFCDV QSDIIDT PGRRM VIP LASARLLSD KVSRELYPV VHIGDESW RMMTTDM ASVPV SVIGEE VADLSH RENDI KNAIN LMFWGI
21+
-> Word2Vec Embedding
22+
[1*768] or [1*300]
23+
This is a not a real protein sequence, just for example.
24+
"""
25+
paths = findfile.find_cwd_files('.txt')
26+
27+
# train bpe tokenizer for protein or rna sequence
28+
train_bpe_tokenizer(paths, save_path='bpe_tokenizer', base_tokenizer='roberta-base')
29+
30+
# then you can use the bpe_tokenizer to train a protein or rna sequence word2vec embedding
31+
pre_tokenizer = AutoTokenizer.from_pretrained('bpe_tokenizer')
32+
train_word2vec(paths, save_path='word2vec', pre_tokenizer=pre_tokenizer)

examples-v2/train_tokenizers.py

-21
This file was deleted.

pyabsa/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
# Copyright (C) 2021. All Rights Reserved.
88

99
__name__ = 'pyabsa'
10-
__version__ = '2.0.11'
10+
__version__ = '2.0.12'
1111

1212
from pyabsa.framework.flag_class import *
1313

pyabsa/framework/flag_class/flag_template.py

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class TaskNameOption(dict):
2323
'TAD': 'Text Adversarial Defense',
2424
'RNAC': 'RNA Sequence Classification',
2525
'RNAR': 'RNA Sequence Regression',
26+
'PR': 'Protein Sequence Regression',
2627
}
2728

2829
def __init__(self):
@@ -40,6 +41,7 @@ class TaskCodeOption:
4041
Text_Adversarial_Defense = 'TAD'
4142
RNASequenceClassification = 'RNAC'
4243
RNASequenceRegression = 'RNAR'
44+
ProteinSequenceRegression = 'PR'
4345

4446

4547
class LabelPaddingOption:

pyabsa/framework/instructor_class/instructor_template.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,9 @@ def _prepare_dataloader(self):
164164
DataLoader(dataset=val_set, batch_size=self.config.batch_size, sampler=val_sampler))
165165

166166
def _prepare_env(self):
167+
self.config.tokenizer = self.tokenizer
168+
self.config.embedding_matrix = self.embedding_matrix
169+
167170
if os.path.exists('init_state_dict.bin'):
168171
os.remove('init_state_dict.bin')
169172
if self.config.cross_validate_fold > 0:
@@ -180,8 +183,6 @@ def _prepare_env(self):
180183
if self.config.device.type == 'cuda':
181184
self.logger.info("cuda memory allocated:{}".format(torch.cuda.memory_allocated(device=self.config.device)))
182185

183-
self.config.tokenizer = self.tokenizer
184-
self.config.embedding_matrix = self.embedding_matrix
185186

186187
print_args(self.config, self.logger)
187188

pyabsa/networks/bert_mean_pooler.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- coding: utf-8 -*-
2+
# file: bert_mean_pooler.py
3+
# time: 2022/11/24 17:46
4+
# author: yangheng <[email protected]>
5+
# github: https://github.com/yangheng95
6+
# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
7+
# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research
8+
# Copyright (C) 2022. All Rights Reserved.
9+
import torch
10+
from torch import nn
11+
12+
13+
class BERTMeanPooler(nn.Module):
14+
def __init__(self, ):
15+
super(BERTMeanPooler, self).__init__()
16+
17+
def forward(self, model_output, attention_mask):
18+
19+
token_embeddings = model_output
20+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
21+
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
22+
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
23+
return sum_embeddings / sum_mask

pyabsa/networks/losses/MAELoss.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding: utf-8 -*-
2+
# file: MAELoss.py
3+
# time: 2022/11/24 20:11
4+
# author: yangheng <[email protected]>
5+
# github: https://github.com/yangheng95
6+
# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
7+
# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research
8+
# Copyright (C) 2022. All Rights Reserved.
9+
import torch
10+
from torch import nn
11+
12+
13+
class MAELoss(nn.Module):
14+
15+
def __init__(self):
16+
super(MAELoss, self).__init__()
17+
18+
def forward(self, y_pred, y_true):
19+
return torch.mean(torch.abs(y_pred - y_true))

pyabsa/networks/losses/R2Loss.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- coding: utf-8 -*-
2+
# file: R2Loss.py
3+
# time: 2022/11/24 20:06
4+
# author: yangheng <[email protected]>
5+
# github: https://github.com/yangheng95
6+
# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
7+
# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research
8+
# Copyright (C) 2022. All Rights Reserved.
9+
import torch
10+
from torch import nn
11+
12+
13+
class R2Loss(nn.Module):
14+
def __init__(self):
15+
super(R2Loss, self).__init__()
16+
self.mse = nn.MSELoss()
17+
18+
def forward(self, y_pred, y_true):
19+
y_true_mean = torch.mean(y_true, dim=0)
20+
ss_tot = torch.sum((y_true - y_true_mean) ** 2, dim=0)
21+
ss_res = torch.sum((y_true - y_pred) ** 2, dim=0)
22+
r2 = 1 - ss_res / ss_tot
23+
return 1 - torch.mean(r2)

pyabsa/networks/losses/RMSELoss.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# -*- coding: utf-8 -*-
2+
# file: RMSELoss.py
3+
# time: 2022/11/24 20:10
4+
# author: yangheng <[email protected]>
5+
# github: https://github.com/yangheng95
6+
# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
7+
# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research
8+
# Copyright (C) 2022. All Rights Reserved.
9+
import torch
10+
from torch import nn
11+
12+
13+
class RMSELoss(nn.Module):
14+
def __init__(self):
15+
super(RMSELoss, self).__init__()
16+
17+
def forward(self, y_pred, y_true):
18+
return torch.sqrt(nn.MSELoss()(y_pred, y_true))

pyabsa/networks/losses/__init__.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -*- coding: utf-8 -*-
2+
# file: __init__.py.py
3+
# time: 2022/11/24 20:05
4+
# author: yangheng <[email protected]>
5+
# github: https://github.com/yangheng95
6+
# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
7+
# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research
8+
# Copyright (C) 2022. All Rights Reserved.

pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/data_utils_for_training.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from termcolor import colored
1212

1313
from pyabsa.framework.dataset_class.dataset_template import PyABSADataset
14-
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__plm__.classic_bert_apc_utils import prepare_input_for_apc, build_sentiment_window
15-
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__plm__.dependency_graph import configure_spacy_model, prepare_dependency_graph
14+
from ...dataset_utils.__plm__.classic_bert_apc_utils import prepare_input_for_apc, build_sentiment_window
15+
from ...dataset_utils.__plm__.dependency_graph import configure_spacy_model, prepare_dependency_graph
1616
from pyabsa.utils.file_utils.file_utils import load_dataset_from_file
1717
from pyabsa.utils.pyabsa_utils import check_and_fix_labels, validate_example
1818

pyabsa/tasks/AspectPolarityClassification/instructor/apc_instructor.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
from pyabsa import DeviceTypeOption
2121
from pyabsa.framework.instructor_class.instructor_template import BaseTrainingInstructor
22-
from pyabsa.tasks.AspectPolarityClassification.instructor.ensembler import APCEnsembler
22+
from ..instructor.ensembler import APCEnsembler
2323
from pyabsa.utils.file_utils.file_utils import save_model
2424
from pyabsa.utils.pyabsa_utils import print_args, init_optimizer
2525

@@ -480,15 +480,18 @@ def _init_misc(self):
480480
{'params': self.model.models[0].eta2, 'lr': self.config.eta_lr, 'weight_decay': self.config.l2reg}
481481
],
482482
lr=self.config.learning_rate,
483-
weight_decay=self.config.l2reg
483+
weight_decay=self.config.l2reg,
484+
maximize=self.config.maximize_loss if self.config.get('maximize_loss') else False
484485
)
485486
else:
486487
self.optimizer = init_optimizer(self.config.optimizer)(
487488
self.model.parameters(),
488489
lr=self.config.learning_rate,
489-
weight_decay=self.config.l2reg
490+
weight_decay=self.config.l2reg,
491+
maximize=self.config.maximize_loss if self.config.get('maximize_loss') else False
490492
)
491493

494+
492495
def _cache_or_load_dataset(self):
493496
pass
494497

pyabsa/tasks/AspectPolarityClassification/instructor/ensembler.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@
1919
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
2020
from transformers import AutoTokenizer, AutoModel
2121

22-
from pyabsa.tasks.AspectPolarityClassification.models.__classic__ import GloVeAPCModelList
23-
from pyabsa.tasks.AspectPolarityClassification.models.__lcf__ import APCModelList
24-
from pyabsa.tasks.AspectPolarityClassification.models.__plm__ import BERTBaselineAPCModelList
25-
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__classic__.data_utils_for_training import GloVeABSADataset
26-
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__lcf__.data_utils_for_training import ABSADataset
27-
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__plm__.data_utils_for_training import BERTBaselineABSADataset
22+
from ..models.__classic__ import GloVeAPCModelList
23+
from ..models.__lcf__ import APCModelList
24+
from ..models.__plm__ import BERTBaselineAPCModelList
25+
from ..dataset_utils.__classic__.data_utils_for_training import GloVeABSADataset
26+
from ..dataset_utils.__lcf__.data_utils_for_training import ABSADataset
27+
from ..dataset_utils.__plm__.data_utils_for_training import BERTBaselineABSADataset
2828
from pyabsa.framework.tokenizer_class.tokenizer_class import PretrainedTokenizer, Tokenizer, build_embedding_matrix
2929

3030

@@ -119,7 +119,6 @@ def __init__(self, config, load_dataset=True, **kwargs):
119119
self.valid_set = GloVeABSADataset(self.config, self.tokenizer, dataset_type='valid') if not self.valid_set else self.valid_set
120120

121121
self.models.append(models[i](copy.deepcopy(self.embedding_matrix) if self.config.deep_ensemble else self.embedding_matrix, self.config))
122-
self.config.tokenizer = self.tokenizer
123122
self.config.embedding_matrix = self.embedding_matrix
124123

125124
if self.config.cache_dataset and not os.path.exists(cache_path) and not self.config.overwrite_cache:
@@ -137,6 +136,8 @@ def __init__(self, config, load_dataset=True, **kwargs):
137136
valid_sampler = SequentialSampler(self.valid_set if not self.valid_set else self.valid_set)
138137
self.valid_dataloader = DataLoader(self.valid_set, batch_size=self.config.batch_size, pin_memory=True, sampler=valid_sampler)
139138

139+
self.config.tokenizer = self.tokenizer
140+
140141
self.dense = nn.Linear(config.output_dim * len(models), config.output_dim)
141142

142143
def forward(self, inputs):

pyabsa/tasks/AspectPolarityClassification/prediction/sentiment_classifier.py

+8-9
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,16 @@
1919

2020
from pyabsa import LabelPaddingOption, TaskCodeOption
2121
from pyabsa.framework.prediction_class.predictor_template import InferenceModel
22-
from pyabsa.tasks.AspectPolarityClassification.models.__plm__ import BERTBaselineAPCModelList
23-
from pyabsa.tasks.AspectPolarityClassification.models.__classic__ import GloVeAPCModelList
24-
from pyabsa.tasks.AspectPolarityClassification.models.__lcf__ import APCModelList
25-
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__classic__.data_utils_for_inference import GloVeABSAInferenceDataset
26-
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__lcf__.data_utils_for_inference import ABSAInferenceDataset
27-
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__plm__.data_utils_for_inference import BERTABSAInferenceDataset
28-
from pyabsa.tasks.AspectPolarityClassification.instructor.ensembler import APCEnsembler
22+
from ..models.__plm__ import BERTBaselineAPCModelList
23+
from ..models.__classic__ import GloVeAPCModelList
24+
from ..models.__lcf__ import APCModelList
25+
from ..dataset_utils.__classic__.data_utils_for_inference import GloVeABSAInferenceDataset
26+
from ..dataset_utils.__lcf__.data_utils_for_inference import ABSAInferenceDataset
27+
from ..dataset_utils.__plm__.data_utils_for_inference import BERTABSAInferenceDataset
28+
from ..instructor.ensembler import APCEnsembler
2929
from pyabsa.utils.data_utils.dataset_manager import detect_infer_dataset
3030
from pyabsa.utils.pyabsa_utils import get_device, print_args
3131
from pyabsa.utils.text_utils.mlm import get_mlm_and_tokenizer
32-
from pyabsa.framework.tokenizer_class.tokenizer_class import Tokenizer
3332

3433

3534
class SentimentClassifier(InferenceModel):
@@ -41,7 +40,7 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
4140
super().__init__(checkpoint, cal_perplexity, task_code=self.task_code, **kwargs)
4241

4342
# load from a trainer
44-
if not isinstance(self.checkpoint, str):
43+
if self.checkpoint and not isinstance(self.checkpoint, str):
4544
print('Load sentiment classifier from trainer')
4645
self.model = self.checkpoint[0]
4746
self.config = self.checkpoint[1]

pyabsa/tasks/AspectPolarityClassification/trainer/apc_trainer.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111

1212
from pyabsa import DeviceTypeOption, ModelSaveOption, TaskCodeOption, TaskNameOption
1313
from pyabsa.framework.trainer_class.trainer_template import Trainer
14-
from pyabsa.tasks.AspectPolarityClassification.configuration.apc_configuration import APCConfigManager
15-
from pyabsa.tasks.AspectPolarityClassification.prediction.sentiment_classifier import SentimentClassifier
16-
from pyabsa.tasks.AspectPolarityClassification.instructor.apc_instructor import APCTrainingInstructor
14+
from ..configuration.apc_configuration import APCConfigManager
15+
from ..prediction.sentiment_classifier import SentimentClassifier
16+
from ..instructor.apc_instructor import APCTrainingInstructor
1717

1818

1919
class APCTrainer(Trainer):

pyabsa/tasks/AspectTermExtraction/configuration/atepc_configuration.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import copy
1111

1212
from pyabsa.framework.configuration_class.configuration_template import ConfigManager
13-
from pyabsa.tasks.AspectTermExtraction.models.__lcf__.lcf_atepc import LCF_ATEPC
13+
from ..models.__lcf__.lcf_atepc import LCF_ATEPC
1414

1515
# if you find the optimal param set of some situation, e.g., some model on some datasets
1616
# please share the main use template main

pyabsa/tasks/AspectTermExtraction/dataset_utils/__lcf__/data_utils_for_inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from pyabsa import LabelPaddingOption
1111
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__lcf__.apc_utils import configure_spacy_model
12-
from pyabsa.tasks.AspectTermExtraction.dataset_utils.__lcf__.atepc_utils import simple_split_text, prepare_input_for_atepc
12+
from ...dataset_utils.__lcf__.atepc_utils import simple_split_text, prepare_input_for_atepc
1313

1414

1515
class InputExample(object):

pyabsa/tasks/AspectTermExtraction/dataset_utils/__lcf__/data_utils_for_training.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from pyabsa import LabelPaddingOption
1111
from pyabsa.tasks.AspectPolarityClassification.dataset_utils.__lcf__.apc_utils import configure_spacy_model
12-
from pyabsa.tasks.AspectTermExtraction.dataset_utils.__lcf__.atepc_utils import prepare_input_for_atepc
12+
from ...dataset_utils.__lcf__.atepc_utils import prepare_input_for_atepc
1313
from pyabsa.utils.pyabsa_utils import validate_example, check_and_fix_labels, check_and_fix_IOB_labels
1414

1515
Labels = set()

pyabsa/tasks/AspectTermExtraction/instructor/atepc_instructor.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,13 @@
1919
import tqdm
2020
from seqeval.metrics import classification_report
2121
from sklearn.metrics import f1_score
22-
from termcolor import colored
2322
from torch import cuda
2423
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
2524
from transformers import AutoTokenizer, AutoModel
2625

2726
from pyabsa import DeviceTypeOption
2827
from pyabsa.framework.instructor_class.instructor_template import BaseTrainingInstructor
29-
from pyabsa.tasks.AspectTermExtraction.dataset_utils.__lcf__.data_utils_for_training import ATEPCProcessor, convert_examples_to_features
28+
from ..dataset_utils.__lcf__.data_utils_for_training import ATEPCProcessor, convert_examples_to_features
3029
from pyabsa.utils.file_utils.file_utils import save_model
3130
from pyabsa.utils.pyabsa_utils import print_args, init_optimizer
3231

@@ -468,9 +467,12 @@ def _init_misc(self):
468467
self.model.to(self.config.device)
469468

470469
if isinstance(self.config.optimizer, str):
471-
self.optimizer = init_optimizer(self.config.optimizer)(self.optimizer_grouped_parameters,
472-
lr=self.config.learning_rate,
473-
weight_decay=self.config.l2reg)
470+
self.optimizer = init_optimizer(self.config.optimizer)(
471+
self.optimizer_grouped_parameters,
472+
lr=self.config.learning_rate,
473+
weight_decay=self.config.l2reg,
474+
maximize=self.config.maximize_loss if self.config.get('maximize_loss') else False
475+
)
474476
self.config.device = torch.device(self.config.device)
475477
if self.config.device.type == 'cuda':
476478
self.logger.info(

0 commit comments

Comments
 (0)