Skip to content

Commit

Permalink
remove load state as np. (PaddlePaddle#7120)
Browse files Browse the repository at this point in the history
  • Loading branch information
ZHUI authored Sep 26, 2023
1 parent c1157e5 commit 0b73e12
Show file tree
Hide file tree
Showing 23 changed files with 12 additions and 38 deletions.
2 changes: 1 addition & 1 deletion examples/benchmark/ceval/model_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, choices, k, model_name_or_path, temperature=0.2):
super().__init__(choices, model_name_or_path, k)
self.model_name_or_path = model_name_or_path
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="float16", low_cpu_mem_usage=True)
self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="float16")
self.model.eval()
self.generation_config = dict(
temperature=temperature,
Expand Down
2 changes: 0 additions & 2 deletions examples/benchmark/peft/paddle/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ def main():
if model_args.model_name_or_path in ["gpt3-6.7B-en", "gpt3-13B-en"]:
model = GPTForCausalLM.from_pretrained(
model_args.model_name_or_path,
low_cpu_mem_usage=True,
use_flash_attention=model_args.use_flash_attention,
dtype=dtype,
tensor_parallel_degree=training_args.tensor_parallel_degree,
Expand All @@ -104,7 +103,6 @@ def main():
else:
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
low_cpu_mem_usage=True,
use_flash_attention=model_args.use_flash_attention,
dtype=dtype,
tensor_parallel_degree=training_args.tensor_parallel_degree,
Expand Down
2 changes: 0 additions & 2 deletions examples/benchmark/peft/paddle/inference_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ def predict_forward(model, inputs):
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
load_state_as_np=True,
low_cpu_mem_usage=True,
)
if model.base_model_prefix == "llama":
tokenizer.pad_token = tokenizer.unk_token
Expand Down
1 change: 0 additions & 1 deletion examples/code_generation/codegen/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ python codegen_server.py
- `min_length`:生成的最小长度,默认为0
- `max_length`:生成的最大长度,默认为16
- `decode_strategy`:解码策略,默认为"greedy_search"
- `load_state_as_np`:以numpy格式加载模型参数,可节省显存,默认为True
- `use_fast`:是否使用FastGeneration,可加速推理,默认为True
- `use_fp16_decoding`:是否使用fp16推理,可节省显存和加速推理,默认为True

Expand Down
5 changes: 1 addition & 4 deletions examples/code_generation/codegen/codegen_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ class DefaultConfig:
min_length = 0
max_length = 16
decode_strategy = "greedy_search"
load_state_as_np = True
use_faster = True
use_fp16_decoding = True
default_dtype = "float16" if use_faster and use_fp16_decoding else "float32"
Expand Down Expand Up @@ -64,9 +63,7 @@ class Output(BaseModel):
paddle.set_default_dtype(generate_config.default_dtype)

tokenizer = CodeGenTokenizer.from_pretrained(generate_config.model_name_or_path)
model = CodeGenForCausalLM.from_pretrained(
generate_config.model_name_or_path, load_state_as_np=generate_config.load_state_as_np
)
model = CodeGenForCausalLM.from_pretrained(generate_config.model_name_or_path)

app = FastAPI()

Expand Down
3 changes: 0 additions & 3 deletions examples/language_model/t5/tests/t5_mp.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def main():
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
dtype="float32",
low_cpu_mem_usage=True,
)
model.eval()
loss = model(
Expand All @@ -63,7 +62,6 @@ def main():
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
dtype="float32",
low_cpu_mem_usage=True,
)
load_model.eval()
loss = load_model(
Expand All @@ -85,7 +83,6 @@ def main():
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
dtype="float32",
low_cpu_mem_usage=True,
)
load_model.eval()
loss = load_model(
Expand Down
2 changes: 1 addition & 1 deletion examples/text_generation/opt/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Demo:
def __init__(self, model_name_or_path, max_predict_len=128):
self.tokenizer = GPTTokenizer.from_pretrained(model_name_or_path)
logger.info("Loading the model parameters, please wait...")
self.model = OPTForCausalLM.from_pretrained(model_name_or_path, load_state_as_np=True)
self.model = OPTForCausalLM.from_pretrained(model_name_or_path)
self.model.eval()
self.max_predict_len = max_predict_len
logger.info("Model loaded.")
Expand Down
2 changes: 1 addition & 1 deletion fast_generation/perf/codegen_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def perf_pd(args):
place = "gpu"
place = paddle.set_device(place)
tokenizer = CodeGenTokenizer.from_pretrained(args.model_name_or_path)
model = CodeGenForCausalLM.from_pretrained(args.model_name_or_path, load_state_as_np=True)
model = CodeGenForCausalLM.from_pretrained(args.model_name_or_path)
model.eval()
load_mem = query_by_id(args.gpu_id)

Expand Down
2 changes: 1 addition & 1 deletion fast_generation/perf/pegasus_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def perf_pd(args):
place = "gpu"
place = paddle.set_device(place)
tokenizer = PegasusChineseTokenizer.from_pretrained(args.model_name_or_path)
model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path, load_state_as_np=True)
model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path)
model.eval()
load_mem = query_by_id(args.gpu_id)
input_ids_np = [np.random.choice(range(len(tokenizer.vocab)), args.input_len) for _ in range(args.batch_size)]
Expand Down
2 changes: 1 addition & 1 deletion fast_generation/samples/codegen_16b_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
model_name = "Salesforce/codegen-16B-mono"

tokenizer = CodeGenTokenizer.from_pretrained(model_name)
model = CodeGenForCausalLM.from_pretrained(model_name, load_state_as_np=True)
model = CodeGenForCausalLM.from_pretrained(model_name)
model.eval()

inputs = "def hello"
Expand Down
2 changes: 1 addition & 1 deletion fast_generation/samples/gpt_mp_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def main(args):
if args.profile:
MODEL_CLASSES[model_name][0].generate = profile(args.batch_size)(MODEL_CLASSES[model_name][0].generate)
tokenizer = MODEL_CLASSES[model_name][-1].from_pretrained(model_name)
model = MODEL_CLASSES[model_name][0].from_pretrained(model_name, load_state_as_np=True)
model = MODEL_CLASSES[model_name][0].from_pretrained(model_name)
model.eval()

# NOTE: When using prompt, open this and replace the text with what you want.
Expand Down
2 changes: 1 addition & 1 deletion fast_generation/samples/gptj_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
model_name = "EleutherAI/gpt-j-6B"

tokenizer = GPTJTokenizer.from_pretrained(model_name)
model = GPTJForCausalLM.from_pretrained(model_name, load_state_as_np=True)
model = GPTJForCausalLM.from_pretrained(model_name)
model.eval()

inputs = "What is PaddleNLP?"
Expand Down
2 changes: 1 addition & 1 deletion fast_generation/samples/plato_xl_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def main(args):
if args.profile:
UnifiedTransformerLMHeadModel.generate = profile(args.batch_size)(UnifiedTransformerLMHeadModel.generate)
tokenizer = UnifiedTransformerTokenizer.from_pretrained("plato-xl")
model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl", load_state_as_np=True)
model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl")
model.eval()

history = [
Expand Down
1 change: 0 additions & 1 deletion llm/ernie-3.5-se/predict_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def __init__(self, args=None, tokenizer=None, model=None, **kwargs):
args.model_name_or_path,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
load_state_as_np=True,
dtype=dtype,
use_flash_attention=use_flash_attn,
)
Expand Down
1 change: 0 additions & 1 deletion llm/ernie-3.5-se/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ def main():
model_args.model_name_or_path,
config=config,
dtype=dtype,
load_state_as_np=True,
use_progressive_seq_len=True,
)
else:
Expand Down
1 change: 0 additions & 1 deletion llm/glm/finetune_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ def main():
model_args.model_name_or_path,
output_predict=True,
parallel_output=True,
load_state_as_np=True,
dtype=dtype, # todo enable set dtype to avoid additional mem usage
tensor_parallel_degree=training_args.tensor_parallel_degree,
tensor_parallel_rank=training_args.tensor_parallel_rank,
Expand Down
2 changes: 0 additions & 2 deletions llm/glm/predict_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,7 @@ def __init__(self, args):
args.model_name_or_path,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
load_state_as_np=True,
dtype=dtype,
low_cpu_mem_usage=True,
)
if self.args.lora_path is not None:
self.model = LoRAModel.from_pretrained(self.model, self.args.lora_path)
Expand Down
1 change: 0 additions & 1 deletion llm/gpt-3/finetune_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ def main():
model_args.model_name_or_path,
config=config,
dtype=dtype,
load_state_as_np=True,
)
if model_args.lora:
if model_args.lora_path is None:
Expand Down
2 changes: 0 additions & 2 deletions llm/gpt-3/predict_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ def __init__(self, args=None, tokenizer=None, model=None, **kwargs):

self.model = GPTForCausalLM.from_pretrained(
args.model_name_or_path,
load_state_as_np=True,
low_cpu_mem_usage=True,
dtype=dtype,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
Expand Down
2 changes: 1 addition & 1 deletion model_zoo/plato-xl/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def main(args):
if args.profile:
UnifiedTransformerLMHeadModel.generate = profile(args.batch_size)(UnifiedTransformerLMHeadModel.generate)
tokenizer = UnifiedTransformerTokenizer.from_pretrained("plato-xl")
model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl", load_state_as_np=True)
model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl")
model.eval()

history = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def do_predict(args):
paddle.set_default_dtype("float16")

model_name = "plato-xl"
model = UnifiedTransformerLMHeadModel.from_pretrained(model_name, load_state_as_np=True)
model = UnifiedTransformerLMHeadModel.from_pretrained(model_name)
tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)

plato = FasterUnifiedTransformer(model=model, use_fp16_decoding=args.use_fp16_decoding)
Expand Down
1 change: 0 additions & 1 deletion paddlenlp/taskflow/text2text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ def _construct_model(self, model):
"""
model_instance = AutoModelForCausalLM.from_pretrained(
self.model,
load_state_as_np=True,
dtype=self._dtype,
)
# Load the model parameter for the predict
Expand Down
8 changes: 1 addition & 7 deletions tests/transformers/test_modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from multiprocessing import Pool
from tempfile import TemporaryDirectory

from paddlenlp.transformers import BertModel, TinyBertModel
from paddlenlp.transformers import BertModel
from paddlenlp.utils.env import CONFIG_NAME, MODEL_HOME, PADDLE_WEIGHTS_NAME
from tests.testing_utils import slow

Expand Down Expand Up @@ -57,12 +57,6 @@ def test_from_pretrained_cache_dir_pretrained_init(self):
# check against double appending model_name in cache_dir
self.assertFalse(os.path.exists(os.path.join(tempdir, model_name, model_name)))

@slow
def test_from_pretrained_with_load_as_state_np_params(self):
"""init model with `load_state_as_np` params"""
model = TinyBertModel.from_pretrained("tinybert-4l-312d", load_state_as_np=True)
self.assertIsNotNone(model)

@slow
def test_multiprocess_downloading(self):
"""test downloading with multi-process. Some errors may be triggered when downloading model
Expand Down

0 comments on commit 0b73e12

Please sign in to comment.