diff --git a/examples/benchmark/ceval/model_evaluator.py b/examples/benchmark/ceval/model_evaluator.py index 4fbef4fe26c9..01a18ab7bfa1 100644 --- a/examples/benchmark/ceval/model_evaluator.py +++ b/examples/benchmark/ceval/model_evaluator.py @@ -30,7 +30,7 @@ def __init__(self, choices, k, model_name_or_path, temperature=0.2): super().__init__(choices, model_name_or_path, k) self.model_name_or_path = model_name_or_path self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="float16", low_cpu_mem_usage=True) + self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="float16") self.model.eval() self.generation_config = dict( temperature=temperature, diff --git a/examples/benchmark/peft/paddle/benchmark.py b/examples/benchmark/peft/paddle/benchmark.py index 1849dd9d7312..e21c4a37b108 100644 --- a/examples/benchmark/peft/paddle/benchmark.py +++ b/examples/benchmark/peft/paddle/benchmark.py @@ -92,7 +92,6 @@ def main(): if model_args.model_name_or_path in ["gpt3-6.7B-en", "gpt3-13B-en"]: model = GPTForCausalLM.from_pretrained( model_args.model_name_or_path, - low_cpu_mem_usage=True, use_flash_attention=model_args.use_flash_attention, dtype=dtype, tensor_parallel_degree=training_args.tensor_parallel_degree, @@ -104,7 +103,6 @@ def main(): else: model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, - low_cpu_mem_usage=True, use_flash_attention=model_args.use_flash_attention, dtype=dtype, tensor_parallel_degree=training_args.tensor_parallel_degree, diff --git a/examples/benchmark/peft/paddle/inference_benchmark.py b/examples/benchmark/peft/paddle/inference_benchmark.py index 79cec1478e51..43e5efcd6b3d 100644 --- a/examples/benchmark/peft/paddle/inference_benchmark.py +++ b/examples/benchmark/peft/paddle/inference_benchmark.py @@ -67,8 +67,6 @@ def predict_forward(model, inputs): tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) model = AutoModelForCausalLM.from_pretrained( args.model_name_or_path, - load_state_as_np=True, - low_cpu_mem_usage=True, ) if model.base_model_prefix == "llama": tokenizer.pad_token = tokenizer.unk_token diff --git a/examples/code_generation/codegen/README.md b/examples/code_generation/codegen/README.md index c71803fdba20..6ef14f5bcbc8 100644 --- a/examples/code_generation/codegen/README.md +++ b/examples/code_generation/codegen/README.md @@ -119,7 +119,6 @@ python codegen_server.py - `min_length`:生成的最小长度,默认为0 - `max_length`:生成的最大长度,默认为16 - `decode_strategy`:解码策略,默认为"greedy_search" -- `load_state_as_np`:以numpy格式加载模型参数,可节省显存,默认为True - `use_fast`:是否使用FastGeneration,可加速推理,默认为True - `use_fp16_decoding`:是否使用fp16推理,可节省显存和加速推理,默认为True diff --git a/examples/code_generation/codegen/codegen_server.py b/examples/code_generation/codegen/codegen_server.py index 5f8d80bb7cfa..e0c246063bf9 100644 --- a/examples/code_generation/codegen/codegen_server.py +++ b/examples/code_generation/codegen/codegen_server.py @@ -35,7 +35,6 @@ class DefaultConfig: min_length = 0 max_length = 16 decode_strategy = "greedy_search" - load_state_as_np = True use_faster = True use_fp16_decoding = True default_dtype = "float16" if use_faster and use_fp16_decoding else "float32" @@ -64,9 +63,7 @@ class Output(BaseModel): paddle.set_default_dtype(generate_config.default_dtype) tokenizer = CodeGenTokenizer.from_pretrained(generate_config.model_name_or_path) -model = CodeGenForCausalLM.from_pretrained( - generate_config.model_name_or_path, load_state_as_np=generate_config.load_state_as_np -) +model = CodeGenForCausalLM.from_pretrained(generate_config.model_name_or_path) app = FastAPI() diff --git a/examples/language_model/t5/tests/t5_mp.py b/examples/language_model/t5/tests/t5_mp.py index aab854c3475e..40a5c0d6256b 100644 --- a/examples/language_model/t5/tests/t5_mp.py +++ b/examples/language_model/t5/tests/t5_mp.py @@ -44,7 +44,6 @@ def main(): tensor_parallel_degree=tensor_parallel_degree, tensor_parallel_rank=tensor_parallel_rank, dtype="float32", - low_cpu_mem_usage=True, ) model.eval() loss = model( @@ -63,7 +62,6 @@ def main(): tensor_parallel_degree=tensor_parallel_degree, tensor_parallel_rank=tensor_parallel_rank, dtype="float32", - low_cpu_mem_usage=True, ) load_model.eval() loss = load_model( @@ -85,7 +83,6 @@ def main(): tensor_parallel_degree=tensor_parallel_degree, tensor_parallel_rank=tensor_parallel_rank, dtype="float32", - low_cpu_mem_usage=True, ) load_model.eval() loss = load_model( diff --git a/examples/text_generation/opt/demo.py b/examples/text_generation/opt/demo.py index 2ab891ed6153..63828dab7d01 100644 --- a/examples/text_generation/opt/demo.py +++ b/examples/text_generation/opt/demo.py @@ -23,7 +23,7 @@ class Demo: def __init__(self, model_name_or_path, max_predict_len=128): self.tokenizer = GPTTokenizer.from_pretrained(model_name_or_path) logger.info("Loading the model parameters, please wait...") - self.model = OPTForCausalLM.from_pretrained(model_name_or_path, load_state_as_np=True) + self.model = OPTForCausalLM.from_pretrained(model_name_or_path) self.model.eval() self.max_predict_len = max_predict_len logger.info("Model loaded.") diff --git a/fast_generation/perf/codegen_perf.py b/fast_generation/perf/codegen_perf.py index 8620a11336c5..1a84b4e94fab 100644 --- a/fast_generation/perf/codegen_perf.py +++ b/fast_generation/perf/codegen_perf.py @@ -37,7 +37,7 @@ def perf_pd(args): place = "gpu" place = paddle.set_device(place) tokenizer = CodeGenTokenizer.from_pretrained(args.model_name_or_path) - model = CodeGenForCausalLM.from_pretrained(args.model_name_or_path, load_state_as_np=True) + model = CodeGenForCausalLM.from_pretrained(args.model_name_or_path) model.eval() load_mem = query_by_id(args.gpu_id) diff --git a/fast_generation/perf/pegasus_perf.py b/fast_generation/perf/pegasus_perf.py index ae9c6ce61b6a..fe8ba55fb8e3 100644 --- a/fast_generation/perf/pegasus_perf.py +++ b/fast_generation/perf/pegasus_perf.py @@ -40,7 +40,7 @@ def perf_pd(args): place = "gpu" place = paddle.set_device(place) tokenizer = PegasusChineseTokenizer.from_pretrained(args.model_name_or_path) - model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path, load_state_as_np=True) + model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path) model.eval() load_mem = query_by_id(args.gpu_id) input_ids_np = [np.random.choice(range(len(tokenizer.vocab)), args.input_len) for _ in range(args.batch_size)] diff --git a/fast_generation/samples/codegen_16b_sample.py b/fast_generation/samples/codegen_16b_sample.py index 02c121645e1c..0f556911e813 100644 --- a/fast_generation/samples/codegen_16b_sample.py +++ b/fast_generation/samples/codegen_16b_sample.py @@ -21,7 +21,7 @@ model_name = "Salesforce/codegen-16B-mono" tokenizer = CodeGenTokenizer.from_pretrained(model_name) -model = CodeGenForCausalLM.from_pretrained(model_name, load_state_as_np=True) +model = CodeGenForCausalLM.from_pretrained(model_name) model.eval() inputs = "def hello" diff --git a/fast_generation/samples/gpt_mp_sample.py b/fast_generation/samples/gpt_mp_sample.py index f2370f9b2e8f..061318e74661 100644 --- a/fast_generation/samples/gpt_mp_sample.py +++ b/fast_generation/samples/gpt_mp_sample.py @@ -96,7 +96,7 @@ def main(args): if args.profile: MODEL_CLASSES[model_name][0].generate = profile(args.batch_size)(MODEL_CLASSES[model_name][0].generate) tokenizer = MODEL_CLASSES[model_name][-1].from_pretrained(model_name) - model = MODEL_CLASSES[model_name][0].from_pretrained(model_name, load_state_as_np=True) + model = MODEL_CLASSES[model_name][0].from_pretrained(model_name) model.eval() # NOTE: When using prompt, open this and replace the text with what you want. diff --git a/fast_generation/samples/gptj_sample.py b/fast_generation/samples/gptj_sample.py index f335121287c8..17615667dfda 100644 --- a/fast_generation/samples/gptj_sample.py +++ b/fast_generation/samples/gptj_sample.py @@ -20,7 +20,7 @@ model_name = "EleutherAI/gpt-j-6B" tokenizer = GPTJTokenizer.from_pretrained(model_name) -model = GPTJForCausalLM.from_pretrained(model_name, load_state_as_np=True) +model = GPTJForCausalLM.from_pretrained(model_name) model.eval() inputs = "What is PaddleNLP?" diff --git a/fast_generation/samples/plato_xl_sample.py b/fast_generation/samples/plato_xl_sample.py index b7c91f4fc921..9c6138a9721b 100644 --- a/fast_generation/samples/plato_xl_sample.py +++ b/fast_generation/samples/plato_xl_sample.py @@ -106,7 +106,7 @@ def main(args): if args.profile: UnifiedTransformerLMHeadModel.generate = profile(args.batch_size)(UnifiedTransformerLMHeadModel.generate) tokenizer = UnifiedTransformerTokenizer.from_pretrained("plato-xl") - model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl", load_state_as_np=True) + model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl") model.eval() history = [ diff --git a/llm/ernie-3.5-se/predict_generation.py b/llm/ernie-3.5-se/predict_generation.py index 4120b1862a41..933dcf044eb5 100644 --- a/llm/ernie-3.5-se/predict_generation.py +++ b/llm/ernie-3.5-se/predict_generation.py @@ -99,7 +99,6 @@ def __init__(self, args=None, tokenizer=None, model=None, **kwargs): args.model_name_or_path, tensor_parallel_degree=tensor_parallel_degree, tensor_parallel_rank=tensor_parallel_rank, - load_state_as_np=True, dtype=dtype, use_flash_attention=use_flash_attn, ) diff --git a/llm/ernie-3.5-se/run_pretrain.py b/llm/ernie-3.5-se/run_pretrain.py index 5e08185f7bb4..2fd4abcce880 100644 --- a/llm/ernie-3.5-se/run_pretrain.py +++ b/llm/ernie-3.5-se/run_pretrain.py @@ -394,7 +394,6 @@ def main(): model_args.model_name_or_path, config=config, dtype=dtype, - load_state_as_np=True, use_progressive_seq_len=True, ) else: diff --git a/llm/glm/finetune_generation.py b/llm/glm/finetune_generation.py index dd536c73f87a..e8779d68f3ee 100644 --- a/llm/glm/finetune_generation.py +++ b/llm/glm/finetune_generation.py @@ -109,7 +109,6 @@ def main(): model_args.model_name_or_path, output_predict=True, parallel_output=True, - load_state_as_np=True, dtype=dtype, # todo enable set dtype to avoid additional mem usage tensor_parallel_degree=training_args.tensor_parallel_degree, tensor_parallel_rank=training_args.tensor_parallel_rank, diff --git a/llm/glm/predict_generation.py b/llm/glm/predict_generation.py index 7467216557a1..41dd6b3459af 100644 --- a/llm/glm/predict_generation.py +++ b/llm/glm/predict_generation.py @@ -80,9 +80,7 @@ def __init__(self, args): args.model_name_or_path, tensor_parallel_degree=tensor_parallel_degree, tensor_parallel_rank=tensor_parallel_rank, - load_state_as_np=True, dtype=dtype, - low_cpu_mem_usage=True, ) if self.args.lora_path is not None: self.model = LoRAModel.from_pretrained(self.model, self.args.lora_path) diff --git a/llm/gpt-3/finetune_generation.py b/llm/gpt-3/finetune_generation.py index 872bb3a0e24a..9018221f78da 100644 --- a/llm/gpt-3/finetune_generation.py +++ b/llm/gpt-3/finetune_generation.py @@ -153,7 +153,6 @@ def main(): model_args.model_name_or_path, config=config, dtype=dtype, - load_state_as_np=True, ) if model_args.lora: if model_args.lora_path is None: diff --git a/llm/gpt-3/predict_generation.py b/llm/gpt-3/predict_generation.py index 53e1d95c22d3..060bcb9f8cf1 100644 --- a/llm/gpt-3/predict_generation.py +++ b/llm/gpt-3/predict_generation.py @@ -73,8 +73,6 @@ def __init__(self, args=None, tokenizer=None, model=None, **kwargs): self.model = GPTForCausalLM.from_pretrained( args.model_name_or_path, - load_state_as_np=True, - low_cpu_mem_usage=True, dtype=dtype, tensor_parallel_degree=tensor_parallel_degree, tensor_parallel_rank=tensor_parallel_rank, diff --git a/model_zoo/plato-xl/infer.py b/model_zoo/plato-xl/infer.py index a20270855f9a..e96458fb12b2 100644 --- a/model_zoo/plato-xl/infer.py +++ b/model_zoo/plato-xl/infer.py @@ -112,7 +112,7 @@ def main(args): if args.profile: UnifiedTransformerLMHeadModel.generate = profile(args.batch_size)(UnifiedTransformerLMHeadModel.generate) tokenizer = UnifiedTransformerTokenizer.from_pretrained("plato-xl") - model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl", load_state_as_np=True) + model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl") model.eval() history = [ diff --git a/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py index 56bf31bc076e..1b3e6a0e877e 100644 --- a/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py +++ b/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py @@ -70,7 +70,7 @@ def do_predict(args): paddle.set_default_dtype("float16") model_name = "plato-xl" - model = UnifiedTransformerLMHeadModel.from_pretrained(model_name, load_state_as_np=True) + model = UnifiedTransformerLMHeadModel.from_pretrained(model_name) tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name) plato = FasterUnifiedTransformer(model=model, use_fp16_decoding=args.use_fp16_decoding) diff --git a/paddlenlp/taskflow/text2text_generation.py b/paddlenlp/taskflow/text2text_generation.py index 6cb5703c2dfe..7966f2995eef 100644 --- a/paddlenlp/taskflow/text2text_generation.py +++ b/paddlenlp/taskflow/text2text_generation.py @@ -117,7 +117,6 @@ def _construct_model(self, model): """ model_instance = AutoModelForCausalLM.from_pretrained( self.model, - load_state_as_np=True, dtype=self._dtype, ) # Load the model parameter for the predict diff --git a/tests/transformers/test_modeling_utils.py b/tests/transformers/test_modeling_utils.py index 8cdf1c847f90..f83ea674943f 100644 --- a/tests/transformers/test_modeling_utils.py +++ b/tests/transformers/test_modeling_utils.py @@ -18,7 +18,7 @@ from multiprocessing import Pool from tempfile import TemporaryDirectory -from paddlenlp.transformers import BertModel, TinyBertModel +from paddlenlp.transformers import BertModel from paddlenlp.utils.env import CONFIG_NAME, MODEL_HOME, PADDLE_WEIGHTS_NAME from tests.testing_utils import slow @@ -57,12 +57,6 @@ def test_from_pretrained_cache_dir_pretrained_init(self): # check against double appending model_name in cache_dir self.assertFalse(os.path.exists(os.path.join(tempdir, model_name, model_name))) - @slow - def test_from_pretrained_with_load_as_state_np_params(self): - """init model with `load_state_as_np` params""" - model = TinyBertModel.from_pretrained("tinybert-4l-312d", load_state_as_np=True) - self.assertIsNotNone(model) - @slow def test_multiprocess_downloading(self): """test downloading with multi-process. Some errors may be triggered when downloading model