2020
2121import pytest
2222import torchao
23- from executorch import version as executorch_version
23+ import transformers
2424from executorch .extension .pybindings .portable_lib import ExecuTorchModule
2525from packaging .version import parse
2626from transformers import AutoConfig , AutoTokenizer
@@ -43,41 +43,49 @@ def __init__(self, *args, **kwargs):
4343 @slow
4444 @pytest .mark .run_slow
4545 @pytest .mark .skipif (
46- is_ci ,
47- reason = "Test Phi-4-mini (3.8B) will require runner to be configured with larger RAM " ,
46+ parse ( transformers . __version__ ) < parse ( "4.52.0" ) or parse ( torchao . __version__ ) < parse ( "0.11.0" ) ,
47+ reason = "Only available on transformers >= 4.52.0 and torchao >= 0.11.0 " ,
4848 )
49- def test_phi4_text_generation (self ):
49+ def test_phi4_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we (self ):
5050 model_id = "microsoft/Phi-4-mini-instruct"
5151 config = AutoConfig .from_pretrained (model_id )
5252 # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting
5353 # the data-dependent control flow in _longrope_frequency_update. Alternatively, we can rewrite
5454 # that function to avoid the data-dependent control flow.
5555 if hasattr (config , "rope_scaling" ) and config .rope_scaling is not None :
5656 config .rope_scaling ["type" ] = "default"
57- model = ExecuTorchModelForCausalLM .from_pretrained (model_id , recipe = "xnnpack" , config = config )
57+ model = ExecuTorchModelForCausalLM .from_pretrained (
58+ model_id ,
59+ recipe = "xnnpack" ,
60+ config = config ,
61+ attn_implementation = "custom_sdpa" ,
62+ use_custom_kv_cache = True ,
63+ ** {"qlinear" : True , "qembeeding" : True },
64+ )
5865 self .assertIsInstance (model , ExecuTorchModelForCausalLM )
5966 self .assertIsInstance (model .model , ExecuTorchModule )
6067
6168 tokenizer = AutoTokenizer .from_pretrained (model_id )
6269 generated_text = model .text_generation (
6370 tokenizer = tokenizer ,
6471 prompt = "My favourite condiment is " ,
65- max_seq_len = 32 ,
72+ max_seq_len = 64 ,
6673 )
6774 logging .info (f"\n Generated text:\n \t { generated_text } " )
68- generated_tokens = tokenizer (generated_text , return_tensors = "pt" ).input_ids
6975
70- # Free memory before loading eager for quality check
71- del model
72- del tokenizer
73- gc .collect ()
76+ if not is_ci :
77+ generated_tokens = tokenizer (generated_text , return_tensors = "pt" ).input_ids
78+
79+ # Free memory before loading eager for quality check
80+ del model
81+ del tokenizer
82+ gc .collect ()
7483
75- self .assertTrue (check_causal_lm_output_quality (model_id , generated_tokens ))
84+ self .assertTrue (check_causal_lm_output_quality (model_id , generated_tokens ))
7685
7786 @slow
7887 @pytest .mark .run_slow
79- @pytest .mark .skipif (
80- parse (executorch_version .__version__ ) > parse ("0.6.0" ),
88+ @pytest .mark .skip (
8189 reason = "Require cache_position support in executorch runtime. Re-enable when available." ,
8290 )
8391 def test_phi4_text_generation_with_quantized_pte_from_hub (self ):
@@ -119,9 +127,8 @@ def test_phi4_text_generation_with_quantized_pte_from_hub(self):
119127
120128 @slow
121129 @pytest .mark .run_slow
122- @pytest .mark .skipif (
123- parse (torchao .__version__ ) < parse ("0.11.0.dev0" ),
124- reason = "Only available on torchao >= 0.11.0.dev0" ,
130+ @pytest .mark .skip (
131+ reason = "Require cache_position support in executorch runtime. Re-enable when available." ,
125132 )
126133 def test_phi4_text_generation_with_quantized_ckp (self ):
127134 model_id = "pytorch/Phi-4-mini-instruct-8da4w"
0 commit comments