- 
                Notifications
    You must be signed in to change notification settings 
- Fork 465
Add immediate isq predicates for qwen3 #1358
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -4,9 +4,9 @@ use cli_table::{format::Justify, print_stdout, Cell, CellStruct, Style, Table}; | |
| use mistralrs_core::{ | ||
| get_auto_device_map_params, get_model_dtype, initialize_logging, paged_attn_supported, | ||
| parse_isq_value, Constraint, DefaultSchedulerMethod, DeviceLayerMapMetadata, DeviceMapMetadata, | ||
| DeviceMapSetting, DrySamplingParams, IsqType, Loader, LoaderBuilder, MemoryGpuConfig, | ||
| MistralRs, MistralRsBuilder, ModelSelected, NormalRequest, PagedAttentionConfig, Request, | ||
| RequestMessage, Response, SamplingParams, SchedulerConfig, TokenSource, Usage, | ||
| DeviceMapSetting, DrySamplingParams, Loader, LoaderBuilder, MemoryGpuConfig, MistralRs, | ||
| MistralRsBuilder, ModelSelected, NormalRequest, PagedAttentionConfig, Request, RequestMessage, | ||
| Response, SamplingParams, SchedulerConfig, TokenSource, Usage, | ||
| }; | ||
| use std::sync::Arc; | ||
| use std::{fmt::Display, num::NonZeroUsize}; | ||
|  | @@ -300,8 +300,8 @@ struct Args { | |
| num_device_layers: Option<Vec<String>>, | ||
|  | ||
| /// In-situ quantization to apply. | ||
| #[arg(long = "isq", value_parser = parse_isq_value)] | ||
| in_situ_quant: Option<IsqType>, | ||
| #[arg(long = "isq")] | ||
| in_situ_quant: Option<String>, | ||
|  | ||
| /// GPU memory to allocate for KV cache with PagedAttention in MBs. | ||
| /// PagedAttention is supported on CUDA and Metal. It is automatically activated on CUDA but not on Metal. | ||
|  | @@ -490,14 +490,19 @@ fn main() -> anyhow::Result<()> { | |
| (_, _, _, _, _, _) => None, | ||
| }; | ||
|  | ||
| let isq = args | ||
| .in_situ_quant | ||
| .as_ref() | ||
| .and_then(|isq| parse_isq_value(isq, Some(&device)).ok()); | ||
|  | ||
| 
      Comment on lines
    
      +493
     to 
      +497
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Parsing errors are swallowed – user never knows their ISQ argument was ignored 
 -let isq = args
-    .in_situ_quant
-    .as_ref()
-    .and_then(|isq| parse_isq_value(isq, Some(&device)).ok());
+let isq = match &args.in_situ_quant {
+    Some(s) => Some(parse_isq_value(s, Some(&device))?),
+    None    => None,
+};Now an invalid flag stops execution with a clear error instead of silently degrading. 🤖 Prompt for AI Agents | ||
| let pipeline = loader.load_model_from_hf( | ||
| None, | ||
| token_source, | ||
| &dtype, | ||
| &device, | ||
| false, | ||
| mapper, | ||
| args.in_situ_quant, | ||
| isq, | ||
| cache_config, | ||
| )?; | ||
| info!("Model loaded."); | ||
|  | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|  | @@ -10,7 +10,7 @@ use clap::Parser; | |||||||||||||||||
| use mistralrs_core::{ | ||||||||||||||||||
| get_auto_device_map_params, get_model_dtype, get_tgt_non_granular_index, initialize_logging, | ||||||||||||||||||
| paged_attn_supported, parse_isq_value, BertEmbeddingModel, DefaultSchedulerMethod, | ||||||||||||||||||
| DeviceLayerMapMetadata, DeviceMapMetadata, DeviceMapSetting, IsqType, Loader, LoaderBuilder, | ||||||||||||||||||
| DeviceLayerMapMetadata, DeviceMapMetadata, DeviceMapSetting, Loader, LoaderBuilder, | ||||||||||||||||||
| MemoryGpuConfig, MistralRs, MistralRsBuilder, ModelSelected, PagedAttentionConfig, Request, | ||||||||||||||||||
| SchedulerConfig, TokenSource, | ||||||||||||||||||
| }; | ||||||||||||||||||
|  | @@ -119,8 +119,8 @@ struct Args { | |||||||||||||||||
| num_device_layers: Option<Vec<String>>, | ||||||||||||||||||
|  | ||||||||||||||||||
| /// In-situ quantization to apply. | ||||||||||||||||||
| #[arg(long = "isq", value_parser = parse_isq_value)] | ||||||||||||||||||
| in_situ_quant: Option<IsqType>, | ||||||||||||||||||
| #[arg(long = "isq")] | ||||||||||||||||||
| in_situ_quant: Option<String>, | ||||||||||||||||||
| 
      Comment on lines
    
      +122
     to 
      +123
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Re-add  Same concern as in the bench binary: turning the field into  🤖 Prompt for AI Agents | ||||||||||||||||||
|  | ||||||||||||||||||
| /// GPU memory to allocate for KV cache with PagedAttention in MBs. | ||||||||||||||||||
| /// PagedAttention is supported on CUDA and Metal. It is automatically activated on CUDA but not on Metal. | ||||||||||||||||||
|  | @@ -223,7 +223,7 @@ async fn re_isq( | |||||||||||||||||
| ) -> Result<String, String> { | ||||||||||||||||||
| let repr = format!("Re ISQ: {:?}", request.ggml_type); | ||||||||||||||||||
| MistralRs::maybe_log_request(state.clone(), repr.clone()); | ||||||||||||||||||
| let request = Request::ReIsq(parse_isq_value(&request.ggml_type)?); | ||||||||||||||||||
| let request = Request::ReIsq(parse_isq_value(&request.ggml_type, None)?); | ||||||||||||||||||
| state.get_sender().unwrap().send(request).await.unwrap(); | ||||||||||||||||||
| Ok(repr) | ||||||||||||||||||
| } | ||||||||||||||||||
|  | @@ -300,7 +300,12 @@ async fn main() -> Result<()> { | |||||||||||||||||
| .build()?; | ||||||||||||||||||
|  | ||||||||||||||||||
| #[cfg(feature = "metal")] | ||||||||||||||||||
| let device = Device::new_metal(0)?; | ||||||||||||||||||
| let device = if args.cpu { | ||||||||||||||||||
| args.no_paged_attn = true; | ||||||||||||||||||
| Device::Cpu | ||||||||||||||||||
| } else { | ||||||||||||||||||
| Device::new_metal(0)? | ||||||||||||||||||
| }; | ||||||||||||||||||
| #[cfg(not(feature = "metal"))] | ||||||||||||||||||
| let device = if args.cpu { | ||||||||||||||||||
| args.no_paged_attn = true; | ||||||||||||||||||
|  | @@ -426,14 +431,19 @@ async fn main() -> Result<()> { | |||||||||||||||||
| (_, _, _, _, _, _) => None, | ||||||||||||||||||
| }; | ||||||||||||||||||
|  | ||||||||||||||||||
| let isq = args | ||||||||||||||||||
| .in_situ_quant | ||||||||||||||||||
| .as_ref() | ||||||||||||||||||
| .and_then(|isq| parse_isq_value(isq, Some(&device)).ok()); | ||||||||||||||||||
|  | ||||||||||||||||||
| 
      Comment on lines
    
      +434
     to 
      +438
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Silent drop of invalid  Identical issue to the bench tool: using  -let isq = args
-    .in_situ_quant
-    .as_ref()
-    .and_then(|isq| parse_isq_value(isq, Some(&device)).ok());
+let isq = match &args.in_situ_quant {
+    Some(s) => Some(parse_isq_value(s, Some(&device))?),
+    None    => None,
+};This will return a 500 at startup with a clear error instead of quietly ignoring the flag, which is preferable for an operator-facing server binary. 📝 Committable suggestion
 
        Suggested change
       
 🤖 Prompt for AI Agents | ||||||||||||||||||
| let pipeline = loader.load_model_from_hf( | ||||||||||||||||||
| None, | ||||||||||||||||||
| args.token_source, | ||||||||||||||||||
| &dtype, | ||||||||||||||||||
| &device, | ||||||||||||||||||
| false, | ||||||||||||||||||
| mapper, | ||||||||||||||||||
| args.in_situ_quant, | ||||||||||||||||||
| isq, | ||||||||||||||||||
| cache_config, | ||||||||||||||||||
| )?; | ||||||||||||||||||
| info!("Model loaded."); | ||||||||||||||||||
|  | ||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Early-stage CLI validation for
--isqwas lost – re-introduce viavalue_parserBy changing the field to
Option<String>without avalue_parser, invalid ISQ values will now silently passclap’s parsing stage and only be caught (maybe) much later in the program flow. You can keep the defer-to-device behaviour while still validating user input early:This preserves the new string syntax at the CLI, catches typos immediately, and still allows you to decide later whether the chosen ISQ is legal for the detected device.
📝 Committable suggestion
🤖 Prompt for AI Agents