Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
8f77f17
Enhance woq model loading & support hf woq model loading
yuwenzho May 31, 2024
8c93d05
Merge branch 'master' into yuwenzho/hf_woq_load
yuwenzho May 31, 2024
507339a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 31, 2024
2a7f5df
enhance code & fix bug
yuwenzho May 31, 2024
76edd2b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 31, 2024
ae4aa65
Merge branch 'master' into yuwenzho/hf_woq_load
yuwenzho May 31, 2024
9124e04
update load API usage
yuwenzho Jun 3, 2024
091676c
fix bug
yuwenzho Jun 3, 2024
6a0eec4
fix bug
yuwenzho Jun 3, 2024
d25763d
enhance code
yuwenzho Jun 5, 2024
9186c57
fix conflict
yuwenzho Jun 5, 2024
219fb54
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 5, 2024
3c6b1f5
enhance coverage
yuwenzho Jun 5, 2024
da120cc
Merge branch 'master' into yuwenzho/hf_woq_load
yuwenzho Jun 5, 2024
d1cdd62
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 5, 2024
11ae6fd
enhance code
yuwenzho Jun 6, 2024
e2afc7b
Merge branch 'master' into yuwenzho/hf_woq_load
yuwenzho Jun 6, 2024
67fedf0
fix pylint
yuwenzho Jun 6, 2024
124678c
enhance load API
yuwenzho Jun 11, 2024
8154ae5
fix conflict
yuwenzho Jun 11, 2024
f3ba356
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 11, 2024
eb95cff
enhance docstring
yuwenzho Jun 11, 2024
097e1a1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 11, 2024
92ad0e5
update load API usage
yuwenzho Jun 11, 2024
fba5711
update load API usage
yuwenzho Jun 11, 2024
939a261
enhance code
yuwenzho Jun 12, 2024
c7c2274
Merge branch 'master' into yuwenzho/hf_woq_load
yuwenzho Jun 12, 2024
f18602c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 12, 2024
a33c766
enhance code
yuwenzho Jun 12, 2024
fca891b
Merge branch 'master' into yuwenzho/hf_woq_load
yuwenzho Jun 12, 2024
42cfe39
enhance code
yuwenzho Jun 12, 2024
61b70cc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 12, 2024
f03fd85
Merge branch 'master' into yuwenzho/hf_woq_load
yuwenzho Jun 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions docs/3x/PT_WeightOnlyQuant.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ Theoretically, round-to-nearest (RTN) is the most straightforward way to quantiz

## Supported Matrix

| Algorithms/Backend | PyTorch eager mode |
| Algorithms/Backend | PyTorch eager mode |
|--------------|----------|
| RTN | ✔ |
| GPTQ | ✔ |
| AutoRound| ✔ |
| AWQ | ✔ |
| TEQ | ✔ |
| TEQ | ✔ |
| HQQ | ✔ |
> **RTN:** A quantification method that we can think of very intuitively. It does not require additional datasets and is a very fast quantization method. Generally speaking, RTN will convert the weight into a uniformly distributed integer data type, but some algorithms, such as Qlora, propose a non-uniform NF4 data type and prove its theoretical optimality.

Expand All @@ -64,8 +64,8 @@ WeightOnlyQuant quantization for PyTorch is using prepare and convert [APIs](./P
| bits (int)| [1, ..., 8] |
| group_size (int)| [-1, 1, ..., $C_{in}$] |
| use_sym (bool)| [True, False] |
| use_double_quant (bool) | [True, False] |
| double_quant_dtype (str) | ['int'] |
| use_double_quant (bool) | [True, False] |
| double_quant_dtype (str) | ['int'] |
| double_quant_bits (int) | [1, ..., bits] |
| double_quant_use_sym (bool) | [True, False] |
| double_quant_group_size (int) | [-1, 1, ..., $C_{in}$] |
Expand Down Expand Up @@ -98,7 +98,7 @@ model = convert(model)
#### GPTQ
| gptq_args | comments | default value |
|----------|-------------|-------------------------------------------------------------------|
| use_mse_search (bool) | Enables mean squared error (MSE) search | False
| use_mse_search (bool) | Enables mean squared error (MSE) search | False
| use_layer_wise (bool) | Enables quantize model per layer | False |
| model_path (str) | Model path that is used to load state_dict per layer | |
| use_double_quant (bool) | Enables double quantization | False |
Expand All @@ -120,7 +120,7 @@ model = convert(model)
#### AutoRound
| autoround_args | comments | default value |
|----------|-------------|-------------------------------------------------------------------|
| enable_full_range (bool) | Whether to enable full range quantization | False
| enable_full_range (bool) | Whether to enable full range quantization | False
| batch_size (int) | Batch size for training | 8 |
| lr_scheduler | The learning rate scheduler to be used | None |
| enable_quanted_input (bool) | Whether to use quantized input data | True |
Expand Down Expand Up @@ -251,8 +251,8 @@ from neural_compressor.torch.quantization import load

orig_model = YOURMODEL()
loaded_model = load(
"saved_results", model=orig_model
) # Please note that the model parameter passes the original model.
"saved_results", original_model=orig_model
) # Please note that the original_model parameter passes the original model.
```


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
parser.add_argument("--calib_iters", default=100, type=int,
help="calibration iters.")
parser.add_argument("--tasks", nargs='+', default=["lambada_openai"], \
type=str, choices=["hellaswag", "lambada_openai", "piqa", "winogrande", "copa",
type=str, choices=["hellaswag", "lambada_openai", "piqa", "winogrande", "copa",
"rte", "openbookqa", "lambada_standard", "wikitext"],
help="tasks list for accuracy validation")
parser.add_argument("--limit", default=None, type=int,
Expand Down Expand Up @@ -117,10 +117,10 @@
for examples in calib_dataset:
calib_data.append(
tokenizer(
examples["text"],
return_tensors="pt",
max_length=64,
padding="max_length",
examples["text"],
return_tensors="pt",
max_length=64,
padding="max_length",
truncation=True
)
)
Expand Down Expand Up @@ -154,7 +154,7 @@ def calib_func(model):



# If torch.matmul and torch.bmm are not replaced by INC module,
# If torch.matmul and torch.bmm are not replaced by INC module,
# Below codes can make torch.matmul and torch.bmm run on fp8 by injection.
if not args.skip_fp8_mm and args.precision in ['fp8_e4m3', 'fp8_e5m2']:
def replace_torch_mm_bmm():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ def run_fn(model):
user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
run_fn(user_model)
user_model = convert(user_model)

user_model.save(args.output_dir)


Expand All @@ -377,9 +377,10 @@ def run_fn(model):
print("load int8 model")

from neural_compressor.torch.quantization import load
user_model, _ = get_user_model()
tokenizer = AutoTokenizer.from_pretrained(args.model)
config = AutoConfig.from_pretrained(args.model)
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model)
setattr(user_model, "config", config)
else:
user_model, tokenizer = get_user_model()
Expand Down
3 changes: 3 additions & 0 deletions neural_compressor/torch/algorithms/weight_only/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from .save_load import save, load
Loading