Skip to content

Commit

Permalink
Merge branch 'main' into openai_api_chat_correctness
Browse files Browse the repository at this point in the history
  • Loading branch information
Jack-Khuu authored Aug 16, 2024
2 parents 76b8a5a + 147c292 commit d90e33b
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 75 deletions.
9 changes: 8 additions & 1 deletion build/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ def _initialize_model(
quantize,
tokenizer=None,
max_seq_length=None,
support_tensor_subclass: bool = True,
):
print("Loading model...")

Expand Down Expand Up @@ -510,7 +511,13 @@ def _initialize_model(
if quantize:
print(f"Quantizing the model with: {quantize}")
with measure_time("Time to quantize model: {time:.02f} seconds"):
quantize_model(model, builder_args.device, quantize, tokenizer)
quantize_model(
model,
builder_args.device,
quantize,
tokenizer,
support_tensor_subclass,
)
device_sync(device=builder_args.device)

if builder_args.setup_caches:
Expand Down
65 changes: 0 additions & 65 deletions build/model_aoti.py

This file was deleted.

2 changes: 2 additions & 0 deletions export.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def main(args):
quantize,
tokenizer,
max_seq_length=builder_args.max_seq_length,
support_tensor_subclass=output_dso_path is None,
)
model_to_pte = model
model_to_dso = model
Expand All @@ -143,6 +144,7 @@ def main(args):
model_to_dso = _initialize_model(
builder_args,
quantize,
support_tensor_subclass=False,
)
_unset_gguf_kwargs(builder_args)

Expand Down
10 changes: 5 additions & 5 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,9 @@ def __init__(
# global print
# from tp import maybe_init_dist
# rank = maybe_init_dist()
# use_tp = False
# use_distributed = False
self.rank: Optional[int] = None
# if use_tp:
# if use_distributed:
# if rank != 0:
# # only print on rank 0
# print = lambda *args, **kwargs: None
Expand Down Expand Up @@ -655,7 +655,7 @@ def chat(
)
if generator_args.compile:
if (
self.is_speculative and self.builder_args.use_tp
self.is_speculative and self.builder_args.use_distributed
): # and ("cuda" in builder_args.device):
torch._inductor.config.triton.cudagraph_trees = (
False # Bug with cudagraph trees in this case
Expand Down Expand Up @@ -783,7 +783,7 @@ def callback(x, *, done_generating=False):
)

if (i != generator_args.num_samples - 1 or not self.profile) or (
self.builder_args.use_tp and self.rank != 0
self.builder_args.use_distributed and self.rank != 0
):
import contextlib

Expand Down Expand Up @@ -820,7 +820,7 @@ def callback(x, *, done_generating=False):
)
compilation_time = time.perf_counter() - t0
if hasattr(prof, "export_chrome_trace"):
if self.builder_args.use_tp:
if self.builder_args.use_distributed:
prof.export_chrome_trace(f"{self.profile}_rank_{self.rank}.json")
else:
prof.export_chrome_trace(f"{self.profile}.json")
Expand Down
4 changes: 2 additions & 2 deletions install_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ fi
# NOTE: If a newly-fetched version of the executorch repo changes the value of
# NIGHTLY_VERSION, you should re-run this script to install the necessary
# package versions.
NIGHTLY_VERSION=dev20240728
NIGHTLY_VERSION=dev20240814

# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
(
Expand Down Expand Up @@ -82,7 +82,7 @@ REQUIREMENTS_TO_INSTALL=(
# TODO: Remove this and install nightly build, once it supports macos
(
set -x
$PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@d477c0e59b458b5617dcb3e999290a87df3070d8
$PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@e11201a62669f582d81cdb33e031a07fb8dfc4f3
)
if [[ -x "$(command -v nvidia-smi)" ]]; then
(
Expand Down
11 changes: 9 additions & 2 deletions quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@
### torchchat quantization API ###


def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
def quantize_model(
model: nn.Module,
device,
quantize_options,
tokenizer=None,
support_tensor_subclass: bool = True,
):
"""
Quantize the specified model using the quantizers described by
a quantization dict of the form:
Expand All @@ -74,7 +80,8 @@ def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
# Use tensor subclass API for int4 weight only.
if device == "cuda" and quantizer == "linear:int4":
quantize_(model, int4_weight_only(q_kwargs["groupsize"]))
unwrap_tensor_subclass(model)
if not support_tensor_subclass:
unwrap_tensor_subclass(model)
continue
# Use dtype precision specified in user config, else fallback on global precision.
if "precision" in quantize_options:
Expand Down
3 changes: 3 additions & 0 deletions torchchat/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Chat with LLMs Everywhere

This directory is a WIP path that will host most of the files currently living in root

0 comments on commit d90e33b

Please sign in to comment.