Merge branch 'main' into openai_api_chat_correctness

pytorch · Aug 16, 2024 · d90e33b · d90e33b
2 parents 76b8a5a + 147c292
commit d90e33b
Show file tree

Hide file tree

Showing 7 changed files with 29 additions and 75 deletions.
diff --git a/build/builder.py b/build/builder.py
@@ -440,6 +440,7 @@ def _initialize_model(
     quantize,
     tokenizer=None,
     max_seq_length=None,
+    support_tensor_subclass: bool = True,
 ):
     print("Loading model...")
 
@@ -510,7 +511,13 @@ def _initialize_model(
         if quantize:
             print(f"Quantizing the model with: {quantize}")
             with measure_time("Time to quantize model: {time:.02f} seconds"):
-                quantize_model(model, builder_args.device, quantize, tokenizer)
+                quantize_model(
+                    model,
+                    builder_args.device,
+                    quantize,
+                    tokenizer,
+                    support_tensor_subclass,
+                )
                 device_sync(device=builder_args.device)
 
         if builder_args.setup_caches:

diff --git a/build/model_aoti.py b/build/model_aoti.py
diff --git a/export.py b/export.py
@@ -126,6 +126,7 @@ def main(args):
             quantize,
             tokenizer,
             max_seq_length=builder_args.max_seq_length,
+            support_tensor_subclass=output_dso_path is None,
         )
         model_to_pte = model
         model_to_dso = model
@@ -143,6 +144,7 @@ def main(args):
             model_to_dso = _initialize_model(
                 builder_args,
                 quantize,
+                support_tensor_subclass=False,
             )
             _unset_gguf_kwargs(builder_args)
 

diff --git a/generate.py b/generate.py
@@ -199,9 +199,9 @@ def __init__(
         # global print
         #    from tp import maybe_init_dist
         #    rank = maybe_init_dist()
-        # use_tp = False
+        # use_distributed = False
         self.rank: Optional[int] = None
-        #    if use_tp:
+        #    if use_distributed:
         #        if rank != 0:
         #            # only print on rank 0
         #            print = lambda *args, **kwargs: None
@@ -655,7 +655,7 @@ def chat(
         )
         if generator_args.compile:
             if (
-                self.is_speculative and self.builder_args.use_tp
+                self.is_speculative and self.builder_args.use_distributed
             ):  # and ("cuda" in builder_args.device):
                 torch._inductor.config.triton.cudagraph_trees = (
                     False  # Bug with cudagraph trees in this case
@@ -783,7 +783,7 @@ def callback(x, *, done_generating=False):
                     )
 
             if (i != generator_args.num_samples - 1 or not self.profile) or (
-                self.builder_args.use_tp and self.rank != 0
+                self.builder_args.use_distributed and self.rank != 0
             ):
                 import contextlib
 
@@ -820,7 +820,7 @@ def callback(x, *, done_generating=False):
             )
             compilation_time = time.perf_counter() - t0
             if hasattr(prof, "export_chrome_trace"):
-                if self.builder_args.use_tp:
+                if self.builder_args.use_distributed:
                     prof.export_chrome_trace(f"{self.profile}_rank_{self.rank}.json")
                 else:
                     prof.export_chrome_trace(f"{self.profile}.json")

diff --git a/install_requirements.sh b/install_requirements.sh
@@ -47,7 +47,7 @@ fi
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION=dev20240728
+NIGHTLY_VERSION=dev20240814
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
 (
@@ -82,7 +82,7 @@ REQUIREMENTS_TO_INSTALL=(
 # TODO: Remove this and install nightly build, once it supports macos
 (
   set -x
-  $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@d477c0e59b458b5617dcb3e999290a87df3070d8
+  $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@e11201a62669f582d81cdb33e031a07fb8dfc4f3
 )
 if [[ -x "$(command -v nvidia-smi)" ]]; then
   (

diff --git a/quantization/quantize.py b/quantization/quantize.py
@@ -50,7 +50,13 @@
 ###                  torchchat quantization API                       ###
 
 
-def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
+def quantize_model(
+    model: nn.Module,
+    device,
+    quantize_options,
+    tokenizer=None,
+    support_tensor_subclass: bool = True,
+):
     """
     Quantize the specified model using the quantizers described by
     a quantization dict of the form:
@@ -74,7 +80,8 @@ def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
             # Use tensor subclass API for int4 weight only.
             if device == "cuda" and quantizer == "linear:int4":
                 quantize_(model, int4_weight_only(q_kwargs["groupsize"]))
-                unwrap_tensor_subclass(model)
+                if not support_tensor_subclass:
+                    unwrap_tensor_subclass(model)
                 continue
             # Use dtype precision specified in user config, else fallback on global precision.
             if "precision" in quantize_options:

diff --git a/torchchat/README.md b/torchchat/README.md
@@ -0,0 +1,3 @@
+# Chat with LLMs Everywhere
+
+This directory is a WIP path that will host most of the files currently living in root
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Chat with LLMs Everywhere

		This directory is a WIP path that will host most of the files currently living in root