Skip to content

Commit 592a307

Browse files
mikekgfbmalfet
authored andcommitted
add gguf path option and gguf tests (pytorch#168)
* add gguf path option and gguf tests * tokenizer path for GGUF * tab->spc * GGUF_PATH definition * pip install gguf
1 parent d071d18 commit 592a307

File tree

6 files changed

+117
-7
lines changed

6 files changed

+117
-7
lines changed

.github/workflows/compile-gguf.yml

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
name: Compile main using GGUF
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
workflow_dispatch:
9+
10+
jobs:
11+
run-tinystories:
12+
strategy:
13+
matrix:
14+
runner: [ubuntu-latest, macos-14]
15+
runs-on: ${{matrix.runner}}
16+
steps:
17+
- name: Checkout repo
18+
uses: actions/checkout@v2
19+
- name: Setup Python
20+
uses: actions/setup-python@v2
21+
with:
22+
python-version: 3.11
23+
- name: Print machine info
24+
run: |
25+
uname -a
26+
if [ $(uname -s) == Darwin ]; then
27+
sysctl machdep.cpu.brand_string
28+
sysctl machdep.cpu.core_count
29+
fi
30+
- name: Install requirements
31+
run: |
32+
pip install gguf
33+
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
34+
pip install -r requirements.txt
35+
- name: Download GGUF
36+
run: |
37+
mkdir gguf_files
38+
export GGUF_PATH=gguf_files/llama-2-7b.Q4_0.gguf
39+
export TOKENIZER_PATH=gguf_files/tokenizer.model
40+
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true"
41+
wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
42+
- name: Run inference
43+
run: |
44+
export GGUF_PATH=gguf_files/llama-2-7b.Q4_0.gguf
45+
export TOKENIZER_PATH=gguf_files/tokenizer.model
46+
export MODEL_NAME=llama-2-7b.Q4_0.gguf
47+
export MODEL_DIR=/tmp
48+
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_eager
49+
cat ./output_eager
50+
python generate.py --compile --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_compiled
51+
cat ./output_compiled
52+
python export.py --gguf-path ${GGUF_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
53+
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
54+
cat ./output_aoti
55+
56+
echo "******************************************"
57+
echo "******* Emb: channel-wise quantized ******"
58+
echo "******************************************"
59+
python generate.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_eager
60+
cat ./output_eager
61+
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_compiled
62+
cat ./output_compiled
63+
python export.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --gguf-path ${GGUF_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
64+
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
65+
cat ./output_aoti
66+
67+
echo "******************************************"
68+
echo "******** Emb: group-wise quantized *******"
69+
echo "******************************************"
70+
python generate.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_eager
71+
cat ./output_eager
72+
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_compiled
73+
cat ./output_compiled
74+
python export.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --gguf-path ${GGUF_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
75+
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
76+
cat ./output_aoti
77+
78+
echo "tests complete"
79+
echo "******************************************"
80+

cli.py

+6
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,12 @@ def cli_args():
152152
default=None,
153153
help="Parameter file path.",
154154
)
155+
parser.add_argument(
156+
"--gguf-path",
157+
type=Path,
158+
default=None,
159+
help="GGUF file path.",
160+
)
155161
parser.add_argument(
156162
"--tokenizer-path",
157163
type=Path,

eval.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,8 @@ def eval_main(args) -> None:
215215
checkpoint_dir = args.checkpoint_dir
216216
params_path = args.params_path
217217
params_table = args.params_table
218+
gguf_path = args.gguf_path
218219
tokenizer_path = args.tokenizer_path
219-
params_path = args.params_path
220220
dso_path = args.dso_path
221221
pte_path = args.pte_path
222222
quantize = args.quantize
@@ -241,6 +241,7 @@ def eval_main(args) -> None:
241241
checkpoint_dir,
242242
params_path,
243243
params_table,
244+
gguf_path,
244245
dso_path,
245246
pte_path,
246247
quantize,

export.py

+1
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def main(args):
7979
args.checkpoint_dir,
8080
args.params_path,
8181
args.params_table,
82+
args.gguf_path,
8283
device=device,
8384
precision=precision,
8485
use_tp=False

generate.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -279,16 +279,19 @@ def _load_model(
279279
checkpoint_dir,
280280
params_path,
281281
params_table,
282+
gguf_path,
282283
device,
283284
precision,
284-
use_tp=False
285+
use_tp # =False
285286
):
286287
use_cuda = "cuda" in device
287288
with torch.device("meta"):
288289
if params_path:
289290
model = Transformer.from_params(params_path)
290291
elif params_table:
291-
model = Transformer.from_table(params_path)
292+
model = Transformer.from_table(params_path)
293+
elif gguf_path:
294+
model = Transformer.from_gguf(gguf_path)
292295
else:
293296
model = Transformer.from_name(checkpoint_path.parent.name)
294297

@@ -343,22 +346,29 @@ def _load_inference_model(
343346
checkpoint_dir,
344347
params_path,
345348
params_table,
349+
gguf_path,
346350
dso_path,
347351
pte_path,
348352
quantize,
349353
device,
350354
precision,
351-
use_tp=False
355+
use_tp # =False
352356
):
353357
assert (
354358
(checkpoint_path and checkpoint_path.is_file()) or
359+
(checkpoint_dir and checkpoint_path.is_dir()) or
360+
(gguf_path and gguf_path.is_file()) or
355361
(dso_path and Path(dso_path).is_file()) or
356362
(pte_path and Path(pte_path).is_file())
357-
), "need to specified a valid checkpoint path, DSO path, or PTE path"
363+
), "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path"
358364
assert not (dso_path and pte_path), "specify either DSO path or PTE path, but not both"
359365

360366
if (checkpoint_path and (dso_path or pte_path)):
361367
print("Warning: checkpoint path ignored because an exported DSO or PTE path specified")
368+
if (checkpoint_dir and (dso_path or pte_path)):
369+
print("Warning: checkpoint dir ignored because an exported DSO or PTE path specified")
370+
if (gguf_path and (dso_path or pte_path)):
371+
print("Warning: GGUF path ignored because an exported DSO or PTE path specified")
362372

363373
print("Loading model ...")
364374
t0 = time.time()
@@ -367,6 +377,7 @@ def _load_inference_model(
367377
checkpoint_dir,
368378
params_path,
369379
params_table,
380+
gguf_path,
370381
device,
371382
precision,
372383
use_tp
@@ -423,6 +434,7 @@ def _main(
423434
checkpoint_dir: Optional[Path] = None,
424435
params_path: Optional[Path] = None,
425436
params_table: Optional[str] = None,
437+
gguf_path: Optional[Path] = None,
426438
tokenizer_path: Optional[Path] = None,
427439
compile: bool = True,
428440
compile_prefill: bool = False,
@@ -463,6 +475,7 @@ def _main(
463475
checkpoint_dir,
464476
params_path,
465477
params_table,
478+
gguf_path,
466479
dso_path,
467480
pte_path,
468481
quantize,
@@ -476,8 +489,10 @@ def _main(
476489
if is_speculative:
477490
draft_model = _load_model(
478491
draft_checkpoint_path,
479-
None,
480-
None,
492+
None, # checkpoint_dir
493+
None, # params_path,
494+
None, # params_table
495+
None, # gguf_path
481496
device,
482497
precision,
483498
use_tp
@@ -619,6 +634,7 @@ def main(args):
619634
args.checkpoint_dir,
620635
args.params_path,
621636
args.params_table,
637+
args.gguf_path,
622638
args.tokenizer_path,
623639
args.compile,
624640
args.compile_prefill,

model.py

+6
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,12 @@ def from_table(cls, name: str):
234234
def from_params(cls, params_path: str):
235235
return cls(ModelArgs.from_params(params_path))
236236

237+
@classmethod
238+
def from_gguf(cls, gguf_path: str):
239+
from gguf_loader import load_llama_from_gguf_file
240+
model = load_llama_from_gguf_file(gguf_path)
241+
return model
242+
237243

238244
class TransformerBlock(nn.Module):
239245
def __init__(self, config: ModelArgs) -> None:

0 commit comments

Comments
 (0)