Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions examples/text-generation/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import argparse
from transformers import AutoTokenizer
import nltk
import evaluate
import numpy as np
import json

###################### Habana internal code ##################################
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you put proper head?

ACC_TARGET = {"rouge1": 44.4312, "rouge2": 22.0352, "rougeL": 28.6162}

# See https://github.com/mlcommons/inference/pull/1583
##############################################################################

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint-path", default="/mnt/weka/data/pytorch/llama2/Llama-2-70b-chat-hf",
help="Path to Llama2-70b-hf-chat checkpoint")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove the default path as current. and put None

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This argument should'nt have a default. And it should probably have: required=True. Also we can call it --model_name_or_path (because it doesnt have to be a checkpoint on disk, it could be a hugging face model that might be downloaded)
like here: https://github.com/huggingface/optimum-habana/blob/main/examples/text-generation/run_generation.py#L48

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have used this from llama mlperf submission, and this rouge eval works only with this checkpoint, it can't be any other checkpoint. Therefore it is put as default since we have tested with only this. also it helps user to run with the correct file.

parser.add_argument("--accuracy-file", default="output/accuracy.json", help="path to accuracy.json")
parser.add_argument("--dataset-file", default="/mnt/weka/data/mlperf_inference/llama2/processed-data.pkl",
help="path to processed openorca validation set")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove the default as this

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe should specify what the file format and data contents should be, maybe as help message or atleast as a comment somewhere

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

default file name is to help running with correct dataset file since it is already preprocessed, no other dataset is used here. user may end up running with incorrect or may not find the correct dataset if removed.

parser.add_argument("--verbose", action="store_true",
help="verbose messages")
parser.add_argument("--dtype", default="int64",
help="dtype of the accuracy log", choices=["int32", "int64", "float"])
args = parser.parse_args()
return args


def get_groundtruth(processed_dataset_file):
import pandas as pd
data = pd.read_pickle(processed_dataset_file)
ground_truths = data['output']
return ground_truths

def postprocess_text(preds, targets):
preds = [pred.strip() for pred in preds]
targets = [target.strip() for target in targets]

# rougeLSum expects newline after each sentence
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]

return preds, targets


def main():

args = get_args()
checkpoint_path = args.checkpoint_path
metric = evaluate.load("rouge")
nltk.download('punkt')

tokenizer = AutoTokenizer.from_pretrained(
checkpoint_path,
model_max_length=2048,
padding_side="left",
use_fast=False,)


targets = get_groundtruth(args.dataset_file)

target_required = []
preds_token_ids = []

eval_dtype = np.int64
if args.dtype == "int32":
eval_dtype = np.int32
elif args.dtype == "float":
eval_dtype = np.float32
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor:
eval_dtype = {"int32": np.int32, "float": np.float32, "int64": np.int64}[args.dtype]


with open(args.accuracy_file, "r") as f:
results = json.load(f)

seen = set()
gen_tok_len = 0
for pred in results:
qsl_idx = pred['qsl_idx']
if qsl_idx in seen:
continue

seen.add(qsl_idx)
target = targets[qsl_idx]
target_required.append(target)
pred = np.frombuffer( bytes.fromhex(pred['data']), eval_dtype)

gen_tok_len += len(pred)
preds_token_ids.append(pred)

preds_decoded_text = tokenizer.batch_decode(
preds_token_ids, skip_special_tokens=True)

preds, targets = postprocess_text(preds_decoded_text, target_required)

result = metric.compute(
predictions=preds, references=targets, use_stemmer=True, use_aggregator=False)
result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
prediction_lens = [len(pred) for pred in preds]
gen_num = len(preds)

acc = [result[key] / ACC_TARGET[key] for key in ACC_TARGET]
acc = round(np.min(acc) * 100, 2)


result = {**result,
'gen_len': np.sum(prediction_lens),
'gen_num': gen_num,
'accuracy': acc # this is Habana internal field
}

print("\nResults\n")
print(result)


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions examples/text-generation/requirements_lm_eval.txt
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
https://github.com/EleutherAI/lm-evaluation-harness/archive/0bf683b4e6a9df359b3156ba9ba8d62bdd47e0c0.zip
evaluate
rouge_score
accelerate
pandas
168 changes: 166 additions & 2 deletions examples/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
import time
from itertools import cycle
from pathlib import Path

import pandas as pd
import struct
import contextlib
import torch
from utils import adjust_batch, count_hpu_graphs, finalize_quantization, initialize_model

Expand Down Expand Up @@ -85,6 +87,12 @@ def setup_parser(parser):
type=str,
help="Optional argument if you want to assess your model on a given dataset of the HF Hub.",
)
parser.add_argument(
"--dataset",
default="/mnt/weka/data/mlperf_inference/llama2/processed-data.pkl",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove default, and put None, but do a check later

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

type=str,
help="path of the dataset to run rouge evaluation and measurement for rouge",
)
parser.add_argument(
"--column_name",
default=None,
Expand Down Expand Up @@ -341,8 +349,164 @@ def main():
use_lazy_mode = False

import habana_frameworks.torch.hpu as torch_hpu
if args.dataset_name == "openorca":
# Benchmark over the prompts below
def get_ds(args):
ds = pd.read_pickle(args.dataset)
return ds


def get_input(ds, batch_size):
queries = []
tok_input = ds["tok_input"].tolist()
for start in range(0, len(ds), batch_size):
end = start + batch_size
batch = tok_input[start:end]
input_ids = []
attention_mask=[]
for query in batch:
input_ids.append(
[0] * (args.max_input_tokens - len(query)) + query)
attention_mask.append([0] * (args.max_input_tokens - len(query)) + [1] * len(query))
queries.append({
'input_ids': torch.tensor(input_ids, dtype=torch.int32),
'attention_mask': torch.tensor(attention_mask, dtype=torch.int32)
})
return queries

ds = get_ds(args)
input_sentences = get_input(ds, args.batch_size)

def generate(input_tokens, size=None, reduce_recompile=False):
"""Generates sequences from the input sentences and returns them."""

t0 = time.perf_counter()
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please note this for evaluation and not for latency test still existing behaviour is retained

print(f"Step4+ starting time is {t0*1000}", flush=True)
if size is not None:
input_tokens = adjust_batch(input_tokens, size)

if not reduce_recompile:
# Move inputs to target device(s)
for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(args.device)

outputs = model.generate(
**input_tokens,
generation_config=generation_config,
lazy_mode=use_lazy_mode,
hpu_graphs=args.use_hpu_graphs,
profiling_steps=args.profiling_steps,
profiling_warmup_steps=args.profiling_warmup_steps,
).cpu()
outputs = outputs.tolist()
for i in range(len(outputs)):
outputs[i] = outputs[i][args.max_input_tokens:]
duration = time.perf_counter() - t0
print(f"Total E2E time of this batch is {duration:.3f}s", flush=True)
return outputs
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a LOT of code duplication, which can cause errors later due to failing to remember to propagate chaneges in both branches etc.

For example, you could reuse the older "generate" function, just by adding:

def generate(input_tokens=None, size=None, reduce_recompile=False):
.....
    if input_tokens is None:  # ADDING THIS
				# Tokenization
				if args.max_input_tokens > 0:
					input_tokens = tokenizer.batch_encode_plus(
						input_sentences,
						return_tensors="pt",
						padding="max_length",
						max_length=args.max_input_tokens,
						truncation=True,
					)
				else:
					input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True)

And you could perform the output runcation outside generate:

for i in range(len(outputs)):
                outputs[i] = outputs[i][args.max_input_tokens:]

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please note we need to run the measurement also, we need other functionalities from this file therefore existing behaviours are retained, if any code duplication can be handled as part of later cleanup/code refactoring.


from optimum.habana.utils import HabanaProfile

# compilation stage disable profiling
HabanaProfile.disable()
# Compilation
logger.info("Graph compilation...")
dyn_prompt_lens = args.simulate_dyn_prompt
t0 = time.perf_counter()
# The first three iterations take longer because of graph compilation
if dyn_prompt_lens is None or len(set(dyn_prompt_lens)) == 1:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no difference between these code and the already existing one in the else branch (except for the call to generate? If not lets not duplicate. You can just call generate differently:

generate(input_sentences[0] if args.dataset_name == "openorca" else None, dyn_prompt_lens[0], args.reduce_recompile)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there are difference related the data loading from different datasets and invoking the generate functions on openorca datasets in the for loop

for _ in range(args.warmup):
if dyn_prompt_lens is None:
print("Warming up", flush=True)
generate(input_sentences[0], None, args.reduce_recompile)
else:
print("Warming up for shape,", dyn_prompt_lens[0], flush=True)
generate(input_sentences[0], dyn_prompt_lens[0], args.reduce_recompile)
else:
if args.bucket_size > 0:
mn = min(dyn_prompt_lens)
mx = max(dyn_prompt_lens)

def rounder(x):
return int(math.ceil(x / args.bucket_size) * args.bucket_size)

if args.dataset_name is None:
min_prompt_len = rounder(mn)
max_sentence_len = rounder(mx)
for _ in range(args.warmup):
lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
for sz in lst:
print("Warming up for shape,", sz - 1, flush=True)
generate(input_sentences[0], sz - 1, args.reduce_recompile)
torch_hpu.synchronize()
compilation_duration = time.perf_counter() - t0
HabanaProfile.enable()
total_new_tokens_generated = 0
logger.info("Running generate...")
t0 = time.perf_counter()
# Benchmark over n_iterations iterations
N = len(input_sentences)
if dyn_prompt_lens is None:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the purpose of the dataset here? I suppose it is for accuracy eval. Then Why warmup? Once you have gone thru the dataset once and collected the sentences for accuracy, you dont need to go over the dataset n_iter times again, as far as I understand.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly teh whole apparatus of dynamic prompts warmup etc are also probably not used for orca eval? in which case we should delete all these extraneous if-elses that dynamic prompt gives rise to

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

two different datasets one for generating measurement and one for running the quantization for evaluation. Please note here number of iteration is 1 for running this evaluation, we have already specified the command for running the evaluation, I have kept to retain the n_itr argument. dynamic path is yet no tested for rouge eval, need to check with validation team else it can be clean up as part of code refactoring.

for i in range(args.n_iterations):
results = []
b = 1
for sentence in input_sentences:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: More pythonic to write:
for b, sentence in enumerate(input_sentences)
then we can remove the b=1, b+=1 lines

generated = generate(sentence, None, args.reduce_recompile)
results.extend(generated)
print(f"Generatig batch {b}/{N}")
b +=1
else:
repeated_prompt_len = cycle(dyn_prompt_lens)
for i in range(args.n_iterations):
prompt_len = next(repeated_prompt_len)
print("Generating for shape,", prompt_len)
results = []
for sentence in input_sentences:
generated = generate(sentence, prompt_len, args.reduce_recompile)
results.extend(generated)
duration = time.perf_counter() - t0
total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
throughput = total_new_tokens_generated / duration

# Store results if necessary
if args.output_dir is not None and args.global_rank == 0:
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

#TODO dump in hex format
acc_file = []
num_token = 0
for i, idx in enumerate(ds.index):
pred = results[i]
eos_token_id = 2
try:
ind_eos = pred.index(eos_token_id)+1
except:
ind_eos = len(pred)
pred = pred[:ind_eos]
num_token += len(pred)
acc_file.append({
"seq_id": idx,
"qsl_idx": idx,
"data": bytes(struct.pack('L' * len(pred), *pred)).hex().upper()
})
with open(output_dir / "accuracy.json", "w") as outfile:
outfile.write(json.dumps(acc_file))

stats = f"Throughput (including tokenization) = {throughput} tokens/second"
stats = stats + f"\nNumber of HPU graphs = {count_hpu_graphs()}"
separator = "-" * len(stats)
print()
print("Stats:")
print(separator)
print(stats)
mem = get_hpu_memory_stats()
for k, v in mem.items():
print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
print(f"Graph compilation duration = {compilation_duration} seconds")
print(separator)
print()
elif args.dataset_name is None:
# Benchmark over the prompts below
if args.prompt:
input_sentences = args.prompt
Expand Down