xwu-intel
diff --git a/‎benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py‎
Lines changed: 6 additions & 6 deletions b/‎benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎benchmark/benchmark_batch/benchmark_batch.py‎
Lines changed: 4 additions & 4 deletions b/‎benchmark/benchmark_batch/benchmark_batch.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmark/benchmark_batch/benchmark_tokenizer.py‎
Lines changed: 4 additions & 4 deletions b/‎benchmark/benchmark_batch/benchmark_tokenizer.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmark/generative_agents/bench_other.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/generative_agents/bench_other.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/generative_agents/bench_sglang.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/generative_agents/bench_sglang.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/gsm8k/bench_other.py‎
Lines changed: 3 additions & 3 deletions b/‎benchmark/gsm8k/bench_other.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmark/gsm8k/bench_sglang.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/gsm8k/bench_sglang.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/hellaswag/bench_other.py‎
Lines changed: 3 additions & 3 deletions b/‎benchmark/hellaswag/bench_other.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmark/hellaswag/bench_sglang.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/hellaswag/bench_sglang.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/hicache/bench_multiturn.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/hicache/bench_multiturn.py‎
Lines changed: 2 additions & 2 deletions
@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
 
     tot_time = 0
     for i in range(len(all_prompts)):
-        tic = time.time()
+        tic = time.perf_counter()
         text_qa.run_batch(
             list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
         )
-        tot_time += time.time() - tic
+        tot_time += time.perf_counter() - tic
 
     return tot_time
 
@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
 
     tot_time = 0
     for i in range(len(all_prompts)):
-        tic = time.time()
+        tic = time.perf_counter()
         # Send a hint to cache the prefix
         text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
         # Send the batch
         text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
 
-        tot_time += time.time() - tic
+        tot_time += time.perf_counter() - tic
 
     return tot_time
 
@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
 
     all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
 
-    tic = time.time()
+    tic = time.perf_counter()
     text_qa.run_batch(
         list(zip(all_prompts, [gen_len] * len(all_prompts))),
     )
-    tot_time = time.time() - tic
+    tot_time = time.perf_counter() - tic
 
     return tot_time
 
 
@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
     }
     data = {"text": prompts, "sampling_params": sampling_params}
 
-    start_time = time.time()
+    start_time = time.perf_counter()
     try:
         response = requests.post(
             endpoint.base_url + "/generate", json=data, timeout=3600
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
             error = response.json()
             raise RuntimeError(f"Request {request_id} failed: {error}")
         result = response.json()
-        elapsed_time = (time.time() - start_time) * 1000  # Convert to ms
+        elapsed_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
         avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
         return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
     except Exception as e:
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
     num_requests = len(batched_prompts)
 
     # Record start time for total latency
-    benchmark_start_time = time.time()
+    benchmark_start_time = time.perf_counter()
 
     for i, batch_prompts in enumerate(batched_prompts):
         request_id = i + 1
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
         results.append(result)
 
     # Calculate total latency
-    total_latency = (time.time() - benchmark_start_time) * 1000  # Convert to ms
+    total_latency = (time.perf_counter() - benchmark_start_time) * 1000  # Convert to ms
 
     return results, total_latency
 
 
@@ -44,20 +44,20 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
     for run in range(NUM_RUNS):
         batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
 
-        start_time = time.time()
+        start_time = time.perf_counter()
         for prompt in batch_prompts:
             tokens = tokenizer.encode(prompt)
-        sequential_time = (time.time() - start_time) * 1000
+        sequential_time = (time.perf_counter() - start_time) * 1000
         sequential_times.append(sequential_time)
 
     # Batch tokenization using tokenizer()
     batch_times = []
     for run in range(NUM_RUNS):
         batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
 
-        start_time = time.time()
+        start_time = time.perf_counter()
         tokens = tokenizer(batch_prompts)
-        batch_time = (time.time() - start_time) * 1000
+        batch_time = (time.perf_counter() - start_time) * 1000
         batch_times.append(batch_time)
 
     return {
 
@@ -39,7 +39,7 @@ async def get_one_answer_async(arg):
         answer = await call_generate(**arg, temperature=0)
         states.append(answer)
 
-    tic = time.time()
+    tic = time.perf_counter()
     # we always sequentially execute agent calls to maintain its dependency
     if args.backend != "lmql":
         for arg in tqdm(arguments):
@@ -50,7 +50,7 @@ async def get_one_answer_async(arg):
         loop = asyncio.get_event_loop()
         for arg in tqdm(arguments):
             loop.run_until_complete(get_one_answer_async(arg))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     print(f"Latency: {latency:.3f}")
 
 
@@ -35,14 +35,14 @@ def main(args):
 
     states = []
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     for a in arguments:
         # only a single key in the dict
         for func, arg in a.items():
             result = func.run(**arg)
         result.sync()
         states.append(result)
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     # Compute accuracy
     print(f"Latency: {latency:.3f}")
 
@@ -75,7 +75,7 @@ def get_one_answer(i):
             )
             states[i] = answer
 
-        tic = time.time()
+        tic = time.perf_counter()
         if args.parallel == 1:
             for i in tqdm(range(len(questions))):
                 get_one_answer(i)
@@ -106,9 +106,9 @@ async def batched_call(batch_size):
                 for j in range(len(rets)):
                     states[i + j] = rets[j]
 
-        tic = time.time()
+        tic = time.perf_counter()
         asyncio.run(batched_call(batch_size=args.parallel))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     preds = []
     for i in range(len(states)):
 
@@ -84,14 +84,14 @@ def few_shot_gsm8k(s, question):
     #####################################
 
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     states = few_shot_gsm8k.run_batch(
         arguments,
         temperature=0,
         num_threads=args.parallel,
         progress_bar=True,
     )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     preds = []
     for i in range(len(states)):
 
@@ -57,7 +57,7 @@ def get_one_answer(i):
                 context=few_shot_examples + questions[i], choices=choices[i]
             )
 
-        tic = time.time()
+        tic = time.perf_counter()
         if args.parallel == 1:
             for i in tqdm(range(len(questions))):
                 get_one_answer(i)
@@ -82,10 +82,10 @@ async def batched_call(batch_size):
                 for j in range(len(rets)):
                     preds[i + j] = rets[j]
 
-        tic = time.time()
+        tic = time.perf_counter()
         asyncio.run(batched_call(batch_size=args.parallel))
 
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     # Compute accuracy
     acc = np.mean(np.array(preds) == np.array(labels))
 
@@ -68,15 +68,15 @@ def few_shot_hellaswag(s, question, choices):
     #####################################
 
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     rets = few_shot_hellaswag.run_batch(
         arguments,
         temperature=0,
         num_threads=args.parallel,
         progress_bar=True,
     )
     preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     # Compute accuracy
     acc = np.mean(np.array(preds) == np.array(labels))
 
@@ -261,7 +261,7 @@ async def handle_request(self, item):
             client_id, payload = item
             response = await async_request_sglang_generate(payload, self.url, self.pbar)
             if self.pbar.n == self.pbar.total:
-                self.finished_time = time.time()
+                self.finished_time = time.perf_counter()
             self.response_queue.put((client_id, response))
         except Exception as e:
             print(f"Request failed: {e}")
@@ -334,7 +334,7 @@ def run(self):
         request_thread = threading.Thread(target=self.request_sender, daemon=True)
         response_thread = threading.Thread(target=self.response_handler, daemon=True)
 
-        self.start_time = time.time()
+        self.start_time = time.perf_counter()
         request_thread.start()
         response_thread.start()