CTranslate2 Blog

--------- Co-authored-by: ffloresy <[email protected]> Co-authored-by: Eliot Li <[email protected]> Co-authored-by: vbayanag <[email protected]> Co-authored-by: Jeffrey Novotny <[email protected]> Co-authored-by: Danny213123 <[email protected]>
ROCm · Oct 24, 2024 · c9a603f · c9a603f
1 parent a1aa665
commit c9a603f
Show file tree

Hide file tree

Showing 9 changed files with 634 additions and 0 deletions.
diff --git a/blogs/artificial-intelligence/ctranslate2/README.md b/blogs/artificial-intelligence/ctranslate2/README.md
diff --git a/blogs/artificial-intelligence/ctranslate2/src/gpt2.py b/blogs/artificial-intelligence/ctranslate2/src/gpt2.py
@@ -0,0 +1,15 @@
+import ctranslate2
+import transformers
+
+generator = ctranslate2.Generator("gpt2_ct2", device="cuda")
+tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
+
+# Unconditional generation.
+start_tokens = [tokenizer.bos_token]
+results = generator.generate_batch([start_tokens], max_length=30, sampling_topk=10)
+print(tokenizer.decode(results[0].sequences_ids[0]))
+
+# Conditional generation.
+start_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode("It is"))
+results = generator.generate_batch([start_tokens], max_length=30, sampling_topk=10)
+print(tokenizer.decode(results[0].sequences_ids[0]))
diff --git a/blogs/artificial-intelligence/ctranslate2/src/sample2.flac b/blogs/artificial-intelligence/ctranslate2/src/sample2.flac
diff --git a/blogs/artificial-intelligence/ctranslate2/src/speech_recognition.py b/blogs/artificial-intelligence/ctranslate2/src/speech_recognition.py
@@ -0,0 +1,35 @@
+import ctranslate2
+import librosa
+import transformers
+
+# Load and resample the audio file.
+audio, _ = librosa.load("src/sample2.flac", sr=16000, mono=True)
+
+# Compute the features of the first 30 seconds of audio.
+processor = transformers.WhisperProcessor.from_pretrained("openai/whisper-tiny")
+inputs = processor(audio, return_tensors="np", sampling_rate=16000)
+features = ctranslate2.StorageView.from_array(inputs.input_features)
+
+# Load the model on GPU.
+model = ctranslate2.models.Whisper("whisper-tiny-ct2", device="cuda")
+
+# Detect the language.
+results = model.detect_language(features)
+language, probability = results[0][0]
+print("Detected language %s with probability %f" % (language, probability))
+
+# Describe the task in the prompt.
+# See the prompt format in https://github.com/openai/whisper.
+prompt = processor.tokenizer.convert_tokens_to_ids(
+    [
+        "<|startoftranscript|>",
+        language,
+        "<|transcribe|>",
+        "<|notimestamps|>",  # Remove this token to generate timestamps.
+    ]
+)
+
+# Run generation for the 30-second window.
+results = model.generate(features, [prompt])
+transcription = processor.decode(results[0].sequences_ids[0])
+print(transcription)
diff --git a/blogs/artificial-intelligence/ctranslate2/src/translate.py b/blogs/artificial-intelligence/ctranslate2/src/translate.py
@@ -0,0 +1,15 @@
+import ctranslate2
+import sentencepiece as spm
+
+translator = ctranslate2.Translator("ende_ctranslate2/", device="cuda")
+sp = spm.SentencePieceProcessor("sentencepiece.model")
+
+input_text = "Good Morning!"
+input_tokens = sp.encode(input_text, out_type=str)
+
+results = translator.translate_batch([input_tokens])
+
+output_tokens = results[0].hypotheses[0]
+output_text = sp.decode(output_tokens)
+
+print(output_text)
diff --git a/blogs/artificial-intelligence/ctranslate2/src/translate_compare.py b/blogs/artificial-intelligence/ctranslate2/src/translate_compare.py
@@ -0,0 +1,54 @@
+import ctranslate2
+import sentencepiece as spm
+import time
+
+# Load the SentencePiece model
+sp = spm.SentencePieceProcessor(model_file="sentencepiece.model")
+
+# Input text to translate
+input_text = "Hello world!"
+input_tokens = sp.encode(input_text, out_type=str)
+
+# Function to perform translation and measure latency and tokens per second
+def translate_and_time(translator):
+    start_time = time.time()
+    results = translator.translate_batch([input_tokens])
+    end_time = time.time()
+    latency = end_time - start_time
+
+    # Decode the translated tokens
+    output_tokens = results[0].hypotheses[0]
+    output_text = sp.decode(output_tokens)
+
+    # Calculate tokens per second
+    num_output_tokens = len(output_tokens)
+    tokens_per_second = num_output_tokens / latency
+
+    return output_text, latency, tokens_per_second
+
+# Load the default (float32) model
+translator_float32 = ctranslate2.Translator(
+    "ende_ctranslate2/", device="cuda", compute_type="float32"
+)
+output_text_float32, latency_float32, tps_float32 = translate_and_time(translator_float32)
+
+# Load the int8 quantized model
+translator_int8 = ctranslate2.Translator(
+    "ende_ctranslate2_int8/", device="cuda", compute_type="int8"
+)
+output_text_int8, latency_int8, tps_int8 = translate_and_time(translator_int8)
+
+# Print the results
+print("Default (float32) model translation:")
+print(f"Output: {output_text_float32}")
+print(f"Latency: {latency_float32:.4f} seconds")
+print(f"Tokens per second: {tps_float32:.2f}\n")
+
+print("Int8 quantized model translation:")
+print(f"Output: {output_text_int8}")
+print(f"Latency: {latency_int8:.4f} seconds")
+print(f"Tokens per second: {tps_int8:.2f}\n")
+
+# Calculate the speedup in tokens per second
+speedup_tps = tps_int8 / tps_float32
+print(f"Speedup in tokens per second with int8 quantization: {speedup_tps:.2f}x faster")
diff --git a/blogs/authors/michael-zhang.md b/blogs/authors/michael-zhang.md
@@ -0,0 +1,11 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Michael Zhang">
+  <meta name="keywords" content="AMD GPU, MI300, MI250, ROCm, blog, contributor, blog author">
+</head>
+
+(mzhang)=
+
+# Michael Zhang
+
+Michael is a Machine Learning Engineer at AMD. Michael specializes in generative AI, large language models (LLMs), computer vision, autonomous driving, and robotics. He has published 10+ papers in AI top conference and journals and has a Google Scholar citation count of over 650 as of October 2024. He holds a master's degree in Computer Engineering from the University of Illinois at Urbana-Champaign (UIUC).
diff --git a/blogs/conf.py b/blogs/conf.py
@@ -150,6 +150,10 @@
         "Maria Ruiz Varela",
         "http://rocm.blogs.amd.com/authors/maria-ruiz-varela.html",
 ),
+"Michael Zhang": (
+        "Michael Zhang",
+        "http://rocm.blogs.amd.com/authors/michael-zhang.html",
+),
 "Nicholas Curtis": (
         "Nicholas Curtis",
         "https://rocm.blogs.amd.com/authors/nicholas-curtis.html",

diff --git a/blogs/contributor-bios.md b/blogs/contributor-bios.md
@@ -105,6 +105,12 @@ Technical Marketing Engineer at AMD
 Senior Member of Technical Staff at AMD
 :::
 
+:::{grid-item-card} [Michael Zhang](./authors/michael-zhang.md)
+:padding: 1
+
+Machine Learning Engineer at AMD
+:::
+
 :::{grid-item-card} [Nicholas Curtis](./authors/nicholas-curtis.md)
 :padding: 1