From 840aef04ad6b70891b4a9a000c73797e07e43ce5 Mon Sep 17 00:00:00 2001 From: Jack Gerrits Date: Fri, 8 Mar 2024 11:42:05 -0500 Subject: [PATCH 1/3] Update notebook to use new function registration syntax --- ...eo_transcript_translate_with_whisper.ipynb | 389 +++++++++--------- 1 file changed, 185 insertions(+), 204 deletions(-) diff --git a/notebook/agentchat_video_transcript_translate_with_whisper.ipynb b/notebook/agentchat_video_transcript_translate_with_whisper.ipynb index f100933ff32e..ae36cc888992 100644 --- a/notebook/agentchat_video_transcript_translate_with_whisper.ipynb +++ b/notebook/agentchat_video_transcript_translate_with_whisper.ipynb @@ -1,20 +1,12 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "e4fccaaa-fda5-4f99-a4c5-c463c5c890f5", - "metadata": {}, - "source": [ - "\"Open" - ] - }, { "cell_type": "markdown", "id": "a5b4540e-4987-4774-9305-764c3133e953", "metadata": {}, "source": [ - "\n", - "# Auto Generated Agent Chat: Translating Video audio using Whisper and GPT-3.5-turbo\n", + "# Translating Video audio using Whisper and GPT-3.5-turbo\n", + "\n", "In this notebook, we demonstrate how to use whisper and GPT-3.5-turbo with `AssistantAgent` and `UserProxyAgent` to recognize and translate\n", "the speech sound from a video file and add the timestamp like a subtitle file based on [agentchat_function_call.ipynb](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_function_call.ipynb)\n" ] @@ -25,27 +17,20 @@ "metadata": {}, "source": [ "## Requirements\n", - "AutoGen requires `Python>=3.8`. To run this notebook example, please install `openai`, `pyautogen`, `whisper`, and `moviepy`:\n", + "\n", + "``````\n", + "````{=mdx}\n", + ":::info Requirements\n", + "Some extra dependencies are needed for this notebook, which can be installed via pip:\n", + "\n", "```bash\n", - "pip install openai\n", - "pip install openai-whisper\n", - "pip install moviepy\n", - "pip install pyautogen\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc4600b8-c6df-49dd-945d-ce69f30a65cc", - "metadata": {}, - "outputs": [], - "source": [ - "%%capture --no-stderr\n", - "# %pip install moviepy~=1.0.3\n", - "# %pip install openai-whisper~=20230918\n", - "# %pip install openai~=1.3.5\n", - "# %pip install \"pyautogen>=0.2.3\"" + "pip install pyautogen openai openai-whisper\n", + "```\n", + "\n", + "For more information, please refer to the [installation guide](/docs/installation/).\n", + ":::\n", + "````\n", + "``````" ] }, { @@ -59,19 +44,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "26d1ae87-f007-4286-a56a-dcf68abf9393", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", - "import whisper\n", - "from moviepy.editor import VideoFileClip\n", - "from openai import OpenAI\n", - "\n", - "import autogen\n", - "\n", "config_list = [\n", " {\n", " \"model\": \"gpt-4\",\n", @@ -85,11 +64,163 @@ "id": "324fec65-ab23-45db-a7a8-0aaf753fe19c", "metadata": {}, "source": [ + "````{=mdx}\n", + ":::tip\n", + "Learn more about configuring LLMs for agents [here](/docs/topics/llm_configuration).\n", + ":::\n", + "````\n", + "\n", "## Example and Output\n", "Below is an example of speech recognition from a [Peppa Pig cartoon video clip](https://drive.google.com/file/d/1QY0naa2acHw2FuH7sY3c-g2sBLtC2Sv4/view?usp=drive_link) originally in English and translated into Chinese.\n", "'FFmpeg' does not support online files. To run the code on the example video, you need to download the example video locally. You can change `your_file_path` to your local video file path." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3e691b8", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Annotated, List\n", + "\n", + "import whisper\n", + "from openai import OpenAI\n", + "import autogen\n", + "\n", + "\n", + "source_language = \"English\"\n", + "target_language = \"Chinese\"\n", + "key = os.getenv(\"OPENAI_API_KEY\")\n", + "target_video = \"your_file_path\"\n", + "\n", + "assistant = autogen.AssistantAgent(\n", + " name=\"assistant\",\n", + " system_message=\"For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.\",\n", + " llm_config={\"config_list\": config_list, \"timeout\": 120},\n", + ")\n", + "\n", + "user_proxy = autogen.UserProxyAgent(\n", + " name=\"user_proxy\",\n", + " is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n", + " human_input_mode=\"NEVER\",\n", + " max_consecutive_auto_reply=10,\n", + ")\n", + "\n", + "\n", + "def translate_text(input_text, source_language, target_language):\n", + " client = OpenAI(api_key=key)\n", + "\n", + " response = client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": f\"Directly translate the following {source_language} text to a pure {target_language} \"\n", + " f\"video subtitle text without additional explanation.: '{input_text}'\",\n", + " },\n", + " ],\n", + " max_tokens=1500,\n", + " )\n", + "\n", + " # Correctly accessing the response content\n", + " translated_text = response.choices[0].message.content if response.choices else None\n", + " return translated_text\n", + "\n", + "\n", + "@user_proxy.register_for_execution()\n", + "@assistant.register_for_llm(description=\"using translate_text function to translate the script\")\n", + "def translate_transcript(\n", + " source_language: Annotated[str, \"Source language\"], target_language: Annotated[str, \"Target language\"]\n", + ") -> str:\n", + " with open(\"transcription.txt\", \"r\") as f:\n", + " lines = f.readlines()\n", + "\n", + " translated_transcript = []\n", + "\n", + " for line in lines:\n", + " # Split each line into timestamp and text parts\n", + " parts = line.strip().split(\": \")\n", + " if len(parts) == 2:\n", + " timestamp, text = parts[0], parts[1]\n", + " # Translate only the text part\n", + " translated_text = translate_text(text, source_language, target_language)\n", + " # Reconstruct the line with the translated text and the preserved timestamp\n", + " translated_line = f\"{timestamp}: {translated_text}\"\n", + " translated_transcript.append(translated_line)\n", + " else:\n", + " # If the line doesn't contain a timestamp, add it as is\n", + " translated_transcript.append(line.strip())\n", + "\n", + " return \"\\n\".join(translated_transcript)\n", + "\n", + "\n", + "@user_proxy.register_for_execution()\n", + "@assistant.register_for_llm(description=\"recognize the speech from video and transfer into a txt file\")\n", + "def recognize_transcript_from_video(filepath: Annotated[str, \"path of the video file\"]) -> List[dict]:\n", + " try:\n", + " # Load model\n", + " model = whisper.load_model(\"small\")\n", + "\n", + " # Transcribe audio with detailed timestamps\n", + " result = model.transcribe(filepath, verbose=True)\n", + "\n", + " # Initialize variables for transcript\n", + " transcript = []\n", + " sentence = \"\"\n", + " start_time = 0\n", + "\n", + " # Iterate through the segments in the result\n", + " for segment in result[\"segments\"]:\n", + " # If new sentence starts, save the previous one and reset variables\n", + " if segment[\"start\"] != start_time and sentence:\n", + " transcript.append(\n", + " {\n", + " \"sentence\": sentence.strip() + \".\",\n", + " \"timestamp_start\": start_time,\n", + " \"timestamp_end\": segment[\"start\"],\n", + " }\n", + " )\n", + " sentence = \"\"\n", + " start_time = segment[\"start\"]\n", + "\n", + " # Add the word to the current sentence\n", + " sentence += segment[\"text\"] + \" \"\n", + "\n", + " # Add the final sentence\n", + " if sentence:\n", + " transcript.append(\n", + " {\n", + " \"sentence\": sentence.strip() + \".\",\n", + " \"timestamp_start\": start_time,\n", + " \"timestamp_end\": result[\"segments\"][-1][\"end\"],\n", + " }\n", + " )\n", + "\n", + " # Save the transcript to a file\n", + " with open(\"transcription.txt\", \"w\") as file:\n", + " for item in transcript:\n", + " sentence = item[\"sentence\"]\n", + " start_time, end_time = item[\"timestamp_start\"], item[\"timestamp_end\"]\n", + " file.write(f\"{start_time}s to {end_time}s: {sentence}\\n\")\n", + "\n", + " return transcript\n", + "\n", + " except FileNotFoundError:\n", + " return \"The specified audio file could not be found.\"\n", + " except Exception as e:\n", + " return f\"An unexpected error occurred: {str(e)}\"" + ] + }, + { + "cell_type": "markdown", + "id": "072de235", + "metadata": {}, + "source": [ + "Now, start the chat:" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -206,180 +337,30 @@ } ], "source": [ - "def recognize_transcript_from_video(audio_filepath):\n", - " try:\n", - " # Load model\n", - " model = whisper.load_model(\"small\")\n", - "\n", - " # Transcribe audio with detailed timestamps\n", - " result = model.transcribe(audio_filepath, verbose=True)\n", - "\n", - " # Initialize variables for transcript\n", - " transcript = []\n", - " sentence = \"\"\n", - " start_time = 0\n", - "\n", - " # Iterate through the segments in the result\n", - " for segment in result[\"segments\"]:\n", - " # If new sentence starts, save the previous one and reset variables\n", - " if segment[\"start\"] != start_time and sentence:\n", - " transcript.append(\n", - " {\n", - " \"sentence\": sentence.strip() + \".\",\n", - " \"timestamp_start\": start_time,\n", - " \"timestamp_end\": segment[\"start\"],\n", - " }\n", - " )\n", - " sentence = \"\"\n", - " start_time = segment[\"start\"]\n", - "\n", - " # Add the word to the current sentence\n", - " sentence += segment[\"text\"] + \" \"\n", - "\n", - " # Add the final sentence\n", - " if sentence:\n", - " transcript.append(\n", - " {\n", - " \"sentence\": sentence.strip() + \".\",\n", - " \"timestamp_start\": start_time,\n", - " \"timestamp_end\": result[\"segments\"][-1][\"end\"],\n", - " }\n", - " )\n", - "\n", - " # Save the transcript to a file\n", - " with open(\"transcription.txt\", \"w\") as file:\n", - " for item in transcript:\n", - " sentence = item[\"sentence\"]\n", - " start_time, end_time = item[\"timestamp_start\"], item[\"timestamp_end\"]\n", - " file.write(f\"{start_time}s to {end_time}s: {sentence}\\n\")\n", - "\n", - " return transcript\n", - "\n", - " except FileNotFoundError:\n", - " return \"The specified audio file could not be found.\"\n", - " except Exception as e:\n", - " return f\"An unexpected error occurred: {str(e)}\"\n", - "\n", - "\n", - "def translate_text(input_text, source_language, target_language):\n", - " client = OpenAI(api_key=key)\n", - "\n", - " response = client.chat.completions.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": f\"Directly translate the following {source_language} text to a pure {target_language} \"\n", - " f\"video subtitle text without additional explanation.: '{input_text}'\",\n", - " },\n", - " ],\n", - " max_tokens=1500,\n", - " )\n", - "\n", - " # Correctly accessing the response content\n", - " translated_text = response.choices[0].message.content if response.choices else None\n", - " return translated_text\n", - "\n", - "\n", - "def translate_transcript(source_language, target_language):\n", - " with open(\"transcription.txt\", \"r\") as f:\n", - " lines = f.readlines()\n", - "\n", - " translated_transcript = []\n", - "\n", - " for line in lines:\n", - " # Split each line into timestamp and text parts\n", - " parts = line.strip().split(\": \")\n", - " if len(parts) == 2:\n", - " timestamp, text = parts[0], parts[1]\n", - " # Translate only the text part\n", - " translated_text = translate_text(text, source_language, target_language)\n", - " # Reconstruct the line with the translated text and the preserved timestamp\n", - " translated_line = f\"{timestamp}: {translated_text}\"\n", - " translated_transcript.append(translated_line)\n", - " else:\n", - " # If the line doesn't contain a timestamp, add it as is\n", - " translated_transcript.append(line.strip())\n", - "\n", - " return \"\\n\".join(translated_transcript)\n", - "\n", - "\n", - "llm_config = {\n", - " \"functions\": [\n", - " {\n", - " \"name\": \"recognize_transcript_from_video\",\n", - " \"description\": \"recognize the speech from video and transfer into a txt file\",\n", - " \"parameters\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"audio_filepath\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"path of the video file\",\n", - " }\n", - " },\n", - " \"required\": [\"audio_filepath\"],\n", - " },\n", - " },\n", - " {\n", - " \"name\": \"translate_transcript\",\n", - " \"description\": \"using translate_text function to translate the script\",\n", - " \"parameters\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"source_language\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"source language\",\n", - " },\n", - " \"target_language\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"target language\",\n", - " },\n", - " },\n", - " \"required\": [\"source_language\", \"target_language\"],\n", - " },\n", - " },\n", - " ],\n", - " \"config_list\": config_list,\n", - " \"timeout\": 120,\n", - "}\n", - "source_language = \"English\"\n", - "target_language = \"Chinese\"\n", - "key = os.getenv(\"OPENAI_API_KEY\")\n", - "target_video = \"your_file_path\"\n", - "\n", - "chatbot = autogen.AssistantAgent(\n", - " name=\"chatbot\",\n", - " system_message=\"For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.\",\n", - " llm_config=llm_config,\n", - ")\n", - "\n", - "user_proxy = autogen.UserProxyAgent(\n", - " name=\"user_proxy\",\n", - " is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n", - " human_input_mode=\"NEVER\",\n", - " max_consecutive_auto_reply=10,\n", - " code_execution_config={\n", - " \"work_dir\": \"coding_2\",\n", - " \"use_docker\": False,\n", - " }, # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.\n", - ")\n", - "\n", - "user_proxy.register_function(\n", - " function_map={\n", - " \"recognize_transcript_from_video\": recognize_transcript_from_video,\n", - " \"translate_transcript\": translate_transcript,\n", - " }\n", - ")\n", "user_proxy.initiate_chat(\n", - " chatbot,\n", + " assistant,\n", " message=f\"For the video located in {target_video}, recognize the speech and transfer it into a script file, \"\n", " f\"then translate from {source_language} text to a {target_language} video subtitle text. \",\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aeea924a", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { + "front_matter": { + "description": "Use tools to extract and translate the transcript of a video file.", + "tags": [ + "whisper", + "function call" + ] + }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", @@ -395,7 +376,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.7" } }, "nbformat": 4, From a1a1a0b94a578998576c5d2d7ce643d94cfaea66 Mon Sep 17 00:00:00 2001 From: Jack Gerrits Date: Sun, 10 Mar 2024 07:24:22 -0400 Subject: [PATCH 2/3] Update agentchat_video_transcript_translate_with_whisper.ipynb --- notebook/agentchat_video_transcript_translate_with_whisper.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/notebook/agentchat_video_transcript_translate_with_whisper.ipynb b/notebook/agentchat_video_transcript_translate_with_whisper.ipynb index ae36cc888992..773163632831 100644 --- a/notebook/agentchat_video_transcript_translate_with_whisper.ipynb +++ b/notebook/agentchat_video_transcript_translate_with_whisper.ipynb @@ -105,6 +105,7 @@ " is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n", " human_input_mode=\"NEVER\",\n", " max_consecutive_auto_reply=10,\n", + " code_execution_config={}\n", ")\n", "\n", "\n", From b1f1fdd560b80036fbfb074fdf250d4a0ab11d62 Mon Sep 17 00:00:00 2001 From: Jack Gerrits Date: Mon, 11 Mar 2024 08:28:33 -0400 Subject: [PATCH 3/3] formatting --- .../agentchat_video_transcript_translate_with_whisper.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebook/agentchat_video_transcript_translate_with_whisper.ipynb b/notebook/agentchat_video_transcript_translate_with_whisper.ipynb index 773163632831..153338fa12a8 100644 --- a/notebook/agentchat_video_transcript_translate_with_whisper.ipynb +++ b/notebook/agentchat_video_transcript_translate_with_whisper.ipynb @@ -105,7 +105,7 @@ " is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n", " human_input_mode=\"NEVER\",\n", " max_consecutive_auto_reply=10,\n", - " code_execution_config={}\n", + " code_execution_config={},\n", ")\n", "\n", "\n",