Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add YouTube transcript extraction component and frontend integration #4502

Merged
merged 16 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/backend/base/langflow/components/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from .wikipedia_api import WikipediaAPIComponent
from .wolfram_alpha_api import WolframAlphaAPIComponent
from .yahoo_finance import YfinanceToolComponent
from .youtube_transcripts import YouTubeTranscriptsComponent

with warnings.catch_warnings():
warnings.simplefilter("ignore", LangChainDeprecationWarning)
Expand All @@ -45,4 +46,5 @@
"WikipediaAPIComponent",
"WolframAlphaAPIComponent",
"YfinanceToolComponent",
"YouTubeTranscriptsComponent",
]
168 changes: 168 additions & 0 deletions src/backend/base/langflow/components/tools/youtube_transcripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
from langchain.tools import StructuredTool
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat
from pydantic import BaseModel, Field

from langflow.base.langchain_utilities.model import LCToolComponent
from langflow.field_typing import Tool
from langflow.inputs import DropdownInput, IntInput, MultilineInput
from langflow.schema import Data
from langflow.template import Output


class YouTubeTranscriptsComponent(LCToolComponent):
"""A component that extracts spoken content from YouTube videos as transcripts."""

display_name: str = "YouTube Transcripts"
description: str = "Extracts spoken content from YouTube videos as transcripts."
icon: str = "YouTube"

inputs = [
MultilineInput(
name="url", display_name="Video URL", info="Enter the YouTube video URL to get transcripts from."
),
DropdownInput(
name="transcript_format",
display_name="Transcript Format",
options=["text", "chunks"],
value="text",
info="The format of the transcripts. Either 'text' for a single output "
"or 'chunks' for timestamped chunks.",
),
IntInput(
name="chunk_size_seconds",
display_name="Chunk Size (seconds)",
value=60,
advanced=True,
info="The size of each transcript chunk in seconds. Only applicable when "
"'Transcript Format' is set to 'chunks'.",
),
MultilineInput(
name="language",
display_name="Language",
info="A comma-separated list of language codes in descending priority. " "Leave empty for default.",
),
DropdownInput(
name="translation",
display_name="Translation Language",
advanced=True,
options=["", "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "hi", "ar", "id"],
info="Translate the transcripts to the specified language. " "Leave empty for no translation.",
),
]

outputs = [
Output(name="transcripts", display_name="Data", method="build_youtube_transcripts"),
Output(name="transcripts_tool", display_name="Tool", method="build_youtube_tool"),
]

class YoutubeApiSchema(BaseModel):
"""Schema to define the input structure for the tool."""

url: str = Field(..., description="The YouTube URL to get transcripts from.")
transcript_format: str = Field(
"text",
description="The format of the transcripts. Either 'text' for a single "
"text output or 'chunks' for timestamped chunks.",
)
chunk_size_seconds: int = Field(
120,
description="The size of each transcript chunk in seconds. Only "
"applicable when 'Transcript Format' is set to 'chunks'.",
)
language: str = Field(
"",
description="A comma-separated list of language codes in descending " "priority. Leave empty for default.",
)
translation: str = Field(
"", description="Translate the transcripts to the specified language. " "Leave empty for no translation."
)

def build_youtube_transcripts(self) -> Data | list[Data]:
"""Method to build transcripts from the provided YouTube URL.

Returns:
Data | list[Data]: The transcripts of the video, either as a single
Data object or a list of Data objects.
"""
try:
loader = YoutubeLoader.from_youtube_url(
self.url,
transcript_format=TranscriptFormat.TEXT
if self.transcript_format == "text"
else TranscriptFormat.CHUNKS,
chunk_size_seconds=self.chunk_size_seconds,
language=self.language.split(",") if self.language else ["en"],
translation=self.translation if self.translation else None,
)

transcripts = loader.load()

if self.transcript_format == "text":
# Extract only the page_content from the Document
return Data(data={"transcripts": transcripts[0].page_content})
# For chunks, extract page_content and metadata separately
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]

except Exception as exc: # noqa: BLE001
# Using a specific error type for the return value
return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"})

def youtube_transcripts(
self,
url: str = "",
transcript_format: str = "text",
chunk_size_seconds: int = 120,
language: str = "",
translation: str = "",
) -> Data | list[Data]:
"""Helper method to handle transcripts outside of component calls.

Args:
url: The YouTube URL to get transcripts from.
transcript_format: Format of transcripts ('text' or 'chunks').
chunk_size_seconds: Size of each transcript chunk in seconds.
language: Comma-separated list of language codes.
translation: Target language for translation.

Returns:
Data | list[Data]: Video transcripts as single Data or list of Data.
"""
try:
loader = YoutubeLoader.from_youtube_url(
url,
transcript_format=TranscriptFormat.TEXT if transcript_format == "text" else TranscriptFormat.CHUNKS,
chunk_size_seconds=chunk_size_seconds,
language=language.split(",") if language else ["en"],
translation=translation if translation else None,
)

transcripts = loader.load()

if transcript_format == "text":
return Data(data={"transcript": transcripts[0].page_content})
return [Data(data={"content": doc.page_content, "metadata": doc.metadata}) for doc in transcripts]

except Exception as exc: # noqa: BLE001
return Data(data={"error": f"Failed to get YouTube transcripts: {exc!s}"})

def build_youtube_tool(self) -> Tool:
"""Method to build the transcripts tool.

Returns:
Tool: A structured tool that uses the transcripts method.

Raises:
RuntimeError: If tool creation fails.
"""
try:
return StructuredTool.from_function(
name="youtube_transcripts",
description="Get transcripts from YouTube videos.",
func=self.youtube_transcripts,
args_schema=self.YoutubeApiSchema,
)

except Exception as exc:
msg = f"Failed to build the YouTube transcripts tool: {exc!s}"
raise RuntimeError(msg) from exc
9 changes: 9 additions & 0 deletions src/frontend/src/icons/Youtube/index.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import React, { forwardRef } from "react";
import YouTubeIcon from "./youtube";

export const YouTubeSvgIcon = forwardRef<
SVGSVGElement,
React.PropsWithChildren<{}>
>((props, ref) => {
return <YouTubeIcon ref={ref} {...props} />;
});
Loading
Loading