From 695100d5d83e62be9810216756d9e7012b22a12e Mon Sep 17 00:00:00 2001 From: narumi Date: Mon, 16 Dec 2024 13:16:00 +0800 Subject: [PATCH] Support specifying YouTube transcript language --- src/markitdown/_markitdown.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..645d230 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -344,8 +344,11 @@ def convert( assert isinstance(params["v"][0], str) video_id = str(params["v"][0]) try: + youtube_transcript_languages = kwargs.get( + "youtube_transcript_languages", ("en",) + ) # Must be a single transcript. - transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore + transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore # Alternative formatting: # formatter = TextFormatter() @@ -1003,7 +1006,7 @@ def convert_response( self._append_ext(extensions, g) # Convert - result = self._convert(temp_path, extensions, url=response.url) + result = self._convert(temp_path, extensions, url=response.url, **kwargs) # Clean up finally: try: