Playlists

Added check for playlist and script to generate text file so you can DL/transcribe/summarize entire playlists. Also added first attempt at some tests
rmusser01 · May 9, 2024 · 91697a8 · 91697a8
1 parent 947ff8a
commit 91697a8
Show file tree

Hide file tree

Showing 3 changed files with 289 additions and 12 deletions.
diff --git a/Get_Playlist_URLs.py b/Get_Playlist_URLs.py
@@ -0,0 +1,37 @@
+import sys
+import yt_dlp
+
+def get_playlist_videos(playlist_url):
+    ydl_opts = {
+        'extract_flat': True,
+        'skip_download': True,
+        'quiet': True
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(playlist_url, download=False)
+
+        if 'entries' in info:
+            video_urls = [entry['url'] for entry in info['entries']]
+            playlist_title = info['title']
+            return video_urls, playlist_title
+        else:
+            print("No videos found in the playlist.")
+            return [], None
+
+def save_to_file(video_urls, filename):
+    with open(filename, 'w') as file:
+        file.write('\n'.join(video_urls))
+    print(f"Video URLs saved to {filename}")
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Please provide the playlist URL as a command-line argument.")
+        sys.exit(1)
+
+    playlist_url = sys.argv[1]
+    video_urls, playlist_title = get_playlist_videos(playlist_url)
+
+    if video_urls:
+        filename = f"{playlist_title}.txt"
+        save_to_file(video_urls, filename)
diff --git a/diarize.py b/diarize.py
@@ -279,7 +279,6 @@ def download_ffmpeg():
 
 def read_paths_from_file(file_path):
     """ Reads a file containing URLs or local file paths and returns them as a list. """
-    paths = []
     with open(file_path, 'r') as file:
         for line in file:
             line = line.strip()
@@ -354,7 +353,7 @@ def normalize_title(title):
 def get_youtube(video_url):
     ydl_opts = {
         'format': 'bestaudio[ext=m4a]',
-        'noplaylist': True,
+        'noplaylist': False,
         'quiet': True,
         'extract_flat': True
     }
@@ -366,6 +365,33 @@ def get_youtube(video_url):
 
 
 
+def get_playlist_videos(playlist_url):
+    ydl_opts = {
+        'extract_flat': True,
+        'skip_download': True,
+        'quiet': True
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(playlist_url, download=False)
+
+        if 'entries' in info:
+            video_urls = [entry['url'] for entry in info['entries']]
+            playlist_title = info['title']
+            return video_urls, playlist_title
+        else:
+            print("No videos found in the playlist.")
+            return [], None
+
+
+
+def save_to_file(video_urls, filename):
+    with open(filename, 'w') as file:
+        file.write('\n'.join(video_urls))
+    print(f"Video URLs saved to {filename}")
+
+
+
 def download_video(video_url, download_path, info_dict, download_video_flag):
     logging.debug("About to normalize downloaded video title")
     title = normalize_title(info_dict['title'])
@@ -1092,30 +1118,49 @@ def save_summary_to_file(summary, file_path):
 ####################################################################################################################################
 # Main()
 #
+
 def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False):
     start_time = time.monotonic()
+    paths = []
     if os.path.isfile(input_path) and input_path.endswith('.txt'):
         logging.debug("MAIN: User passed in a text file, processing text file...")
         paths = read_paths_from_file(input_path)
     else:
-        paths = [input_path]
+        info_dict = get_youtube(input_path)
+        if 'entries' in info_dict:  # Check if the input is a playlist
+            logging.debug("MAIN: YouTube playlist detected")
+            print("\n\nSorry, but playlists aren't currently supported. You can run the following command to generate a text file that you can then pass into this script though!" + """\n\n\tpython Get_Playlist_URLs.py <Youtube Playlist URL>\n\n\tThen,\n\n\tpython diarizer.py <playlist text file name>\n\n""")
+        else:
+            paths = [input_path]
 
     results = []
+
     for path in paths:
         try:
             if path.startswith('http'):
                 logging.debug("MAIN: URL Detected")
                 info_dict = get_youtube(path)
                 if info_dict:
-                    logging.debug("MAIN: Creating path for video file...")
-                    download_path = create_download_directory(info_dict['title'])
-                    logging.debug("MAIN: Path created successfully")
-                    logging.debug("MAIN: Downloading video from yt_dlp...")
-                    video_path = download_video(path, download_path, info_dict, download_video_flag)
-                    logging.debug("MAIN: Video downloaded successfully")
-                    logging.debug("MAIN: Converting video file to WAV...")
-                    audio_file = convert_to_wav(video_path, offset)
-                    logging.debug("MAIN: Audio file converted succesfully")
+                    if 'entries' in info_dict:  # Check if the input is a playlist
+                        logging.debug("MAIN: Creating path for video file...")
+                        download_path = create_download_directory(info_dict['title'])
+                        logging.debug("MAIN: Path created successfully")
+                        logging.debug("MAIN: Downloading video from yt_dlp...")
+                        video_path = download_video(path, download_path, info_dict, download_video_flag)
+                        logging.debug("MAIN: Video downloaded successfully")
+                        logging.debug("MAIN: Converting video file to WAV...")
+                        audio_file = convert_to_wav(video_path, offset)
+                        logging.debug("MAIN: Audio file converted succesfully")
+                    else:
+                        logging.debug("MAIN: Creating path for video file...")
+                        download_path = create_download_directory(info_dict['title'])
+                        logging.debug("MAIN: Path created successfully")
+                        logging.debug("MAIN: Downloading video from yt_dlp...")
+                        video_path = download_video(path, download_path, info_dict, download_video_flag)
+                        logging.debug("MAIN: Video downloaded successfully")
+                        logging.debug("MAIN: Converting video file to WAV...")
+                        audio_file = convert_to_wav(video_path, offset)
+                        logging.debug("MAIN: Audio file converted succesfully")
             else:
                 if os.path.exists(path):
                     logging.debug("MAIN: Local file path detected")

diff --git a/test_transcription_summarization.py b/test_transcription_summarization.py
@@ -0,0 +1,195 @@
+import os
+import shutil
+import tempfile
+import unittest
+from unittest.mock import patch, MagicMock
+
+# Run this: python -m unittest test_transcription_summarization.py
+"""
+1. Test error handling:
+    You mentioned testing error handling in the comments, but there are no specific tests for it in the script. Consider adding tests that intentionally raise exceptions or simulate error conditions to ensure that the script handles them gracefully. Use assertRaises to check if the expected exceptions are raised.
+2. Test convert_to_wav:
+    In the test_convert_to_wav test, you can enhance it by creating a temporary video file using a library like cv2 or moviepy and then testing the conversion with the actual file. This will make the test more comprehensive.
+3. Test summarization functions:
+    You have a test for summarize_with_openai, but there are no tests for other summarization functions like summarize_with_claude, summarize_with_cohere, etc. Consider adding tests for each summarization function to ensure they work as expected.
+4. Parameterized tests:
+    You mentioned using parameterized tests in the comments, but the current script doesn't include any. Consider using the @parameterized decorator from the parameterized library to create parameterized tests. This allows you to test the script with different input paths, API names, models, and configurations without duplicating test code.
+5. Integration tests:
+    The test_main function is a good example of an integration test. Consider adding more integration tests that cover different scenarios and combinations of input paths, API names, and configurations to ensure the script works end-to-end.
+6. Test coverage:
+    Use a test coverage tool like coverage to measure the test coverage of your script. This helps identify areas that may require additional testing. You can run the tests with coverage and generate a coverage report to see which lines of code are covered by the tests.
+7. Naming conventions:
+    Follow consistent naming conventions for test methods and variables. For example, use test_ prefix for test method names and mock_ prefix for mocked objects.
+8. Docstrings and comments:
+    Add docstrings to each test method to describe what the test is verifying. This improves the readability and maintainability of the test script. You can also add comments to explain complex test setups or assertions.
+9. Integration tests:
+    Write integration tests that cover the entire flow of the script, from processing the input path to generating the summary.
+    Use a combination of real and mocked dependencies to test the integration between different components.
+10. Parameterized tests:
+    Use parameterized tests to test the script with different input paths, API names, models, and other configurations.
+    This allows you to cover a wide range of scenarios without duplicating test code.
+"""
+
+# Import the necessary functions and classes from your script
+from diarize import (
+    read_paths_from_file,
+    process_path,
+    process_local_file,
+    create_download_directory,
+    normalize_title,
+    get_youtube,
+    download_video,
+    convert_to_wav,
+    speech_to_text,
+    summarize_with_openai,
+    summarize_with_claude,
+    summarize_with_cohere,
+    summarize_with_groq,
+    summarize_with_llama,
+    summarize_with_oobabooga,
+    save_summary_to_file,
+    main
+)
+
+class TestTranscriptionSummarization(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir)
+
+    def test_read_paths_from_file(self):
+        # Create a temporary file with sample paths
+        file_path = os.path.join(self.temp_dir, 'paths.txt')
+        with open(file_path, 'w') as file:
+            file.write('path1\npath2\npath3')
+
+        # Call the function and check the returned paths
+        paths = read_paths_from_file(file_path)
+        self.assertEqual(paths, ['path1', 'path2', 'path3'])
+
+
+
+    @patch('diarize.process_local_file')
+    def test_process_local_file(self, mock_process_local_file):
+        mock_process_local_file.return_value = ('/path/to/download', {'title': 'Local Video'}, '/path/to/audio.wav')
+        result = process_path('/path/to/local/video.mp4')
+        self.assertEqual(result, ('/path/to/download', {'title': 'Local Video'}, '/path/to/audio.wav'))
+
+
+
+    def test_normalize_title(self):
+        title = 'Video Title / with \\ Special: Characters*'
+        normalized_title = normalize_title(title)
+        self.assertEqual(normalized_title, 'Video Title _ with _ Special_ Characters')
+
+
+
+    @patch('diarize.subprocess.run')
+    def test_convert_to_wav(self, mock_subprocess_run):
+        video_file_path = '/path/to/video.mp4'
+        audio_file_path = convert_to_wav(video_file_path)
+        self.assertEqual(audio_file_path, '/path/to/video.wav')
+        mock_subprocess_run.assert_called_once()
+
+
+
+    @patch('diarize.process_local_file')
+    def test_process_path(self, mock_process_local_file, mock_get_youtube):
+        # Test processing a URL
+        mock_get_youtube.return_value = {'title': 'Video Title'}
+        result = process_path('https://example.com/video.mp4')
+        self.assertEqual(result, {'title': 'Video Title'})
+
+        # Test processing a local file
+        mock_process_local_file.return_value = ('/path/to/download', {'title': 'Local Video'}, '/path/to/audio.wav')
+        result = process_path('/path/to/local/video.mp4')
+        self.assertEqual(result, ('/path/to/download', {'title': 'Local Video'}, '/path/to/audio.wav'))
+
+
+
+    def test_speech_to_text(self):
+        audio_file_path = '/path/to/audio.wav'
+        segments = speech_to_text(audio_file_path)
+        self.assertIsInstance(segments, list)
+        self.assertTrue(len(segments) > 0)
+        self.assertIn('start', segments[0])
+        self.assertIn('end', segments[0])
+        self.assertIn('text', segments[0])
+
+
+
+    @patch('diarize.requests.post')
+    def test_summarize_with_openai(self, mock_post):
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {'choices': [{'message': {'content': 'Summary'}}]}
+        mock_post.return_value = mock_response
+
+        summary = summarize_with_openai('api_key', '/path/to/audio.wav.segments.json', 'gpt-4-turbo')
+        self.assertEqual(summary, 'Summary')
+
+
+
+    def test_integration_local_file(self):
+        # Create a temporary video file
+        video_file_path = os.path.join(self.temp_dir, 'video.mp4')
+        with open(video_file_path, 'wb') as file:
+            file.write(b'dummy video content')
+
+        # Call the main function with the local file path
+        results = main(video_file_path)
+
+        # Check the expected results
+        self.assertEqual(len(results), 1)
+        self.assertEqual(results[0]['video_path'], video_file_path)
+        self.assertIsNotNone(results[0]['audio_file'])
+        self.assertIsInstance(results[0]['transcription'], list)
+
+
+
+    def test_save_summary_to_file(self):
+        summary = 'This is a summary.'
+        file_path = '/path/to/audio.wav.segments.json'
+        save_summary_to_file(summary, file_path)
+        summary_file_path = file_path.        self.assertTrue(os.path.exists(summary_file_path))
+        with open(summary_file_path, 'r') as file:
+            content = file.read()
+            self.assertEqual(content, summary)
+
+
+
+    @patch('diarize.get_youtube')
+    @patch('diarize.download_video')
+    @patch('diarize.convert_to_wav')
+    @patch('diarize.speech_to_text')
+    @patch('diarize.summarize_with_openai')
+    def test_main(self, mock_summarize, mock_speech_to_text, mock_convert_to_wav, mock_download_video, mock_get_youtube):
+        # Set up mock return values
+        mock_get_youtube.return_value = {'title': 'Video Title'}
+        mock_download_video.return_value = '/path/to/video.mp4'
+        mock_convert_to_wav.return_value = '/path/to/audio.wav'
+        mock_speech_to_text.return_value = [{'start': 0, 'end': 5, 'text': 'Hello'}]
+        mock_summarize.return_value = 'This is a summary.'
+
+        # Call the main function with sample arguments
+        results = main('https://example.com/video.mp4', api_name='openai', api_key='api_key')
+
+        # Check the expected results
+        self.assertEqual(len(results), 1)
+        self.assertEqual(results[0]['video_path'], 'https://example.com/video.mp4')
+        self.assertEqual(results[0]['audio_file'], '/path/to/audio.wav')
+        self.assertEqual(results[0]['transcription'], [{'start': 0, 'end': 5, 'text': 'Hello'}])
+        self.assertEqual(results[0]['summary'], 'This is a summary.')
+
+        # Check that the expected functions were called with the correct arguments
+        mock_get_youtube.assert_called_once_with('https://example.com/video.mp4')
+        mock_download_video.assert_called_once()
+        mock_convert_to_wav.assert_called_once()
+        mock_speech_to_text.assert_called_once()
+        mock_summarize.assert_called_once_with('api_key', '/path/to/audio.wav.segments.json', 'gpt-4-turbo')
+
+    # Add more test methods for other functions...
+
+if __name__ == '__main__':
+    unittest.main()