Skip to content

Commit

Permalink
Playlists
Browse files Browse the repository at this point in the history
Added check for playlist and script to generate text file so you can DL/transcribe/summarize entire playlists.

Also added first attempt at some tests
  • Loading branch information
rmusser01 committed May 9, 2024
1 parent 947ff8a commit 91697a8
Show file tree
Hide file tree
Showing 3 changed files with 289 additions and 12 deletions.
37 changes: 37 additions & 0 deletions Get_Playlist_URLs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import sys
import yt_dlp

def get_playlist_videos(playlist_url):
ydl_opts = {
'extract_flat': True,
'skip_download': True,
'quiet': True
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(playlist_url, download=False)

if 'entries' in info:
video_urls = [entry['url'] for entry in info['entries']]
playlist_title = info['title']
return video_urls, playlist_title
else:
print("No videos found in the playlist.")
return [], None

def save_to_file(video_urls, filename):
with open(filename, 'w') as file:
file.write('\n'.join(video_urls))
print(f"Video URLs saved to {filename}")

if __name__ == '__main__':
if len(sys.argv) < 2:
print("Please provide the playlist URL as a command-line argument.")
sys.exit(1)

playlist_url = sys.argv[1]
video_urls, playlist_title = get_playlist_videos(playlist_url)

if video_urls:
filename = f"{playlist_title}.txt"
save_to_file(video_urls, filename)
69 changes: 57 additions & 12 deletions diarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,6 @@ def download_ffmpeg():

def read_paths_from_file(file_path):
""" Reads a file containing URLs or local file paths and returns them as a list. """
paths = []
with open(file_path, 'r') as file:
for line in file:
line = line.strip()
Expand Down Expand Up @@ -354,7 +353,7 @@ def normalize_title(title):
def get_youtube(video_url):
ydl_opts = {
'format': 'bestaudio[ext=m4a]',
'noplaylist': True,
'noplaylist': False,
'quiet': True,
'extract_flat': True
}
Expand All @@ -366,6 +365,33 @@ def get_youtube(video_url):



def get_playlist_videos(playlist_url):
ydl_opts = {
'extract_flat': True,
'skip_download': True,
'quiet': True
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(playlist_url, download=False)

if 'entries' in info:
video_urls = [entry['url'] for entry in info['entries']]
playlist_title = info['title']
return video_urls, playlist_title
else:
print("No videos found in the playlist.")
return [], None



def save_to_file(video_urls, filename):
with open(filename, 'w') as file:
file.write('\n'.join(video_urls))
print(f"Video URLs saved to {filename}")



def download_video(video_url, download_path, info_dict, download_video_flag):
logging.debug("About to normalize downloaded video title")
title = normalize_title(info_dict['title'])
Expand Down Expand Up @@ -1092,30 +1118,49 @@ def save_summary_to_file(summary, file_path):
####################################################################################################################################
# Main()
#

def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False):
start_time = time.monotonic()
paths = []
if os.path.isfile(input_path) and input_path.endswith('.txt'):
logging.debug("MAIN: User passed in a text file, processing text file...")
paths = read_paths_from_file(input_path)
else:
paths = [input_path]
info_dict = get_youtube(input_path)
if 'entries' in info_dict: # Check if the input is a playlist
logging.debug("MAIN: YouTube playlist detected")
print("\n\nSorry, but playlists aren't currently supported. You can run the following command to generate a text file that you can then pass into this script though!" + """\n\n\tpython Get_Playlist_URLs.py <Youtube Playlist URL>\n\n\tThen,\n\n\tpython diarizer.py <playlist text file name>\n\n""")
else:
paths = [input_path]

results = []

for path in paths:
try:
if path.startswith('http'):
logging.debug("MAIN: URL Detected")
info_dict = get_youtube(path)
if info_dict:
logging.debug("MAIN: Creating path for video file...")
download_path = create_download_directory(info_dict['title'])
logging.debug("MAIN: Path created successfully")
logging.debug("MAIN: Downloading video from yt_dlp...")
video_path = download_video(path, download_path, info_dict, download_video_flag)
logging.debug("MAIN: Video downloaded successfully")
logging.debug("MAIN: Converting video file to WAV...")
audio_file = convert_to_wav(video_path, offset)
logging.debug("MAIN: Audio file converted succesfully")
if 'entries' in info_dict: # Check if the input is a playlist
logging.debug("MAIN: Creating path for video file...")
download_path = create_download_directory(info_dict['title'])
logging.debug("MAIN: Path created successfully")
logging.debug("MAIN: Downloading video from yt_dlp...")
video_path = download_video(path, download_path, info_dict, download_video_flag)
logging.debug("MAIN: Video downloaded successfully")
logging.debug("MAIN: Converting video file to WAV...")
audio_file = convert_to_wav(video_path, offset)
logging.debug("MAIN: Audio file converted succesfully")
else:
logging.debug("MAIN: Creating path for video file...")
download_path = create_download_directory(info_dict['title'])
logging.debug("MAIN: Path created successfully")
logging.debug("MAIN: Downloading video from yt_dlp...")
video_path = download_video(path, download_path, info_dict, download_video_flag)
logging.debug("MAIN: Video downloaded successfully")
logging.debug("MAIN: Converting video file to WAV...")
audio_file = convert_to_wav(video_path, offset)
logging.debug("MAIN: Audio file converted succesfully")
else:
if os.path.exists(path):
logging.debug("MAIN: Local file path detected")
Expand Down
195 changes: 195 additions & 0 deletions test_transcription_summarization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import os
import shutil
import tempfile
import unittest
from unittest.mock import patch, MagicMock

# Run this: python -m unittest test_transcription_summarization.py
"""
1. Test error handling:
You mentioned testing error handling in the comments, but there are no specific tests for it in the script. Consider adding tests that intentionally raise exceptions or simulate error conditions to ensure that the script handles them gracefully. Use assertRaises to check if the expected exceptions are raised.
2. Test convert_to_wav:
In the test_convert_to_wav test, you can enhance it by creating a temporary video file using a library like cv2 or moviepy and then testing the conversion with the actual file. This will make the test more comprehensive.
3. Test summarization functions:
You have a test for summarize_with_openai, but there are no tests for other summarization functions like summarize_with_claude, summarize_with_cohere, etc. Consider adding tests for each summarization function to ensure they work as expected.
4. Parameterized tests:
You mentioned using parameterized tests in the comments, but the current script doesn't include any. Consider using the @parameterized decorator from the parameterized library to create parameterized tests. This allows you to test the script with different input paths, API names, models, and configurations without duplicating test code.
5. Integration tests:
The test_main function is a good example of an integration test. Consider adding more integration tests that cover different scenarios and combinations of input paths, API names, and configurations to ensure the script works end-to-end.
6. Test coverage:
Use a test coverage tool like coverage to measure the test coverage of your script. This helps identify areas that may require additional testing. You can run the tests with coverage and generate a coverage report to see which lines of code are covered by the tests.
7. Naming conventions:
Follow consistent naming conventions for test methods and variables. For example, use test_ prefix for test method names and mock_ prefix for mocked objects.
8. Docstrings and comments:
Add docstrings to each test method to describe what the test is verifying. This improves the readability and maintainability of the test script. You can also add comments to explain complex test setups or assertions.
9. Integration tests:
Write integration tests that cover the entire flow of the script, from processing the input path to generating the summary.
Use a combination of real and mocked dependencies to test the integration between different components.
10. Parameterized tests:
Use parameterized tests to test the script with different input paths, API names, models, and other configurations.
This allows you to cover a wide range of scenarios without duplicating test code.
"""

# Import the necessary functions and classes from your script
from diarize import (
read_paths_from_file,
process_path,
process_local_file,
create_download_directory,
normalize_title,
get_youtube,
download_video,
convert_to_wav,
speech_to_text,
summarize_with_openai,
summarize_with_claude,
summarize_with_cohere,
summarize_with_groq,
summarize_with_llama,
summarize_with_oobabooga,
save_summary_to_file,
main
)

class TestTranscriptionSummarization(unittest.TestCase):
def setUp(self):
self.temp_dir = tempfile.mkdtemp()

def tearDown(self):
shutil.rmtree(self.temp_dir)

def test_read_paths_from_file(self):
# Create a temporary file with sample paths
file_path = os.path.join(self.temp_dir, 'paths.txt')
with open(file_path, 'w') as file:
file.write('path1\npath2\npath3')

# Call the function and check the returned paths
paths = read_paths_from_file(file_path)
self.assertEqual(paths, ['path1', 'path2', 'path3'])



@patch('diarize.process_local_file')
def test_process_local_file(self, mock_process_local_file):
mock_process_local_file.return_value = ('/path/to/download', {'title': 'Local Video'}, '/path/to/audio.wav')
result = process_path('/path/to/local/video.mp4')
self.assertEqual(result, ('/path/to/download', {'title': 'Local Video'}, '/path/to/audio.wav'))



def test_normalize_title(self):
title = 'Video Title / with \\ Special: Characters*'
normalized_title = normalize_title(title)
self.assertEqual(normalized_title, 'Video Title _ with _ Special_ Characters')



@patch('diarize.subprocess.run')
def test_convert_to_wav(self, mock_subprocess_run):
video_file_path = '/path/to/video.mp4'
audio_file_path = convert_to_wav(video_file_path)
self.assertEqual(audio_file_path, '/path/to/video.wav')
mock_subprocess_run.assert_called_once()



@patch('diarize.process_local_file')
def test_process_path(self, mock_process_local_file, mock_get_youtube):
# Test processing a URL
mock_get_youtube.return_value = {'title': 'Video Title'}
result = process_path('https://example.com/video.mp4')
self.assertEqual(result, {'title': 'Video Title'})

# Test processing a local file
mock_process_local_file.return_value = ('/path/to/download', {'title': 'Local Video'}, '/path/to/audio.wav')
result = process_path('/path/to/local/video.mp4')
self.assertEqual(result, ('/path/to/download', {'title': 'Local Video'}, '/path/to/audio.wav'))



def test_speech_to_text(self):
audio_file_path = '/path/to/audio.wav'
segments = speech_to_text(audio_file_path)
self.assertIsInstance(segments, list)
self.assertTrue(len(segments) > 0)
self.assertIn('start', segments[0])
self.assertIn('end', segments[0])
self.assertIn('text', segments[0])



@patch('diarize.requests.post')
def test_summarize_with_openai(self, mock_post):
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {'choices': [{'message': {'content': 'Summary'}}]}
mock_post.return_value = mock_response

summary = summarize_with_openai('api_key', '/path/to/audio.wav.segments.json', 'gpt-4-turbo')
self.assertEqual(summary, 'Summary')



def test_integration_local_file(self):
# Create a temporary video file
video_file_path = os.path.join(self.temp_dir, 'video.mp4')
with open(video_file_path, 'wb') as file:
file.write(b'dummy video content')

# Call the main function with the local file path
results = main(video_file_path)

# Check the expected results
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['video_path'], video_file_path)
self.assertIsNotNone(results[0]['audio_file'])
self.assertIsInstance(results[0]['transcription'], list)



def test_save_summary_to_file(self):
summary = 'This is a summary.'
file_path = '/path/to/audio.wav.segments.json'
save_summary_to_file(summary, file_path)
summary_file_path = file_path. self.assertTrue(os.path.exists(summary_file_path))
with open(summary_file_path, 'r') as file:
content = file.read()
self.assertEqual(content, summary)



@patch('diarize.get_youtube')
@patch('diarize.download_video')
@patch('diarize.convert_to_wav')
@patch('diarize.speech_to_text')
@patch('diarize.summarize_with_openai')
def test_main(self, mock_summarize, mock_speech_to_text, mock_convert_to_wav, mock_download_video, mock_get_youtube):
# Set up mock return values
mock_get_youtube.return_value = {'title': 'Video Title'}
mock_download_video.return_value = '/path/to/video.mp4'
mock_convert_to_wav.return_value = '/path/to/audio.wav'
mock_speech_to_text.return_value = [{'start': 0, 'end': 5, 'text': 'Hello'}]
mock_summarize.return_value = 'This is a summary.'

# Call the main function with sample arguments
results = main('https://example.com/video.mp4', api_name='openai', api_key='api_key')

# Check the expected results
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['video_path'], 'https://example.com/video.mp4')
self.assertEqual(results[0]['audio_file'], '/path/to/audio.wav')
self.assertEqual(results[0]['transcription'], [{'start': 0, 'end': 5, 'text': 'Hello'}])
self.assertEqual(results[0]['summary'], 'This is a summary.')

# Check that the expected functions were called with the correct arguments
mock_get_youtube.assert_called_once_with('https://example.com/video.mp4')
mock_download_video.assert_called_once()
mock_convert_to_wav.assert_called_once()
mock_speech_to_text.assert_called_once()
mock_summarize.assert_called_once_with('api_key', '/path/to/audio.wav.segments.json', 'gpt-4-turbo')

# Add more test methods for other functions...

if __name__ == '__main__':
unittest.main()

0 comments on commit 91697a8

Please sign in to comment.