-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
64 lines (56 loc) · 3.43 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
## author: 11unx0 (https://github.com/11unx0)
## project: https://github.com/11unx0/Transcriptor
import argparse
import whisper
from pytube import YouTube
import gradio as gr
# parameters
parser = argparse.ArgumentParser(description='Transcript Youtube videos and your own video/audio files. Uses Whisper. Developer: https://github.com/11unx0')
parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cpu', help='Device to run Whisper model on. Choices are "cpu" or "cuda". Default is "cpu".')
parser.add_argument('--model', type=str, choices=['tiny', 'base', 'small', 'medium', 'large', 'large-v2', 'large-v3'], default='medium', help='Choose a model for Whisper. Default is "medium". More details for models: https://github.com/11unx0/Transcriptor')
args = parser.parse_args()
# Whisper model load.
#whisper_model = whisper.load_model("base")
whisper_model = whisper.load_model(args.model, args.device) # tiny - base - small - medium - large - large-v2 - large-v3
# | Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
# |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
# | tiny | 39 M | `tiny.en` | [tiny] | ~1 GB | ~32x |
# | base | 74 M | `base.en` | [base] | ~1 GB | ~16x |
# | small | 244 M | `small.en` | [small] | ~2 GB | ~6x |
# | medium | 769 M | `medium.en` | [medium] | ~5 GB | ~2x |
# | large | 1550 M | N/A | [large] | ~10 GB | 1x |
# | large-v2 | 1550 M | N/A | [large-v2] |
# | large-v3 | 1550 M | N/A | [large-v3] |
def transcribe(input_data, file_data=None):
if input_data.startswith("https://www.youtube.com/") or input_data.startswith("http://www.youtube.com/") or input_data.startswith("www.youtube.com/") or input_data.startswith("youtube.com/"):
try:
yt = YouTube(input_data)
video = yt.streams.filter(only_audio=True).first()
audio_file_path = video.download(output_path=".")
result = whisper_model.transcribe(audio_file_path)
return result['text'].strip()
except Exception as e:
return f"An error occurred: {str(e)}"
elif file_data is not None:
try:
result = whisper_model.transcribe(file_data.name)
return result['text'].strip()
except Exception as e:
return f"An error occurred: {str(e)}"
else:
return "Please enter a YouTube URL or upload an audio file."
input_text_url = gr.Textbox(label='YouTube URL', placeholder='Enter YouTube video URL or upload audio file')
input_file = gr.File(label="Upload Audio File")
iface = gr.Interface(
fn=transcribe,
inputs=[input_text_url, input_file], # Both input types
outputs="text",
title="11unx0's Transcriptor.",
description='''### Enter a YouTube video URL or upload an audio file to transcribe it.
### Uses Whisper models by OpenAI.
### Developer's Web Page: [https://11unx0.github.io](https://11unx0.github.io)
### Project Page: [https://github.com/11unx0/Transcriptor](https://github.com/11unx0/Transcriptor)''',
allow_flagging="never",
theme='Taithrah/Minimal'
)
iface.launch()