diff --git a/.gitignore b/.gitignore index 05975bf..f9f1e7c 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,5 @@ cython_debug/ output/ assets/ + +.DS_Store diff --git a/melodytalk/main.py b/melodytalk/main.py index e110d21..0120b0f 100644 --- a/melodytalk/main.py +++ b/melodytalk/main.py @@ -226,7 +226,10 @@ def clear_input_audio(self): if not os.path.exists("checkpoints"): os.mkdir("checkpoints") parser = argparse.ArgumentParser() - parser.add_argument('--load', type=str, default="Text2Music_cuda:0, ExtractTrack_cuda:0, Text2MusicWithMelody_cuda:0") + parser.add_argument('--load', type=str, default="Text2Music_cuda:0, " + "ExtractTrack_cuda:0, " + "Text2MusicWithMelody_cuda:0," + "SimpleTracksMixing_cuda:0") args = parser.parse_args() load_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.load.split(',')} bot = ConversationBot(load_dict=load_dict) diff --git a/melodytalk/modules.py b/melodytalk/modules.py index 5856aa9..66fbf99 100644 --- a/melodytalk/modules.py +++ b/melodytalk/modules.py @@ -10,7 +10,7 @@ # source separation import demucs.separate -from utils import prompts, get_new_audio_name +from utils import prompts, get_new_audio_name, description_to_attributes, cut_dialogue_history # Initialze common models @@ -32,7 +32,7 @@ def __init__(self, device): def inference(self, text): music_filename = os.path.join("music", f"{str(uuid.uuid4())[:8]}.wav") - prompt = text + text = description_to_attributes(text) # convert text to attributes wav = self.model.generate([text], progress=False) wav = wav[0] # batch size is 1 audio_write(music_filename[:-4], @@ -47,8 +47,8 @@ def __init__(self, device): self.model = musicgen_model @prompts( - name="Generate music from user input text with melody condition", - description="useful if you want to generate, style transfer or remix music from a user input text with a given melody condition." + name="Generate music from user input text with melody or track condition", + description="useful if you want to generate, style transfer or remix music from a user input text with a given melody or track condition." "like: remix the given melody with text description, or doing style transfer as text described with the given melody." "The input to this tool should be a comma separated string of two, " "representing the music_filename and the text description." @@ -56,6 +56,7 @@ def __init__(self, device): def inference(self, inputs): music_filename, text = inputs.split(",")[0].strip(), inputs.split(",")[1].strip() + text = description_to_attributes(text) # convert text to attributes print(f"Generating music from text with melody condition, Input Text: {text}, Melody: {music_filename}.") updated_music_filename = get_new_audio_name(music_filename, func_name="remix") melody, sr = torchaudio.load(music_filename) @@ -90,7 +91,7 @@ def __init__(self, device): def inference(self, inputs): music_filename, instrument, mode = inputs.split(",")[0].strip(), inputs.split(",")[1].strip(), inputs.split(",")[2].strip() - print(f"{mode}ing {instrument} track from {music_filename}.") + print(f"{mode} {instrument} track from {music_filename}.") if mode == "extract": instrument_mode = instrument diff --git a/melodytalk/utils.py b/melodytalk/utils.py index 10793c6..19b49b2 100644 --- a/melodytalk/utils.py +++ b/melodytalk/utils.py @@ -3,6 +3,9 @@ import torch import numpy as np import os +import openai + +openai.api_key = os.getenv("OPENAI_API_KEY") def cut_dialogue_history(history_memory, keep_last_n_words=500): if history_memory is None or len(history_memory) == 0: @@ -49,3 +52,35 @@ def get_new_audio_name(org_audio_name, func_name="update"): recent_prev_file_name = name_split[0] new_file_name = f'{this_new_uuid}_{func_name}_{recent_prev_file_name}_{most_org_file_name}.wav' return os.path.join(head, new_file_name) + +def description_to_attributes(description: str) -> str: + """ This function is a trick to concate key, bpm, (genre, mood, instrument) information to the description. + + :param description: + :return: + """ + + openai_prompt = f"""Please catch the bpm and key attributes from the original description text. If the description text does not mention it, do not add it. Here are two examples: + + Q: Generate a love pop song in C major of 120 bpm. + A: Generate a love pop song. bpm: 120. key: Cmaj. + + Q: Generate a love pop song in a minor. + A: Generate a love pop song. key: Amin. + + Q: {description}. + A: + """ + + response = openai.Completion.create( + model="text-davinci-003", + prompt=openai_prompt, + temperature=0, + max_tokens=100, + top_p=1, + frequency_penalty=0.0, + presence_penalty=0.0, + stop=["\n"] + ) + + return response.choices[0].text