From 18329bad34dad1ad1f6e977559cfec2978aed0e2 Mon Sep 17 00:00:00 2001 From: Yixiao Zhang Date: Fri, 25 Aug 2023 18:44:02 +0900 Subject: [PATCH] update --- melodytalk/dependencies/lpmc/__init__.py | 0 .../lpmc/music_captioning/__init__.py | 0 .../lpmc/music_captioning/captioning.py | 7 +- .../lpmc/music_captioning/model/bart.py | 2 +- .../dependencies/transplayer/__init__.py | 0 .../dependencies/transplayer/inference.py | 96 +++++ melodytalk/dependencies/transplayer/model.py | 399 ++++++++++++++++++ melodytalk/main.py | 49 ++- melodytalk/modules.py | 181 ++++++-- melodytalk/utils.py | 38 ++ 10 files changed, 719 insertions(+), 53 deletions(-) create mode 100644 melodytalk/dependencies/lpmc/__init__.py create mode 100644 melodytalk/dependencies/lpmc/music_captioning/__init__.py create mode 100644 melodytalk/dependencies/transplayer/__init__.py create mode 100644 melodytalk/dependencies/transplayer/inference.py create mode 100644 melodytalk/dependencies/transplayer/model.py diff --git a/melodytalk/dependencies/lpmc/__init__.py b/melodytalk/dependencies/lpmc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/melodytalk/dependencies/lpmc/music_captioning/__init__.py b/melodytalk/dependencies/lpmc/music_captioning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/melodytalk/dependencies/lpmc/music_captioning/captioning.py b/melodytalk/dependencies/lpmc/music_captioning/captioning.py index 6d14c5b..089b80d 100644 --- a/melodytalk/dependencies/lpmc/music_captioning/captioning.py +++ b/melodytalk/dependencies/lpmc/music_captioning/captioning.py @@ -45,9 +45,11 @@ def get_audio(audio_path, duration=10, target_sr=16000): audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32')) return audio -def main(): +def main(audio_path=None): args = parser.parse_args() - captioning(args) + if audio_path is not None: + args.audio_path = audio_path + return captioning(args) def captioning(args): save_dir = f"exp/{args.framework}/{args.caption_type}/" @@ -74,6 +76,7 @@ def captioning(args): item = {"text":text,"time":time} inference[chunk] = item print(item) + return inference if __name__ == '__main__': main() diff --git a/melodytalk/dependencies/lpmc/music_captioning/model/bart.py b/melodytalk/dependencies/lpmc/music_captioning/model/bart.py index 308214c..a06bd60 100644 --- a/melodytalk/dependencies/lpmc/music_captioning/model/bart.py +++ b/melodytalk/dependencies/lpmc/music_captioning/model/bart.py @@ -4,7 +4,7 @@ import torch.nn as nn import torch.nn.functional as F import numpy as np -from lpmc.music_captioning.model.modules import AudioEncoder +from melodytalk.dependencies.lpmc.music_captioning.model.modules import AudioEncoder from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig class BartCaptionModel(nn.Module): diff --git a/melodytalk/dependencies/transplayer/__init__.py b/melodytalk/dependencies/transplayer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/melodytalk/dependencies/transplayer/inference.py b/melodytalk/dependencies/transplayer/inference.py new file mode 100644 index 0000000..8d6e120 --- /dev/null +++ b/melodytalk/dependencies/transplayer/inference.py @@ -0,0 +1,96 @@ +import librosa +import resampy + +def transform(filepath): + audio, sr = librosa.load(filepath) + if sr != 16000: + audio = resampy.resample(audio, sr, 16000) + cqt_representation = lr.cqt(audio, sr=sr, hop_length=256) + + cqt_magnitude = np.abs(cqt_representation) + + +import os +import argparse +import torch +import numpy as np +from math import ceil +from model import Generator + +device = 'cuda:0' + + +def pad_seq(x, base=32): + len_out = int(base * ceil(float(x.shape[0]) / base)) + len_pad = len_out - x.shape[0] + assert len_pad >= 0 + return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), len_pad + + +def inference(input_file_path, + output_file_path, + org='piano', trg='piano', + cp_path=None): + G = Generator(dim_neck=32, + dim_emb=4, + dim_pre=512, + freq=32).eval().to(device) + if os.path.exists(cp_path): + save_info = torch.load(cp_path) + G.load_state_dict(save_info["model"]) + + # one-hot + ins_list = ['harp', 'trumpet', 'epiano', 'viola', 'piano', 'guitar', 'organ', 'flute'] + ins_org = org + ins_trg = trg + emb_org = ins_list.index(ins_org) + emb_trg = ins_list.index(ins_trg) + # emb_org = [i == ins_org for i in ins_list] + # emb_trg = [i == ins_trg for i in ins_list] + emb_org = torch.unsqueeze(torch.tensor(emb_org), dim=0).to(device) + emb_trg = torch.unsqueeze(torch.tensor(emb_trg), dim=0).to(device) + + x_org = np.log(np.load(config.feature_path).T)[:config.feature_len] + # x_org = np.load(config.spectrogram_path).T + x_org, len_pad = pad_seq(x_org) + x_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device) + + with torch.no_grad(): + _, x_identic_psnt, _ = G(x_org, emb_org, emb_org) + if len_pad == 0: + x_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() + else: + x_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() + + np.save(os.path.basename(config.feature_path)[:-4] + "_" + ins_org + "_" + ins_org + ".npy", x_trg.T) + print("result saved.") + + with torch.no_grad(): + _, x_identic_psnt, _ = G(x_org, emb_org, emb_trg) + if len_pad == 0: + x_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() + else: + x_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() + + np.save(os.path.basename(config.feature_path)[:-4] + "_" + ins_org + "_" + ins_trg + ".npy", x_trg.T) + print("result saved.") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + # Model configuration. + parser.add_argument('--lambda_cd', type=float, default=0, help='weight for hidden code loss') + # Training configuration. + parser.add_argument('--feature_path', type=str, default='../../data_syn/cropped/piano_all_00.wav_cqt.npy') + parser.add_argument('--feature_len', type=int, default=2400) + # parser.add_argument('--num_iters', type=int, default=1000000, help='number of total iterations') + # parser.add_argument('--len_crop', type=int, default=128, help='dataloader output sequence length') + + # Miscellaneous. + parser.add_argument('--cp_path', type=str, + default="../../autovc_cp/weights_log_cqt_down32_neck32_onehot4_withcross") + + config = parser.parse_args() + print(config) + inference(config) \ No newline at end of file diff --git a/melodytalk/dependencies/transplayer/model.py b/melodytalk/dependencies/transplayer/model.py new file mode 100644 index 0000000..0485c4a --- /dev/null +++ b/melodytalk/dependencies/transplayer/model.py @@ -0,0 +1,399 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + + +class LinearNorm(torch.nn.Module): + def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): + super(LinearNorm, self).__init__() + self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) + + torch.nn.init.xavier_uniform_( + self.linear_layer.weight, + gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, x): + return self.linear_layer(x) + + +class ConvNorm(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, + padding=None, dilation=1, bias=True, w_init_gain='linear'): + super(ConvNorm, self).__init__() + if padding is None: + assert (kernel_size % 2 == 1) + padding = int(dilation * (kernel_size - 1) / 2) + + self.conv = torch.nn.Conv1d(in_channels, out_channels, + kernel_size=kernel_size, stride=stride, + padding=padding, dilation=dilation, + bias=bias) + + torch.nn.init.xavier_uniform_( + self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, signal): + conv_signal = self.conv(signal) + return conv_signal + + +class Embedder(nn.Module): + def __init__(self, dim_emb): + super(Embedder, self).__init__() + self.dim_emb = dim_emb + # one-hot + self.emb = nn.Embedding(8, dim_emb) + + def forward(self, onehot): # one-hot + return self.emb(onehot) + + +class Encoder(nn.Module): + """Encoder module: + """ + + def __init__(self, dim_neck, dim_emb, dim_pre, freq): + super(Encoder, self).__init__() + self.dim_neck = dim_neck + self.freq = freq + + convolutions = [] + for i in range(3): + conv_layer = nn.Sequential( + ConvNorm(84 + dim_emb if i == 0 else dim_pre, + dim_pre, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='relu'), + nn.BatchNorm1d(dim_pre)) + convolutions.append(conv_layer) + self.convolutions = nn.ModuleList(convolutions) + + self.lstm = nn.LSTM(dim_pre, dim_neck, 2, batch_first=True, bidirectional=True) + + def forward(self, x, c_org): + x = x.squeeze(1).transpose(2, 1) + + c_org = c_org.unsqueeze(-1).expand(-1, -1, x.size(-1)) + + x = torch.cat((x, c_org), dim=1) + + for conv in self.convolutions: + x = F.relu(conv(x)) + x = x.transpose(1, 2) + + self.lstm.flatten_parameters() + outputs, _ = self.lstm(x) + out_forward = outputs[:, :, :self.dim_neck] + out_backward = outputs[:, :, self.dim_neck:] + + codes = [] + for i in range(0, outputs.size(1), self.freq): + codes.append(torch.cat((out_forward[:, i + self.freq - 1, :], out_backward[:, i, :]), dim=-1)) + + return codes + + +class Decoder(nn.Module): + """Decoder module: + """ + + def __init__(self, dim_neck, dim_emb, dim_pre): + super(Decoder, self).__init__() + + self.lstm1 = nn.LSTM(dim_neck * 2 + dim_emb, dim_pre, 1, batch_first=True) + + convolutions = [] + for i in range(3): + conv_layer = nn.Sequential( + ConvNorm(dim_pre, + dim_pre, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='relu'), + nn.BatchNorm1d(dim_pre)) + convolutions.append(conv_layer) + self.convolutions = nn.ModuleList(convolutions) + + self.lstm2 = nn.LSTM(dim_pre, 1024, 2, batch_first=True) + + self.linear_projection = LinearNorm(1024, 84) + + def forward(self, x): + + # self.lstm1.flatten_parameters() + x, _ = self.lstm1(x) + x = x.transpose(1, 2) + + for conv in self.convolutions: + x = F.relu(conv(x)) + x = x.transpose(1, 2) + + outputs, _ = self.lstm2(x) + + decoder_output = self.linear_projection(outputs) + + return decoder_output + + +class Postnet(nn.Module): + """Postnet + - Five 1-d convolution with 512 channels and kernel size 5 + """ + + def __init__(self): + super(Postnet, self).__init__() + self.convolutions = nn.ModuleList() + + self.convolutions.append( + nn.Sequential( + ConvNorm(84, 512, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(512)) + ) + + for i in range(1, 5 - 1): + self.convolutions.append( + nn.Sequential( + ConvNorm(512, + 512, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(512)) + ) + + self.convolutions.append( + nn.Sequential( + ConvNorm(512, 84, + kernel_size=5, stride=1, + padding=2, + dilation=1, w_init_gain='linear'), + nn.BatchNorm1d(84)) + ) + + def forward(self, x): + for i in range(len(self.convolutions) - 1): + x = torch.tanh(self.convolutions[i](x)) + + x = self.convolutions[-1](x) + + return x + + +class Generator(nn.Module): + """Generator network.""" + + def __init__(self, dim_neck, dim_emb, dim_pre, freq): + super(Generator, self).__init__() + + self.embedder = Embedder(dim_emb) + self.encoder = Encoder(dim_neck, dim_emb, dim_pre, freq) + self.decoder = Decoder(dim_neck, dim_emb, dim_pre) + self.postnet = Postnet() + + def forward(self, x, c_org, c_trg): + # one-hot + c_org = self.embedder(c_org) + + codes = self.encoder(x, c_org) + if c_trg is None: # only encoder, don't decode to any target + return torch.cat(codes, dim=-1) + + tmp = [] + for code in codes: + tmp.append(code.unsqueeze(1).expand(-1, int(x.size(1) / len(codes)), -1)) + code_exp = torch.cat(tmp, dim=1) + + # one-hot + c_trg = self.embedder(c_trg) + + # encoder_outputs = torch.cat((code_exp, c_trg.unsqueeze(1).expand(-1,x.size(1),-1)), dim=-1) + encoder_outputs = torch.cat((code_exp, c_trg.unsqueeze(1).expand(-1, x.size(1), -1)), dim=-1) + + dec_outputs = self.decoder(encoder_outputs) + + postnet_outputs = self.postnet(dec_outputs.transpose(2, 1)) + postnet_outputs = dec_outputs + postnet_outputs.transpose(2, 1) + + dec_outputs = dec_outputs.unsqueeze(1) + postnet_outputs = postnet_outputs.unsqueeze(1) + + return dec_outputs, postnet_outputs, torch.cat(codes, dim=-1) + + +class ResBlock(nn.Module): + def __init__(self, dim, dilation=1, norm='in', activation='relu', pad_type='zero'): + super(ResBlock, self).__init__() + + model = [] + model += [ConvNorm(dim, dim, kernel_size=3, padding=1), nn.ReLU()] + model += [ConvNorm(dim, dim, kernel_size=3, padding=1), nn.ReLU()] + self.model = nn.Sequential(*model) + + def forward(self, x): + residual = x + out = self.model(x) + out += residual + return out + + +class Conv2dBlock(nn.Module): + def __init__(self, input_dim, output_dim, kernel_size, stride, + padding=0, dilation=1, norm='none', activation='relu', pad_type='zero'): + super(Conv2dBlock, self).__init__() + self.use_bias = True + # initialize padding + if pad_type == 'reflect': + self.pad = nn.ReflectionPad2d(padding) + elif pad_type == 'replicate': + self.pad = nn.ReplicationPad2d(padding) + elif pad_type == 'zero': + self.pad = nn.ZeroPad2d(padding) + else: + assert 0, "Unsupported padding type: {}".format(pad_type) + + # initialize normalization + norm_dim = output_dim + if norm == 'bn': + self.norm = nn.BatchNorm2d(norm_dim) + elif norm == 'in': + self.norm = nn.InstanceNorm2d(norm_dim) + elif norm == 'ln': + self.norm = LayerNorm(norm_dim) + elif norm == 'adain': + self.norm = AdaptiveInstanceNorm2d(norm_dim) + elif norm == 'none' or norm == 'spectral': + self.norm = None + else: + assert 0, "Unsupported normalization: {}".format(norm) + + # initialize activation + if activation == 'relu': + self.activation = nn.ReLU(inplace=True) + elif activation == 'lrelu': + self.activation = nn.LeakyReLU(0.2, inplace=True) + elif activation == 'prelu': + self.activation = nn.PReLU() + elif activation == 'selu': + self.activation = nn.SELU(inplace=True) + elif activation == 'tanh': + self.activation = nn.Tanh() + elif activation == 'none': + self.activation = None + else: + assert 0, "Unsupported activation: {}".format(activation) + + # initialize convolution + if norm == 'spectral': + self.conv = SpectralNorm( + nn.Conv2d(input_dim, output_dim, kernel_size, stride, dilation=dilation, bias=self.use_bias)) + else: + self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride, dilation=dilation, bias=self.use_bias) + + def forward(self, x): + x = self.conv(self.pad(x)) + if self.norm: + x = self.norm(x) + if self.activation: + x = self.activation(x) + return x + + +class ConvTranspose2dBlock(nn.Module): + def __init__(self, input_dim, output_dim, kernel_size, stride, + padding=0, dilation=1, norm='none', activation='none', pad_type='zero'): + super(ConvTranspose2dBlock, self).__init__() + self.use_bias = True + # initialize padding + if pad_type == 'reflect': + self.pad = nn.ReflectionPad2d(padding) + elif pad_type == 'replicate': + self.pad = nn.ReplicationPad2d(padding) + elif pad_type == 'zero': + self.pad = nn.ZeroPad2d(padding) + else: + assert 0, "Unsupported padding type: {}".format(pad_type) + + # initialize normalization + norm_dim = output_dim + if norm == 'bn': + self.norm = nn.BatchNorm2d(norm_dim) + elif norm == 'in': + self.norm = nn.InstanceNorm2d(norm_dim) + elif norm == 'ln': + self.norm = LayerNorm(norm_dim) + elif norm == 'adain': + self.norm = AdaptiveInstanceNorm2d(norm_dim) + elif norm == 'none' or norm == 'spectral': + self.norm = None + else: + assert 0, "Unsupported normalization: {}".format(norm) + + # initialize activation + if activation == 'relu': + self.activation = nn.ReLU(inplace=True) + elif activation == 'none': + self.activation = None + else: + assert 0, "Unsupported activation: {}".format(activation) + + # initialize convolution + self.dconv = nn.ConvTranspose2d(input_dim, output_dim, kernel_size, stride, padding, bias=self.use_bias, + dilation=dilation) + + def forward(self, x): + x = self.dconv(x) + if self.norm: + x = self.norm(x) + if self.activation: + x = self.activation(x) + return x + + +class LinearBlock(nn.Module): + def __init__(self, input_dim, output_dim, norm='none', activation='relu'): + super(LinearBlock, self).__init__() + use_bias = True + # initialize fully connected layer + self.fc = nn.Linear(input_dim, output_dim, bias=use_bias) + + # initialize normalization + norm_dim = output_dim + if norm == 'bn': + self.norm = nn.BatchNorm1d(norm_dim) + elif norm == 'in': + self.norm = nn.InstanceNorm1d(norm_dim) + elif norm == 'ln': + self.norm = LayerNorm(norm_dim) + elif norm == 'none': + self.norm = None + else: + assert 0, "Unsupported normalization: {}".format(norm) + + # initialize activation + if activation == 'relu': + self.activation = nn.ReLU(inplace=True) + elif activation == 'lrelu': + self.activation = nn.LeakyReLU(0.2, inplace=True) + elif activation == 'prelu': + self.activation = nn.PReLU() + elif activation == 'selu': + self.activation = nn.SELU(inplace=True) + elif activation == 'tanh': + self.activation = nn.Tanh() + elif activation == 'none': + self.activation = None + else: + assert 0, "Unsupported activation: {}".format(activation) + + def forward(self, x): + out = self.fc(x) + if self.norm: + out = self.norm(out) + if self.activation: + out = self.activation(out) + return out \ No newline at end of file diff --git a/melodytalk/main.py b/melodytalk/main.py index 2fcef05..cfea79a 100644 --- a/melodytalk/main.py +++ b/melodytalk/main.py @@ -120,7 +120,11 @@ def __init__(self): "Text2MusicWithTitle": "cuda:0", "AddNewTrack": "cuda:0", "MusicInpainting": "cuda:0", - "Variation": "cuda:0",} + "Variation": "cuda:0", + "PitchShifting": "cuda:0", + "TimeStretching": "cuda:0", + "SingleSoundEffect": "cuda:0", + } template_dict = None # { "Text2MusicwithChord": "cuda:0"} # "Accompaniment": "cuda:0", print(f"Initializing MelodyTalk, load_dict={load_dict}, template_dict={template_dict}") @@ -227,19 +231,24 @@ def clear_input_audio(self): with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo: - gr.Markdown( - """This is a demo to our work *MelodyTalk*. - """ - ) + gr.Markdown(""" + ## MelodyTalk + ### MelodyTalk is a ChatGPT-based interface for making music loops. All supported tools are listed below. + ### Usage: + ### Step 1: Describe the music loop you want to make. You can assign genre, instrument, bpm, mood in your text. + ### Step 2: You can finetune the generated music loop using existing tools. + """) lang = gr.Radio(choices=['Chinese', 'English'], value=None, label='Language') chatbot = gr.Chatbot(elem_id="chatbot", label="MelodyTalk") state = gr.State([]) with gr.Row(visible=False) as input_raws: - with gr.Column(scale=0.7): + with gr.Column(scale=0.55): txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an audio").style( container=False) + with gr.Column(scale=0.15, min_width=0): + undo = gr.Button("Undo") with gr.Column(scale=0.15, min_width=0): clear = gr.Button("Clear") with gr.Column(scale=0.15, min_width=0): @@ -253,6 +262,32 @@ def clear_input_audio(self): with gr.Column(scale=0.15, min_width=0): rec_submit = gr.Button("Submit") + + gr.Markdown( + """| Task | Stage | Examples of text input | Backend models | + | --- | --- | --- | --- | + | Text to music | 1 | Generate a rock music loop with guitar and drums. | MusicGen | + | Drum pattern to music | 1 | Generate a rock music with guitar based on this drum. | MusicGen, CLAP | + | Impression to music | 1 | Generate a music loop feels like "Hey Jude"'s choral part. | ChatGPT, MusicGen | + | Stylistic rearrangement | 1 | Rearrange this music audio to jazz with saxophone solo. | MusicGen | + | Music variation | 1 | Generate a music loop sounds like this music. | VampNet | + | Add a track | 2 | Add a saxophone solo to this music loop. | MusicGen, CLAP | + | Remove a track | 2 | Remove the guitar from this music loop. | Demucs | + | Re-generation/inpainting | 2 | Re-generate the 3-5s part of the music loop. | VampNet | + | Pitch shifting | 2 | Shift this music by 3 semitone. | pedalboard | + | Speed changing | 2 | Speed up this music by 1.2. | torchaudio | + | Add sound effects| 2 | Add some reverb to the guitar solo. | pedalboard, automix-tools | + | Music captioning | N/A | Describe the current music loop. | LP-MusicCaps | + | * Replace instrument (unavailable) | 2 | Replace the guitar solo by piano. | Transplayer, automix-tools | + | * Timbre adjustment (unavailable) | 2 | Make the drum to sound more metallic. | ChatGPT, pedalboard, automix-tools | + """ + ) + + gr.Markdown(""" + Currently, MelodyTalk still does not support music content style transfer, such as 'make this music more relax.' Please wait for our future work. + """) + + lang.change(bot.init_agent, [lang], [input_raws, lang, txt, clear, record_raws]) txt.submit(bot.run_text, [txt, state], [chatbot, state]) txt.submit(lambda: "", None, txt) @@ -265,5 +300,5 @@ def clear_input_audio(self): clear.click(lambda: [], None, chatbot) clear.click(lambda: [], None, state) clear.click(bot.clear_input_audio, None, rec_audio) - demo.launch(server_name="0.0.0.0", server_port=7860, + demo.launch(server_name="0.0.0.0", server_port=7862, ssl_certfile="cert.pem", ssl_keyfile="key.pem", ssl_verify=False) diff --git a/melodytalk/modules.py b/melodytalk/modules.py index 153b2a7..3898c04 100644 --- a/melodytalk/modules.py +++ b/melodytalk/modules.py @@ -1,7 +1,10 @@ from shutil import copyfile from dataclasses import dataclass +import librosa import torch +import torchaudio.functional +import pedalboard # text2music from melodytalk.dependencies.audiocraft.models import MusicGen @@ -13,11 +16,13 @@ # Vampnet from melodytalk.dependencies.vampnet.interface import Interface from melodytalk.dependencies.vampnet.main import vamp +# captioning +from melodytalk.dependencies.lpmc.music_captioning.captioning import main as captioning from utils import * DURATION = 8 -GENERATION_CANDIDATE = 5 +GENERATION_CANDIDATE = 6 # Initialze common models # musicgen_model = MusicGen.get_pretrained('large') @@ -34,13 +39,16 @@ # Vampnet interface = Interface( - coarse_ckpt="./models/vampnet/coarse.pth", - coarse2fine_ckpt="./models/vampnet/c2f.pth", - codec_ckpt="./models/vampnet/codec.pth", - wavebeat_ckpt="./models/wavebeat.pth", + coarse_ckpt="./dependencies/vampnet/models/vampnet/coarse.pth", + coarse2fine_ckpt="./dependencies/vampnet/models/vampnet/c2f.pth", + codec_ckpt="./dependencies/vampnet/models/vampnet/codec.pth", + wavebeat_ckpt="./dependencies/vampnet/models/wavebeat.pth", device="cuda" if torch.cuda.is_available() else "cpu", ) +# captioning model + + @dataclass class GlobalAttributes(object): # metadata @@ -114,16 +122,16 @@ def __init__(self, device): @prompts( name="Generate music from user input when the input is a title of music", - description="useful if you want to generate music which is silimar and save it to a file." - "like: generate music of love pop song, or generate music with piano and violin." - "The input to this tool should be a comma separated string of two, " - "representing the text description and the title." + description="useful if you want to generate music which feels like an real music." + "like: generate music that feels like 'hey jude', or generate music similar to 'let it be'." + "The input to this tool should be a string, " + "representing the music title." ) def inference(self, inputs): - text, title = inputs.split(",")[0].strip(), inputs.split(",")[1].strip() + title = inputs music_filename = os.path.join("music", f"{title}.wav") - text = music_title_to_description(text) # using chatGPT's knowledge base to convert title to description + text = music_title_to_description(title) # using chatGPT's knowledge base to convert title to description attribute_table.descriptions = text text = description_to_attributes(text) # convert text to attributes wav = self.model.generate([text], progress=False) @@ -182,10 +190,10 @@ def inference(self, inputs): music_filename, text = inputs.split(",")[0].strip(), inputs.split(",")[1].strip() text = description_to_attributes(text) print(f"Generating music from text with drum condition, Input text: {text}, Drum: {music_filename}.") - updated_music_filename = get_new_audio_name(music_filename, func_name="with_drum") + updated_music_filename = get_new_audio_name(music_filename, func_name="withdrum") drum, sr = torchaudio.load(music_filename) self.model.set_generation_params(duration=35) - wav = self.model.generate_continuation(prompt=drum[None].expand(GENERATION_CANDIDATE, -1, -1), prompt_sr=sr, + wav = self.model.generate_continuation(prompt=drum[None].expand(GENERATION_CANDIDATE, -1, -1), prompt_sample_rate=sr, descriptions=[text] * GENERATION_CANDIDATE, progress=False) self.model.set_generation_params(duration=DURATION) # cut drum prompt @@ -229,6 +237,7 @@ def inference(self, inputs): # select the best one by CLAP scores print(f"CLAP post filter for {len(splitted_audios)} candidates.") best_wav, _ = CLAP_post_filter(CLAP_model, attribute_table.descriptions, splitted_audios.cuda(), self.model.sample_rate) + best_wav = torch.from_numpy(librosa.effects.trim(best_wav.cpu().numpy())[0]) audio_write(updated_music_filename[:-4], best_wav.cpu(), self.model.sample_rate, strategy="loudness", loudness_compressor=True) print(f"\nProcessed AddNewTrack, Output Music: {updated_music_filename}.") @@ -334,7 +343,12 @@ def __init__(self): ) def inference(self, inputs): - pass + music_filename = inputs.strip() + print(f"Captioning the current music, Input Music: {music_filename}.") + captions = captioning(music_filename) + captions_text = captions[0]["text"] + print(f"\nProcessed MusicCaptioning, Output Captions: {captions_text}.") + return captions_text # class Text2MusicwithChord(object): @@ -375,7 +389,59 @@ def inference(self, inputs): # print(f"\nProcessed Text2Music, Input Text: {preprocessed_input}, Output Music: {music_filename}.") # return music_filename +class PitchShifting(object): + def __init__(self, device): + print("Initializing PitchShifting") + self.device = device + + @prompts( + name="Shift the pitch of the given music.", + description="useful if you want to shift the pitch of a music." + "Like: shift the pitch of this music by 3 semitones." + "The input to this tool should be a comma separated string of two, " + "representing the music_filename and the pitch shift value." + ) + def inference(self, inputs): + music_filename, pitch_shift_value = inputs.split(",")[0].strip(), int(inputs.split(",")[1].strip()) + print(f"Shifting the pitch of the given music, Input Music: {music_filename}, Pitch Shift Value: {pitch_shift_value}.") + updated_music_filename = get_new_audio_name(music_filename, func_name="pitchshifting") + # load + wav, sr = torchaudio.load(music_filename) + # shift + wav = torchaudio.functional.pitch_shift(wav, sr, pitch_shift_value) + # write + audio_write(updated_music_filename[:-4], + wav.cpu(), sr, strategy="loudness", loudness_compressor=True) + print(f"\nProcessed PitchShifting, Output Music: {updated_music_filename}.") + return updated_music_filename + +class TimeStretching(object): + def __init__(self, device): + print("Initializing TimeStretching") + self.device = device + + @prompts( + name="Stretch the time of the given music.", + description="useful if you want to stretch the time of a music." + "Like: stretch the time of this music by 1.5." + "The input to this tool should be a comma separated string of two, " + "representing the music_filename and the time stretch value." + ) + + def inference(self, inputs): + music_filename, time_stretch_value = inputs.split(",")[0].strip(), float(inputs.split(",")[1].strip()) + print(f"Stretching the time of the given music, Input Music: {music_filename}, Time Stretch Value: {time_stretch_value}.") + updated_music_filename = get_new_audio_name(music_filename, func_name="timestretching") + # load + wav, sr = torchaudio.load(music_filename) + # stretch + wav = torchaudio.functional.speed(wav, sr, time_stretch_value)[0] + # write + audio_write(updated_music_filename[:-4], + wav.cpu(), sr, strategy="loudness", loudness_compressor=True) + print(f"\nProcessed TimeStretching, Output Music: {updated_music_filename}.") + return updated_music_filename class MusicInpainting(object): def __init__(self, device): @@ -394,7 +460,7 @@ def __init__(self, device): def inference(self, inputs): music_filename, start_time, end_time = inputs.split(",")[0].strip(), inputs.split(",")[1].strip(), inputs.split(",")[2].strip() print(f"Inpainting a specific time region of the given music, Input Music: {music_filename}, Start Time: {start_time}, End Time: {end_time}.") - updated_music_filename = get_new_audio_name(music_filename, func_name="inpainting_" + start_time + "_" + end_time) + updated_music_filename = get_new_audio_name(music_filename, func_name="inpainting") p_track, sr = torchaudio.load(music_filename) audio_length_in_second = p_track.shape[-1] / sr if float(end_time) > audio_length_in_second: @@ -434,34 +500,63 @@ def inference(self, inputs): print(f"\nProcessed Variation, Output Music: {updated_music_filename}.") return updated_music_filename -# class Accompaniment(object): -# template_model = True -# def __init__(self, Text2MusicWithMelody, ExtractTrack, SimpleTracksMixing): -# print("Initializing Accompaniment") -# self.Text2MusicWithMelody = Text2MusicWithMelody -# self.ExtractTrack = ExtractTrack -# self.SimpleTracksMixing = SimpleTracksMixing +class SingleSoundEffect(object): + def __init__(self, device): + print("Initializing SingleSoundEffect") + self.device = device + self.interface = interface + + @prompts( + name="Add a single sound effect to the given music.", + description="useful if you want to add a single sound effect, like reverb, high pass filter or chorus to the given music." + "like: add a reverb of recording studio to this music." + "The input to this tool should be a comma separated string of two, " + "representing the music_filename and the original user message." + ) + + def inference(self, inputs): + music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip() + print(f"Add a single sound effect to the given music, Input Music: {music_filename}, Sound Effect Name: {user_message}.") + updated_music_filename = get_new_audio_name(music_filename, func_name="single_sound_effect") + sound_effect = add_single_sound_effect(user_message) + my_pedalboard = pedalboard.Pedalboard() + my_pedalboard.append(eval(sound_effect)) + input_audio, sr = torchaudio.load(music_filename) + output_audio = my_pedalboard(input_audio.numpy(), sample_rate=sr) + audio_write(updated_music_filename[:-4], + output_audio, sr, strategy="loudness", loudness_compressor=True) + print(f"\nProcessed SingleSoundEffect, Output Music: {updated_music_filename}.") + return updated_music_filename + + +# class TimbreTransfer(object): +# def __init__(self, device): +# print("Initializing TimbreTransfer") +# self.device = device +# self.interface = interface # -# @prompts( -# name="Generate accompaniment music from user input text, keeping the given melody or track", -# description="useful if you want to style transfer or remix music from a user input text with a given melody." -# "Unlike Text2MusicWithMelody, this tool will keep the given melody track instead of re-generate it." -# "Note that the user must assign a track (it must be one of `vocals`, `drums`, `bass`, `guitar`, `piano` or `other`) to keep." -# "like: keep the guitar track and remix the given music with text description, " -# "or generate accompaniment as text described with the given vocal track." -# "The input to this tool should be a comma separated string of three, " -# "representing the music_filename, track name, and the text description." -# ) +# @prompts( +# name="Transfer the timbre of the given music to another music.", +# description="useful if you want to transfer the timbre of the given music to another music." +# "like: transfer the timbre of this music to another music." +# "The input to this tool should be a comma separated string of two, " +# "representing the music_filename and the original user message." +# ) # # def inference(self, inputs): -# music_filename, track_name, text = inputs.split(",")[0].strip(), inputs.split(",")[1].strip(), inputs.split(",")[2].strip() -# print(f"Generating music from text with accompaniment condition, Input Text: {text}, Previous music: {music_filename}, Track: {track_name}.") -# # separate the track -# updated_main_track = self.ExtractTrack.inference(f"{music_filename}, {track_name}, extract") -# # generate music -# updated_new_music = self.Text2MusicWithMelody.inference(f"{updated_main_track}, {text}") -# # remove the track in accompaniment -# updated_accompaniment = self.ExtractTrack.inference(f"{updated_new_music}, {track_name}, remove") -# # mix -# updated_music_filename = self.SimpleTracksMixing.inference(f"{updated_main_track}, {updated_accompaniment}") -# return updated_music_filename \ No newline at end of file +# music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip() +# print(f"Transfer the timbre of the given music to another music, Input Music: {music_filename}, Target Music: {user_message}.") +# updated_music_filename = get_new_audio_name(music_filename, func_name="timbre_transfer") +# target_music_filename = get_new_audio_name(user_message, func_name="timbre_transfer") +# # load +# wav, sr = torchaudio.load(music_filename) +# target_wav, target_sr = torchaudio.load(user_message) +# # stretch +# wav = torchaudio.functional.time_stretch(wav, sr, target_sr/sr)[0] +# # write +# audio_write(updated_music_filename[:-4], +# wav.cpu(), sr, strategy="loudness", loudness_compressor=True) +# audio_write(target_music_filename[:-4], +# target_wav.cpu(), target_sr, strategy="loudness", loudness_compressor=True) +# print(f"\nProcessed TimbreTransfer, Output Music: {updated_music_filename}.") +# return updated_music_filename diff --git a/melodytalk/utils.py b/melodytalk/utils.py index 7e9b6af..8f74947 100644 --- a/melodytalk/utils.py +++ b/melodytalk/utils.py @@ -301,5 +301,43 @@ def split_audio_tensor_by_downbeats(input_audio_batch: torch.Tensor, sr: int = 3 return segments +def add_single_sound_effect(input: str) -> str: + openai_prompt = f"""You are asked to pick the most appropriate one of the APIs below to achieve the desired sound effects. You MUST loyally only assign the existing parameters to fine-tune the function. If you use the default param value, skip it. + + 1. Guitar-style effects: + - Chorus(rate_hz: float = 1.0, depth: float = 0.25, centre_delay_ms: float = 7.0, feedback: float = 0.0, mix: float = 0.5); + - Distortion(drive_db: float = 25); + - Phaser(rate_hz: float = 1.0, depth: float = 0.5, centre_frequency_hz: float = 1300.0, feedback: float = 0.0, mix: float = 0.5); + - Clipping(threshold_db: float = -6.0); + 2. Loudness and dynamic range effects: + - Compressor(threshold_db: float = 0, ratio: float = 1, attack_ms: float = 1.0, release_ms: float = 100); + - Gain(gain_db: float = 1.0); + - Limiter(threshold_db: float = -10.0, release_ms: float = 100.0); + 3. Equalizers and filters: + - HighpassFilter(cutoff_frequency_hz: float = 50); + - LadderFilter(mode: Mode = Mode.LPF12, cutoff_hz: float = 200, resonance: float = 0, drive: float = 1.0); + - LowpassFilter(cutoff_frequency_hz: float = 50); + 4. Spatial effects: + - Convolution(impulse_response_filename: str, mix: float = 1.0); + - Delay(delay_seconds: float = 0.5, feedback: float = 0.0, mix: float = 0.5); + - Reverb(room_size: float = 0.5, damping: float = 0.5, wet_level: float = 0.33, dry_level: float = 0.4, width: float = 1.0, freeze_mode: float = 0.0); + + Let us think step by step. + + Q: I want to use a 200hz highpass filter to this audio. + A: pedalboard.HighpassFilter(cutoff_frequency_hz=200); + + Q: {input}. + A: """ + response = openai.Completion.create( + model="text-davinci-003", + prompt=openai_prompt, + temperature=0, + max_tokens=100, + top_p=1, + frequency_penalty=0.0, + presence_penalty=0.0, + ) + return response.choices[0].text \ No newline at end of file