From 18329bad34dad1ad1f6e977559cfec2978aed0e2 Mon Sep 17 00:00:00 2001
From: Yixiao Zhang <ldzhangyx@outlook.com>
Date: Fri, 25 Aug 2023 18:44:02 +0900
Subject: [PATCH] update

---
 melodytalk/dependencies/lpmc/__init__.py      |   0
 .../lpmc/music_captioning/__init__.py         |   0
 .../lpmc/music_captioning/captioning.py       |   7 +-
 .../lpmc/music_captioning/model/bart.py       |   2 +-
 .../dependencies/transplayer/__init__.py      |   0
 .../dependencies/transplayer/inference.py     |  96 +++++
 melodytalk/dependencies/transplayer/model.py  | 399 ++++++++++++++++++
 melodytalk/main.py                            |  49 ++-
 melodytalk/modules.py                         | 181 ++++++--
 melodytalk/utils.py                           |  38 ++
 10 files changed, 719 insertions(+), 53 deletions(-)
 create mode 100644 melodytalk/dependencies/lpmc/__init__.py
 create mode 100644 melodytalk/dependencies/lpmc/music_captioning/__init__.py
 create mode 100644 melodytalk/dependencies/transplayer/__init__.py
 create mode 100644 melodytalk/dependencies/transplayer/inference.py
 create mode 100644 melodytalk/dependencies/transplayer/model.py

diff --git a/melodytalk/dependencies/lpmc/__init__.py b/melodytalk/dependencies/lpmc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/melodytalk/dependencies/lpmc/music_captioning/__init__.py b/melodytalk/dependencies/lpmc/music_captioning/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/melodytalk/dependencies/lpmc/music_captioning/captioning.py b/melodytalk/dependencies/lpmc/music_captioning/captioning.py
index 6d14c5b..089b80d 100644
--- a/melodytalk/dependencies/lpmc/music_captioning/captioning.py
+++ b/melodytalk/dependencies/lpmc/music_captioning/captioning.py
@@ -45,9 +45,11 @@ def get_audio(audio_path, duration=10, target_sr=16000):
     audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
     return audio
     
-def main():
+def main(audio_path=None):
     args = parser.parse_args()
-    captioning(args)
+    if audio_path is not None:
+        args.audio_path = audio_path
+    return captioning(args)
  
 def captioning(args):
     save_dir = f"exp/{args.framework}/{args.caption_type}/"
@@ -74,6 +76,7 @@ def captioning(args):
         item = {"text":text,"time":time}
         inference[chunk] = item
         print(item)
+    return inference
 
 if __name__ == '__main__':
     main()
diff --git a/melodytalk/dependencies/lpmc/music_captioning/model/bart.py b/melodytalk/dependencies/lpmc/music_captioning/model/bart.py
index 308214c..a06bd60 100644
--- a/melodytalk/dependencies/lpmc/music_captioning/model/bart.py
+++ b/melodytalk/dependencies/lpmc/music_captioning/model/bart.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
-from lpmc.music_captioning.model.modules import AudioEncoder
+from melodytalk.dependencies.lpmc.music_captioning.model.modules import AudioEncoder
 from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
 
 class BartCaptionModel(nn.Module):
diff --git a/melodytalk/dependencies/transplayer/__init__.py b/melodytalk/dependencies/transplayer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/melodytalk/dependencies/transplayer/inference.py b/melodytalk/dependencies/transplayer/inference.py
new file mode 100644
index 0000000..8d6e120
--- /dev/null
+++ b/melodytalk/dependencies/transplayer/inference.py
@@ -0,0 +1,96 @@
+import librosa
+import resampy
+
+def transform(filepath):
+    audio, sr = librosa.load(filepath)
+    if sr != 16000:
+        audio = resampy.resample(audio, sr, 16000)
+    cqt_representation = lr.cqt(audio, sr=sr, hop_length=256)
+
+    cqt_magnitude = np.abs(cqt_representation)
+
+
+import os
+import argparse
+import torch
+import numpy as np
+from math import ceil
+from model import Generator
+
+device = 'cuda:0'
+
+
+def pad_seq(x, base=32):
+    len_out = int(base * ceil(float(x.shape[0]) / base))
+    len_pad = len_out - x.shape[0]
+    assert len_pad >= 0
+    return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), len_pad
+
+
+def inference(input_file_path,
+              output_file_path,
+              org='piano', trg='piano',
+              cp_path=None):
+    G = Generator(dim_neck=32,
+                  dim_emb=4,
+                  dim_pre=512,
+                  freq=32).eval().to(device)
+    if os.path.exists(cp_path):
+        save_info = torch.load(cp_path)
+        G.load_state_dict(save_info["model"])
+
+    # one-hot
+    ins_list = ['harp', 'trumpet', 'epiano', 'viola', 'piano', 'guitar', 'organ', 'flute']
+    ins_org = org
+    ins_trg = trg
+    emb_org = ins_list.index(ins_org)
+    emb_trg = ins_list.index(ins_trg)
+    # emb_org = [i == ins_org for i in ins_list]
+    # emb_trg = [i == ins_trg for i in ins_list]
+    emb_org = torch.unsqueeze(torch.tensor(emb_org), dim=0).to(device)
+    emb_trg = torch.unsqueeze(torch.tensor(emb_trg), dim=0).to(device)
+
+    x_org = np.log(np.load(config.feature_path).T)[:config.feature_len]
+    # x_org = np.load(config.spectrogram_path).T
+    x_org, len_pad = pad_seq(x_org)
+    x_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
+
+    with torch.no_grad():
+        _, x_identic_psnt, _ = G(x_org, emb_org, emb_org)
+        if len_pad == 0:
+            x_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
+        else:
+            x_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
+
+    np.save(os.path.basename(config.feature_path)[:-4] + "_" + ins_org + "_" + ins_org + ".npy", x_trg.T)
+    print("result saved.")
+
+    with torch.no_grad():
+        _, x_identic_psnt, _ = G(x_org, emb_org, emb_trg)
+        if len_pad == 0:
+            x_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
+        else:
+            x_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
+
+    np.save(os.path.basename(config.feature_path)[:-4] + "_" + ins_org + "_" + ins_trg + ".npy", x_trg.T)
+    print("result saved.")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    # Model configuration.
+    parser.add_argument('--lambda_cd', type=float, default=0, help='weight for hidden code loss')
+    # Training configuration.
+    parser.add_argument('--feature_path', type=str, default='../../data_syn/cropped/piano_all_00.wav_cqt.npy')
+    parser.add_argument('--feature_len', type=int, default=2400)
+    # parser.add_argument('--num_iters', type=int, default=1000000, help='number of total iterations')
+    # parser.add_argument('--len_crop', type=int, default=128, help='dataloader output sequence length')
+
+    # Miscellaneous.
+    parser.add_argument('--cp_path', type=str,
+                        default="../../autovc_cp/weights_log_cqt_down32_neck32_onehot4_withcross")
+
+    config = parser.parse_args()
+    print(config)
+    inference(config)
\ No newline at end of file
diff --git a/melodytalk/dependencies/transplayer/model.py b/melodytalk/dependencies/transplayer/model.py
new file mode 100644
index 0000000..0485c4a
--- /dev/null
+++ b/melodytalk/dependencies/transplayer/model.py
@@ -0,0 +1,399 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, x):
+        return self.linear_layer(x)
+
+
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert (kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+
+
+class Embedder(nn.Module):
+    def __init__(self, dim_emb):
+        super(Embedder, self).__init__()
+        self.dim_emb = dim_emb
+        # one-hot
+        self.emb = nn.Embedding(8, dim_emb)
+
+    def forward(self, onehot):  # one-hot
+        return self.emb(onehot)
+
+
+class Encoder(nn.Module):
+    """Encoder module:
+    """
+
+    def __init__(self, dim_neck, dim_emb, dim_pre, freq):
+        super(Encoder, self).__init__()
+        self.dim_neck = dim_neck
+        self.freq = freq
+
+        convolutions = []
+        for i in range(3):
+            conv_layer = nn.Sequential(
+                ConvNorm(84 + dim_emb if i == 0 else dim_pre,
+                         dim_pre,
+                         kernel_size=5, stride=1,
+                         padding=2,
+                         dilation=1, w_init_gain='relu'),
+                nn.BatchNorm1d(dim_pre))
+            convolutions.append(conv_layer)
+        self.convolutions = nn.ModuleList(convolutions)
+
+        self.lstm = nn.LSTM(dim_pre, dim_neck, 2, batch_first=True, bidirectional=True)
+
+    def forward(self, x, c_org):
+        x = x.squeeze(1).transpose(2, 1)
+
+        c_org = c_org.unsqueeze(-1).expand(-1, -1, x.size(-1))
+
+        x = torch.cat((x, c_org), dim=1)
+
+        for conv in self.convolutions:
+            x = F.relu(conv(x))
+        x = x.transpose(1, 2)
+
+        self.lstm.flatten_parameters()
+        outputs, _ = self.lstm(x)
+        out_forward = outputs[:, :, :self.dim_neck]
+        out_backward = outputs[:, :, self.dim_neck:]
+
+        codes = []
+        for i in range(0, outputs.size(1), self.freq):
+            codes.append(torch.cat((out_forward[:, i + self.freq - 1, :], out_backward[:, i, :]), dim=-1))
+
+        return codes
+
+
+class Decoder(nn.Module):
+    """Decoder module:
+    """
+
+    def __init__(self, dim_neck, dim_emb, dim_pre):
+        super(Decoder, self).__init__()
+
+        self.lstm1 = nn.LSTM(dim_neck * 2 + dim_emb, dim_pre, 1, batch_first=True)
+
+        convolutions = []
+        for i in range(3):
+            conv_layer = nn.Sequential(
+                ConvNorm(dim_pre,
+                         dim_pre,
+                         kernel_size=5, stride=1,
+                         padding=2,
+                         dilation=1, w_init_gain='relu'),
+                nn.BatchNorm1d(dim_pre))
+            convolutions.append(conv_layer)
+        self.convolutions = nn.ModuleList(convolutions)
+
+        self.lstm2 = nn.LSTM(dim_pre, 1024, 2, batch_first=True)
+
+        self.linear_projection = LinearNorm(1024, 84)
+
+    def forward(self, x):
+
+        # self.lstm1.flatten_parameters()
+        x, _ = self.lstm1(x)
+        x = x.transpose(1, 2)
+
+        for conv in self.convolutions:
+            x = F.relu(conv(x))
+        x = x.transpose(1, 2)
+
+        outputs, _ = self.lstm2(x)
+
+        decoder_output = self.linear_projection(outputs)
+
+        return decoder_output
+
+
+class Postnet(nn.Module):
+    """Postnet
+        - Five 1-d convolution with 512 channels and kernel size 5
+    """
+
+    def __init__(self):
+        super(Postnet, self).__init__()
+        self.convolutions = nn.ModuleList()
+
+        self.convolutions.append(
+            nn.Sequential(
+                ConvNorm(84, 512,
+                         kernel_size=5, stride=1,
+                         padding=2,
+                         dilation=1, w_init_gain='tanh'),
+                nn.BatchNorm1d(512))
+        )
+
+        for i in range(1, 5 - 1):
+            self.convolutions.append(
+                nn.Sequential(
+                    ConvNorm(512,
+                             512,
+                             kernel_size=5, stride=1,
+                             padding=2,
+                             dilation=1, w_init_gain='tanh'),
+                    nn.BatchNorm1d(512))
+            )
+
+        self.convolutions.append(
+            nn.Sequential(
+                ConvNorm(512, 84,
+                         kernel_size=5, stride=1,
+                         padding=2,
+                         dilation=1, w_init_gain='linear'),
+                nn.BatchNorm1d(84))
+        )
+
+    def forward(self, x):
+        for i in range(len(self.convolutions) - 1):
+            x = torch.tanh(self.convolutions[i](x))
+
+        x = self.convolutions[-1](x)
+
+        return x
+
+
+class Generator(nn.Module):
+    """Generator network."""
+
+    def __init__(self, dim_neck, dim_emb, dim_pre, freq):
+        super(Generator, self).__init__()
+
+        self.embedder = Embedder(dim_emb)
+        self.encoder = Encoder(dim_neck, dim_emb, dim_pre, freq)
+        self.decoder = Decoder(dim_neck, dim_emb, dim_pre)
+        self.postnet = Postnet()
+
+    def forward(self, x, c_org, c_trg):
+        # one-hot
+        c_org = self.embedder(c_org)
+
+        codes = self.encoder(x, c_org)
+        if c_trg is None:  # only encoder, don't decode to any target
+            return torch.cat(codes, dim=-1)
+
+        tmp = []
+        for code in codes:
+            tmp.append(code.unsqueeze(1).expand(-1, int(x.size(1) / len(codes)), -1))
+        code_exp = torch.cat(tmp, dim=1)
+
+        # one-hot
+        c_trg = self.embedder(c_trg)
+
+        # encoder_outputs = torch.cat((code_exp, c_trg.unsqueeze(1).expand(-1,x.size(1),-1)), dim=-1)
+        encoder_outputs = torch.cat((code_exp, c_trg.unsqueeze(1).expand(-1, x.size(1), -1)), dim=-1)
+
+        dec_outputs = self.decoder(encoder_outputs)
+
+        postnet_outputs = self.postnet(dec_outputs.transpose(2, 1))
+        postnet_outputs = dec_outputs + postnet_outputs.transpose(2, 1)
+
+        dec_outputs = dec_outputs.unsqueeze(1)
+        postnet_outputs = postnet_outputs.unsqueeze(1)
+
+        return dec_outputs, postnet_outputs, torch.cat(codes, dim=-1)
+
+
+class ResBlock(nn.Module):
+    def __init__(self, dim, dilation=1, norm='in', activation='relu', pad_type='zero'):
+        super(ResBlock, self).__init__()
+
+        model = []
+        model += [ConvNorm(dim, dim, kernel_size=3, padding=1), nn.ReLU()]
+        model += [ConvNorm(dim, dim, kernel_size=3, padding=1), nn.ReLU()]
+        self.model = nn.Sequential(*model)
+
+    def forward(self, x):
+        residual = x
+        out = self.model(x)
+        out += residual
+        return out
+
+
+class Conv2dBlock(nn.Module):
+    def __init__(self, input_dim, output_dim, kernel_size, stride,
+                 padding=0, dilation=1, norm='none', activation='relu', pad_type='zero'):
+        super(Conv2dBlock, self).__init__()
+        self.use_bias = True
+        # initialize padding
+        if pad_type == 'reflect':
+            self.pad = nn.ReflectionPad2d(padding)
+        elif pad_type == 'replicate':
+            self.pad = nn.ReplicationPad2d(padding)
+        elif pad_type == 'zero':
+            self.pad = nn.ZeroPad2d(padding)
+        else:
+            assert 0, "Unsupported padding type: {}".format(pad_type)
+
+        # initialize normalization
+        norm_dim = output_dim
+        if norm == 'bn':
+            self.norm = nn.BatchNorm2d(norm_dim)
+        elif norm == 'in':
+            self.norm = nn.InstanceNorm2d(norm_dim)
+        elif norm == 'ln':
+            self.norm = LayerNorm(norm_dim)
+        elif norm == 'adain':
+            self.norm = AdaptiveInstanceNorm2d(norm_dim)
+        elif norm == 'none' or norm == 'spectral':
+            self.norm = None
+        else:
+            assert 0, "Unsupported normalization: {}".format(norm)
+
+        # initialize activation
+        if activation == 'relu':
+            self.activation = nn.ReLU(inplace=True)
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU(0.2, inplace=True)
+        elif activation == 'prelu':
+            self.activation = nn.PReLU()
+        elif activation == 'selu':
+            self.activation = nn.SELU(inplace=True)
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        elif activation == 'none':
+            self.activation = None
+        else:
+            assert 0, "Unsupported activation: {}".format(activation)
+
+        # initialize convolution
+        if norm == 'spectral':
+            self.conv = SpectralNorm(
+                nn.Conv2d(input_dim, output_dim, kernel_size, stride, dilation=dilation, bias=self.use_bias))
+        else:
+            self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride, dilation=dilation, bias=self.use_bias)
+
+    def forward(self, x):
+        x = self.conv(self.pad(x))
+        if self.norm:
+            x = self.norm(x)
+        if self.activation:
+            x = self.activation(x)
+        return x
+
+
+class ConvTranspose2dBlock(nn.Module):
+    def __init__(self, input_dim, output_dim, kernel_size, stride,
+                 padding=0, dilation=1, norm='none', activation='none', pad_type='zero'):
+        super(ConvTranspose2dBlock, self).__init__()
+        self.use_bias = True
+        # initialize padding
+        if pad_type == 'reflect':
+            self.pad = nn.ReflectionPad2d(padding)
+        elif pad_type == 'replicate':
+            self.pad = nn.ReplicationPad2d(padding)
+        elif pad_type == 'zero':
+            self.pad = nn.ZeroPad2d(padding)
+        else:
+            assert 0, "Unsupported padding type: {}".format(pad_type)
+
+        # initialize normalization
+        norm_dim = output_dim
+        if norm == 'bn':
+            self.norm = nn.BatchNorm2d(norm_dim)
+        elif norm == 'in':
+            self.norm = nn.InstanceNorm2d(norm_dim)
+        elif norm == 'ln':
+            self.norm = LayerNorm(norm_dim)
+        elif norm == 'adain':
+            self.norm = AdaptiveInstanceNorm2d(norm_dim)
+        elif norm == 'none' or norm == 'spectral':
+            self.norm = None
+        else:
+            assert 0, "Unsupported normalization: {}".format(norm)
+
+        # initialize activation
+        if activation == 'relu':
+            self.activation = nn.ReLU(inplace=True)
+        elif activation == 'none':
+            self.activation = None
+        else:
+            assert 0, "Unsupported activation: {}".format(activation)
+
+        # initialize convolution
+        self.dconv = nn.ConvTranspose2d(input_dim, output_dim, kernel_size, stride, padding, bias=self.use_bias,
+                                        dilation=dilation)
+
+    def forward(self, x):
+        x = self.dconv(x)
+        if self.norm:
+            x = self.norm(x)
+        if self.activation:
+            x = self.activation(x)
+        return x
+
+
+class LinearBlock(nn.Module):
+    def __init__(self, input_dim, output_dim, norm='none', activation='relu'):
+        super(LinearBlock, self).__init__()
+        use_bias = True
+        # initialize fully connected layer
+        self.fc = nn.Linear(input_dim, output_dim, bias=use_bias)
+
+        # initialize normalization
+        norm_dim = output_dim
+        if norm == 'bn':
+            self.norm = nn.BatchNorm1d(norm_dim)
+        elif norm == 'in':
+            self.norm = nn.InstanceNorm1d(norm_dim)
+        elif norm == 'ln':
+            self.norm = LayerNorm(norm_dim)
+        elif norm == 'none':
+            self.norm = None
+        else:
+            assert 0, "Unsupported normalization: {}".format(norm)
+
+        # initialize activation
+        if activation == 'relu':
+            self.activation = nn.ReLU(inplace=True)
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU(0.2, inplace=True)
+        elif activation == 'prelu':
+            self.activation = nn.PReLU()
+        elif activation == 'selu':
+            self.activation = nn.SELU(inplace=True)
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        elif activation == 'none':
+            self.activation = None
+        else:
+            assert 0, "Unsupported activation: {}".format(activation)
+
+    def forward(self, x):
+        out = self.fc(x)
+        if self.norm:
+            out = self.norm(out)
+        if self.activation:
+            out = self.activation(out)
+        return out
\ No newline at end of file
diff --git a/melodytalk/main.py b/melodytalk/main.py
index 2fcef05..cfea79a 100644
--- a/melodytalk/main.py
+++ b/melodytalk/main.py
@@ -120,7 +120,11 @@ def __init__(self):
                      "Text2MusicWithTitle": "cuda:0",
                      "AddNewTrack": "cuda:0",
                      "MusicInpainting": "cuda:0",
-                     "Variation": "cuda:0",}
+                     "Variation": "cuda:0",
+                     "PitchShifting": "cuda:0",
+                     "TimeStretching": "cuda:0",
+                     "SingleSoundEffect": "cuda:0",
+                     }
         template_dict = None  # { "Text2MusicwithChord": "cuda:0"} # "Accompaniment": "cuda:0",
 
         print(f"Initializing MelodyTalk, load_dict={load_dict}, template_dict={template_dict}")
@@ -227,19 +231,24 @@ def clear_input_audio(self):
 
     with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
 
-        gr.Markdown(
-            """This is a demo to our work *MelodyTalk*.
-            """
-        )
+        gr.Markdown("""
+        ## MelodyTalk
+        ### MelodyTalk is a ChatGPT-based interface for making music loops. All supported tools are listed below.
+        ### Usage: 
+        ### Step 1: Describe the music loop you want to make. You can assign genre, instrument, bpm, mood in your text.
+        ### Step 2: You can finetune the generated music loop using existing tools.
+        """)
 
         lang = gr.Radio(choices=['Chinese', 'English'], value=None, label='Language')
         chatbot = gr.Chatbot(elem_id="chatbot", label="MelodyTalk")
         state = gr.State([])
 
         with gr.Row(visible=False) as input_raws:
-            with gr.Column(scale=0.7):
+            with gr.Column(scale=0.55):
                 txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an audio").style(
                     container=False)
+            with gr.Column(scale=0.15, min_width=0):
+                undo = gr.Button("Undo")
             with gr.Column(scale=0.15, min_width=0):
                 clear = gr.Button("Clear")
             with gr.Column(scale=0.15, min_width=0):
@@ -253,6 +262,32 @@ def clear_input_audio(self):
             with gr.Column(scale=0.15, min_width=0):
                 rec_submit = gr.Button("Submit")
 
+
+        gr.Markdown(
+            """| Task | Stage | Examples of text input | Backend models |
+        | --- | --- | --- | --- |
+        | Text to music | 1 | Generate a rock music loop with guitar and drums. | MusicGen |
+        | Drum pattern to music | 1 | Generate a rock music with guitar based on this drum. | MusicGen, CLAP |
+        | Impression to music | 1 | Generate a music loop feels like "Hey Jude"'s choral part. | ChatGPT, MusicGen |
+        | Stylistic rearrangement | 1 | Rearrange this music audio to jazz with saxophone solo. | MusicGen |
+        | Music variation | 1 | Generate a music loop sounds like this music. | VampNet |
+        | Add a track | 2 | Add a saxophone solo to this music loop. | MusicGen, CLAP |
+        | Remove a track | 2 | Remove the guitar from this music loop. | Demucs |
+        | Re-generation/inpainting | 2 | Re-generate the 3-5s part of the music loop. | VampNet |
+        | Pitch shifting | 2 | Shift this music by 3 semitone. | pedalboard |
+        | Speed changing | 2 | Speed up this music by 1.2. | torchaudio |
+        | Add sound effects| 2 | Add some reverb to the guitar solo. | pedalboard, automix-tools |
+        | Music captioning | N/A | Describe the current music loop. | LP-MusicCaps |
+        | * Replace instrument (unavailable) | 2 | Replace the guitar solo by piano. | Transplayer, automix-tools |
+        | * Timbre adjustment (unavailable) | 2 | Make the drum to sound more metallic. | ChatGPT, pedalboard, automix-tools |
+                    """
+                )
+
+        gr.Markdown("""
+        Currently, MelodyTalk still does not support music content style transfer, such as 'make this music more relax.' Please wait for our future work.
+        """)
+
+
         lang.change(bot.init_agent, [lang], [input_raws, lang, txt, clear, record_raws])
         txt.submit(bot.run_text, [txt, state], [chatbot, state])
         txt.submit(lambda: "", None, txt)
@@ -265,5 +300,5 @@ def clear_input_audio(self):
         clear.click(lambda: [], None, chatbot)
         clear.click(lambda: [], None, state)
         clear.click(bot.clear_input_audio, None, rec_audio)
-    demo.launch(server_name="0.0.0.0", server_port=7860,
+    demo.launch(server_name="0.0.0.0", server_port=7862,
                 ssl_certfile="cert.pem", ssl_keyfile="key.pem", ssl_verify=False)
diff --git a/melodytalk/modules.py b/melodytalk/modules.py
index 153b2a7..3898c04 100644
--- a/melodytalk/modules.py
+++ b/melodytalk/modules.py
@@ -1,7 +1,10 @@
 from shutil import copyfile
 from dataclasses import dataclass
 
+import librosa
 import torch
+import torchaudio.functional
+import pedalboard
 
 # text2music
 from melodytalk.dependencies.audiocraft.models import MusicGen
@@ -13,11 +16,13 @@
 # Vampnet
 from melodytalk.dependencies.vampnet.interface import Interface
 from melodytalk.dependencies.vampnet.main import vamp
+# captioning
+from melodytalk.dependencies.lpmc.music_captioning.captioning import main as captioning
 
 from utils import *
 
 DURATION = 8
-GENERATION_CANDIDATE = 5
+GENERATION_CANDIDATE = 6
 
 # Initialze common models
 # musicgen_model = MusicGen.get_pretrained('large')
@@ -34,13 +39,16 @@
 
 # Vampnet
 interface = Interface(
-    coarse_ckpt="./models/vampnet/coarse.pth",
-    coarse2fine_ckpt="./models/vampnet/c2f.pth",
-    codec_ckpt="./models/vampnet/codec.pth",
-    wavebeat_ckpt="./models/wavebeat.pth",
+    coarse_ckpt="./dependencies/vampnet/models/vampnet/coarse.pth",
+    coarse2fine_ckpt="./dependencies/vampnet/models/vampnet/c2f.pth",
+    codec_ckpt="./dependencies/vampnet/models/vampnet/codec.pth",
+    wavebeat_ckpt="./dependencies/vampnet/models/wavebeat.pth",
     device="cuda" if torch.cuda.is_available() else "cpu",
 )
 
+# captioning model
+
+
 @dataclass
 class GlobalAttributes(object):
     # metadata
@@ -114,16 +122,16 @@ def __init__(self, device):
 
     @prompts(
         name="Generate music from user input when the input is a title of music",
-        description="useful if you want to generate music which is silimar  and save it to a file."
-                    "like: generate music of love pop song, or generate music with piano and violin."
-                    "The input to this tool should be a comma separated string of two, "
-                    "representing the text description and the title."
+        description="useful if you want to generate music which feels like an real music."
+                    "like: generate music that feels like 'hey jude', or generate music similar to 'let it be'."
+                    "The input to this tool should be a string, "
+                    "representing the music title."
     )
 
     def inference(self, inputs):
-        text, title = inputs.split(",")[0].strip(), inputs.split(",")[1].strip()
+        title = inputs
         music_filename = os.path.join("music", f"{title}.wav")
-        text = music_title_to_description(text)  # using chatGPT's knowledge base to convert title to description
+        text = music_title_to_description(title)  # using chatGPT's knowledge base to convert title to description
         attribute_table.descriptions = text
         text = description_to_attributes(text)  # convert text to attributes
         wav = self.model.generate([text], progress=False)
@@ -182,10 +190,10 @@ def inference(self, inputs):
         music_filename, text = inputs.split(",")[0].strip(), inputs.split(",")[1].strip()
         text = description_to_attributes(text)
         print(f"Generating music from text with drum condition, Input text: {text}, Drum: {music_filename}.")
-        updated_music_filename = get_new_audio_name(music_filename, func_name="with_drum")
+        updated_music_filename = get_new_audio_name(music_filename, func_name="withdrum")
         drum, sr = torchaudio.load(music_filename)
         self.model.set_generation_params(duration=35)
-        wav = self.model.generate_continuation(prompt=drum[None].expand(GENERATION_CANDIDATE, -1, -1), prompt_sr=sr,
+        wav = self.model.generate_continuation(prompt=drum[None].expand(GENERATION_CANDIDATE, -1, -1), prompt_sample_rate=sr,
                                                descriptions=[text] * GENERATION_CANDIDATE, progress=False)
         self.model.set_generation_params(duration=DURATION)
         # cut drum prompt
@@ -229,6 +237,7 @@ def inference(self, inputs):
         # select the best one by CLAP scores
         print(f"CLAP post filter for {len(splitted_audios)} candidates.")
         best_wav, _ = CLAP_post_filter(CLAP_model, attribute_table.descriptions, splitted_audios.cuda(), self.model.sample_rate)
+        best_wav = torch.from_numpy(librosa.effects.trim(best_wav.cpu().numpy())[0])
         audio_write(updated_music_filename[:-4],
                     best_wav.cpu(), self.model.sample_rate, strategy="loudness", loudness_compressor=True)
         print(f"\nProcessed AddNewTrack, Output Music: {updated_music_filename}.")
@@ -334,7 +343,12 @@ def __init__(self):
     )
 
     def inference(self, inputs):
-        pass
+        music_filename = inputs.strip()
+        print(f"Captioning the current music, Input Music: {music_filename}.")
+        captions = captioning(music_filename)
+        captions_text = captions[0]["text"]
+        print(f"\nProcessed MusicCaptioning, Output Captions: {captions_text}.")
+        return captions_text
 
 
 # class Text2MusicwithChord(object):
@@ -375,7 +389,59 @@ def inference(self, inputs):
 #         print(f"\nProcessed Text2Music, Input Text: {preprocessed_input}, Output Music: {music_filename}.")
 #         return music_filename
 
+class PitchShifting(object):
+    def __init__(self, device):
+        print("Initializing PitchShifting")
+        self.device = device
+
+    @prompts(
+        name="Shift the pitch of the given music.",
+        description="useful if you want to shift the pitch of a music."
+                    "Like: shift the pitch of this music by 3 semitones."
+                    "The input to this tool should be a comma separated string of two, "
+                    "representing the music_filename and the pitch shift value."
+    )
 
+    def inference(self, inputs):
+        music_filename, pitch_shift_value = inputs.split(",")[0].strip(), int(inputs.split(",")[1].strip())
+        print(f"Shifting the pitch of the given music, Input Music: {music_filename}, Pitch Shift Value: {pitch_shift_value}.")
+        updated_music_filename = get_new_audio_name(music_filename, func_name="pitchshifting")
+        # load
+        wav, sr = torchaudio.load(music_filename)
+        # shift
+        wav = torchaudio.functional.pitch_shift(wav, sr, pitch_shift_value)
+        # write
+        audio_write(updated_music_filename[:-4],
+                    wav.cpu(), sr, strategy="loudness", loudness_compressor=True)
+        print(f"\nProcessed PitchShifting, Output Music: {updated_music_filename}.")
+        return updated_music_filename
+
+class TimeStretching(object):
+    def __init__(self, device):
+        print("Initializing TimeStretching")
+        self.device = device
+
+    @prompts(
+        name="Stretch the time of the given music.",
+        description="useful if you want to stretch the time of a music."
+                    "Like: stretch the time of this music by 1.5."
+                    "The input to this tool should be a comma separated string of two, "
+                    "representing the music_filename and the time stretch value."
+    )
+
+    def inference(self, inputs):
+        music_filename, time_stretch_value = inputs.split(",")[0].strip(), float(inputs.split(",")[1].strip())
+        print(f"Stretching the time of the given music, Input Music: {music_filename}, Time Stretch Value: {time_stretch_value}.")
+        updated_music_filename = get_new_audio_name(music_filename, func_name="timestretching")
+        # load
+        wav, sr = torchaudio.load(music_filename)
+        # stretch
+        wav = torchaudio.functional.speed(wav, sr, time_stretch_value)[0]
+        # write
+        audio_write(updated_music_filename[:-4],
+                    wav.cpu(), sr, strategy="loudness", loudness_compressor=True)
+        print(f"\nProcessed TimeStretching, Output Music: {updated_music_filename}.")
+        return updated_music_filename
 
 class MusicInpainting(object):
     def __init__(self, device):
@@ -394,7 +460,7 @@ def __init__(self, device):
     def inference(self, inputs):
         music_filename, start_time, end_time = inputs.split(",")[0].strip(), inputs.split(",")[1].strip(), inputs.split(",")[2].strip()
         print(f"Inpainting a specific time region of the given music, Input Music: {music_filename}, Start Time: {start_time}, End Time: {end_time}.")
-        updated_music_filename = get_new_audio_name(music_filename, func_name="inpainting_" + start_time + "_" + end_time)
+        updated_music_filename = get_new_audio_name(music_filename, func_name="inpainting")
         p_track, sr = torchaudio.load(music_filename)
         audio_length_in_second = p_track.shape[-1] / sr
         if float(end_time) > audio_length_in_second:
@@ -434,34 +500,63 @@ def inference(self, inputs):
         print(f"\nProcessed Variation, Output Music: {updated_music_filename}.")
         return updated_music_filename
 
-# class Accompaniment(object):
-#     template_model = True
-#     def __init__(self, Text2MusicWithMelody, ExtractTrack, SimpleTracksMixing):
-#         print("Initializing Accompaniment")
-#         self.Text2MusicWithMelody = Text2MusicWithMelody
-#         self.ExtractTrack = ExtractTrack
-#         self.SimpleTracksMixing = SimpleTracksMixing
+class SingleSoundEffect(object):
+    def __init__(self, device):
+        print("Initializing SingleSoundEffect")
+        self.device = device
+        self.interface = interface
+
+    @prompts(
+        name="Add a single sound effect to the given music.",
+        description="useful if you want to add a single sound effect, like reverb, high pass filter or chorus to the given music."
+                    "like: add a reverb of recording studio to this music."
+                    "The input to this tool should be a comma separated string of two, "
+                    "representing the music_filename and the original user message."
+    )
+
+    def inference(self, inputs):
+        music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip()
+        print(f"Add a single sound effect to the given music, Input Music: {music_filename}, Sound Effect Name: {user_message}.")
+        updated_music_filename = get_new_audio_name(music_filename, func_name="single_sound_effect")
+        sound_effect = add_single_sound_effect(user_message)
+        my_pedalboard = pedalboard.Pedalboard()
+        my_pedalboard.append(eval(sound_effect))
+        input_audio, sr = torchaudio.load(music_filename)
+        output_audio = my_pedalboard(input_audio.numpy(), sample_rate=sr)
+        audio_write(updated_music_filename[:-4],
+                    output_audio, sr, strategy="loudness", loudness_compressor=True)
+        print(f"\nProcessed SingleSoundEffect, Output Music: {updated_music_filename}.")
+        return updated_music_filename
+
+
+# class TimbreTransfer(object):
+#     def __init__(self, device):
+#             print("Initializing TimbreTransfer")
+#             self.device = device
+#             self.interface = interface
 #
-#     @prompts(
-#         name="Generate accompaniment music from user input text, keeping the given melody or track",
-#         description="useful if you want to style transfer or remix music from a user input text with a given melody."
-#                     "Unlike Text2MusicWithMelody, this tool will keep the given melody track instead of re-generate it."
-#                     "Note that the user must assign a track (it must be one of `vocals`, `drums`, `bass`, `guitar`, `piano` or `other`) to keep."
-#                     "like: keep the guitar track and remix the given music with text description, "
-#                     "or generate accompaniment as text described with the given vocal track."
-#                     "The input to this tool should be a comma separated string of three, "
-#                     "representing the music_filename, track name, and the text description."
-#     )
+#         @prompts(
+#             name="Transfer the timbre of the given music to another music.",
+#             description="useful if you want to transfer the timbre of the given music to another music."
+#                         "like: transfer the timbre of this music to another music."
+#                         "The input to this tool should be a comma separated string of two, "
+#                         "representing the music_filename and the original user message."
+#         )
 #
 #     def inference(self, inputs):
-#         music_filename, track_name, text = inputs.split(",")[0].strip(), inputs.split(",")[1].strip(), inputs.split(",")[2].strip()
-#         print(f"Generating music from text with accompaniment condition, Input Text: {text}, Previous music: {music_filename}, Track: {track_name}.")
-#         # separate the track
-#         updated_main_track = self.ExtractTrack.inference(f"{music_filename}, {track_name}, extract")
-#         # generate music
-#         updated_new_music = self.Text2MusicWithMelody.inference(f"{updated_main_track}, {text}")
-#         # remove the track in accompaniment
-#         updated_accompaniment = self.ExtractTrack.inference(f"{updated_new_music}, {track_name}, remove")
-#         # mix
-#         updated_music_filename = self.SimpleTracksMixing.inference(f"{updated_main_track}, {updated_accompaniment}")
-#         return updated_music_filename
\ No newline at end of file
+#         music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip()
+#         print(f"Transfer the timbre of the given music to another music, Input Music: {music_filename}, Target Music: {user_message}.")
+#         updated_music_filename = get_new_audio_name(music_filename, func_name="timbre_transfer")
+#         target_music_filename = get_new_audio_name(user_message, func_name="timbre_transfer")
+#         # load
+#         wav, sr = torchaudio.load(music_filename)
+#         target_wav, target_sr = torchaudio.load(user_message)
+#         # stretch
+#         wav = torchaudio.functional.time_stretch(wav, sr, target_sr/sr)[0]
+#         # write
+#         audio_write(updated_music_filename[:-4],
+#                     wav.cpu(), sr, strategy="loudness", loudness_compressor=True)
+#         audio_write(target_music_filename[:-4],
+#                     target_wav.cpu(), target_sr, strategy="loudness", loudness_compressor=True)
+#         print(f"\nProcessed TimbreTransfer, Output Music: {updated_music_filename}.")
+#         return updated_music_filename
diff --git a/melodytalk/utils.py b/melodytalk/utils.py
index 7e9b6af..8f74947 100644
--- a/melodytalk/utils.py
+++ b/melodytalk/utils.py
@@ -301,5 +301,43 @@ def split_audio_tensor_by_downbeats(input_audio_batch: torch.Tensor, sr: int = 3
 
     return segments
 
+def add_single_sound_effect(input: str) -> str:
+    openai_prompt = f"""You are asked to pick the most appropriate one of the APIs below to achieve the desired sound effects. You MUST loyally only assign the existing parameters to fine-tune the function. If you use the default param value, skip it.
+
+    1. Guitar-style effects: 
+        - Chorus(rate_hz: float = 1.0, depth: float = 0.25, centre_delay_ms: float = 7.0, feedback: float = 0.0, mix: float = 0.5);
+        - Distortion(drive_db: float = 25);
+        - Phaser(rate_hz: float = 1.0, depth: float = 0.5, centre_frequency_hz: float = 1300.0, feedback: float = 0.0, mix: float = 0.5);
+        - Clipping(threshold_db: float = -6.0);
+    2. Loudness and dynamic range effects: 
+        - Compressor(threshold_db: float = 0, ratio: float = 1, attack_ms: float = 1.0, release_ms: float = 100);
+        - Gain(gain_db: float = 1.0);
+        - Limiter(threshold_db: float = -10.0, release_ms: float = 100.0);
+    3. Equalizers and filters: 
+        - HighpassFilter(cutoff_frequency_hz: float = 50);
+        - LadderFilter(mode: Mode = Mode.LPF12, cutoff_hz: float = 200, resonance: float = 0, drive: float = 1.0);
+        - LowpassFilter(cutoff_frequency_hz: float = 50);
+    4. Spatial effects: 
+        - Convolution(impulse_response_filename: str, mix: float = 1.0);
+        - Delay(delay_seconds: float = 0.5, feedback: float = 0.0, mix: float = 0.5);
+        - Reverb(room_size: float = 0.5, damping: float = 0.5, wet_level: float = 0.33, dry_level: float = 0.4, width: float = 1.0, freeze_mode: float = 0.0);
+    
+    Let us think step by step.
+    
+    Q: I want to use a 200hz highpass filter to this audio.
+    A: pedalboard.HighpassFilter(cutoff_frequency_hz=200);
+
+    Q: {input}.
+    A: """
 
+    response = openai.Completion.create(
+        model="text-davinci-003",
+        prompt=openai_prompt,
+        temperature=0,
+        max_tokens=100,
+        top_p=1,
+        frequency_penalty=0.0,
+        presence_penalty=0.0,
+    )
 
+    return response.choices[0].text
\ No newline at end of file