update

ldzhangyx · Aug 31, 2023 · 087b752 · 087b752
1 parent 18329ba
commit 087b752
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 87 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/melodytalk/dependencies/transplayer/inference.py b/melodytalk/dependencies/transplayer/inference.py
@@ -1,17 +1,6 @@
-import librosa
+import librosa as lr
 import resampy
-
-def transform(filepath):
-    audio, sr = librosa.load(filepath)
-    if sr != 16000:
-        audio = resampy.resample(audio, sr, 16000)
-    cqt_representation = lr.cqt(audio, sr=sr, hop_length=256)
-
-    cqt_magnitude = np.abs(cqt_representation)
-
-
 import os
-import argparse
 import torch
 import numpy as np
 from math import ceil
@@ -26,11 +15,11 @@ def pad_seq(x, base=32):
     assert len_pad >= 0
     return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), len_pad
 
-
 def inference(input_file_path,
               output_file_path,
-              org='piano', trg='piano',
-              cp_path=None):
+              org='piano', trg='guitar',
+              feature_len=2400,
+              cp_path='weights.pth'):
     G = Generator(dim_neck=32,
                   dim_emb=4,
                   dim_pre=512,
@@ -39,58 +28,38 @@ def inference(input_file_path,
         save_info = torch.load(cp_path)
         G.load_state_dict(save_info["model"])
 
+    # process input
+    audio, sr = lr.load(input_file_path)
+    if sr != 16000:
+        audio = resampy.resample(audio, sr, 16000)
+    cqt_representation = lr.cqt(audio, sr=sr, hop_length=256)
+    cqt_magnitude = np.abs(cqt_representation)
+
     # one-hot
     ins_list = ['harp', 'trumpet', 'epiano', 'viola', 'piano', 'guitar', 'organ', 'flute']
     ins_org = org
     ins_trg = trg
     emb_org = ins_list.index(ins_org)
     emb_trg = ins_list.index(ins_trg)
-    # emb_org = [i == ins_org for i in ins_list]
-    # emb_trg = [i == ins_trg for i in ins_list]
     emb_org = torch.unsqueeze(torch.tensor(emb_org), dim=0).to(device)
     emb_trg = torch.unsqueeze(torch.tensor(emb_trg), dim=0).to(device)
 
-    x_org = np.log(np.load(config.feature_path).T)[:config.feature_len]
+    x_org = np.log(cqt_magnitude.T)[:feature_len]
     # x_org = np.load(config.spectrogram_path).T
     x_org, len_pad = pad_seq(x_org)
     x_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
 
-    with torch.no_grad():
-        _, x_identic_psnt, _ = G(x_org, emb_org, emb_org)
-        if len_pad == 0:
-            x_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
-        else:
-            x_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
-
-    np.save(os.path.basename(config.feature_path)[:-4] + "_" + ins_org + "_" + ins_org + ".npy", x_trg.T)
-    print("result saved.")
-
     with torch.no_grad():
         _, x_identic_psnt, _ = G(x_org, emb_org, emb_trg)
         if len_pad == 0:
             x_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
         else:
             x_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
 
-    np.save(os.path.basename(config.feature_path)[:-4] + "_" + ins_org + "_" + ins_trg + ".npy", x_trg.T)
-    print("result saved.")
+    # save output
+    waveform = lr.icqt(x_trg, sr=sr, hop_length=256)
+    lr.output.write_wav(output_file_path, waveform, sr)
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-
-    # Model configuration.
-    parser.add_argument('--lambda_cd', type=float, default=0, help='weight for hidden code loss')
-    # Training configuration.
-    parser.add_argument('--feature_path', type=str, default='../../data_syn/cropped/piano_all_00.wav_cqt.npy')
-    parser.add_argument('--feature_len', type=int, default=2400)
-    # parser.add_argument('--num_iters', type=int, default=1000000, help='number of total iterations')
-    # parser.add_argument('--len_crop', type=int, default=128, help='dataloader output sequence length')
-
-    # Miscellaneous.
-    parser.add_argument('--cp_path', type=str,
-                        default="../../autovc_cp/weights_log_cqt_down32_neck32_onehot4_withcross")
-
-    config = parser.parse_args()
-    print(config)
-    inference(config)
+    inference('/home/intern-2023-02/melodytalk/melodytalk/music/2d2c_piano_2333801f_2333801f.wav', 'output.wav')
diff --git a/melodytalk/main.py b/melodytalk/main.py
@@ -118,7 +118,6 @@ def __init__(self):
                      "ReArrangement": "cuda:0",
                      "Text2MusicWithDrum": "cuda:0",
                      "Text2MusicWithTitle": "cuda:0",
-                     "AddNewTrack": "cuda:0",
                      "MusicInpainting": "cuda:0",
                      "Variation": "cuda:0",
                      "PitchShifting": "cuda:0",
@@ -193,6 +192,13 @@ def run_text(self, text, state):
         #       f"Current Memory: {self.agent.memory.buffer}")
         return state, state
 
+    def redo(self, state):
+        state = state[:-1]
+        text = self.agent.memory.chat_memory.messages[-1].content
+        state = state[:-1]
+        self.agent.memory.chat_memory.messages = self.self.agent.memory.chat_memory.messages[:-2]
+        return self.run_text(text, state)
+
     def run_audio(self, file, state, txt, lang):
         music_filename = os.path.join('music', str(uuid.uuid4())[0:8] + ".wav")
         print("Inputs:", file, state)
@@ -248,7 +254,7 @@ def clear_input_audio(self):
                 txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an audio").style(
                     container=False)
             with gr.Column(scale=0.15, min_width=0):
-                undo = gr.Button("Undo")
+                redo = gr.Button("Redo")
             with gr.Column(scale=0.15, min_width=0):
                 clear = gr.Button("Clear")
             with gr.Column(scale=0.15, min_width=0):
@@ -271,15 +277,12 @@ def clear_input_audio(self):
         | Impression to music | 1 | Generate a music loop feels like "Hey Jude"'s choral part. | ChatGPT, MusicGen |
         | Stylistic rearrangement | 1 | Rearrange this music audio to jazz with saxophone solo. | MusicGen |
         | Music variation | 1 | Generate a music loop sounds like this music. | VampNet |
-        | Add a track | 2 | Add a saxophone solo to this music loop. | MusicGen, CLAP |
         | Remove a track | 2 | Remove the guitar from this music loop. | Demucs |
         | Re-generation/inpainting | 2 | Re-generate the 3-5s part of the music loop. | VampNet |
         | Pitch shifting | 2 | Shift this music by 3 semitone. | pedalboard |
         | Speed changing | 2 | Speed up this music by 1.2. | torchaudio |
         | Add sound effects| 2 | Add some reverb to the guitar solo. | pedalboard, automix-tools |
         | Music captioning | N/A | Describe the current music loop. | LP-MusicCaps |
-        | * Replace instrument (unavailable) | 2 | Replace the guitar solo by piano. | Transplayer, automix-tools |
-        | * Timbre adjustment (unavailable) | 2 | Make the drum to sound more metallic. | ChatGPT, pedalboard, automix-tools |
                     """
                 )
 
@@ -292,7 +295,7 @@ def clear_input_audio(self):
         txt.submit(bot.run_text, [txt, state], [chatbot, state])
         txt.submit(lambda: "", None, txt)
         btn.upload(bot.run_audio, [btn, state, txt, lang], [chatbot, state])
-
+        redo.click(bot.redo, [state], [chatbot, state])
         rec_submit.click(bot.run_recording, [rec_audio, state, txt, lang], [chatbot, state])
         rec_clear.click(bot.clear_input_audio, None, rec_audio)
 

diff --git a/melodytalk/modules.py b/melodytalk/modules.py
@@ -517,46 +517,46 @@ def __init__(self, device):
     def inference(self, inputs):
         music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip()
         print(f"Add a single sound effect to the given music, Input Music: {music_filename}, Sound Effect Name: {user_message}.")
-        updated_music_filename = get_new_audio_name(music_filename, func_name="single_sound_effect")
+        updated_music_filename = get_new_audio_name(music_filename, func_name="singlesoundeffect")
         sound_effect = add_single_sound_effect(user_message)
         my_pedalboard = pedalboard.Pedalboard()
         my_pedalboard.append(eval(sound_effect))
         input_audio, sr = torchaudio.load(music_filename)
         output_audio = my_pedalboard(input_audio.numpy(), sample_rate=sr)
         audio_write(updated_music_filename[:-4],
-                    output_audio, sr, strategy="loudness", loudness_compressor=True)
+                    torch.from_numpy(output_audio), sr, strategy="loudness", loudness_compressor=True)
         print(f"\nProcessed SingleSoundEffect, Output Music: {updated_music_filename}.")
         return updated_music_filename
 
 
-# class TimbreTransfer(object):
-#     def __init__(self, device):
-#             print("Initializing TimbreTransfer")
-#             self.device = device
-#             self.interface = interface
-#
-#         @prompts(
-#             name="Transfer the timbre of the given music to another music.",
-#             description="useful if you want to transfer the timbre of the given music to another music."
-#                         "like: transfer the timbre of this music to another music."
-#                         "The input to this tool should be a comma separated string of two, "
-#                         "representing the music_filename and the original user message."
-#         )
-#
-#     def inference(self, inputs):
-#         music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip()
-#         print(f"Transfer the timbre of the given music to another music, Input Music: {music_filename}, Target Music: {user_message}.")
-#         updated_music_filename = get_new_audio_name(music_filename, func_name="timbre_transfer")
-#         target_music_filename = get_new_audio_name(user_message, func_name="timbre_transfer")
-#         # load
-#         wav, sr = torchaudio.load(music_filename)
-#         target_wav, target_sr = torchaudio.load(user_message)
-#         # stretch
-#         wav = torchaudio.functional.time_stretch(wav, sr, target_sr/sr)[0]
-#         # write
-#         audio_write(updated_music_filename[:-4],
-#                     wav.cpu(), sr, strategy="loudness", loudness_compressor=True)
-#         audio_write(target_music_filename[:-4],
-#                     target_wav.cpu(), target_sr, strategy="loudness", loudness_compressor=True)
-#         print(f"\nProcessed TimbreTransfer, Output Music: {updated_music_filename}.")
-#         return updated_music_filename
+class TimbreTransfer(object):
+    def __init__(self, device):
+            print("Initializing TimbreTransfer")
+            self.device = device
+            self.interface = interface
+
+    @prompts(
+        name="Transfer the timbre of the given music to another music.",
+        description="useful if you want to transfer the timbre of the given music to another music."
+                    "like: transfer the timbre of this music to another music."
+                    "The input to this tool should be a comma separated string of two, "
+                    "representing the music_filename and the original user message."
+    )
+
+    def inference(self, inputs):
+        music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip()
+        print(f"Transfer the timbre of the given music to another music, Input Music: {music_filename}, Target Music: {user_message}.")
+        updated_music_filename = get_new_audio_name(music_filename, func_name="timbre_transfer")
+        target_music_filename = get_new_audio_name(user_message, func_name="timbre_transfer")
+        # load
+        wav, sr = torchaudio.load(music_filename)
+        target_wav, target_sr = torchaudio.load(user_message)
+        # stretch
+        wav = torchaudio.functional.time_stretch(wav, sr, target_sr/sr)[0]
+        # write
+        audio_write(updated_music_filename[:-4],
+                    wav.cpu(), sr, strategy="loudness", loudness_compressor=True)
+        audio_write(target_music_filename[:-4],
+                    target_wav.cpu(), target_sr, strategy="loudness", loudness_compressor=True)
+        print(f"\nProcessed TimbreTransfer, Output Music: {updated_music_filename}.")
+        return updated_music_filename
diff --git a/melodytalk/utils.py b/melodytalk/utils.py
@@ -325,7 +325,7 @@ def add_single_sound_effect(input: str) -> str:
     Let us think step by step.
     
     Q: I want to use a 200hz highpass filter to this audio.
-    A: pedalboard.HighpassFilter(cutoff_frequency_hz=200);
+    A: pedalboard.HighpassFilter(cutoff_frequency_hz=200)
 
     Q: {input}.
     A: """