From 087b752bf33c8cef5821d051d453d167ddfaff5a Mon Sep 17 00:00:00 2001 From: Yixiao Zhang Date: Thu, 31 Aug 2023 17:37:15 +0900 Subject: [PATCH] update --- .DS_Store | Bin 10244 -> 10244 bytes .../dependencies/transplayer/inference.py | 63 +++++------------ melodytalk/main.py | 15 ++-- melodytalk/modules.py | 66 +++++++++--------- melodytalk/utils.py | 2 +- 5 files changed, 59 insertions(+), 87 deletions(-) diff --git a/.DS_Store b/.DS_Store index 88ec8f8f707fbbb2911ce79d2653bc0cfc3b5b79..bd1d07a3693cd0d28df7a8d4376f39ef38e309b8 100644 GIT binary patch delta 59 wcmZn(XbIRbSAgBjKu5vI)N-=FgyZI?0$FSv#t0Gn&C|qp@}r1Mo+h3L01M&~6aWAK delta 59 zcmZn(XbIRbSAgBZKu5vEz<9F1gyZI?0$FSvrbZwUV@sfj{pM-nJNZ#WCQlR30{{wI B5eWbQ diff --git a/melodytalk/dependencies/transplayer/inference.py b/melodytalk/dependencies/transplayer/inference.py index 8d6e120..6c73635 100644 --- a/melodytalk/dependencies/transplayer/inference.py +++ b/melodytalk/dependencies/transplayer/inference.py @@ -1,17 +1,6 @@ -import librosa +import librosa as lr import resampy - -def transform(filepath): - audio, sr = librosa.load(filepath) - if sr != 16000: - audio = resampy.resample(audio, sr, 16000) - cqt_representation = lr.cqt(audio, sr=sr, hop_length=256) - - cqt_magnitude = np.abs(cqt_representation) - - import os -import argparse import torch import numpy as np from math import ceil @@ -26,11 +15,11 @@ def pad_seq(x, base=32): assert len_pad >= 0 return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), len_pad - def inference(input_file_path, output_file_path, - org='piano', trg='piano', - cp_path=None): + org='piano', trg='guitar', + feature_len=2400, + cp_path='weights.pth'): G = Generator(dim_neck=32, dim_emb=4, dim_pre=512, @@ -39,32 +28,27 @@ def inference(input_file_path, save_info = torch.load(cp_path) G.load_state_dict(save_info["model"]) + # process input + audio, sr = lr.load(input_file_path) + if sr != 16000: + audio = resampy.resample(audio, sr, 16000) + cqt_representation = lr.cqt(audio, sr=sr, hop_length=256) + cqt_magnitude = np.abs(cqt_representation) + # one-hot ins_list = ['harp', 'trumpet', 'epiano', 'viola', 'piano', 'guitar', 'organ', 'flute'] ins_org = org ins_trg = trg emb_org = ins_list.index(ins_org) emb_trg = ins_list.index(ins_trg) - # emb_org = [i == ins_org for i in ins_list] - # emb_trg = [i == ins_trg for i in ins_list] emb_org = torch.unsqueeze(torch.tensor(emb_org), dim=0).to(device) emb_trg = torch.unsqueeze(torch.tensor(emb_trg), dim=0).to(device) - x_org = np.log(np.load(config.feature_path).T)[:config.feature_len] + x_org = np.log(cqt_magnitude.T)[:feature_len] # x_org = np.load(config.spectrogram_path).T x_org, len_pad = pad_seq(x_org) x_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device) - with torch.no_grad(): - _, x_identic_psnt, _ = G(x_org, emb_org, emb_org) - if len_pad == 0: - x_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() - else: - x_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() - - np.save(os.path.basename(config.feature_path)[:-4] + "_" + ins_org + "_" + ins_org + ".npy", x_trg.T) - print("result saved.") - with torch.no_grad(): _, x_identic_psnt, _ = G(x_org, emb_org, emb_trg) if len_pad == 0: @@ -72,25 +56,10 @@ def inference(input_file_path, else: x_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() - np.save(os.path.basename(config.feature_path)[:-4] + "_" + ins_org + "_" + ins_trg + ".npy", x_trg.T) - print("result saved.") + # save output + waveform = lr.icqt(x_trg, sr=sr, hop_length=256) + lr.output.write_wav(output_file_path, waveform, sr) if __name__ == '__main__': - parser = argparse.ArgumentParser() - - # Model configuration. - parser.add_argument('--lambda_cd', type=float, default=0, help='weight for hidden code loss') - # Training configuration. - parser.add_argument('--feature_path', type=str, default='../../data_syn/cropped/piano_all_00.wav_cqt.npy') - parser.add_argument('--feature_len', type=int, default=2400) - # parser.add_argument('--num_iters', type=int, default=1000000, help='number of total iterations') - # parser.add_argument('--len_crop', type=int, default=128, help='dataloader output sequence length') - - # Miscellaneous. - parser.add_argument('--cp_path', type=str, - default="../../autovc_cp/weights_log_cqt_down32_neck32_onehot4_withcross") - - config = parser.parse_args() - print(config) - inference(config) \ No newline at end of file + inference('/home/intern-2023-02/melodytalk/melodytalk/music/2d2c_piano_2333801f_2333801f.wav', 'output.wav') \ No newline at end of file diff --git a/melodytalk/main.py b/melodytalk/main.py index cfea79a..47b28a5 100644 --- a/melodytalk/main.py +++ b/melodytalk/main.py @@ -118,7 +118,6 @@ def __init__(self): "ReArrangement": "cuda:0", "Text2MusicWithDrum": "cuda:0", "Text2MusicWithTitle": "cuda:0", - "AddNewTrack": "cuda:0", "MusicInpainting": "cuda:0", "Variation": "cuda:0", "PitchShifting": "cuda:0", @@ -193,6 +192,13 @@ def run_text(self, text, state): # f"Current Memory: {self.agent.memory.buffer}") return state, state + def redo(self, state): + state = state[:-1] + text = self.agent.memory.chat_memory.messages[-1].content + state = state[:-1] + self.agent.memory.chat_memory.messages = self.self.agent.memory.chat_memory.messages[:-2] + return self.run_text(text, state) + def run_audio(self, file, state, txt, lang): music_filename = os.path.join('music', str(uuid.uuid4())[0:8] + ".wav") print("Inputs:", file, state) @@ -248,7 +254,7 @@ def clear_input_audio(self): txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an audio").style( container=False) with gr.Column(scale=0.15, min_width=0): - undo = gr.Button("Undo") + redo = gr.Button("Redo") with gr.Column(scale=0.15, min_width=0): clear = gr.Button("Clear") with gr.Column(scale=0.15, min_width=0): @@ -271,15 +277,12 @@ def clear_input_audio(self): | Impression to music | 1 | Generate a music loop feels like "Hey Jude"'s choral part. | ChatGPT, MusicGen | | Stylistic rearrangement | 1 | Rearrange this music audio to jazz with saxophone solo. | MusicGen | | Music variation | 1 | Generate a music loop sounds like this music. | VampNet | - | Add a track | 2 | Add a saxophone solo to this music loop. | MusicGen, CLAP | | Remove a track | 2 | Remove the guitar from this music loop. | Demucs | | Re-generation/inpainting | 2 | Re-generate the 3-5s part of the music loop. | VampNet | | Pitch shifting | 2 | Shift this music by 3 semitone. | pedalboard | | Speed changing | 2 | Speed up this music by 1.2. | torchaudio | | Add sound effects| 2 | Add some reverb to the guitar solo. | pedalboard, automix-tools | | Music captioning | N/A | Describe the current music loop. | LP-MusicCaps | - | * Replace instrument (unavailable) | 2 | Replace the guitar solo by piano. | Transplayer, automix-tools | - | * Timbre adjustment (unavailable) | 2 | Make the drum to sound more metallic. | ChatGPT, pedalboard, automix-tools | """ ) @@ -292,7 +295,7 @@ def clear_input_audio(self): txt.submit(bot.run_text, [txt, state], [chatbot, state]) txt.submit(lambda: "", None, txt) btn.upload(bot.run_audio, [btn, state, txt, lang], [chatbot, state]) - + redo.click(bot.redo, [state], [chatbot, state]) rec_submit.click(bot.run_recording, [rec_audio, state, txt, lang], [chatbot, state]) rec_clear.click(bot.clear_input_audio, None, rec_audio) diff --git a/melodytalk/modules.py b/melodytalk/modules.py index 3898c04..a7d818b 100644 --- a/melodytalk/modules.py +++ b/melodytalk/modules.py @@ -517,46 +517,46 @@ def __init__(self, device): def inference(self, inputs): music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip() print(f"Add a single sound effect to the given music, Input Music: {music_filename}, Sound Effect Name: {user_message}.") - updated_music_filename = get_new_audio_name(music_filename, func_name="single_sound_effect") + updated_music_filename = get_new_audio_name(music_filename, func_name="singlesoundeffect") sound_effect = add_single_sound_effect(user_message) my_pedalboard = pedalboard.Pedalboard() my_pedalboard.append(eval(sound_effect)) input_audio, sr = torchaudio.load(music_filename) output_audio = my_pedalboard(input_audio.numpy(), sample_rate=sr) audio_write(updated_music_filename[:-4], - output_audio, sr, strategy="loudness", loudness_compressor=True) + torch.from_numpy(output_audio), sr, strategy="loudness", loudness_compressor=True) print(f"\nProcessed SingleSoundEffect, Output Music: {updated_music_filename}.") return updated_music_filename -# class TimbreTransfer(object): -# def __init__(self, device): -# print("Initializing TimbreTransfer") -# self.device = device -# self.interface = interface -# -# @prompts( -# name="Transfer the timbre of the given music to another music.", -# description="useful if you want to transfer the timbre of the given music to another music." -# "like: transfer the timbre of this music to another music." -# "The input to this tool should be a comma separated string of two, " -# "representing the music_filename and the original user message." -# ) -# -# def inference(self, inputs): -# music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip() -# print(f"Transfer the timbre of the given music to another music, Input Music: {music_filename}, Target Music: {user_message}.") -# updated_music_filename = get_new_audio_name(music_filename, func_name="timbre_transfer") -# target_music_filename = get_new_audio_name(user_message, func_name="timbre_transfer") -# # load -# wav, sr = torchaudio.load(music_filename) -# target_wav, target_sr = torchaudio.load(user_message) -# # stretch -# wav = torchaudio.functional.time_stretch(wav, sr, target_sr/sr)[0] -# # write -# audio_write(updated_music_filename[:-4], -# wav.cpu(), sr, strategy="loudness", loudness_compressor=True) -# audio_write(target_music_filename[:-4], -# target_wav.cpu(), target_sr, strategy="loudness", loudness_compressor=True) -# print(f"\nProcessed TimbreTransfer, Output Music: {updated_music_filename}.") -# return updated_music_filename +class TimbreTransfer(object): + def __init__(self, device): + print("Initializing TimbreTransfer") + self.device = device + self.interface = interface + + @prompts( + name="Transfer the timbre of the given music to another music.", + description="useful if you want to transfer the timbre of the given music to another music." + "like: transfer the timbre of this music to another music." + "The input to this tool should be a comma separated string of two, " + "representing the music_filename and the original user message." + ) + + def inference(self, inputs): + music_filename, user_message = inputs.split(",")[0].strip(), inputs.split(",")[1].strip() + print(f"Transfer the timbre of the given music to another music, Input Music: {music_filename}, Target Music: {user_message}.") + updated_music_filename = get_new_audio_name(music_filename, func_name="timbre_transfer") + target_music_filename = get_new_audio_name(user_message, func_name="timbre_transfer") + # load + wav, sr = torchaudio.load(music_filename) + target_wav, target_sr = torchaudio.load(user_message) + # stretch + wav = torchaudio.functional.time_stretch(wav, sr, target_sr/sr)[0] + # write + audio_write(updated_music_filename[:-4], + wav.cpu(), sr, strategy="loudness", loudness_compressor=True) + audio_write(target_music_filename[:-4], + target_wav.cpu(), target_sr, strategy="loudness", loudness_compressor=True) + print(f"\nProcessed TimbreTransfer, Output Music: {updated_music_filename}.") + return updated_music_filename diff --git a/melodytalk/utils.py b/melodytalk/utils.py index 8f74947..85be6c6 100644 --- a/melodytalk/utils.py +++ b/melodytalk/utils.py @@ -325,7 +325,7 @@ def add_single_sound_effect(input: str) -> str: Let us think step by step. Q: I want to use a 200hz highpass filter to this audio. - A: pedalboard.HighpassFilter(cutoff_frequency_hz=200); + A: pedalboard.HighpassFilter(cutoff_frequency_hz=200) Q: {input}. A: """