new model: vampnet

ldzhangyx · Aug 1, 2023 · f8ab42e · f8ab42e
1 parent 7bcc344
commit f8ab42e
Show file tree

Hide file tree

Showing 5 changed files with 242 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -154,5 +154,7 @@ cython_debug/
 output/
 assets/
 melodytalk/music/
+*.pth
+*.wav
 
 .DS_Store
diff --git a/melodytalk/dependencies/vampnet/main.py b/melodytalk/dependencies/vampnet/main.py
@@ -0,0 +1,112 @@
+from pathlib import Path
+from typing import Tuple
+import yaml
+import tempfile
+import uuid
+import shutil
+from dataclasses import dataclass, asdict
+
+import numpy as np
+import audiotools as at
+import argbind
+import torch
+
+import gradio as gr
+from melodytalk.dependencies.vampnet.interface import Interface
+from melodytalk.dependencies.vampnet import mask as pmask
+
+
+def vamp(input_audio_path=None,
+         output_audio_path=None,
+         interface=None,
+         top_p=0,
+         prefix_s=0,  # inpainting
+         suffix_s=0,  # inpainting
+         rand_mask_intensity=1,
+         num_steps=36,
+         periodic_p=0,  # periodic mask
+         periodic_w=0,  # periodic mask
+         onset_mask_width=0,  # onset mask
+         beat_mask_width=0,  # beat mask
+         dropout=0,  # dropout
+         beat_mask_downbeats=False,
+         n_conditioning_codebooks=0,
+         seed=0,
+         masktemp=1.5,
+         sampletemp=1.0,
+         typical_filtering=False,
+         typical_mass=0.15,
+         typical_min_tokens=64,
+         use_coarse2fine=True):
+    # preprocess files
+    # trim to 10s
+
+    # sig = at.AudioSignal(input_audio_path)
+    sig = at.AudioSignal(input_audio_path, duration=10)
+
+    z = interface.encode(sig)
+
+    ncc = n_conditioning_codebooks
+
+    # build the mask
+    mask = pmask.linear_random(z, rand_mask_intensity)
+    mask = pmask.mask_and(
+        mask, pmask.inpaint(
+            z,
+            interface.s2t(prefix_s),
+            interface.s2t(suffix_s)
+        )
+    )
+    mask = pmask.mask_and(
+        mask, pmask.periodic_mask(
+            z,
+            periodic_p,
+            periodic_w,
+            random_roll=True
+        )
+    )
+    if onset_mask_width > 0:
+        mask = pmask.mask_or(
+            mask, pmask.onset_mask(sig, z, interface, width=onset_mask_width)
+        )
+    if beat_mask_width > 0:
+        beat_mask = interface.make_beat_mask(
+            sig,
+            after_beat_s=(beat_mask_width / 1000),
+            mask_upbeats=not beat_mask_downbeats,
+        )
+        mask = pmask.mask_and(mask, beat_mask)
+
+    # these should be the last two mask ops
+    mask = pmask.dropout(mask, dropout)
+    mask = pmask.codebook_unmask(mask, ncc)
+    _top_p = top_p if top_p > 0 else None
+
+    _seed = seed if seed > 0 else None
+    zv, mask_z = interface.coarse_vamp(
+        z,
+        mask=mask,
+        sampling_steps=num_steps,
+        mask_temperature=masktemp * 10,
+        sampling_temperature=sampletemp,
+        return_mask=True,
+        typical_filtering=typical_filtering,
+        typical_mass=typical_mass,
+        typical_min_tokens=typical_min_tokens,
+        top_p=_top_p,
+        gen_fn=interface.coarse.generate,
+        seed=_seed,
+    )
+
+    if use_coarse2fine:
+        zv = interface.coarse_to_fine(
+            zv,
+            mask_temperature=masktemp * 10,
+            sampling_temperature=sampletemp,
+            mask=mask,
+            sampling_steps=num_steps,
+            seed=_seed,
+        )
+
+    sig = interface.to_signal(zv).cpu()
+    sig.write(output_audio_path)
diff --git a/melodytalk/main.py b/melodytalk/main.py
@@ -115,9 +115,12 @@ class ConversationBot(object):
     def __init__(self):
         load_dict = {"Text2Music": "cuda:0",
                      "ExtractTrack": "cuda:0",
-                     "Text2MusicWithMelody": "cuda:0",
+                     "ReArrangement": "cuda:0",
                      "Text2MusicWithDrum": "cuda:0",
-                     "AddNewTrack": "cuda:0"}
+                     "Text2MusicWithTitle": "cuda:0",
+                     "AddNewTrack": "cuda:0",
+                     "MusicInpainting": "cuda:0",
+                     "Variation": "cuda:0",}
         template_dict = None  # { "Text2MusicwithChord": "cuda:0"} # "Accompaniment": "cuda:0",
 
         print(f"Initializing MelodyTalk, load_dict={load_dict}, template_dict={template_dict}")

diff --git a/melodytalk/modules.py b/melodytalk/modules.py
@@ -10,6 +10,9 @@
 import demucs.separate
 # CLAP
 from melodytalk.dependencies import laion_clap
+# Vampnet
+from melodytalk.dependencies.vampnet.interface import Interface
+from melodytalk.dependencies.vampnet.main import vamp
 
 from utils import *
 
@@ -29,6 +32,15 @@
 CLAP_model = laion_clap.CLAP_Module(enable_fusion=False, amodel="HTSAT-base", device="cuda")
 CLAP_model.load_ckpt("/home/intern-2023-02/melodytalk/melodytalk/pretrained/music_audioset_epoch_15_esc_90.14.pt")
 
+# Vampnet
+interface = Interface(
+    coarse_ckpt="./models/vampnet/coarse.pth",
+    coarse2fine_ckpt="./models/vampnet/c2f.pth",
+    codec_ckpt="./models/vampnet/codec.pth",
+    wavebeat_ckpt="./models/wavebeat.pth",
+    device="cuda" if torch.cuda.is_available() else "cpu",
+)
+
 @dataclass
 class GlobalAttributes(object):
     # metadata
@@ -94,18 +106,45 @@ def inference(self, text):
         print(f"\nProcessed Text2Music, Input Text: {text}, Output Music: {music_filename}.")
         return music_filename
 
-class Text2MusicWithMelody(object):
+class Text2MusicWithTitle(object):
+    def __init__(self, device):
+        print("Initializing Text2MusicWithTitle")
+        self.device = device
+        self.model = musicgen_model
+
+    @prompts(
+        name="Generate music from user input when the input is a title of music",
+        description="useful if you want to generate music which is silimar  and save it to a file."
+                    "like: generate music of love pop song, or generate music with piano and violin."
+                    "The input to this tool should be a comma separated string of two, "
+                    "representing the text description and the title."
+    )
+
+    def inference(self, inputs):
+        text, title = inputs.split(",")[0].strip(), inputs.split(",")[1].strip()
+        music_filename = os.path.join("music", f"{title}.wav")
+        text = music_title_to_description(text)  # using chatGPT's knowledge base to convert title to description
+        attribute_table.descriptions = text
+        text = description_to_attributes(text)  # convert text to attributes
+        wav = self.model.generate([text], progress=False)
+        wav = wav[0]  # batch size is 1
+        audio_write(music_filename[:-4],
+                    wav.cpu(), self.model.sample_rate, strategy="loudness", loudness_compressor=True)
+        print(f"\nProcessed Text2MusicWithTitle, Input Text: {text}, Output Music: {music_filename}.")
+        return music_filename
+
+class ReArrangement(object):
     def __init__(self, device):
         print("Initializing Text2MusicWithMelody")
         self.device = device
         self.model = musicgen_model
 
     @prompts(
-        name="Generate music from user input text with given melody condition",
-        description="useful if you want to style transfer or remix music with a user input text describing the target style and the original music."
+        name="Generate a new music arrangement with text indicating new style and previous music.",
+        description="useful if you want to style transfer or rearrange music with a user input text describing the target style and the previous music."
                     "Please use Text2MusicWithDrum instead if the condition is a single drum track."
                     "You shall not use it when no previous music file in the history."
-                    "like: remix the given melody with text description, or doing style transfer as text described with the given melody."
+                    "like: remix the given melody with text description, or doing style transfer as text described from previous music."
                     "The input to this tool should be a comma separated string of two, "
                     "representing the music_filename and the text description."
     )
@@ -328,8 +367,61 @@ def __init__(self):
 
 
 class MusicInpainting(object):
-    def __init__(self):
-        raise NotImplementedError
+    def __init__(self, device):
+        print("Initializing MusicInpainting")
+        self.device = device
+        self.interface = interface
+
+    @prompts(
+        name="Inpaint a specific time region of the given music.",
+        description="useful if you want to inpaint or regenerate a specific region (must with explicit time start and ending) of music."
+                    "like: re-generate the 3s-5s part of this music."
+                    "The input to this tool should be a comma separated string of three, "
+                    "representing the music_filename, the start time (in second), and the end time (in second)."
+    )
+
+    def inference(self, inputs):
+        music_filename, start_time, end_time = inputs.split(",")[0].strip(), inputs.split(",")[1].strip(), inputs.split(",")[2].strip()
+        print(f"Inpainting a specific time region of the given music, Input Music: {music_filename}, Start Time: {start_time}, End Time: {end_time}.")
+        updated_music_filename = get_new_audio_name(music_filename, func_name="inpainting_" + start_time + "_" + end_time)
+        p_track, sr = torchaudio.load(music_filename)
+        audio_length_in_second = p_track.shape[-1] / sr
+        if float(end_time) > audio_length_in_second:
+            print(f"Invalid end time, please check the input.")
+            end_time = audio_length_in_second
+        start_time, end_time = int(start_time), int(audio_length_in_second - float(end_time))
+        vamp(input_audio_path=music_filename,
+             output_audio_path=updated_music_filename,
+             interface=self.interface,
+             prefix_s=start_time,
+             suffix_s=end_time)
+        print(f"\nProcessed MusicInpainting, Output Music: {updated_music_filename}.")
+        return updated_music_filename
+
+class Variation(object):
+    def __init__(self, device):
+        print("Initializing Variation")
+        self.device = device
+        self.interface = interface
+
+    @prompts(
+        name="Generate a variation of given music.",
+        description="useful if you want to generate a variation of music, or re-generate the entire music track."
+                    "like: re-generate this music, or, generate a variant."
+                    "The input to this tool should be a single string, "
+                    "representing the music_filename."
+    )
+
+    def inference(self, inputs):
+        music_filename = inputs
+        print(f"Generate a variation of given music, Input Music: {music_filename}.")
+        updated_music_filename = get_new_audio_name(music_filename, func_name="variation")
+        p_track, sr = torchaudio.load(music_filename)
+        vamp(input_audio_path=music_filename,
+             output_audio_path=updated_music_filename,
+             interface=self.interface,)
+        print(f"\nProcessed Variation, Output Music: {updated_music_filename}.")
+        return updated_music_filename
 
 # class Accompaniment(object):
 #     template_model = True

diff --git a/melodytalk/utils.py b/melodytalk/utils.py
@@ -122,6 +122,31 @@ def addtrack_demand_to_description(description: str) -> str:
 
     return response.choices[0].text
 
+
+def music_title_to_description(description: str, use_api: bool = False) -> str:
+    if use_api:
+        raise NotImplementedError
+
+    openai_prompt = f"""Please transfer the music title to a description including genre, instruments and moods.
+
+    Q: Let it go
+    A: an anime pop song with vocal and piano arrangement, constructing a quiet and hopeful atmosphere.
+
+    Q: {description}
+    A: """
+
+    response = openai.Completion.create(
+        model="text-davinci-003",
+        prompt=openai_prompt,
+        temperature=0,
+        max_tokens=100,
+        top_p=1,
+        frequency_penalty=0.0,
+        presence_penalty=0.0,
+    )
+
+    return response.choices[0].text
+
 def merge_description(description_1: str, description_2: str) -> str:
     openai_prompt = f"""Please merge two descriptions into one.
-Original file line number
+Diff line change
@@ Expand Up / @@ -154,5 +154,7 @@ cython_debug/ @@
     output/
     assets/
     melodytalk/music/
+    *.pth
+    *.wav
     .DS_Store