Skip to content

Commit

Permalink
new model: vampnet
Browse files Browse the repository at this point in the history
  • Loading branch information
ldzhangyx committed Aug 1, 2023
1 parent 7bcc344 commit f8ab42e
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 8 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -154,5 +154,7 @@ cython_debug/
output/
assets/
melodytalk/music/
*.pth
*.wav

.DS_Store
112 changes: 112 additions & 0 deletions melodytalk/dependencies/vampnet/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from pathlib import Path
from typing import Tuple
import yaml
import tempfile
import uuid
import shutil
from dataclasses import dataclass, asdict

import numpy as np
import audiotools as at
import argbind
import torch

import gradio as gr
from melodytalk.dependencies.vampnet.interface import Interface
from melodytalk.dependencies.vampnet import mask as pmask


def vamp(input_audio_path=None,
output_audio_path=None,
interface=None,
top_p=0,
prefix_s=0, # inpainting
suffix_s=0, # inpainting
rand_mask_intensity=1,
num_steps=36,
periodic_p=0, # periodic mask
periodic_w=0, # periodic mask
onset_mask_width=0, # onset mask
beat_mask_width=0, # beat mask
dropout=0, # dropout
beat_mask_downbeats=False,
n_conditioning_codebooks=0,
seed=0,
masktemp=1.5,
sampletemp=1.0,
typical_filtering=False,
typical_mass=0.15,
typical_min_tokens=64,
use_coarse2fine=True):
# preprocess files
# trim to 10s

# sig = at.AudioSignal(input_audio_path)
sig = at.AudioSignal(input_audio_path, duration=10)

z = interface.encode(sig)

ncc = n_conditioning_codebooks

# build the mask
mask = pmask.linear_random(z, rand_mask_intensity)
mask = pmask.mask_and(
mask, pmask.inpaint(
z,
interface.s2t(prefix_s),
interface.s2t(suffix_s)
)
)
mask = pmask.mask_and(
mask, pmask.periodic_mask(
z,
periodic_p,
periodic_w,
random_roll=True
)
)
if onset_mask_width > 0:
mask = pmask.mask_or(
mask, pmask.onset_mask(sig, z, interface, width=onset_mask_width)
)
if beat_mask_width > 0:
beat_mask = interface.make_beat_mask(
sig,
after_beat_s=(beat_mask_width / 1000),
mask_upbeats=not beat_mask_downbeats,
)
mask = pmask.mask_and(mask, beat_mask)

# these should be the last two mask ops
mask = pmask.dropout(mask, dropout)
mask = pmask.codebook_unmask(mask, ncc)
_top_p = top_p if top_p > 0 else None

_seed = seed if seed > 0 else None
zv, mask_z = interface.coarse_vamp(
z,
mask=mask,
sampling_steps=num_steps,
mask_temperature=masktemp * 10,
sampling_temperature=sampletemp,
return_mask=True,
typical_filtering=typical_filtering,
typical_mass=typical_mass,
typical_min_tokens=typical_min_tokens,
top_p=_top_p,
gen_fn=interface.coarse.generate,
seed=_seed,
)

if use_coarse2fine:
zv = interface.coarse_to_fine(
zv,
mask_temperature=masktemp * 10,
sampling_temperature=sampletemp,
mask=mask,
sampling_steps=num_steps,
seed=_seed,
)

sig = interface.to_signal(zv).cpu()
sig.write(output_audio_path)
7 changes: 5 additions & 2 deletions melodytalk/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,12 @@ class ConversationBot(object):
def __init__(self):
load_dict = {"Text2Music": "cuda:0",
"ExtractTrack": "cuda:0",
"Text2MusicWithMelody": "cuda:0",
"ReArrangement": "cuda:0",
"Text2MusicWithDrum": "cuda:0",
"AddNewTrack": "cuda:0"}
"Text2MusicWithTitle": "cuda:0",
"AddNewTrack": "cuda:0",
"MusicInpainting": "cuda:0",
"Variation": "cuda:0",}
template_dict = None # { "Text2MusicwithChord": "cuda:0"} # "Accompaniment": "cuda:0",

print(f"Initializing MelodyTalk, load_dict={load_dict}, template_dict={template_dict}")
Expand Down
104 changes: 98 additions & 6 deletions melodytalk/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
import demucs.separate
# CLAP
from melodytalk.dependencies import laion_clap
# Vampnet
from melodytalk.dependencies.vampnet.interface import Interface
from melodytalk.dependencies.vampnet.main import vamp

from utils import *

Expand All @@ -29,6 +32,15 @@
CLAP_model = laion_clap.CLAP_Module(enable_fusion=False, amodel="HTSAT-base", device="cuda")
CLAP_model.load_ckpt("/home/intern-2023-02/melodytalk/melodytalk/pretrained/music_audioset_epoch_15_esc_90.14.pt")

# Vampnet
interface = Interface(
coarse_ckpt="./models/vampnet/coarse.pth",
coarse2fine_ckpt="./models/vampnet/c2f.pth",
codec_ckpt="./models/vampnet/codec.pth",
wavebeat_ckpt="./models/wavebeat.pth",
device="cuda" if torch.cuda.is_available() else "cpu",
)

@dataclass
class GlobalAttributes(object):
# metadata
Expand Down Expand Up @@ -94,18 +106,45 @@ def inference(self, text):
print(f"\nProcessed Text2Music, Input Text: {text}, Output Music: {music_filename}.")
return music_filename

class Text2MusicWithMelody(object):
class Text2MusicWithTitle(object):
def __init__(self, device):
print("Initializing Text2MusicWithTitle")
self.device = device
self.model = musicgen_model

@prompts(
name="Generate music from user input when the input is a title of music",
description="useful if you want to generate music which is silimar and save it to a file."
"like: generate music of love pop song, or generate music with piano and violin."
"The input to this tool should be a comma separated string of two, "
"representing the text description and the title."
)

def inference(self, inputs):
text, title = inputs.split(",")[0].strip(), inputs.split(",")[1].strip()
music_filename = os.path.join("music", f"{title}.wav")
text = music_title_to_description(text) # using chatGPT's knowledge base to convert title to description
attribute_table.descriptions = text
text = description_to_attributes(text) # convert text to attributes
wav = self.model.generate([text], progress=False)
wav = wav[0] # batch size is 1
audio_write(music_filename[:-4],
wav.cpu(), self.model.sample_rate, strategy="loudness", loudness_compressor=True)
print(f"\nProcessed Text2MusicWithTitle, Input Text: {text}, Output Music: {music_filename}.")
return music_filename

class ReArrangement(object):
def __init__(self, device):
print("Initializing Text2MusicWithMelody")
self.device = device
self.model = musicgen_model

@prompts(
name="Generate music from user input text with given melody condition",
description="useful if you want to style transfer or remix music with a user input text describing the target style and the original music."
name="Generate a new music arrangement with text indicating new style and previous music.",
description="useful if you want to style transfer or rearrange music with a user input text describing the target style and the previous music."
"Please use Text2MusicWithDrum instead if the condition is a single drum track."
"You shall not use it when no previous music file in the history."
"like: remix the given melody with text description, or doing style transfer as text described with the given melody."
"like: remix the given melody with text description, or doing style transfer as text described from previous music."
"The input to this tool should be a comma separated string of two, "
"representing the music_filename and the text description."
)
Expand Down Expand Up @@ -328,8 +367,61 @@ def __init__(self):


class MusicInpainting(object):
def __init__(self):
raise NotImplementedError
def __init__(self, device):
print("Initializing MusicInpainting")
self.device = device
self.interface = interface

@prompts(
name="Inpaint a specific time region of the given music.",
description="useful if you want to inpaint or regenerate a specific region (must with explicit time start and ending) of music."
"like: re-generate the 3s-5s part of this music."
"The input to this tool should be a comma separated string of three, "
"representing the music_filename, the start time (in second), and the end time (in second)."
)

def inference(self, inputs):
music_filename, start_time, end_time = inputs.split(",")[0].strip(), inputs.split(",")[1].strip(), inputs.split(",")[2].strip()
print(f"Inpainting a specific time region of the given music, Input Music: {music_filename}, Start Time: {start_time}, End Time: {end_time}.")
updated_music_filename = get_new_audio_name(music_filename, func_name="inpainting_" + start_time + "_" + end_time)
p_track, sr = torchaudio.load(music_filename)
audio_length_in_second = p_track.shape[-1] / sr
if float(end_time) > audio_length_in_second:
print(f"Invalid end time, please check the input.")
end_time = audio_length_in_second
start_time, end_time = int(start_time), int(audio_length_in_second - float(end_time))
vamp(input_audio_path=music_filename,
output_audio_path=updated_music_filename,
interface=self.interface,
prefix_s=start_time,
suffix_s=end_time)
print(f"\nProcessed MusicInpainting, Output Music: {updated_music_filename}.")
return updated_music_filename

class Variation(object):
def __init__(self, device):
print("Initializing Variation")
self.device = device
self.interface = interface

@prompts(
name="Generate a variation of given music.",
description="useful if you want to generate a variation of music, or re-generate the entire music track."
"like: re-generate this music, or, generate a variant."
"The input to this tool should be a single string, "
"representing the music_filename."
)

def inference(self, inputs):
music_filename = inputs
print(f"Generate a variation of given music, Input Music: {music_filename}.")
updated_music_filename = get_new_audio_name(music_filename, func_name="variation")
p_track, sr = torchaudio.load(music_filename)
vamp(input_audio_path=music_filename,
output_audio_path=updated_music_filename,
interface=self.interface,)
print(f"\nProcessed Variation, Output Music: {updated_music_filename}.")
return updated_music_filename

# class Accompaniment(object):
# template_model = True
Expand Down
25 changes: 25 additions & 0 deletions melodytalk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,31 @@ def addtrack_demand_to_description(description: str) -> str:

return response.choices[0].text


def music_title_to_description(description: str, use_api: bool = False) -> str:
if use_api:
raise NotImplementedError

openai_prompt = f"""Please transfer the music title to a description including genre, instruments and moods.
Q: Let it go
A: an anime pop song with vocal and piano arrangement, constructing a quiet and hopeful atmosphere.
Q: {description}
A: """

response = openai.Completion.create(
model="text-davinci-003",
prompt=openai_prompt,
temperature=0,
max_tokens=100,
top_p=1,
frequency_penalty=0.0,
presence_penalty=0.0,
)

return response.choices[0].text

def merge_description(description_1: str, description_2: str) -> str:
openai_prompt = f"""Please merge two descriptions into one.
Expand Down

0 comments on commit f8ab42e

Please sign in to comment.