Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
In addition to the specific generation tasks, Amphion includes several **vocoders** and **evaluation metrics**. A vocoder is an important module for producing high-quality audio signals, while evaluation metrics are critical for ensuring consistent metrics in generation tasks. Moreover, Amphion is dedicated to advancing audio generation in real-world applications, such as building **large-scale datasets** for speech synthesis.

## 🚀 News
- **2025/04/12**: We release [***Vevo1.5***](models/svc/vevosing/README.md), which extends Vevo and focuses on unified and controllable generation for both speech and singing voice. Vevo1.5 can be applied into a series of speech and singing voice generation tasks, including VC, TTS, AC, SVS, SVC, Speech/Singing Voice Editing, and more. [![blog](https://img.shields.io/badge/README-Blog-blue.svg)](https://veiled-army-9c5.notion.site/Vevo1-5-1d2ce17b49a280b5b444d3fa2300c93a)
- **2025/02/26**: We release [***Metis***](https://github.com/open-mmlab/Amphion/tree/main/models/tts/metis), a foundation model for unified speech generation. The system supports zero-shot text-to-speech, voice conversion, target speaker extraction, speech enhancement, and lip-to-speech. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/pdf/2502.03128) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-model-yellow)](https://huggingface.co/amphion/metis)
- **2025/02/26**: *The Emilia-Large dataset, featuring over 200,000 hours of data, is now available!!!* Emilia-Large combines the original 101k-hour Emilia dataset (licensed under `CC BY-NC 4.0`) with the brand-new 114k-hour **Emilia-YODAS dataset** (licensed under `CC BY 4.0`). Download at [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Dataset-yellow)](https://huggingface.co/datasets/amphion/Emilia-Dataset). Check details at [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2501.15907).
- **2025/01/30**: We release [Amphion v0.2 Technical Report](https://arxiv.org/abs/2501.15442), which provides a comprehensive overview of the Amphion updates in 2024. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2501.15442)
Expand Down
9 changes: 9 additions & 0 deletions bins/codec/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,19 @@
import torch

from models.codec.facodec.facodec_trainer import FAcodecTrainer
from models.codec.vevo.vqvae_trainer import (
VQVAETrainer,
)
from models.codec.coco.rep_coco_trainer import RepCocoTrainer

from utils.util import load_config


def build_trainer(args, cfg):
supported_trainer = {
"FAcodec": FAcodecTrainer,
"RepCoco": RepCocoTrainer,
"VQVAE": VQVAETrainer,
}

trainer_class = supported_trainer[cfg.model_type]
Expand Down Expand Up @@ -50,6 +56,9 @@ def main():
help="A specific name to note the experiment",
required=True,
)
parser.add_argument(
"--resume", action="store_true", help="The model name to restore"
)
parser.add_argument(
"--resume_type",
type=str,
Expand Down
54 changes: 36 additions & 18 deletions bins/svc/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
from models.svc.transformer.transformer_trainer import TransformerTrainer
from models.svc.vits.vits_trainer import VitsSVCTrainer
from models.svc.flow_matching_transformer.fmt_trainer import (
FlowMatchingTransformerTrainer,
)
from models.svc.autoregressive_transformer.ar_trainer import (
AutoregressiveTransformerTrainer,
)

from utils.util import load_config


Expand All @@ -20,6 +27,8 @@ def build_trainer(args, cfg):
"DiffComoSVC": ComoSVCTrainer,
"TransformerSVC": TransformerTrainer,
"VitsSVC": VitsSVCTrainer,
"AutoregressiveTransformer": AutoregressiveTransformerTrainer,
"FlowMatchingTransformer": FlowMatchingTransformerTrainer,
}

trainer_class = supported_trainer[cfg.model_type]
Expand Down Expand Up @@ -79,24 +88,33 @@ def main():
cfg = load_config(args.config)

# Data Augmentation
if (
type(cfg.preprocess.data_augment) == list
and len(cfg.preprocess.data_augment) > 0
):
new_datasets_list = []
for dataset in cfg.preprocess.data_augment:
new_datasets = [
f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
(
f"{dataset}_formant_shift"
if cfg.preprocess.use_formant_shift
else None
),
f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
]
new_datasets_list.extend(filter(None, new_datasets))
cfg.dataset.extend(new_datasets_list)
if "data_augment" in cfg.preprocess:
if (
type(cfg.preprocess.data_augment) == list
and len(cfg.preprocess.data_augment) > 0
):
new_datasets_list = []
for dataset in cfg.preprocess.data_augment:
new_datasets = [
(
f"{dataset}_pitch_shift"
if cfg.preprocess.use_pitch_shift
else None
),
(
f"{dataset}_formant_shift"
if cfg.preprocess.use_formant_shift
else None
),
f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
(
f"{dataset}_time_stretch"
if cfg.preprocess.use_time_stretch
else None
),
]
new_datasets_list.extend(filter(None, new_datasets))
cfg.dataset.extend(new_datasets_list)

# CUDA settings
cuda_relevant()
Expand Down
96 changes: 96 additions & 0 deletions bins/vc/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import argparse
import torch

from models.vc.flow_matching_transformer.fmt_trainer import (
FlowMatchingTransformerTrainer,
)
from models.vc.autoregressive_transformer.ar_trainer import (
AutoregressiveTransformerTrainer,
)

from utils.util import load_config


def build_trainer(args, cfg):
supported_trainer = {
"FlowMatchingTransformer": FlowMatchingTransformerTrainer,
"AutoregressiveTransformer": AutoregressiveTransformerTrainer,
}

trainer_class = supported_trainer[cfg.model_type]
trainer = trainer_class(args, cfg)
return trainer


def cuda_relevant(deterministic=False):
torch.cuda.empty_cache()
# TF32 on Ampere and above
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.enabled = True
torch.backends.cudnn.allow_tf32 = True
# Deterministic
torch.backends.cudnn.deterministic = deterministic
torch.backends.cudnn.benchmark = not deterministic
torch.use_deterministic_algorithms(deterministic)


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--config",
default="config.json",
help="json files for configurations.",
required=True,
)
parser.add_argument(
"--exp_name",
type=str,
default="exp_name",
help="A specific name to note the experiment",
required=True,
)
parser.add_argument(
"--resume", action="store_true", help="The model name to restore"
)
parser.add_argument(
"--log_level", default="warning", help="logging level (debug, info, warning)"
)
parser.add_argument(
"--resume_type",
type=str,
default="resume",
help="Resume training or finetuning.",
)
parser.add_argument(
"--checkpoint_path",
type=str,
default=None,
help="Checkpoint for resume training or finetuning.",
)
parser.add_argument(
"--dataloader_seed",
type=int,
default=1,
help="Seed for dataloader",
)

args = parser.parse_args()
cfg = load_config(args.config)

# # CUDA settings
cuda_relevant()

# Build trainer
trainer = build_trainer(args, cfg)
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
trainer.train_loop()


if __name__ == "__main__":
main()
30 changes: 19 additions & 11 deletions bins/vocoder/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from models.vocoders.gan.gan_vocoder_trainer import GANVocoderTrainer
from models.vocoders.diffusion.diffusion_vocoder_trainer import DiffusionVocoderTrainer
from models.vocoders.vocos.vocos_trainer import VocosTrainer

from utils.util import load_config

Expand All @@ -17,6 +18,7 @@ def build_trainer(args, cfg):
supported_trainer = {
"GANVocoder": GANVocoderTrainer,
"DiffusionVocoder": DiffusionVocoderTrainer,
"Vocos": VocosTrainer,
}

trainer_class = supported_trainer[cfg.model_type]
Expand Down Expand Up @@ -51,6 +53,11 @@ def main():
help="A specific name to note the experiment",
required=True,
)
parser.add_argument(
"--resume",
action="store_true",
help="If specified, to resume from the existing checkpoint.",
)
parser.add_argument(
"--resume_type",
type=str,
Expand All @@ -68,17 +75,18 @@ def main():
cfg = load_config(args.config)

# Data Augmentation
if cfg.preprocess.data_augment:
new_datasets_list = []
for dataset in cfg.preprocess.data_augment:
new_datasets = [
# f"{dataset}_pitch_shift",
# f"{dataset}_formant_shift",
f"{dataset}_equalizer",
f"{dataset}_time_stretch",
]
new_datasets_list.extend(new_datasets)
cfg.dataset.extend(new_datasets_list)
if "data_augment" in cfg.preprocess:
if cfg.preprocess.data_augment:
new_datasets_list = []
for dataset in cfg.preprocess.data_augment:
new_datasets = [
# f"{dataset}_pitch_shift",
# f"{dataset}_formant_shift",
f"{dataset}_equalizer",
f"{dataset}_time_stretch",
]
new_datasets_list.extend(new_datasets)
cfg.dataset.extend(new_datasets_list)

# CUDA settings
cuda_relevant()
Expand Down
92 changes: 92 additions & 0 deletions egs/codec/coco/contentstyle_fvq16384_12.5hz.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"model_type": "RepCoco",
"dataset": {
"emilia": 1, // 101k hours, 34m samples
"singnet": 20 // 400 hours, 0.34m samples * 20 = 6.8m samples
},
"singnet_path": "[Please fill out your singing data path]/sing400.json",
"preprocess": {
"hop_size": 480,
"sample_rate": 24000,
"n_fft": 1920,
"num_mels": 128,
"win_size": 1920,
"fmin": 0,
"fmax": 12000,
"mel_var": 8.14,
"mel_mean": -4.92,
"f0_fmin": 50.0,
"f0_fmax": 1100.0,
"load_chromagram": true
},
"model": {
"coco": {
"coco_type": "content_style", // content, style, or content_style
"downsample_rate": 4, // The original frame rate is 50 Hz, downsample to 12.5 Hz
"codebook_size": 16384,
"hidden_size": 1024, // Representations Dim
"codebook_dim": 8,
"encoder": {
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12,
},
"decoder": {
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12,
},
"use_normed_whisper": true,
"whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt",
"whisper_dim": 1024,
"chromagram_dim": 24
},
"cond_sample_rate": 16000
},
"log_dir": "ckpts/coco",
"train": {
"max_epoch": 0,
"use_dynamic_batchsize": true,
"max_tokens": 18000,
"max_sentences": 90,
"lr_warmup_steps": 10000,
"lr_scheduler": "constant",
"num_train_steps": 1000000,
"adam": {
"lr": 1e-4,
"betas": [
0.5,
0.9
]
},
"ddp": false,
"random_seed": 114,
"batch_size": 32, // use batch_size if not use dynamic batchsize
"epochs": 5000,
"max_steps": 1000000,
"total_training_steps": 800000,
"save_summary_steps": 500,
"save_checkpoints_steps": 1000,
"save_checkpoints_backup_steps": 100000,
"valid_interval": 2000,
"keep_checkpoint_max": 100,
"gradient_accumulation_step": 1,
"tracker": [
"tensorboard"
],
"save_checkpoint_stride": [
1
],
"keep_last": [
5
],
"run_eval": [
true
],
"dataloader": {
"num_worker": 8,
"pin_memory": true
},
"use_emilia_dataset": true
}
}
18 changes: 18 additions & 0 deletions egs/codec/coco/contentstyle_fvq16384_12.5hz.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
######## Build Experiment Environment ###########
exp_dir=$(cd `dirname $0`; pwd)
work_dir=$(dirname $(dirname $(dirname $exp_dir)))

export WORK_DIR=$work_dir
export PYTHONPATH=$work_dir
export PYTHONIOENCODING=UTF-8

######## Set Experiment Configuration ###########
exp_config="$exp_dir/contentstyle_fvq16384_12.5hz.json"
exp_name="contentstyle_fvq16384_12.5hz"

####### Train Model ###########
CUDA_VISIBLE_DEVICES="0" accelerate launch --main_process_port 14557 --mixed_precision="bf16" \
"${work_dir}"/bins/codec/train.py \
--config=$exp_config \
--exp_name=$exp_name \
--log_level debug
Loading