diff --git a/README.md b/README.md index 8086494c06a269371a239c43ef0a10ed3317636e..de2864e758f527f5aa16a91ab1738b79acb9d675 100644 --- a/README.md +++ b/README.md @@ -456,6 +456,7 @@ DeepSparkHub甄选上百个应用算法和模型,覆盖AI和通用计算各领 | Model | Framework | Dataset | IXUCA SDK | |-----------------------------------------------------------------------|--------------|-------------|-------| +| [CosyVoice2-0.5B](audio/speech_synthesis/cosyvoice/pytorch) | DeepSpeed | openslr | 4.3.0 | | [PP-TTS-FastSpeech2](audio/speech_synthesis/fastspeech2/paddlepaddle) | PaddlePaddle | CSMSC | 3.1.0 | | [PP-TTS-HiFiGAN](audio/speech_synthesis/hifigan/paddlepaddle) | PaddlePaddle | CSMSC | 3.1.0 | | [Tacotron2](audio/speech_synthesis/tacotron2/pytorch) | PyTorch | LJSpeech | 2.2.0 | diff --git a/README_en.md b/README_en.md index b31e48663f67f9e00b6404b42e18cd3ea514e035..3116386737b60d693bde9627ba883a1c87b74a59 100644 --- a/README_en.md +++ b/README_en.md @@ -458,6 +458,7 @@ individuals, healthcare, education, communication, energy, and more. | Model | Framework | Dataset | IXUCA SDK | |-----------------------------------------------------------------------|--------------|-------------|-------| +| [CosyVoice2-0.5B](audio/speech_synthesis/cosyvoice/pytorch) | DeepSpeed | openslr | 4.3.0 | | [PP-TTS-FastSpeech2](audio/speech_synthesis/fastspeech2/paddlepaddle) | PaddlePaddle | CSMSC | 3.1.0 | | [PP-TTS-HiFiGAN](audio/speech_synthesis/hifigan/paddlepaddle) | PaddlePaddle | CSMSC | 3.1.0 | | [Tacotron2](audio/speech_synthesis/tacotron2/pytorch) | PyTorch | LJSpeech | 2.2.0 | diff --git a/audio/speech_synthesis/cosyvoice/deepspeed/README.md b/audio/speech_synthesis/cosyvoice/deepspeed/README.md new file mode 100644 index 0000000000000000000000000000000000000000..37c8e24754231d01a2f8e489c471743a6848a3a6 --- /dev/null +++ b/audio/speech_synthesis/cosyvoice/deepspeed/README.md @@ -0,0 +1,69 @@ +# CosyVoice2 + +## Model Description + +CosyVoice2-0.5B is a small speech model designed to understand and generate human-like speech. It can be used for tasks like voice assistants, text-to-speech, or voice cloning. With 0.5 billion parameters, it is lightweight and works well on devices with limited computing power. It focuses on natural-sounding voices and easy customization. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| BI-V150 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +Pretrained model: + +Dataset: + +```bash +mkdir -p /root/datasets/openslr/libritts +cd /root/datasets/openslr/libritts +wget https://openslr.elda.org/resources/60/train-clean-100.tar.gz +wget https://openslr.elda.org/resources/60/dev-clean.tar.gz +tar -xvzf train-clean-100.tar.gz +tar -xvzf dev-clean.tar.gz +``` + +### Install Dependencies + +Contact the Iluvatar administrator to get the missing packages: + +- transformers-4.45.2+corex.4.3.0-py3-none-any.whl +- deepspeed-0.16.4+corex.4.3.0-cp310-cp310-linux_x86_64.whl + +```bash +pip3 install -r requirements.txt +git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git +# If you failed to clone the submodule due to network failures, please run the following command until success +cd CosyVoice +git submodule update --init --recursive + +mkdir -p pretrained_models +# download CosyVoice2-0.5B model into pretrained_models dir + +# If you encounter sox compatibility issues +# ubuntu +sudo apt-get install sox libsox-dev +# centos +sudo yum install sox sox-devel +``` + +## Model Training + +```bash +cp ../run_dpo.sh examples/libritts/cosyvoice2/ +cp ../run.sh examples/libritts/cosyvoice2/ +cd examples/libritts/cosyvoice2/ + +export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH +bash run.sh +``` + +## Model Results + +## References + +- [CosyVoice](https://github.com/FunAudioLLM/CosyVoice/commit/0a496c18f78ca993c63f6d880fcc60778bfc85c1) \ No newline at end of file diff --git a/audio/speech_synthesis/cosyvoice/deepspeed/requirements.txt b/audio/speech_synthesis/cosyvoice/deepspeed/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3f0917dac4757f45b359a0514f0cda2169770e9 --- /dev/null +++ b/audio/speech_synthesis/cosyvoice/deepspeed/requirements.txt @@ -0,0 +1,40 @@ +--extra-index-url https://download.pytorch.org/whl/cu121 +--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684 +conformer==0.3.2 +#deepspeed==0.15.1; sys_platform == 'linux' +diffusers==0.29.0 +fastapi==0.115.6 +fastapi-cli==0.0.4 +gdown==5.1.0 +gradio==5.4.0 +grpcio==1.57.0 +grpcio-tools==1.57.0 +hydra-core==1.3.2 +HyperPyYAML==1.2.2 +inflect==7.3.1 +librosa==0.10.2 +lightning==2.2.4 +matplotlib==3.7.5 +modelscope==1.20.0 +networkx==3.1 +omegaconf==2.3.0 +#onnx==1.16.0 +#onnxruntime-gpu==1.18.0; sys_platform == 'linux' +onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'win32' +openai-whisper +protobuf==4.25 +pyarrow==18.1.0 +pydantic==2.7.0 +pyworld==0.3.4 +rich==13.7.1 +soundfile==0.12.1 +tensorboard==2.14.0 +#tensorrt-cu12==10.0.1; sys_platform == 'linux' +#tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux' +#tensorrt-cu12-libs==10.0.1; sys_platform == 'linux' +#torch==2.3.1 +#torchaudio==2.3.1 +#transformers==4.40.1 +uvicorn==0.30.0 +wetext==0.0.4 +wget==3.2 \ No newline at end of file diff --git a/audio/speech_synthesis/cosyvoice/deepspeed/run.sh b/audio/speech_synthesis/cosyvoice/deepspeed/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc0b7c2e06372d7b7f654f1370875a91a09ec205 --- /dev/null +++ b/audio/speech_synthesis/cosyvoice/deepspeed/run.sh @@ -0,0 +1,126 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2024 Alibaba Inc. All Rights Reserved. +. ./path.sh || exit 1; + +stage=-1 +stop_stage=5 + +data_url=www.openslr.org/resources/60 +data_dir=/root/datasets/openslr/libritts +pretrained_model_dir=../../../pretrained_models/CosyVoice2-0.5B + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "Data Download" + #for part in dev-clean train-clean-100; do + # local/download_and_untar.sh ${data_dir} ${data_url} ${part} + #done +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt" + for x in train-clean-100 dev-clean; do + mkdir -p data/$x + python3 local/prepare_data.py --src_dir $data_dir/LibriTTS/$x --des_dir data/$x + done +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" + for x in train-clean-100 dev-clean; do + python3 tools/extract_embedding.py --dir data/$x \ + --onnx_path $pretrained_model_dir/campplus.onnx + done +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" + for x in train-clean-100 dev-clean; do + python3 tools/extract_speech_token.py --dir data/$x \ + --onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx + done +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" + for x in train-clean-100 dev-clean; do + mkdir -p data/$x/parquet + python3 tools/make_parquet_list.py --num_utts_per_parquet 1000 \ + --num_processes 10 \ + --src_dir data/$x \ + --des_dir data/$x/parquet + done +fi + +# train llm +export CUDA_VISIBLE_DEVICES="0,1,2,3" +num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +job_id=1986 +dist_backend="nccl" +num_workers=2 +prefetch=100 +train_engine=torch_ddp +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + echo "Run train. We only support llm traning for now" + if [ $train_engine == 'deepspeed' ]; then + echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary" + fi + cat data/train-clean-100/parquet/data.list > data/train.data.list + cat data/dev-clean/parquet/data.list > data/dev.data.list + # NOTE will update llm/hift training later + for model in llm; do + torchrun --nnodes=1 --nproc_per_node=$num_gpus \ + --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ + cosyvoice/bin/train.py \ + --train_engine $train_engine \ + --config conf/cosyvoice2.yaml \ + --train_data data/train.data.list \ + --cv_data data/dev.data.list \ + --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \ + --model $model \ + --checkpoint $pretrained_model_dir/$model.pt \ + --model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \ + --tensorboard_dir `pwd`/tensorboard/cosyvoice2/$model/$train_engine \ + --ddp.dist_backend $dist_backend \ + --num_workers ${num_workers} \ + --prefetch ${prefetch} \ + --pin_memory \ + --use_amp \ + --deepspeed_config ./conf/ds_stage2.json \ + --deepspeed.save_states model+optimizer + done +fi + +# average model +average_num=5 +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + for model in llm flow hifigan; do + decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt + echo "do model average and final checkpoint is $decode_checkpoint" + python3 cosyvoice/bin/average_model.py \ + --dst_model $decode_checkpoint \ + --src_path `pwd`/exp/cosyvoice/$model/$train_engine \ + --num ${average_num} \ + --val_best + done +fi + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir" + python3 cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir + python3 cosyvoice/bin/export_onnx.py --model_dir $pretrained_model_dir +fi \ No newline at end of file diff --git a/tests/model_info.json b/tests/model_info.json index 87f01851f7d60254bfa04ee050b006be947453a4..12bb3d93905b5a0b53886273cda704ba317f7022 100644 --- a/tests/model_info.json +++ b/tests/model_info.json @@ -7511,6 +7511,30 @@ "github_branch": "", "github_path": "", "priority": "P4" + }, + { + "model_name": "cosyvoice", + "framework": "pytorch", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "BI-V150", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "audio/speech_synthesis", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "deepsparkhub/audio/speech_synthesis/cosyvoice/pytorch", + "readme_file": "deepsparkhub/audio/speech_synthesis/cosyvoice/pytorch/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "priority": "P4" } ] } \ No newline at end of file