From 5e714ecb4a40561c2a2e6a54ff8c4d787cea4ec4 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 19 Sep 2022 18:35:08 +0800
Subject: [PATCH] [doc]update api docs (#2406)

* update apt docs, test=doc
---
 docs/source/api/paddlespeech.audio.rst        |   3 +
 ...ddlespeech.audio.streamdata.autodecode.rst |   7 +
 .../paddlespeech.audio.streamdata.cache.rst   |   7 +
 .../paddlespeech.audio.streamdata.compat.rst  |   7 +
 ...espeech.audio.streamdata.extradatasets.rst |   7 +
 .../paddlespeech.audio.streamdata.filters.rst |   7 +
 .../paddlespeech.audio.streamdata.gopen.rst   |   7 +
 ...paddlespeech.audio.streamdata.handlers.rst |   7 +
 .../api/paddlespeech.audio.streamdata.mix.rst |   7 +
 ...lespeech.audio.streamdata.paddle_utils.rst |   7 +
 ...paddlespeech.audio.streamdata.pipeline.rst |   7 +
 .../api/paddlespeech.audio.streamdata.rst     |  28 ++
 ...ddlespeech.audio.streamdata.shardlists.rst |   7 +
 ...lespeech.audio.streamdata.tariterators.rst |   7 +
 .../paddlespeech.audio.streamdata.utils.rst   |   7 +
 .../paddlespeech.audio.streamdata.writer.rst  |   7 +
 docs/source/api/paddlespeech.audio.text.rst   |  16 +
 ...addlespeech.audio.text.text_featurizer.rst |   7 +
 .../api/paddlespeech.audio.text.utility.rst   |   7 +
 ...addlespeech.audio.transform.add_deltas.rst |   7 +
 ...peech.audio.transform.channel_selector.rst |   7 +
 .../api/paddlespeech.audio.transform.cmvn.rst |   7 +
 ...addlespeech.audio.transform.functional.rst |   7 +
 .../paddlespeech.audio.transform.perturb.rst  |   7 +
 .../api/paddlespeech.audio.transform.rst      |  24 ++
 ...dlespeech.audio.transform.spec_augment.rst |   7 +
 ...ddlespeech.audio.transform.spectrogram.rst |   7 +
 ...ch.audio.transform.transform_interface.rst |   7 +
 ...espeech.audio.transform.transformation.rst |   7 +
 .../api/paddlespeech.audio.transform.wpe.rst  |   7 +
 .../paddlespeech.audio.utils.check_kwargs.rst |   7 +
 ...addlespeech.audio.utils.dynamic_import.rst |   7 +
 docs/source/api/paddlespeech.audio.utils.rst  |   3 +
 .../paddlespeech.audio.utils.tensor_utils.rst |   7 +
 .../paddlespeech.kws.exps.mdtc.collate.rst    |   7 +
 ...paddlespeech.kws.exps.mdtc.compute_det.rst |   7 +
 ...dlespeech.kws.exps.mdtc.plot_det_curve.rst |   7 +
 .../source/api/paddlespeech.kws.exps.mdtc.rst |  19 ++
 .../api/paddlespeech.kws.exps.mdtc.score.rst  |   7 +
 .../api/paddlespeech.kws.exps.mdtc.train.rst  |   7 +
 docs/source/api/paddlespeech.kws.exps.rst     |  15 +
 docs/source/api/paddlespeech.kws.rst          |   1 +
 .../api/paddlespeech.resource.model_alias.rst |   7 +
 ...addlespeech.resource.pretrained_models.rst |   7 +
 .../api/paddlespeech.resource.resource.rst    |   7 +
 docs/source/api/paddlespeech.resource.rst     |  17 +
 docs/source/api/paddlespeech.rst              |   2 +
 docs/source/api/paddlespeech.s2t.rst          |   1 -
 docs/source/api/paddlespeech.server.utils.rst |   1 -
 docs/source/api/paddlespeech.t2s.datasets.rst |   1 +
 .../api/paddlespeech.t2s.datasets.sampler.rst |   7 +
 .../paddlespeech.t2s.exps.ernie_sat.align.rst |   7 +
 ...dlespeech.t2s.exps.ernie_sat.normalize.rst |   7 +
 ...lespeech.t2s.exps.ernie_sat.preprocess.rst |   7 +
 .../api/paddlespeech.t2s.exps.ernie_sat.rst   |  21 ++
 ...lespeech.t2s.exps.ernie_sat.synthesize.rst |   7 +
 ...eech.t2s.exps.ernie_sat.synthesize_e2e.rst |   7 +
 .../paddlespeech.t2s.exps.ernie_sat.train.rst |   7 +
 .../paddlespeech.t2s.exps.ernie_sat.utils.rst |   7 +
 .../api/paddlespeech.t2s.exps.fastspeech2.rst |   1 +
 ...espeech.t2s.exps.fastspeech2.vc2_infer.rst |   7 +
 docs/source/api/paddlespeech.t2s.exps.rst     |   3 +
 .../paddlespeech.t2s.exps.stream_play_tts.rst |   7 +
 .../paddlespeech.t2s.exps.vits.normalize.rst  |   7 +
 .../paddlespeech.t2s.exps.vits.preprocess.rst |   7 +
 .../source/api/paddlespeech.t2s.exps.vits.rst |  20 ++
 .../paddlespeech.t2s.exps.vits.synthesize.rst |   7 +
 ...dlespeech.t2s.exps.vits.synthesize_e2e.rst |   7 +
 .../api/paddlespeech.t2s.exps.vits.train.rst  |   7 +
 ...ddlespeech.t2s.exps.vits.voice_cloning.rst |   7 +
 ...paddlespeech.t2s.frontend.g2pw.dataset.rst |   7 +
 ...addlespeech.t2s.frontend.g2pw.onnx_api.rst |   7 +
 .../api/paddlespeech.t2s.frontend.g2pw.rst    |  17 +
 .../paddlespeech.t2s.frontend.g2pw.utils.rst  |   7 +
 ...paddlespeech.t2s.frontend.mix_frontend.rst |   7 +
 docs/source/api/paddlespeech.t2s.frontend.rst |   2 +
 ...espeech.t2s.models.ernie_sat.ernie_sat.rst |   7 +
 ...t2s.models.ernie_sat.ernie_sat_updater.rst |   7 +
 .../api/paddlespeech.t2s.models.ernie_sat.rst |   3 +-
 ...h.t2s.models.vits.monotonic_align.core.rst |   7 +
 ...speech.t2s.models.vits.monotonic_align.rst |  16 +
 ....t2s.models.vits.monotonic_align.setup.rst |   7 +
 .../api/paddlespeech.utils.dynamic_import.rst |   7 +
 docs/source/api/paddlespeech.utils.env.rst    |   7 +
 docs/source/api/paddlespeech.utils.rst        |  16 +
 docs/source/index.rst                         |   2 +
 .../t2s/models/ernie_sat/ernie_sat.py         | 108 ++++---
 .../t2s/models/vits/duration_predictor.py     |  39 ++-
 paddlespeech/t2s/models/vits/flow.py          | 111 ++++---
 paddlespeech/t2s/models/vits/generator.py     | 301 +++++++++++-------
 .../t2s/models/vits/posterior_encoder.py      |  54 ++--
 .../t2s/models/vits/residual_coupling.py      |  99 ++++--
 paddlespeech/t2s/models/vits/text_encoder.py  |  69 ++--
 paddlespeech/t2s/models/vits/vits.py          | 153 ++++++---
 .../t2s/models/vits/wavenet/residual_block.py |  24 +-
 .../t2s/models/vits/wavenet/wavenet.py        |  72 +++--
 paddlespeech/t2s/models/wavernn/wavernn.py    |  20 +-
 97 files changed, 1348 insertions(+), 375 deletions(-)
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.cache.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.compat.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.filters.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.gopen.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.handlers.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.mix.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.utils.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.writer.rst
 create mode 100644 docs/source/api/paddlespeech.audio.text.rst
 create mode 100644 docs/source/api/paddlespeech.audio.text.text_featurizer.rst
 create mode 100644 docs/source/api/paddlespeech.audio.text.utility.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.add_deltas.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.channel_selector.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.cmvn.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.functional.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.perturb.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.spec_augment.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.spectrogram.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.transform_interface.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.transformation.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.wpe.rst
 create mode 100644 docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
 create mode 100644 docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
 create mode 100644 docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.rst
 create mode 100644 docs/source/api/paddlespeech.resource.model_alias.rst
 create mode 100644 docs/source/api/paddlespeech.resource.pretrained_models.rst
 create mode 100644 docs/source/api/paddlespeech.resource.resource.rst
 create mode 100644 docs/source/api/paddlespeech.resource.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.datasets.sampler.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.train.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
 create mode 100644 docs/source/api/paddlespeech.utils.dynamic_import.rst
 create mode 100644 docs/source/api/paddlespeech.utils.env.rst
 create mode 100644 docs/source/api/paddlespeech.utils.rst

diff --git a/docs/source/api/paddlespeech.audio.rst b/docs/source/api/paddlespeech.audio.rst
index 5a3867f9620..4ed7e4672cf 100644
--- a/docs/source/api/paddlespeech.audio.rst
+++ b/docs/source/api/paddlespeech.audio.rst
@@ -20,4 +20,7 @@ Subpackages
    paddlespeech.audio.io
    paddlespeech.audio.metric
    paddlespeech.audio.sox_effects
+   paddlespeech.audio.streamdata
+   paddlespeech.audio.text
+   paddlespeech.audio.transform
    paddlespeech.audio.utils
diff --git a/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
new file mode 100644
index 00000000000..1e45c13735c
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.autodecode module
+===============================================
+
+.. automodule:: paddlespeech.audio.streamdata.autodecode
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.cache.rst b/docs/source/api/paddlespeech.audio.streamdata.cache.rst
new file mode 100644
index 00000000000..393055e5475
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.cache.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.cache module
+==========================================
+
+.. automodule:: paddlespeech.audio.streamdata.cache
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.compat.rst b/docs/source/api/paddlespeech.audio.streamdata.compat.rst
new file mode 100644
index 00000000000..760695b20d4
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.compat.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.compat module
+===========================================
+
+.. automodule:: paddlespeech.audio.streamdata.compat
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
new file mode 100644
index 00000000000..74628e9630d
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.extradatasets module
+==================================================
+
+.. automodule:: paddlespeech.audio.streamdata.extradatasets
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.filters.rst b/docs/source/api/paddlespeech.audio.streamdata.filters.rst
new file mode 100644
index 00000000000..d26104279bc
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.filters.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.filters module
+============================================
+
+.. automodule:: paddlespeech.audio.streamdata.filters
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.gopen.rst b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst
new file mode 100644
index 00000000000..1cccb776376
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.gopen module
+==========================================
+
+.. automodule:: paddlespeech.audio.streamdata.gopen
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.handlers.rst b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst
new file mode 100644
index 00000000000..7a4b3ce8e7c
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.handlers module
+=============================================
+
+.. automodule:: paddlespeech.audio.streamdata.handlers
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.mix.rst b/docs/source/api/paddlespeech.audio.streamdata.mix.rst
new file mode 100644
index 00000000000..908b35dd118
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.mix.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.mix module
+========================================
+
+.. automodule:: paddlespeech.audio.streamdata.mix
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
new file mode 100644
index 00000000000..2033430041b
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.paddle\_utils module
+==================================================
+
+.. automodule:: paddlespeech.audio.streamdata.paddle_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
new file mode 100644
index 00000000000..ae05fbecc2e
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.pipeline module
+=============================================
+
+.. automodule:: paddlespeech.audio.streamdata.pipeline
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.rst b/docs/source/api/paddlespeech.audio.streamdata.rst
new file mode 100644
index 00000000000..a1f4560a306
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.rst
@@ -0,0 +1,28 @@
+paddlespeech.audio.streamdata package
+=====================================
+
+.. automodule:: paddlespeech.audio.streamdata
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.audio.streamdata.autodecode
+   paddlespeech.audio.streamdata.cache
+   paddlespeech.audio.streamdata.compat
+   paddlespeech.audio.streamdata.extradatasets
+   paddlespeech.audio.streamdata.filters
+   paddlespeech.audio.streamdata.gopen
+   paddlespeech.audio.streamdata.handlers
+   paddlespeech.audio.streamdata.mix
+   paddlespeech.audio.streamdata.paddle_utils
+   paddlespeech.audio.streamdata.pipeline
+   paddlespeech.audio.streamdata.shardlists
+   paddlespeech.audio.streamdata.tariterators
+   paddlespeech.audio.streamdata.utils
+   paddlespeech.audio.streamdata.writer
diff --git a/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
new file mode 100644
index 00000000000..ec1fe823619
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.shardlists module
+===============================================
+
+.. automodule:: paddlespeech.audio.streamdata.shardlists
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
new file mode 100644
index 00000000000..b003b2d42f7
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.tariterators module
+=================================================
+
+.. automodule:: paddlespeech.audio.streamdata.tariterators
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.utils.rst b/docs/source/api/paddlespeech.audio.streamdata.utils.rst
new file mode 100644
index 00000000000..f248b11310c
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.utils module
+==========================================
+
+.. automodule:: paddlespeech.audio.streamdata.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.writer.rst b/docs/source/api/paddlespeech.audio.streamdata.writer.rst
new file mode 100644
index 00000000000..7437241f37d
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.writer.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.writer module
+===========================================
+
+.. automodule:: paddlespeech.audio.streamdata.writer
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.text.rst b/docs/source/api/paddlespeech.audio.text.rst
new file mode 100644
index 00000000000..a2018050a68
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.text.rst
@@ -0,0 +1,16 @@
+paddlespeech.audio.text package
+===============================
+
+.. automodule:: paddlespeech.audio.text
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.audio.text.text_featurizer
+   paddlespeech.audio.text.utility
diff --git a/docs/source/api/paddlespeech.audio.text.text_featurizer.rst b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst
new file mode 100644
index 00000000000..1a8262d089b
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.text.text\_featurizer module
+===============================================
+
+.. automodule:: paddlespeech.audio.text.text_featurizer
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.text.utility.rst b/docs/source/api/paddlespeech.audio.text.utility.rst
new file mode 100644
index 00000000000..90fcb25f60b
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.text.utility.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.text.utility module
+======================================
+
+.. automodule:: paddlespeech.audio.text.utility
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.add_deltas.rst b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst
new file mode 100644
index 00000000000..b4b596d6ee0
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.add\_deltas module
+===============================================
+
+.. automodule:: paddlespeech.audio.transform.add_deltas
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.channel_selector.rst b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst
new file mode 100644
index 00000000000..4828b590485
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.channel\_selector module
+=====================================================
+
+.. automodule:: paddlespeech.audio.transform.channel_selector
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.cmvn.rst b/docs/source/api/paddlespeech.audio.transform.cmvn.rst
new file mode 100644
index 00000000000..44655a1e4d6
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.cmvn.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.cmvn module
+========================================
+
+.. automodule:: paddlespeech.audio.transform.cmvn
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.functional.rst b/docs/source/api/paddlespeech.audio.transform.functional.rst
new file mode 100644
index 00000000000..7877d2495ba
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.functional.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.functional module
+==============================================
+
+.. automodule:: paddlespeech.audio.transform.functional
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.perturb.rst b/docs/source/api/paddlespeech.audio.transform.perturb.rst
new file mode 100644
index 00000000000..e3615a5d145
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.perturb.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.perturb module
+===========================================
+
+.. automodule:: paddlespeech.audio.transform.perturb
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.rst b/docs/source/api/paddlespeech.audio.transform.rst
new file mode 100644
index 00000000000..47a7303b342
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.rst
@@ -0,0 +1,24 @@
+paddlespeech.audio.transform package
+====================================
+
+.. automodule:: paddlespeech.audio.transform
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.audio.transform.add_deltas
+   paddlespeech.audio.transform.channel_selector
+   paddlespeech.audio.transform.cmvn
+   paddlespeech.audio.transform.functional
+   paddlespeech.audio.transform.perturb
+   paddlespeech.audio.transform.spec_augment
+   paddlespeech.audio.transform.spectrogram
+   paddlespeech.audio.transform.transform_interface
+   paddlespeech.audio.transform.transformation
+   paddlespeech.audio.transform.wpe
diff --git a/docs/source/api/paddlespeech.audio.transform.spec_augment.rst b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst
new file mode 100644
index 00000000000..f11a322410f
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.spec\_augment module
+=================================================
+
+.. automodule:: paddlespeech.audio.transform.spec_augment
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.spectrogram.rst b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst
new file mode 100644
index 00000000000..6be0c32eea0
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.spectrogram module
+===============================================
+
+.. automodule:: paddlespeech.audio.transform.spectrogram
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.transform_interface.rst b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst
new file mode 100644
index 00000000000..ec8b20857af
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.transform\_interface module
+========================================================
+
+.. automodule:: paddlespeech.audio.transform.transform_interface
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.transformation.rst b/docs/source/api/paddlespeech.audio.transform.transformation.rst
new file mode 100644
index 00000000000..94629b9afd6
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.transformation.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.transformation module
+==================================================
+
+.. automodule:: paddlespeech.audio.transform.transformation
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.wpe.rst b/docs/source/api/paddlespeech.audio.transform.wpe.rst
new file mode 100644
index 00000000000..85c75811494
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.wpe.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.wpe module
+=======================================
+
+.. automodule:: paddlespeech.audio.transform.wpe
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
new file mode 100644
index 00000000000..a18f27e65a7
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.utils.check\_kwargs module
+=============================================
+
+.. automodule:: paddlespeech.audio.utils.check_kwargs
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
new file mode 100644
index 00000000000..5d060ee15c3
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.utils.dynamic\_import module
+===============================================
+
+.. automodule:: paddlespeech.audio.utils.dynamic_import
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.utils.rst b/docs/source/api/paddlespeech.audio.utils.rst
index db15927dad9..217afa8fbc9 100644
--- a/docs/source/api/paddlespeech.audio.utils.rst
+++ b/docs/source/api/paddlespeech.audio.utils.rst
@@ -12,8 +12,11 @@ Submodules
 .. toctree::
    :maxdepth: 4
 
+   paddlespeech.audio.utils.check_kwargs
    paddlespeech.audio.utils.download
+   paddlespeech.audio.utils.dynamic_import
    paddlespeech.audio.utils.error
    paddlespeech.audio.utils.log
    paddlespeech.audio.utils.numeric
+   paddlespeech.audio.utils.tensor_utils
    paddlespeech.audio.utils.time
diff --git a/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
new file mode 100644
index 00000000000..93a1f70eb25
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.utils.tensor\_utils module
+=============================================
+
+.. automodule:: paddlespeech.audio.utils.tensor_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
new file mode 100644
index 00000000000..b533e8c42a3
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.collate module
+=========================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.collate
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
new file mode 100644
index 00000000000..45e09455500
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.compute\_det module
+==============================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.compute_det
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
new file mode 100644
index 00000000000..46a149b0bf1
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.plot\_det\_curve module
+==================================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.rst
new file mode 100644
index 00000000000..f6cad64e353
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst
@@ -0,0 +1,19 @@
+paddlespeech.kws.exps.mdtc package
+==================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.kws.exps.mdtc.collate
+   paddlespeech.kws.exps.mdtc.compute_det
+   paddlespeech.kws.exps.mdtc.plot_det_curve
+   paddlespeech.kws.exps.mdtc.score
+   paddlespeech.kws.exps.mdtc.train
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
new file mode 100644
index 00000000000..aa956b4cb2f
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.score module
+=======================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.score
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
new file mode 100644
index 00000000000..5e4ca401a20
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.train module
+=======================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.train
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.rst b/docs/source/api/paddlespeech.kws.exps.rst
new file mode 100644
index 00000000000..bf10d2c9fff
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.rst
@@ -0,0 +1,15 @@
+paddlespeech.kws.exps package
+=============================
+
+.. automodule:: paddlespeech.kws.exps
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.kws.exps.mdtc
diff --git a/docs/source/api/paddlespeech.kws.rst b/docs/source/api/paddlespeech.kws.rst
index c2829a42eb8..d21d094c711 100644
--- a/docs/source/api/paddlespeech.kws.rst
+++ b/docs/source/api/paddlespeech.kws.rst
@@ -12,4 +12,5 @@ Subpackages
 .. toctree::
    :maxdepth: 4
 
+   paddlespeech.kws.exps
    paddlespeech.kws.models
diff --git a/docs/source/api/paddlespeech.resource.model_alias.rst b/docs/source/api/paddlespeech.resource.model_alias.rst
new file mode 100644
index 00000000000..b78e643acea
--- /dev/null
+++ b/docs/source/api/paddlespeech.resource.model_alias.rst
@@ -0,0 +1,7 @@
+paddlespeech.resource.model\_alias module
+=========================================
+
+.. automodule:: paddlespeech.resource.model_alias
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.resource.pretrained_models.rst b/docs/source/api/paddlespeech.resource.pretrained_models.rst
new file mode 100644
index 00000000000..a020616933b
--- /dev/null
+++ b/docs/source/api/paddlespeech.resource.pretrained_models.rst
@@ -0,0 +1,7 @@
+paddlespeech.resource.pretrained\_models module
+===============================================
+
+.. automodule:: paddlespeech.resource.pretrained_models
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.resource.resource.rst b/docs/source/api/paddlespeech.resource.resource.rst
new file mode 100644
index 00000000000..8b51eda3c22
--- /dev/null
+++ b/docs/source/api/paddlespeech.resource.resource.rst
@@ -0,0 +1,7 @@
+paddlespeech.resource.resource module
+=====================================
+
+.. automodule:: paddlespeech.resource.resource
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.resource.rst b/docs/source/api/paddlespeech.resource.rst
new file mode 100644
index 00000000000..61fdd531785
--- /dev/null
+++ b/docs/source/api/paddlespeech.resource.rst
@@ -0,0 +1,17 @@
+paddlespeech.resource package
+=============================
+
+.. automodule:: paddlespeech.resource
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.resource.model_alias
+   paddlespeech.resource.pretrained_models
+   paddlespeech.resource.resource
diff --git a/docs/source/api/paddlespeech.rst b/docs/source/api/paddlespeech.rst
index e7a01bf7653..d06cd2c770d 100644
--- a/docs/source/api/paddlespeech.rst
+++ b/docs/source/api/paddlespeech.rst
@@ -16,8 +16,10 @@ Subpackages
    paddlespeech.cli
    paddlespeech.cls
    paddlespeech.kws
+   paddlespeech.resource
    paddlespeech.s2t
    paddlespeech.server
    paddlespeech.t2s
    paddlespeech.text
+   paddlespeech.utils
    paddlespeech.vector
diff --git a/docs/source/api/paddlespeech.s2t.rst b/docs/source/api/paddlespeech.s2t.rst
index 4be22cb878d..be9ef52f553 100644
--- a/docs/source/api/paddlespeech.s2t.rst
+++ b/docs/source/api/paddlespeech.s2t.rst
@@ -19,5 +19,4 @@ Subpackages
    paddlespeech.s2t.models
    paddlespeech.s2t.modules
    paddlespeech.s2t.training
-   paddlespeech.s2t.transform
    paddlespeech.s2t.utils
diff --git a/docs/source/api/paddlespeech.server.utils.rst b/docs/source/api/paddlespeech.server.utils.rst
index 9d1166392f8..b4051aee34d 100644
--- a/docs/source/api/paddlespeech.server.utils.rst
+++ b/docs/source/api/paddlespeech.server.utils.rst
@@ -18,7 +18,6 @@ Submodules
    paddlespeech.server.utils.config
    paddlespeech.server.utils.errors
    paddlespeech.server.utils.exception
-   paddlespeech.server.utils.log
    paddlespeech.server.utils.onnx_infer
    paddlespeech.server.utils.paddle_predictor
    paddlespeech.server.utils.util
diff --git a/docs/source/api/paddlespeech.t2s.datasets.rst b/docs/source/api/paddlespeech.t2s.datasets.rst
index b40eb2bf19c..dfbdb0b47ca 100644
--- a/docs/source/api/paddlespeech.t2s.datasets.rst
+++ b/docs/source/api/paddlespeech.t2s.datasets.rst
@@ -19,4 +19,5 @@ Submodules
    paddlespeech.t2s.datasets.get_feats
    paddlespeech.t2s.datasets.ljspeech
    paddlespeech.t2s.datasets.preprocess_utils
+   paddlespeech.t2s.datasets.sampler
    paddlespeech.t2s.datasets.vocoder_batch_fn
diff --git a/docs/source/api/paddlespeech.t2s.datasets.sampler.rst b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst
new file mode 100644
index 00000000000..ed29c28d73b
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.datasets.sampler module
+========================================
+
+.. automodule:: paddlespeech.t2s.datasets.sampler
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
new file mode 100644
index 00000000000..a5e07aace39
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.align module
+=============================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.align
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
new file mode 100644
index 00000000000..3771311cb8c
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.normalize module
+=================================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
new file mode 100644
index 00000000000..8d4c24ffe58
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.preprocess module
+==================================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
new file mode 100644
index 00000000000..a611584205d
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
@@ -0,0 +1,21 @@
+paddlespeech.t2s.exps.ernie\_sat package
+========================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.t2s.exps.ernie_sat.align
+   paddlespeech.t2s.exps.ernie_sat.normalize
+   paddlespeech.t2s.exps.ernie_sat.preprocess
+   paddlespeech.t2s.exps.ernie_sat.synthesize
+   paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
+   paddlespeech.t2s.exps.ernie_sat.train
+   paddlespeech.t2s.exps.ernie_sat.utils
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
new file mode 100644
index 00000000000..ecda2a51346
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.synthesize module
+==================================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
new file mode 100644
index 00000000000..00fc4495266
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module
+=======================================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
new file mode 100644
index 00000000000..ba9a3334493
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.train module
+=============================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.train
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
new file mode 100644
index 00000000000..a2dd26c38fc
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.utils module
+=============================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
index 3c98aa88256..fad1fd87f7d 100644
--- a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
@@ -16,3 +16,4 @@ Submodules
    paddlespeech.t2s.exps.fastspeech2.normalize
    paddlespeech.t2s.exps.fastspeech2.preprocess
    paddlespeech.t2s.exps.fastspeech2.train
+   paddlespeech.t2s.exps.fastspeech2.vc2_infer
diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
new file mode 100644
index 00000000000..70a9d6e1571
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.fastspeech2.vc2\_infer module
+===================================================
+
+.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.rst b/docs/source/api/paddlespeech.t2s.exps.rst
index a688435eba0..bee18a97209 100644
--- a/docs/source/api/paddlespeech.t2s.exps.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.rst
@@ -12,11 +12,13 @@ Subpackages
 .. toctree::
    :maxdepth: 4
 
+   paddlespeech.t2s.exps.ernie_sat
    paddlespeech.t2s.exps.fastspeech2
    paddlespeech.t2s.exps.gan_vocoder
    paddlespeech.t2s.exps.speedyspeech
    paddlespeech.t2s.exps.tacotron2
    paddlespeech.t2s.exps.transformer_tts
+   paddlespeech.t2s.exps.vits
    paddlespeech.t2s.exps.waveflow
    paddlespeech.t2s.exps.wavernn
 
@@ -31,6 +33,7 @@ Submodules
    paddlespeech.t2s.exps.ort_predict
    paddlespeech.t2s.exps.ort_predict_e2e
    paddlespeech.t2s.exps.ort_predict_streaming
+   paddlespeech.t2s.exps.stream_play_tts
    paddlespeech.t2s.exps.syn_utils
    paddlespeech.t2s.exps.synthesize
    paddlespeech.t2s.exps.synthesize_e2e
diff --git a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
new file mode 100644
index 00000000000..cb22dde0c38
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.stream\_play\_tts module
+==============================================
+
+.. automodule:: paddlespeech.t2s.exps.stream_play_tts
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
new file mode 100644
index 00000000000..c5606f99830
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.normalize module
+===========================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.normalize
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
new file mode 100644
index 00000000000..50633c621ee
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.preprocess module
+============================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.preprocess
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.rst b/docs/source/api/paddlespeech.t2s.exps.vits.rst
new file mode 100644
index 00000000000..51a9418d598
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.rst
@@ -0,0 +1,20 @@
+paddlespeech.t2s.exps.vits package
+==================================
+
+.. automodule:: paddlespeech.t2s.exps.vits
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.t2s.exps.vits.normalize
+   paddlespeech.t2s.exps.vits.preprocess
+   paddlespeech.t2s.exps.vits.synthesize
+   paddlespeech.t2s.exps.vits.synthesize_e2e
+   paddlespeech.t2s.exps.vits.train
+   paddlespeech.t2s.exps.vits.voice_cloning
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
new file mode 100644
index 00000000000..4b22d069a5f
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.synthesize module
+============================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.synthesize
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
new file mode 100644
index 00000000000..053ddfc8328
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.synthesize\_e2e module
+=================================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.train.rst b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst
new file mode 100644
index 00000000000..31bd3a48f58
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.train module
+=======================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.train
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
new file mode 100644
index 00000000000..d9be0f310f7
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.voice\_cloning module
+================================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
new file mode 100644
index 00000000000..1635ec284c9
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.frontend.g2pw.dataset module
+=============================================
+
+.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
new file mode 100644
index 00000000000..b7d549070e2
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.frontend.g2pw.onnx\_api module
+===============================================
+
+.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
new file mode 100644
index 00000000000..10a118b7650
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
@@ -0,0 +1,17 @@
+paddlespeech.t2s.frontend.g2pw package
+======================================
+
+.. automodule:: paddlespeech.t2s.frontend.g2pw
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.t2s.frontend.g2pw.dataset
+   paddlespeech.t2s.frontend.g2pw.onnx_api
+   paddlespeech.t2s.frontend.g2pw.utils
diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
new file mode 100644
index 00000000000..ce942803792
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.frontend.g2pw.utils module
+===========================================
+
+.. automodule:: paddlespeech.t2s.frontend.g2pw.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
new file mode 100644
index 00000000000..4505dddba61
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.frontend.mix\_frontend module
+==============================================
+
+.. automodule:: paddlespeech.t2s.frontend.mix_frontend
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.rst
index 8fbf1e6ebc5..b6106861629 100644
--- a/docs/source/api/paddlespeech.t2s.frontend.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.rst
@@ -12,6 +12,7 @@ Subpackages
 .. toctree::
    :maxdepth: 4
 
+   paddlespeech.t2s.frontend.g2pw
    paddlespeech.t2s.frontend.normalizer
    paddlespeech.t2s.frontend.zh_normalization
 
@@ -23,6 +24,7 @@ Submodules
 
    paddlespeech.t2s.frontend.arpabet
    paddlespeech.t2s.frontend.generate_lexicon
+   paddlespeech.t2s.frontend.mix_frontend
    paddlespeech.t2s.frontend.phonectic
    paddlespeech.t2s.frontend.punctuation
    paddlespeech.t2s.frontend.tone_sandhi
diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
new file mode 100644
index 00000000000..fce5a83cc58
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.models.ernie\_sat.ernie\_sat module
+====================================================
+
+.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
new file mode 100644
index 00000000000..8a697d6cf35
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module
+=============================================================
+
+.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
index 680a85deaf2..aff7489c736 100644
--- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
@@ -12,4 +12,5 @@ Submodules
 .. toctree::
    :maxdepth: 4
 
-   paddlespeech.t2s.models.ernie_sat.mlm
+   paddlespeech.t2s.models.ernie_sat.ernie_sat
+   paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
new file mode 100644
index 00000000000..7aaba795299
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.models.vits.monotonic\_align.core module
+=========================================================
+
+.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
new file mode 100644
index 00000000000..25c819a7efd
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
@@ -0,0 +1,16 @@
+paddlespeech.t2s.models.vits.monotonic\_align package
+=====================================================
+
+.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.t2s.models.vits.monotonic_align.core
+   paddlespeech.t2s.models.vits.monotonic_align.setup
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
new file mode 100644
index 00000000000..a93c3b8bf62
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.models.vits.monotonic\_align.setup module
+==========================================================
+
+.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.utils.dynamic_import.rst b/docs/source/api/paddlespeech.utils.dynamic_import.rst
new file mode 100644
index 00000000000..daa4e6e7845
--- /dev/null
+++ b/docs/source/api/paddlespeech.utils.dynamic_import.rst
@@ -0,0 +1,7 @@
+paddlespeech.utils.dynamic\_import module
+=========================================
+
+.. automodule:: paddlespeech.utils.dynamic_import
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.utils.env.rst b/docs/source/api/paddlespeech.utils.env.rst
new file mode 100644
index 00000000000..e51278f8288
--- /dev/null
+++ b/docs/source/api/paddlespeech.utils.env.rst
@@ -0,0 +1,7 @@
+paddlespeech.utils.env module
+=============================
+
+.. automodule:: paddlespeech.utils.env
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.utils.rst b/docs/source/api/paddlespeech.utils.rst
new file mode 100644
index 00000000000..3d47626bbdc
--- /dev/null
+++ b/docs/source/api/paddlespeech.utils.rst
@@ -0,0 +1,16 @@
+paddlespeech.utils package
+==========================
+
+.. automodule:: paddlespeech.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.utils.dynamic_import
+   paddlespeech.utils.env
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 83474c5286d..8540d3fc67a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -74,8 +74,10 @@ Contents
    paddlespeech.cli <api/paddlespeech.cli>
    paddlespeech.cls <api/paddlespeech.cls>
    paddlespeech.kws <api/paddlespeech.kws>
+   paddlespeech.resource <api/paddlespeech.resource>
    paddlespeech.s2t <api/paddlespeech.s2t>
    paddlespeech.server <api/paddlespeech.server>
    paddlespeech.t2s <api/paddlespeech.t2s>
    paddlespeech.text <api/paddlespeech.text>
+   paddlespeech.utils <api/ppaddlespeech.utils>
    paddlespeech.vector <api/paddlespeech.vector>
diff --git a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
index 08c43dc5f8d..eb42b33ed39 100644
--- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
+++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
@@ -71,31 +71,53 @@ class MLMEncoder(nn.Layer):
     """Conformer encoder module.
 
     Args:
-        idim (int): Input dimension.
-        attention_dim (int): Dimension of attention.
-        attention_heads (int): The number of heads of multi head attention.
-        linear_units (int): The number of units of position-wise feed forward.
-        num_blocks (int): The number of decoder blocks.
-        dropout_rate (float): Dropout rate.
-        positional_dropout_rate (float): Dropout rate after adding positional encoding.
-        attention_dropout_rate (float): Dropout rate in attention.
-        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
-        normalize_before (bool): Whether to use layer_norm before the first block.
-        concat_after (bool): Whether to concat attention layer's input and output.
+        idim (int): 
+            Input dimension.
+        attention_dim (int): 
+            Dimension of attention.
+        attention_heads (int): 
+            The number of heads of multi head attention.
+        linear_units (int): 
+            The number of units of position-wise feed forward.
+        num_blocks (int): 
+            The number of decoder blocks.
+        dropout_rate (float): 
+            Dropout rate.
+        positional_dropout_rate (float): 
+            Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): 
+            Dropout rate in attention.
+        input_layer (Union[str, paddle.nn.Layer]): 
+            Input layer type.
+        normalize_before (bool): 
+            Whether to use layer_norm before the first block.
+        concat_after (bool): 
+            Whether to concat attention layer's input and output.
             if True, additional linear will be applied.
             i.e. x -> x + linear(concat(x, att(x)))
             if False, no additional linear will be applied. i.e. x -> x + att(x)
-        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
-        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
-        macaron_style (bool): Whether to use macaron style for positionwise layer.
-        pos_enc_layer_type (str): Encoder positional encoding layer type.
-        selfattention_layer_type (str): Encoder attention layer type.
-        activation_type (str): Encoder activation function type.
-        use_cnn_module (bool): Whether to use convolution module.
-        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
-        cnn_module_kernel (int): Kernerl size of convolution module.
-        padding_idx (int): Padding idx for input_layer=embed.
-        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        positionwise_layer_type (str): 
+            "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): 
+            Kernel size of positionwise conv1d layer.
+        macaron_style (bool): 
+            Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): 
+            Encoder positional encoding layer type.
+        selfattention_layer_type (str): 
+            Encoder attention layer type.
+        activation_type (str): 
+            Encoder activation function type.
+        use_cnn_module (bool): 
+            Whether to use convolution module.
+        zero_triu (bool): 
+            Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): 
+            Kernerl size of convolution module.
+        padding_idx (int): 
+            Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): 
+            Maximum probability to skip the encoder layer.
 
     """
 
@@ -320,12 +342,16 @@ def forward(self, xs: paddle.Tensor, masks: paddle.Tensor):
         """Encode input sequence.
 
         Args:
-            xs (paddle.Tensor): Input tensor (#batch, time, idim).
-            masks (paddle.Tensor): Mask tensor (#batch, time).
+            xs (paddle.Tensor): 
+                Input tensor (#batch, time, idim).
+            masks (paddle.Tensor): 
+                Mask tensor (#batch, time).
 
         Returns:
-            paddle.Tensor: Output tensor (#batch, time, attention_dim).
-            paddle.Tensor: Mask tensor (#batch, time).
+            paddle.Tensor: 
+                Output tensor (#batch, time, attention_dim).
+            paddle.Tensor: 
+                Mask tensor (#batch, time).
 
         """
         xs = self.embed(xs)
@@ -392,19 +418,27 @@ def inference(
             use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]:
         '''
         Args:
-            speech (paddle.Tensor): input speech (1, Tmax, D).
-            text (paddle.Tensor): input text (1, Tmax2).
-            masked_pos (paddle.Tensor): masked position of input speech (1, Tmax)
-            speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax).
-            text_mask (paddle.Tensor): mask of text (1, 1, Tmax2).
-            speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
-            text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
-            span_bdy (List[int]): masked mel boundary of input speech (2,)
-            use_teacher_forcing (bool): whether to use teacher forcing
+            speech (paddle.Tensor): 
+                input speech (1, Tmax, D).
+            text (paddle.Tensor): 
+                input text (1, Tmax2).
+            masked_pos (paddle.Tensor): 
+                masked position of input speech (1, Tmax)
+            speech_mask (paddle.Tensor): 
+                mask of speech (1, 1, Tmax).
+            text_mask (paddle.Tensor): 
+                mask of text (1, 1, Tmax2).
+            speech_seg_pos (paddle.Tensor): 
+                n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
+            text_seg_pos (paddle.Tensor): 
+                n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
+            span_bdy (List[int]): 
+                masked mel boundary of input speech (2,)
+            use_teacher_forcing (bool): 
+                whether to use teacher forcing
         Returns:
             List[Tensor]:
-                eg:
-                [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
+                eg: [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
         '''
 
         z_cache = None
diff --git a/paddlespeech/t2s/models/vits/duration_predictor.py b/paddlespeech/t2s/models/vits/duration_predictor.py
index 6197d569637..b0bb68d0f5d 100644
--- a/paddlespeech/t2s/models/vits/duration_predictor.py
+++ b/paddlespeech/t2s/models/vits/duration_predictor.py
@@ -48,12 +48,18 @@ def __init__(
             global_channels: int=-1, ):
         """Initialize StochasticDurationPredictor module.
         Args:
-            channels (int): Number of channels.
-            kernel_size (int): Kernel size.
-            dropout_rate (float): Dropout rate.
-            flows (int): Number of flows.
-            dds_conv_layers (int): Number of conv layers in DDS conv.
-            global_channels (int): Number of global conditioning channels.
+            channels (int):
+                Number of channels.
+            kernel_size (int):
+                Kernel size.
+            dropout_rate (float):
+                Dropout rate.
+            flows (int):
+                Number of flows.
+            dds_conv_layers (int):
+                Number of conv layers in DDS conv.
+            global_channels (int):
+                Number of global conditioning channels.
         """
         super().__init__()
 
@@ -108,14 +114,21 @@ def forward(
             noise_scale: float=1.0, ) -> paddle.Tensor:
         """Calculate forward propagation.
         Args:
-            x (Tensor): Input tensor (B, channels, T_text).
-            x_mask (Tensor): Mask tensor (B, 1, T_text).
-            w (Optional[Tensor]): Duration tensor (B, 1, T_text).
-            g (Optional[Tensor]): Global conditioning tensor (B, channels, 1)
-            inverse (bool): Whether to inverse the flow.
-            noise_scale (float): Noise scale value.
+            x (Tensor):
+                Input tensor (B, channels, T_text).
+            x_mask (Tensor):
+                Mask tensor (B, 1, T_text).
+            w (Optional[Tensor]):
+                Duration tensor (B, 1, T_text).
+            g (Optional[Tensor]):
+                Global conditioning tensor (B, channels, 1)
+            inverse (bool):
+                Whether to inverse the flow.
+            noise_scale (float):
+                Noise scale value.
         Returns:
-            Tensor: If not inverse, negative log-likelihood (NLL) tensor (B,).
+            Tensor: 
+                If not inverse, negative log-likelihood (NLL) tensor (B,).
                 If inverse, log-duration tensor (B, 1, T_text).
         """
         # stop gradient
diff --git a/paddlespeech/t2s/models/vits/flow.py b/paddlespeech/t2s/models/vits/flow.py
index 3c8f89356e9..7593eb72733 100644
--- a/paddlespeech/t2s/models/vits/flow.py
+++ b/paddlespeech/t2s/models/vits/flow.py
@@ -34,11 +34,15 @@ def forward(self, x: paddle.Tensor, *args, inverse: bool=False, **kwargs
                 ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
         """Calculate forward propagation.
         Args:
-            x (Tensor): Input tensor (B, channels, T).
-            inverse (bool): Whether to inverse the flow.
+            x (Tensor):
+                Input tensor (B, channels, T).
+            inverse (bool):
+                Whether to inverse the flow.
         Returns:
-            Tensor: Flipped tensor (B, channels, T).
-            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+            Tensor:
+                Flipped tensor (B, channels, T).
+            Tensor:
+                Log-determinant tensor for NLL (B,) if not inverse.
         """
         x = paddle.flip(x, [1])
         if not inverse:
@@ -60,13 +64,19 @@ def forward(self,
                 ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
         """Calculate forward propagation.
         Args:
-            x (Tensor): Input tensor (B, channels, T).
-            x_mask (Tensor): Mask tensor (B, 1, T).
-            inverse (bool): Whether to inverse the flow.
-            eps (float): Epsilon for log.
+            x (Tensor):
+                Input tensor (B, channels, T).
+            x_mask (Tensor):
+                Mask tensor (B, 1, T).
+            inverse (bool):
+                Whether to inverse the flow.
+            eps (float):
+                Epsilon for log.
         Returns:
-            Tensor: Output tensor (B, channels, T).
-            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+            Tensor:
+                Output tensor (B, channels, T).
+            Tensor:
+                Log-determinant tensor for NLL (B,) if not inverse.
         """
         if not inverse:
             y = paddle.log(paddle.clip(x, min=eps)) * x_mask
@@ -83,7 +93,8 @@ class ElementwiseAffineFlow(nn.Layer):
     def __init__(self, channels: int):
         """Initialize ElementwiseAffineFlow module.
         Args:
-            channels (int): Number of channels.
+            channels (int):
+                Number of channels.
         """
         super().__init__()
         self.channels = channels
@@ -107,12 +118,17 @@ def forward(self,
                 ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
         """Calculate forward propagation.
         Args:
-            x (Tensor): Input tensor (B, channels, T).
-            x_mask (Tensor): Mask tensor (B, 1, T).
-            inverse (bool): Whether to inverse the flow.
+            x (Tensor):
+                Input tensor (B, channels, T).
+            x_mask (Tensor):
+                Mask tensor (B, 1, T).
+            inverse (bool):
+                Whether to inverse the flow.
         Returns:
-            Tensor: Output tensor (B, channels, T).
-            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+            Tensor:
+                Output tensor (B, channels, T).
+            Tensor:
+                Log-determinant tensor for NLL (B,) if not inverse.
         """
         if not inverse:
             y = self.m + paddle.exp(self.logs) * x
@@ -157,11 +173,16 @@ def __init__(
             eps: float=1e-5, ):
         """Initialize DilatedDepthSeparableConv module.
         Args:
-            channels (int): Number of channels.
-            kernel_size (int): Kernel size.
-            layers (int): Number of layers.
-            dropout_rate (float): Dropout rate.
-            eps (float): Epsilon for layer norm.
+            channels (int):
+                Number of channels.
+            kernel_size (int):
+                Kernel size.
+            layers (int):
+                Number of layers.
+            dropout_rate (float):
+                Dropout rate.
+            eps (float):
+                Epsilon for layer norm.
         """
         super().__init__()
 
@@ -198,11 +219,15 @@ def forward(self,
                 g: Optional[paddle.Tensor]=None) -> paddle.Tensor:
         """Calculate forward propagation.
         Args:
-            x (Tensor): Input tensor (B, in_channels, T).
-            x_mask (Tensor): Mask tensor (B, 1, T).
-            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
+            x (Tensor):
+                Input tensor (B, in_channels, T).
+            x_mask (Tensor):
+                Mask tensor (B, 1, T).
+            g (Optional[Tensor]):
+                Global conditioning tensor (B, global_channels, 1).
         Returns:
-            Tensor: Output tensor (B, channels, T).
+            Tensor:
+                Output tensor (B, channels, T).
         """
         if g is not None:
             x = x + g
@@ -225,12 +250,18 @@ def __init__(
             tail_bound: float=5.0, ):
         """Initialize ConvFlow module.
         Args:
-            in_channels (int): Number of input channels.
-            hidden_channels (int): Number of hidden channels.
-            kernel_size (int): Kernel size.
-            layers (int): Number of layers.
-            bins (int): Number of bins.
-            tail_bound (float): Tail bound value.
+            in_channels (int):
+                Number of input channels.
+            hidden_channels (int):
+                Number of hidden channels.
+            kernel_size (int):
+                Kernel size.
+            layers (int):
+                Number of layers.
+            bins (int):
+                Number of bins.
+            tail_bound (float):
+                Tail bound value.
         """
         super().__init__()
         self.half_channels = in_channels // 2
@@ -275,13 +306,19 @@ def forward(
     ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
         """Calculate forward propagation.
         Args:
-            x (Tensor): Input tensor (B, channels, T).
-            x_mask (Tensor): Mask tensor (B, 1, T).
-            g (Optional[Tensor]): Global conditioning tensor (B, channels, 1).
-            inverse (bool): Whether to inverse the flow.
+            x (Tensor):
+                Input tensor (B, channels, T).
+            x_mask (Tensor):
+                Mask tensor (B, 1, T).
+            g (Optional[Tensor]):
+                Global conditioning tensor (B, channels, 1).
+            inverse (bool):
+                Whether to inverse the flow.
         Returns:
-            Tensor: Output tensor (B, channels, T).
-            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+            Tensor:
+                Output tensor (B, channels, T).
+            Tensor:
+                Log-determinant tensor for NLL (B,) if not inverse.
         """
         xa, xb = x.split(2, 1)
         h = self.input_conv(xa)
diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py
index 359b662586c..7ecc5161972 100644
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@@ -97,81 +97,104 @@ def __init__(
             stochastic_duration_predictor_dds_conv_layers: int=3, ):
         """Initialize VITS generator module.
         Args:
-            vocabs (int): Input vocabulary size.
-            aux_channels (int): Number of acoustic feature channels.
-            hidden_channels (int): Number of hidden channels.
-            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
+            vocabs (int):
+                Input vocabulary size.
+            aux_channels (int):
+                Number of acoustic feature channels.
+            hidden_channels (int):
+                Number of hidden channels.
+            spks (Optional[int]):
+                Number of speakers. If set to > 1, assume that the
                 sids will be provided as the input and use sid embedding layer.
-            langs (Optional[int]): Number of languages. If set to > 1, assume that the
+            langs (Optional[int]):
+                Number of languages. If set to > 1, assume that the
                 lids will be provided as the input and use sid embedding layer.
-            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+            spk_embed_dim (Optional[int]):
+                Speaker embedding dimension. If set to > 0,
                 assume that spembs will be provided as the input.
-            global_channels (int): Number of global conditioning channels.
-            segment_size (int): Segment size for decoder.
-            text_encoder_attention_heads (int): Number of heads in conformer block
-                of text encoder.
-            text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block
-                of text encoder.
-            text_encoder_blocks (int): Number of conformer blocks in text encoder.
-            text_encoder_positionwise_layer_type (str): Position-wise layer type in
-                conformer block of text encoder.
-            text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution
-                kernel size in conformer block of text encoder. Only used when the
-                above layer type is conv1d or conv1d-linear.
-            text_encoder_positional_encoding_layer_type (str): Positional encoding layer
-                type in conformer block of text encoder.
-            text_encoder_self_attention_layer_type (str): Self-attention layer type in
-                conformer block of text encoder.
-            text_encoder_activation_type (str): Activation function type in conformer
-                block of text encoder.
-            text_encoder_normalize_before (bool): Whether to apply layer norm before
-                self-attention in conformer block of text encoder.
-            text_encoder_dropout_rate (float): Dropout rate in conformer block of
-                text encoder.
-            text_encoder_positional_dropout_rate (float): Dropout rate for positional
-                encoding in conformer block of text encoder.
-            text_encoder_attention_dropout_rate (float): Dropout rate for attention in
-                conformer block of text encoder.
-            text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It
-                will be used when only use_conformer_conv_in_text_encoder = True.
-            use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN
-                in conformer block of text encoder.
-            use_conformer_conv_in_text_encoder (bool): Whether to use covolution in
-                conformer block of text encoder.
-            decoder_kernel_size (int): Decoder kernel size.
-            decoder_channels (int): Number of decoder initial channels.
-            decoder_upsample_scales (List[int]): List of upsampling scales in decoder.
-            decoder_upsample_kernel_sizes (List[int]): List of kernel size for
-                upsampling layers in decoder.
-            decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks
-                in decoder.
-            decoder_resblock_dilations (List[List[int]]): List of list of dilations for
-                resblocks in decoder.
-            use_weight_norm_in_decoder (bool): Whether to apply weight normalization in
-                decoder.
-            posterior_encoder_kernel_size (int): Posterior encoder kernel size.
-            posterior_encoder_layers (int): Number of layers of posterior encoder.
-            posterior_encoder_stacks (int): Number of stacks of posterior encoder.
-            posterior_encoder_base_dilation (int): Base dilation of posterior encoder.
-            posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder.
-            use_weight_norm_in_posterior_encoder (bool): Whether to apply weight
-                normalization in posterior encoder.
-            flow_flows (int): Number of flows in flow.
-            flow_kernel_size (int): Kernel size in flow.
-            flow_base_dilation (int): Base dilation in flow.
-            flow_layers (int): Number of layers in flow.
-            flow_dropout_rate (float): Dropout rate in flow
-            use_weight_norm_in_flow (bool): Whether to apply weight normalization in
-                flow.
-            use_only_mean_in_flow (bool): Whether to use only mean in flow.
-            stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic
-                duration predictor.
-            stochastic_duration_predictor_dropout_rate (float): Dropout rate in
-                stochastic duration predictor.
-            stochastic_duration_predictor_flows (int): Number of flows in stochastic
-                duration predictor.
-            stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv
-                layers in stochastic duration predictor.
+            global_channels (int):
+                Number of global conditioning channels.
+            segment_size (int):
+                Segment size for decoder.
+            text_encoder_attention_heads (int):
+                Number of heads in conformer block of text encoder.
+            text_encoder_ffn_expand (int): 
+                Expansion ratio of FFN in conformer block of text encoder.
+            text_encoder_blocks (int):
+                Number of conformer blocks in text encoder.
+            text_encoder_positionwise_layer_type (str):
+                Position-wise layer type in conformer block of text encoder.
+            text_encoder_positionwise_conv_kernel_size (int):
+                Position-wise convolution kernel size in conformer block of text encoder. 
+                Only used when the above layer type is conv1d or conv1d-linear.
+            text_encoder_positional_encoding_layer_type (str):
+                Positional encoding layer type in conformer block of text encoder.
+            text_encoder_self_attention_layer_type (str):
+                Self-attention layer type in conformer block of text encoder.
+            text_encoder_activation_type (str):
+                Activation function type in conformer block of text encoder.
+            text_encoder_normalize_before (bool): 
+                Whether to apply layer norm before self-attention in conformer block of text encoder.
+            text_encoder_dropout_rate (float):
+                Dropout rate in conformer block of text encoder.
+            text_encoder_positional_dropout_rate (float):
+                Dropout rate for positional encoding in conformer block of text encoder.
+            text_encoder_attention_dropout_rate (float):
+                Dropout rate for attention in conformer block of text encoder.
+            text_encoder_conformer_kernel_size (int):
+                Conformer conv kernel size. It will be used when only use_conformer_conv_in_text_encoder = True.
+            use_macaron_style_in_text_encoder (bool):
+                Whether to use macaron style FFN in conformer block of text encoder.
+            use_conformer_conv_in_text_encoder (bool):
+                Whether to use covolution in conformer block of text encoder.
+            decoder_kernel_size (int):
+                Decoder kernel size.
+            decoder_channels (int):
+                Number of decoder initial channels.
+            decoder_upsample_scales (List[int]):
+                List of upsampling scales in decoder.
+            decoder_upsample_kernel_sizes (List[int]):
+                List of kernel size for upsampling layers in decoder.
+            decoder_resblock_kernel_sizes (List[int]):
+                List of kernel size for resblocks in decoder.
+            decoder_resblock_dilations (List[List[int]]):
+                List of list of dilations for resblocks in decoder.
+            use_weight_norm_in_decoder (bool):
+                Whether to apply weight normalization in decoder.
+            posterior_encoder_kernel_size (int):
+                Posterior encoder kernel size.
+            posterior_encoder_layers (int):
+                Number of layers of posterior encoder.
+            posterior_encoder_stacks (int):
+                Number of stacks of posterior encoder.
+            posterior_encoder_base_dilation (int):
+                Base dilation of posterior encoder.
+            posterior_encoder_dropout_rate (float):
+                Dropout rate for posterior encoder.
+            use_weight_norm_in_posterior_encoder (bool): 
+                Whether to apply weight normalization in posterior encoder.
+            flow_flows (int):
+                Number of flows in flow.
+            flow_kernel_size (int):
+                Kernel size in flow.
+            flow_base_dilation (int):
+                Base dilation in flow.
+            flow_layers (int):
+                Number of layers in flow.
+            flow_dropout_rate (float):
+                Dropout rate in flow
+            use_weight_norm_in_flow (bool):
+                Whether to apply weight normalization in flow.
+            use_only_mean_in_flow (bool):
+                Whether to use only mean in flow.
+            stochastic_duration_predictor_kernel_size (int): 
+                Kernel size in stochastic duration predictor.
+            stochastic_duration_predictor_dropout_rate (float):
+                Dropout rate in stochastic duration predictor.
+            stochastic_duration_predictor_flows (int):
+                Number of flows in stochastic duration predictor.
+            stochastic_duration_predictor_dds_conv_layers (int):
+                Number of DDS conv layers in stochastic duration predictor.
         """
         super().__init__()
         self.segment_size = segment_size
@@ -272,27 +295,40 @@ def forward(
                      paddle.Tensor, paddle.Tensor, ], ]:
         """Calculate forward propagation.
         Args:
-            text (Tensor): Text index tensor (B, T_text).
-            text_lengths (Tensor): Text length tensor (B,).
-            feats (Tensor): Feature tensor (B, aux_channels, T_feats).
-            feats_lengths (Tensor): Feature length tensor (B,).
-            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
-            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
-            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, aux_channels, T_feats).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
         Returns:
-            Tensor: Waveform tensor (B, 1, segment_size * upsample_factor).
-            Tensor: Duration negative log-likelihood (NLL) tensor (B,).
-            Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text).
-            Tensor: Segments start index tensor (B,).
-            Tensor: Text mask tensor (B, 1, T_text).
-            Tensor: Feature mask tensor (B, 1, T_feats).
-            tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
-                - Tensor: Posterior encoder hidden representation (B, H, T_feats).
-                - Tensor: Flow hidden representation (B, H, T_feats).
-                - Tensor: Expanded text encoder projected mean (B, H, T_feats).
-                - Tensor: Expanded text encoder projected scale (B, H, T_feats).
-                - Tensor: Posterior encoder projected mean (B, H, T_feats).
-                - Tensor: Posterior encoder projected scale (B, H, T_feats).
+            Tensor:
+                Waveform tensor (B, 1, segment_size * upsample_factor).
+            Tensor:
+                Duration negative log-likelihood (NLL) tensor (B,).
+            Tensor:
+                Monotonic attention weight tensor (B, 1, T_feats, T_text).
+            Tensor:
+                Segments start index tensor (B,).
+            Tensor:
+                Text mask tensor (B, 1, T_text).
+            Tensor: 
+                Feature mask tensor (B, 1, T_feats).
+                tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+                    - Tensor: Posterior encoder hidden representation (B, H, T_feats).
+                    - Tensor: Flow hidden representation (B, H, T_feats).
+                    - Tensor: Expanded text encoder projected mean (B, H, T_feats).
+                    - Tensor: Expanded text encoder projected scale (B, H, T_feats).
+                    - Tensor: Posterior encoder projected mean (B, H, T_feats).
+                    - Tensor: Posterior encoder projected scale (B, H, T_feats).
         """
         # forward text encoder
         x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
@@ -402,24 +438,40 @@ def inference(
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """Run inference.
         Args:
-            text (Tensor): Input text index tensor (B, T_text,).
-            text_lengths (Tensor): Text length tensor (B,).
-            feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
-            feats_lengths (Tensor): Feature length tensor (B,).
-            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
-            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
-            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
-            dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided,
+            text (Tensor):
+                Input text index tensor (B, T_text,).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, aux_channels, T_feats,).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            dur (Optional[Tensor]):
+                Ground-truth duration (B, T_text,). If provided,
                 skip the prediction of durations (i.e., teacher forcing).
-            noise_scale (float): Noise scale parameter for flow.
-            noise_scale_dur (float): Noise scale parameter for duration predictor.
-            alpha (float): Alpha parameter to control the speed of generated speech.
-            max_len (Optional[int]): Maximum length of acoustic feature sequence.
-            use_teacher_forcing (bool): Whether to use teacher forcing.
+            noise_scale (float):
+                Noise scale parameter for flow.
+            noise_scale_dur (float):
+                Noise scale parameter for duration predictor.
+            alpha (float):
+                Alpha parameter to control the speed of generated speech.
+            max_len (Optional[int]):
+                Maximum length of acoustic feature sequence.
+            use_teacher_forcing (bool):
+                Whether to use teacher forcing.
         Returns:
-            Tensor: Generated waveform tensor (B, T_wav).
-            Tensor: Monotonic attention weight tensor (B, T_feats, T_text).
-            Tensor: Duration tensor (B, T_text).
+            Tensor: 
+                Generated waveform tensor (B, T_wav).
+            Tensor:
+                Monotonic attention weight tensor (B, T_feats, T_text).
+            Tensor:
+                Duration tensor (B, T_text).
         """
         # encoder
         x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
@@ -533,15 +585,23 @@ def voice_conversion(
             lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
         """Run voice conversion.
         Args:
-            feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
-            feats_lengths (Tensor): Feature length tensor (B,).
-            sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1).
-            sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1).
-            spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim).
-            spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim).
-            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+            feats (Tensor):
+                Feature tensor (B, aux_channels, T_feats,).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            sids_src (Optional[Tensor]):
+                Speaker index tensor of source feature (B,) or (B, 1).
+            sids_tgt (Optional[Tensor]):
+                Speaker index tensor of target feature (B,) or (B, 1).
+            spembs_src (Optional[Tensor]):
+                Speaker embedding tensor of source feature (B, spk_embed_dim).
+            spembs_tgt (Optional[Tensor]):
+                Speaker embedding tensor of target feature (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
         Returns:
-            Tensor: Generated waveform tensor (B, T_wav).
+            Tensor:
+                Generated waveform tensor (B, T_wav).
         """
         # encoder
         g_src = None
@@ -602,10 +662,13 @@ def _generate_path(self, dur: paddle.Tensor,
                        mask: paddle.Tensor) -> paddle.Tensor:
         """Generate path a.k.a. monotonic attention.
         Args:
-            dur (Tensor): Duration tensor (B, 1, T_text).
-            mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text).
+            dur (Tensor):
+                Duration tensor (B, 1, T_text).
+            mask (Tensor):
+                Attention mask tensor (B, 1, T_feats, T_text).
         Returns:
-            Tensor: Path tensor (B, 1, T_feats, T_text).
+            Tensor:
+                Path tensor (B, 1, T_feats, T_text).
         """
         b, _, t_y, t_x = paddle.shape(mask)
         cum_dur = paddle.cumsum(dur, -1)
diff --git a/paddlespeech/t2s/models/vits/posterior_encoder.py b/paddlespeech/t2s/models/vits/posterior_encoder.py
index 8532375576d..5e3d6b9ce34 100644
--- a/paddlespeech/t2s/models/vits/posterior_encoder.py
+++ b/paddlespeech/t2s/models/vits/posterior_encoder.py
@@ -52,17 +52,28 @@ def __init__(
         """Initilialize PosteriorEncoder module.
 
         Args:
-            in_channels (int): Number of input channels.
-            out_channels (int): Number of output channels.
-            hidden_channels (int): Number of hidden channels.
-            kernel_size (int): Kernel size in WaveNet.
-            layers (int): Number of layers of WaveNet.
-            stacks (int): Number of repeat stacking of WaveNet.
-            base_dilation (int): Base dilation factor.
-            global_channels (int): Number of global conditioning channels.
-            dropout_rate (float): Dropout rate.
-            bias (bool): Whether to use bias parameters in conv.
-            use_weight_norm (bool): Whether to apply weight norm.
+            in_channels (int):
+                Number of input channels.
+            out_channels (int):
+                Number of output channels.
+            hidden_channels (int):
+                Number of hidden channels.
+            kernel_size (int):
+                Kernel size in WaveNet.
+            layers (int):
+                Number of layers of WaveNet.
+            stacks (int):
+                Number of repeat stacking of WaveNet.
+            base_dilation (int):
+                Base dilation factor.
+            global_channels (int):
+                Number of global conditioning channels.
+            dropout_rate (float):
+                Dropout rate.
+            bias (bool):
+                Whether to use bias parameters in conv.
+            use_weight_norm (bool):
+                Whether to apply weight norm.
 
         """
         super().__init__()
@@ -99,15 +110,22 @@ def forward(
         """Calculate forward propagation.
 
         Args:
-            x (Tensor): Input tensor (B, in_channels, T_feats).
-            x_lengths (Tensor): Length tensor (B,).
-            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
+            x (Tensor):
+                Input tensor (B, in_channels, T_feats).
+            x_lengths (Tensor):
+                Length tensor (B,).
+            g (Optional[Tensor]):
+                Global conditioning tensor (B, global_channels, 1).
 
         Returns:
-            Tensor: Encoded hidden representation tensor (B, out_channels, T_feats).
-            Tensor: Projected mean tensor (B, out_channels, T_feats).
-            Tensor: Projected scale tensor (B, out_channels, T_feats).
-            Tensor: Mask tensor for input tensor (B, 1, T_feats).
+            Tensor:
+                Encoded hidden representation tensor (B, out_channels, T_feats).
+            Tensor:
+                Projected mean tensor (B, out_channels, T_feats).
+            Tensor:
+                Projected scale tensor (B, out_channels, T_feats).
+            Tensor:
+                Mask tensor for input tensor (B, 1, T_feats).
 
         """
         x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)
diff --git a/paddlespeech/t2s/models/vits/residual_coupling.py b/paddlespeech/t2s/models/vits/residual_coupling.py
index c18beedd038..afa6d1fa73b 100644
--- a/paddlespeech/t2s/models/vits/residual_coupling.py
+++ b/paddlespeech/t2s/models/vits/residual_coupling.py
@@ -55,18 +55,30 @@ def __init__(
         """Initilize ResidualAffineCouplingBlock module.
 
         Args:
-            in_channels (int): Number of input channels.
-            hidden_channels (int): Number of hidden channels.
-            flows (int): Number of flows.
-            kernel_size (int): Kernel size for WaveNet.
-            base_dilation (int): Base dilation factor for WaveNet.
-            layers (int): Number of layers of WaveNet.
-            stacks (int): Number of stacks of WaveNet.
-            global_channels (int): Number of global channels.
-            dropout_rate (float): Dropout rate.
-            use_weight_norm (bool): Whether to use weight normalization in WaveNet.
-            bias (bool): Whether to use bias paramters in WaveNet.
-            use_only_mean (bool): Whether to estimate only mean.
+            in_channels (int):
+                Number of input channels.
+            hidden_channels (int):
+                Number of hidden channels.
+            flows (int):
+                Number of flows.
+            kernel_size (int):
+                Kernel size for WaveNet.
+            base_dilation (int):
+                Base dilation factor for WaveNet.
+            layers (int):
+                Number of layers of WaveNet.
+            stacks (int):
+                Number of stacks of WaveNet.
+            global_channels (int):
+                Number of global channels.
+            dropout_rate (float):
+                Dropout rate.
+            use_weight_norm (bool):
+                Whether to use weight normalization in WaveNet.
+            bias (bool):
+                Whether to use bias paramters in WaveNet.
+            use_only_mean (bool):
+                Whether to estimate only mean.
 
         """
         super().__init__()
@@ -97,10 +109,14 @@ def forward(
         """Calculate forward propagation.
 
         Args:
-            x (Tensor): Input tensor (B, in_channels, T).
-            x_mask (Tensor): Length tensor (B, 1, T).
-            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
-            inverse (bool): Whether to inverse the flow.
+            x (Tensor):
+                Input tensor (B, in_channels, T).
+            x_mask (Tensor):
+                Length tensor (B, 1, T).
+            g (Optional[Tensor]):
+                Global conditioning tensor (B, global_channels, 1).
+            inverse (bool):
+                Whether to inverse the flow.
 
         Returns:
             Tensor: Output tensor (B, in_channels, T).
@@ -134,17 +150,28 @@ def __init__(
         """Initialzie ResidualAffineCouplingLayer module.
 
         Args:
-            in_channels (int): Number of input channels.
-            hidden_channels (int): Number of hidden channels.
-            kernel_size (int): Kernel size for WaveNet.
-            base_dilation (int): Base dilation factor for WaveNet.
-            layers (int): Number of layers of WaveNet.
-            stacks (int): Number of stacks of WaveNet.
-            global_channels (int): Number of global channels.
-            dropout_rate (float): Dropout rate.
-            use_weight_norm (bool): Whether to use weight normalization in WaveNet.
-            bias (bool): Whether to use bias paramters in WaveNet.
-            use_only_mean (bool): Whether to estimate only mean.
+            in_channels (int):
+                Number of input channels.
+            hidden_channels (int):
+                Number of hidden channels.
+            kernel_size (int):
+                Kernel size for WaveNet.
+            base_dilation (int):
+                Base dilation factor for WaveNet.
+            layers (int):
+                Number of layers of WaveNet.
+            stacks (int):
+                Number of stacks of WaveNet.
+            global_channels (int):
+                Number of global channels.
+            dropout_rate (float):
+                Dropout rate.
+            use_weight_norm (bool):
+                Whether to use weight normalization in WaveNet.
+            bias (bool):
+                Whether to use bias paramters in WaveNet.
+            use_only_mean (bool):
+                Whether to estimate only mean.
 
         """
         assert in_channels % 2 == 0, "in_channels should be divisible by 2"
@@ -211,14 +238,20 @@ def forward(
         """Calculate forward propagation.
 
         Args:
-            x (Tensor): Input tensor (B, in_channels, T).
-            x_lengths (Tensor): Length tensor (B,).
-            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
-            inverse (bool): Whether to inverse the flow.
+            x (Tensor):
+                Input tensor (B, in_channels, T).
+            x_lengths (Tensor):
+                Length tensor (B,).
+            g (Optional[Tensor]):
+                Global conditioning tensor (B, global_channels, 1).
+            inverse (bool):
+                Whether to inverse the flow.
 
         Returns:
-            Tensor: Output tensor (B, in_channels, T).
-            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+            Tensor:
+                Output tensor (B, in_channels, T).
+            Tensor:
+                Log-determinant tensor for NLL (B,) if not inverse.
 
         """
         xa, xb = paddle.split(x, 2, axis=1)
diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py
index 3afc7831ad1..799e0c75964 100644
--- a/paddlespeech/t2s/models/vits/text_encoder.py
+++ b/paddlespeech/t2s/models/vits/text_encoder.py
@@ -62,23 +62,40 @@ def __init__(
         """Initialize TextEncoder module.
 
         Args:
-            vocabs (int): Vocabulary size.
-            attention_dim (int): Attention dimension.
-            attention_heads (int): Number of attention heads.
-            linear_units (int): Number of linear units of positionwise layers.
-            blocks (int): Number of encoder blocks.
-            positionwise_layer_type (str): Positionwise layer type.
-            positionwise_conv_kernel_size (int): Positionwise layer's kernel size.
-            positional_encoding_layer_type (str): Positional encoding layer type.
-            self_attention_layer_type (str): Self-attention layer type.
-            activation_type (str): Activation function type.
-            normalize_before (bool): Whether to apply LayerNorm before attention.
-            use_macaron_style (bool): Whether to use macaron style components.
-            use_conformer_conv (bool): Whether to use conformer conv layers.
-            conformer_kernel_size (int): Conformer's conv kernel size.
-            dropout_rate (float): Dropout rate.
-            positional_dropout_rate (float): Dropout rate for positional encoding.
-            attention_dropout_rate (float): Dropout rate for attention.
+            vocabs (int):
+                Vocabulary size.
+            attention_dim (int):
+                Attention dimension.
+            attention_heads (int):
+                Number of attention heads.
+            linear_units (int):
+                Number of linear units of positionwise layers.
+            blocks (int):
+                Number of encoder blocks.
+            positionwise_layer_type (str):
+                Positionwise layer type.
+            positionwise_conv_kernel_size (int):
+                Positionwise layer's kernel size.
+            positional_encoding_layer_type (str):
+                Positional encoding layer type.
+            self_attention_layer_type (str):
+                Self-attention layer type.
+            activation_type (str):
+                Activation function type.
+            normalize_before (bool):
+                Whether to apply LayerNorm before attention.
+            use_macaron_style (bool):
+                Whether to use macaron style components.
+            use_conformer_conv (bool):
+                Whether to use conformer conv layers.
+            conformer_kernel_size (int):
+                Conformer's conv kernel size.
+            dropout_rate (float):
+                Dropout rate.
+            positional_dropout_rate (float):
+                Dropout rate for positional encoding.
+            attention_dropout_rate (float):
+                Dropout rate for attention.
 
         """
         super().__init__()
@@ -121,14 +138,20 @@ def forward(
         """Calculate forward propagation.
 
         Args:
-            x (Tensor): Input index tensor (B, T_text).
-            x_lengths (Tensor): Length tensor (B,).
+            x (Tensor):
+                Input index tensor (B, T_text).
+            x_lengths (Tensor):
+                Length tensor (B,).
 
         Returns:
-            Tensor: Encoded hidden representation (B, attention_dim, T_text).
-            Tensor: Projected mean tensor (B, attention_dim, T_text).
-            Tensor: Projected scale tensor (B, attention_dim, T_text).
-            Tensor: Mask tensor for input tensor (B, 1, T_text).
+            Tensor:
+                Encoded hidden representation (B, attention_dim, T_text).
+            Tensor:
+                Projected mean tensor (B, attention_dim, T_text).
+            Tensor:
+                Projected scale tensor (B, attention_dim, T_text).
+            Tensor:
+                Mask tensor for input tensor (B, 1, T_text).
 
         """
         x = self.emb(x) * math.sqrt(self.attention_dim)
diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py
index 983bf0a36f6..0ff3a546d0e 100644
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@@ -156,17 +156,25 @@ def __init__(
             init_type: str="xavier_uniform", ):
         """Initialize VITS module.
         Args:
-            idim (int): Input vocabrary size.
-            odim (int): Acoustic feature dimension. The actual output channels will
+            idim (int):
+                Input vocabrary size.
+            odim (int):
+                Acoustic feature dimension. The actual output channels will
                 be 1 since VITS is the end-to-end text-to-wave model but for the
                 compatibility odim is used to indicate the acoustic feature dimension.
-            sampling_rate (int): Sampling rate, not used for the training but it will
+            sampling_rate (int):
+                Sampling rate, not used for the training but it will
                 be referred in saving waveform during the inference.
-            generator_type (str): Generator type.
-            generator_params (Dict[str, Any]): Parameter dict for generator.
-            discriminator_type (str): Discriminator type.
-            discriminator_params (Dict[str, Any]): Parameter dict for discriminator.
-            cache_generator_outputs (bool): Whether to cache generator outputs.
+            generator_type (str):
+                Generator type.
+            generator_params (Dict[str, Any]):
+                Parameter dict for generator.
+            discriminator_type (str):
+                Discriminator type.
+            discriminator_params (Dict[str, Any]):
+                Parameter dict for discriminator.
+            cache_generator_outputs (bool):
+                Whether to cache generator outputs.
         """
         assert check_argument_types()
         super().__init__()
@@ -218,14 +226,22 @@ def forward(
             forward_generator: bool=True, ) -> Dict[str, Any]:
         """Perform generator forward.
         Args:
-            text (Tensor): Text index tensor (B, T_text).
-            text_lengths (Tensor): Text length tensor (B,).
-            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
-            feats_lengths (Tensor): Feature length tensor (B,).
-            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
-            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
-            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
-            forward_generator (bool): Whether to forward generator.
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            forward_generator (bool):
+                    Whether to forward generator.
         Returns:
         
         """
@@ -259,13 +275,20 @@ def _forward_generator(
             lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]:
         """Perform generator forward.
         Args:
-            text (Tensor): Text index tensor (B, T_text).
-            text_lengths (Tensor): Text length tensor (B,).
-            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
-            feats_lengths (Tensor): Feature length tensor (B,).
-            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
-            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
-            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
         Returns:
             
         """
@@ -304,13 +327,20 @@ def _forward_discrminator(
             lids: Optional[paddle.Tensor]=None, ) -> Dict[str, Any]:
         """Perform discriminator forward.
         Args:
-            text (Tensor): Text index tensor (B, T_text).
-            text_lengths (Tensor): Text length tensor (B,).
-            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
-            feats_lengths (Tensor): Feature length tensor (B,).
-            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
-            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
-            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
         Returns:
 
         """
@@ -353,22 +383,36 @@ def inference(
             use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
         """Run inference.
         Args:
-            text (Tensor): Input text index tensor (T_text,).
-            feats (Tensor): Feature tensor (T_feats, aux_channels).
-            sids (Tensor): Speaker index tensor (1,).
-            spembs (Optional[Tensor]): Speaker embedding tensor (spk_embed_dim,).
-            lids (Tensor): Language index tensor (1,).
-            durations (Tensor): Ground-truth duration tensor (T_text,).
-            noise_scale (float): Noise scale value for flow.
-            noise_scale_dur (float): Noise scale value for duration predictor.
-            alpha (float): Alpha parameter to control the speed of generated speech.
-            max_len (Optional[int]): Maximum length.
-            use_teacher_forcing (bool): Whether to use teacher forcing.
+            text (Tensor):
+                Input text index tensor (T_text,).
+            feats (Tensor):
+                Feature tensor (T_feats, aux_channels).
+            sids (Tensor):
+                Speaker index tensor (1,).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (spk_embed_dim,).
+            lids (Tensor):
+                Language index tensor (1,).
+            durations (Tensor):
+                Ground-truth duration tensor (T_text,).
+            noise_scale (float):
+                Noise scale value for flow.
+            noise_scale_dur (float):
+                Noise scale value for duration predictor.
+            alpha (float):
+                Alpha parameter to control the speed of generated speech.
+            max_len (Optional[int]):
+                Maximum length.
+            use_teacher_forcing (bool):
+                Whether to use teacher forcing.
         Returns:
             Dict[str, Tensor]:
-                * wav (Tensor): Generated waveform tensor (T_wav,).
-                * att_w (Tensor): Monotonic attention weight tensor (T_feats, T_text).
-                * duration (Tensor): Predicted duration tensor (T_text,).
+                * wav (Tensor):
+                    Generated waveform tensor (T_wav,).
+                * att_w (Tensor):
+                    Monotonic attention weight tensor (T_feats, T_text).
+                * duration (Tensor):
+                    Predicted duration tensor (T_text,).
         """
         # setup
         text = text[None]
@@ -417,15 +461,22 @@ def voice_conversion(
             lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
         """Run voice conversion.
         Args:
-            feats (Tensor): Feature tensor (T_feats, aux_channels).
-            sids_src (Optional[Tensor]): Speaker index tensor of source feature (1,).
-            sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (1,).
-            spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (spk_embed_dim,).
-            spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (spk_embed_dim,).
-            lids (Optional[Tensor]): Language index tensor (1,).
+            feats (Tensor):
+                Feature tensor (T_feats, aux_channels).
+            sids_src (Optional[Tensor]):
+                Speaker index tensor of source feature (1,).
+            sids_tgt (Optional[Tensor]):
+                Speaker index tensor of target feature (1,).
+            spembs_src (Optional[Tensor]):
+                Speaker embedding tensor of source feature (spk_embed_dim,).
+            spembs_tgt (Optional[Tensor]):
+                Speaker embedding tensor of target feature (spk_embed_dim,).
+            lids (Optional[Tensor]):
+                Language index tensor (1,).
         Returns:
             Dict[str, Tensor]:
-                * wav (Tensor): Generated waveform tensor (T_wav,).
+                * wav (Tensor):
+                    Generated waveform tensor (T_wav,).
         """
         assert feats is not None
         feats = feats[None].transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/models/vits/wavenet/residual_block.py b/paddlespeech/t2s/models/vits/wavenet/residual_block.py
index 197e7497557..b5095e16872 100644
--- a/paddlespeech/t2s/models/vits/wavenet/residual_block.py
+++ b/paddlespeech/t2s/models/vits/wavenet/residual_block.py
@@ -39,14 +39,22 @@ def __init__(
         """Initialize ResidualBlock module.
 
         Args:
-            kernel_size (int): Kernel size of dilation convolution layer.
-            residual_channels (int): Number of channels for residual connection.
-            skip_channels (int): Number of channels for skip connection.
-            aux_channels (int): Number of local conditioning channels.
-            dropout (float): Dropout probability.
-            dilation (int): Dilation factor.
-            bias (bool): Whether to add bias parameter in convolution layers.
-            scale_residual (bool): Whether to scale the residual outputs.
+            kernel_size (int):
+                Kernel size of dilation convolution layer.
+            residual_channels (int):
+                Number of channels for residual connection.
+            skip_channels (int):
+                Number of channels for skip connection.
+            aux_channels (int):
+                Number of local conditioning channels.
+            dropout (float):
+                Dropout probability.
+            dilation (int):
+                Dilation factor.
+            bias (bool):
+                Whether to add bias parameter in convolution layers.
+            scale_residual (bool):
+                Whether to scale the residual outputs.
 
         """
         super().__init__()
diff --git a/paddlespeech/t2s/models/vits/wavenet/wavenet.py b/paddlespeech/t2s/models/vits/wavenet/wavenet.py
index 44693dac614..04422939bc5 100644
--- a/paddlespeech/t2s/models/vits/wavenet/wavenet.py
+++ b/paddlespeech/t2s/models/vits/wavenet/wavenet.py
@@ -47,25 +47,42 @@ def __init__(
         """Initialize WaveNet module.
 
         Args:
-            in_channels (int): Number of input channels.
-            out_channels (int): Number of output channels.
-            kernel_size (int): Kernel size of dilated convolution.
-            layers (int): Number of residual block layers.
-            stacks (int): Number of stacks i.e., dilation cycles.
-            base_dilation (int): Base dilation factor.
-            residual_channels (int): Number of channels in residual conv.
-            gate_channels (int):  Number of channels in gated conv.
-            skip_channels (int): Number of channels in skip conv.
-            aux_channels (int): Number of channels for local conditioning feature.
-            global_channels (int): Number of channels for global conditioning feature.
-            dropout_rate (float): Dropout rate. 0.0 means no dropout applied.
-            bias (bool): Whether to use bias parameter in conv layer.
-            use_weight_norm (bool): Whether to use weight norm. If set to true, it will
-                be applied to all of the conv layers.
-            use_first_conv (bool): Whether to use the first conv layers.
-            use_last_conv (bool): Whether to use the last conv layers.
-            scale_residual (bool): Whether to scale the residual outputs.
-            scale_skip_connect (bool): Whether to scale the skip connection outputs.
+            in_channels (int):
+                Number of input channels.
+            out_channels (int):
+                Number of output channels.
+            kernel_size (int):
+                Kernel size of dilated convolution.
+            layers (int):
+                Number of residual block layers.
+            stacks (int):
+                Number of stacks i.e., dilation cycles.
+            base_dilation (int):
+                Base dilation factor.
+            residual_channels (int):
+                Number of channels in residual conv.
+            gate_channels (int):
+                Number of channels in gated conv.
+            skip_channels (int):
+                Number of channels in skip conv.
+            aux_channels (int):
+                Number of channels for local conditioning feature.
+            global_channels (int):
+                Number of channels for global conditioning feature.
+            dropout_rate (float):
+                Dropout rate. 0.0 means no dropout applied.
+            bias (bool):
+                Whether to use bias parameter in conv layer.
+            use_weight_norm (bool):
+                Whether to use weight norm. If set to true, it will be applied to all of the conv layers.
+            use_first_conv (bool):
+                Whether to use the first conv layers.
+            use_last_conv (bool):
+                Whether to use the last conv layers.
+            scale_residual (bool):
+                Whether to scale the residual outputs.
+            scale_skip_connect (bool):
+                Whether to scale the skip connection outputs.
 
         """
         super().__init__()
@@ -128,15 +145,18 @@ def forward(
         """Calculate forward propagation.
 
         Args:
-            x (Tensor): Input noise signal (B, 1, T) if use_first_conv else
-                (B, residual_channels, T).
-            x_mask (Optional[Tensor]): Mask tensor (B, 1, T).
-            c (Optional[Tensor]): Local conditioning features (B, aux_channels, T).
-            g (Optional[Tensor]): Global conditioning features (B, global_channels, 1).
+            x (Tensor):
+                Input noise signal (B, 1, T) if use_first_conv else (B, residual_channels, T).
+            x_mask (Optional[Tensor]):
+                Mask tensor (B, 1, T).
+            c (Optional[Tensor]):
+                Local conditioning features (B, aux_channels, T).
+            g (Optional[Tensor]):
+                Global conditioning features (B, global_channels, 1).
 
         Returns:
-            Tensor: Output tensor (B, out_channels, T) if use_last_conv else
-                (B, residual_channels, T).
+            Tensor:
+                Output tensor (B, out_channels, T) if use_last_conv else(B, residual_channels, T).
 
         """
         # encode to hidden representation
diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py
index 254edbb2df0..44e9f2d8df8 100644
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@@ -69,9 +69,11 @@ def __init__(self,
     def forward(self, x):
         '''
         Args:
-            x (Tensor): Input tensor (B, in_dims, T).
+            x (Tensor):
+                Input tensor (B, in_dims, T).
         Returns:
-            Tensor: Output tensor (B, res_out_dims, T).
+            Tensor:
+                Output tensor (B, res_out_dims, T).
         '''
 
         x = self.conv_in(x)
@@ -119,10 +121,13 @@ def __init__(self,
     def forward(self, m):
         '''
         Args:
-            c (Tensor): Input tensor (B, C_aux, T).
+            c (Tensor):
+                Input tensor (B, C_aux, T).
         Returns:
-            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
-            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
+            Tensor:
+                Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
+            Tensor:
+                Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
         '''
         # aux: [B, C_aux, T] 
         # -> [B, res_out_dims, T - 2 * aux_context_window]
@@ -302,7 +307,8 @@ def generate(self,
                 number of samples for crossfading between batches
             mu_law(bool)
         Returns: 
-            wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
+            wav sequence:
+                Output (T' * prod(upsample_scales), out_channels, C_out).
         """
 
         self.eval()
@@ -423,7 +429,7 @@ def pad_tensor(self, x, pad, side='both'):
             x(Tensor): 
                 mel, [1, n_frames, 80]
             pad(int): 
-            side(str, optional):  (Default value = 'both')
+                side(str, optional):  (Default value = 'both')
 
         Returns:
             Tensor