diff --git a/examples/aishell/s0/conf/ds_stage2.json b/examples/aishell/s0/conf/ds_stage2.json
deleted file mode 100644
index 49884009a..000000000
--- a/examples/aishell/s0/conf/ds_stage2.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "train_micro_batch_size_per_gpu": 1,
-  "gradient_accumulation_steps": 1,
-  "steps_per_print": 100,
-  "gradient_clipping": 0.0001,
-  "fp16": {
-    "enabled": false,
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 8,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-   "enabled": false
-  },
-  "zero_force_ds_cpu_optimizer": false,
-  "zero_optimization": {
-    "stage": 2,
-    "offload_optimizer": {
-      "device": "none",
-      "pin_memory": true
-    },
-    "offload_param": {
-      "device": "none",
-      "pin_memory": true
-    },
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1e7,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1e7,
-    "contiguous_gradients" : true
-  },
-  "activation_checkpointing": {
-    "partition_activations": false,
-    "cpu_checkpointing": false,
-    "contiguous_memory_optimization": false,
-    "number_checkpoints": null,
-    "synchronize_checkpoint_boundary": false,
-    "profile": true
-  },
-  "flops_profiler": {
-    "enabled": false,
-    "profile_step": 100,
-    "module_depth": -1,
-    "top_modules": 1,
-    "detailed": true,
-    "output_file": null
-  },
-  "tensorboard": {
-    "enabled": true,
-    "output_path": "tensorboard/ds_logs/",
-    "job_name": "deepspeed"
-  }
-}
diff --git a/examples/aishell/s0/conf/train_u2++_conformer_1.8B.yaml b/examples/aishell/s0/conf/train_u2++_conformer_1.8B.yaml
deleted file mode 100644
index d4de4c440..000000000
--- a/examples/aishell/s0/conf/train_u2++_conformer_1.8B.yaml
+++ /dev/null
@@ -1,115 +0,0 @@
-# network architecture
-# encoder related
-encoder: conformer
-encoder_conf:
-    output_size: 2048    # dimension of attention
-    attention_heads: 16
-    linear_units: 8192  # the number of units of position-wise feed forward
-    num_blocks: 12      # the number of encoder blocks
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    attention_dropout_rate: 0.1
-    input_layer: conv2d8 # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-    normalize_before: true
-    cnn_module_kernel: 8
-    use_cnn_module: True
-    activation_type: 'swish'
-    pos_enc_layer_type: 'rel_pos'
-    selfattention_layer_type: 'rel_selfattn'
-    causal: true
-    use_dynamic_chunk: true
-    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
-    use_dynamic_left_chunk: false
-
-# decoder related
-decoder: bitransformer
-decoder_conf:
-    attention_heads: 16
-    linear_units: 8192
-    num_blocks: 3
-    r_num_blocks: 3
-    dropout_rate: 0.1
-    positional_dropout_rate: 0.1
-    self_attention_dropout_rate: 0.1
-    src_attention_dropout_rate: 0.1
-
-tokenizer: char
-tokenizer_conf:
-  symbol_table_path: 'data/dict/lang_char.txt'
-  split_with_space: false
-  bpe_path: null
-  non_lang_syms_path: null
-  is_multilingual: false
-  num_languages: 1
-  special_tokens:
-    <blank>: 0
-    <unk>: 1
-    <sos>: 2
-    <eos>: 2
-
-ctc: ctc
-ctc_conf:
-  ctc_blank_id: 0
-
-cmvn: global_cmvn
-cmvn_conf:
-  cmvn_file: 'data/train/global_cmvn'
-  is_json_cmvn: true
-
-# hybrid CTC/attention
-model: asr_model
-model_conf:
-    ctc_weight: 0.3
-    lsm_weight: 0.1     # label smoothing option
-    length_normalized_loss: false
-    reverse_weight: 0.3
-
-dataset: asr
-dataset_conf:
-    filter_conf:
-        max_length: 40960
-        min_length: 0
-        token_max_length: 200
-        token_min_length: 1
-    resample_conf:
-        resample_rate: 16000
-    speed_perturb: true
-    fbank_conf:
-        num_mel_bins: 80
-        frame_shift: 10
-        frame_length: 25
-        dither: 1.0
-    spec_aug: true
-    spec_aug_conf:
-        num_t_mask: 2
-        num_f_mask: 2
-        max_t: 50
-        max_f: 10
-    spec_sub: true
-    spec_sub_conf:
-        num_t_sub: 3
-        max_t: 30
-    spec_trim: false
-    spec_trim_conf:
-        max_t: 50
-    shuffle: true
-    shuffle_conf:
-        shuffle_size: 1500
-    sort: true
-    sort_conf:
-        sort_size: 500  # sort_size should be less than shuffle_size
-    batch_conf:
-        batch_type: 'static' # static or dynamic
-        batch_size: 16
-
-grad_clip: 5
-accum_grad: 1
-max_epoch: 100
-log_interval: 100
-
-optim: adam
-optim_conf:
-    lr: 0.001
-scheduler: warmuplr     # pytorch v1.1.0+ required
-scheduler_conf:
-    warmup_steps: 25000