{ "activation_quant_modules": [ "bert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense", "bert.encoder.albert_layer_groups.0.albert_layers.0.attention.key", "bert.encoder.albert_layer_groups.0.albert_layers.0.attention.query", "bert.encoder.albert_layer_groups.0.albert_layers.0.attention.value", "bert.encoder.albert_layer_groups.0.albert_layers.0.ffn", "bert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output", "bert.encoder.embedding_hidden_mapping_in", "bert_encoder", "decoder.asr_res.0", "decoder.decode.0.conv1", "decoder.decode.0.conv1x1", "decoder.decode.0.conv2", "decoder.decode.0.norm1.fc", "decoder.decode.0.norm2.fc", "decoder.decode.1.conv1", "decoder.decode.1.conv1x1", "decoder.decode.1.conv2", "decoder.decode.1.norm1.fc", "decoder.decode.1.norm2.fc", "decoder.decode.2.conv1", "decoder.decode.2.conv1x1", "decoder.decode.2.conv2", "decoder.decode.2.norm1.fc", "decoder.decode.2.norm2.fc", "decoder.decode.3.conv1", "decoder.decode.3.conv1x1", "decoder.decode.3.conv2", "decoder.decode.3.norm1.fc", "decoder.decode.3.norm2.fc", "decoder.encode.conv1", "decoder.encode.conv1x1", "decoder.encode.conv2", "decoder.encode.norm1.fc", "decoder.encode.norm2.fc", "decoder.generator.conv_post", "decoder.generator.noise_convs.0", "decoder.generator.noise_convs.1", "decoder.generator.noise_res.0.adain1.0.fc", "decoder.generator.noise_res.0.adain1.1.fc", "decoder.generator.noise_res.0.adain1.2.fc", "decoder.generator.noise_res.0.adain2.0.fc", "decoder.generator.noise_res.0.adain2.1.fc", "decoder.generator.noise_res.0.adain2.2.fc", "decoder.generator.noise_res.0.convs1.0", "decoder.generator.noise_res.0.convs1.1", "decoder.generator.noise_res.0.convs1.2", "decoder.generator.noise_res.0.convs2.0", "decoder.generator.noise_res.0.convs2.1", "decoder.generator.noise_res.0.convs2.2", "decoder.generator.noise_res.1.adain1.0.fc", "decoder.generator.noise_res.1.adain1.1.fc", "decoder.generator.noise_res.1.adain1.2.fc", "decoder.generator.noise_res.1.adain2.0.fc", "decoder.generator.noise_res.1.adain2.1.fc", "decoder.generator.noise_res.1.adain2.2.fc", "decoder.generator.noise_res.1.convs1.0", "decoder.generator.noise_res.1.convs1.1", "decoder.generator.noise_res.1.convs1.2", "decoder.generator.noise_res.1.convs2.0", "decoder.generator.noise_res.1.convs2.1", "decoder.generator.noise_res.1.convs2.2", "decoder.generator.resblocks.0.adain1.0.fc", "decoder.generator.resblocks.0.adain1.1.fc", "decoder.generator.resblocks.0.adain1.2.fc", "decoder.generator.resblocks.0.adain2.0.fc", "decoder.generator.resblocks.0.adain2.1.fc", "decoder.generator.resblocks.0.adain2.2.fc", "decoder.generator.resblocks.0.convs1.0", "decoder.generator.resblocks.0.convs1.1", "decoder.generator.resblocks.0.convs1.2", "decoder.generator.resblocks.0.convs2.0", "decoder.generator.resblocks.0.convs2.1", "decoder.generator.resblocks.0.convs2.2", "decoder.generator.resblocks.1.adain1.0.fc", "decoder.generator.resblocks.1.adain1.1.fc", "decoder.generator.resblocks.1.adain1.2.fc", "decoder.generator.resblocks.1.adain2.0.fc", "decoder.generator.resblocks.1.adain2.1.fc", "decoder.generator.resblocks.1.adain2.2.fc", "decoder.generator.resblocks.1.convs1.0", "decoder.generator.resblocks.1.convs1.1", "decoder.generator.resblocks.1.convs1.2", "decoder.generator.resblocks.1.convs2.0", "decoder.generator.resblocks.1.convs2.1", "decoder.generator.resblocks.1.convs2.2", "decoder.generator.resblocks.2.adain1.0.fc", "decoder.generator.resblocks.2.adain1.1.fc", "decoder.generator.resblocks.2.adain1.2.fc", "decoder.generator.resblocks.2.adain2.0.fc", "decoder.generator.resblocks.2.adain2.1.fc", "decoder.generator.resblocks.2.adain2.2.fc", "decoder.generator.resblocks.2.convs1.0", "decoder.generator.resblocks.2.convs1.1", "decoder.generator.resblocks.2.convs1.2", "decoder.generator.resblocks.2.convs2.0", "decoder.generator.resblocks.2.convs2.1", "decoder.generator.resblocks.2.convs2.2", "decoder.generator.resblocks.3.adain1.0.fc", "decoder.generator.resblocks.3.adain1.1.fc", "decoder.generator.resblocks.3.adain1.2.fc", "decoder.generator.resblocks.3.adain2.0.fc", "decoder.generator.resblocks.3.adain2.1.fc", "decoder.generator.resblocks.3.adain2.2.fc", "decoder.generator.resblocks.3.convs1.0", "decoder.generator.resblocks.3.convs1.1", "decoder.generator.resblocks.3.convs1.2", "decoder.generator.resblocks.3.convs2.0", "decoder.generator.resblocks.3.convs2.1", "decoder.generator.resblocks.3.convs2.2", "predictor.F0.0.conv1", "predictor.F0.0.conv2", "predictor.F0.0.norm1.fc", "predictor.F0.0.norm2.fc", "predictor.F0.1.conv1", "predictor.F0.1.conv1x1", "predictor.F0.1.conv2", "predictor.F0.1.norm1.fc", "predictor.F0.1.norm2.fc", "predictor.F0.2.conv1", "predictor.F0.2.conv2", "predictor.F0.2.norm1.fc", "predictor.F0.2.norm2.fc", "predictor.F0_proj", "predictor.N.0.conv1", "predictor.N.0.conv2", "predictor.N.0.norm1.fc", "predictor.N.0.norm2.fc", "predictor.N.1.conv1", "predictor.N.1.conv1x1", "predictor.N.1.conv2", "predictor.N.1.norm1.fc", "predictor.N.1.norm2.fc", "predictor.N.2.conv1", "predictor.N.2.conv2", "predictor.N.2.norm1.fc", "predictor.N.2.norm2.fc", "predictor.N_proj", "predictor.lstm", "predictor.shared", "predictor.text_encoder.lstms.0", "predictor.text_encoder.lstms.1.fc", "predictor.text_encoder.lstms.2", "predictor.text_encoder.lstms.3.fc", "predictor.text_encoder.lstms.4", "predictor.text_encoder.lstms.5.fc", "text_encoder.cnn.0.0", "text_encoder.cnn.1.0", "text_encoder.cnn.2.0", "text_encoder.lstm" ], "asr_res_dim": 64, "decoder_out_dim": 512, "hidden_dim": 512, "istftnet": { "resblock_kernel_sizes": [ 3, 3 ], "upsample_rates": [ 10, 6 ], "upsample_initial_channel": 512, "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ] ], "upsample_kernel_sizes": [ 20, 12 ], "gen_istft_n_fft": 20, "gen_istft_hop_size": 5 }, "max_conv_dim": 1024, "max_dur": 50, "model_type": "kitten_tts", "n_layer": 3, "n_mels": 80, "n_token": 178, "plbert": { "num_hidden_layers": 12, "num_attention_heads": 12, "hidden_size": 768, "intermediate_size": 2048, "max_position_embeddings": 512, "embedding_size": 128, "inner_group_num": 1, "num_hidden_groups": 1, "hidden_dropout_prob": 0.0, "attention_probs_dropout_prob": 0.0, "type_vocab_size": 2, "layer_norm_eps": 1e-12 }, "sample_rate": 24000, "speed_priors": {}, "style_dim": 128, "text_encoder_kernel_size": 5, "voice_aliases": { "Bella": "expr-voice-2-f", "Jasper": "expr-voice-2-m", "Luna": "expr-voice-3-f", "Bruno": "expr-voice-3-m", "Rosie": "expr-voice-4-f", "Hugo": "expr-voice-4-m", "Kiki": "expr-voice-5-f", "Leo": "expr-voice-5-m" }, "voices_path": "voices.npz" }