flexthink
Modified it to create as "lite" version for cases where you only need speaker embeddings
380887d
| # ############################################################################ | |
| # Model: ECAPA big for Speaker verification | |
| # ############################################################################ | |
| # Feature parameters | |
| n_mels: 80 | |
| # Pretrain folder (HuggingFace) | |
| pretrained_path: poonehmousavi/discrete_wavlm_spk_rec_ecapatdn | |
| # Output parameters | |
| save_folder: tmp | |
| ### Configuration for discrete SSL model | |
| # ssl_model_type: hubert, wavlm, wav2vec2 | |
| # ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large | |
| ssl_model_type: wavlm # hubert, wavml or wav2vec2 | |
| ssl_hub: microsoft/wavlm-large | |
| ssl_folder: !ref <save_folder>/ssl_checkpoint | |
| kmeans_repo_id: speechbrain/SSL_Quantization | |
| kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint | |
| kmeans_dataset: LibriSpeech-100-360-500 | |
| freeze_ssl: True | |
| freeze_feature_extractor: True | |
| num_clusters: 1000 | |
| ### Config for Tokenizer | |
| # Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) | |
| # ssl_layer_num: [3, 7, 12, 23] | |
| # deduplicate: [False, False, False, False] | |
| # bpe_tokenizer_path: [null , null, null, null] | |
| ssl_layer_num: [1, 3, 7, 12, 18, 23] | |
| ssl_layer_num_selected: [1, 3, 7, 12, 18, 23] | |
| num_codebooks: 6 | |
| deduplicate: [False, False, False, False, False, False] | |
| bpe_tokenizer_path: [null, null, null, null, null, null] | |
| sample_rate: 16000 | |
| # Feature parameters | |
| encoder_dim: 1024 | |
| # Modules | |
| tokenizer_config: | |
| SSL_layers: !ref <ssl_layer_num> | |
| deduplicates: !ref <deduplicate> | |
| bpe_tokenizers: !ref <bpe_tokenizer_path> | |
| discrete_embedding_layer: !new:custom_interface.Discrete_EmbeddingLayer | |
| num_codebooks: !ref <num_codebooks> | |
| vocab_size: !ref <num_clusters> | |
| emb_dim: !ref <encoder_dim> | |
| available_layers: !ref <ssl_layer_num> | |
| layers: !ref <ssl_layer_num_selected> | |
| attention_mlp: !new:custom_interface.AttentionMLP | |
| input_dim: !ref <encoder_dim> | |
| hidden_dim: !ref <encoder_dim> | |
| embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN | |
| input_size: !ref <encoder_dim> | |
| channels: [1024, 1024, 1024, 1024, 3072] | |
| kernel_sizes: [5, 3, 3, 3, 1] | |
| dilations: [1, 2, 3, 4, 1] | |
| groups: [1, 1, 1, 1, 1] | |
| attention_channels: 128 | |
| lin_neurons: 192 | |
| modules: | |
| embedding_model: !ref <embedding_model> | |
| attention_mlp: !ref <attention_mlp> | |
| discrete_embedding_layer: !ref <discrete_embedding_layer> | |
| pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer | |
| loadables: | |
| embedding_model: !ref <embedding_model> | |
| attention_mlp: !ref <attention_mlp> | |
| discrete_embedding_layer: !ref <discrete_embedding_layer> | |
| paths: | |
| embedding_model: !ref <pretrained_path>/embedding_model.ckpt | |
| attention_mlp: !ref <pretrained_path>/attention_mlp.ckpt | |
| discrete_embedding_layer: !ref <pretrained_path>/discrete_embedding_layer.ckpt | |