Spaces:
Build error
Build error
| import torch | |
| import gradio as gr | |
| import librosa | |
| import tempfile | |
| from typing import Optional | |
| from TTS.config import load_config | |
| from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline | |
| from TTS.utils.manage import ModelManager | |
| from TTS.utils.synthesizer import Synthesizer | |
| first_generation = True | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| def load_and_fix_data(input_file, model_sampling_rate): | |
| speech, sample_rate = librosa.load(input_file) | |
| if len(speech.shape) > 1: | |
| speech = speech[:, 0] + speech[:, 1] | |
| if sample_rate != model_sampling_rate: | |
| speech = librosa.resample(speech, sample_rate, model_sampling_rate) | |
| return speech | |
| feature_extractor = AutoFeatureExtractor.from_pretrained("jonatasgrosman/wav2vec2-xls-r-1b-spanish") | |
| sampling_rate = feature_extractor.sampling_rate | |
| asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-xls-r-1b-spanish") | |
| prefix = '' | |
| model_checkpoint = "hackathon-pln-es/es_text_neutralizer" | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) | |
| manager = ModelManager() | |
| MODEL_NAMES = manager.list_tts_models() | |
| def postproc(input_sentence, preds): | |
| try: | |
| preds = preds.replace('De el', 'Del').replace('de el', 'del').replace(' ', ' ') | |
| if preds[0].islower(): | |
| preds = preds.capitalize() | |
| preds = preds.replace(' . ', '. ').replace(' , ', ', ') | |
| # Nombres en mayusculas | |
| prev_letter = '' | |
| for word in input_sentence.split(' '): | |
| if word: | |
| if word[0].isupper(): | |
| if word.lower() in preds and word != input_sentence.split(' ')[0]: | |
| if prev_letter == '.': | |
| preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ') | |
| else: | |
| if word[-1] == '.': | |
| preds = preds.replace(word.lower(), word) | |
| else: | |
| preds = preds.replace(word.lower() + ' ', word + ' ') | |
| prev_letter = word[-1] | |
| preds = preds.strip() # quitar ultimo espacio | |
| except: | |
| pass | |
| return preds | |
| model_name = "es/mai/tacotron2-DDC" | |
| MAX_TXT_LEN = 100 | |
| def predict_and_ctc_lm_decode(input_file, speaker_idx: str=None): | |
| speech = load_and_fix_data(input_file, sampling_rate) | |
| transcribed_text = asr(speech, chunk_length_s=10, stride_length_s=1) | |
| transcribed_text = transcribed_text["text"] | |
| inputs = tokenizer([prefix + transcribed_text], return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| if first_generation: | |
| output_sequence = model.generate( | |
| input_ids=inputs["input_ids"].to(device), | |
| attention_mask=inputs["attention_mask"].to(device), | |
| do_sample=False, # disable sampling to test if batching affects output | |
| ) | |
| else: | |
| output_sequence = model.generate( | |
| input_ids=inputs["input_ids"].to(device), | |
| attention_mask=inputs["attention_mask"].to(device), | |
| do_sample=False, | |
| num_beams=2, | |
| repetition_penalty=2.5, | |
| # length_penalty=1.0, | |
| early_stopping=True# disable sampling to test if batching affects output | |
| ) | |
| text = postproc(transcribed_text, | |
| preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) | |
| if len(text) > MAX_TXT_LEN: | |
| text = text[:MAX_TXT_LEN] | |
| print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.") | |
| print(text, model_name) | |
| # download model | |
| model_path, config_path, model_item = manager.download_model(f"tts_models/{model_name}") | |
| vocoder_name: Optional[str] = model_item["default_vocoder"] | |
| # download vocoder | |
| vocoder_path = None | |
| vocoder_config_path = None | |
| if vocoder_name is not None: | |
| vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) | |
| # init synthesizer | |
| synthesizer = Synthesizer( | |
| model_path, config_path, None, None, vocoder_path, vocoder_config_path, | |
| ) | |
| # synthesize | |
| if synthesizer is None: | |
| raise NameError("model not found") | |
| wavs = synthesizer.tts(text, speaker_idx) | |
| # return output | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
| synthesizer.save_wav(wavs, fp) | |
| return fp.name | |
| description = """This is a Gradio demo for generating gender-neutralized audios. To use it, simply provide an audio input (via microphone or audio recording), which will then be transcribed and gender-neutralized using pre-trained models. Finally, with the help of Coqui's TTS model, gender neutralized audio is generated. | |
| Pre-trained model used for Spanish ASR: [jonatasgrosman/wav2vec2-xls-r-1b-spanish](https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-spanish) | |
| Pre-trained model used for Gender Neutralization: [hackathon-pln-es/es_text_neutralizer](https://huggingface.co/hackathon-pln-es/es_text_neutralizer) | |
| Pre-trained model used for TTS: 🐸💬 CoquiTTS => model_name = "es/mai/tacotron2-DDC" | |
| """ | |
| article = """ **ACKNOWLEDGEMENT:** | |
| **This project is based on the following Spaces:** | |
| [CoquiTTS](https://huggingface.co/spaces/coqui/CoquiTTS) | |
| [es_nlp_gender_neutralizer](https://huggingface.co/spaces/hackathon-pln-es/es_nlp_gender_neutralizer) | |
| [Hindi_ASR](https://huggingface.co/spaces/anuragshas/Hindi_ASR) | |
| """ | |
| gr.Interface( | |
| predict_and_ctc_lm_decode, | |
| inputs=[ | |
| gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio") | |
| ], | |
| outputs=gr.outputs.Audio(label="Output"), | |
| examples=[["Example1.wav"],["Example2.wav"],["Example3.wav"]], | |
| title="Generate-Gender-Neutralized-Audios", | |
| description = description, | |
| article=article, | |
| layout="horizontal", | |
| theme="huggingface", | |
| ).launch(enable_queue=True, cache_examples=True) | |