```python import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from gtts import gTTS import os import tempfile class AccentConverter: def __init__(self): # Load models for different accents self.models = { 'american': { 'model': Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), 'processor': Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") }, 'british': { 'model': Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), 'processor': Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") }, 'australian': { 'model': Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), 'processor': Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") }, 'indian': { 'model': Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), 'processor': Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") } } def load_audio(self, file_path): waveform, sample_rate = torchaudio.load(file_path) return waveform, sample_rate def transcribe_audio(self, waveform, sample_rate, accent='american'): # Resample if needed if sample_rate != 16000: resampler = torchaudio.transforms.Resample(sample_rate, 16000) waveform = resampler(waveform) # Process with wav2vec2 inputs = self.models[accent]['processor']( waveform.squeeze(0), sampling_rate=16000, return_tensors="pt" ) with torch.no_grad(): logits = self.models[accent]['model'](inputs.input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = self.models[accent]['processor'].batch_decode(predicted_ids)[0] return transcription def text_to_speech(self, text, accent='american', output_file='output.mp3'): # Map accents to gTTS language codes accent_map = { 'american': 'en-us', 'british': 'en-gb', 'australian': 'en-au', 'indian': 'en-in' } tts = gTTS(text=text, lang=accent_map.get(accent, 'en-us')) tts.save(output_file) return output_file def detect_accent(self, audio_path): """Improved accent detection using acoustic features and ML""" try: waveform, sample_rate = self.load_audio(audio_path) # Extract features using torchaudio mfcc = torchaudio.transforms.MFCC( sample_rate=sample_rate, n_mfcc=13, melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 23} )(waveform) # Calculate statistics of MFCCs (mean, std) as features features = torch.cat([ mfcc.mean(dim=-1), mfcc.std(dim=-1) ], dim=1).squeeze().numpy() # Load our trained accent classifier model classifier = torch.load('models/accent_classifier.pt') classifier.eval() # Predict accent with torch.no_grad(): inputs = torch.from_numpy(features).unsqueeze(0).float() outputs = classifier(inputs) _, predicted = torch.max(outputs, 1) # Map prediction index to accent names accent_names = ['american', 'british', 'australian', 'indian'] detected_accent = accent_names[predicted.item()] return detected_accent except Exception as e: print(f"Accent detection error: {e}") return "unknown" def convert_accent(self, input_file, source_accent, target_accent): waveform, sample_rate = self.load_audio(input_file) # First detect source accent if not provided if source_accent == "auto": source_accent = self.detect_accent(input_file) print(f"Detected source accent: {source_accent}") # Transcribe with source accent model transcription = self.transcribe_audio(waveform, sample_rate, source_accent) # Apply accent conversion rules converted_text = self.apply_accent_rules(transcription, source_accent, target_accent) # Convert to speech with target accent with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp: output_file = tmp.name self.text_to_speech(converted_text, target_accent, output_file) return output_file, converted_text def apply_accent_rules(self, text, source_accent, target_accent): """Apply accent-specific transformation rules to text""" # Example rules for demonstration - in reality this would be more sophisticated rules = { 'american_to_british': [ (r'\btruck\b', 'lorry'), (r'\belevator\b', 'lift'), (r'\bapartment\b', 'flat') ], 'british_to_american': [ (r'\blorry\b', 'truck'), (r'\blift\b', 'elevator'), (r'\bflat\b', 'apartment') ], # Add more conversion rules here } conversion_key = f"{source_accent}_to_{target_accent}" if conversion_key in rules: for pattern, replacement in rules[conversion_key]: text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) return f"[Converted from {source_accent} to {target_accent}] {text}" ```