|
|
```python |
|
|
import torch |
|
|
import torchaudio |
|
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
|
from gtts import gTTS |
|
|
import os |
|
|
import tempfile |
|
|
|
|
|
class AccentConverter: |
|
|
def __init__(self): |
|
|
|
|
|
self.models = { |
|
|
'american': { |
|
|
'model': Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), |
|
|
'processor': Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") |
|
|
}, |
|
|
'british': { |
|
|
'model': Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), |
|
|
'processor': Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") |
|
|
}, |
|
|
'australian': { |
|
|
'model': Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), |
|
|
'processor': Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") |
|
|
}, |
|
|
'indian': { |
|
|
'model': Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self"), |
|
|
'processor': Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") |
|
|
} |
|
|
} |
|
|
|
|
|
def load_audio(self, file_path): |
|
|
waveform, sample_rate = torchaudio.load(file_path) |
|
|
return waveform, sample_rate |
|
|
|
|
|
def transcribe_audio(self, waveform, sample_rate, accent='american'): |
|
|
|
|
|
if sample_rate != 16000: |
|
|
resampler = torchaudio.transforms.Resample(sample_rate, 16000) |
|
|
waveform = resampler(waveform) |
|
|
|
|
|
|
|
|
inputs = self.models[accent]['processor']( |
|
|
waveform.squeeze(0), |
|
|
sampling_rate=16000, |
|
|
return_tensors="pt" |
|
|
) |
|
|
|
|
|
with torch.no_grad(): |
|
|
logits = self.models[accent]['model'](inputs.input_values).logits |
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
|
transcription = self.models[accent]['processor'].batch_decode(predicted_ids)[0] |
|
|
return transcription |
|
|
|
|
|
def text_to_speech(self, text, accent='american', output_file='output.mp3'): |
|
|
|
|
|
accent_map = { |
|
|
'american': 'en-us', |
|
|
'british': 'en-gb', |
|
|
'australian': 'en-au', |
|
|
'indian': 'en-in' |
|
|
} |
|
|
|
|
|
tts = gTTS(text=text, lang=accent_map.get(accent, 'en-us')) |
|
|
tts.save(output_file) |
|
|
return output_file |
|
|
def detect_accent(self, audio_path): |
|
|
"""Improved accent detection using acoustic features and ML""" |
|
|
try: |
|
|
waveform, sample_rate = self.load_audio(audio_path) |
|
|
|
|
|
|
|
|
mfcc = torchaudio.transforms.MFCC( |
|
|
sample_rate=sample_rate, |
|
|
n_mfcc=13, |
|
|
melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 23} |
|
|
)(waveform) |
|
|
|
|
|
|
|
|
features = torch.cat([ |
|
|
mfcc.mean(dim=-1), |
|
|
mfcc.std(dim=-1) |
|
|
], dim=1).squeeze().numpy() |
|
|
|
|
|
|
|
|
classifier = torch.load('models/accent_classifier.pt') |
|
|
classifier.eval() |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
inputs = torch.from_numpy(features).unsqueeze(0).float() |
|
|
outputs = classifier(inputs) |
|
|
_, predicted = torch.max(outputs, 1) |
|
|
|
|
|
|
|
|
accent_names = ['american', 'british', 'australian', 'indian'] |
|
|
detected_accent = accent_names[predicted.item()] |
|
|
|
|
|
return detected_accent |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Accent detection error: {e}") |
|
|
return "unknown" |
|
|
def convert_accent(self, input_file, source_accent, target_accent): |
|
|
waveform, sample_rate = self.load_audio(input_file) |
|
|
|
|
|
|
|
|
if source_accent == "auto": |
|
|
source_accent = self.detect_accent(input_file) |
|
|
print(f"Detected source accent: {source_accent}") |
|
|
|
|
|
|
|
|
transcription = self.transcribe_audio(waveform, sample_rate, source_accent) |
|
|
|
|
|
|
|
|
converted_text = self.apply_accent_rules(transcription, source_accent, target_accent) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp: |
|
|
output_file = tmp.name |
|
|
|
|
|
self.text_to_speech(converted_text, target_accent, output_file) |
|
|
return output_file, converted_text |
|
|
|
|
|
def apply_accent_rules(self, text, source_accent, target_accent): |
|
|
"""Apply accent-specific transformation rules to text""" |
|
|
|
|
|
rules = { |
|
|
'american_to_british': [ |
|
|
(r'\btruck\b', 'lorry'), |
|
|
(r'\belevator\b', 'lift'), |
|
|
(r'\bapartment\b', 'flat') |
|
|
], |
|
|
'british_to_american': [ |
|
|
(r'\blorry\b', 'truck'), |
|
|
(r'\blift\b', 'elevator'), |
|
|
(r'\bflat\b', 'apartment') |
|
|
], |
|
|
|
|
|
} |
|
|
|
|
|
conversion_key = f"{source_accent}_to_{target_accent}" |
|
|
if conversion_key in rules: |
|
|
for pattern, replacement in rules[conversion_key]: |
|
|
text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) |
|
|
|
|
|
return f"[Converted from {source_accent} to {target_accent}] {text}" |
|
|
``` |