File size: 2,343 Bytes
428ab68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
```python
import whisper
import tempfile
import os

class WhisperTranscriber:
    def __init__(self, model_size="base"):
        # Available models: tiny, base, small, medium, large
        self.model = whisper.load_model(model_size)

    def transcribe_audio(self, audio_path, language=None):
        """
        Transcribe audio using Whisper model
        
        Args:
            audio_path (str): Path to audio file
            language (str, optional): Language code (e.g., 'en'). If None, auto-detect
        
        Returns:
            dict: Transcription result containing text, segments, language, etc.
        """
        result = self.model.transcribe(audio_path, language=language)
        return result

    def transcribe_bytes(self, audio_bytes, temp_prefix="whisper_temp"):
        """
        Transcribe raw audio bytes by saving to temporary file
        
        Args:
            audio_bytes (bytes): Raw audio data
            temp_prefix (str): Prefix for temporary file
        
        Returns:
            dict: Transcription result
        """
        with tempfile.NamedTemporaryFile(prefix=temp_prefix, delete=True) as temp_file:
            temp_file.write(audio_bytes)
            temp_file.flush()
            return self.transcribe_audio(temp_file.name)

    def detect_language(self, audio_path):
        """
        Detect the language of the audio
        
        Args:
            audio_path (str): Path to audio file
            
        Returns:
            str: Language code (e.g., 'en')
        """
        # Load audio and pad/trim it to fit 30 seconds
        audio = whisper.load_audio(audio_path)
        audio = whisper.pad_or_trim(audio)
        
        # Make log-Mel spectrogram and move to device
        mel = whisper.log_mel_spectrogram(audio).to(self.model.device)
        
        # Detect language
        _, probs = self.model.detect_language(mel)
        return max(probs, key=probs.get)

    def transcribe_with_timestamps(self, audio_path):
        """
        Get transcription with word-level timestamps
        
        Args:
            audio_path (str): Path to audio file
            
        Returns:
            dict: Transcription with word-level timestamps
        """
        result = self.model.transcribe(audio_path, word_timestamps=True)
        return result
```