Spaces:

HaryaniAnjali
/

Audio_File_Emotion_Classification

Sleeping

App Files Files Community

HaryaniAnjali commited on Apr 5

Commit

e5b4dac

verified ·

1 Parent(s): b2a2de0

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -73

app.py CHANGED Viewed

@@ -3,112 +3,152 @@ import torch
 import librosa
 import numpy as np
 import os
-# Define PyTorch model class (must match the structure used during conversion)
 class EmotionClassifier(torch.nn.Module):
-    def __init__(self, input_shape, num_classes):
         super().__init__()
-        # Adjust this architecture to match your converted model
-        self.flatten = torch.nn.Flatten()
-        self.layers = torch.nn.Sequential(
-            torch.nn.Linear(input_shape, 128),
-            torch.nn.ReLU(),
-            torch.nn.Dropout(0.3),
-            torch.nn.Linear(128, 64),
-            torch.nn.ReLU(),
-            torch.nn.Dropout(0.3),
-            torch.nn.Linear(64, num_classes)
-        )
     def forward(self, x):
-        x = self.flatten(x)
-        return self.layers(x)
-# Create model instance
-input_shape = 13 * 128  # n_mfcc * max_length
-num_classes = 7  # Number of emotions
-model = EmotionClassifier(input_shape, num_classes)
-# Load the saved model weights
-model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
-model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
-model.eval()
-# Define emotions
-emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
-def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
-    """Extract MFCC features from an audio file"""
     try:
         audio, sr = librosa.load(audio_path, sr=sample_rate)
         # Extract MFCCs
-        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
-        # Pad or truncate to fixed length
-        if mfccs.shape[1] < max_length:
-            pad_width = max_length - mfccs.shape[1]
-            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
-        else:
-            mfccs = mfccs[:, :max_length]
-        return mfccs
     except Exception as e:
-        print(f"Error in feature extraction: {e}")
         return None
 def predict_emotion(audio):
     """Predict emotion from audio input"""
     try:
-        # Process audio input
         if isinstance(audio, str):  # File path
             features = extract_features(audio)
         else:  # Audio array from microphone
-            # Handle microphone input
-            if isinstance(audio, tuple):
-                audio_array, sample_rate = audio
-            else:
-                audio_array = audio
-                sample_rate = 16000
-            # Convert to mono if stereo
-            if len(np.array(audio_array).shape) > 1:
-                audio_array = np.mean(audio_array, axis=1)
-            # Extract features
-            mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13)
-            # Pad or truncate to fixed length
-            max_length = 128
-            if mfccs.shape[1] < max_length:
-                pad_width = max_length - mfccs.shape[1]
-                mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
-            else:
-                mfccs = mfccs[:, :max_length]
-            features = mfccs
         if features is None:
-            return {emotion: 0.0 for emotion in emotions}
-        # Flatten the features (adjust based on your model's input expectations)
-        features_flat = features.reshape(1, -1)
-        # Convert to PyTorch tensor
-        features_tensor = torch.tensor(features_flat, dtype=torch.float32)
-        # Get predictions
         with torch.no_grad():
             outputs = model(features_tensor)
             probabilities = torch.nn.functional.softmax(outputs, dim=1)
-        # Format results
         result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
         return result
     except Exception as e:
         print(f"Error in prediction: {e}")
-        import traceback
         traceback.print_exc()
         return {emotion: 1/len(emotions) for emotion in emotions}
@@ -116,9 +156,9 @@ def predict_emotion(audio):
 demo = gr.Interface(
     fn=predict_emotion,
     inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
-    outputs=gr.Label(num_top_classes=7),
     title="Speech Emotion Recognition",
-    description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
 )
 demo.launch()

 import librosa
 import numpy as np
 import os
+import traceback
+# Define your PyTorch model class to match the conversion
 class EmotionClassifier(torch.nn.Module):
+    def __init__(self, input_features, hidden_sizes, num_classes):
         super().__init__()
+        # Build the sequential model
+        layers = []
+        prev_size = input_features
+        # Add hidden layers
+        for size in hidden_sizes:
+            layers.append(torch.nn.Linear(prev_size, size))
+            layers.append(torch.nn.ReLU())
+            prev_size = size
+        # Add output layer
+        layers.append(torch.nn.Linear(prev_size, num_classes))
+        # Create the model
+        self.model = torch.nn.Sequential(*layers)
     def forward(self, x):
+        return self.model(x)
+# Define emotions list - make sure this matches your model's output classes
+emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised", "calm"]  # Added "calm" as the 8th emotion based on your model
+# Load the PyTorch model
+try:
+    print("Loading PyTorch model...")
+    # Parameters determined from the Keras model
+    input_features = 768  # From the Keras model's first layer weights
+    hidden_sizes = [256, 128, 64]  # From the Keras model architecture
+    num_classes = 8  # From the Keras model's output layer
+    model = EmotionClassifier(input_features, hidden_sizes, num_classes)
+    model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
+    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+    model.eval()
+    print("Model loaded successfully")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    traceback.print_exc()
+    model = None
+def extract_features(audio_path, sample_rate=16000):
+    """Extract features from an audio file that match what your model expects"""
     try:
+        print(f"Extracting features from {audio_path}")
         audio, sr = librosa.load(audio_path, sr=sample_rate)
+        # We need to extract features that match what your model was trained on
+        # Based on your model, it seems to expect 768 features
+        # Let's extract MFCCs, spectral features, and more to get a rich feature set
         # Extract MFCCs
+        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
+        mfccs_mean = np.mean(mfccs.T, axis=0)
+        mfccs_var = np.var(mfccs.T, axis=0)
+        # Extract spectral features
+        chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
+        chroma_mean = np.mean(chroma.T, axis=0)
+        chroma_var = np.var(chroma.T, axis=0)
+        # Extract mel spectrogram
+        mel = librosa.feature.melspectrogram(y=audio, sr=sr)
+        mel_mean = np.mean(mel.T, axis=0)
+        mel_var = np.var(mel.T, axis=0)
+        # Extract spectral contrast
+        contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
+        contrast_mean = np.mean(contrast.T, axis=0)
+        contrast_var = np.var(contrast.T, axis=0)
+        # Combine all features
+        features = np.hstack([
+            mfccs_mean, mfccs_var,
+            chroma_mean, chroma_var,
+            mel_mean[:200], mel_var[:200],  # Limit to 200 features to avoid exceeding 768
+            contrast_mean, contrast_var
+        ])
+        # Ensure we have exactly 768 features
+        if len(features) < 768:
+            # Pad with zeros if needed
+            features = np.pad(features, (0, 768 - len(features)))
+        elif len(features) > 768:
+            # Truncate if too many
+            features = features[:768]
+        print(f"Extracted {len(features)} features")
+        return features
     except Exception as e:
+        print(f"Error extracting features: {e}")
+        traceback.print_exc()
         return None
 def predict_emotion(audio):
     """Predict emotion from audio input"""
+    if model is None:
+        return {emotion: 1/len(emotions) for emotion in emotions}
     try:
+        print(f"Processing audio input: {type(audio)}")
+        # Process audio based on input type
         if isinstance(audio, str):  # File path
             features = extract_features(audio)
         else:  # Audio array from microphone
+            # Save to a temporary file
+            import tempfile
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
+                if isinstance(audio, tuple):
+                    audio_array, sample_rate = audio
+                else:
+                    audio_array = audio
+                    sample_rate = 16000
+                import soundfile as sf
+                sf.write(temp_file.name, audio_array, sample_rate)
+                features = extract_features(temp_file.name)
+                # Clean up
+                os.remove(temp_file.name)
         if features is None:
+            return {emotion: 1/len(emotions) for emotion in emotions}
+        # Convert features to PyTorch tensor
+        features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
+        # Make prediction
         with torch.no_grad():
             outputs = model(features_tensor)
             probabilities = torch.nn.functional.softmax(outputs, dim=1)
+        # Format result
         result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
+        print(f"Prediction result: {result}")
         return result
     except Exception as e:
         print(f"Error in prediction: {e}")
         traceback.print_exc()
         return {emotion: 1/len(emotions) for emotion in emotions}
 demo = gr.Interface(
     fn=predict_emotion,
     inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
+    outputs=gr.Label(num_top_classes=8),  # Updated to match the 8 emotions
     title="Speech Emotion Recognition",
+    description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, surprised, and calm emotions."
 )
 demo.launch()