Spaces:

HaryaniAnjali
/

Audio_File_Emotion_Classification

Sleeping

App Files Files Community

HaryaniAnjali commited on Apr 5

Commit

f1445b2

verified ·

1 Parent(s): 7431bd1

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -27

app.py CHANGED Viewed

@@ -1,43 +1,99 @@
 import gradio as gr
-from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
-import torch
 import librosa
 import numpy as np
-# Load model and feature extractor
-model_id = "your-username/speech-emotion-recognition-model"
-feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
-model = AutoModelForAudioClassification.from_pretrained(model_id)
-# Define emotions
 emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
-def predict_emotion(audio_path):
-    # Load audio
-    audio, sampling_rate = librosa.load(audio_path, sr=16000)
-    # Process through feature extractor
-    inputs = feature_extractor(audio, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
-    # Get prediction
-    with torch.no_grad():
-        outputs = model(**inputs)
-        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
-        predicted_class_id = torch.argmax(probs, dim=1).item()
-        predicted_label = emotions[predicted_class_id]
-        confidence = probs[0][predicted_class_id].item()
-    # Return result
-    result = {emotion: float(probs[0][i].item()) for i, emotion in enumerate(emotions)}
-    return result
-# Create Gradio interface
 demo = gr.Interface(
     fn=predict_emotion,
-    inputs=gr.Audio(source="microphone", type="filepath"),
     outputs=gr.Label(num_top_classes=7),
     title="Speech Emotion Recognition",
-    description="Upload audio or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
 )
 demo.launch()

 import gradio as gr
+import tensorflow as tf
 import librosa
 import numpy as np
+import os
+# Load the model directly from the .h5 file
+model_path = os.path.join(os.path.dirname(__file__), 'wav2vec_model.h5')
+model = tf.keras.models.load_model(model_path)
+# Define emotions list
 emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
+def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
+    """Extract MFCC features from an audio file"""
+    try:
+        audio, sr = librosa.load(audio_path, sr=sample_rate)
+        # Extract MFCCs
+        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
+        # Pad or truncate to fixed length
+        if mfccs.shape[1] < max_length:
+            pad_width = max_length - mfccs.shape[1]
+            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
+        else:
+            mfccs = mfccs[:, :max_length]
+        return mfccs
+    except Exception as e:
+        print(f"Error in feature extraction: {e}")
+        return None
+def predict_emotion(audio):
+    """Predict emotion from audio input
+    This function accepts both file path (when uploading) and audio array
+    (when recording via microphone) as input
+    """
+    try:
+        # Check if audio is a file path or audio array
+        if isinstance(audio, str):  # File path
+            features = extract_features(audio)
+        else:  # Audio array from microphone
+            # If audio is a tuple (audio array, sample rate)
+            if isinstance(audio, tuple):
+                audio_array, sample_rate = audio
+            else:
+                # If only audio array is provided, assume sample rate
+                audio_array = audio
+                sample_rate = 16000
+            # Convert to mono if stereo
+            if len(audio_array.shape) > 1:
+                audio_array = np.mean(audio_array, axis=1)
+            # Extract features
+            mfccs = librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=13)
+            # Pad or truncate to fixed length
+            max_length = 128
+            if mfccs.shape[1] < max_length:
+                pad_width = max_length - mfccs.shape[1]
+                mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
+            else:
+                mfccs = mfccs[:, :max_length]
+            features = mfccs
+        if features is None:
+            return {emotion: 0.0 for emotion in emotions}
+        # Reshape for model input
+        features = np.expand_dims(features, axis=0)
+        # Make prediction
+        predictions = model.predict(features)
+        # Format results
+        result = {emotion: float(predictions[0][i]) for i, emotion in enumerate(emotions)}
+        return result
+    except Exception as e:
+        print(f"Error in prediction: {e}")
+        return {emotion: 0.0 for emotion in emotions}
+# Create Gradio interface with both file upload and microphone
 demo = gr.Interface(
     fn=predict_emotion,
+    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
     outputs=gr.Label(num_top_classes=7),
     title="Speech Emotion Recognition",
+    description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions.",
+    examples=[
+        ["example1.wav"],  # Add example files here if you have them
+    ]
 )
 demo.launch()