HaryaniAnjali commited on
Commit
f1445b2
·
verified ·
1 Parent(s): 7431bd1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -27
app.py CHANGED
@@ -1,43 +1,99 @@
1
  import gradio as gr
2
- from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
3
- import torch
4
  import librosa
5
  import numpy as np
 
6
 
7
- # Load model and feature extractor
8
- model_id = "your-username/speech-emotion-recognition-model"
9
- feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
10
- model = AutoModelForAudioClassification.from_pretrained(model_id)
11
 
12
- # Define emotions
13
  emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
14
 
15
- def predict_emotion(audio_path):
16
- # Load audio
17
- audio, sampling_rate = librosa.load(audio_path, sr=16000)
18
-
19
- # Process through feature extractor
20
- inputs = feature_extractor(audio, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
21
-
22
- # Get prediction
23
- with torch.no_grad():
24
- outputs = model(**inputs)
25
- probs = torch.nn.functional.softmax(outputs.logits, dim=1)
26
- predicted_class_id = torch.argmax(probs, dim=1).item()
27
- predicted_label = emotions[predicted_class_id]
28
- confidence = probs[0][predicted_class_id].item()
 
 
 
 
 
 
 
 
29
 
30
- # Return result
31
- result = {emotion: float(probs[0][i].item()) for i, emotion in enumerate(emotions)}
32
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- # Create Gradio interface
35
  demo = gr.Interface(
36
  fn=predict_emotion,
37
- inputs=gr.Audio(source="microphone", type="filepath"),
38
  outputs=gr.Label(num_top_classes=7),
39
  title="Speech Emotion Recognition",
40
- description="Upload audio or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
 
 
 
41
  )
42
 
43
  demo.launch()
 
1
  import gradio as gr
2
+ import tensorflow as tf
 
3
  import librosa
4
  import numpy as np
5
+ import os
6
 
7
+ # Load the model directly from the .h5 file
8
+ model_path = os.path.join(os.path.dirname(__file__), 'wav2vec_model.h5')
9
+ model = tf.keras.models.load_model(model_path)
 
10
 
11
+ # Define emotions list
12
  emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
13
 
14
+ def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
15
+ """Extract MFCC features from an audio file"""
16
+ try:
17
+ audio, sr = librosa.load(audio_path, sr=sample_rate)
18
+
19
+ # Extract MFCCs
20
+ mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
21
+
22
+ # Pad or truncate to fixed length
23
+ if mfccs.shape[1] < max_length:
24
+ pad_width = max_length - mfccs.shape[1]
25
+ mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
26
+ else:
27
+ mfccs = mfccs[:, :max_length]
28
+
29
+ return mfccs
30
+ except Exception as e:
31
+ print(f"Error in feature extraction: {e}")
32
+ return None
33
+
34
+ def predict_emotion(audio):
35
+ """Predict emotion from audio input
36
 
37
+ This function accepts both file path (when uploading) and audio array
38
+ (when recording via microphone) as input
39
+ """
40
+ try:
41
+ # Check if audio is a file path or audio array
42
+ if isinstance(audio, str): # File path
43
+ features = extract_features(audio)
44
+ else: # Audio array from microphone
45
+ # If audio is a tuple (audio array, sample rate)
46
+ if isinstance(audio, tuple):
47
+ audio_array, sample_rate = audio
48
+ else:
49
+ # If only audio array is provided, assume sample rate
50
+ audio_array = audio
51
+ sample_rate = 16000
52
+
53
+ # Convert to mono if stereo
54
+ if len(audio_array.shape) > 1:
55
+ audio_array = np.mean(audio_array, axis=1)
56
+
57
+ # Extract features
58
+ mfccs = librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=13)
59
+
60
+ # Pad or truncate to fixed length
61
+ max_length = 128
62
+ if mfccs.shape[1] < max_length:
63
+ pad_width = max_length - mfccs.shape[1]
64
+ mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
65
+ else:
66
+ mfccs = mfccs[:, :max_length]
67
+
68
+ features = mfccs
69
+
70
+ if features is None:
71
+ return {emotion: 0.0 for emotion in emotions}
72
+
73
+ # Reshape for model input
74
+ features = np.expand_dims(features, axis=0)
75
+
76
+ # Make prediction
77
+ predictions = model.predict(features)
78
+
79
+ # Format results
80
+ result = {emotion: float(predictions[0][i]) for i, emotion in enumerate(emotions)}
81
+ return result
82
+
83
+ except Exception as e:
84
+ print(f"Error in prediction: {e}")
85
+ return {emotion: 0.0 for emotion in emotions}
86
 
87
+ # Create Gradio interface with both file upload and microphone
88
  demo = gr.Interface(
89
  fn=predict_emotion,
90
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
91
  outputs=gr.Label(num_top_classes=7),
92
  title="Speech Emotion Recognition",
93
+ description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions.",
94
+ examples=[
95
+ ["example1.wav"], # Add example files here if you have them
96
+ ]
97
  )
98
 
99
  demo.launch()