HaryaniAnjali commited on
Commit
e5b4dac
·
verified ·
1 Parent(s): b2a2de0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -73
app.py CHANGED
@@ -3,112 +3,152 @@ import torch
3
  import librosa
4
  import numpy as np
5
  import os
 
6
 
7
- # Define PyTorch model class (must match the structure used during conversion)
8
  class EmotionClassifier(torch.nn.Module):
9
- def __init__(self, input_shape, num_classes):
10
  super().__init__()
11
- # Adjust this architecture to match your converted model
12
- self.flatten = torch.nn.Flatten()
13
- self.layers = torch.nn.Sequential(
14
- torch.nn.Linear(input_shape, 128),
15
- torch.nn.ReLU(),
16
- torch.nn.Dropout(0.3),
17
- torch.nn.Linear(128, 64),
18
- torch.nn.ReLU(),
19
- torch.nn.Dropout(0.3),
20
- torch.nn.Linear(64, num_classes)
21
- )
 
 
 
 
 
22
 
23
  def forward(self, x):
24
- x = self.flatten(x)
25
- return self.layers(x)
26
 
27
- # Create model instance
28
- input_shape = 13 * 128 # n_mfcc * max_length
29
- num_classes = 7 # Number of emotions
30
- model = EmotionClassifier(input_shape, num_classes)
31
 
32
- # Load the saved model weights
33
- model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
34
- model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
35
- model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- # Define emotions
38
- emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
39
-
40
- def extract_features(audio_path, sample_rate=16000, n_mfcc=13, max_length=128):
41
- """Extract MFCC features from an audio file"""
42
  try:
 
43
  audio, sr = librosa.load(audio_path, sr=sample_rate)
44
 
 
 
 
 
45
  # Extract MFCCs
46
- mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- # Pad or truncate to fixed length
49
- if mfccs.shape[1] < max_length:
50
- pad_width = max_length - mfccs.shape[1]
51
- mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
52
- else:
53
- mfccs = mfccs[:, :max_length]
54
 
55
- return mfccs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  except Exception as e:
57
- print(f"Error in feature extraction: {e}")
 
58
  return None
59
 
60
  def predict_emotion(audio):
61
  """Predict emotion from audio input"""
 
 
 
62
  try:
63
- # Process audio input
 
 
64
  if isinstance(audio, str): # File path
65
  features = extract_features(audio)
66
  else: # Audio array from microphone
67
- # Handle microphone input
68
- if isinstance(audio, tuple):
69
- audio_array, sample_rate = audio
70
- else:
71
- audio_array = audio
72
- sample_rate = 16000
73
-
74
- # Convert to mono if stereo
75
- if len(np.array(audio_array).shape) > 1:
76
- audio_array = np.mean(audio_array, axis=1)
77
-
78
- # Extract features
79
- mfccs = librosa.feature.mfcc(y=np.array(audio_array), sr=sample_rate, n_mfcc=13)
80
-
81
- # Pad or truncate to fixed length
82
- max_length = 128
83
- if mfccs.shape[1] < max_length:
84
- pad_width = max_length - mfccs.shape[1]
85
- mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
86
- else:
87
- mfccs = mfccs[:, :max_length]
88
-
89
- features = mfccs
90
 
91
  if features is None:
92
- return {emotion: 0.0 for emotion in emotions}
93
 
94
- # Flatten the features (adjust based on your model's input expectations)
95
- features_flat = features.reshape(1, -1)
96
 
97
- # Convert to PyTorch tensor
98
- features_tensor = torch.tensor(features_flat, dtype=torch.float32)
99
-
100
- # Get predictions
101
  with torch.no_grad():
102
  outputs = model(features_tensor)
103
  probabilities = torch.nn.functional.softmax(outputs, dim=1)
104
 
105
- # Format results
106
  result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
 
107
  return result
108
-
109
  except Exception as e:
110
  print(f"Error in prediction: {e}")
111
- import traceback
112
  traceback.print_exc()
113
  return {emotion: 1/len(emotions) for emotion in emotions}
114
 
@@ -116,9 +156,9 @@ def predict_emotion(audio):
116
  demo = gr.Interface(
117
  fn=predict_emotion,
118
  inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
119
- outputs=gr.Label(num_top_classes=7),
120
  title="Speech Emotion Recognition",
121
- description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, and surprised emotions."
122
  )
123
 
124
  demo.launch()
 
3
  import librosa
4
  import numpy as np
5
  import os
6
+ import traceback
7
 
8
+ # Define your PyTorch model class to match the conversion
9
  class EmotionClassifier(torch.nn.Module):
10
+ def __init__(self, input_features, hidden_sizes, num_classes):
11
  super().__init__()
12
+
13
+ # Build the sequential model
14
+ layers = []
15
+ prev_size = input_features
16
+
17
+ # Add hidden layers
18
+ for size in hidden_sizes:
19
+ layers.append(torch.nn.Linear(prev_size, size))
20
+ layers.append(torch.nn.ReLU())
21
+ prev_size = size
22
+
23
+ # Add output layer
24
+ layers.append(torch.nn.Linear(prev_size, num_classes))
25
+
26
+ # Create the model
27
+ self.model = torch.nn.Sequential(*layers)
28
 
29
  def forward(self, x):
30
+ return self.model(x)
 
31
 
32
+ # Define emotions list - make sure this matches your model's output classes
33
+ emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised", "calm"] # Added "calm" as the 8th emotion based on your model
 
 
34
 
35
+ # Load the PyTorch model
36
+ try:
37
+ print("Loading PyTorch model...")
38
+
39
+ # Parameters determined from the Keras model
40
+ input_features = 768 # From the Keras model's first layer weights
41
+ hidden_sizes = [256, 128, 64] # From the Keras model architecture
42
+ num_classes = 8 # From the Keras model's output layer
43
+
44
+ model = EmotionClassifier(input_features, hidden_sizes, num_classes)
45
+
46
+ model_path = os.path.join(os.path.dirname(__file__), 'emotion_model.pt')
47
+ model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
48
+ model.eval()
49
+ print("Model loaded successfully")
50
+ except Exception as e:
51
+ print(f"Error loading model: {e}")
52
+ traceback.print_exc()
53
+ model = None
54
 
55
+ def extract_features(audio_path, sample_rate=16000):
56
+ """Extract features from an audio file that match what your model expects"""
 
 
 
57
  try:
58
+ print(f"Extracting features from {audio_path}")
59
  audio, sr = librosa.load(audio_path, sr=sample_rate)
60
 
61
+ # We need to extract features that match what your model was trained on
62
+ # Based on your model, it seems to expect 768 features
63
+ # Let's extract MFCCs, spectral features, and more to get a rich feature set
64
+
65
  # Extract MFCCs
66
+ mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
67
+ mfccs_mean = np.mean(mfccs.T, axis=0)
68
+ mfccs_var = np.var(mfccs.T, axis=0)
69
+
70
+ # Extract spectral features
71
+ chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
72
+ chroma_mean = np.mean(chroma.T, axis=0)
73
+ chroma_var = np.var(chroma.T, axis=0)
74
+
75
+ # Extract mel spectrogram
76
+ mel = librosa.feature.melspectrogram(y=audio, sr=sr)
77
+ mel_mean = np.mean(mel.T, axis=0)
78
+ mel_var = np.var(mel.T, axis=0)
79
 
80
+ # Extract spectral contrast
81
+ contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
82
+ contrast_mean = np.mean(contrast.T, axis=0)
83
+ contrast_var = np.var(contrast.T, axis=0)
 
 
84
 
85
+ # Combine all features
86
+ features = np.hstack([
87
+ mfccs_mean, mfccs_var,
88
+ chroma_mean, chroma_var,
89
+ mel_mean[:200], mel_var[:200], # Limit to 200 features to avoid exceeding 768
90
+ contrast_mean, contrast_var
91
+ ])
92
+
93
+ # Ensure we have exactly 768 features
94
+ if len(features) < 768:
95
+ # Pad with zeros if needed
96
+ features = np.pad(features, (0, 768 - len(features)))
97
+ elif len(features) > 768:
98
+ # Truncate if too many
99
+ features = features[:768]
100
+
101
+ print(f"Extracted {len(features)} features")
102
+ return features
103
  except Exception as e:
104
+ print(f"Error extracting features: {e}")
105
+ traceback.print_exc()
106
  return None
107
 
108
  def predict_emotion(audio):
109
  """Predict emotion from audio input"""
110
+ if model is None:
111
+ return {emotion: 1/len(emotions) for emotion in emotions}
112
+
113
  try:
114
+ print(f"Processing audio input: {type(audio)}")
115
+
116
+ # Process audio based on input type
117
  if isinstance(audio, str): # File path
118
  features = extract_features(audio)
119
  else: # Audio array from microphone
120
+ # Save to a temporary file
121
+ import tempfile
122
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
123
+ if isinstance(audio, tuple):
124
+ audio_array, sample_rate = audio
125
+ else:
126
+ audio_array = audio
127
+ sample_rate = 16000
128
+
129
+ import soundfile as sf
130
+ sf.write(temp_file.name, audio_array, sample_rate)
131
+ features = extract_features(temp_file.name)
132
+ # Clean up
133
+ os.remove(temp_file.name)
 
 
 
 
 
 
 
 
 
134
 
135
  if features is None:
136
+ return {emotion: 1/len(emotions) for emotion in emotions}
137
 
138
+ # Convert features to PyTorch tensor
139
+ features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
140
 
141
+ # Make prediction
 
 
 
142
  with torch.no_grad():
143
  outputs = model(features_tensor)
144
  probabilities = torch.nn.functional.softmax(outputs, dim=1)
145
 
146
+ # Format result
147
  result = {emotion: float(probabilities[0][i].item()) for i, emotion in enumerate(emotions)}
148
+ print(f"Prediction result: {result}")
149
  return result
 
150
  except Exception as e:
151
  print(f"Error in prediction: {e}")
 
152
  traceback.print_exc()
153
  return {emotion: 1/len(emotions) for emotion in emotions}
154
 
 
156
  demo = gr.Interface(
157
  fn=predict_emotion,
158
  inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
159
+ outputs=gr.Label(num_top_classes=8), # Updated to match the 8 emotions
160
  title="Speech Emotion Recognition",
161
+ description="Upload an audio file or record your voice to identify the emotion. This model can detect neutral, happy, sad, angry, fearful, disgust, surprised, and calm emotions."
162
  )
163
 
164
  demo.launch()