Spaces:
Sleeping
Sleeping
RonaldCeballos
commited on
Commit
Β·
c5e79e0
1
Parent(s):
54985cb
Update app.py
Browse files
app.py
CHANGED
|
@@ -113,8 +113,63 @@ def load_model_from_hub(species_type, model_version):
|
|
| 113 |
print(f"Error loading {species_type} - {model_version}: {str(e)}")
|
| 114 |
return None, None
|
| 115 |
|
| 116 |
-
def
|
| 117 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
global current_audio_path, current_audio_name
|
| 119 |
|
| 120 |
if audio_file is None:
|
|
@@ -123,7 +178,6 @@ def predict_species(species_type, model_version, audio_file):
|
|
| 123 |
try:
|
| 124 |
# Store the current audio path and name for feedback
|
| 125 |
current_audio_path = audio_file
|
| 126 |
-
# Extract the original audio file name
|
| 127 |
current_audio_name = os.path.basename(audio_file)
|
| 128 |
|
| 129 |
# Load model
|
|
@@ -131,24 +185,23 @@ def predict_species(species_type, model_version, audio_file):
|
|
| 131 |
if model is None or classes is None:
|
| 132 |
return pd.DataFrame({"Error": [f"Could not load {species_type} - {model_version} model"]})
|
| 133 |
|
| 134 |
-
# Process audio
|
| 135 |
wav, sr = torchaudio.load(audio_file)
|
| 136 |
wav = wav.mean(dim=0) # Convert to mono
|
| 137 |
|
| 138 |
-
# Extract 5-second chunks
|
| 139 |
chunks = extract_chunks(wav.numpy(), sr, time=5)
|
| 140 |
|
| 141 |
results = []
|
| 142 |
|
| 143 |
for i, chunk in enumerate(chunks):
|
| 144 |
-
# Create spectrogram
|
| 145 |
spectrogram = create_spectrogram(chunk)
|
| 146 |
|
| 147 |
-
# Normalize
|
| 148 |
-
|
| 149 |
-
spectrogram = (spectrogram - np.mean(spectrogram)) / np.std(spectrogram)
|
| 150 |
|
| 151 |
-
# Predict
|
| 152 |
species, confidence = predict_with_model(spectrogram, model, classes)
|
| 153 |
|
| 154 |
time_start = i * 5
|
|
@@ -174,56 +227,85 @@ def predict_species(species_type, model_version, audio_file):
|
|
| 174 |
print(f"Prediction error: {str(e)}")
|
| 175 |
return pd.DataFrame({"Error": [f"Error during analysis: {str(e)}"]})
|
| 176 |
|
| 177 |
-
def
|
| 178 |
-
"""
|
| 179 |
-
|
| 180 |
-
chunks = []
|
| 181 |
-
for i in range(0, len(audio_clean), n_samples):
|
| 182 |
-
start = i
|
| 183 |
-
end = i + n_samples
|
| 184 |
-
|
| 185 |
-
if end <= len(audio_clean):
|
| 186 |
-
chunk = audio_clean[start:end]
|
| 187 |
-
else:
|
| 188 |
-
# Circular padding
|
| 189 |
-
missing = end - len(audio_clean)
|
| 190 |
-
padding = audio_clean[:missing]
|
| 191 |
-
chunk = np.concatenate([audio_clean[start:], padding])
|
| 192 |
-
|
| 193 |
-
chunks.append(chunk)
|
| 194 |
-
return np.array(chunks)
|
| 195 |
-
|
| 196 |
-
def create_spectrogram(array_audio, n_fft=2048):
|
| 197 |
-
"""Create spectrogram from audio array"""
|
| 198 |
-
dta = np.abs(librosa.stft(array_audio, n_fft=n_fft))
|
| 199 |
-
D = librosa.amplitude_to_db(dta, ref=np.max)
|
| 200 |
-
return D
|
| 201 |
-
|
| 202 |
-
def predict_with_model(spec, model, classes):
|
| 203 |
-
"""Predict species from spectrogram"""
|
| 204 |
-
# Ensure correct dimensions (1025, 313)
|
| 205 |
-
if spec.shape != (1025, 313):
|
| 206 |
-
# Resize if needed
|
| 207 |
-
spec = resize_spectrogram(spec, (1025, 313))
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
def save_feedback_to_dataset(audio_file_path, original_audio_name, feedback_text, consent_given, species_type, model_version, results_df):
|
| 229 |
"""Save audio and feedback to private Hugging Face dataset"""
|
|
@@ -341,6 +423,7 @@ with gr.Blocks(
|
|
| 341 |
css="""
|
| 342 |
.gradio-container { max-width: 1200px; margin: auto; }
|
| 343 |
.consent-text { font-size: 0.9em; color: #666; }
|
|
|
|
| 344 |
"""
|
| 345 |
) as demo:
|
| 346 |
|
|
@@ -348,10 +431,16 @@ with gr.Blocks(
|
|
| 348 |
current_results = gr.State(value=pd.DataFrame())
|
| 349 |
|
| 350 |
gr.Markdown("""
|
| 351 |
-
|
| 352 |
**Upload an audio file to identify species using AI models**
|
| 353 |
|
| 354 |
-
*Models are loaded from: [RonaldCeballos/SpeciesClassifiers](https://huggingface.co/RonaldCeballos/SpeciesClassifiers)*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
""")
|
| 356 |
|
| 357 |
with gr.Row():
|
|
@@ -386,10 +475,11 @@ with gr.Blocks(
|
|
| 386 |
gr.Markdown("""
|
| 387 |
### π‘ Instructions:
|
| 388 |
1. Select species category
|
| 389 |
-
2. Choose model version
|
| 390 |
3. Upload audio file (WAV, MP3, etc.)
|
| 391 |
4. Click "Analyze Audio"
|
| 392 |
5. Review results by 5-second segments
|
|
|
|
| 393 |
""")
|
| 394 |
|
| 395 |
with gr.Column(scale=2):
|
|
@@ -399,12 +489,14 @@ with gr.Blocks(
|
|
| 399 |
label="π§ Analyzed Chunks",
|
| 400 |
headers=["Chunks", "Time", "Species", "Confidence"],
|
| 401 |
wrap=True,
|
| 402 |
-
max_height=
|
|
|
|
| 403 |
)
|
| 404 |
|
| 405 |
with gr.Accordion("π¬ Submit Feedback for Model Improvement", open=False):
|
| 406 |
gr.Markdown("""
|
| 407 |
**Help us improve!** Submit your audio and feedback to our private dataset for model training.
|
|
|
|
| 408 |
""")
|
| 409 |
|
| 410 |
consent_checkbox = gr.Checkbox(
|
|
@@ -426,7 +518,7 @@ with gr.Blocks(
|
|
| 426 |
|
| 427 |
# Event handlers
|
| 428 |
predict_btn.click(
|
| 429 |
-
fn=
|
| 430 |
inputs=[species_selector, model_selector, audio_input],
|
| 431 |
outputs=results_display
|
| 432 |
).then(
|
|
|
|
| 113 |
print(f"Error loading {species_type} - {model_version}: {str(e)}")
|
| 114 |
return None, None
|
| 115 |
|
| 116 |
+
def predict_with_model(spec, model, classes):
|
| 117 |
+
"""Predict species from spectrogram - Adapted from notebook"""
|
| 118 |
+
# Ensure correct dimensions (1025, 313)
|
| 119 |
+
if spec.shape != (1025, 313):
|
| 120 |
+
# Resize if needed
|
| 121 |
+
spec = resize_spectrogram(spec, (1025, 313))
|
| 122 |
+
|
| 123 |
+
# Preprocess for model - exactly as in notebook
|
| 124 |
+
arr = np.expand_dims(spec[..., np.newaxis], axis=0).astype('float32')
|
| 125 |
+
X = arr / np.max(arr)
|
| 126 |
+
|
| 127 |
+
# Predict
|
| 128 |
+
pred = model.predict(X, verbose=0)
|
| 129 |
+
pred_class_idx = np.argmax(pred)
|
| 130 |
+
pred_class = str(classes[pred_class_idx])
|
| 131 |
+
prob = float(pred[0][pred_class_idx])
|
| 132 |
+
|
| 133 |
+
return pred_class, prob
|
| 134 |
+
|
| 135 |
+
def extract_chunks(audio_clean, sr, time=5):
|
| 136 |
+
"""Extract audio chunks - Adapted from notebook's ext_chunks function"""
|
| 137 |
+
n_samples = sr * time
|
| 138 |
+
chunks = []
|
| 139 |
+
for i in range(0, len(audio_clean), n_samples):
|
| 140 |
+
start = i
|
| 141 |
+
end = i + n_samples
|
| 142 |
+
|
| 143 |
+
if end <= len(audio_clean):
|
| 144 |
+
chunk = audio_clean[start:end]
|
| 145 |
+
else:
|
| 146 |
+
# Circular padding - exactly as in notebook
|
| 147 |
+
faltan = end - len(audio_clean)
|
| 148 |
+
padding = audio_clean[:faltan]
|
| 149 |
+
chunk = np.concatenate([audio_clean[start:], padding])
|
| 150 |
+
|
| 151 |
+
chunks.append(chunk)
|
| 152 |
+
return np.array(chunks)
|
| 153 |
+
|
| 154 |
+
def create_spectrogram(array_audio, n_fft=2048):
|
| 155 |
+
"""Create spectrogram from audio array - Adapted from notebook's spectogram function"""
|
| 156 |
+
if isinstance(array_audio, np.ndarray):
|
| 157 |
+
dta = np.abs(librosa.stft(array_audio, n_fft=n_fft))
|
| 158 |
+
D = librosa.amplitude_to_db(dta, ref=np.max)
|
| 159 |
+
else:
|
| 160 |
+
dta = np.abs(librosa.stft(array_audio.numpy()))
|
| 161 |
+
D = librosa.amplitude_to_db(dta, ref=np.max)
|
| 162 |
+
return D
|
| 163 |
+
|
| 164 |
+
def resize_spectrogram(spec, target_shape):
|
| 165 |
+
"""Resize spectrogram to target shape"""
|
| 166 |
+
from scipy import ndimage
|
| 167 |
+
zoom_factors = (target_shape[0] / spec.shape[0], target_shape[1] / spec.shape[1])
|
| 168 |
+
resized = ndimage.zoom(spec, zoom_factors, order=1)
|
| 169 |
+
return resized
|
| 170 |
+
|
| 171 |
+
def predict_species_all_chunks(species_type, model_version, audio_file):
|
| 172 |
+
"""Main prediction function that processes all chunks"""
|
| 173 |
global current_audio_path, current_audio_name
|
| 174 |
|
| 175 |
if audio_file is None:
|
|
|
|
| 178 |
try:
|
| 179 |
# Store the current audio path and name for feedback
|
| 180 |
current_audio_path = audio_file
|
|
|
|
| 181 |
current_audio_name = os.path.basename(audio_file)
|
| 182 |
|
| 183 |
# Load model
|
|
|
|
| 185 |
if model is None or classes is None:
|
| 186 |
return pd.DataFrame({"Error": [f"Could not load {species_type} - {model_version} model"]})
|
| 187 |
|
| 188 |
+
# Process audio - using notebook approach
|
| 189 |
wav, sr = torchaudio.load(audio_file)
|
| 190 |
wav = wav.mean(dim=0) # Convert to mono
|
| 191 |
|
| 192 |
+
# Extract 5-second chunks using notebook function
|
| 193 |
chunks = extract_chunks(wav.numpy(), sr, time=5)
|
| 194 |
|
| 195 |
results = []
|
| 196 |
|
| 197 |
for i, chunk in enumerate(chunks):
|
| 198 |
+
# Create spectrogram using notebook function
|
| 199 |
spectrogram = create_spectrogram(chunk)
|
| 200 |
|
| 201 |
+
# Normalize exactly as in notebook
|
| 202 |
+
spectrogram = (spectrogram - np.mean(spectrogram)) / np.std(spectrogram)
|
|
|
|
| 203 |
|
| 204 |
+
# Predict using adapted notebook function
|
| 205 |
species, confidence = predict_with_model(spectrogram, model, classes)
|
| 206 |
|
| 207 |
time_start = i * 5
|
|
|
|
| 227 |
print(f"Prediction error: {str(e)}")
|
| 228 |
return pd.DataFrame({"Error": [f"Error during analysis: {str(e)}"]})
|
| 229 |
|
| 230 |
+
def predict_species_final(species_type, model_version, audio_file):
|
| 231 |
+
"""Enhanced prediction with voting system across chunks"""
|
| 232 |
+
global current_audio_path, current_audio_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
+
if audio_file is None:
|
| 235 |
+
return pd.DataFrame({"Info": ["Please upload an audio file"]})
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
current_audio_path = audio_file
|
| 239 |
+
current_audio_name = os.path.basename(audio_file)
|
| 240 |
+
|
| 241 |
+
# Load model
|
| 242 |
+
model, classes = load_model_from_hub(species_type, model_version)
|
| 243 |
+
if model is None or classes is None:
|
| 244 |
+
return pd.DataFrame({"Error": [f"Could not load {species_type} - {model_version} model"]})
|
| 245 |
+
|
| 246 |
+
# Process audio
|
| 247 |
+
wav, sr = torchaudio.load(audio_file)
|
| 248 |
+
wav = wav.mean(dim=0)
|
| 249 |
+
|
| 250 |
+
# Extract chunks
|
| 251 |
+
chunks = extract_chunks(wav.numpy(), sr, time=5)
|
| 252 |
+
|
| 253 |
+
results = []
|
| 254 |
+
species_votes = {}
|
| 255 |
+
|
| 256 |
+
for i, chunk in enumerate(chunks):
|
| 257 |
+
# Create and normalize spectrogram
|
| 258 |
+
spectrogram = create_spectrogram(chunk)
|
| 259 |
+
spectrogram = (spectrogram - np.mean(spectrogram)) / np.std(spectrogram)
|
| 260 |
+
|
| 261 |
+
# Predict
|
| 262 |
+
species, confidence = predict_with_model(spectrogram, model, classes)
|
| 263 |
+
|
| 264 |
+
# Count votes for final prediction
|
| 265 |
+
if species in species_votes:
|
| 266 |
+
species_votes[species] += confidence
|
| 267 |
+
else:
|
| 268 |
+
species_votes[species] = confidence
|
| 269 |
+
|
| 270 |
+
time_start = i * 5
|
| 271 |
+
time_end = (i + 1) * 5
|
| 272 |
+
|
| 273 |
+
results.append({
|
| 274 |
+
'Segment': f'{i+1}',
|
| 275 |
+
'Time': f'{time_start}s - {time_end}s',
|
| 276 |
+
'Species': species,
|
| 277 |
+
'Confidence': f'{confidence:.1%}'
|
| 278 |
+
})
|
| 279 |
+
|
| 280 |
+
# Determine final prediction
|
| 281 |
+
if species_votes:
|
| 282 |
+
final_species = max(species_votes, key=species_votes.get)
|
| 283 |
+
final_confidence = species_votes[final_species] / len(chunks)
|
| 284 |
+
|
| 285 |
+
# Add final prediction row
|
| 286 |
+
final_row = pd.DataFrame({
|
| 287 |
+
'Segment': ['FINAL'],
|
| 288 |
+
'Time': ['Full Audio'],
|
| 289 |
+
'Species': [final_species],
|
| 290 |
+
'Confidence': [f'{final_confidence:.1%}']
|
| 291 |
+
})
|
| 292 |
+
|
| 293 |
+
results_df = pd.concat([pd.DataFrame(results), final_row], ignore_index=True)
|
| 294 |
+
else:
|
| 295 |
+
results_df = pd.DataFrame(results)
|
| 296 |
+
|
| 297 |
+
# Clean memory
|
| 298 |
+
del model
|
| 299 |
+
gc.collect()
|
| 300 |
+
|
| 301 |
+
if results_df.empty:
|
| 302 |
+
return pd.DataFrame({"Info": ["No valid segments detected in the audio"]})
|
| 303 |
+
|
| 304 |
+
return results_df
|
| 305 |
+
|
| 306 |
+
except Exception as e:
|
| 307 |
+
print(f"Prediction error: {str(e)}")
|
| 308 |
+
return pd.DataFrame({"Error": [f"Error during analysis: {str(e)}"]})
|
| 309 |
|
| 310 |
def save_feedback_to_dataset(audio_file_path, original_audio_name, feedback_text, consent_given, species_type, model_version, results_df):
|
| 311 |
"""Save audio and feedback to private Hugging Face dataset"""
|
|
|
|
| 423 |
css="""
|
| 424 |
.gradio-container { max-width: 1200px; margin: auto; }
|
| 425 |
.consent-text { font-size: 0.9em; color: #666; }
|
| 426 |
+
.final-prediction { background-color: #e8f5e8 !important; font-weight: bold; }
|
| 427 |
"""
|
| 428 |
) as demo:
|
| 429 |
|
|
|
|
| 431 |
current_results = gr.State(value=pd.DataFrame())
|
| 432 |
|
| 433 |
gr.Markdown("""
|
| 434 |
+
## Species Audio Classifier
|
| 435 |
**Upload an audio file to identify species using AI models**
|
| 436 |
|
| 437 |
+
*Based on your notebook implementation - Models are loaded from: [RonaldCeballos/SpeciesClassifiers](https://huggingface.co/RonaldCeballos/SpeciesClassifiers)*
|
| 438 |
+
|
| 439 |
+
π **How it works:**
|
| 440 |
+
- Audio is split into 5-second segments
|
| 441 |
+
- Each segment is converted to a spectrogram
|
| 442 |
+
- AI model predicts species for each segment
|
| 443 |
+
- Final prediction is based on voting across all segments
|
| 444 |
""")
|
| 445 |
|
| 446 |
with gr.Row():
|
|
|
|
| 475 |
gr.Markdown("""
|
| 476 |
### π‘ Instructions:
|
| 477 |
1. Select species category
|
| 478 |
+
2. Choose model version
|
| 479 |
3. Upload audio file (WAV, MP3, etc.)
|
| 480 |
4. Click "Analyze Audio"
|
| 481 |
5. Review results by 5-second segments
|
| 482 |
+
6. Final prediction shown at the bottom
|
| 483 |
""")
|
| 484 |
|
| 485 |
with gr.Column(scale=2):
|
|
|
|
| 489 |
label="π§ Analyzed Chunks",
|
| 490 |
headers=["Chunks", "Time", "Species", "Confidence"],
|
| 491 |
wrap=True,
|
| 492 |
+
max_height=500,
|
| 493 |
+
datatype=["str", "str", "str", "str"]
|
| 494 |
)
|
| 495 |
|
| 496 |
with gr.Accordion("π¬ Submit Feedback for Model Improvement", open=False):
|
| 497 |
gr.Markdown("""
|
| 498 |
**Help us improve!** Submit your audio and feedback to our private dataset for model training.
|
| 499 |
+
*Using the same approach as your notebook implementation*
|
| 500 |
""")
|
| 501 |
|
| 502 |
consent_checkbox = gr.Checkbox(
|
|
|
|
| 518 |
|
| 519 |
# Event handlers
|
| 520 |
predict_btn.click(
|
| 521 |
+
fn=predict_species_final, # Using the enhanced version with voting
|
| 522 |
inputs=[species_selector, model_selector, audio_input],
|
| 523 |
outputs=results_display
|
| 524 |
).then(
|