Spaces:

GenSEC-LLM
/

Post-ASR-LLM-Transcription-Correction

Running

App Files Files Community

huckiyang commited on Mar 14

Commit

d9795b9

1 Parent(s): 44ea2d4

optz the data loading

Browse files

Files changed (1) hide show

app.py +89 -94

app.py CHANGED Viewed

@@ -37,114 +37,107 @@ def preprocess_text(text):
     text = re.sub(r'\s+', ' ', text).strip()
     return text
 # Calculate WER for a group of examples
 def calculate_wer(examples):
     if not examples:
         return 0.0
     try:
-        # First, let's examine the first example in detail
-        if examples and len(examples) > 0:
             example = examples[0]
-            print("\n===== EXAMPLE DATA INSPECTION =====")
-            print(f"Keys in example: {example.keys()}")
-            # Try different possible field names
-            possible_reference_fields = ["transcription", "reference", "ground_truth", "target"]
-            possible_hypothesis_fields = ["input1", "hypothesis", "asr_output", "source_text"]
-            for field in possible_reference_fields:
-                if field in example:
-                    print(f"Reference field '{field}' found with value: {str(example[field])[:100]}...")
-            for field in possible_hypothesis_fields:
-                if field in example:
-                    print(f"Hypothesis field '{field}' found with value: {str(example[field])[:100]}...")
-        # Filter valid examples in a single pass
-        valid_pairs = []
-        for ex in examples:
             try:
-                # First try the expected field names
-                if "transcription" in ex and "input1" in ex:
-                    reference = ex["transcription"]
-                    hypothesis = ex["input1"]
-                # Try alternate field pairs if the standard ones don't exist
-                elif "transcription" in ex and "hypothesis_concatenated" in ex and ex["hypothesis_concatenated"]:
-                    reference = ex["transcription"]
-                    hypothesis = ex["hypothesis_concatenated"].split('.')[0]  # Take first sentence
-                elif "reference" in ex and "hypothesis" in ex:
-                    reference = ex["reference"]
-                    hypothesis = ex["hypothesis"]
-                else:
-                    continue  # Skip this example if we can't find matching fields
-                # Clean and preprocess the text
-                reference = preprocess_text(reference)
-                hypothesis = preprocess_text(hypothesis)
-                # Only add if both have valid content
                 if reference and hypothesis:
-                    valid_pairs.append((reference, hypothesis))
             except Exception as ex_error:
                 print(f"Error processing example: {str(ex_error)}")
                 continue
-        if not valid_pairs:
             print("No valid pairs found for WER calculation")
             return np.nan
-        # Print sample pairs for debugging
-        print(f"\nSample pair for WER calculation:")
-        print(f"Reference: '{valid_pairs[0][0]}'")
-        print(f"Hypothesis: '{valid_pairs[0][1]}'")
-        print(f"Total valid pairs: {len(valid_pairs)}")
-        # Make sure we have enough valid examples
-        if len(valid_pairs) < 5:
-            print("WARNING: Very few valid pairs for WER calculation")
-            if len(valid_pairs) < 2:
-                print("Not enough data for reliable WER calculation")
-                return np.nan
-        # Unzip the pairs
-        references, hypotheses = zip(*valid_pairs)
-        # Calculate WER with additional transforms
-        try:
-            # Set up transformation pipeline for jiwer
-            transformation = jiwer.Compose([
-                jiwer.ToLowerCase(),
-                jiwer.RemoveMultipleSpaces(),
-                jiwer.Strip(),
-                jiwer.RemovePunctuation(),
-                jiwer.ReduceToListOfWords()
-            ])
-            # Calculate WER with transformations
-            wer = jiwer.wer(
-                references,
-                hypotheses,
-                truth_transform=transformation,
-                hypothesis_transform=transformation
-            )
-            print(f"Successfully calculated WER: {wer}")
-            return wer
-        except Exception as wer_error:
-            print(f"Error calculating WER with jiwer: {str(wer_error)}")
-            # Fallback: Calculate character error rate manually for one sample
-            try:
-                if valid_pairs:
-                    ref = valid_pairs[0][0]
-                    hyp = valid_pairs[0][1]
-                    distance = jiwer.transforms.cer(ref, hyp)
-                    print(f"Fallback CER for first sample: {distance}")
-                return np.nan
-            except:
-                return np.nan
     except Exception as e:
         print(f"Error in calculate_wer: {str(e)}")
@@ -163,14 +156,14 @@ def get_wer_metrics(dataset):
         examples_by_source = {}
         # Process all examples
-        for ex in dataset:
             try:
                 source = ex.get("source", "unknown")
                 if source not in examples_by_source:
                     examples_by_source[source] = []
                 examples_by_source[source].append(ex)
             except Exception as e:
-                print(f"Error processing example: {str(e)}")
                 continue
         # Get all unique sources
@@ -186,7 +179,7 @@ def get_wer_metrics(dataset):
                 if count > 0:
                     print(f"\nCalculating WER for source {source} with {count} examples")
-                    wer = calculate_wer(examples[:100])  # Start with a sample for debugging
                 else:
                     wer = np.nan
@@ -207,9 +200,10 @@ def get_wer_metrics(dataset):
         try:
             total_count = len(dataset)
             print(f"\nCalculating overall WER with a sample of examples")
-            # Use a sample for overall calculation to avoid overloading
-            sample_size = min(1000, total_count)
-            overall_wer = calculate_wer(dataset[:sample_size])
             results.append({
                 "Source": "OVERALL",
@@ -218,6 +212,7 @@ def get_wer_metrics(dataset):
             })
         except Exception as e:
             print(f"Error calculating overall metrics: {str(e)}")
             results.append({
                 "Source": "OVERALL",
                 "Count": len(dataset),
@@ -294,4 +289,4 @@ with gr.Blocks(title="ASR Text Correction Test Leaderboard") as demo:
     refresh_btn.click(refresh_and_report, outputs=[leaderboard, error_output])
 if __name__ == "__main__":
-    demo.launch()

     text = re.sub(r'\s+', ' ', text).strip()
     return text
+# Simple WER calculation
+def calculate_simple_wer(reference, hypothesis):
+    """Calculate WER using a simple word-based approach"""
+    if not reference or not hypothesis:
+        return 1.0  # Maximum error if either is empty
+    # Split into words
+    ref_words = reference.split()
+    hyp_words = hypothesis.split()
+    # Levenshtein distance at the word level
+    # This is a simple implementation and may not be as accurate as jiwer
+    from jiwer.measures import _levenshtein_distance
+    distance = _levenshtein_distance(ref_words, hyp_words)
+    # WER calculation
+    if len(ref_words) == 0:
+        return 1.0
+    return float(distance) / float(len(ref_words))
 # Calculate WER for a group of examples
 def calculate_wer(examples):
     if not examples:
         return 0.0
     try:
+        # Check if examples is a Dataset or a list
+        is_dataset = hasattr(examples, 'features')
+        # Get the first example for inspection
+        if is_dataset and len(examples) > 0:
             example = examples[0]
+        elif not is_dataset and len(examples) > 0:
+            example = examples[0]
+        else:
+            print("No examples found")
+            return np.nan
+        print("\n===== EXAMPLE DATA INSPECTION =====")
+        print(f"Keys in example: {example.keys()}")
+        # Try different possible field names
+        possible_reference_fields = ["transcription", "reference", "ground_truth", "target"]
+        possible_hypothesis_fields = ["input1", "hypothesis", "asr_output", "source_text"]
+        for field in possible_reference_fields:
+            if field in example:
+                print(f"Reference field '{field}' found with value: {str(example[field])[:100]}...")
+        for field in possible_hypothesis_fields:
+            if field in example:
+                print(f"Hypothesis field '{field}' found with value: {str(example[field])[:100]}...")
+        # Process each example in the dataset
+        wer_values = []
+        # Determine how to iterate based on type
+        items_to_process = examples
+        if is_dataset:
+            # Limit to first 200 examples for efficiency
+            items_to_process = examples.select(range(min(200, len(examples))))
+        else:
+            items_to_process = examples[:200]  # First 200 examples
+        for ex in items_to_process:
             try:
+                # Try to get transcription and input1
+                transcription = ex.get("transcription")
+                # First try input1, then use first element from hypothesis if available
+                input1 = ex.get("input1")
+                if input1 is None and "hypothesis" in ex and ex["hypothesis"]:
+                    if isinstance(ex["hypothesis"], list) and len(ex["hypothesis"]) > 0:
+                        input1 = ex["hypothesis"][0]
+                    elif isinstance(ex["hypothesis"], str):
+                        input1 = ex["hypothesis"]
+                # Skip if either field is missing
+                if not transcription or not input1:
+                    continue
+                # Clean the text
+                reference = preprocess_text(transcription)
+                hypothesis = preprocess_text(input1)
+                # Calculate WER for this pair
                 if reference and hypothesis:
+                    pair_wer = calculate_simple_wer(reference, hypothesis)
+                    wer_values.append(pair_wer)
             except Exception as ex_error:
                 print(f"Error processing example: {str(ex_error)}")
                 continue
+        # Calculate average WER
+        if not wer_values:
             print("No valid pairs found for WER calculation")
             return np.nan
+        avg_wer = np.mean(wer_values)
+        print(f"Calculated {len(wer_values)} pairs with average WER: {avg_wer:.4f}")
+        return avg_wer
     except Exception as e:
         print(f"Error in calculate_wer: {str(e)}")
         examples_by_source = {}
         # Process all examples
+        for i, ex in enumerate(dataset):
             try:
                 source = ex.get("source", "unknown")
                 if source not in examples_by_source:
                     examples_by_source[source] = []
                 examples_by_source[source].append(ex)
             except Exception as e:
+                print(f"Error processing example {i}: {str(e)}")
                 continue
         # Get all unique sources
                 if count > 0:
                     print(f"\nCalculating WER for source {source} with {count} examples")
+                    wer = calculate_wer(examples)  # Now handles both lists and datasets
                 else:
                     wer = np.nan
         try:
             total_count = len(dataset)
             print(f"\nCalculating overall WER with a sample of examples")
+            # Sample for calculation
+            sample_size = min(500, total_count)
+            sample_dataset = dataset.select(range(sample_size))
+            overall_wer = calculate_wer(sample_dataset)
             results.append({
                 "Source": "OVERALL",
             })
         except Exception as e:
             print(f"Error calculating overall metrics: {str(e)}")
+            print(traceback.format_exc())
             results.append({
                 "Source": "OVERALL",
                 "Count": len(dataset),
     refresh_btn.click(refresh_and_report, outputs=[leaderboard, error_output])
 if __name__ == "__main__":
+    demo.launch()