Spaces:

frankai98
/

Tokentesting

Sleeping

frankai98 commited on Mar 13

Commit

ee46d7b

verified ·

1 Parent(s): 2553077

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -160,9 +160,24 @@ else:
                 progress = int((i / len(candidate_docs)) * 50)  # First half of progress bar (0-50%)
                 progress_bar.progress(progress)
-                # Process single document
-                result = score_pipe(doc)
-                scored_results.append(result[0])
                 # Display occasional status updates for large datasets
                 if i % max(1, len(candidate_docs) // 10) == 0:

                 progress = int((i / len(candidate_docs)) * 50)  # First half of progress bar (0-50%)
                 progress_bar.progress(progress)
+                # Process single document with truncation to avoid tensor size mismatch
+                try:
+                    # Use the tokenizer to properly truncate the input
+                    tokenizer = score_pipe.tokenizer
+                    max_length = tokenizer.model_max_length  # Usually 512 for RoBERTa
+                    # Truncate the text using the tokenizer to ensure it fits
+                    encoded_input = tokenizer(doc, truncation=True, max_length=max_length, return_tensors="pt")
+                    # Decode back to text to get the truncated version
+                    truncated_doc = tokenizer.decode(encoded_input["input_ids"][0], skip_special_tokens=True)
+                    # Now process the truncated document
+                    result = score_pipe(truncated_doc)
+                    scored_results.append(result[0])  # Get the first result
+                except Exception as e:
+                    st.warning(f"Error processing document {i}: {str(e)}")
+                    # Add a placeholder result to maintain indexing
+                    scored_results.append({"label": "ERROR", "score": 0})
                 # Display occasional status updates for large datasets
                 if i % max(1, len(candidate_docs) // 10) == 0: