Commit
·
ffe6715
1
Parent(s):
0243855
document extraction process updated
Browse files- processors/document_extractor.py +232 -138
- requirements.txt +1 -0
processors/document_extractor.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
import io
|
| 3 |
import os
|
|
|
|
| 4 |
import mimetypes
|
| 5 |
from typing import Any
|
| 6 |
from typing import Dict
|
|
@@ -14,16 +15,37 @@ from dataclasses import dataclass
|
|
| 14 |
|
| 15 |
# Document processing libraries
|
| 16 |
try:
|
| 17 |
-
|
| 18 |
-
import
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
except ImportError:
|
| 21 |
-
logger.warning("
|
| 22 |
-
|
| 23 |
|
| 24 |
try:
|
| 25 |
from docx import Document as DocxDocument
|
| 26 |
DOCX_AVAILABLE = True
|
|
|
|
| 27 |
except ImportError:
|
| 28 |
logger.warning("python-docx not available. Install: pip install python-docx")
|
| 29 |
DOCX_AVAILABLE = False
|
|
@@ -31,6 +53,7 @@ except ImportError:
|
|
| 31 |
try:
|
| 32 |
import chardet
|
| 33 |
CHARDET_AVAILABLE = True
|
|
|
|
| 34 |
except ImportError:
|
| 35 |
logger.warning("chardet not available. Install: pip install chardet")
|
| 36 |
CHARDET_AVAILABLE = False
|
|
@@ -38,6 +61,7 @@ except ImportError:
|
|
| 38 |
try:
|
| 39 |
from bs4 import BeautifulSoup
|
| 40 |
BS4_AVAILABLE = True
|
|
|
|
| 41 |
except ImportError:
|
| 42 |
logger.warning("BeautifulSoup not available. Install: pip install beautifulsoup4")
|
| 43 |
BS4_AVAILABLE = False
|
|
@@ -82,7 +106,7 @@ class DocumentExtractor:
|
|
| 82 |
|
| 83 |
Supported Formats:
|
| 84 |
- Plain text (.txt, .md, .log)
|
| 85 |
-
- PDF documents (.pdf)
|
| 86 |
- Microsoft Word (.doc, .docx)
|
| 87 |
- Rich Text Format (.rtf)
|
| 88 |
- HTML files (.html, .htm)
|
|
@@ -105,20 +129,17 @@ class DocumentExtractor:
|
|
| 105 |
MAX_FILE_SIZE = 50 * 1024 * 1024
|
| 106 |
|
| 107 |
|
| 108 |
-
def __init__(self, max_file_size: int = MAX_FILE_SIZE,
|
| 109 |
"""
|
| 110 |
Initialize document extractor
|
| 111 |
|
| 112 |
Arguments:
|
| 113 |
----------
|
| 114 |
-
max_file_size : Maximum file size in bytes
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
extract_metadata : Extract document metadata
|
| 119 |
"""
|
| 120 |
self.max_file_size = max_file_size
|
| 121 |
-
self.prefer_pdfplumber = prefer_pdfplumber
|
| 122 |
self.extract_metadata = extract_metadata
|
| 123 |
|
| 124 |
logger.info(f"DocumentExtractor initialized (max_size={max_file_size/1024/1024:.1f}MB)")
|
|
@@ -176,7 +197,7 @@ class DocumentExtractor:
|
|
| 176 |
result.file_path = str(file_path)
|
| 177 |
result.file_size_bytes = file_size
|
| 178 |
|
| 179 |
-
logger.info(f"Extracted {len(result.text)} chars from {file_path.name}")
|
| 180 |
return result
|
| 181 |
|
| 182 |
except Exception as e:
|
|
@@ -192,15 +213,15 @@ class DocumentExtractor:
|
|
| 192 |
|
| 193 |
Arguments:
|
| 194 |
----------
|
| 195 |
-
file_bytes : File content as bytes
|
| 196 |
|
| 197 |
-
filename
|
| 198 |
|
| 199 |
-
mime_type : MIME type (optional)
|
| 200 |
|
| 201 |
Returns:
|
| 202 |
--------
|
| 203 |
-
ExtractedDocument object
|
| 204 |
"""
|
| 205 |
try:
|
| 206 |
# Determine file type
|
|
@@ -244,7 +265,7 @@ class DocumentExtractor:
|
|
| 244 |
return result
|
| 245 |
|
| 246 |
except Exception as e:
|
| 247 |
-
logger.error(f"Error extracting from bytes: {e}")
|
| 248 |
return self._create_error_result(file_path = filename,
|
| 249 |
error = repr(e),
|
| 250 |
)
|
|
@@ -264,7 +285,8 @@ class DocumentExtractor:
|
|
| 264 |
with open(file_path, 'rb') as f:
|
| 265 |
raw_data = f.read()
|
| 266 |
detected = chardet.detect(raw_data)
|
| 267 |
-
|
|
|
|
| 268 |
encoding = detected['encoding']
|
| 269 |
logger.debug(f"Detected encoding: {encoding} (confidence: {detected['confidence']})")
|
| 270 |
|
|
@@ -340,20 +362,58 @@ class DocumentExtractor:
|
|
| 340 |
|
| 341 |
def _extract_pdf(self, file_path: Path) -> ExtractedDocument:
|
| 342 |
"""
|
| 343 |
-
Extract text from PDF files
|
| 344 |
"""
|
| 345 |
-
if not
|
| 346 |
-
return self._create_error_result(file_path = (file_path),
|
| 347 |
-
error = "PDF libraries not installed",
|
| 348 |
)
|
| 349 |
|
| 350 |
-
warnings
|
| 351 |
-
text
|
| 352 |
-
page_count
|
| 353 |
-
metadata
|
|
|
|
| 354 |
|
| 355 |
-
# Try
|
| 356 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
try:
|
| 358 |
with pdfplumber.open(file_path) as pdf:
|
| 359 |
page_count = len(pdf.pages)
|
|
@@ -365,74 +425,124 @@ class DocumentExtractor:
|
|
| 365 |
if page_text:
|
| 366 |
text += page_text + "\n\n"
|
| 367 |
|
|
|
|
|
|
|
| 368 |
if text.strip():
|
|
|
|
| 369 |
return ExtractedDocument(text = text.strip(),
|
| 370 |
file_path = str(file_path),
|
| 371 |
file_type = '.pdf',
|
| 372 |
file_size_bytes = file_path.stat().st_size,
|
| 373 |
page_count = page_count,
|
| 374 |
-
extraction_method =
|
| 375 |
metadata = metadata,
|
| 376 |
is_success = True,
|
| 377 |
error_message = None,
|
| 378 |
warnings = warnings,
|
| 379 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
except Exception as e:
|
| 381 |
warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")
|
| 382 |
|
| 383 |
-
# Fallback
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
|
| 389 |
-
|
| 390 |
-
metadata = reader.metadata or {}
|
| 391 |
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
return self._create_error_result(file_path = str(file_path),
|
| 415 |
-
error = repr(e),
|
| 416 |
-
)
|
| 417 |
|
| 418 |
|
| 419 |
def _extract_pdf_bytes(self, file_bytes: bytes, filename: str) -> ExtractedDocument:
|
| 420 |
"""
|
| 421 |
-
Extract text from PDF bytes
|
| 422 |
"""
|
| 423 |
-
if not
|
| 424 |
return self._create_error_result(file_path = filename,
|
| 425 |
error = "PDF libraries not installed",
|
| 426 |
)
|
| 427 |
|
| 428 |
-
warnings
|
| 429 |
-
text
|
| 430 |
-
page_count
|
| 431 |
-
metadata
|
|
|
|
| 432 |
|
| 433 |
try:
|
| 434 |
-
# Try
|
| 435 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
try:
|
| 437 |
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
| 438 |
page_count = len(pdf.pages)
|
|
@@ -444,47 +554,61 @@ class DocumentExtractor:
|
|
| 444 |
if page_text:
|
| 445 |
text += page_text + "\n\n"
|
| 446 |
|
|
|
|
|
|
|
| 447 |
if text.strip():
|
| 448 |
return ExtractedDocument(text = text.strip(),
|
| 449 |
file_path = filename,
|
| 450 |
file_type = '.pdf',
|
| 451 |
file_size_bytes = len(file_bytes),
|
| 452 |
page_count = page_count,
|
| 453 |
-
extraction_method =
|
| 454 |
metadata = metadata,
|
| 455 |
is_success = True,
|
| 456 |
error_message = None,
|
| 457 |
warnings = warnings,
|
| 458 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
except Exception as e:
|
| 460 |
warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")
|
| 461 |
|
| 462 |
-
# Fallback
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
for page in reader.pages:
|
| 467 |
-
page_text = page.extract_text()
|
| 468 |
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
|
| 484 |
except Exception as e:
|
| 485 |
return self._create_error_result(file_path = filename,
|
| 486 |
error = repr(e),
|
| 487 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
|
| 489 |
|
| 490 |
def _extract_word(self, file_path: Path) -> ExtractedDocument:
|
|
@@ -519,7 +643,7 @@ class DocumentExtractor:
|
|
| 519 |
file_path = str(file_path),
|
| 520 |
file_type = file_path.suffix,
|
| 521 |
file_size_bytes = file_path.stat().st_size,
|
| 522 |
-
page_count = len(paragraphs),
|
| 523 |
extraction_method = 'python-docx',
|
| 524 |
metadata = metadata,
|
| 525 |
is_success = True,
|
|
@@ -659,7 +783,7 @@ class DocumentExtractor:
|
|
| 659 |
script.decompose()
|
| 660 |
|
| 661 |
# Get text
|
| 662 |
-
text = soup.get_text(separator='\n')
|
| 663 |
|
| 664 |
# Clean up whitespace
|
| 665 |
lines = (line.strip() for line in text.splitlines())
|
|
@@ -736,6 +860,7 @@ class DocumentExtractor:
|
|
| 736 |
|
| 737 |
# Check file size
|
| 738 |
file_size = file_path.stat().st_size
|
|
|
|
| 739 |
if (file_size > self.max_file_size):
|
| 740 |
return False, f"File too large: {file_size/1024/1024:.1f}MB (max: {self.max_file_size/1024/1024:.1f}MB)"
|
| 741 |
|
|
@@ -764,21 +889,22 @@ class DocumentExtractor:
|
|
| 764 |
|
| 765 |
|
| 766 |
# Convenience Functions
|
| 767 |
-
|
| 768 |
def extract_text(file_path: str, **kwargs) -> ExtractedDocument:
|
| 769 |
"""
|
| 770 |
Quick text extraction with default settings
|
| 771 |
|
| 772 |
Arguments:
|
| 773 |
----------
|
| 774 |
-
file_path : Path to document
|
| 775 |
-
|
|
|
|
| 776 |
|
| 777 |
Returns:
|
| 778 |
--------
|
| 779 |
-
ExtractedDocument object
|
| 780 |
"""
|
| 781 |
extractor = DocumentExtractor(**kwargs)
|
|
|
|
| 782 |
return extractor.extract(file_path)
|
| 783 |
|
| 784 |
|
|
@@ -788,56 +914,24 @@ def extract_from_upload(file_bytes: bytes, filename: str, **kwargs) -> Extracted
|
|
| 788 |
|
| 789 |
Arguments:
|
| 790 |
----------
|
| 791 |
-
file_bytes : File content as bytes
|
| 792 |
-
|
| 793 |
-
|
|
|
|
|
|
|
| 794 |
|
| 795 |
Returns:
|
| 796 |
--------
|
| 797 |
-
ExtractedDocument object
|
| 798 |
"""
|
| 799 |
extractor = DocumentExtractor(**kwargs)
|
|
|
|
| 800 |
return extractor.extract_from_bytes(file_bytes, filename)
|
| 801 |
|
| 802 |
|
| 803 |
# Export
|
| 804 |
-
__all__ = ['
|
|
|
|
| 805 |
'ExtractedDocument',
|
| 806 |
-
'extract_text',
|
| 807 |
'extract_from_upload',
|
| 808 |
]
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
# Testing
|
| 812 |
-
if __name__ == "__main__":
|
| 813 |
-
import sys
|
| 814 |
-
|
| 815 |
-
if len(sys.argv) > 1:
|
| 816 |
-
# Test with provided file
|
| 817 |
-
test_file = sys.argv[1]
|
| 818 |
-
print(f"Testing extraction on: {test_file}")
|
| 819 |
-
print("=" * 70)
|
| 820 |
-
|
| 821 |
-
result = extract_text(test_file)
|
| 822 |
-
|
| 823 |
-
print(f"Success: {result.is_success}")
|
| 824 |
-
print(f"File type: {result.file_type}")
|
| 825 |
-
print(f"Pages: {result.page_count}")
|
| 826 |
-
print(f"Method: {result.extraction_method}")
|
| 827 |
-
print(f"Text length: {len(result.text)} chars")
|
| 828 |
-
|
| 829 |
-
if result.warnings:
|
| 830 |
-
print(f"Warnings: {result.warnings}")
|
| 831 |
-
|
| 832 |
-
if result.error_message:
|
| 833 |
-
print(f"Error: {result.error_message}")
|
| 834 |
-
|
| 835 |
-
if result.text:
|
| 836 |
-
print(f"\nFirst 500 chars:")
|
| 837 |
-
print("-" * 70)
|
| 838 |
-
print(result.text[:500])
|
| 839 |
-
else:
|
| 840 |
-
print("Usage: python document_extractor.py <file_path>")
|
| 841 |
-
print("\nSupported formats:")
|
| 842 |
-
for ext in sorted(DocumentExtractor.SUPPORTED_EXTENSIONS):
|
| 843 |
-
print(f" {ext}")
|
|
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
import io
|
| 3 |
import os
|
| 4 |
+
import re
|
| 5 |
import mimetypes
|
| 6 |
from typing import Any
|
| 7 |
from typing import Dict
|
|
|
|
| 15 |
|
| 16 |
# Document processing libraries
|
| 17 |
try:
|
| 18 |
+
# PyMuPDF - Primary PDF Extractor
|
| 19 |
+
import fitz
|
| 20 |
+
PYPDF_AVAILABLE = True
|
| 21 |
+
logger.info("PyMuPDF available for high-quality PDF extraction")
|
| 22 |
+
|
| 23 |
+
except ImportError:
|
| 24 |
+
logger.warning("PyMuPDF not available. Install: pip install PyMuPDF")
|
| 25 |
+
PYPDF_AVAILABLE = False
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
# Fallback 1
|
| 29 |
+
import pdfplumber
|
| 30 |
+
PDFPLUMBER_AVAILABLE = True
|
| 31 |
+
|
| 32 |
+
except ImportError:
|
| 33 |
+
logger.warning("pdfplumber not available. Install: pip install pdfplumber")
|
| 34 |
+
PDFPLUMBER_AVAILABLE = False
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
# Fallback 2
|
| 38 |
+
import PyPDF2
|
| 39 |
+
PYPDF2_AVAILABLE = True
|
| 40 |
+
|
| 41 |
except ImportError:
|
| 42 |
+
logger.warning("PyPDF2 not available. Install: pip install PyPDF2")
|
| 43 |
+
PYPDF2_AVAILABLE = False
|
| 44 |
|
| 45 |
try:
|
| 46 |
from docx import Document as DocxDocument
|
| 47 |
DOCX_AVAILABLE = True
|
| 48 |
+
|
| 49 |
except ImportError:
|
| 50 |
logger.warning("python-docx not available. Install: pip install python-docx")
|
| 51 |
DOCX_AVAILABLE = False
|
|
|
|
| 53 |
try:
|
| 54 |
import chardet
|
| 55 |
CHARDET_AVAILABLE = True
|
| 56 |
+
|
| 57 |
except ImportError:
|
| 58 |
logger.warning("chardet not available. Install: pip install chardet")
|
| 59 |
CHARDET_AVAILABLE = False
|
|
|
|
| 61 |
try:
|
| 62 |
from bs4 import BeautifulSoup
|
| 63 |
BS4_AVAILABLE = True
|
| 64 |
+
|
| 65 |
except ImportError:
|
| 66 |
logger.warning("BeautifulSoup not available. Install: pip install beautifulsoup4")
|
| 67 |
BS4_AVAILABLE = False
|
|
|
|
| 106 |
|
| 107 |
Supported Formats:
|
| 108 |
- Plain text (.txt, .md, .log)
|
| 109 |
+
- PDF documents (.pdf) - Uses PyMuPDF as primary extractor
|
| 110 |
- Microsoft Word (.doc, .docx)
|
| 111 |
- Rich Text Format (.rtf)
|
| 112 |
- HTML files (.html, .htm)
|
|
|
|
| 129 |
MAX_FILE_SIZE = 50 * 1024 * 1024
|
| 130 |
|
| 131 |
|
| 132 |
+
def __init__(self, max_file_size: int = MAX_FILE_SIZE, extract_metadata: bool = True):
|
| 133 |
"""
|
| 134 |
Initialize document extractor
|
| 135 |
|
| 136 |
Arguments:
|
| 137 |
----------
|
| 138 |
+
max_file_size { int } : Maximum file size in bytes
|
| 139 |
|
| 140 |
+
extract_metadata { bool } : Extract document metadata
|
|
|
|
|
|
|
| 141 |
"""
|
| 142 |
self.max_file_size = max_file_size
|
|
|
|
| 143 |
self.extract_metadata = extract_metadata
|
| 144 |
|
| 145 |
logger.info(f"DocumentExtractor initialized (max_size={max_file_size/1024/1024:.1f}MB)")
|
|
|
|
| 197 |
result.file_path = str(file_path)
|
| 198 |
result.file_size_bytes = file_size
|
| 199 |
|
| 200 |
+
logger.info(f"Extracted {len(result.text)} chars from {file_path.name} using {result.extraction_method}")
|
| 201 |
return result
|
| 202 |
|
| 203 |
except Exception as e:
|
|
|
|
| 213 |
|
| 214 |
Arguments:
|
| 215 |
----------
|
| 216 |
+
file_bytes { bytes } : File content as bytes
|
| 217 |
|
| 218 |
+
filename { str } : Original filename
|
| 219 |
|
| 220 |
+
mime_type { str } : MIME type (optional)
|
| 221 |
|
| 222 |
Returns:
|
| 223 |
--------
|
| 224 |
+
{ ExtractedDocument} : ExtractedDocument object
|
| 225 |
"""
|
| 226 |
try:
|
| 227 |
# Determine file type
|
|
|
|
| 265 |
return result
|
| 266 |
|
| 267 |
except Exception as e:
|
| 268 |
+
logger.error(f"Error extracting from bytes: {repr(e)}")
|
| 269 |
return self._create_error_result(file_path = filename,
|
| 270 |
error = repr(e),
|
| 271 |
)
|
|
|
|
| 285 |
with open(file_path, 'rb') as f:
|
| 286 |
raw_data = f.read()
|
| 287 |
detected = chardet.detect(raw_data)
|
| 288 |
+
|
| 289 |
+
if (detected['confidence'] > 0.7):
|
| 290 |
encoding = detected['encoding']
|
| 291 |
logger.debug(f"Detected encoding: {encoding} (confidence: {detected['confidence']})")
|
| 292 |
|
|
|
|
| 362 |
|
| 363 |
def _extract_pdf(self, file_path: Path) -> ExtractedDocument:
|
| 364 |
"""
|
| 365 |
+
Extract text from PDF files in exactly this Priority order : PyMuPDF > pdfplumber > PyPDF2
|
| 366 |
"""
|
| 367 |
+
if not any([PYPDF_AVAILABLE, PDFPLUMBER_AVAILABLE, PYPDF2_AVAILABLE]):
|
| 368 |
+
return self._create_error_result(file_path = str(file_path),
|
| 369 |
+
error = "PDF libraries not installed. Install: pip install PyMuPDF",
|
| 370 |
)
|
| 371 |
|
| 372 |
+
warnings = list()
|
| 373 |
+
text = ""
|
| 374 |
+
page_count = 0
|
| 375 |
+
metadata = dict()
|
| 376 |
+
extraction_method = "unknown"
|
| 377 |
|
| 378 |
+
# Try PyMuPDF first : best quality & performance
|
| 379 |
+
if PYPDF_AVAILABLE:
|
| 380 |
+
try:
|
| 381 |
+
doc = fitz.open(file_path)
|
| 382 |
+
page_count = doc.page_count
|
| 383 |
+
metadata = doc.metadata
|
| 384 |
+
|
| 385 |
+
for page_num in range(page_count):
|
| 386 |
+
page = doc[page_num]
|
| 387 |
+
page_text = page.get_text()
|
| 388 |
+
|
| 389 |
+
if page_text:
|
| 390 |
+
text += page_text + "\n\n"
|
| 391 |
+
|
| 392 |
+
doc.close()
|
| 393 |
+
extraction_method = "PyMuPDF"
|
| 394 |
+
|
| 395 |
+
if text.strip():
|
| 396 |
+
logger.info(f"Successfully extracted PDF using PyMuPDF: {len(text)} chars")
|
| 397 |
+
return ExtractedDocument(text = text.strip(),
|
| 398 |
+
file_path = str(file_path),
|
| 399 |
+
file_type = '.pdf',
|
| 400 |
+
file_size_bytes = file_path.stat().st_size,
|
| 401 |
+
page_count = page_count,
|
| 402 |
+
extraction_method = extraction_method,
|
| 403 |
+
metadata = metadata,
|
| 404 |
+
is_success = True,
|
| 405 |
+
error_message = None,
|
| 406 |
+
warnings = warnings,
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
else:
|
| 410 |
+
warnings.append("PyMuPDF extracted no text")
|
| 411 |
+
|
| 412 |
+
except Exception as e:
|
| 413 |
+
warnings.append(f"PyMuPDF failed: {repr(e)}, trying pdfplumber")
|
| 414 |
+
|
| 415 |
+
# Fallback-1: Try pdfplumber
|
| 416 |
+
if PDFPLUMBER_AVAILABLE:
|
| 417 |
try:
|
| 418 |
with pdfplumber.open(file_path) as pdf:
|
| 419 |
page_count = len(pdf.pages)
|
|
|
|
| 425 |
if page_text:
|
| 426 |
text += page_text + "\n\n"
|
| 427 |
|
| 428 |
+
extraction_method = "pdfplumber"
|
| 429 |
+
|
| 430 |
if text.strip():
|
| 431 |
+
logger.info(f"Extracted PDF using pdfplumber: {len(text)} chars")
|
| 432 |
return ExtractedDocument(text = text.strip(),
|
| 433 |
file_path = str(file_path),
|
| 434 |
file_type = '.pdf',
|
| 435 |
file_size_bytes = file_path.stat().st_size,
|
| 436 |
page_count = page_count,
|
| 437 |
+
extraction_method = extraction_method,
|
| 438 |
metadata = metadata,
|
| 439 |
is_success = True,
|
| 440 |
error_message = None,
|
| 441 |
warnings = warnings,
|
| 442 |
)
|
| 443 |
+
|
| 444 |
+
else:
|
| 445 |
+
warnings.append("pdfplumber extracted no text")
|
| 446 |
+
|
| 447 |
except Exception as e:
|
| 448 |
warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")
|
| 449 |
|
| 450 |
+
# Fallback-2: Try PyPDF2
|
| 451 |
+
if PYPDF2_AVAILABLE:
|
| 452 |
+
try:
|
| 453 |
+
with open(file_path, 'rb') as f:
|
| 454 |
+
reader = PyPDF2.PdfReader(f)
|
| 455 |
+
page_count = len(reader.pages)
|
| 456 |
+
|
| 457 |
+
if self.extract_metadata:
|
| 458 |
+
metadata = reader.metadata or {}
|
| 459 |
+
|
| 460 |
+
for page in reader.pages:
|
| 461 |
+
page_text = page.extract_text()
|
| 462 |
+
|
| 463 |
+
if page_text:
|
| 464 |
+
text += page_text + "\n\n"
|
| 465 |
|
| 466 |
+
extraction_method = "PyPDF2"
|
|
|
|
| 467 |
|
| 468 |
+
if not text.strip():
|
| 469 |
+
warnings.append("PDF appears to be image-based or encrypted")
|
| 470 |
+
|
| 471 |
+
return ExtractedDocument(text = text.strip(),
|
| 472 |
+
file_path = str(file_path),
|
| 473 |
+
file_type = '.pdf',
|
| 474 |
+
file_size_bytes = file_path.stat().st_size,
|
| 475 |
+
page_count = page_count,
|
| 476 |
+
extraction_method = extraction_method,
|
| 477 |
+
metadata = metadata,
|
| 478 |
+
is_success = bool(text.strip()),
|
| 479 |
+
error_message = None if text.strip() else "No text extracted from PDF",
|
| 480 |
+
warnings = warnings,
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
except Exception as e:
|
| 484 |
+
warnings.append(f"PyPDF2 failed: {repr(e)}")
|
| 485 |
+
|
| 486 |
+
# All extractors failed
|
| 487 |
+
return self._create_error_result(file_path = str(file_path),
|
| 488 |
+
error = "All PDF extractors failed: " + "; ".join(warnings),
|
| 489 |
+
)
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
|
| 492 |
def _extract_pdf_bytes(self, file_bytes: bytes, filename: str) -> ExtractedDocument:
|
| 493 |
"""
|
| 494 |
+
Extract text from PDF bytes in this priority order : PyMuPDF > pdfplumber > PyPDF2
|
| 495 |
"""
|
| 496 |
+
if not any([PYPDF_AVAILABLE, PDFPLUMBER_AVAILABLE, PYPDF2_AVAILABLE]):
|
| 497 |
return self._create_error_result(file_path = filename,
|
| 498 |
error = "PDF libraries not installed",
|
| 499 |
)
|
| 500 |
|
| 501 |
+
warnings = list()
|
| 502 |
+
text = ""
|
| 503 |
+
page_count = 0
|
| 504 |
+
metadata = dict()
|
| 505 |
+
extraction_method = "unknown"
|
| 506 |
|
| 507 |
try:
|
| 508 |
+
# Primary: Try PyMuPDF first
|
| 509 |
+
if PYPDF_AVAILABLE:
|
| 510 |
+
try:
|
| 511 |
+
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 512 |
+
page_count = doc.page_count
|
| 513 |
+
metadata = doc.metadata
|
| 514 |
+
|
| 515 |
+
for page_num in range(page_count):
|
| 516 |
+
page = doc[page_num]
|
| 517 |
+
page_text = page.get_text()
|
| 518 |
+
|
| 519 |
+
if page_text:
|
| 520 |
+
text += page_text + "\n\n"
|
| 521 |
+
|
| 522 |
+
doc.close()
|
| 523 |
+
extraction_method = "PyMuPDF"
|
| 524 |
+
|
| 525 |
+
if text.strip():
|
| 526 |
+
return ExtractedDocument(text = text.strip(),
|
| 527 |
+
file_path = filename,
|
| 528 |
+
file_type = '.pdf',
|
| 529 |
+
file_size_bytes = len(file_bytes),
|
| 530 |
+
page_count = page_count,
|
| 531 |
+
extraction_method = extraction_method,
|
| 532 |
+
metadata = metadata,
|
| 533 |
+
is_success = True,
|
| 534 |
+
error_message = None,
|
| 535 |
+
warnings = warnings,
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
else:
|
| 539 |
+
warnings.append("PyMuPDF extracted no text")
|
| 540 |
+
|
| 541 |
+
except Exception as e:
|
| 542 |
+
warnings.append(f"PyMuPDF failed: {repr(e)}, trying pdfplumber")
|
| 543 |
+
|
| 544 |
+
# Fallback-1: Try pdfplumber
|
| 545 |
+
if PDFPLUMBER_AVAILABLE:
|
| 546 |
try:
|
| 547 |
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
| 548 |
page_count = len(pdf.pages)
|
|
|
|
| 554 |
if page_text:
|
| 555 |
text += page_text + "\n\n"
|
| 556 |
|
| 557 |
+
extraction_method = "pdfplumber"
|
| 558 |
+
|
| 559 |
if text.strip():
|
| 560 |
return ExtractedDocument(text = text.strip(),
|
| 561 |
file_path = filename,
|
| 562 |
file_type = '.pdf',
|
| 563 |
file_size_bytes = len(file_bytes),
|
| 564 |
page_count = page_count,
|
| 565 |
+
extraction_method = extraction_method,
|
| 566 |
metadata = metadata,
|
| 567 |
is_success = True,
|
| 568 |
error_message = None,
|
| 569 |
warnings = warnings,
|
| 570 |
)
|
| 571 |
+
|
| 572 |
+
else:
|
| 573 |
+
warnings.append("pdfplumber extracted no text")
|
| 574 |
+
|
| 575 |
except Exception as e:
|
| 576 |
warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")
|
| 577 |
|
| 578 |
+
# Fallback-2: Try PyPDF2
|
| 579 |
+
if PYPDF2_AVAILABLE:
|
| 580 |
+
reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
|
| 581 |
+
page_count = len(reader.pages)
|
|
|
|
|
|
|
| 582 |
|
| 583 |
+
for page in reader.pages:
|
| 584 |
+
page_text = page.extract_text()
|
| 585 |
+
|
| 586 |
+
if page_text:
|
| 587 |
+
text += page_text + "\n\n"
|
| 588 |
+
|
| 589 |
+
extraction_method = "PyPDF2"
|
| 590 |
+
|
| 591 |
+
return ExtractedDocument(text = text.strip(),
|
| 592 |
+
file_path = filename,
|
| 593 |
+
file_type = '.pdf',
|
| 594 |
+
file_size_bytes = len(file_bytes),
|
| 595 |
+
page_count = page_count,
|
| 596 |
+
extraction_method = extraction_method,
|
| 597 |
+
metadata = metadata,
|
| 598 |
+
is_success = bool(text.strip()),
|
| 599 |
+
error_message = None if text.strip() else "No text extracted",
|
| 600 |
+
warnings = warnings,
|
| 601 |
+
)
|
| 602 |
|
| 603 |
except Exception as e:
|
| 604 |
return self._create_error_result(file_path = filename,
|
| 605 |
error = repr(e),
|
| 606 |
)
|
| 607 |
+
|
| 608 |
+
# All extractors failed
|
| 609 |
+
return self._create_error_result(file_path = filename,
|
| 610 |
+
error = "All PDF extractors failed: " + "; ".join(warnings),
|
| 611 |
+
)
|
| 612 |
|
| 613 |
|
| 614 |
def _extract_word(self, file_path: Path) -> ExtractedDocument:
|
|
|
|
| 643 |
file_path = str(file_path),
|
| 644 |
file_type = file_path.suffix,
|
| 645 |
file_size_bytes = file_path.stat().st_size,
|
| 646 |
+
page_count = len(paragraphs),
|
| 647 |
extraction_method = 'python-docx',
|
| 648 |
metadata = metadata,
|
| 649 |
is_success = True,
|
|
|
|
| 783 |
script.decompose()
|
| 784 |
|
| 785 |
# Get text
|
| 786 |
+
text = soup.get_text(separator = '\n')
|
| 787 |
|
| 788 |
# Clean up whitespace
|
| 789 |
lines = (line.strip() for line in text.splitlines())
|
|
|
|
| 860 |
|
| 861 |
# Check file size
|
| 862 |
file_size = file_path.stat().st_size
|
| 863 |
+
|
| 864 |
if (file_size > self.max_file_size):
|
| 865 |
return False, f"File too large: {file_size/1024/1024:.1f}MB (max: {self.max_file_size/1024/1024:.1f}MB)"
|
| 866 |
|
|
|
|
| 889 |
|
| 890 |
|
| 891 |
# Convenience Functions
|
|
|
|
| 892 |
def extract_text(file_path: str, **kwargs) -> ExtractedDocument:
|
| 893 |
"""
|
| 894 |
Quick text extraction with default settings
|
| 895 |
|
| 896 |
Arguments:
|
| 897 |
----------
|
| 898 |
+
file_path { str } : Path to document
|
| 899 |
+
|
| 900 |
+
**kwargs : Override settings
|
| 901 |
|
| 902 |
Returns:
|
| 903 |
--------
|
| 904 |
+
{ ExtractedDocument } : ExtractedDocument object
|
| 905 |
"""
|
| 906 |
extractor = DocumentExtractor(**kwargs)
|
| 907 |
+
|
| 908 |
return extractor.extract(file_path)
|
| 909 |
|
| 910 |
|
|
|
|
| 914 |
|
| 915 |
Arguments:
|
| 916 |
----------
|
| 917 |
+
file_bytes { bytes } : File content as bytes
|
| 918 |
+
|
| 919 |
+
filename { str } : Original filename
|
| 920 |
+
|
| 921 |
+
**kwargs : Override settings
|
| 922 |
|
| 923 |
Returns:
|
| 924 |
--------
|
| 925 |
+
{ ExtractedDocument } : ExtractedDocument object
|
| 926 |
"""
|
| 927 |
extractor = DocumentExtractor(**kwargs)
|
| 928 |
+
|
| 929 |
return extractor.extract_from_bytes(file_bytes, filename)
|
| 930 |
|
| 931 |
|
| 932 |
# Export
|
| 933 |
+
__all__ = ['extract_text',
|
| 934 |
+
'DocumentExtractor',
|
| 935 |
'ExtractedDocument',
|
|
|
|
| 936 |
'extract_from_upload',
|
| 937 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -25,6 +25,7 @@ pandas==2.2.3
|
|
| 25 |
|
| 26 |
# Text Processing
|
| 27 |
python-docx==1.1.2
|
|
|
|
| 28 |
PyPDF2==3.0.1
|
| 29 |
pdfplumber==0.11.5
|
| 30 |
pymupdf==1.25.5
|
|
|
|
| 25 |
|
| 26 |
# Text Processing
|
| 27 |
python-docx==1.1.2
|
| 28 |
+
PyMuPDF>=1.23.0
|
| 29 |
PyPDF2==3.0.1
|
| 30 |
pdfplumber==0.11.5
|
| 31 |
pymupdf==1.25.5
|