satyaki-mitra commited on
Commit
ffe6715
·
1 Parent(s): 0243855

document extraction process updated

Browse files
Files changed (2) hide show
  1. processors/document_extractor.py +232 -138
  2. requirements.txt +1 -0
processors/document_extractor.py CHANGED
@@ -1,6 +1,7 @@
1
  # DEPENDENCIES
2
  import io
3
  import os
 
4
  import mimetypes
5
  from typing import Any
6
  from typing import Dict
@@ -14,16 +15,37 @@ from dataclasses import dataclass
14
 
15
  # Document processing libraries
16
  try:
17
- import PyPDF2
18
- import pdfplumber
19
- PDF_AVAILABLE = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  except ImportError:
21
- logger.warning("PDF libraries not available. Install: pip install PyPDF2 pdfplumber")
22
- PDF_AVAILABLE = False
23
 
24
  try:
25
  from docx import Document as DocxDocument
26
  DOCX_AVAILABLE = True
 
27
  except ImportError:
28
  logger.warning("python-docx not available. Install: pip install python-docx")
29
  DOCX_AVAILABLE = False
@@ -31,6 +53,7 @@ except ImportError:
31
  try:
32
  import chardet
33
  CHARDET_AVAILABLE = True
 
34
  except ImportError:
35
  logger.warning("chardet not available. Install: pip install chardet")
36
  CHARDET_AVAILABLE = False
@@ -38,6 +61,7 @@ except ImportError:
38
  try:
39
  from bs4 import BeautifulSoup
40
  BS4_AVAILABLE = True
 
41
  except ImportError:
42
  logger.warning("BeautifulSoup not available. Install: pip install beautifulsoup4")
43
  BS4_AVAILABLE = False
@@ -82,7 +106,7 @@ class DocumentExtractor:
82
 
83
  Supported Formats:
84
  - Plain text (.txt, .md, .log)
85
- - PDF documents (.pdf)
86
  - Microsoft Word (.doc, .docx)
87
  - Rich Text Format (.rtf)
88
  - HTML files (.html, .htm)
@@ -105,20 +129,17 @@ class DocumentExtractor:
105
  MAX_FILE_SIZE = 50 * 1024 * 1024
106
 
107
 
108
- def __init__(self, max_file_size: int = MAX_FILE_SIZE, prefer_pdfplumber: bool = True, extract_metadata: bool = True):
109
  """
110
  Initialize document extractor
111
 
112
  Arguments:
113
  ----------
114
- max_file_size : Maximum file size in bytes
115
 
116
- prefer_pdfplumber : Use pdfplumber over PyPDF2 (better quality)
117
-
118
- extract_metadata : Extract document metadata
119
  """
120
  self.max_file_size = max_file_size
121
- self.prefer_pdfplumber = prefer_pdfplumber
122
  self.extract_metadata = extract_metadata
123
 
124
  logger.info(f"DocumentExtractor initialized (max_size={max_file_size/1024/1024:.1f}MB)")
@@ -176,7 +197,7 @@ class DocumentExtractor:
176
  result.file_path = str(file_path)
177
  result.file_size_bytes = file_size
178
 
179
- logger.info(f"Extracted {len(result.text)} chars from {file_path.name}")
180
  return result
181
 
182
  except Exception as e:
@@ -192,15 +213,15 @@ class DocumentExtractor:
192
 
193
  Arguments:
194
  ----------
195
- file_bytes : File content as bytes
196
 
197
- filename : Original filename
198
 
199
- mime_type : MIME type (optional)
200
 
201
  Returns:
202
  --------
203
- ExtractedDocument object
204
  """
205
  try:
206
  # Determine file type
@@ -244,7 +265,7 @@ class DocumentExtractor:
244
  return result
245
 
246
  except Exception as e:
247
- logger.error(f"Error extracting from bytes: {e}")
248
  return self._create_error_result(file_path = filename,
249
  error = repr(e),
250
  )
@@ -264,7 +285,8 @@ class DocumentExtractor:
264
  with open(file_path, 'rb') as f:
265
  raw_data = f.read()
266
  detected = chardet.detect(raw_data)
267
- if detected['confidence'] > 0.7:
 
268
  encoding = detected['encoding']
269
  logger.debug(f"Detected encoding: {encoding} (confidence: {detected['confidence']})")
270
 
@@ -340,20 +362,58 @@ class DocumentExtractor:
340
 
341
  def _extract_pdf(self, file_path: Path) -> ExtractedDocument:
342
  """
343
- Extract text from PDF files
344
  """
345
- if not PDF_AVAILABLE:
346
- return self._create_error_result(file_path = (file_path),
347
- error = "PDF libraries not installed",
348
  )
349
 
350
- warnings = list()
351
- text = ""
352
- page_count = 0
353
- metadata = dict()
 
354
 
355
- # Try pdfplumber first (better quality)
356
- if self.prefer_pdfplumber:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  try:
358
  with pdfplumber.open(file_path) as pdf:
359
  page_count = len(pdf.pages)
@@ -365,74 +425,124 @@ class DocumentExtractor:
365
  if page_text:
366
  text += page_text + "\n\n"
367
 
 
 
368
  if text.strip():
 
369
  return ExtractedDocument(text = text.strip(),
370
  file_path = str(file_path),
371
  file_type = '.pdf',
372
  file_size_bytes = file_path.stat().st_size,
373
  page_count = page_count,
374
- extraction_method = 'pdfplumber',
375
  metadata = metadata,
376
  is_success = True,
377
  error_message = None,
378
  warnings = warnings,
379
  )
 
 
 
 
380
  except Exception as e:
381
  warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")
382
 
383
- # Fallback to PyPDF2
384
- try:
385
- with open(file_path, 'rb') as f:
386
- reader = PyPDF2.PdfReader(f)
387
- page_count = len(reader.pages)
 
 
 
 
 
 
 
 
 
 
388
 
389
- if self.extract_metadata:
390
- metadata = reader.metadata or {}
391
 
392
- for page in reader.pages:
393
- page_text = page.extract_text()
394
-
395
- if page_text:
396
- text += page_text + "\n\n"
397
-
398
- if not text.strip():
399
- warnings.append("PDF appears to be image-based or encrypted")
400
-
401
- return ExtractedDocument(text = text.strip(),
402
- file_path = str(file_path),
403
- file_type = '.pdf',
404
- file_size_bytes = file_path.stat().st_size,
405
- page_count = page_count,
406
- extraction_method = 'PyPDF2',
407
- metadata = metadata,
408
- is_success = bool(text.strip()),
409
- error_message = None if text.strip() else "No text extracted",
410
- warnings = warnings,
411
- )
412
-
413
- except Exception as e:
414
- return self._create_error_result(file_path = str(file_path),
415
- error = repr(e),
416
- )
417
 
418
 
419
  def _extract_pdf_bytes(self, file_bytes: bytes, filename: str) -> ExtractedDocument:
420
  """
421
- Extract text from PDF bytes
422
  """
423
- if not PDF_AVAILABLE:
424
  return self._create_error_result(file_path = filename,
425
  error = "PDF libraries not installed",
426
  )
427
 
428
- warnings = list()
429
- text = ""
430
- page_count = 0
431
- metadata = dict()
 
432
 
433
  try:
434
- # Try pdfplumber
435
- if self.prefer_pdfplumber:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  try:
437
  with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
438
  page_count = len(pdf.pages)
@@ -444,47 +554,61 @@ class DocumentExtractor:
444
  if page_text:
445
  text += page_text + "\n\n"
446
 
 
 
447
  if text.strip():
448
  return ExtractedDocument(text = text.strip(),
449
  file_path = filename,
450
  file_type = '.pdf',
451
  file_size_bytes = len(file_bytes),
452
  page_count = page_count,
453
- extraction_method = 'pdfplumber',
454
  metadata = metadata,
455
  is_success = True,
456
  error_message = None,
457
  warnings = warnings,
458
  )
 
 
 
 
459
  except Exception as e:
460
  warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")
461
 
462
- # Fallback to PyPDF2
463
- reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
464
- page_count = len(reader.pages)
465
-
466
- for page in reader.pages:
467
- page_text = page.extract_text()
468
 
469
- if page_text:
470
- text += page_text + "\n\n"
471
-
472
- return ExtractedDocument(text = text.strip(),
473
- file_path = filename,
474
- file_type = '.pdf',
475
- file_size_bytes = len(file_bytes),
476
- page_count = page_count,
477
- extraction_method = 'PyPDF2',
478
- metadata = metadata,
479
- is_success = bool(text.strip()),
480
- error_message = None if text.strip() else "No text extracted",
481
- warnings = warnings,
482
- )
 
 
 
 
 
483
 
484
  except Exception as e:
485
  return self._create_error_result(file_path = filename,
486
  error = repr(e),
487
  )
 
 
 
 
 
488
 
489
 
490
  def _extract_word(self, file_path: Path) -> ExtractedDocument:
@@ -519,7 +643,7 @@ class DocumentExtractor:
519
  file_path = str(file_path),
520
  file_type = file_path.suffix,
521
  file_size_bytes = file_path.stat().st_size,
522
- page_count = len(paragraphs), # Approximate
523
  extraction_method = 'python-docx',
524
  metadata = metadata,
525
  is_success = True,
@@ -659,7 +783,7 @@ class DocumentExtractor:
659
  script.decompose()
660
 
661
  # Get text
662
- text = soup.get_text(separator='\n')
663
 
664
  # Clean up whitespace
665
  lines = (line.strip() for line in text.splitlines())
@@ -736,6 +860,7 @@ class DocumentExtractor:
736
 
737
  # Check file size
738
  file_size = file_path.stat().st_size
 
739
  if (file_size > self.max_file_size):
740
  return False, f"File too large: {file_size/1024/1024:.1f}MB (max: {self.max_file_size/1024/1024:.1f}MB)"
741
 
@@ -764,21 +889,22 @@ class DocumentExtractor:
764
 
765
 
766
  # Convenience Functions
767
-
768
  def extract_text(file_path: str, **kwargs) -> ExtractedDocument:
769
  """
770
  Quick text extraction with default settings
771
 
772
  Arguments:
773
  ----------
774
- file_path : Path to document
775
- **kwargs : Override settings
 
776
 
777
  Returns:
778
  --------
779
- ExtractedDocument object
780
  """
781
  extractor = DocumentExtractor(**kwargs)
 
782
  return extractor.extract(file_path)
783
 
784
 
@@ -788,56 +914,24 @@ def extract_from_upload(file_bytes: bytes, filename: str, **kwargs) -> Extracted
788
 
789
  Arguments:
790
  ----------
791
- file_bytes : File content as bytes
792
- filename : Original filename
793
- **kwargs : Override settings
 
 
794
 
795
  Returns:
796
  --------
797
- ExtractedDocument object
798
  """
799
  extractor = DocumentExtractor(**kwargs)
 
800
  return extractor.extract_from_bytes(file_bytes, filename)
801
 
802
 
803
  # Export
804
- __all__ = ['DocumentExtractor',
 
805
  'ExtractedDocument',
806
- 'extract_text',
807
  'extract_from_upload',
808
  ]
809
-
810
-
811
- # Testing
812
- if __name__ == "__main__":
813
- import sys
814
-
815
- if len(sys.argv) > 1:
816
- # Test with provided file
817
- test_file = sys.argv[1]
818
- print(f"Testing extraction on: {test_file}")
819
- print("=" * 70)
820
-
821
- result = extract_text(test_file)
822
-
823
- print(f"Success: {result.is_success}")
824
- print(f"File type: {result.file_type}")
825
- print(f"Pages: {result.page_count}")
826
- print(f"Method: {result.extraction_method}")
827
- print(f"Text length: {len(result.text)} chars")
828
-
829
- if result.warnings:
830
- print(f"Warnings: {result.warnings}")
831
-
832
- if result.error_message:
833
- print(f"Error: {result.error_message}")
834
-
835
- if result.text:
836
- print(f"\nFirst 500 chars:")
837
- print("-" * 70)
838
- print(result.text[:500])
839
- else:
840
- print("Usage: python document_extractor.py <file_path>")
841
- print("\nSupported formats:")
842
- for ext in sorted(DocumentExtractor.SUPPORTED_EXTENSIONS):
843
- print(f" {ext}")
 
1
  # DEPENDENCIES
2
  import io
3
  import os
4
+ import re
5
  import mimetypes
6
  from typing import Any
7
  from typing import Dict
 
15
 
16
  # Document processing libraries
17
  try:
18
+ # PyMuPDF - Primary PDF Extractor
19
+ import fitz
20
+ PYPDF_AVAILABLE = True
21
+ logger.info("PyMuPDF available for high-quality PDF extraction")
22
+
23
+ except ImportError:
24
+ logger.warning("PyMuPDF not available. Install: pip install PyMuPDF")
25
+ PYPDF_AVAILABLE = False
26
+
27
+ try:
28
+ # Fallback 1
29
+ import pdfplumber
30
+ PDFPLUMBER_AVAILABLE = True
31
+
32
+ except ImportError:
33
+ logger.warning("pdfplumber not available. Install: pip install pdfplumber")
34
+ PDFPLUMBER_AVAILABLE = False
35
+
36
+ try:
37
+ # Fallback 2
38
+ import PyPDF2
39
+ PYPDF2_AVAILABLE = True
40
+
41
  except ImportError:
42
+ logger.warning("PyPDF2 not available. Install: pip install PyPDF2")
43
+ PYPDF2_AVAILABLE = False
44
 
45
  try:
46
  from docx import Document as DocxDocument
47
  DOCX_AVAILABLE = True
48
+
49
  except ImportError:
50
  logger.warning("python-docx not available. Install: pip install python-docx")
51
  DOCX_AVAILABLE = False
 
53
  try:
54
  import chardet
55
  CHARDET_AVAILABLE = True
56
+
57
  except ImportError:
58
  logger.warning("chardet not available. Install: pip install chardet")
59
  CHARDET_AVAILABLE = False
 
61
  try:
62
  from bs4 import BeautifulSoup
63
  BS4_AVAILABLE = True
64
+
65
  except ImportError:
66
  logger.warning("BeautifulSoup not available. Install: pip install beautifulsoup4")
67
  BS4_AVAILABLE = False
 
106
 
107
  Supported Formats:
108
  - Plain text (.txt, .md, .log)
109
+ - PDF documents (.pdf) - Uses PyMuPDF as primary extractor
110
  - Microsoft Word (.doc, .docx)
111
  - Rich Text Format (.rtf)
112
  - HTML files (.html, .htm)
 
129
  MAX_FILE_SIZE = 50 * 1024 * 1024
130
 
131
 
132
+ def __init__(self, max_file_size: int = MAX_FILE_SIZE, extract_metadata: bool = True):
133
  """
134
  Initialize document extractor
135
 
136
  Arguments:
137
  ----------
138
+ max_file_size { int } : Maximum file size in bytes
139
 
140
+ extract_metadata { bool } : Extract document metadata
 
 
141
  """
142
  self.max_file_size = max_file_size
 
143
  self.extract_metadata = extract_metadata
144
 
145
  logger.info(f"DocumentExtractor initialized (max_size={max_file_size/1024/1024:.1f}MB)")
 
197
  result.file_path = str(file_path)
198
  result.file_size_bytes = file_size
199
 
200
+ logger.info(f"Extracted {len(result.text)} chars from {file_path.name} using {result.extraction_method}")
201
  return result
202
 
203
  except Exception as e:
 
213
 
214
  Arguments:
215
  ----------
216
+ file_bytes { bytes } : File content as bytes
217
 
218
+ filename { str } : Original filename
219
 
220
+ mime_type { str } : MIME type (optional)
221
 
222
  Returns:
223
  --------
224
+ { ExtractedDocument} : ExtractedDocument object
225
  """
226
  try:
227
  # Determine file type
 
265
  return result
266
 
267
  except Exception as e:
268
+ logger.error(f"Error extracting from bytes: {repr(e)}")
269
  return self._create_error_result(file_path = filename,
270
  error = repr(e),
271
  )
 
285
  with open(file_path, 'rb') as f:
286
  raw_data = f.read()
287
  detected = chardet.detect(raw_data)
288
+
289
+ if (detected['confidence'] > 0.7):
290
  encoding = detected['encoding']
291
  logger.debug(f"Detected encoding: {encoding} (confidence: {detected['confidence']})")
292
 
 
362
 
363
  def _extract_pdf(self, file_path: Path) -> ExtractedDocument:
364
  """
365
+ Extract text from PDF files in exactly this Priority order : PyMuPDF > pdfplumber > PyPDF2
366
  """
367
+ if not any([PYPDF_AVAILABLE, PDFPLUMBER_AVAILABLE, PYPDF2_AVAILABLE]):
368
+ return self._create_error_result(file_path = str(file_path),
369
+ error = "PDF libraries not installed. Install: pip install PyMuPDF",
370
  )
371
 
372
+ warnings = list()
373
+ text = ""
374
+ page_count = 0
375
+ metadata = dict()
376
+ extraction_method = "unknown"
377
 
378
+ # Try PyMuPDF first : best quality & performance
379
+ if PYPDF_AVAILABLE:
380
+ try:
381
+ doc = fitz.open(file_path)
382
+ page_count = doc.page_count
383
+ metadata = doc.metadata
384
+
385
+ for page_num in range(page_count):
386
+ page = doc[page_num]
387
+ page_text = page.get_text()
388
+
389
+ if page_text:
390
+ text += page_text + "\n\n"
391
+
392
+ doc.close()
393
+ extraction_method = "PyMuPDF"
394
+
395
+ if text.strip():
396
+ logger.info(f"Successfully extracted PDF using PyMuPDF: {len(text)} chars")
397
+ return ExtractedDocument(text = text.strip(),
398
+ file_path = str(file_path),
399
+ file_type = '.pdf',
400
+ file_size_bytes = file_path.stat().st_size,
401
+ page_count = page_count,
402
+ extraction_method = extraction_method,
403
+ metadata = metadata,
404
+ is_success = True,
405
+ error_message = None,
406
+ warnings = warnings,
407
+ )
408
+
409
+ else:
410
+ warnings.append("PyMuPDF extracted no text")
411
+
412
+ except Exception as e:
413
+ warnings.append(f"PyMuPDF failed: {repr(e)}, trying pdfplumber")
414
+
415
+ # Fallback-1: Try pdfplumber
416
+ if PDFPLUMBER_AVAILABLE:
417
  try:
418
  with pdfplumber.open(file_path) as pdf:
419
  page_count = len(pdf.pages)
 
425
  if page_text:
426
  text += page_text + "\n\n"
427
 
428
+ extraction_method = "pdfplumber"
429
+
430
  if text.strip():
431
+ logger.info(f"Extracted PDF using pdfplumber: {len(text)} chars")
432
  return ExtractedDocument(text = text.strip(),
433
  file_path = str(file_path),
434
  file_type = '.pdf',
435
  file_size_bytes = file_path.stat().st_size,
436
  page_count = page_count,
437
+ extraction_method = extraction_method,
438
  metadata = metadata,
439
  is_success = True,
440
  error_message = None,
441
  warnings = warnings,
442
  )
443
+
444
+ else:
445
+ warnings.append("pdfplumber extracted no text")
446
+
447
  except Exception as e:
448
  warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")
449
 
450
+ # Fallback-2: Try PyPDF2
451
+ if PYPDF2_AVAILABLE:
452
+ try:
453
+ with open(file_path, 'rb') as f:
454
+ reader = PyPDF2.PdfReader(f)
455
+ page_count = len(reader.pages)
456
+
457
+ if self.extract_metadata:
458
+ metadata = reader.metadata or {}
459
+
460
+ for page in reader.pages:
461
+ page_text = page.extract_text()
462
+
463
+ if page_text:
464
+ text += page_text + "\n\n"
465
 
466
+ extraction_method = "PyPDF2"
 
467
 
468
+ if not text.strip():
469
+ warnings.append("PDF appears to be image-based or encrypted")
470
+
471
+ return ExtractedDocument(text = text.strip(),
472
+ file_path = str(file_path),
473
+ file_type = '.pdf',
474
+ file_size_bytes = file_path.stat().st_size,
475
+ page_count = page_count,
476
+ extraction_method = extraction_method,
477
+ metadata = metadata,
478
+ is_success = bool(text.strip()),
479
+ error_message = None if text.strip() else "No text extracted from PDF",
480
+ warnings = warnings,
481
+ )
482
+
483
+ except Exception as e:
484
+ warnings.append(f"PyPDF2 failed: {repr(e)}")
485
+
486
+ # All extractors failed
487
+ return self._create_error_result(file_path = str(file_path),
488
+ error = "All PDF extractors failed: " + "; ".join(warnings),
489
+ )
 
 
 
490
 
491
 
492
  def _extract_pdf_bytes(self, file_bytes: bytes, filename: str) -> ExtractedDocument:
493
  """
494
+ Extract text from PDF bytes in this priority order : PyMuPDF > pdfplumber > PyPDF2
495
  """
496
+ if not any([PYPDF_AVAILABLE, PDFPLUMBER_AVAILABLE, PYPDF2_AVAILABLE]):
497
  return self._create_error_result(file_path = filename,
498
  error = "PDF libraries not installed",
499
  )
500
 
501
+ warnings = list()
502
+ text = ""
503
+ page_count = 0
504
+ metadata = dict()
505
+ extraction_method = "unknown"
506
 
507
  try:
508
+ # Primary: Try PyMuPDF first
509
+ if PYPDF_AVAILABLE:
510
+ try:
511
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
512
+ page_count = doc.page_count
513
+ metadata = doc.metadata
514
+
515
+ for page_num in range(page_count):
516
+ page = doc[page_num]
517
+ page_text = page.get_text()
518
+
519
+ if page_text:
520
+ text += page_text + "\n\n"
521
+
522
+ doc.close()
523
+ extraction_method = "PyMuPDF"
524
+
525
+ if text.strip():
526
+ return ExtractedDocument(text = text.strip(),
527
+ file_path = filename,
528
+ file_type = '.pdf',
529
+ file_size_bytes = len(file_bytes),
530
+ page_count = page_count,
531
+ extraction_method = extraction_method,
532
+ metadata = metadata,
533
+ is_success = True,
534
+ error_message = None,
535
+ warnings = warnings,
536
+ )
537
+
538
+ else:
539
+ warnings.append("PyMuPDF extracted no text")
540
+
541
+ except Exception as e:
542
+ warnings.append(f"PyMuPDF failed: {repr(e)}, trying pdfplumber")
543
+
544
+ # Fallback-1: Try pdfplumber
545
+ if PDFPLUMBER_AVAILABLE:
546
  try:
547
  with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
548
  page_count = len(pdf.pages)
 
554
  if page_text:
555
  text += page_text + "\n\n"
556
 
557
+ extraction_method = "pdfplumber"
558
+
559
  if text.strip():
560
  return ExtractedDocument(text = text.strip(),
561
  file_path = filename,
562
  file_type = '.pdf',
563
  file_size_bytes = len(file_bytes),
564
  page_count = page_count,
565
+ extraction_method = extraction_method,
566
  metadata = metadata,
567
  is_success = True,
568
  error_message = None,
569
  warnings = warnings,
570
  )
571
+
572
+ else:
573
+ warnings.append("pdfplumber extracted no text")
574
+
575
  except Exception as e:
576
  warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")
577
 
578
+ # Fallback-2: Try PyPDF2
579
+ if PYPDF2_AVAILABLE:
580
+ reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
581
+ page_count = len(reader.pages)
 
 
582
 
583
+ for page in reader.pages:
584
+ page_text = page.extract_text()
585
+
586
+ if page_text:
587
+ text += page_text + "\n\n"
588
+
589
+ extraction_method = "PyPDF2"
590
+
591
+ return ExtractedDocument(text = text.strip(),
592
+ file_path = filename,
593
+ file_type = '.pdf',
594
+ file_size_bytes = len(file_bytes),
595
+ page_count = page_count,
596
+ extraction_method = extraction_method,
597
+ metadata = metadata,
598
+ is_success = bool(text.strip()),
599
+ error_message = None if text.strip() else "No text extracted",
600
+ warnings = warnings,
601
+ )
602
 
603
  except Exception as e:
604
  return self._create_error_result(file_path = filename,
605
  error = repr(e),
606
  )
607
+
608
+ # All extractors failed
609
+ return self._create_error_result(file_path = filename,
610
+ error = "All PDF extractors failed: " + "; ".join(warnings),
611
+ )
612
 
613
 
614
  def _extract_word(self, file_path: Path) -> ExtractedDocument:
 
643
  file_path = str(file_path),
644
  file_type = file_path.suffix,
645
  file_size_bytes = file_path.stat().st_size,
646
+ page_count = len(paragraphs),
647
  extraction_method = 'python-docx',
648
  metadata = metadata,
649
  is_success = True,
 
783
  script.decompose()
784
 
785
  # Get text
786
+ text = soup.get_text(separator = '\n')
787
 
788
  # Clean up whitespace
789
  lines = (line.strip() for line in text.splitlines())
 
860
 
861
  # Check file size
862
  file_size = file_path.stat().st_size
863
+
864
  if (file_size > self.max_file_size):
865
  return False, f"File too large: {file_size/1024/1024:.1f}MB (max: {self.max_file_size/1024/1024:.1f}MB)"
866
 
 
889
 
890
 
891
  # Convenience Functions
 
892
  def extract_text(file_path: str, **kwargs) -> ExtractedDocument:
893
  """
894
  Quick text extraction with default settings
895
 
896
  Arguments:
897
  ----------
898
+ file_path { str } : Path to document
899
+
900
+ **kwargs : Override settings
901
 
902
  Returns:
903
  --------
904
+ { ExtractedDocument } : ExtractedDocument object
905
  """
906
  extractor = DocumentExtractor(**kwargs)
907
+
908
  return extractor.extract(file_path)
909
 
910
 
 
914
 
915
  Arguments:
916
  ----------
917
+ file_bytes { bytes } : File content as bytes
918
+
919
+ filename { str } : Original filename
920
+
921
+ **kwargs : Override settings
922
 
923
  Returns:
924
  --------
925
+ { ExtractedDocument } : ExtractedDocument object
926
  """
927
  extractor = DocumentExtractor(**kwargs)
928
+
929
  return extractor.extract_from_bytes(file_bytes, filename)
930
 
931
 
932
  # Export
933
+ __all__ = ['extract_text',
934
+ 'DocumentExtractor',
935
  'ExtractedDocument',
 
936
  'extract_from_upload',
937
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -25,6 +25,7 @@ pandas==2.2.3
25
 
26
  # Text Processing
27
  python-docx==1.1.2
 
28
  PyPDF2==3.0.1
29
  pdfplumber==0.11.5
30
  pymupdf==1.25.5
 
25
 
26
  # Text Processing
27
  python-docx==1.1.2
28
+ PyMuPDF>=1.23.0
29
  PyPDF2==3.0.1
30
  pdfplumber==0.11.5
31
  pymupdf==1.25.5