Spaces:
Running
Running
⚡️ add warning for truncation
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
- app.py +17 -15
- pdf2text.py +3 -1
app.py
CHANGED
|
@@ -55,7 +55,7 @@ def load_uploaded_file(file_obj, temp_dir: Path = None):
|
|
| 55 |
return None
|
| 56 |
|
| 57 |
|
| 58 |
-
def convert_PDF(pdf_obj, language: str = "en"):
|
| 59 |
"""
|
| 60 |
convert_PDF - convert a PDF file to text
|
| 61 |
|
|
@@ -76,15 +76,18 @@ def convert_PDF(pdf_obj, language: str = "en"):
|
|
| 76 |
conversion_stats = convert_PDF_to_Text(
|
| 77 |
file_path,
|
| 78 |
ocr_model=ocr_model,
|
| 79 |
-
max_pages=
|
| 80 |
)
|
| 81 |
converted_txt = conversion_stats["converted_text"]
|
| 82 |
num_pages = conversion_stats["num_pages"]
|
|
|
|
| 83 |
# if alt_lang: # TODO: fix this
|
| 84 |
|
| 85 |
rt = round((time.perf_counter() - st) / 60, 2)
|
| 86 |
print(f"Runtime: {rt} minutes")
|
| 87 |
html = ""
|
|
|
|
|
|
|
| 88 |
html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
|
| 89 |
|
| 90 |
return converted_txt, html
|
|
@@ -125,20 +128,14 @@ if __name__ == "__main__":
|
|
| 125 |
gr.Markdown("Upload your own file:")
|
| 126 |
gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
label="VM file path",
|
| 131 |
-
placeholder="When the file is uploaded, the path will appear here",
|
| 132 |
-
value=pdf_obj,
|
| 133 |
-
)
|
| 134 |
-
with gr.Row():
|
| 135 |
-
uploaded_file = gr.File(
|
| 136 |
label="Upload a PDF file",
|
| 137 |
file_count="single",
|
| 138 |
type="file",
|
| 139 |
value= _here / "example_file.pdf",
|
| 140 |
)
|
| 141 |
-
load_file_button = gr.Button("Load Uploaded File")
|
| 142 |
|
| 143 |
gr.Markdown("---")
|
| 144 |
|
|
@@ -150,10 +147,15 @@ if __name__ == "__main__":
|
|
| 150 |
OCR_text = gr.Textbox(
|
| 151 |
label="OCR Result", placeholder="The OCR text will appear here"
|
| 152 |
)
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
convert_button.click(
|
| 159 |
fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
|
|
|
|
| 55 |
return None
|
| 56 |
|
| 57 |
|
| 58 |
+
def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
|
| 59 |
"""
|
| 60 |
convert_PDF - convert a PDF file to text
|
| 61 |
|
|
|
|
| 76 |
conversion_stats = convert_PDF_to_Text(
|
| 77 |
file_path,
|
| 78 |
ocr_model=ocr_model,
|
| 79 |
+
max_pages=max_pages,
|
| 80 |
)
|
| 81 |
converted_txt = conversion_stats["converted_text"]
|
| 82 |
num_pages = conversion_stats["num_pages"]
|
| 83 |
+
was_truncated = conversion_stats["truncated"]
|
| 84 |
# if alt_lang: # TODO: fix this
|
| 85 |
|
| 86 |
rt = round((time.perf_counter() - st) / 60, 2)
|
| 87 |
print(f"Runtime: {rt} minutes")
|
| 88 |
html = ""
|
| 89 |
+
if was_truncated:
|
| 90 |
+
html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
|
| 91 |
html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
|
| 92 |
|
| 93 |
return converted_txt, html
|
|
|
|
| 128 |
gr.Markdown("Upload your own file:")
|
| 129 |
gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
|
| 130 |
|
| 131 |
+
|
| 132 |
+
uploaded_file = gr.File(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
label="Upload a PDF file",
|
| 134 |
file_count="single",
|
| 135 |
type="file",
|
| 136 |
value= _here / "example_file.pdf",
|
| 137 |
)
|
| 138 |
+
# load_file_button = gr.Button("Load Uploaded File")
|
| 139 |
|
| 140 |
gr.Markdown("---")
|
| 141 |
|
|
|
|
| 147 |
OCR_text = gr.Textbox(
|
| 148 |
label="OCR Result", placeholder="The OCR text will appear here"
|
| 149 |
)
|
| 150 |
+
text_file = gr.File(
|
| 151 |
+
label="Download Text File",
|
| 152 |
+
file_count="single",
|
| 153 |
+
type="file",
|
| 154 |
+
interactive=False,
|
| 155 |
+
)
|
| 156 |
+
# load_file_button.click(
|
| 157 |
+
# fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
|
| 158 |
+
# )
|
| 159 |
|
| 160 |
convert_button.click(
|
| 161 |
fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
|
pdf2text.py
CHANGED
|
@@ -591,12 +591,13 @@ def convert_PDF_to_Text(
|
|
| 591 |
ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
|
| 592 |
logging.info(f"starting OCR on {PDF_file.name}")
|
| 593 |
doc = DocumentFile.from_pdf(PDF_file)
|
| 594 |
-
|
| 595 |
if len(doc) > max_pages:
|
| 596 |
logging.warning(
|
| 597 |
f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
|
| 598 |
)
|
| 599 |
doc = doc[:max_pages]
|
|
|
|
| 600 |
|
| 601 |
# Analyze
|
| 602 |
logging.info(f"running OCR on {len(doc)} pages")
|
|
@@ -616,6 +617,7 @@ def convert_PDF_to_Text(
|
|
| 616 |
"runtime": round(fn_rt, 2),
|
| 617 |
"date": str(date.today()),
|
| 618 |
"converted_text": ocr_results,
|
|
|
|
| 619 |
"length": len(ocr_results),
|
| 620 |
}
|
| 621 |
|
|
|
|
| 591 |
ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
|
| 592 |
logging.info(f"starting OCR on {PDF_file.name}")
|
| 593 |
doc = DocumentFile.from_pdf(PDF_file)
|
| 594 |
+
truncated = False
|
| 595 |
if len(doc) > max_pages:
|
| 596 |
logging.warning(
|
| 597 |
f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
|
| 598 |
)
|
| 599 |
doc = doc[:max_pages]
|
| 600 |
+
truncated = True
|
| 601 |
|
| 602 |
# Analyze
|
| 603 |
logging.info(f"running OCR on {len(doc)} pages")
|
|
|
|
| 617 |
"runtime": round(fn_rt, 2),
|
| 618 |
"date": str(date.today()),
|
| 619 |
"converted_text": ocr_results,
|
| 620 |
+
"truncated": truncated,
|
| 621 |
"length": len(ocr_results),
|
| 622 |
}
|
| 623 |
|