OpScanIA

Sleeping

App Files Files Community

axiilay commited on Oct 20

Commit

73f54ee

1 Parent(s): 94fd0fd

display the output proper

Browse files

Files changed (1) hide show

app.py +59 -16

app.py CHANGED Viewed

@@ -20,14 +20,22 @@ model = model.eval()
 @spaces.GPU
 def process_image(image, model_size, task_type):
     """
-    Process image with DeepSeek-OCR
     Args:
         image: PIL Image or file path
         model_size: Model size configuration
         task_type: OCR task type
     """
-    # 在 GPU 函数内部移动模型到 GPU
     model_gpu = model.cuda().to(torch.bfloat16)
     # Create temporary directory for output
@@ -60,7 +68,7 @@ def process_image(image, model_size, task_type):
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
         # Run inference
-        result = model_gpu.infer(
             tokenizer,
             prompt=prompt,
             image_file=temp_image_path,
@@ -68,17 +76,34 @@ def process_image(image, model_size, task_type):
             base_size=config["base_size"],
             image_size=config["image_size"],
             crop_mode=config["crop_mode"],
-            save_results=True,
             test_compress=True,
-            eval_mode=True,
         )
-        print(f"====\nresult: {result}\n====\n")
-        return result
 # Create Gradio interface
-with gr.Blocks(title="DeepSeek-OCR") as demo:
     gr.Markdown(
         """
         # DeepSeek-OCR Document Recognition
@@ -96,7 +121,7 @@ with gr.Blocks(title="DeepSeek-OCR") as demo:
     )
     with gr.Row():
-        with gr.Column():
             image_input = gr.Image(
                 type="pil", label="Upload Image", sources=["upload", "clipboard"]
             )
@@ -113,12 +138,29 @@ with gr.Blocks(title="DeepSeek-OCR") as demo:
                 label="Task Type",
             )
             submit_btn = gr.Button("Process Image", variant="primary")
-        with gr.Column():
-            output_text = gr.Textbox(
-                label="OCR Result", lines=20, show_copy_button=True
-            )
     # Examples
     gr.Examples(
@@ -127,18 +169,19 @@ with gr.Blocks(title="DeepSeek-OCR") as demo:
             ["examples/receipt.jpg", "Base", "Free OCR"],
         ],
         inputs=[image_input, model_size, task_type],
-        outputs=output_text,
         fn=process_image,
         cache_examples=False,
     )
     submit_btn.click(
         fn=process_image,
-        inputs=[image_input, model_size, task_type],
-        outputs=output_text,
     )
 # Launch the app
 if __name__ == "__main__":
     demo.queue(max_size=20)
     demo.launch()

 @spaces.GPU
 def process_image(image, model_size, task_type):
     """
+    Process image with DeepSeek-OCR and return multiple output formats.
     Args:
         image: PIL Image or file path
         model_size: Model size configuration
         task_type: OCR task type
+    Returns:
+        A tuple containing:
+        - Path to the image with bounding boxes.
+        - The content of the markdown result file.
+        - The plain text OCR result.
     """
+    if image is None:
+        return None, "Please upload an image first.", "Please upload an image first."
     model_gpu = model.cuda().to(torch.bfloat16)
     # Create temporary directory for output
         config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
         # Run inference
+        plain_text_result = model_gpu.infer(
             tokenizer,
             prompt=prompt,
             image_file=temp_image_path,
             base_size=config["base_size"],
             image_size=config["image_size"],
             crop_mode=config["crop_mode"],
+            save_results=True,  # Ensure results are saved to disk
             test_compress=True,
+            # eval_mode=True,
+        )
+        # Define paths for the generated files
+        image_result_path = os.path.join(output_path, "result_with_boxes.jpg")
+        markdown_result_path = os.path.join(output_path, "result.mmd")
+        # Read the markdown file content if it exists
+        markdown_content = ""
+        if os.path.exists(markdown_result_path):
+            with open(markdown_result_path, "r", encoding="utf-8") as f:
+                markdown_content = f.read()
+        else:
+            markdown_content = "Markdown result was not generated. This is expected for 'Free OCR' task."
+        # Check if the annotated image exists
+        final_image_path = (
+            image_result_path if os.path.exists(image_result_path) else None
         )
+        # Return all three results. Gradio will handle the temporary file path for the image.
+        return final_image_path, markdown_content, plain_text_result
 # Create Gradio interface
+with gr.Blocks(title="DeepSeek-OCR", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # DeepSeek-OCR Document Recognition
     )
     with gr.Row():
+        with gr.Column(scale=1):
             image_input = gr.Image(
                 type="pil", label="Upload Image", sources=["upload", "clipboard"]
             )
                 label="Task Type",
             )
+            eval_mode_checkbox = gr.Checkbox(
+                value=False,
+                label="Enable Evaluation Mode",
+                info="Returns only plain text, but might be faster. Uncheck to get annotated image and markdown.",
+            )
             submit_btn = gr.Button("Process Image", variant="primary")
+        with gr.Column(scale=2):
+            with gr.Tabs():
+                with gr.TabItem("Annotated Image"):
+                    output_image = gr.Image(
+                        label="Result with Bounding Boxes", interactive=False
+                    )
+                with gr.TabItem("Markdown Output"):
+                    output_markdown = gr.Markdown(label="Markdown Formatted Result")
+                with gr.TabItem("Plain Text"):
+                    output_text = gr.Textbox(
+                        label="OCR Result (eval_mode == True)",
+                        lines=20,
+                        show_copy_button=True,
+                        interactive=False,
+                    )
     # Examples
     gr.Examples(
             ["examples/receipt.jpg", "Base", "Free OCR"],
         ],
         inputs=[image_input, model_size, task_type],
+        outputs=[output_image, output_markdown, output_text, eval_mode_checkbox],
         fn=process_image,
         cache_examples=False,
     )
     submit_btn.click(
         fn=process_image,
+        inputs=[image_input, model_size, task_type, eval_mode_checkbox],
+        outputs=[output_image, output_markdown, output_text],
     )
 # Launch the app
 if __name__ == "__main__":
     demo.queue(max_size=20)
     demo.launch()