|
|
import fitz |
|
|
import os |
|
|
|
|
|
def extract_images_from_pdf(input_pdf: str, output_dir: str = "extracted_images"): |
|
|
""" |
|
|
Extract all images from a PDF and save them as individual image files. |
|
|
|
|
|
Args: |
|
|
input_pdf (str): Path to the PDF file. |
|
|
output_dir (str): Directory to save extracted images. Default is 'extracted_images'. |
|
|
|
|
|
Returns: |
|
|
List of saved image file paths. |
|
|
""" |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
pdf = fitz.open(input_pdf) |
|
|
saved_images = [] |
|
|
|
|
|
print(f"Extracting images from: {input_pdf}") |
|
|
|
|
|
for page_num in range(len(pdf)): |
|
|
page = pdf[page_num] |
|
|
images = page.get_images(full=True) |
|
|
|
|
|
for img_index, img in enumerate(images): |
|
|
xref = img[0] |
|
|
base_image = pdf.extract_image(xref) |
|
|
image_bytes = base_image["image"] |
|
|
image_ext = base_image["ext"] |
|
|
image_filename = f"page{page_num+1}_img{img_index+1}.{image_ext}" |
|
|
|
|
|
output_path = os.path.join(output_dir, image_filename) |
|
|
with open(output_path, "wb") as img_file: |
|
|
img_file.write(image_bytes) |
|
|
|
|
|
saved_images.append(output_path) |
|
|
print(f"Saved: {output_path}") |
|
|
|
|
|
pdf.close() |
|
|
|
|
|
if saved_images: |
|
|
print(f"✅ Extracted {len(saved_images)} images to: {output_dir}") |
|
|
else: |
|
|
print("⚠️ No images found in the PDF.") |
|
|
|
|
|
return saved_images |
|
|
|
|
|
|
|
|
extract_images_from_pdf("../CaptionCreator/media/Jebin passport.pdf") |
|
|
|