Tools / PDF /reduce_pdf_size.py
jebin2's picture
new tool
9c32a2c
raw
history blame
1.37 kB
import fitz # PyMuPDF
import os
def extract_images_from_pdf(input_pdf: str, output_dir: str = "extracted_images"):
"""
Extract all images from a PDF and save them as individual image files.
Args:
input_pdf (str): Path to the PDF file.
output_dir (str): Directory to save extracted images. Default is 'extracted_images'.
Returns:
List of saved image file paths.
"""
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Open the PDF
pdf = fitz.open(input_pdf)
saved_images = []
print(f"Extracting images from: {input_pdf}")
for page_num in range(len(pdf)):
page = pdf[page_num]
images = page.get_images(full=True)
for img_index, img in enumerate(images):
xref = img[0]
base_image = pdf.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_filename = f"page{page_num+1}_img{img_index+1}.{image_ext}"
output_path = os.path.join(output_dir, image_filename)
with open(output_path, "wb") as img_file:
img_file.write(image_bytes)
saved_images.append(output_path)
print(f"Saved: {output_path}")
pdf.close()
if saved_images:
print(f"✅ Extracted {len(saved_images)} images to: {output_dir}")
else:
print("⚠️ No images found in the PDF.")
return saved_images
# Example usage
extract_images_from_pdf("../CaptionCreator/media/Jebin passport.pdf")