Code for How to Extract Images from PDF in Python Tutorial


View on Github

pdf_image_extractor_cli.py

import os
import fitz  # PyMuPDF
import io
from PIL import Image
import argparse

parser = argparse.ArgumentParser(description="Extract images from a PDF file.")
parser.add_argument("file", help="PDF file to extract images from.")
parser.add_argument("-o", "--output", help="Output directory for the extracted images.", default="extracted_images")
parser.add_argument("-f", "--format", help="Desired output image format, default is PNG.", default="png")
parser.add_argument("-w", "--width", help="Minimum width for extracted images, default is 100.", default=100, type=int)
parser.add_argument("-he", "--height", help="Minimum height for extracted images, default is 100.", default=100, type=int)
# Parse the arguments
args = parser.parse_args()

# Output directory for the extracted images
output_dir = args.output
# Desired output image format
output_format = args.format
# Minimum width and height for extracted images
min_width = args.width
min_height = args.height
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# File path you want to extract images from
file = args.file
# Open the file
pdf_file = fitz.open(file)
# Iterate over PDF pages
for page_index in range(len(pdf_file)):
    # Get the page itself
    page = pdf_file[page_index]
    # Get image list
    image_list = page.get_images(full=True)
    # Print the number of images found on this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print(f"[!] No images found on page {page_index}")
    # Iterate over the images on the page
    for image_index, img in enumerate(image_list, start=1):
        # Get the XREF of the image
        xref = img[0]
        # Extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        # Get the image extension
        image_ext = base_image["ext"]
        # Load it to PIL
        image = Image.open(io.BytesIO(image_bytes))
        # Check if the image meets the minimum dimensions and save it
        if image.width >= min_width and image.height >= min_height:
            image.save(
                open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
                format=output_format.upper())
        else:
            print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")

pdf_image_extractor.py

import os
import fitz  # PyMuPDF
import io
from PIL import Image

# Output directory for the extracted images
output_dir = "extracted_images"
# Desired output image format
output_format = "png"
# Minimum width and height for extracted images
min_width = 100
min_height = 100
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# File path you want to extract images from
file = "1710.05006.pdf"
# Open the file
pdf_file = fitz.open(file)
# Iterate over PDF pages
for page_index in range(len(pdf_file)):
    # Get the page itself
    page = pdf_file[page_index]
    # Get image list
    image_list = page.get_images(full=True)
    # Print the number of images found on this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print(f"[!] No images found on page {page_index}")
    # Iterate over the images on the page
    for image_index, img in enumerate(image_list, start=1):
        # Get the XREF of the image
        xref = img[0]
        # Extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        # Get the image extension
        image_ext = base_image["ext"]
        # Load it to PIL
        image = Image.open(io.BytesIO(image_bytes))
        # Check if the image meets the minimum dimensions and save it
        if image.width >= min_width and image.height >= min_height:
            image.save(
                open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
                format=output_format.upper())
        else:
            print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")