convert_pdf2image.py
import fitz
from typing import Tuple
import os
def convert_pdf2img(input_file: str, pages: Tuple = None):
"""Converts pdf to image and generates a file by page"""
# Open the document
pdfIn = fitz.open(input_file)
output_files = []
# Iterate throughout the pages
for pg in range(pdfIn.pageCount):
if str(pages) != str(None):
if str(pg) not in str(pages):
continue
# Select a page
page = pdfIn[pg]
rotate = int(0)
# PDF Page is converted into a whole picture 1056*816 and then for each picture a screenshot is taken.
# zoom = 1.33333333 -----> Image size = 1056*816
# zoom = 2 ---> 2 * Default Resolution (text is clear, image text is hard to read) = filesize small / Image size = 1584*1224
# zoom = 4 ---> 4 * Default Resolution (text is clear, image text is barely readable) = filesize large
# zoom = 8 ---> 8 * Default Resolution (text is clear, image text is readable) = filesize large
zoom_x = 2
zoom_y = 2
# The zoom factor is equal to 2 in order to make text clear
# Pre-rotate is to rotate if needed.
mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
pix = page.getPixmap(matrix=mat, alpha=False)
output_file = f"{os.path.splitext(os.path.basename(input_file))[0]}_page{pg+1}.png"
pix.writePNG(output_file)
output_files.append(output_file)
pdfIn.close()
summary = {
"File": input_file, "Pages": str(pages), "Output File(s)": str(output_files)
}
# Printing Summary
print("## Summary ########################################################")
print("\n".join("{}:{}".format(i, j) for i, j in summary.items()))
print("###################################################################")
return output_files
if __name__ == "__main__":
import sys
input_file = sys.argv[1]
convert_pdf2img(input_file)