From 15872471659f90d2396164e8fa358a09c190e26a Mon Sep 17 00:00:00 2001 From: Anton Akhmerov <anton.akhmerov@gmail.com> Date: Sat, 17 Mar 2018 23:21:52 +0100 Subject: [PATCH] Rewrite PDF processing to avoid temporary files * Use PyPDF2 (new dependency) to loop over images attached to a PDF this allows to detect and raise an error whenever there's >1 image/page (we cannot treat this correctly anyway) * rewrite helper functions to deal with image objects rather than reading from disk * This also means we don't need pdfimages anymore --- requirements.txt | 1 + zesje/helpers/pdf_helper.py | 167 +++++++++++++++++++----------------- 2 files changed, 89 insertions(+), 79 deletions(-) diff --git a/requirements.txt b/requirements.txt index 02d97ad3c..c84ade2ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ numpy scipy pandas pyyaml +PyPDF2 diff --git a/zesje/helpers/pdf_helper.py b/zesje/helpers/pdf_helper.py index b8e39eb87..08b7d5153 100644 --- a/zesje/helpers/pdf_helper.py +++ b/zesje/helpers/pdf_helper.py @@ -2,16 +2,15 @@ import os from collections import namedtuple, ChainMap import functools import itertools -import subprocess import argparse -import shutil -import tempfile -import contextlib +from io import BytesIO import numpy as np import pandas import cv2 import zbar +from PIL import Image +import PyPDF2 from pony import orm @@ -52,7 +51,6 @@ def process_pdf(pdf_id, data_directory): config_path = os.path.join(data_directory, pdf.exam.yaml_path) output_directory = os.path.join(data_directory, pdf.exam.name + '_data') - try: # Read in exam metadata config = ExamMetadata(*yaml_helper.parse(yaml_helper.read(config_path))) @@ -60,43 +58,77 @@ def process_pdf(pdf_id, data_directory): report_error(f'Error while reading Exam metadata: {e}') raise - with make_temp_directory() as tmpdir: - # Extract pages as images - report_progress('Extracting pages') - try: - images = pdf_to_images(pdf_path, tmpdir) - except Exception as e: - report_error(f'Error while extracting pages: {e}') - raise - - # Process individual pages, ensuring we report the page numbers - # starting from 1. - failures = [] - for i, image in enumerate(images, 1): - report_progress(f'Processing page {i} / {len(images)}') + total = PyPDF2.PdfFileReader(open(pdf_path, "rb")).getNumPages() + failures = [] + try: + for image, page in extract_images(pdf_path): + report_progress(f'Processing page {page} / {total}') try: success, reason = process_page(output_directory, image, config) if not success: print(reason) - failures.append(images) + failures.append(page) except Exception as e: - report_error(f'Error processing page {i}: {e}') + report_error(f'Error processing page {page}: {e}') return + except Exception as e: + report_error(f"Failed to read pdf: {e}") + raise - if failures: - processed = len(images) - len(failures) - # images are named like '-nnnnn.jpg' - failures = [1 + int(os.path.basename(im)[1:-len('.jpg')]) - for im in failures] - report_error( - f'Processed {processed} / {len(images)} pages. ' - f'Failed on pages: {failures}' - ) - else: - report_success(f'processed {len(images)} pages') + if failures: + processed = total - len(failures) + report_error( + f'Processed {processed} / {total} pages. ' + f'Failed on pages: {failures}' + ) + else: + report_success(f'processed {total} pages') -def process_page(output_dir, image, exam_config): +def extract_images(filename): + """Yield all images from a PDF file. + + Adapted from https://stackoverflow.com/a/34116472/2217463 + + We raise if there are > 1 images / page + """ + reader = PyPDF2.PdfFileReader(open(filename, "rb")) + total = reader.getNumPages() + for pagenr in range(total): + page = reader.getPage(pagenr) + xObject = page['/Resources']['/XObject'].getObject() + + if sum((xObject[obj]['/Subtype'] == '/Image') + for obj in xObject) > 1: + raise RuntimeError('Page {pagenr} contains more than 1 image,' + 'likely not a scan') + + for obj in xObject: + if xObject[obj]['/Subtype'] == '/Image': + data = xObject[obj].getData() + filter = xObject[obj]['/Filter'] + + if filter == '/FlateDecode': + size = (xObject[obj]['/Width'], xObject[obj]['/Height']) + if xObject[obj]['/ColorSpace'] == '/DeviceRGB': + mode = "RGB" + else: + mode = "P" + img = Image.frombytes(mode, size, data) + else: + img = Image.open(BytesIO(data)) + + yield img, pagenr+1 + + +def write_pdf_status(pdf_id, status, message): + with orm.db_session: + pdf = PDF[pdf_id] + pdf.status = status + pdf.message = message + + +def process_page(output_dir, image_data, exam_config): """Incorporate a scanned image in the data structure. For each page perform the following steps: @@ -112,8 +144,7 @@ def process_page(output_dir, image, exam_config): ---------- output_dir : string Path where the processed image must be stored. - image : string - Path to the original + image_data : PIL Image exam_config : ExamMetadata instance Information about the exam to which this page should belong @@ -132,7 +163,9 @@ def process_page(output_dir, image, exam_config): Because the failure of steps 1-3 likely means we're reading a wrong pdf, we should stop processing any other pages if this function raises. """ - qr_data = extract_qr(image) + ext = image_data.format + image_data.load() + qr_data = extract_qr(image_data) if qr_data is None: return False, "Reading QR code failed" @@ -143,18 +176,17 @@ def process_page(output_dir, image, exam_config): qr_coords, widget_data = exam_config.qr_coords, exam_config.widget_data - rotate_and_shift(image, qr_data, qr_coords) + image_data = rotate_and_shift(image_data, qr_data, qr_coords) sub_nr = qr_data.sub_nr with orm.db_session: exam = Exam.get(name=qr_data.name) sub = Submission.get(copy_number=sub_nr, exam=exam) \ or Submission(copy_number=sub_nr, exam=exam) - _, ext = os.path.splitext(image) target = os.path.join(output_dir, f'{qr_data.name}_{sub_nr}') os.makedirs(target, exist_ok=True) - target_image = os.path.join(target, f'page{qr_data.page}{ext}') - shutil.move(image, target_image) + target_image = os.path.join(target, f'page{qr_data.page}.jpg') + image_data.save(target_image) # We may have added this page in previous uploads; the above # 'rename' then overwrites the previosly uploaded page, but # we only want a single 'Page' entry. @@ -182,38 +214,12 @@ def process_page(output_dir, image, exam_config): return True, '' -def pdf_to_images(pdf_path, output_path): - """Extract all images out of a pdf file.""" - # We convert everything to jpeg, which may be suboptimal, however some - # formats recognized by pdfimages aren't understood by opencv. - subprocess.run(['pdfimages', '-j', pdf_path, output_path]) - return sorted(os.path.join(output_path, f) - for f in os.listdir(output_path) - if f.endswith('.jpg')) - - -def write_pdf_status(pdf_id, status, message): - with orm.db_session: - pdf = PDF[pdf_id] - pdf.status = status - pdf.message = message - - -@contextlib.contextmanager -def make_temp_directory(): - temp_dir = tempfile.mkdtemp() - if not temp_dir.endswith('/'): - temp_dir += '/' - try: - yield temp_dir - finally: - shutil.rmtree(temp_dir) - - -def extract_qr(image_path): - image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) - if image.shape[0] < image.shape[1]: - image = image.T +def extract_qr(image_data): + """Extract a QR code from a PIL Image.""" + grayscale = np.asarray(image_data.convert(mode='L')) + # Apply portrait orientation. + if grayscale.shape[0] < grayscale.shape[1]: + grayscale = grayscale.T # Empirically we observed that the most important parameter # for zbar to successfully read a qr code is the resolution @@ -221,9 +227,11 @@ def extract_qr(image_path): # zbar also seems to care about image orientation, hence # the loop over dirx/diry. for dirx, diry, factor in itertools.product([1, -1], [1, -1], [8, 5, 4, 3]): - flipped = image[::factor * dirx, ::factor * diry] + scaled = grayscale[::factor * dirx, ::factor * diry] scanner = zbar.Scanner() - results = scanner.scan(flipped) + results = scanner.scan(scaled) + if len(results) > 1: + raise RuntimeError("Found > 1 QR code on the page.") if results: try: version, name, page, copy = \ @@ -232,10 +240,10 @@ def extract_qr(image_path): return coords = np.array(results[0].position) # zbar doesn't respect array ordering! - if not np.isfortran(flipped): + if not np.isfortran(scaled): coords = coords[:, ::-1] coords *= [factor * dirx, factor * diry] - coords %= image.shape + coords %= grayscale.shape return ExtractedQR(version, name, int(page), int(copy), coords) else: return @@ -247,9 +255,10 @@ def guess_dpi(image_array): return resolutions[np.argmin(abs(resolutions - 25.4 * h / 297))] -def rotate_and_shift(image_path, extracted_qr, qr_coords): +def rotate_and_shift(image_data, extracted_qr, qr_coords): + """Roll the image such that QR occupies coords specified by the template.""" page, position = extracted_qr.page, extracted_qr.coords - image = cv2.imread(image_path) + image = np.asarray(image_data) if image.shape[0] < image.shape[1]: image = np.transpose(image, (1, 0, 2)) @@ -271,7 +280,7 @@ def rotate_and_shift(image_path, extracted_qr, qr_coords): shifted_image = np.roll(image, shift[0], axis=0) shifted_image = np.roll(shifted_image, shift[1], axis=1) - cv2.imwrite(image_path, shifted_image) + return Image.fromarray(shifted_image) def get_student_number(image_path, widget): -- GitLab