Commit b0ba9dd7 authored by Anton Akhmerov's avatar Anton Akhmerov

Merge branch 'check-image-dimensions' into 'master'

Check image dimensions for image extraction

See merge request !162
parents 5dcde227 eac903d1
......@@ -7,6 +7,7 @@ from tempfile import NamedTemporaryFile
from flask import Flask
from io import BytesIO
import wand.image
from pikepdf import Pdf
from zesje.scans import decode_barcode, ExamMetadata, ExtractedBarcode
from zesje.database import db, _generate_exam_token
......@@ -267,6 +268,24 @@ def test_all_effects(
assert success is expected, reason
@pytest.mark.parametrize('filename,expected', [
['blank-a4-2pages.pdf', AttributeError],
['single-image-a4.pdf', ValueError],
['two-images-a4.pdf', ValueError],
['flattened-a4-2pages.pdf', None]],
ids=['blank pdf', 'single image', 'two images', 'flattened pdf'])
def test_image_extraction_pike(datadir, filename, expected):
file = os.path.join(datadir, filename)
with Pdf.open(file) as pdf_reader:
for pagenr in range(len(pdf_reader.pages)):
if expected is not None:
with pytest.raises(expected):
scans.extract_image_pikepdf(pagenr, pdf_reader)
else:
img = scans.extract_image_pikepdf(pagenr, pdf_reader)
assert img is not None
@pytest.mark.parametrize('filename', [
'blank-a4-2pages.pdf',
'flattened-a4-2pages.pdf'],
......
......@@ -165,9 +165,15 @@ def extract_image_pikepdf(pagenr, reader):
"""Extracts an image as an array from the designated page
This method uses PikePDF to extract the image and only works
when there is a single image present on the page.
when there is a single image present on the page with the
same aspect ratio as the page.
Raises an error if not exactly one image is found on the page.
We do not check for the actual size of the image on the page,
since this size depends on the draw instruction rather than
the embedded image object available to pikepdf.
Raises an error if not exactly image is present or the image
does not have the same aspect ratio as the page.
Parameters
----------
......@@ -183,7 +189,11 @@ def extract_image_pikepdf(pagenr, reader):
Raises
------
ValueError if not exactly one image is found on the page
ValueError
if not exactly one image is found on the page or the image
does not have the same aspect ratio as the page
AttributeError
if no XObject or MediaBox is present on the page
"""
page = reader.pages[pagenr]
......@@ -192,11 +202,23 @@ def extract_image_pikepdf(pagenr, reader):
if sum((xObject[obj].Subtype == '/Image')
for obj in xObject) != 1:
raise ValueError
raise ValueError('Not exactly 1 image present on the page')
for obj in xObject:
if xObject[obj].Subtype == '/Image':
pdfimage = PdfImage(xObject[obj])
pdf_width = float(page.MediaBox[2] - page.MediaBox[0])
pdf_height = float(page.MediaBox[3] - page.MediaBox[1])
ratio_width = pdfimage.width / pdf_width
ratio_height = pdfimage.height / pdf_height
# Check if the aspect ratio of the image is the same as the
# aspect ratio of the page up to a 3% relative error
if abs(ratio_width - ratio_height) > 0.03 * ratio_width:
raise ValueError('Image has incorrect dimensions')
return pdfimage.as_pil_image()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment