Commit a0b48f7b authored by Adrià Labay's avatar Adrià Labay
Browse files

do not fail whole process on an unreadable file

parent 68af6d43
......@@ -112,7 +112,7 @@ def test_guess_missing_page_info():
def test_extract_pages_no_image(app):
file_content = b'1701'
result = [result for result in extract_pages_from_file(file_content, 'notanimage.nopng')]
result = list(extract_pages_from_file(file_content, 'notanimage.nopng'))
assert len(result) == 1
image, page_info, file_info, number, total = result[0]
......@@ -122,6 +122,28 @@ def test_extract_pages_no_image(app):
assert total == 1
def test_extract_pages_corrupted_image(app):
file_content = b'1701'
result = list(extract_pages_from_file(file_content, 'corrupted_image.png'))
assert len(result) == 1
image, page_info, file_info, number, total = result[0]
assert isinstance(image, Exception)
assert number == 1
assert total == 1
def test_extract_pages_corrupted_pdf(app):
file_content = b'1701'
result = list(extract_pages_from_file(file_content, 'corrupted_pdf.pdf'))
assert len(result) == 1
image, page_info, file_info, number, total = result[0]
assert isinstance(image, Exception)
assert number == 1
assert total == 1
def test_extract_pages_from_zip(app):
with BytesIO() as zip_bytes:
with zipfile.ZipFile(zip_bytes, 'w') as z, \
......
......@@ -66,7 +66,7 @@ def extract_pages_from_file(file_path_or_buffer, file_info, dpi=300):
number : int
The number of files extracted so far.
total : int
The total number of file to extract
The total number of files to extract.
"""
file_infos = list(extract_images_or_infos_from_file(file_path_or_buffer, file_info, dpi, only_info=True))
final_total = len(file_infos)
......@@ -170,10 +170,13 @@ def extract_image_from_image(file_path_or_buffer, file_info, only_info=False):
Same as `extract_images_or_infos_from_file`.
"""
if not only_info:
with Image.open(file_path_or_buffer) as image:
image = exif_transpose(image)
image = convert_to_rgb(image)
yield image, file_info
try:
with Image.open(file_path_or_buffer) as image:
image = exif_transpose(image)
image = convert_to_rgb(image)
yield image, file_info
except Exception as e:
yield e, file_info
else:
yield file_info
......@@ -200,34 +203,47 @@ def extract_images_from_pdf(file_path_or_buffer, file_info=None, dpi=300, only_i
if file_info is None:
file_info = []
with Pdf.open(file_path_or_buffer) as pdf_reader:
number_of_pages = len(pdf_reader.pages)
use_wand = False
for page_number, page in enumerate(pdf_reader.pages, start=1):
# Only include page number in file_info if there are multiple pages
if number_of_pages > 1:
file_info_page = _combine_file_info(file_info, page_number)
else:
file_info_page = file_info
if not only_info:
if not use_wand:
try:
# Try to use PikePDF, but catch any error it raises
img = extract_image_pikepdf(page)
except (ValueError, AttributeError, NotImplementedError, PdfError):
# Fallback to Wand if extracting with PikePDF failed
use_wand = True
if use_wand:
img = extract_image_wand(page, dpi)
img = convert_to_rgb(img)
yield img, file_info_page
else:
yield file_info_page
try:
with Pdf.open(file_path_or_buffer) as pdf_reader:
number_of_pages = len(pdf_reader.pages)
use_wand = False
for page_number, page in enumerate(pdf_reader.pages, start=1):
# Only include page number in file_info if there are multiple pages
if number_of_pages > 1:
file_info_page = _combine_file_info(file_info, page_number)
else:
file_info_page = file_info
if not only_info:
if not use_wand:
try:
# Try to use PikePDF, but catch any error it raises
img = extract_image_pikepdf(page)
except (ValueError, AttributeError, NotImplementedError, PdfError):
# Fallback to Wand if extracting with PikePDF failed
use_wand = True
except Exception as e:
yield e, file_info_page
continue
if use_wand:
try:
img = extract_image_wand(page, dpi)
except Exception as e:
yield e, file_info_page
continue
img = convert_to_rgb(img)
yield img, file_info_page
else:
yield file_info_page
except Exception as e:
if only_info:
yield file_info
else:
yield e, file_info
def extract_image_pikepdf(page):
......
......@@ -87,17 +87,19 @@ def _process_scan(scan_id, exam_layout):
try:
for image, page_info, file_info, number, total in extract_pages_from_file(scan.path, scan.name):
report_progress(f'Processing page {number} / {total}')
if not isinstance(image, Image.Image):
failures.append((file_info, 'File is not an image'))
try:
success, description = process_page_function(
image, page_info, file_info, exam_config, output_directory
)
if not success:
failures.append((file_info, description))
except Exception as e:
report_error(f'Error processing {readable_filename(file_info)}: {e}')
return
if isinstance(image, Exception):
failures.append((file_info, str(image)))
elif not isinstance(image, Image.Image):
failures.append((file_info, 'File is not an image.'))
else:
try:
success, description = process_page_function(
image, page_info, file_info, exam_config, output_directory
)
if not success:
failures.append((file_info, description))
except Exception as e:
report_error(f'Error processing {readable_filename(file_info)}: {e}')
except Exception as e:
report_error(f"Failed to read file {scan.name}: {e}")
raise
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment