From d8e2d9403780bac65de61cccaf2755ddbb77a8f1 Mon Sep 17 00:00:00 2001 From: Ruben Young On <r.d.youngon@student.tudelft.nl> Date: Wed, 5 Jun 2019 22:20:37 +0200 Subject: [PATCH] PDFMiner is now used instead of PyMuPDF --- requirements.txt | 2 +- zesje/scans.py | 75 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/requirements.txt b/requirements.txt index 746660a0a..3645dc76a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,7 +28,7 @@ pyStrich # TODO: can we replace this with stuff from pylibdmtx? opencv-python git+https://github.com/mstamy2/PyPDF2 pylibdmtx -pymupdf +pdfminer3 # Exporting pandas diff --git a/zesje/scans.py b/zesje/scans.py index e8708dc58..f2ae277db 100644 --- a/zesje/scans.py +++ b/zesje/scans.py @@ -1,6 +1,5 @@ import functools import itertools -import fitz import math import os from collections import namedtuple, Counter @@ -10,6 +9,8 @@ import signal import cv2 import numpy as np import PyPDF2 + +import pdfminer3 from PIL import Image from wand.image import Image as WandImage from pylibdmtx import pylibdmtx @@ -19,6 +20,14 @@ from .datamatrix import decode_raw_datamatrix from .images import guess_dpi, get_box from .factory import make_celery +from pdfminer3.pdfparser import PDFParser +from pdfminer3.pdfdocument import PDFDocument +from pdfminer3.pdfpage import PDFPage +from pdfminer3.pdfinterp import PDFResourceManager +from pdfminer3.pdfinterp import PDFPageInterpreter +from pdfminer3.layout import LAParams +from pdfminer3.converter import PDFPageAggregator + from flask import current_app ExtractedBarcode = namedtuple('ExtractedBarcode', ['token', 'copy', 'page']) @@ -127,6 +136,31 @@ def exam_metadata(exam_id): ) +def parse_obj(lt_objs): + """ + Returns all text boxes from a pdf page. + + Parameters + ---------- + lt_objs : The list of objects in the page. + + Returns + ------- + A list of tuples with the (x0, y0, x1, y1, text) values. + + """ + res = [] + + for obj in lt_objs: + if isinstance(obj, pdfminer3.layout.LTTextBoxHorizontal): + res.append((obj.bbox[0], obj.bbox[1], obj.bbox[2], obj.bbox[3], obj.get_text())) + + elif isinstance(obj, pdfminer3.layout.LTFigure): + res.append(parse_obj(obj._objs)) + + return res + + def get_question_title(problem): """ Returns the question title of a problem @@ -139,32 +173,35 @@ def get_question_title(problem): width = problem.widget.width height = problem.widget.height - # Loads the pdf page - doc = fitz.open(pdf_path) + fp = open(pdf_path, 'rb') - page = doc.loadPage(problem.widget.page) - words = page.getTextWords() + parser = PDFParser(fp) + document = PDFDocument(parser) + rsrcmgr = PDFResourceManager() + laparams = LAParams() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) - # Check if no words are found - if len(words) == 0: - return "Empty" + for page in PDFPage.create_pages(document): + interpreter.process_page(page) + layout = device.get_result() - # Finds the text in the problem widget - filtered_words = [word for word in words - if word[1] > y and word[3] < y + height - and word[0] > x and word[2] < x + width] + if layout.pageid == problem.widget.page + 1: + res = parse_obj(layout._objs) - min_y = min(word[1] for word in filtered_words) + filtered_words = [word[4] for word in res + if word[1] < 842 - y and word[3] > 842 - (y + height) + and word[0] > x and word[2] < x + width] - margin = 2 # pts - first_line = [word for word in filtered_words if abs(word[1] - min_y) < margin] + if not filtered_words: + return '' - problem_title = '' + right_line = filtered_words[0] - for word in first_line: - problem_title += ' ' + word[4] + lines = right_line.split('\n') + return lines[0] - return problem_title[1:] + return '' def extract_images(filename): -- GitLab