Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
zesje
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Réouven ASSOULY
zesje
Commits
0b68654a
Commit
0b68654a
authored
7 years ago
by
Joseph Weston
Browse files
Options
Downloads
Patches
Plain Diff
process PDFs in a subprocess
parent
3004f066
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
zesje/helpers/pdf_helper.py
+246
-0
246 additions, 0 deletions
zesje/helpers/pdf_helper.py
zesje/resources/pdfs.py
+8
-2
8 additions, 2 deletions
zesje/resources/pdfs.py
with
254 additions
and
2 deletions
zesje/helpers/pdf_helper.py
0 → 100644
+
246
−
0
View file @
0b68654a
import
os
from
collections
import
namedtuple
,
ChainMap
import
functools
import
itertools
import
subprocess
import
argparse
import
shutil
import
tempfile
import
contextlib
import
numpy
as
np
import
pandas
import
cv2
import
zbar
from
flask
import
current_app
as
app
from
pony
import
orm
from
.
import
yaml_helper
from
..models
import
db
,
PDF
,
Exam
,
Problem
,
Page
,
Student
,
Submission
,
Solution
ExtractedQR
=
namedtuple
(
'
ExtractedQR
'
,
[
'
name
'
,
'
page
'
,
'
sub_nr
'
,
'
coords
'
])
ExamMetadata
=
namedtuple
(
'
ExamMetadata
'
,
[
'
version
'
,
'
exam_name
'
,
'
qr_coords
'
,
'
widget_data
'
])
def
process_pdf
(
pdf_id
):
"""
Process a PDF, recording progress to a database
This *must* be called from a subprocess of the
Flask process, so that we inherit the bound DB instance
and the app config.
Parameters
----------
pdf_id : int
The ID in the database of the PDF to process
"""
data_directory
=
app
.
config
[
'
DATA_DIRECTORY
'
]
report_error
=
functools
.
partial
(
write_pdf_status
,
pdf_id
,
'
error
'
)
report_progress
=
functools
.
partial
(
write_pdf_status
,
pdf_id
,
'
processing
'
)
report_success
=
functools
.
partial
(
write_pdf_status
,
pdf_id
,
'
success
'
)
with
orm
.
db_session
:
pdf
=
PDF
[
pdf_id
]
# TODO: paths
pdf_filename
=
f
'
{
pdf
.
id
}
.pdf
'
pdf_path
=
os
.
path
.
join
(
data_directory
,
'
pdfs
'
,
pdf_filename
)
config_path
=
os
.
path
.
join
(
data_directory
,
pdf
.
exam
.
yaml_path
)
output_directory
=
os
.
path
.
join
(
data_directory
,
pdf
.
exam
.
name
+
'
_data
'
)
try
:
# Read in exam metadata
config
=
ExamMetadata
(
*
yaml_helper
.
parse
(
yaml_helper
.
read
(
config_path
)))
except
Exception
as
e
:
report_error
(
f
'
Error while reading Exam metadata:
{
e
}
'
)
raise
with
make_temp_directory
()
as
tmpdir
:
# Extract pages as images
report_progress
(
'
Extracting pages
'
)
try
:
images
=
pdf_to_images
(
pdf_path
,
tmpdir
)
except
Exception
as
e
:
report_error
(
f
'
Error while extracting pages:
{
e
}
'
)
raise
# Extract QR codes.
report_progress
(
'
Extracting page metadata
'
)
try
:
extracted_qrs
=
[
extract_qr
(
image
,
config
.
version
)
for
image
in
images
]
except
RuntimeError
:
report_error
(
'
Zesje version mismatch between config file and PDF
'
)
raise
except
Exception
as
e
:
report_error
(
f
'
Error while extracting QR codes:
{
e
}
'
)
raise
if
any
(
qr
[
0
]
!=
config
.
exam_name
for
qr
in
extracted_qrs
if
qr
is
not
None
):
report_error
(
'
PDF is not from this exam
'
)
raise
# Process individual pages
failures
=
[]
for
i
,
(
image
,
qr
)
in
enumerate
(
zip
(
images
,
extracted_qrs
)):
report_progress
(
f
'
Processing page
{
i
}
/
{
len
(
images
)
}
'
)
if
qr
is
None
:
failures
.
append
(
image
)
continue
try
:
process_page
(
output_directory
,
image
,
qr
,
config
)
except
Exception
as
e
:
print
(
image
,
e
)
failures
.
append
(
image
)
if
failures
:
processed
=
len
(
images
)
-
len
(
failures
)
# images are named like '-nnnnn.jpg'
failures
=
[
int
(
os
.
path
.
basename
(
im
)[
1
:
-
4
])
for
im
in
failures
]
report_error
(
f
'
Processed
{
processed
}
/
{
len
(
images
)
}
pages.
'
f
'
Failed on pages:
{
failures
}
'
)
else
:
report_success
(
f
'
processed
{
len
(
images
)
}
pages
'
)
def
process_page
(
output_dir
,
image
,
qr_data
,
exam_config
):
assert
qr_data
is
not
None
qr_coords
,
widget_data
=
exam_config
.
qr_coords
,
exam_config
.
widget_data
rotate_and_shift
(
image
,
qr_data
,
qr_coords
)
sub_nr
=
qr_data
.
sub_nr
with
orm
.
db_session
:
exam
=
Exam
.
get
(
name
=
qr_data
.
name
)
sub
=
Submission
.
get
(
copy_number
=
sub_nr
,
exam
=
exam
)
\
or
Submission
(
copy_number
=
sub_nr
,
exam
=
exam
)
_
,
ext
=
os
.
path
.
splitext
(
image
)
target
=
os
.
path
.
join
(
output_dir
,
f
'
{
qr_data
.
name
}
_
{
sub_nr
}
'
)
os
.
makedirs
(
target
,
exist_ok
=
True
)
target_image
=
os
.
path
.
join
(
target
,
f
'
page
{
qr_data
.
page
}{
ext
}
'
)
os
.
rename
(
image
,
target_image
)
# We may have added this page in previous uploads; the above
# 'rename' then overwrites the previosly uploaded page, but
# we only want a single 'Page' entry.
if
Page
.
get
(
path
=
target_image
,
submission
=
sub
)
is
None
:
Page
(
path
=
target_image
,
submission
=
sub
)
widgets_on_page
=
widget_data
[
widget_data
.
page
==
qr_data
.
page
]
for
problem
in
widgets_on_page
.
index
:
if
problem
==
'
studentnr
'
:
sub
.
signature_image_path
=
'
None
'
try
:
number_widget
=
widgets_on_page
.
loc
[
'
studentnr
'
]
number
=
get_student_number
(
target_image
,
number_widget
)
sub
.
student
=
Student
.
get
(
id
=
int
(
number
))
except
Exception
:
pass
# could not extract student name
else
:
prob
=
Problem
.
get
(
name
=
problem
,
exam
=
exam
)
sol
=
Solution
.
get
(
problem
=
prob
,
submission
=
sub
)
if
sol
:
sol
.
image_path
=
'
None
'
else
:
Solution
(
problem
=
prob
,
submission
=
sub
,
image_path
=
'
None
'
)
def
pdf_to_images
(
pdf_path
,
output_path
):
"""
Extract all images out of a pdf file.
"""
# We convert everything to jpeg, which may be suboptimal, however some
# formats recognized by pdfimages aren't understood by opencv.
subprocess
.
run
([
'
pdfimages
'
,
'
-j
'
,
pdf_path
,
output_path
])
return
sorted
(
os
.
path
.
join
(
output_path
,
f
)
for
f
in
os
.
listdir
(
output_path
)
if
f
.
endswith
(
'
.jpg
'
))
def
write_pdf_status
(
pdf_id
,
status
,
message
):
with
orm
.
db_session
:
pdf
=
PDF
[
pdf_id
]
pdf
.
status
=
status
pdf
.
message
=
message
@contextlib.contextmanager
def
make_temp_directory
():
temp_dir
=
tempfile
.
mkdtemp
()
if
not
temp_dir
.
endswith
(
'
/
'
):
temp_dir
+=
'
/
'
try
:
yield
temp_dir
finally
:
shutil
.
rmtree
(
temp_dir
)
def
extract_qr
(
image_path
,
yaml_version
,
scale_factor
=
4
):
image
=
cv2
.
imread
(
image_path
,
cv2
.
IMREAD_GRAYSCALE
)[::
scale_factor
,
::
scale_factor
]
if
image
.
shape
[
0
]
<
image
.
shape
[
1
]:
image
=
image
.
T
# Varied thresholds because zbar is picky about contrast.
for
threshold
in
(
200
,
150
,
220
):
thresholded
=
255
*
(
image
>
threshold
)
# zbar also cares about orientation.
for
direction
in
itertools
.
product
([
1
,
-
1
],
[
1
,
-
1
]):
flipped
=
thresholded
[::
direction
[
0
],
::
direction
[
1
]]
scanner
=
zbar
.
Scanner
()
results
=
scanner
.
scan
(
flipped
.
astype
(
np
.
uint8
))
if
results
:
try
:
version
,
name
,
page
,
copy
=
\
results
[
0
].
data
.
decode
().
split
(
'
;
'
)
except
ValueError
:
return
if
version
!=
'
v{}
'
.
format
(
yaml_version
):
raise
RuntimeError
(
'
Yaml format mismatch
'
)
coords
=
np
.
array
(
results
[
0
].
position
)
# zbar doesn't respect array ordering!
if
not
np
.
isfortran
(
flipped
):
coords
=
coords
[:,
::
-
1
]
coords
*=
direction
coords
%=
image
.
shape
coords
*=
scale_factor
return
ExtractedQR
(
name
,
int
(
page
),
int
(
copy
),
coords
)
else
:
return
def
guess_dpi
(
image_array
):
h
,
*
_
=
image_array
.
shape
resolutions
=
np
.
array
([
1200
,
600
,
300
,
200
,
150
,
120
,
100
,
75
,
60
,
50
,
40
])
return
resolutions
[
np
.
argmin
(
abs
(
resolutions
-
25.4
*
h
/
297
))]
def
rotate_and_shift
(
image_path
,
extracted_qr
,
qr_coords
):
_
,
page
,
_
,
position
=
extracted_qr
image
=
cv2
.
imread
(
image_path
)
if
image
.
shape
[
0
]
<
image
.
shape
[
1
]:
image
=
np
.
transpose
(
image
,
(
1
,
0
,
2
))
dpi
=
guess_dpi
(
image
)
h
,
w
,
*
_
=
image
.
shape
qr_widget
=
qr_coords
[
qr_coords
.
page
==
page
]
box
=
dpi
*
qr_widget
[[
'
top
'
,
'
bottom
'
,
'
left
'
,
'
right
'
]].
values
[
0
]
y0
,
x0
=
h
-
np
.
mean
(
box
[:
2
]),
np
.
mean
(
box
[
2
:])
y
,
x
=
np
.
mean
(
position
,
axis
=
0
)
if
(
x
>
w
/
2
)
!=
(
x0
>
w
/
2
):
image
=
image
[:,
::
-
1
]
x
=
w
-
x
if
(
y
>
h
/
2
)
!=
(
y0
>
h
/
2
):
image
=
image
[::
-
1
]
y
=
h
-
y
shift
=
np
.
round
((
y0
-
y
,
x0
-
x
)).
astype
(
int
)
shifted_image
=
np
.
roll
(
image
,
shift
[
0
],
axis
=
0
)
shifted_image
=
np
.
roll
(
shifted_image
,
shift
[
1
],
axis
=
1
)
cv2
.
imwrite
(
image_path
,
shifted_image
)
This diff is collapsed.
Click to expand it.
zesje/resources/pdfs.py
+
8
−
2
View file @
0b68654a
import
os
from
multiprocessing
import
Process
from
flask
import
abort
,
current_app
as
app
from
flask_restful
import
Resource
,
reqparse
...
...
@@ -7,7 +8,7 @@ from werkzeug.datastructures import FileStorage
from
pony
import
orm
from
..models
import
db
,
Exam
,
PDF
from
..helpers
import
pdf_helper
class
Pdfs
(
Resource
):
"""
Getting a list of uploaded PDFs, and uploading new ones.
"""
...
...
@@ -63,7 +64,6 @@ class Pdfs(Resource):
pdf
=
PDF
(
exam
=
Exam
[
exam_id
],
name
=
args
[
'
pdf
'
].
filename
,
status
=
'
processing
'
,
message
=
'
importing PDF
'
)
# TODO fire off subprocess
with
orm
.
db_session
:
try
:
path
=
os
.
path
.
join
(
app
.
config
[
'
PDF_DIRECTORY
'
],
f
'
{
pdf
.
id
}
.pdf
'
)
...
...
@@ -72,6 +72,12 @@ class Pdfs(Resource):
pdf
.
delete
()
raise
# Fire off background process
# TODO: save these into a process-local datastructure, or save
# it into the DB as well so that we can cull 'processing' tasks
# that are actually dead.
Process
(
target
=
pdf_helper
.
process_pdf
,
args
=
(
pdf
.
id
,)).
start
()
return
{
'
id
'
:
pdf
.
id
,
'
name
'
:
pdf
.
name
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment