From a9754f46bc7f422eb1d3bf9d2d62b2f68afede61 Mon Sep 17 00:00:00 2001
From: Anton Akhmerov <anton.akhmerov@gmail.com>
Date: Mon, 26 Mar 2018 02:18:28 +0200
Subject: [PATCH] implement exam summary statistics image

---
 zesje/api.py                    |   6 ++
 zesje/helpers/db_helper.py      |  73 ++++++++++++++++++++-
 zesje/resources/summary_plot.py | 110 ++++++++++++++++++++++++++++++++
 3 files changed, 187 insertions(+), 2 deletions(-)
 create mode 100644 zesje/resources/summary_plot.py

diff --git a/zesje/api.py b/zesje/api.py
index 84b630313..a9e8ffc93 100644
--- a/zesje/api.py
+++ b/zesje/api.py
@@ -8,6 +8,7 @@ from .resources.students import Students
 from .resources.submissions import Submissions
 from .resources import signature
 from .resources import images
+from .resources import summary_plot
 from .resources.problems import Problems
 from .resources.feedback import Feedback
 
@@ -47,3 +48,8 @@ api_bp.add_url_rule(
     'solution',
     images.get,
 )
+api_bp.add_url_rule(
+    '/images/summary/<int:exam_id>',
+    'exam_summary',
+    summary_plot.get,
+)
diff --git a/zesje/helpers/db_helper.py b/zesje/helpers/db_helper.py
index 30b970ac4..d22e59506 100644
--- a/zesje/helpers/db_helper.py
+++ b/zesje/helpers/db_helper.py
@@ -1,4 +1,8 @@
-from ..models import Problem
+import pandas
+from pony import orm
+from collections import namedtuple, OrderedDict, ChainMap
+
+from ..models import Exam, Problem, Student, Solution
 from . import yaml_helper
 
 def update_exam(exam, existing_yaml, new_yaml):
@@ -16,8 +20,73 @@ def update_exam(exam, existing_yaml, new_yaml):
     new_problem_names = list(name for name in new_widgets.index
                              if name != 'studentnr')
 
-    
     problems = list(Problem.select(lambda p: p.exam == exam)
                            .order_by(lambda p: p.id))
     for problem, name in zip(problems, new_problem_names):
         problem.name = name
+
+
+def solution_data(exam_id, student_id):
+    """Return Python datastructures corresponding to the student submission."""
+    with orm.db_session:
+        exam = Exam[exam_id]
+        student = Student[student_id]
+        if any(i is None for i in (exam, student)):
+            raise RuntimeError('Student did not make a '
+                               'submission for this exam')
+
+        results = []
+        for problem in exam.problems.order_by(Problem.id):
+            if not orm.count(problem.solutions.feedback):
+                # Nobody received any grade for this problem
+                continue
+            problem_data = {
+                'name': problem.name,
+                'max_score': orm.max(problem.feedback_options.score, default=0)
+            }
+            solutions = Solution.select(lambda s: s.problem == problem
+                                        and s.submission.student == student)
+            problem_data['feedback'] = [
+                {'short': fo.text,
+                 'score': fo.score,
+                 'description': fo.description}
+                for solution in solutions for fo in solution.feedback
+            ]
+            problem_data['score'] = sum(i['score'] or 0
+                                        for i in problem_data['feedback'])
+            problem_data['remarks'] = '\n\n'.join(sol.remarks
+                                                  for sol in solutions
+                                                  if sol.remarks)
+            results.append(problem_data)
+
+        student = student.to_dict()
+
+    student['total'] = sum(i['score'] for i in results)
+    return student, results
+
+
+def full_exam_data(exam_id):
+    """Compute all grades of an exam as a pandas DataFrame."""
+    with orm.db_session:
+        students = sorted(Exam[exam_id].submissions.student.id)
+
+        data = [solution_data(exam_id, student_id)
+                for student_id in students]
+
+    students = pandas.DataFrame({i[0]['id']: i[0] for i in data}).T
+    del students['id']
+
+    results = {}
+    for result in data:
+        for problem in result[1]:
+            name = problem.pop('name')
+            problem[(name, 'remarks')] = problem.pop('remarks')
+            for fo in problem.pop('feedback'):
+                problem[(name, fo['short'])] = fo['score']
+            problem[(name, 'total')] = problem.pop('score')
+            problem.pop('max_score')
+        results[result[0]['id']] = dict(ChainMap({('total', 'total'):
+                                                  result[0]['total']},
+                                                 *result[1]))
+
+    return pandas.DataFrame(results).T
diff --git a/zesje/resources/summary_plot.py b/zesje/resources/summary_plot.py
new file mode 100644
index 000000000..b2c4f940d
--- /dev/null
+++ b/zesje/resources/summary_plot.py
@@ -0,0 +1,110 @@
+import os
+from io import BytesIO
+from flask import abort, Response, current_app as app
+from pony import orm
+
+import pandas
+import numpy as np
+
+from ..models import Exam, Submission
+from ..helpers.db_helper import full_exam_data
+
+import matplotlib
+matplotlib.use('agg')
+import seaborn
+from matplotlib import pyplot
+
+@orm.db_session
+def get(exam_id):
+    """Plot exam summary statistics.
+
+    Parameters
+    ----------
+    exam_id : int
+
+    Returns
+    -------
+    Image (JPEG mimetype)
+    """
+    try:
+        exam = Exam[exam_id]
+    except KeyError:
+        abort(404)
+
+    scores = {problem.name: max(list(problem.feedback_options.score) + [0])
+              for problem in exam.problems}
+    scores['total'] = sum(scores.values())
+
+    full_scores = full_exam_data(exam_id)
+    # Full exam data has multilevel columns (includes detailed feedback), we
+    # flatten them out first.
+    problem_scores = full_scores.iloc[
+        :, full_scores.columns.get_level_values(1) == 'total'
+    ]
+    problem_scores.columns = problem_scores.columns.get_level_values(0)
+    # Exclude empty columns from statistics
+    problem_scores = problem_scores.loc[:, ~(problem_scores == 0).all()]
+
+    seaborn.set()
+    seaborn.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
+
+    cm = matplotlib.cm.magma
+    # define the bins and normalize
+    bounds = np.linspace(0, 1, 21)
+    norm = matplotlib.colors.BoundaryNorm(bounds, cm.N)
+
+
+    maxes = pandas.DataFrame(problem_scores.max())
+    maxes['max_rubric'] = maxes.index
+    maxes = maxes.replace({'max_rubric': scores}).max(axis=1)
+
+    corrs = {column: (problem_scores[column]
+                      .astype(float)
+                      .corr(problem_scores
+                            .total
+                            .subtract(problem_scores[column])
+                            .astype(float)
+                           ).round(2)
+                     ) for column in problem_scores if column != 'total'}
+
+    alpha = ((len(problem_scores) - 1) / (len(problem_scores) - 2)
+             * (1 - problem_scores.var()[:-1].sum()
+                / problem_scores.total.var())
+            )
+
+    vals = [
+        problem_scores[i].value_counts(normalize=True).sort_index().cumsum()
+        for i in problem_scores
+    ]
+    data = np.array(
+        [
+            (-i, upper-lower, lower, num/maxes.ix[i])
+            for i, val in enumerate(vals)
+            for num, upper, lower in zip(
+                val.index, val.data, [0] + list(val.data[:-1])
+            )
+        ]
+    ).T
+    fig = pyplot.figure(figsize=(12, 9))
+    ax = fig.add_subplot(1, 1, 1)
+    ax.barh(
+        data[0], data[1], 0.5, data[2], color=cm(norm(data[3])), align='center'
+    )
+    ax.set_yticks(np.arange(0, -len(problem_scores.columns), -1));
+    ax.set_yticklabels(
+        [f'{i} ($Rir={corrs[i]:.2f}$)' for i in problem_scores.columns[:-1]]
+        + [f'total: ($\\alpha = {alpha:.2f}$)']
+    )
+    ax.set_xlabel('fraction of students')
+    ax.set_xlim(-0.025, 1.025)
+    sm = matplotlib.cm.ScalarMappable(cmap=cm, norm=norm)
+    sm._A = []
+    colorbar = fig.colorbar(sm)
+    colorbar.set_ticks(np.linspace(0, 1, 11))
+    colorbar.set_label('score percentage')
+
+    pyplot.tight_layout()
+    image = BytesIO()
+    pyplot.savefig(image)
+
+    return Response(image.getvalue(), 200, mimetype='image/jpeg')
-- 
GitLab