finish course and employee data cleanup

parent 85426489
.ipynb_checkpoints
# Ask Anton about getting this dataset.
employee_data.xlsx
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import JSON"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -7,8 +16,11 @@
"outputs": [],
"source": [
"from collections import defaultdict, Counter\n",
"from itertools import chain\n",
"import json\n",
"from math import isnan\n",
"\n",
"import pandas\n",
"import requests\n",
"\n",
"def recursive_apply(data, f, condition):\n",
......@@ -197,13 +209,35 @@
" json.dump(full_course_data, f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Postprocessing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sorted(list(Counter(len(i) for i in teaching.values()).items()))"
"def normalize_instructors(instructors):\n",
" if instructors is None:\n",
" instructors = []\n",
"\n",
" result = defaultdict(list)\n",
" for entry in instructors:\n",
" if entry['@taal'] != 'EN':\n",
" continue\n",
" \n",
" employees = entry['medewerker']\n",
" if not isinstance(employees, list):\n",
" employees = [employees]\n",
" \n",
" result[entry['@label']] = employees\n",
"\n",
" return result\n"
]
},
{
......@@ -212,10 +246,91 @@
"metadata": {},
"outputs": [],
"source": [
"emails = []\n",
"with open('detailed_data.json') as f:\n",
" full_course_data = json.load(f)\n",
"\n",
"data = list(full_course_data.values())\n",
"\n",
"data = [i for i in data if i is not None]\n",
"\n",
"data = [i['vak'] for i in data]\n",
"\n",
"programs = {}\n",
"instructors = {}\n",
"\n",
"for i in data:\n",
" program = i.pop('opleiding')\n",
" code = program.pop('code')\n",
" program.update(program.pop('faculteit'))\n",
" programs[code] = program\n",
" program['type'] = program.pop('opleidingstype')['naamEN']\n",
" program['faculty'] = program.pop('organisatieOnderdeel')['afkortingEN']\n",
" i['program'] = code\n",
" \n",
" i['course'] = i.pop('cursusid')\n",
" i['ects'] = float(i['ects'])\n",
" i['studiejaar'] = i['studiejaar']['naam']\n",
" i['name_EN'] = i.pop('kortenaamEN')\n",
" i['name_NL'] = i.pop('kortenaamNL')\n",
" assert i['name_EN'] == i['langenaamEN'] and i['name_NL'] == i['langenaamNL']\n",
" del i['langenaamEN'], i['langenaamNL']\n",
"\n",
" default_extra = {'vakMedewerkers': [], 'vakUnsupportedInfoVelden': []}\n",
" extra = i.pop('extraUnsupportedInfo')\n",
" if extra is None:\n",
" extra = default_extra\n",
"\n",
" instructors = normalize_instructors(extra.get('vakMedewerkers'))\n",
" i['Instructor'] = sum((instructors[role] for role in ('Instructor', 'Co-Instructor', 'Module Manager')), [])\n",
" i['Responsible Instructor'] = sum((instructors[role] for role in ('Responsible Instructor', 'Course Coordinator')), [])\n",
" i['Responsible for assignments'] = sum((instructors[role] for role in ('Responsible for assignments', 'Co-responsible for assignments')), [])\n",
" i['original_instructors'] = instructors\n",
"\n",
" extra = extra.get('vakUnsupportedInfoVelden', [])\n",
" for entry in extra:\n",
" i[f'{entry[\"@label\"]} ({entry[\"@taal\"]})'] = entry['inhoud']\n",
"\n",
"\n",
"df = pandas.DataFrame(data)\n",
"df['English'] = df['Course Language (EN)'].map(lambda x: not isinstance(x, float) and 'English' in x)\n",
"df['Dutch'] = df['Course Language (EN)'].map(lambda x: not isinstance(x, float) and 'Dutch' in x)\n",
"\n",
"normalized_period = df['Education Period (EN)'].map(lambda x: x if isinstance(x, list) else [x])\n",
"\n",
"periods = '1A 1B 2A 2B 3A 3B 4A 4B'.split()\n",
"for period in periods:\n",
" df[f'period {period}'] = normalized_period.map(lambda x: period in x or period[0] in x)\n",
"\n",
"df['Collegerama'] = df['Collegerama (EN)'].map((lambda x: x == \"Yes\" or \"Yes\" in x), na_action='ignore')\n",
"df.index = df['course']\n",
"\n",
"for column in (\n",
" 'Collegerama (EN)', 'Collegerama (NL)', 'Onderwijsperiode (NL)', 'Education Period (EN)', 'Cursustaal (NL)', 'Course Language (EN)', 'Tentamenperiode (NL)',\n",
" 'Test and result scale (EN)', 'course',\n",
"):\n",
" del df[column]\n",
"\n",
"# Delete infrequently used columns.\n",
"frequencies = {col: len(df[col].dropna()) for col in df}\n",
"\n",
"for data in all_data:\n",
" emails += list(recursive_apply(data, (lambda x: x['emailAdresTU']), (lambda x: hasattr(x, '__getitem__') and 'emailAdresTU' in x)))"
"for column, frequency in frequencies.items():\n",
" if frequency < 100:\n",
" del df[column]\n",
"\n",
"# Email → name\n",
"instructors = dict(chain.from_iterable(\n",
" df.original_instructors\n",
" .map(lambda instructors: [\n",
" (i['email'], i['naam']) for j in instructors.values() for i in j\n",
" ])\n",
"))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reading the employee data"
]
},
{
......@@ -224,8 +339,55 @@
"metadata": {},
"outputs": [],
"source": [
"len(set(filter((lambda email: '@tudelft.nl' in email), emails)))"
"employee_data = pandas.read_excel('employee_data.xlsx')\n",
"\n",
"for column in employee_data:\n",
" if len(employee_data[column].dropna()) < 100:\n",
" del employee_data[column]\n",
"\n",
"for column in 'ID 27 23 24'.split():\n",
" del employee_data[column]\n",
"\n",
"employee_data = employee_data.rename(columns={\n",
" '1': 'first_name',\n",
" '2': 'last_name',\n",
" '4': 'unit',\n",
" '5': 'section',\n",
" '6': 'office',\n",
" '13': 'phone',\n",
" '22': 'full_name',\n",
" '25': 'email_prefix',\n",
"})\n",
"employee_data = employee_data[employee_data.unit.notna()]\n",
"employee_data = employee_data[employee_data.unit.str.startswith('TUDelft')]\n",
"employee_data.unit = employee_data.unit.str.replace('TUDelft-', '')\n",
"employee_data.email_prefix = employee_data.email_prefix.str.lower()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"employee_data[employee_data.unit.str.startswith('TNW')].unit.unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"employee_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment