Skip to content
Snippets Groups Projects
Commit a550645b authored by Viacheslav Ostroukh's avatar Viacheslav Ostroukh :bike:
Browse files

release v0.0.2

    Changes:
    ========

    - Allows launching kernels locally, if None is specified as a
      parameters list for `salloc` in Jupyter config. This behaviour is
      needed to be enabled in Jupyter config with a parameter:
        c.SlurmKernelManager.default_kernel_local = False

    - Allows fallback to local kernel, if SLURM shell has failed to
      spawn. This behaviour is needed to be enabled in Jupyter config
      with a parameter:
        c.SlurmKernelManager.fallback_kernel_local = False

    - (Hopefully) better stack traces, if something goes wrong.
parents 9e0e438b a3d34fb6
Branches develop master
Tags v0.0.2
No related merge requests found
!!! WIP !!!
Custom KernelSpecManager for Jupyter for launching kernels remotely, using
[SLURM workload manager](https://slurm.schedmd.com/).
For each kernel found it will modify a command to launch kernel with SLURM.
Options for launching are configurable via Jupyter config file.
Options for launching are configurable via the Jupyter config file.
version_info = (0, 0, 1)
version_info = (0, 0, 2)
__version__ = '.'.join(map(str, version_info))
import os
import pexpect
import random
import re
import signal
import socket
import sys
import tempfile
from jupyter_client.connect import port_names, write_connection_file
from jupyter_client.ioloop.manager import IOLoopKernelManager
from jupyter_client.localinterfaces import is_local_ip
from pexpect import EOF, TIMEOUT
from pexpect.popen_spawn import PopenSpawn
from six import raise_from
from socket import gethostbyname
from subprocess import list2cmdline, check_output
from traitlets import Unicode, Instance, Integer, Dict, List
from traitlets import Bool, Unicode, Instance, Integer, Dict, List
from .kernelspecmanager import SlurmKernelSpecManager
class SlurmAllocationError(RuntimeError):
class SlurmAllocationError(Exception):
pass
......@@ -90,6 +89,22 @@ class SlurmKernelManager(IOLoopKernelManager):
allow_none=False
)
default_kernel_local = Bool(
default_value=False,
help='Whether to default to launching a kernel locally or to launch '
'it via SLURM with default arguments.',
config=True,
allow_none=False
)
fallback_kernel_local = Bool(
default_value=False,
help='Whether to fallback to launching a kernel locally, if SLURM '
'allocation is not succeeded.',
config=True,
allow_none=False
)
default_kernel_job_name = Unicode(
default_value='jupyter_kernel',
help='Default name for a kernel SLURM job. Can be overridden in '
......@@ -114,6 +129,7 @@ class SlurmKernelManager(IOLoopKernelManager):
def __init__(self, **kwargs):
super(SlurmKernelManager, self).__init__(**kwargs)
self.slurm_jobid = None
self._ip_default = self.ip
def start_kernel(self, **kw):
"""Starts a kernel on a remote host, using 'salloc' and 'srun' to
......@@ -144,11 +160,27 @@ class SlurmKernelManager(IOLoopKernelManager):
self.slurm_jobid = None
# Launch a job and write a proper connection file for the kernel
shell, nodename, jobid = self._launch_slurm_shell(env)
launch_local = False
try:
shell, nodename, jobid = self._launch_slurm_shell(env)
if shell is None:
launch_local = True
except SlurmAllocationError:
if self.fallback_kernel_local:
shell, nodename, jobid = None, None, None
launch_local = True
else:
raise RuntimeError("Could not get SLURM allocation for the"
" kernel.")
if launch_local:
if not is_local_ip(self.ip):
self.ip = self._ip_default
return super(SlurmKernelManager, self).start_kernel(env=env, **kw)
self.slurm_jobid = jobid
self.ip = self._ip_by_nodename(nodename)
self.write_connection_file()
self.write_connection_file(remote=True)
# save kwargs for use in restart
self._launch_args = kw.copy()
......@@ -159,7 +191,7 @@ class SlurmKernelManager(IOLoopKernelManager):
kernel_cmdline = list2cmdline(kernel_cmd)
# launch the kernel subprocess
self.log.debug("Starting kernel: %s", kernel_cmd)
self.log.debug("Starting kernel: %s" % kernel_cmd)
self.log.debug("Kernel command: %s" % kernel_cmdline)
shell.sendline(kernel_cmdline)
self.kernel = shell
......@@ -215,14 +247,23 @@ class SlurmKernelManager(IOLoopKernelManager):
raise SlurmAllocationError("SLURM metadata for the kernel"
" not found.")
salloc_args = metadata.get(self._metadata_salloc_args, None)
self.log.debug("salloc_args: %s" % salloc_args)
if salloc_args is None:
if self.default_kernel_local:
return None, None, None
else:
salloc_args = []
# allocating a SLURM job with a shell and getting allocated node
# noinspection PyTypeChecker
cmd = ([self.salloc_command,
# Ensure that we have at least one CPU, can be overridden
'--cpus-per-task=1',
# Set job name, can be overridden
'--job-name=%s' % self.default_kernel_job_name] +
# output information with a node name in salloc output
metadata[self._metadata_salloc_args] +
salloc_args +
# Ensure that we have only one node allocated
['--nodes=1',
# Double verbosity even if user tries to do otherwise
......@@ -245,16 +286,38 @@ class SlurmKernelManager(IOLoopKernelManager):
# We can send the kernel launch command afterwards
node = job.match.groups()[0]
return job, str(node.decode()), int(jobid.strip())
except (pexpect.EOF, pexpect.TIMEOUT) as e:
_, _, tb = sys.exc_info()
except EOF:
if job:
job.kill(1)
raise SlurmAllocationError(
"`salloc` has exited without opening a shell.\nPlease check"
" whether you have provided correct parameters in "
"SlurmKernelSpecManager configuration.")
except TIMEOUT:
if job:
job.kill(1)
raise_from(SlurmAllocationError(e), tb)
raise SlurmAllocationError(
"Timeout while waiting for allocation of a job via `salloc`.\n"
"May be target partition is full?"
)
def write_connection_file(self):
def write_connection_file(self, remote=False):
"""Write connection info to JSON dict and assign
self.connection_file.
Parameters
----------
`remote` : boolean
If False, parent's method is called (local kernel and ports are
assigned via `socket.bind`).
If True, kernel is treated as remote one, and custom procedures
are used to generate a valid connection file.
"""
# Default is parent's realization, otherwise it will break calling
# parent's methods.
if not remote:
return super(SlurmKernelManager, self).write_connection_file()
if (self._connection_file_written and
os.path.exists(self.connection_file)):
return
......@@ -330,44 +393,54 @@ class SlurmKernelManager(IOLoopKernelManager):
def _kill_kernel(self):
"""Kill the running kernel."""
if self.has_kernel:
self.kernel.kill(1)
self.kernel.wait()
self.kernel = None
self.slurm_jobid = None
if self.slurm_jobid:
self.kernel.kill(1)
self.kernel.wait()
self.kernel = None
self.slurm_jobid = None
else:
# Kernel is local, calling parent's method
super(SlurmKernelManager, self)._kill_kernel()
else:
raise RuntimeError("Cannot kill kernel. No kernel is running!")
def signal_kernel(self, signum):
"""Sends a signal to the shell, where the kernel is being executed."""
if self.has_kernel and self.slurm_jobid:
try:
sig = re.match("SIG(\w+)", signum.name).groups()[0]
except AttributeError:
sig = str(signal)
# Signaling kernel process group using scancel
check_output([self.scancel_command,
'--signal=%s' % sig,
str(self.slurm_jobid)])
if self.has_kernel:
if self.slurm_jobid:
try:
sig = re.match("SIG(\w+)", signum.name).groups()[0]
except AttributeError:
sig = str(signal)
# Signaling kernel process group using scancel
check_output([self.scancel_command,
'--signal=%s' % sig,
str(self.slurm_jobid)])
else:
# Kernel is local, calling parent's method
super(SlurmKernelManager, self).signal_kernel(signum=signum)
else:
raise RuntimeError("Cannot signal kernel. No kernel is running!")
def is_alive(self):
"""Kernel is considered as alive, if its job exists and its state is
RUNNING
RUNNING, or it is launched locally and parent tells so.
"""
out = PopenSpawn([self.scontrol_command,
"show", "job", str(self.slurm_jobid)],
timeout=self.slurm_timeout)
try:
out.expect("JobState=(\w+)")
state = out.match.groups()[0]
if str(state.decode()) == 'RUNNING':
return True
else:
if self.slurm_jobid:
out = PopenSpawn([self.scontrol_command,
"show", "job", str(self.slurm_jobid)],
timeout=self.slurm_timeout)
try:
out.expect("JobState=(\w+)")
state = out.match.groups()[0]
if str(state.decode()) == 'RUNNING':
return True
else:
return False
except EOF:
return False
except pexpect.EOF:
return False
else:
return super(SlurmKernelManager, self).is_alive()
# Backwards compatibility
# At the moment of writing functions below are not landed to the stable
......
......@@ -18,13 +18,14 @@ class SlurmKernelSpecManager(KernelSpecManager):
"""
profiles = Dict(
default_value={'default': []},
default_value={'default': None},
help=(
'Dictionary of profiles.'
'Keys are names, that are appended to the original kernel name.\n'
'They should be alphanumeric (no spaces).\n'
'Values are list of additional arguments, that are passed '
'to the `salloc` command.\n'
'to the `salloc` command, or None.\n'
'None invokes default behaviour of `SlurmKernelManager`.\n'
'See `man salloc` for more information.'
),
config=True,
......
......@@ -70,6 +70,14 @@ c.SlurmKernelManager.connection_files_dir = '/path/to/shared/folder/.connection_
# SLURM processes.
#c.SlurmKernelManager.slurm_timeout = 10
## Whether to default to launching a kernel locally or to launch it via SLURM
# with default arguments.
#c.SlurmKernelManager.default_kernel_local = False
## Whether to fallback to launching a kernel locally, if SLURM allocation is
# not succeeded.',
#c.SlurmKernelManager.fallback_kernel_local = False
## Default name for a kernel SLURM job. Can be overridden in
# SlurmKernelSpecManager profile, passing argument `--job-name` to salloc.
#c.SlurmKernelManager.default_kernel_job_name = 'jupyter_kernel',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment