Skip to content

Commit

Permalink
container: add custom annotation to specify the scheduler
Browse files Browse the repository at this point in the history
This commit adds a new feature to the container runtime that allows
users to set the scheduling policy of the container process.

The new feature is implemented as a custom annotation.  To set the
scheduling policy and priority, users can add a `run.oci.scheduler`
annotation with a value in the format POLICY[|OPTION][#PRIORITY].

If no scheduling policy or priority is specified, the container process
will use the current scheduling policy and priority.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
  • Loading branch information
giuseppe committed Mar 14, 2023
1 parent 5bdd930 commit 6ba6a00
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 6 deletions.
19 changes: 17 additions & 2 deletions crun.1
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,21 @@ wasm module is relayed back via crun.

.RE

.SH \fB\fCrun.oci.scheduler\fR
.PP
The \fB\fCrun.oci.scheduler\fR annotation allows you to set the scheduling
policy for the container process. The value of the annotation should
be in the format \fB\fCPOLICY[|OPTION][#PRIORITY]\fR, where \fB\fCPOLICY\fR is the
name of the scheduling policy, \fB\fCOPTION\fR can be \fB\fCSCHED_RESET_ON_FORK\fR
and \fB\fCPRIORITY\fR is an optional integer priority value.

.PP
It is an experimental feature and will be removed once the feature is in the
OCI runtime specs.

.PP
Please refer to \fB\fCsched_setscheduler(2)\fR for more information.

.SH tmpcopyup mount options
.PP
If the \fB\fCtmpcopyup\fR option is specified for a tmpfs, then the path that
Expand Down Expand Up @@ -814,8 +829,8 @@ For example, the mapping: \fB\fCuids=@1-3-10\fR, given a configuration like

.PP
will be converted to the absolute value \fB\fCuids=1-4-10\fR, where 4 is
calculated by adding 3 (container ID in the \fB\fCuids=\fR mapping)
+ 1 (\fB\fChostID - containerID\fR for the user namespace mapping where
calculated by adding 3 (container ID in the \fB\fCuids=\fR mapping) and 1
(\fB\fChostID - containerID\fR for the user namespace mapping where
\fB\fCcontainerID = 1\fR is found).

.PP
Expand Down
13 changes: 13 additions & 0 deletions crun.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,19 @@ workload natively. Accepts a `.wasm` binary as input and if `.wat` is
provided it will be automatically compiled into a wasm module. Stdout of
wasm module is relayed back via crun.

## `run.oci.scheduler`

The `run.oci.scheduler` annotation allows you to set the scheduling
policy for the container process. The value of the annotation should
be in the format `POLICY[|OPTION][#PRIORITY]`, where `POLICY` is the
name of the scheduling policy, `OPTION` can be `SCHED_RESET_ON_FORK`
and `PRIORITY` is an optional integer priority value.

It is an experimental feature and will be removed once the feature is in the
OCI runtime specs.

Please refer to `sched_setscheduler(2)` for more information.

## tmpcopyup mount options

If the `tmpcopyup` option is specified for a tmpfs, then the path that
Expand Down
1 change: 0 additions & 1 deletion src/libcrun/cgroup-systemd.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ setup_rt_runtime (runtime_spec_schema_config_linux_resources *resources,
return 0;
}


static int
systemd_finalize (struct libcrun_cgroup_args *args, char **path_out,
int cgroup_mode, const char *suffix, libcrun_error_t *err)
Expand Down
4 changes: 4 additions & 0 deletions src/libcrun/container.c
Original file line number Diff line number Diff line change
Expand Up @@ -2376,6 +2376,10 @@ libcrun_container_run_internal (libcrun_container_t *container, libcrun_context_
if (UNLIKELY (ret < 0))
goto fail;

ret = libcrun_set_scheduler (pid, container, err);
if (UNLIKELY (ret < 0))
return ret;

/* The container is waiting that we write back. In this phase we can launch the
prestart hooks. */
if (def->hooks && def->hooks->prestart_len)
Expand Down
102 changes: 99 additions & 3 deletions src/libcrun/linux.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
#include <sys/xattr.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <sched.h>

#include <yajl/yajl_tree.h>
#include <yajl/yajl_gen.h>
Expand Down Expand Up @@ -4650,7 +4651,8 @@ libcrun_run_linux_container (libcrun_container_t *container, container_entrypoin
}

static int
join_process_parent_helper (pid_t child_pid, int sync_socket_fd,
join_process_parent_helper (libcrun_container_t *container,
pid_t child_pid, int sync_socket_fd,
libcrun_container_status_t *status,
bool need_move_to_cgroup, const char *sub_cgroup,
int *terminal_fd, libcrun_error_t *err)
Expand Down Expand Up @@ -4700,6 +4702,11 @@ join_process_parent_helper (pid_t child_pid, int sync_socket_fd,
if (UNLIKELY (ret < 0))
return ret;
}

/* Join the scheduler immediately after joining the cgroup. */
ret = libcrun_set_scheduler (pid, container, err);
if (UNLIKELY (ret < 0))
return ret;
}

/* The write unblocks the grandchild process so it can run once we setup
Expand Down Expand Up @@ -4944,6 +4951,16 @@ libcrun_join_process (libcrun_container_t *container, pid_t pid_to_join,

pid = syscall_clone3 (&clone3_args);

if (pid > 0)
{
/* We need to set the scheduler as soon as possible after joining the cgroup,
because if it is a RT scheduler, other processes in the container could already
take the entire cpu time and stall the new process. */
ret = libcrun_set_scheduler (pid, container, err);
if (UNLIKELY (ret < 0))
return ret;
}

/* On errors, fall back to fork(). */
if (pid < 0)
{
Expand All @@ -4961,8 +4978,9 @@ libcrun_join_process (libcrun_container_t *container, pid_t pid_to_join,
{
close_and_reset (&sync_socket_fd[1]);
sync_fd = sync_socket_fd[0];
return join_process_parent_helper (pid, sync_fd, status, need_move_to_cgroup,
sub_cgroup, terminal_fd, err);
return join_process_parent_helper (container, pid, sync_fd, status,
need_move_to_cgroup, sub_cgroup,
terminal_fd, err);
}

close_and_reset (&sync_socket_fd[0]);
Expand Down Expand Up @@ -5270,3 +5288,81 @@ libcrun_kill_linux (libcrun_container_status_t *status, int signal, libcrun_erro
return crun_make_error (err, errno, "kill container");
return 0;
}

int
libcrun_set_scheduler (pid_t pid, libcrun_container_t *container, libcrun_error_t *err)
{
cleanup_free char *copy = NULL;
struct sched_param param;
int ret, policy, option;
char *v_priority;
const char *v;
char *sptr;
struct
{
const char *name;
int value;
int option_value;
} policies[] = {
{ "SCHED_OTHER", SCHED_OTHER, 0 },
{ "SCHED_BATCH", SCHED_BATCH, 0 },
{ "SCHED_IDLE", SCHED_IDLE, 0 },
{ "SCHED_FIFO", SCHED_FIFO, 0 },
{ "SCHED_RR", SCHED_RR, 0 },
{ "SCHED_RESET_ON_FORK", 0, SCHED_RESET_ON_FORK },
{ NULL, 0, 0 },
};

v = find_annotation (container, "run.oci.scheduler");
if (LIKELY (v == NULL))
return 0;

memset (&param, 0, sizeof (param));

copy = xstrdup (v);
v_priority = strchr (copy, '#');
if (v_priority)
*v_priority = '\0';

policy = 0;
option = 0;
for (v = strtok_r (copy, "|", &sptr); v; v = strtok_r (NULL, "|", &sptr))
{
int i;

for (i = 0; policies[i].name; i++)
if (strcmp (v, policies[i].name) == 0)
{
policy |= policies[i].value;
option |= policies[i].option_value;
break;
}
if (UNLIKELY (policies[i].name == NULL))
return crun_make_error (err, 0, "invalid scheduler `%s`", v);
}

if (v_priority)
{
long long priority;
char *ep = NULL;

errno = 0;
priority = strtoll (v_priority + 1, &ep, 10);
if (UNLIKELY (ep != NULL && *ep != '\0'))
return crun_make_error (err, EINVAL, "parse scheduler annotation");
if (UNLIKELY (errno))
return crun_make_error (err, errno, "parse scheduler annotation");

if (priority >= INT_MAX || priority <= INT_MIN
|| priority < sched_get_priority_min (policy) || priority > sched_get_priority_max (policy))
return crun_make_error (err, 0, "scheduler priority value `%lli` out of range", priority);

param.sched_priority = (int) priority;
}

ret = sched_setscheduler (pid, option | policy, &param);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "sched_setscheduler");

return 0;
}
2 changes: 2 additions & 0 deletions src/libcrun/linux.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,6 @@ int libcrun_create_dev (libcrun_container_t *container, int devfd,
int parse_idmapped_mount_option (runtime_spec_schema_config_schema *def, bool is_uids, char *option, char **out,
size_t *len, libcrun_error_t *err);

int libcrun_set_scheduler (pid_t pid, libcrun_container_t *container, libcrun_error_t *err);

#endif

0 comments on commit 6ba6a00

Please sign in to comment.