Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

container: add custom annotation to specify the scheduler #1164

Merged
merged 2 commits into from
Mar 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions crun.1
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,21 @@ wasm module is relayed back via crun.

.RE

.SH \fB\fCrun.oci.scheduler\fR
.PP
The \fB\fCrun.oci.scheduler\fR annotation allows you to set the scheduling
policy for the container process. The value of the annotation should
be in the format \fB\fCPOLICY[|OPTION][#PRIORITY]\fR, where \fB\fCPOLICY\fR is the
name of the scheduling policy, \fB\fCOPTION\fR can be \fB\fCSCHED_RESET_ON_FORK\fR
and \fB\fCPRIORITY\fR is an optional integer priority value.

.PP
It is an experimental feature and will be removed once the feature is in the
OCI runtime specs.

.PP
Please refer to \fB\fCsched_setscheduler(2)\fR for more information.

.SH tmpcopyup mount options
.PP
If the \fB\fCtmpcopyup\fR option is specified for a tmpfs, then the path that
Expand Down Expand Up @@ -814,8 +829,8 @@ For example, the mapping: \fB\fCuids=@1-3-10\fR, given a configuration like

.PP
will be converted to the absolute value \fB\fCuids=1-4-10\fR, where 4 is
calculated by adding 3 (container ID in the \fB\fCuids=\fR mapping)
+ 1 (\fB\fChostID - containerID\fR for the user namespace mapping where
calculated by adding 3 (container ID in the \fB\fCuids=\fR mapping) and 1
(\fB\fChostID - containerID\fR for the user namespace mapping where
\fB\fCcontainerID = 1\fR is found).

.PP
Expand Down
13 changes: 13 additions & 0 deletions crun.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,19 @@ workload natively. Accepts a `.wasm` binary as input and if `.wat` is
provided it will be automatically compiled into a wasm module. Stdout of
wasm module is relayed back via crun.

## `run.oci.scheduler`

The `run.oci.scheduler` annotation allows you to set the scheduling
policy for the container process. The value of the annotation should
be in the format `POLICY[|OPTION][#PRIORITY]`, where `POLICY` is the
name of the scheduling policy, `OPTION` can be `SCHED_RESET_ON_FORK`
and `PRIORITY` is an optional integer priority value.

It is an experimental feature and will be removed once the feature is in the
OCI runtime specs.

Please refer to `sched_setscheduler(2)` for more information.

## tmpcopyup mount options

If the `tmpcopyup` option is specified for a tmpfs, then the path that
Expand Down
73 changes: 72 additions & 1 deletion src/libcrun/cgroup-systemd.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <signal.h>
#include <sys/vfs.h>
#include <inttypes.h>
#include <fcntl.h>
#include <time.h>

#ifdef HAVE_SYSTEMD
Expand Down Expand Up @@ -68,17 +69,78 @@ get_systemd_scope_and_slice (const char *id, const char *cgroup_path, char **sco
}
}

/* set the rt-runtime for the current cgroup and its parent if the path is not a scope. */
static int
setup_rt_runtime (runtime_spec_schema_config_linux_resources *resources,
const char *path, libcrun_error_t *err)
{
cleanup_free char *cgroup_path = NULL;
cleanup_close int dirfd = -1;
bool need_set_parent = true;
char fmt_buf[64];
size_t len;
int ret;

if (resources == NULL || resources->cpu == NULL)
return 0;

if (has_suffix (path, ".scope"))
need_set_parent = false;

ret = append_paths (&cgroup_path, err, CGROUP_ROOT, "cpu", path, NULL);
if (UNLIKELY (ret < 0))
return ret;

dirfd = open (cgroup_path, O_DIRECTORY | O_CLOEXEC);
if (UNLIKELY (dirfd < 0))
return crun_make_error (err, errno, "open `%s`", cgroup_path);

if (resources->cpu->realtime_period)
{
len = sprintf (fmt_buf, "%" PRIu64, resources->cpu->realtime_period);

if (need_set_parent)
{
ret = write_file_at_with_flags (dirfd, O_WRONLY, 0, "../cpu.rt_period_us", fmt_buf, len, err);
if (UNLIKELY (ret < 0))
return ret;
}

ret = write_file_at_with_flags (dirfd, O_WRONLY, 0, "cpu.rt_period_us", fmt_buf, len, err);
if (UNLIKELY (ret < 0))
return ret;
}

if (resources->cpu->realtime_runtime)
{
len = sprintf (fmt_buf, "%" PRIu64, resources->cpu->realtime_runtime);

if (need_set_parent)
{
ret = write_file_at_with_flags (dirfd, O_WRONLY, 0, "../cpu.rt_runtime_us", fmt_buf, len, err);
if (UNLIKELY (ret < 0))
return ret;
}

ret = write_file_at_with_flags (dirfd, O_WRONLY, 0, "cpu.rt_runtime_us", fmt_buf, len, err);
if (UNLIKELY (ret < 0))
return ret;
}
return 0;
}

static int
systemd_finalize (struct libcrun_cgroup_args *args, char **path_out,
int cgroup_mode, const char *suffix, libcrun_error_t *err)
{
runtime_spec_schema_config_linux_resources *resources = args->resources;
cleanup_free char *cgroup_path = NULL;
cleanup_free char *content = NULL;
cleanup_free char *path = NULL;
pid_t pid = args->pid;
int ret;
char *from, *to;
char *saveptr = NULL;
cleanup_free char *cgroup_path = NULL;

xasprintf (&cgroup_path, "/proc/%d/cgroup", pid);
ret = read_all_file (cgroup_path, &content, NULL, err);
Expand Down Expand Up @@ -148,6 +210,11 @@ systemd_finalize (struct libcrun_cgroup_args *args, char **path_out,
}
}
}

ret = setup_rt_runtime (resources, path, err);
if (UNLIKELY (ret < 0))
return ret;

break;

case CGROUP_MODE_UNIFIED:
Expand Down Expand Up @@ -1071,6 +1138,10 @@ libcrun_update_resources_systemd (struct libcrun_cgroup_status *cgroup_status,
goto exit;
}

ret = setup_rt_runtime (resources, cgroup_status->path, err);
if (UNLIKELY (ret < 0))
goto exit;

ret = 0;

exit:
Expand Down
4 changes: 4 additions & 0 deletions src/libcrun/container.c
Original file line number Diff line number Diff line change
Expand Up @@ -2376,6 +2376,10 @@ libcrun_container_run_internal (libcrun_container_t *container, libcrun_context_
if (UNLIKELY (ret < 0))
goto fail;

ret = libcrun_set_scheduler (pid, container, err);
if (UNLIKELY (ret < 0))
return ret;

/* The container is waiting that we write back. In this phase we can launch the
prestart hooks. */
if (def->hooks && def->hooks->prestart_len)
Expand Down
102 changes: 99 additions & 3 deletions src/libcrun/linux.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
#include <sys/xattr.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <sched.h>

#include <yajl/yajl_tree.h>
#include <yajl/yajl_gen.h>
Expand Down Expand Up @@ -4650,7 +4651,8 @@ libcrun_run_linux_container (libcrun_container_t *container, container_entrypoin
}

static int
join_process_parent_helper (pid_t child_pid, int sync_socket_fd,
join_process_parent_helper (libcrun_container_t *container,
pid_t child_pid, int sync_socket_fd,
libcrun_container_status_t *status,
bool need_move_to_cgroup, const char *sub_cgroup,
int *terminal_fd, libcrun_error_t *err)
Expand Down Expand Up @@ -4700,6 +4702,11 @@ join_process_parent_helper (pid_t child_pid, int sync_socket_fd,
if (UNLIKELY (ret < 0))
return ret;
}

/* Join the scheduler immediately after joining the cgroup. */
ret = libcrun_set_scheduler (pid, container, err);
if (UNLIKELY (ret < 0))
return ret;
}

/* The write unblocks the grandchild process so it can run once we setup
Expand Down Expand Up @@ -4944,6 +4951,16 @@ libcrun_join_process (libcrun_container_t *container, pid_t pid_to_join,

pid = syscall_clone3 (&clone3_args);

if (pid > 0)
{
/* We need to set the scheduler as soon as possible after joining the cgroup,
because if it is a RT scheduler, other processes in the container could already
take the entire cpu time and stall the new process. */
ret = libcrun_set_scheduler (pid, container, err);
if (UNLIKELY (ret < 0))
return ret;
}

/* On errors, fall back to fork(). */
if (pid < 0)
{
Expand All @@ -4961,8 +4978,9 @@ libcrun_join_process (libcrun_container_t *container, pid_t pid_to_join,
{
close_and_reset (&sync_socket_fd[1]);
sync_fd = sync_socket_fd[0];
return join_process_parent_helper (pid, sync_fd, status, need_move_to_cgroup,
sub_cgroup, terminal_fd, err);
return join_process_parent_helper (container, pid, sync_fd, status,
need_move_to_cgroup, sub_cgroup,
terminal_fd, err);
}

close_and_reset (&sync_socket_fd[0]);
Expand Down Expand Up @@ -5270,3 +5288,81 @@ libcrun_kill_linux (libcrun_container_status_t *status, int signal, libcrun_erro
return crun_make_error (err, errno, "kill container");
return 0;
}

int
libcrun_set_scheduler (pid_t pid, libcrun_container_t *container, libcrun_error_t *err)
{
cleanup_free char *copy = NULL;
struct sched_param param;
int ret, policy, option;
char *v_priority;
const char *v;
char *sptr;
struct
{
const char *name;
int value;
int option_value;
} policies[] = {
{ "SCHED_OTHER", SCHED_OTHER, 0 },
{ "SCHED_BATCH", SCHED_BATCH, 0 },
{ "SCHED_IDLE", SCHED_IDLE, 0 },
{ "SCHED_FIFO", SCHED_FIFO, 0 },
{ "SCHED_RR", SCHED_RR, 0 },
{ "SCHED_RESET_ON_FORK", 0, SCHED_RESET_ON_FORK },
{ NULL, 0, 0 },
};

v = find_annotation (container, "run.oci.scheduler");
if (LIKELY (v == NULL))
return 0;

memset (&param, 0, sizeof (param));

copy = xstrdup (v);
v_priority = strchr (copy, '#');
if (v_priority)
*v_priority = '\0';

policy = 0;
option = 0;
for (v = strtok_r (copy, "|", &sptr); v; v = strtok_r (NULL, "|", &sptr))
{
int i;

for (i = 0; policies[i].name; i++)
if (strcmp (v, policies[i].name) == 0)
{
policy |= policies[i].value;
option |= policies[i].option_value;
break;
}
if (UNLIKELY (policies[i].name == NULL))
return crun_make_error (err, 0, "invalid scheduler `%s`", v);
}

if (v_priority)
{
long long priority;
char *ep = NULL;

errno = 0;
priority = strtoll (v_priority + 1, &ep, 10);
if (UNLIKELY (ep != NULL && *ep != '\0'))
return crun_make_error (err, EINVAL, "parse scheduler annotation");
if (UNLIKELY (errno))
return crun_make_error (err, errno, "parse scheduler annotation");

if (priority >= INT_MAX || priority <= INT_MIN
flouthoc marked this conversation as resolved.
Show resolved Hide resolved
|| priority < sched_get_priority_min (policy) || priority > sched_get_priority_max (policy))
return crun_make_error (err, 0, "scheduler priority value `%lli` out of range", priority);

param.sched_priority = (int) priority;
}

ret = sched_setscheduler (pid, option | policy, &param);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "sched_setscheduler");

return 0;
}
2 changes: 2 additions & 0 deletions src/libcrun/linux.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,6 @@ int libcrun_create_dev (libcrun_container_t *container, int devfd,
int parse_idmapped_mount_option (runtime_spec_schema_config_schema *def, bool is_uids, char *option, char **out,
size_t *len, libcrun_error_t *err);

int libcrun_set_scheduler (pid_t pid, libcrun_container_t *container, libcrun_error_t *err);

#endif