diff --git a/fs/proc/base.c b/fs/proc/base.c index b9052be86e8d56a7aaa83d847b30bd58ded52a7a..8ae7c2be70c273cca91a1faff8551fd1f0cc6b0e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3644,6 +3644,68 @@ static const struct inode_operations proc_tid_comm_inode_operations = { .permission = proc_tid_comm_permission, }; +#ifdef CONFIG_BPF_SCHED +static ssize_t pid_tag_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *tsk; + char buffer[PROC_NUMBUF]; + int err = 0, tag = 0; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &tag); + if (err) + goto out; + + sched_settag(tsk, tag); + +out: + put_task_struct(tsk); + return err < 0 ? err : count; +} + +static int pid_tag_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *tsk; + + tsk = get_proc_task(inode); + if (!tsk) + return -ESRCH; + + seq_printf(m, "%ld\n", tsk->tag); + put_task_struct(tsk); + + return 0; +} + +static int pid_tag_open(struct inode *inode, struct file *flip) +{ + return single_open(flip, pid_tag_show, inode); +} + +static const struct file_operations proc_pid_tag_operations = { + .open = pid_tag_open, + .read = seq_read, + .write = pid_tag_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + /* * Tasks */ @@ -3751,6 +3813,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_ASCEND_SHARE_POOL ONE("sp_group", 0444, proc_sp_group_state), #endif +#ifdef CONFIG_BPF_SCHED + REG("tag", 0644, proc_pid_tag_operations), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/bpf_topology.h b/include/linux/bpf_topology.h new file mode 100644 index 0000000000000000000000000000000000000000..0c7ee492edde392c39d280ca8f8ff3cb4949e461 --- /dev/null +++ b/include/linux/bpf_topology.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_BPF_TOPOLOGY_H +#define _LINUX_BPF_TOPOLOGY_H + +#include + +struct bpf_cpu_topology { + int cpu; + int core_id; + int cluster_id; + int die_id; + int physical_package_id; + int numa_node; + struct cpumask thread_siblings; + struct cpumask core_siblings; + struct cpumask cluster_cpus; + struct cpumask die_cpus; + struct cpumask package_cpus; + struct cpumask node_cpu_lists; +}; + +struct bpf_cpumask_info { + unsigned int nums_possible_cpus; + unsigned int nums_active_cpus; + unsigned int nums_isolate_cpus; + unsigned int nr_cpu_ids; + unsigned int bpf_nr_cpumask_bits; + struct cpumask cpu_possible_cpumask; + struct cpumask cpu_active_cpumask; + struct cpumask cpu_isolate_cpumask; +}; + +#endif /* _LINUX_BPF_TOPOLOGY_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 47f462040f4dfc4c1baed9960ece7aa2ba8e8b4a..41df850003fbbb0ba4a0bfec8b4c4e7b41cb3259 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1397,7 +1397,12 @@ struct task_struct { */ randomized_struct_fields_end +#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a task */ + long tag; +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -2176,4 +2181,84 @@ static inline int sched_qos_cpu_overload(void) return 0; } #endif + +#ifdef CONFIG_BPF_SCHED +extern void sched_settag(struct task_struct *tsk, s64 tag); + +struct bpf_sched_cpu_load { + unsigned long cfs_load_avg; + unsigned long cfs_runnable_avg; + unsigned long cfs_util_avg; + unsigned long rt_load_avg; + unsigned long rt_runnable_avg; + unsigned long rt_util_avg; + unsigned long irq_load_avg; + unsigned long irq_runnable_avg; + unsigned long irq_util_avg; +}; + +struct bpf_sched_cpu_nr_running { + unsigned int nr_running; + unsigned int cfs_nr_running; + unsigned int cfs_h_nr_running; + unsigned int cfs_idle_h_nr_running; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +}; + +struct bpf_sched_cpu_idle_stat { + int available_idle; + unsigned int exit_latency; + unsigned long idle_stamp; + unsigned long avg_idle; +}; + +struct bpf_sched_cpu_capacity { + unsigned long capacity; + unsigned long capacity_orig; +}; + +struct cpumask_op_args { + unsigned int op_type; + void *arg1; + void *arg2; + void *arg3; + void *arg4; +}; + +enum cpumask_op_type { + CPUMASK_EMPTY, + CPUMASK_AND, + CPUMASK_ANDNOT, + CPUMASK_SUBSET, + CPUMASK_EQUAL, + CPUMASK_TEST_CPU, + CPUMASK_COPY, + CPUMASK_WEIGHT, + CPUMASK_NEXT, + CPUMASK_NEXT_WRAP, + CPUMASK_NEXT_AND, + CPUMASK_CPULIST_PARSE +}; + +struct sched_migrate_ctx { + struct task_struct *task; + struct cpumask *cpus_allowed; + struct cpumask *select_idle_mask; + int prev_cpu; + int curr_cpu; + int is_sync; + int want_affine; + int wake_flags; + int sd_flag; + int new_cpu; +}; + +struct sched_affine_ctx { + struct task_struct *task; + int prev_cpu; + int curr_cpu; + int is_sync; +}; +#endif #endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index e2f65e4b8895daba3cbafc76efc413be410da587..07b3063d6f5685874a4f9ccf17a63553f3a43baf 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -3,3 +3,8 @@ BPF_SCHED_HOOK(int, 0, cfs_check_preempt_tick, struct sched_entity *curr, unsign BPF_SCHED_HOOK(int, 0, cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) BPF_SCHED_HOOK(int, 0, cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) +BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, 0, cfs_tag_pick_next_entity, struct sched_entity *curr, + struct sched_entity *next) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d5fbbc28b6a047a984d3c1325bf4ad3ae35d4a9d..1b2f54151cdb9a75d57f1fe91eee517509789a91 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3777,6 +3777,141 @@ union bpf_attr { * to be enabled. * Return * 1 if the sched entity belongs to a cgroup, 0 otherwise. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_load_of(int cpu, struct bpf_sched_cpu_load *ctx, int len) + * Description + * Get multiple types of *cpu* load and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_nr_running_of(int cpu, struct bpf_sched_cpu_nr_running *ctx, int len) + * Description + * Get multiple types of *cpu* nr running and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_idle_stat_of(int cpu, struct bpf_sched_cpu_idle_stat *ctx, int len) + * Description + * Get *cpu* idle state and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_capacity_of(int cpu, struct bpf_sched_cpu_capacity *ctx, int len) + * Description + * Get *cpu* capacity and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map, u64 flags) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_cpumask_info *cpus, int len) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * yes 1, no 0. + * + * int bpf_sched_set_task_cpus_ptr(struct sched_migrate_ctx *h_ctx, struct cpumask *cpus, int len) + * Description + * set cpus_ptr in task. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3940,6 +4075,22 @@ union bpf_attr { FN(sched_entity_to_tgidpid), \ FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ + FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ + FN(sched_cpu_load_of), \ + FN(sched_cpu_nr_running_of), \ + FN(sched_cpu_idle_stat_of), \ + FN(sched_cpu_capacity_of), \ + FN(init_cpu_topology), \ + FN(get_cpumask_info), \ + FN(cpumask_op), \ + FN(cpus_share_cache), \ + FN(sched_set_task_cpus_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/init/init_task.c b/init/init_task.c index 5fa18ed59d33e70edc516308306cb3bf8408a1a4..7003426df677bd526a7da7b0d2692c73485a6caa 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -213,6 +213,9 @@ struct task_struct init_task #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_BPF_SCHED + .tag = 0, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4bb5921a7d2177b3c11883a15bee72f13207f0fe..5fccf33196b5dff4c249e4fc608128f076348342 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -658,6 +658,10 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; +const struct bpf_func_proto bpf_sched_tg_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_task_tag_of_proto __weak; +const struct bpf_func_proto bpf_sched_set_tg_tag_proto __weak; +const struct bpf_func_proto bpf_sched_set_task_tag_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -697,6 +701,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_sched_tg_tag_of: + return &bpf_sched_tg_tag_of_proto; + case BPF_FUNC_sched_task_tag_of: + return &bpf_sched_task_tag_of_proto; default: break; } @@ -715,6 +723,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_sched_set_tg_tag: + return &bpf_sched_set_tg_tag_proto; + case BPF_FUNC_sched_set_task_tag: + return &bpf_sched_set_task_tag_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d26104b258baba4020f1587504c81ec4b403894e..7acc2cd0081ff3efd1c2b952def036ac8d72507b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5020,10 +5020,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) int i; for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) return false; } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8ae9e39eb83ab9cf9bd30f5005932a3ce924e370..c809d5c28424bf27e656c645663f1225028addfd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -36,4 +36,5 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o -obj-$(CONFIG_BPF_SCHED) += bpf_sched.o \ No newline at end of file +obj-$(CONFIG_BPF_SCHED) += bpf_sched.o +obj-$(CONFIG_BPF_SCHED) += bpf_topology.o \ No newline at end of file diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c index 2ce2afcacb179a5844da670264e8b77911034af7..7485077d5a933b3bba5667823c39eeff5d045260 100644 --- a/kernel/sched/bpf_sched.c +++ b/kernel/sched/bpf_sched.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "sched.h" DEFINE_STATIC_KEY_FALSE(bpf_sched_enabled_key); @@ -26,6 +27,9 @@ BTF_SET_START(bpf_sched_hooks) #undef BPF_SCHED_HOOK BTF_SET_END(bpf_sched_hooks) +const struct bpf_func_proto bpf_init_cpu_topology_proto __weak; +const struct bpf_func_proto bpf_get_cpumask_info_proto __weak; + int bpf_sched_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -112,6 +116,414 @@ static const struct bpf_func_proto bpf_sched_entity_belongs_to_cgrp_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_sched_tg_tag_of, struct task_group *, tg) +{ + int ret = 0; + +#ifdef CONFIG_CGROUP_SCHED + if (tg == NULL) + return -EINVAL; + ret = tg->tag; +#endif + + return ret; +} + +BTF_ID_LIST_SINGLE(btf_sched_tg_ids, struct, task_group) + +const struct bpf_func_proto bpf_sched_tg_tag_of_proto = { + .func = bpf_sched_tg_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], +}; + +BPF_CALL_1(bpf_sched_task_tag_of, struct task_struct *, tsk) +{ + if (tsk == NULL) + return -EINVAL; + return tsk->tag; +} + +BTF_ID_LIST_SINGLE(btf_sched_task_ids, struct, task_struct) + +const struct bpf_func_proto bpf_sched_task_tag_of_proto = { + .func = bpf_sched_task_tag_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_is_task, struct sched_entity *, se) +{ + return entity_is_task(se) ? 1 : 0; +} + +static const struct bpf_func_proto bpf_sched_entity_is_task_proto = { + .func = bpf_sched_entity_is_task, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_task, struct sched_entity *, se) +{ + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + return (unsigned long)tsk; + } + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_task_proto = { + .func = bpf_sched_entity_to_task, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_task_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_1(bpf_sched_entity_to_tg, struct sched_entity *, se) +{ +#if CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) { + struct task_group *tg = group_cfs_rq(se)->tg; + + return (unsigned long)tg; + } +#endif + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sched_entity_to_tg_proto = { + .func = bpf_sched_entity_to_tg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &btf_sched_tg_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_entity_ids[0], +}; + +BPF_CALL_2(bpf_sched_set_tg_tag, struct task_group *, tg, s64, tag) +{ +#if CONFIG_CGROUP_SCHED + if (tg == NULL || tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +#endif + return -EPERM; +} + +const struct bpf_func_proto bpf_sched_set_tg_tag_proto = { + .func = bpf_sched_set_tg_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_tg_ids[0], + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_sched_set_task_tag, struct task_struct *, tsk, s64, tag) +{ + if (tsk == NULL) + return -EINVAL; + + sched_settag(tsk, tag); + return 0; +} + +const struct bpf_func_proto bpf_sched_set_task_tag_proto = { + .func = bpf_sched_set_task_tag, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_sched_task_ids[0], + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_sched_cpu_load_of, int, cpu, + struct bpf_sched_cpu_load *, ctx, + int, len) +{ + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + memset(ctx, 0, sizeof(struct bpf_sched_cpu_load)); +#ifdef CONFIG_SMP + rq = cpu_rq(cpu); + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->cfs_load_avg = rq->cfs.avg.load_avg; + ctx->cfs_runnable_avg = rq->cfs.avg.runnable_avg; + ctx->cfs_util_avg = rq->cfs.avg.util_avg; + ctx->rt_load_avg = rq->avg_rt.load_avg; + ctx->rt_runnable_avg = rq->avg_rt.runnable_avg; + ctx->rt_util_avg = rq->avg_rt.util_avg; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + ctx->irq_load_avg = rq->avg_irq.load_avg; + ctx->irq_runnable_avg = rq->avg_irq.runnable_avg; + ctx->irq_util_avg = rq->avg_irq.util_avg; +#endif +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_load_of_proto = { + .func = bpf_sched_cpu_load_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sched_cpu_nr_running_of, int, cpu, + struct bpf_sched_cpu_nr_running *, ctx, + int, len) +{ + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + SCHED_WARN_ON(!rcu_read_lock_held()); + + rq = cpu_rq(cpu); + ctx->nr_running = rq->nr_running; + ctx->cfs_nr_running = rq->cfs.nr_running; + ctx->cfs_h_nr_running = rq->cfs.h_nr_running; + ctx->cfs_idle_h_nr_running = rq->cfs.idle_h_nr_running; + ctx->rt_nr_running = rq->rt.rt_nr_running; + ctx->rr_nr_running = rq->rt.rr_nr_running; + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_nr_running_of_proto = { + .func = bpf_sched_cpu_nr_running_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sched_cpu_idle_stat_of, int, cpu, + struct bpf_sched_cpu_idle_stat *, ctx, + int, len) +{ + struct cpuidle_state *idle; + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + memset(ctx, 0, sizeof(struct bpf_sched_cpu_idle_stat)); + SCHED_WARN_ON(!rcu_read_lock_held()); + ctx->available_idle = available_idle_cpu(cpu); + rq = cpu_rq(cpu); + idle = idle_get_state(rq); + if (idle) + ctx->exit_latency = idle->exit_latency; + +#ifdef CONFIG_SMP + ctx->idle_stamp = rq->idle_stamp; + ctx->avg_idle = rq->avg_idle; +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_idle_stat_of_proto = { + .func = bpf_sched_cpu_idle_stat_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sched_cpu_capacity_of, int, cpu, + struct bpf_sched_cpu_capacity *, ctx, + int, len) +{ + struct rq *rq; + + if (len != sizeof(*ctx)) + return -EINVAL; + + if ((unsigned int)cpu >= nr_cpu_ids) + return -EINVAL; + + memset(ctx, 0, sizeof(struct bpf_sched_cpu_capacity)); +#ifdef CONFIG_SMP + SCHED_WARN_ON(!rcu_read_lock_held()); + rq = cpu_rq(cpu); + ctx->capacity = rq->cpu_capacity; + ctx->capacity_orig = rq->cpu_capacity_orig; +#endif + + return 0; +} + +static const struct bpf_func_proto bpf_sched_cpu_capacity_of_proto = { + .func = bpf_sched_cpu_capacity_of, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_2(bpf_cpumask_op, struct cpumask_op_args *, op, int, len) +{ + int ret; + + if (len != sizeof(*op) || !op->arg1) + return -EINVAL; + + switch (op->op_type) { + case CPUMASK_EMPTY: + return cpumask_empty((const struct cpumask *)op->arg1); + case CPUMASK_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_and((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_ANDNOT: + if (!op->arg2 || !op->arg3) + return -EINVAL; + cpumask_andnot((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + break; + case CPUMASK_SUBSET: + if (!op->arg2) + return -EINVAL; + return cpumask_subset((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_EQUAL: + if (!op->arg2) + return -EINVAL; + return cpumask_equal((const struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_TEST_CPU: + if (!op->arg2) + return -EINVAL; + return cpumask_test_cpu(*(int *)op->arg1, op->arg2); + case CPUMASK_COPY: + if (!op->arg2) + return -EINVAL; + cpumask_copy((struct cpumask *)op->arg1, + (const struct cpumask *)op->arg2); + break; + case CPUMASK_WEIGHT: + return cpumask_weight((const struct cpumask *)op->arg1); + case CPUMASK_NEXT: + if (!op->arg2) + return -EINVAL; + return cpumask_next(*(int *)op->arg1, + (const struct cpumask *)op->arg2); + case CPUMASK_NEXT_WRAP: + if (!op->arg2 || !op->arg3 || !op->arg4) + return -EINVAL; + return cpumask_next_wrap(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + *(int *)op->arg3, *(int *)op->arg4); + case CPUMASK_NEXT_AND: + if (!op->arg2 || !op->arg3) + return -EINVAL; + return cpumask_next_and(*(int *)op->arg1, + (const struct cpumask *)op->arg2, + (const struct cpumask *)op->arg3); + case CPUMASK_CPULIST_PARSE: + if (!op->arg2) + return -EINVAL; + + op->arg1 = (void *)strstrip((void *)op->arg1); + ret = cpulist_parse((void *)op->arg1, + (struct cpumask *)op->arg2); + return ret; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_cpumask_op_proto = { + .func = bpf_cpumask_op, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, +}; + +BPF_CALL_2(bpf_cpus_share_cache, int, src_cpu, int, dst_cpu) +{ + if ((unsigned int)src_cpu >= nr_cpu_ids || + (unsigned int)dst_cpu >= nr_cpu_ids) + return 0; + + return cpus_share_cache(src_cpu, dst_cpu); +} + +static const struct bpf_func_proto bpf_cpus_share_cache_proto = { + .func = bpf_cpus_share_cache, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_sched_set_task_cpus_ptr, struct sched_migrate_ctx *, h_ctx, + struct cpumask *, cpus, int, len) +{ + if (len != sizeof(*cpus)) + return -EINVAL; + + h_ctx->task->cpus_ptr = cpus; + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_sched_migrate_ctx_ids, struct, sched_migrate_ctx) + +static const struct bpf_func_proto bpf_sched_set_task_cpus_ptr_proto = { + .func = bpf_sched_set_task_cpus_ptr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sched_migrate_ctx_ids[0], + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -124,6 +536,30 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sched_entity_to_cgrpid_proto; case BPF_FUNC_sched_entity_belongs_to_cgrp: return &bpf_sched_entity_belongs_to_cgrp_proto; + case BPF_FUNC_sched_entity_is_task: + return &bpf_sched_entity_is_task_proto; + case BPF_FUNC_sched_entity_to_task: + return &bpf_sched_entity_to_task_proto; + case BPF_FUNC_sched_entity_to_tg: + return &bpf_sched_entity_to_tg_proto; + case BPF_FUNC_sched_cpu_load_of: + return &bpf_sched_cpu_load_of_proto; + case BPF_FUNC_sched_cpu_nr_running_of: + return &bpf_sched_cpu_nr_running_of_proto; + case BPF_FUNC_sched_cpu_idle_stat_of: + return &bpf_sched_cpu_idle_stat_of_proto; + case BPF_FUNC_sched_cpu_capacity_of: + return &bpf_sched_cpu_capacity_of_proto; + case BPF_FUNC_init_cpu_topology: + return &bpf_init_cpu_topology_proto; + case BPF_FUNC_get_cpumask_info: + return &bpf_get_cpumask_info_proto; + case BPF_FUNC_cpumask_op: + return &bpf_cpumask_op_proto; + case BPF_FUNC_cpus_share_cache: + return &bpf_cpus_share_cache_proto; + case BPF_FUNC_sched_set_task_cpus_ptr: + return &bpf_sched_set_task_cpus_ptr_proto; default: return bpf_base_func_proto(func_id); } diff --git a/kernel/sched/bpf_topology.c b/kernel/sched/bpf_topology.c new file mode 100644 index 0000000000000000000000000000000000000000..9c2eda139e2a29990a562ef5a9fe5785afa747f2 --- /dev/null +++ b/kernel/sched/bpf_topology.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +static void bpf_update_cpu_topology(struct bpf_cpu_topology *cpu_topology, int cpu) +{ + cpu_topology->cpu = cpu; + cpu_topology->core_id = topology_core_id(cpu); + cpu_topology->cluster_id = topology_cluster_id(cpu); + cpu_topology->die_id = topology_die_id(cpu); + cpu_topology->physical_package_id = topology_physical_package_id(cpu); + cpu_topology->numa_node = cpu_to_node(cpu); + cpumask_copy(&cpu_topology->thread_siblings, topology_sibling_cpumask(cpu)); + cpumask_copy(&cpu_topology->core_siblings, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->cluster_cpus, topology_cluster_cpumask(cpu)); + cpumask_copy(&cpu_topology->die_cpus, topology_die_cpumask(cpu)); + cpumask_copy(&cpu_topology->package_cpus, topology_core_cpumask(cpu)); + cpumask_copy(&cpu_topology->node_cpu_lists, cpumask_of_node(cpu_to_node(cpu))); +} + +BPF_CALL_2(bpf_init_cpu_topology, struct bpf_map *, map, u64, flags) +{ + const struct cpumask *cpu_map = cpu_active_mask; + int ret = 0; + int i = -1; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + for_each_cpu(i, cpu_map) { + struct bpf_cpu_topology topo; + + bpf_update_cpu_topology(&topo, i); + ret = map->ops->map_update_elem(map, &i, &topo, flags); + if (ret) { + int idx = i; + + for (; idx >= 0; idx--) + map->ops->map_delete_elem(map, &idx); + break; + } + } + + return ret; +} + +BTF_ID_LIST_SINGLE(bpf_cpu_topology_ids, struct, bpf_cpu_topology) + +const struct bpf_func_proto bpf_init_cpu_topology_proto = { + .func = bpf_init_cpu_topology, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_get_cpumask_info, struct bpf_cpumask_info *, cpus, + int, len) +{ + if (len != sizeof(*cpus)) + return -EINVAL; + + cpumask_copy(&cpus->cpu_possible_cpumask, cpu_possible_mask); + cpumask_copy(&cpus->cpu_active_cpumask, cpu_active_mask); + cpumask_copy(&cpus->cpu_isolate_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpus->nums_possible_cpus = num_possible_cpus(); + cpus->nums_active_cpus = num_active_cpus(); + cpus->nums_isolate_cpus = cpumask_weight(&cpus->cpu_isolate_cpumask); + cpus->nr_cpu_ids = nr_cpu_ids; + cpus->bpf_nr_cpumask_bits = nr_cpumask_bits; + + return 0; +} + +const struct bpf_func_proto bpf_get_cpumask_info_proto = { + .func = bpf_get_cpumask_info, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, +}; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51c707897c8d696f315506cda91d227271615be9..c0fd31446c701235cf260c2e1fd089a1c1662ff4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3113,6 +3113,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif +#ifdef CONFIG_BPF_SCHED + p->tag = 0; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -7755,6 +7758,13 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); } +#ifdef CONFIG_BPF_SCHED +static inline void tg_init_tag(struct task_group *tg, struct task_group *ptg) +{ + tg->tag = ptg->tag; +} +#endif + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -7775,6 +7785,10 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; +#ifdef CONFIG_BPF_SCHED + tg_init_tag(tg, parent); +#endif + alloc_uclamp_sched_group(tg, parent); return tg; @@ -7846,6 +7860,14 @@ static void sched_change_group(struct task_struct *tsk, int type) sched_change_qos_group(tsk, tg); #endif +#ifdef CONFIG_BPF_SCHED + /* + * This function has cleared and restored the task status, + * so we do not need to dequeue and enqueue the task again. + */ + tsk->tag = tg->tag; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk, type); @@ -8618,6 +8640,80 @@ static inline s64 cpu_qos_read(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_BPF_SCHED +void sched_settag(struct task_struct *tsk, s64 tag) +{ + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq_flags rf; + struct rq *rq; + + if (tsk->tag == tag) + return; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + update_rq_clock(rq); + if (queued) + dequeue_task(rq, tsk, queue_flags); + if (running) + put_prev_task(rq, tsk); + + tsk->tag = tag; + + if (queued) + enqueue_task(rq, tsk, queue_flags); + if (running) + set_next_task(rq, tsk); + + task_rq_unlock(rq, tsk, &rf); +} + +int tg_change_tag(struct task_group *tg, void *data) +{ + struct css_task_iter it; + struct task_struct *tsk; + s64 tag = *(s64 *)data; + struct cgroup_subsys_state *css = &tg->css; + + tg->tag = tag; + + css_task_iter_start(css, 0, &it); + while ((tsk = css_task_iter_next(&it))) + sched_settag(tsk, tag); + css_task_iter_end(&it); + + return 0; +} + +static int cpu_tag_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 tag) +{ + struct task_group *tg = css_tg(css); + + if (tg == &root_task_group) + return -EINVAL; + + if (tg->tag == tag) + return 0; + + rcu_read_lock(); + walk_tg_tree_from(tg, tg_change_tag, tg_nop, (void *)(&tag)); + rcu_read_unlock(); + + return 0; +} + +static inline s64 cpu_tag_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->tag; +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -8679,6 +8775,13 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_qos_read, .write_s64 = cpu_qos_write, }, +#endif +#ifdef CONFIG_BPF_SCHED + { + .name = "tag", + .read_s64 = cpu_tag_read, + .write_s64 = cpu_tag_write, + }, #endif { } /* Terminate */ }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3395d102b43ec475b81747f865c3bb02ceb5470c..e9e054ba27601e1c0169eca79bd9e312c305550c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -513,6 +513,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline int entity_before(struct sched_entity *a, struct sched_entity *b) { +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + int ret = bpf_sched_cfs_tag_pick_next_entity(a, b); + + if (ret == 1) + return 1; + } +#endif + return (s64)(a->vruntime - b->vruntime) < 0; } @@ -4482,8 +4491,11 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (ret < 0) return; - else if (ret > 0) + else if (ret > 0) { resched_curr(rq_of(cfs_rq)); + clear_buddies(cfs_rq, curr); + return; + } } #endif @@ -6020,6 +6032,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, { int target = nr_cpumask_bits; +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + struct sched_affine_ctx ctx; + int ret; + + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = this_cpu; + ctx.is_sync = sync; + + ret = bpf_sched_cfs_wake_affine(&ctx); + if (ret >= 0 && ret < nr_cpumask_bits) + return ret; + } +#endif + if (sched_feat(WA_IDLE)) target = wake_affine_idle(this_cpu, prev_cpu, sync); @@ -6884,6 +6912,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_ctx ctx; + int ret; +#endif time = schedstat_start_time(); @@ -6901,6 +6933,26 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } rcu_read_lock(); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.task = p; + ctx.prev_cpu = prev_cpu; + ctx.curr_cpu = cpu; + ctx.is_sync = sync; + ctx.wake_flags = wake_flags; + ctx.want_affine = want_affine; + ctx.sd_flag = sd_flag; + ctx.cpus_allowed = (void *)p->cpus_ptr; + ctx.select_idle_mask = this_cpu_cpumask_var_ptr(select_idle_mask); + + ret = bpf_sched_cfs_select_rq(&ctx); + if (ret >= 0) { + rcu_read_unlock(); + return ret; + } + } +#endif + for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -6932,6 +6984,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (want_affine) current->recent_used_cpu = cpu; } + +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + ctx.new_cpu = new_cpu; + ret = bpf_sched_cfs_select_rq_exit(&ctx); + if (ret >= 0) + new_cpu = ret; + } +#endif + rcu_read_unlock(); schedstat_end_time(cpu_rq(cpu), time); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 42d5fb7d946437531a3511de026ad856722d14a1..1aeccc5111d77446c0a0970bf4d8575d5688cd4b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -455,7 +455,12 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; #endif +#ifdef CONFIG_BPF_SCHED + /* Used to pad the tag of a group */ + long tag; +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -493,6 +498,9 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) } extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_BPF_SCHED +extern int tg_change_tag(struct task_group *tg, void *data); +#endif extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index aeebf5d12f32c09f7782e5f406fa715104ef7ac9..1d92e87565add19f149a227bfe25de5ef011091b 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -54,6 +54,9 @@ tprogs-y += task_fd_query tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y += sched_preempt +tprogs-y += sched_select_core +tprogs-y += sched_pick_task # Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -111,6 +114,9 @@ task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) +sched_preempt-objs := sched_preempt_user.o +sched_select_core-objs := sched_select_core_user.o +sched_pick_task-objs := sched_pick_task_user.o # Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -172,6 +178,9 @@ always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o always-y += xdpsock_kern.o +always-y += sched_preempt_kern.o +always-y += sched_select_core_kern.o +always-y += sched_pick_task_kern.o ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index c5ad528f046e038a3939180676d64c0a4363cd0e..4dfb45d254b17902b506032badcc860ce3a2ca73 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -87,6 +87,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_sockops = strncmp(event, "sockops", 7) == 0; bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; + bool is_sched = strncmp(event, "sched/", 6) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -120,6 +121,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_SK_SKB; } else if (is_sk_msg) { prog_type = BPF_PROG_TYPE_SK_MSG; + } else if (is_sched) { + prog_type = BPF_PROG_TYPE_SCHED; } else { printf("Unknown event '%s'\n", event); return -1; @@ -137,7 +140,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_fd[prog_cnt++] = fd; - if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) + if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk || is_sched) return 0; if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { @@ -643,7 +646,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "cgroup/", 7) == 0 || memcmp(shname, "sockops", 7) == 0 || memcmp(shname, "sk_skb", 6) == 0 || - memcmp(shname, "sk_msg", 6) == 0) { + memcmp(shname, "sk_msg", 6) == 0 || + memcmp(shname, "sched/", 6) == 0) { ret = load_and_attach(shname, data->d_buf, data->d_size); if (ret != 0) diff --git a/samples/bpf/sched_pick_task_kern.c b/samples/bpf/sched_pick_task_kern.c new file mode 100644 index 0000000000000000000000000000000000000000..b7a48abaf01adbb6eead63071e867539c3e65ca0 --- /dev/null +++ b/samples/bpf/sched_pick_task_kern.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include + +#define PICK_CURR 1 +#define PICK_NOMAL 0 +#define ERROR -1 + +enum task_type { + TASK_TYPE_OFFLINE = -1, + TASK_TYPE_ONLINE, + TASK_TYPE_MAX +}; + +/* + * Only implements the effect of the task selection strategy + * and needs to be used in conjunction with preempt and + * load balance. If quota is not configured, the priority + * inversion leads to system crash. + */ +SEC("sched/cfs_tag_pick_next_entity") +int BPF_PROG(sched_cfs_tag_pick_next_entity, struct sched_entity *curr, struct sched_entity *next) +{ + int curr_type = 0; + int next_type = 0; + + if (curr == NULL || next == NULL) + return PICK_NOMAL; + + curr_type = libbpf_sched_se_tag_of(curr); + next_type = libbpf_sched_se_tag_of(next); + + if (curr_type > next_type) + return PICK_CURR; + + return PICK_NOMAL; +} + +char _license[] SEC("license") = "GPL"; + diff --git a/samples/bpf/sched_pick_task_user.c b/samples/bpf/sched_pick_task_user.c new file mode 100644 index 0000000000000000000000000000000000000000..0c8a24393bd29b589cb6a1ea79ad7a70a9fe046e --- /dev/null +++ b/samples/bpf/sched_pick_task_user.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +/* read trace logs from debug fs */ +void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +int main(int argc, char **argv) +{ + char filename[256]; + struct bpf_object *obj; + struct bpf_program *prog; + struct bpf_link *link; + int err; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + /* Open BPF application */ + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 1; + } + + /* Load and verify BPF program */ + err = bpf_object__load(obj); + if (err) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "sched_cfs_tag_pick_next_entity"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + link = NULL; + goto cleanup; + } + + printf("preempt BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); +out: + return 0; +} diff --git a/samples/bpf/sched_preempt_kern.c b/samples/bpf/sched_preempt_kern.c new file mode 100644 index 0000000000000000000000000000000000000000..788883f72deb0afbe2604e3c24931fe7a547aaaa --- /dev/null +++ b/samples/bpf/sched_preempt_kern.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include +#include +#include +#include +#include + +unsigned long idea_runtime = 1000000UL; + +enum task_type { + TASK_TYPE_OFFLINE = -1, + TASK_TYPE_NORMAL, + TASK_TYPE_ONLINE, +}; + +#define getVal(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +#define bprintk(fmt, ...) \ + ({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ + }) + +SEC("sched/cfs_check_preempt_wakeup") +int BPF_PROG(sched_cfs_check_preempt_wakeup, struct task_struct *curr, struct task_struct *p) +{ + long curr_type, p_type; + int ret = 0; + + curr_type = bpf_sched_task_tag_of(curr); + p_type = bpf_sched_task_tag_of(p); + + if (curr_type == TASK_TYPE_ONLINE && p_type == TASK_TYPE_OFFLINE) + ret = -1; + + if (curr_type == TASK_TYPE_OFFLINE && p_type == TASK_TYPE_ONLINE) + ret = 1; + + bprintk("check_preempt_wakeup: curr id = %d, p id = %d, preempt result is %d\n", + getVal(curr->pid), getVal(p->pid), ret); + + return ret; +} + +SEC("sched/cfs_check_preempt_tick") +int BPF_PROG(sched_cfs_check_preempt_tick, struct sched_entity *curr, unsigned long delta_exec) +{ + long curr_type = TASK_TYPE_NORMAL; + int ret = 0, id = 0; + int entity_is_task = bpf_sched_entity_is_task(curr); + + if (entity_is_task) { + struct task_struct *tsk = bpf_sched_entity_to_task(curr); + + if (tsk) { + curr_type = bpf_sched_task_tag_of(tsk); + id = getVal(tsk->pid); + } + } else { + struct task_group *tg = bpf_sched_entity_to_tg(curr); + + if (tg) { + curr_type = bpf_sched_tg_tag_of(tg); + id = bpf_sched_entity_to_cgrpid(curr); + } + } + + if (curr_type == TASK_TYPE_ONLINE) + ret = delta_exec >= idea_runtime ? 1 : -1; + + bprintk("check_preempt_tick: delta = %lu, entity id = %d, preempt result = %d\n", + delta_exec, id, ret); + return ret; +} + +SEC("sched/cfs_wakeup_preempt_entity") +int BPF_PROG(sched_cfs_wakeup_preempt_entity, struct sched_entity *curr, struct sched_entity *se) +{ + long curr_type = TASK_TYPE_NORMAL; + long p_type = TASK_TYPE_NORMAL; + int curr_id = 0, p_id = 0; + int curr_is_task = bpf_sched_entity_is_task(curr); + int p_is_task = bpf_sched_entity_is_task(se); + int ret = 0; + + if (curr_is_task) { + struct task_struct *tsk = bpf_sched_entity_to_task(curr); + + if (tsk) { + curr_type = bpf_sched_task_tag_of(tsk); + curr_id = getVal(tsk->pid); + } + } else { + struct task_group *tg = bpf_sched_entity_to_tg(curr); + + if (tg) { + curr_type = bpf_sched_tg_tag_of(tg); + curr_id = bpf_sched_entity_to_cgrpid(curr); + } + } + + if (p_is_task) { + struct task_struct *p = bpf_sched_entity_to_task(se); + + if (p) { + p_type = bpf_sched_task_tag_of(p); + p_id = getVal(p->pid); + } + } else { + struct task_group *tg1 = bpf_sched_entity_to_tg(se); + + if (tg1) { + p_type = bpf_sched_tg_tag_of(tg1); + p_id = bpf_sched_entity_to_cgrpid(se); + } + } + + if (curr_type == TASK_TYPE_ONLINE && p_type == TASK_TYPE_OFFLINE) + ret = -1; + + if (curr_type == TASK_TYPE_OFFLINE && p_type == TASK_TYPE_ONLINE) + ret = 1; + + bprintk("wakeup_preempt_entity: curr entity id = %d, se entity id = %d, result = %d\n", + curr_id, p_id, ret); + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sched_preempt_user.c b/samples/bpf/sched_preempt_user.c new file mode 100644 index 0000000000000000000000000000000000000000..92e64d04b6877407964fddfdd8a89c982b8a1c5b --- /dev/null +++ b/samples/bpf/sched_preempt_user.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_PROGS (3) +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +int progindex[MAX_PROGS]; + +static void usage(void) +{ + printf("USAGE: sched_preempt [...]\n"); + printf(" -W # Test sched preempt wakeup\n"); + printf(" -T # Test sched preempt tick\n"); + printf(" -E # Test wakeup preempt entity\n"); + printf(" -h # Display this help\n"); +} + +/* read trace logs from debug fs */ +static void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +static inline bool check_attach_prog(int index) +{ + return progindex[index] ? true : false; +} + +int main(int argc, char **argv) +{ + int opt; + int index; + char filename[256]; + struct bpf_object *obj; + struct bpf_program *prog; + struct bpf_link *link[3] = {NULL}; + + char prognames[MAX_PROGS][256] = { + "sched_cfs_check_preempt_wakeup", + "sched_cfs_check_preempt_tick", + "sched_cfs_wakeup_preempt_entity", + }; + + while ((opt = getopt(argc, argv, "WTEh")) != -1) { + switch (opt) { + case 'W': + progindex[0] = 1; + break; + case 'T': + progindex[1] = 1; + break; + case 'E': + progindex[2] = 1; + break; + case 'h': + default: + usage(); + goto out; + } + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + goto out; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (index = 0; index < MAX_PROGS; ++index) { + if (check_attach_prog(index)) { + prog = bpf_object__find_program_by_name(obj, prognames[index]); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog:%s in obj file failed\n", + prognames[index]); + goto cleanup; + } + + link[index] = bpf_program__attach(prog); + if (libbpf_get_error(link[index])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link[index] = NULL; + goto cleanup; + } + } + } + + printf("preempt BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + for (index = MAX_PROGS - 1; index >= 0; index--) + bpf_link__destroy(link[index]); + bpf_object__close(obj); + +out: + return 0; +} diff --git a/samples/bpf/sched_select_core_kern.c b/samples/bpf/sched_select_core_kern.c new file mode 100644 index 0000000000000000000000000000000000000000..18617e89b3957af3e8a4657ad0b2dca5374aeace --- /dev/null +++ b/samples/bpf/sched_select_core_kern.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Sample select core BPF program. + * 'cfs_select_rq' + * Replace the original core selection policy or + * implement dynamic CPU affinity. + * + * 'cfs_select_rq_exit' + * Restoring the CPU affinity of the task before exiting of + * 'select_task_rq_fair'. + * + * To be used with 'cfs_select_rq' hook to implement + * dynamic CPU affinity. + * + * 'cfs_wake_affine' + * Determine on which CPU task can run soonest. Allow user to + * implement deferent policies. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define STR_MAX (32) +#define SELECT_RQ_RANGE (-1) +#define SELECT_RQ_EXIT_CPU_VALID (-2) + +/* From kernel/sched/sched.h */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ +#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ + +#define TAG_ID(id) TAG_##id + +enum tag_id { + TAG_NONE, + TAG_ID(1), + TAG_ID(2), + TAG_MAX +}; + +struct tag_info { + long tag; + char buf[STR_MAX]; +}; + +struct tag_info tag_tbl[] = { + {TAG_NONE, ""}, + {TAG_ID(1), "0-3"}, + {TAG_ID(2), "4-7"}, + {TAG_MAX, ""}, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} map_idlest_cpu SEC(".maps"); + +int sysctl_sched_util_low_pct = 85; + +static inline bool prefer_cpus_valid(struct cpumask *prefer_cpus, + struct cpumask *cpus_allowed) +{ + return !libbpf_cpumask_empty(prefer_cpus) && + !libbpf_cpumask_equal(prefer_cpus, cpus_allowed) && + libbpf_cpumask_subset(prefer_cpus, cpus_allowed); +} + +static struct cpumask *select_better_cpus(struct task_struct *p, + struct cpumask *prefer_cpus, + int *idlest_cpu) +{ + unsigned long util_avg_sum = 0; + unsigned long tg_capacity = 0; + unsigned int weight; + long min_util = INT_MIN; + struct task_group *tg; + long spare; + int cpu; + + if (!prefer_cpus_valid(prefer_cpus, (void *)getVal(p->cpus_ptr))) + return (void *)getVal(p->cpus_ptr); + + tg = p->sched_task_group; + libbpf_for_each_cpu(cpu, prefer_cpus) { + if (idlest_cpu && libbpf_available_idle_cpu(cpu)) { + *idlest_cpu = cpu; + } else if (idlest_cpu) { + spare = (long)(libbpf_capacity_of(cpu) - libbpf_cfs_util_avg_of(cpu)); + if (spare > min_util) { + min_util = spare; + *idlest_cpu = cpu; + } + } + + if (libbpf_available_idle_cpu(cpu)) + return getVal(prefer_cpus); + + util_avg_sum += libbpf_cfs_util_avg_of(cpu); + tg_capacity += libbpf_capacity_of(cpu); + } + + weight = libbpf_cpumask_weight(prefer_cpus); + if (tg_capacity > weight && + util_avg_sum * 100 <= tg_capacity * sysctl_sched_util_low_pct) { + return getVal(prefer_cpus); + } + + return (void *)getVal(p->cpus_ptr); +} + +SEC("sched/cfs_select_rq") +int BPF_PROG(cfs_select_cpu_range, struct sched_migrate_ctx *h_ctx) +{ + struct cpumask *prefer_cpus = getVal(h_ctx->select_idle_mask); + struct task_struct *p = getVal(h_ctx->task); + struct cpumask *cpus_ptr; + int type = SELECT_RQ_RANGE; + long tag = getVal(p->tag); + int *idlest_cpu = 0; + int key = 0; + int ret; + + if (tag <= TAG_NONE || tag >= TAG_MAX) + return type; + + ret = libbpf_cpumask_cpulist_parse(tag_tbl[tag].buf, prefer_cpus); + if (ret) + return type; + + idlest_cpu = bpf_map_lookup_elem(&map_idlest_cpu, &key); + if (!idlest_cpu) + return type; + + cpus_ptr = select_better_cpus(p, prefer_cpus, idlest_cpu); + libbpf_sched_set_task_cpus_ptr((void *)h_ctx, getVal(cpus_ptr)); + + return type; +} + +SEC("sched/cfs_select_rq_exit") +int BPF_PROG(cfs_select_cpu_range_exit, struct sched_migrate_ctx *h_ctx) +{ + int *idlest_cpu; + int key = 0; + + idlest_cpu = bpf_map_lookup_elem(&map_idlest_cpu, &key); + if (!idlest_cpu) { + libbpf_sched_set_task_cpus_ptr(h_ctx, (void *)getVal(h_ctx->cpus_allowed)); + return SELECT_RQ_EXIT_CPU_VALID; + } + + if (!libbpf_cpumask_test_cpu(getVal(h_ctx->new_cpu), + (void *)getVal(h_ctx->task->cpus_ptr))) { + libbpf_sched_set_task_cpus_ptr(h_ctx, (void *)getVal(h_ctx->cpus_allowed)); + return *idlest_cpu; + } + + libbpf_sched_set_task_cpus_ptr(h_ctx, (void *)getVal(h_ctx->cpus_allowed)); + return SELECT_RQ_EXIT_CPU_VALID; +} + +static int find_idlest_cpu(struct task_struct *p, int parent) +{ + unsigned long min = INT_MAX; + int min_load_cpu = 0; + unsigned long load; + int cpu; + int i; + + for (i = 0, cpu = -1; i < NR_CPUS; i++) { + cpu = libbpf_cpumask_next(cpu, (void *)getVal(p->cpus_ptr)); + if (cpu >= libbpf_nr_cpus_ids()) + break; + + load = libbpf_cfs_load_avg_of(cpu); + if (load < min) { + min = load; + min_load_cpu = cpu; + } + } + + return min_load_cpu; +} + +static int select_idle_cpu(struct task_struct *p, int parent, int prev_cpu) +{ + int cpu; + + if (libbpf_available_idle_cpu(prev_cpu)) + return prev_cpu; + + if (libbpf_available_idle_cpu(parent)) + return prev_cpu; + + libbpf_for_each_cpu_wrap(cpu, (void *)getVal(p->cpus_ptr), prev_cpu) { + if (libbpf_available_idle_cpu(cpu)) + return cpu; + } + + return prev_cpu; +} + +SEC("sched/cfs_select_rq") +int BPF_PROG(cfs_select_cpu, struct sched_migrate_ctx *h_ctx) +{ + struct task_struct *p = getVal(h_ctx->task); + int wake_flags = getVal(h_ctx->wake_flags); + int prev_cpu = getVal(h_ctx->prev_cpu); + int cpu = getVal(h_ctx->curr_cpu); + int new_cpu; + + if (wake_flags == WF_FORK) { + /* Slow path */ + new_cpu = find_idlest_cpu(p, cpu); + } else { + /* Fast path */ + new_cpu = select_idle_cpu(p, cpu, prev_cpu); + } + + return new_cpu; +} + +SEC("sched/cfs_wake_affine") +int BPF_PROG(cfs_wake_affine, struct sched_affine_ctx *h_ctx) +{ + int prev_cpu = getVal(h_ctx->prev_cpu); + int curr_cpu = getVal(h_ctx->curr_cpu); + int sync = getVal(h_ctx->is_sync); + + if (libbpf_available_idle_cpu(curr_cpu) && + libbpf_cpus_share_cache(curr_cpu, prev_cpu)) + return libbpf_available_idle_cpu(prev_cpu) ? prev_cpu : curr_cpu; + + if (sync && libbpf_nr_running_of(curr_cpu) == 1) + return curr_cpu; + + return prev_cpu; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sched_select_core_user.c b/samples/bpf/sched_select_core_user.c new file mode 100644 index 0000000000000000000000000000000000000000..99c98f3944780b4374f4701a5483e07412601be1 --- /dev/null +++ b/samples/bpf/sched_select_core_user.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void usage(void) +{ + printf("USAGE: test sched select core [...]\n"); + printf(" -W wakeup affine # Test sched wake wakeup\n"); + printf(" -C select core # Test sched select core\n"); + printf(" -R select core range # Test sched select core range\n"); + printf(" -h # Display this help\n"); +} + +#define TRACE_DIR "/sys/kernel/debug/tracing/" +#define BUF_SIZE (4096) + +/* read trace logs from debug fs */ +static void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(TRACE_DIR "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[BUF_SIZE]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} + +int main(int argc, char **argv) +{ + int opt; + char filename[256]; + char progname[4][256]; + struct bpf_object *obj; + struct bpf_program *prog[4] = {NULL}; + struct bpf_link *link[4] = {NULL}; + int prog_num = 1; + int i = 0; + + while ((opt = getopt(argc, argv, "C::R::W::E::")) != -1) { + switch (opt) { + case 'C': + snprintf(progname[0], sizeof(progname[0]), "cfs_select_cpu"); + break; + case 'R': + snprintf(progname[0], sizeof(progname[0]), "cfs_select_cpu_range"); + snprintf(progname[1], sizeof(progname[1]), "cfs_select_cpu_range_exit"); + prog_num = 2; + break; + case 'W': + snprintf(progname[0], sizeof(progname[0]), "cfs_wake_affine"); + break; + default: + usage(); + goto out; + } + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + goto out; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (i = 0; i < prog_num; i++) { + prog[i] = bpf_object__find_program_by_name(obj, progname[i]); + if (libbpf_get_error(prog[i])) { + fprintf(stderr, "ERROR: finding a prog %d in obj file failed\n", i); + goto cleanup; + } + + link[i] = bpf_program__attach(prog[i]); + if (libbpf_get_error(link[i])) { + fprintf(stderr, "ERROR: bpf_program__attach %d failed\n", i); + link[i] = NULL; + goto cleanup; + } + } + + printf("select rq BPF started, hit Ctrl+C to stop!\n"); + + read_trace_pipe(); + +cleanup: + for (; i >= 0; i--) + bpf_link__destroy(link[i]); + bpf_object__close(obj); +out: + return 0; +} diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index be21512ee7be4098360b51548665730e8aee1e98..fd89d2f2a86d4bf9a83144e37fafc92dd2b1ef8a 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -436,6 +436,16 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct sched_entity', + 'struct task_group', + 'struct bpf_sched_cpu_load', + 'struct bpf_sched_cpu_nr_running', + 'struct bpf_sched_cpu_idle_stat', + 'struct bpf_sched_cpu_capacity', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', + 'struct cpumask', + 'struct cpumask_op_args', + 'struct sched_migrate_ctx', ] known_types = { '...', @@ -480,6 +490,16 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct sched_entity', + 'struct task_group', + 'struct bpf_sched_cpu_load', + 'struct bpf_sched_cpu_nr_running', + 'struct bpf_sched_cpu_idle_stat', + 'struct bpf_sched_cpu_capacity', + 'struct bpf_cpu_topology', + 'struct bpf_cpumask_info', + 'struct cpumask', + 'struct cpumask_op_args', + 'struct sched_migrate_ctx', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b2a0b189b797de7358eaf3be76326a414930c5f4..1f70a8adc0d12f4c057203d8301917b35c514f84 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3777,6 +3777,141 @@ union bpf_attr { * to be enabled. * Return * 1 if the sched entity belongs to a cgroup, 0 otherwise. + * + * long bpf_sched_tg_tag_of(struct task_group *tg) + * Description + * Return task group tag of *tg* if CONFIG_CGROUP_SCHED enabled. + * The bpf prog obtains the tags to detect different workloads. + * Return + * Task group tag, if CONFIG_CGROUP_SCHED enabled, 0 as default tag, or + * a negative error in case of failure. + * + * long bpf_sched_task_tag_of(struct task_struct *tsk) + * Description + * Return task tag of *tsk*.The bpf prog obtains the tags to detect + * different workloads. + * Return + * Task tag, if used, 0 as default tag, or a negative error in case of failure. + * + * long bpf_sched_entity_is_task(struct sched_entity *se) + * Description + * Checks whether the sched entity is a task. + * Return + * 1 if true, 0 otherwise. + * + * struct task_struct *bpf_sched_entity_to_task(struct sched_entity *se) + * Description + * Return task struct of *se* if se is a task. + * Return + * Task struct if se is a task, NULL otherwise. + * + * struct task_group *bpf_sched_entity_to_tg(struct sched_entity *se) + * Description + * Return task group of *se* if se is a task group. + * Return + * Task struct if se is a task group, NULL otherwise. + * + * int bpf_sched_set_tg_tag(struct task_group *tg, s64 tag) + * Description + * Set tag to *tg* and its descendants. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_set_task_tag(struct task_struct *tsk, s64 tag) + * Description + * Set tag to *tsk*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_load_of(int cpu, struct bpf_sched_cpu_load *ctx, int len) + * Description + * Get multiple types of *cpu* load and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_nr_running_of(int cpu, struct bpf_sched_cpu_nr_running *ctx, int len) + * Description + * Get multiple types of *cpu* nr running and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_idle_stat_of(int cpu, struct bpf_sched_cpu_idle_stat *ctx, int len) + * Description + * Get *cpu* idle state and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_cpu_capacity_of(int cpu, struct bpf_sched_cpu_capacity *ctx, int len) + * Description + * Get *cpu* capacity and store in *ctx*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_init_cpu_topology(struct bpf_map *map, u64 flags) + * Description + * Initializing the cpu topology which used for bpf prog. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_cpumask_info(struct bpf_cpumask_info *cpus, int len) + * Description + * Get system cpus returned in *cpus*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_cpumask_op(struct cpumask_op_args *op, int len) + * Description + * A series of cpumask-related operations. Perform different + * operations base on *op*->type. User also need fill other + * *op* field base on *op*->type. *op*->type is one of them + * + * **CPUMASK_EMPTY** + * *(op->arg1) == 0 returned. + * **CPUMASK_AND** + * *(op->arg1) = *(op->arg2) & *(op->arg3) + * **CPUMASK_ANDNOT** + * *(op->arg1) = *(op->arg2) & ~*(op->arg3) + * **CPUMASK_SUBSET** + * *(op->arg1) & ~*(op->arg2) == 0 returned + * **CPUMASK_EQUAL** + * *(op->arg1) == *(op->arg2) returned + * **CPUMASK_TEST_CPU** + * test for a cpu *(int)(op->arg1) in *(op->arg2) + * returns 1 if *op*->arg1 is set in *op*->arg2, else returns 0 + * **CPUMASK_COPY** + * *(op->arg1) = *(op->arg2), return 0 always + * **CPUMASK_WEIGHT** + * count of bits in *(op->arg1) + * **CPUMASK_NEXT** + * get the next cpu in *(struct cpumask *)(op->arg2) + * *(int *)(op->arg1): the cpu prior to the place to search + * **CPUMASK_NEXT_WRAP** + * helper to implement for_each_cpu_wrap + * @op->arg1: the cpu prior to the place to search + * @op->arg2: the cpumask pointer + * @op->arg3: the start point of the iteration + * @op->arg4: assume @op->arg1 crossing @op->arg3 terminates the iteration + * returns >= nr_cpu_ids on completion + * **CPUMASK_NEXT_AND** + * get the next cpu in *(op->arg1) & *(op->arg2) + * **CPUMASK_CPULIST_PARSE** + * extract a cpumask from a user string of ranges. + * (char *)op->arg1 -> (struct cpumask *)(op->arg2) + * 0 on success, or a negative error in case of failure. + * Return + * View above. + * + * int bpf_cpus_share_cache(int src_cpu, int dst_cpu) + * Description + * check src_cpu whether share cache with dst_cpu. + * Return + * true yes, false no. + * + * int bpf_sched_set_task_cpus_ptr(struct sched_migrate_ctx *h_ctx, struct cpumask *cpus, int len) + * Description + * set cpus_ptr in task. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3940,6 +4075,22 @@ union bpf_attr { FN(sched_entity_to_tgidpid), \ FN(sched_entity_to_cgrpid), \ FN(sched_entity_belongs_to_cgrp), \ + FN(sched_tg_tag_of), \ + FN(sched_task_tag_of), \ + FN(sched_entity_is_task), \ + FN(sched_entity_to_task), \ + FN(sched_entity_to_tg), \ + FN(sched_set_tg_tag), \ + FN(sched_set_task_tag), \ + FN(sched_cpu_load_of), \ + FN(sched_cpu_nr_running_of), \ + FN(sched_cpu_idle_stat_of), \ + FN(sched_cpu_capacity_of), \ + FN(init_cpu_topology), \ + FN(get_cpumask_info), \ + FN(cpumask_op), \ + FN(cpus_share_cache), \ + FN(sched_set_task_cpus_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h new file mode 100644 index 0000000000000000000000000000000000000000..6cb30e8e81f802ebfdbc791e3b8bb4ae78653886 --- /dev/null +++ b/tools/lib/bpf/libbpf_sched.h @@ -0,0 +1,473 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __LIBBPF_LIBSCHED_H +#define __LIBBPF_LIBSCHED_H + +#include +#include +#include +#include +#include + +#define INVALID_PTR ((void *)(0UL)) +#define getVal(P) \ + ({ \ + typeof(P) val = 0; \ + bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ + val; \ + }) + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask); +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap); +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2); +static __always_inline int libbpf_nr_cpus_ids(void); +static __always_inline int libbpf_nr_cpumask_bits(void); + +#if NR_CPUS == 1 + +#define libbpf_for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) + +#else + +#define libbpf_for_each_cpu(cpu, mask) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next((cpu), (mask)), \ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#define libbpf_for_each_cpu_wrap(cpu, mask, start) \ + for (int __i = 0, (cpu) = libbpf_cpumask_next_wrap((start) - 1,\ + (mask), (start), false); \ + (cpu) < libbpf_nr_cpumask_bits() && __i < NR_CPUS; \ + (cpu) = libbpf_cpumask_next_wrap((cpu), (mask), (start),\ + true), __i++) + +#define libbpf_for_each_cpu_and(cpu, mask1, mask2) \ + for (int __i = 0, (cpu) = -1; \ + (cpu) = libbpf_cpumask_next_and((cpu), (mask1), (mask2)),\ + (cpu) < libbpf_nr_cpus_ids() && __i < NR_CPUS; __i++) + +#endif + +static __always_inline long libbpf_cpumask_copy(struct cpumask *dst, + struct cpumask *src) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_COPY; + op.arg1 = dst; + op.arg2 = src; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_empty(struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EMPTY; + op.arg1 = mask; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_and(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_AND; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_andnot(struct cpumask *dst, + struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_ANDNOT; + op.arg1 = dst; + op.arg2 = src1; + op.arg3 = src2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_subset(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_SUBSET; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_equal(struct cpumask *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_EQUAL; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_weight(struct cpumask *src1) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_WEIGHT; + op.arg1 = src1; + op.arg2 = INVALID_PTR; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_test_cpu(int cpu, + struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_TEST_CPU; + op.arg1 = &cpu; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next(int n, struct cpumask *mask) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_wrap(int n, + struct cpumask *mask, + int start, int wrap) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_WRAP; + op.arg1 = &n; + op.arg2 = mask; + op.arg3 = &start; + op.arg4 = &wrap; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_next_and(int n, + struct cpumask *mask1, + struct cpumask *mask2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_NEXT_AND; + op.arg1 = &n; + op.arg2 = mask1; + op.arg3 = mask2; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline long libbpf_cpumask_cpulist_parse(char *src1, + struct cpumask *src2) +{ + struct cpumask_op_args op; + + op.op_type = CPUMASK_CPULIST_PARSE; + op.arg1 = src1; + op.arg2 = src2; + op.arg3 = INVALID_PTR; + op.arg4 = INVALID_PTR; + return bpf_cpumask_op(&op, sizeof(op)); +} + +static __always_inline int libbpf_num_active_cpus(void) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + return getVal(cpus.nums_active_cpus); +} + +static __always_inline int libbpf_num_possible_cpus(void) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + return getVal(cpus.nums_possible_cpus); +} + +static __always_inline void libbpf_possible_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + libbpf_cpumask_copy(mask, &cpus.cpu_possible_cpumask); +} + +static __always_inline void libbpf_active_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + libbpf_cpumask_copy(mask, &cpus.cpu_active_cpumask); +} + +static __always_inline void libbpf_isolate_cpus_mask(struct cpumask *mask) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + libbpf_cpumask_copy(mask, &cpus.cpu_isolate_cpumask); +} + +static __always_inline int libbpf_nr_cpus_ids(void) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + return getVal(cpus.nr_cpu_ids); +} + +static __always_inline int libbpf_nr_cpumask_bits(void) +{ + struct bpf_cpumask_info cpus; + + bpf_get_cpumask_info(&cpus, sizeof(cpus)); + return getVal(cpus.bpf_nr_cpumask_bits); +} + +static __always_inline unsigned long libbpf_cfs_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_load_avg); +} + +static __always_inline unsigned long libbpf_cfs_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_runnable_avg); +} + +static __always_inline unsigned long libbpf_cfs_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return getVal(load.cfs_util_avg); +} + +static __always_inline unsigned long libbpf_rt_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.rt_load_avg; +} + +static __always_inline unsigned long libbpf_rt_runnable_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.rt_runnable_avg; +} + +static __always_inline unsigned long libbpf_rt_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.rt_util_avg; +} + +static __always_inline unsigned long libbpf_irq_load_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.irq_load_avg; +} + +static __always_inline unsigned long libbpf_irq_util_avg_of(int cpu) +{ + struct bpf_sched_cpu_load load; + + bpf_sched_cpu_load_of(cpu, &load, sizeof(load)); + return load.irq_util_avg; +} + +static __always_inline unsigned int libbpf_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.nr_running); +} + +static __always_inline unsigned int libbpf_cfs_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.cfs_h_nr_running); +} + +static __always_inline unsigned int libbpf_cfs_idle_h_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return running.cfs_idle_h_nr_running; +} + +static __always_inline unsigned int libbpf_rt_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return getVal(running.rt_nr_running); +} + +static __always_inline unsigned int libbpf_rr_nr_running_of(int cpu) +{ + struct bpf_sched_cpu_nr_running running; + + bpf_sched_cpu_nr_running_of(cpu, &running, sizeof(running)); + return running.rr_nr_running; +} + +static __always_inline unsigned int libbpf_exit_latency_of(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return stat.exit_latency; +} + +static __always_inline unsigned long libbpf_idle_stamp_of(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return stat.idle_stamp; +} + +static __always_inline unsigned long libbpf_avg_idle_of(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return stat.avg_idle; +} + +static __always_inline unsigned long libbpf_available_idle_cpu(int cpu) +{ + struct bpf_sched_cpu_idle_stat stat; + + bpf_sched_cpu_idle_stat_of(cpu, &stat, sizeof(stat)); + return getVal(stat.available_idle); +} + +static __always_inline unsigned long libbpf_capacity_of(int cpu) +{ + struct bpf_sched_cpu_capacity cap; + + bpf_sched_cpu_capacity_of(cpu, &cap, sizeof(cap)); + return getVal(cap.capacity); +} + +static __always_inline unsigned long libbpf_capacity_orig_of(int cpu) +{ + struct bpf_sched_cpu_capacity cap; + + bpf_sched_cpu_capacity_of(cpu, &cap, sizeof(cap)); + return cap.capacity_orig; +} + +static __always_inline int libbpf_cpus_share_cache(int src_cpu, int dst_cpu) +{ + return bpf_cpus_share_cache(src_cpu, dst_cpu); +} + +static __always_inline int libbpf_sched_se_tag_of(struct sched_entity *se) +{ + int se_tag = 0; + + if (bpf_sched_entity_is_task(se)) { + struct task_struct *task = bpf_sched_entity_to_task(se); + + se_tag = bpf_sched_task_tag_of(task); + } else { + struct task_group *tg = bpf_sched_entity_to_tg(se); + + se_tag = bpf_sched_tg_tag_of(tg); + } + + return se_tag; +} + +static __always_inline void libbpf_sched_set_task_cpus_ptr( + struct sched_migrate_ctx *h_ctx, + struct cpumask *cpus) +{ + bpf_sched_set_task_cpus_ptr(h_ctx, cpus, sizeof(*cpus)); +} +#endif