diff --git a/include/linux/sched.h b/include/linux/sched.h index d8c974338ce2528017446a8b8cc60056cd490f74..8e464b2b06a92f5a12845e592c06415856c75047 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -68,6 +68,11 @@ struct task_delay_info; struct task_group; struct io_uring_task; +#define NONE_BY_PASS 0x0000 +#define INIT_BY_PASS 0x0001 +#define IN_BY_PASS 0x0002 +#define END_BY_PASS 0x0004 + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). @@ -500,6 +505,10 @@ struct sched_entity { unsigned long runnable_weight; #endif +#ifdef CONFIG_DTS + int by_pass; +#endif + #ifdef CONFIG_SMP /* * Per entity load average tracking. @@ -726,6 +735,15 @@ struct task_struct { int normal_prio; unsigned int rt_priority; +#ifdef CONFIG_DTS + /* + * by_pass indicate that the task is launched by direct-thread-switch. + * dts_shared_se is the schedule entity shared with DTS task. + */ + int by_pass; + struct sched_entity dts_shared_se; +#endif + const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; @@ -1821,6 +1839,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); +extern int wake_up_process_prefer_current_cpu(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP @@ -2195,3 +2214,7 @@ static inline int sched_qos_cpu_overload(void) } #endif #endif + +#ifdef CONFIG_DTS +extern int check_task_left_time(struct task_struct *task); +#endif \ No newline at end of file diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index a89eb0accd5e2ee527be1e3e11b1117ff5bf94b4..e2ed0553046d8af5a970837440f01054a05d022c 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -21,10 +21,18 @@ #define FUTEX_WAKE_BITSET 10 #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 +#define FUTEX_SWAP 13 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 + +#ifdef CONFIG_DTS +#define FUTEX_FLAGS_DTS_MODE 512 +#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \ + FUTEX_FLAGS_DTS_MODE) +#else #define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME) +#endif #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) @@ -40,6 +48,8 @@ FUTEX_PRIVATE_FLAG) #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) +#define FUTEX_SWAP_PRIVATE (FUTEX_SWAP | FUTEX_PRIVATE_FLAG) + /* * Support for robust futexes: the kernel cleans up held futexes at diff --git a/init/Kconfig b/init/Kconfig index d860f28d3ac8c932a94d5386883c1e96d24906f9..75d549fd63bc7f0cc572b7d05474798b61073501 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1246,6 +1246,13 @@ config SCHED_STEAL If unsure, say N here. +config DTS + bool "Direct Thread Switch" + default y + depends on SCHED_STEAL + help + enable the direct thread switch mechanism in the futex_swap operation + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" select PROC_CHILDREN diff --git a/kernel/futex.c b/kernel/futex.c index 98a6e1b80bfe462e9174dca612c3c6a010440a0b..bdff0c7586780f7a8e197ae7c635cf8955d720df 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -39,11 +39,16 @@ #include #include #include +#include +#include #include #include "locking/rtmutex_common.h" +#ifdef CONFIG_DTS +#include "sched/sched.h" +#endif /* * READ this before attempting to hack on futexes! * @@ -161,7 +166,7 @@ static int __read_mostly futex_cmpxchg_enabled; * NOMMU does not have per process address space. Let the compiler optimize * code away. */ -# define FLAGS_SHARED 0x00 +#define FLAGS_SHARED 0x00 #endif #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 @@ -1181,7 +1186,7 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, * tsk->futex_state = } else { * FUTEX_STATE_DEAD; if (tsk->futex_state != * FUTEX_STATE_DEAD) - * return -EAGAIN; + * return -EAGAIN; * return -ESRCH; <--- FAIL * } * @@ -1584,16 +1589,16 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) } /* - * Wake up waiters matching bitset queued on this futex (uaddr). + * Prepare wake queue matching bitset queued on this futex (uaddr). */ static int -futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) +prepare_wake_q(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset, + struct wake_q_head *wake_q) { struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; int ret; - DEFINE_WAKE_Q(wake_q); if (!bitset) return -EINVAL; @@ -1621,14 +1626,28 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (!(this->bitset & bitset)) continue; - mark_wake_futex(&wake_q, this); + mark_wake_futex(wake_q, this); if (++ret >= nr_wake) break; } } spin_unlock(&hb->lock); + return ret; +} + +/* + * Wake up waiters matching bitset queued on this futex (uaddr). + */ +static int +futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) +{ + int ret; + DEFINE_WAKE_Q(wake_q); + + ret = prepare_wake_q(uaddr, flags, nr_wake, bitset, &wake_q); wake_up_q(&wake_q); + return ret; } @@ -2571,14 +2590,232 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) return 0; } +#ifdef CONFIG_DTS +static int __direct_thread_switch(struct task_struct *next) +{ + int cpu = smp_processor_id(); + int success = 1; + struct rq_flags rf; + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq = &rq->cfs; + struct task_struct *prev = rq->curr; + struct sched_entity *prev_se, *next_se; + unsigned long *switch_count = &prev->nvcsw; + unsigned long prev_state; + int next_state; + struct rq *src_rq_next; + bool locked; + + if (!prev->by_pass) { + prev_se = &prev->se; + } else { + prev_se = &prev->dts_shared_se; + } + + next_se = &next->se; + + prev->by_pass = NONE_BY_PASS; + next->by_pass = INIT_BY_PASS; + next->dts_shared_se = *prev_se; + prev_se->by_pass = NONE_BY_PASS; + next->dts_shared_se.by_pass = INIT_BY_PASS; + + /* task_struct::state is volatile so far */ + next_state = next->state; + src_rq_next = task_rq(next); + locked = true; + /* Deliver the execution to the callee. */ + if (next_state == TASK_RUNNING) { + printk("======*************************==\n"); + /* The next is running now. */ + if (task_running(src_rq_next, next)) { + success = 0; + goto end; + } + /* The next task is runnable, and may stay in the current core's rq or other cores' rq. */ + /* Dequeue the next task's se (rather than dts_shared_se) to keep fairness and consistence. + * Enqueue the next task's se when the task expired. + */ + if (task_rq(next) != rq) { +#ifdef CONFIG_SCHED_STEAL + /* migrate */ + if (!steal_task(rq, &rf, &locked, next)) { + success = 0; + goto end; + } +#else + success = 0; + goto end; +#endif + } + replace_shared_entity(cfs_rq, next_se, &next->dts_shared_se); + } else if (next_state == TASK_INTERRUPTIBLE) { + /* + * // TODO + * The next task in the sleeping state caused by futex_swap, futex_wait, + * can be woken up here so far, but signals, and other interruptible situations + * need to be implemented here. + * P.S. We pick up the next task from the wake list of the corresponding futex_t. + */ + + /* Enqueue the shared_se and change the state without entering schedule() path. */ + if (!wake_up_process_prefer_current_cpu(next)) { + success = 0; + goto end; + } + + /* success to wakeup (set p->state = TASK_RUNNING) */ + /* dequeue the shared_se and set rq->curr = &next->dts_shared_se; */ + set_next_entity(cfs_rq, &next->dts_shared_se); + + } else { + printk("============================\n"); + success = 0; + goto end; + } + + sched_submit_work(prev); + preempt_disable(); + + local_irq_disable(); + rcu_note_context_switch(false); + + /* + * Make sure that signal_pending_state()->signal_pending() below + * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) + * done by the caller(futex_wait_queue_me) to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * __direct_thread_switch() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) + * + * Also, the membarrier system call requires a full memory barrier + * after coming from user-space, before storing to rq->curr. + */ + rq_lock(rq, &rf); + smp_mb__after_spinlock(); + + /* + * We may fail to switch, so do not deactivate the current task before + * process the next. + */ + + /* + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; + if (prev_state) { + if (signal_pending_state(prev_state, prev)) { + prev->state = TASK_RUNNING; + } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ + deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); + + if (prev->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + } + } + + rq->nr_switches++; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). + */ + RCU_INIT_POINTER(rq->curr, next); + /* + * The membarrier system call requires each architecture + * to have a full memory barrier after updating + * rq->curr, before returning to user-space. + * + * Here are the schemes providing that barrier on the + * various architectures: + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. + * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - finish_lock_switch() for weakly-ordered + * architectures where spin_unlock is a full barrier, + * - switch_to() for arm64 (weakly-ordered, spin_unlock + * is a RELEASE barrier), + */ + ++*switch_count; + + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + + trace_sched_switch(false, prev, next); + + /* do the get_task_struct() in the futex_wait_queue_me() before */ + put_task_struct(next); + + rq = context_switch(rq, prev, next, &rf); + + balance_callback(rq); + sched_preempt_enable_no_resched(); + sched_update_worker(next); +end: + return success; +} + +/* + * return + * 0 for fail + * 1 for succeed + */ +static int direct_thread_switch(struct task_struct *next) +{ + if (next->sched_class != &fair_sched_class || + current == next) { + return 0; + } + + update_before_bypass(); + + if (!check_task_left_time(current)) { + /* 因为要保证公平性 不把next作为下次选任务时建议选的任务 */ + /* dts失败 标记调度 */ + return 0; + } + + return __direct_thread_switch(next); +} +#endif /* CONFIG_DTS */ + /** * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal * @hb: the futex hash bucket, must be locked by the caller * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout + * @next: if present, wake next and hint to the scheduler that we'd + * prefer to execute it locally. */ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout) + struct hrtimer_sleeper *timeout, + struct task_struct *next, int flags) { /* * The task state is guaranteed to be set before another task can @@ -2598,15 +2835,60 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { +#ifdef CONFIG_DTS + int do_dts_switch = 0; +#endif /* * If the timer has already expired, current will already be * flagged for rescheduling. Only call schedule if there * is no timeout, or if it has yet to expire. */ - if (!timeout || timeout->task) - freezable_schedule(); + if (!timeout || timeout->task) { + if (next) { +#ifdef CONFIG_DTS + /* + * If we fail to switch to the next task directly, try to switch to + * the next task in the traditional way. + * + */ + if (flags & FUTEX_FLAGS_DTS_MODE) + do_dts_switch = direct_thread_switch(next); + + if (!do_dts_switch) +#endif + { +#ifdef CONFIG_SMP + wake_up_process_prefer_current_cpu(next); +#else + wake_up_process(next); +#endif + } + +#ifdef CONFIG_DTS + if (!do_dts_switch) +#endif + put_task_struct(next); + + next = NULL; + } +#ifdef CONFIG_DTS + if (!do_dts_switch) +#endif + freezable_schedule(); + } } __set_current_state(TASK_RUNNING); + + + + if (next) { +#ifdef CONFIG_DTS + direct_thread_switch(next); +#else + wake_up_process(next); + put_task_struct(next); +#endif + } } /** @@ -2682,7 +2964,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, } static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) + ktime_t *abs_time, u32 bitset, struct task_struct *next) { struct hrtimer_sleeper timeout, *to; struct restart_block *restart; @@ -2706,7 +2988,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, goto out; /* queue_me and wait for wakeup, timeout, or a signal. */ - futex_wait_queue_me(hb, &q, to); + futex_wait_queue_me(hb, &q, to, next, flags); + next = NULL; /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; @@ -2738,6 +3021,10 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ret = set_restart_fn(restart, futex_wait_restart); out: + if (next) { + wake_up_process(next); + put_task_struct(next); + } if (to) { hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); @@ -2745,7 +3032,6 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, return ret; } - static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; @@ -2757,10 +3043,38 @@ static long futex_wait_restart(struct restart_block *restart) } restart->fn = do_no_restart_syscall; - return (long)futex_wait(uaddr, restart->futex.flags, - restart->futex.val, tp, restart->futex.bitset); + return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val, + tp, restart->futex.bitset, NULL); } +static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 __user *uaddr2) +{ + u32 bitset = FUTEX_BITSET_MATCH_ANY; + struct task_struct *next = NULL; + DEFINE_WAKE_Q(wake_q); + int ret; + + ret = prepare_wake_q(uaddr2, flags, 1, bitset, &wake_q); + if (ret < 0) + return ret; + if (!wake_q_empty(&wake_q)) { + /* At most one wakee can be present. Pull it out. */ + next = container_of(wake_q.first, struct task_struct, wake_q); + next->wake_q.next = NULL; + } + + /* Basic security test. (Are the two tasks in the same group?) */ + + /* Have any time slices to be used? */ + + /* + * The old one will go to sleep and enqueue the rq, meanwhile, get + * the new one to run. + */ + + return futex_wait(uaddr, flags, val, abs_time, bitset, next); +} /* * Userspace tried a 0 -> TID atomic transition of the futex value @@ -3222,7 +3536,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue_me(hb, &q, to); + futex_wait_queue_me(hb, &q, to, NULL, flags); spin_lock(&hb->lock); ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); @@ -3708,6 +4022,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, int cmd = op & FUTEX_CMD_MASK; unsigned int flags = 0; +#ifdef CONFIG_DTS + if (op & FUTEX_FLAGS_DTS_MODE) { + flags |= FUTEX_FLAGS_DTS_MODE; + } +#endif + if (!(op & FUTEX_PRIVATE_FLAG)) flags |= FLAGS_SHARED; @@ -3732,7 +4052,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAIT_BITSET: - return futex_wait(uaddr, flags, val, timeout, val3); + return futex_wait(uaddr, flags, val, timeout, val3, NULL); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; @@ -3756,6 +4076,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, uaddr2); case FUTEX_CMP_REQUEUE_PI: return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); + case FUTEX_SWAP: + return futex_swap(uaddr, flags, val, timeout, uaddr2); } return -ENOSYS; } @@ -3772,7 +4094,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) @@ -3781,7 +4103,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return -EINVAL; t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP) t = ktime_add_safe(ktime_get(), t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) t = timens_ktime_to_host(CLOCK_MONOTONIC, t); @@ -3968,14 +4290,14 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; if (!timespec64_valid(&ts)) return -EINVAL; t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP) t = ktime_add_safe(ktime_get(), t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) t = timens_ktime_to_host(CLOCK_MONOTONIC, t); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c936c042297b1de96c654cad369c1060ddfb594e..39b9b9cbb73682988d2508841ab6afd43ba5078e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2824,7 +2824,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) { - check_preempt_curr(rq, p, wake_flags); +#ifdef CONFIG_DTS + if (p->by_pass != INIT_BY_PASS) +#endif + check_preempt_curr(rq, p, wake_flags); + p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -3351,7 +3355,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) out: if (success) ttwu_stat(p, task_cpu(p), wake_flags); - preempt_enable(); +#ifdef CONFIG_DTS + if (p->by_pass == INIT_BY_PASS) { + p->by_pass = IN_BY_PASS; + p->se.by_pass = IN_BY_PASS; + p->dts_shared_se.by_pass = IN_BY_PASS; + preempt_enable_no_resched(); + } + else +#endif + preempt_enable(); return success; } @@ -3441,6 +3454,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_DTS + p->dts_shared_se.on_rq = 0; + p->dts_shared_se.exec_start = 0; + p->dts_shared_se.sum_exec_runtime = 0; + p->dts_shared_se.prev_sum_exec_runtime = 0; + p->dts_shared_se.nr_migrations = 0; + p->dts_shared_se.vruntime = 0; + INIT_LIST_HEAD(&p->dts_shared_se.group_node); +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif @@ -3667,6 +3690,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); +#ifdef CONFIG_DTS + p->by_pass = NONE_BY_PASS; + p->se.by_pass = NONE_BY_PASS; + p->dts_shared_se.by_pass = NONE_BY_PASS; +#endif #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) @@ -4046,6 +4074,11 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } +#ifdef CONFIG_DTS + prev->by_pass = NONE_BY_PASS; + prev->se.by_pass = NONE_BY_PASS; + prev->dts_shared_se.by_pass = NONE_BY_PASS; +#endif if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -4088,7 +4121,7 @@ static void __balance_callback(struct rq *rq) raw_spin_rq_unlock_irqrestore(rq, flags); } -static inline void balance_callback(struct rq *rq) +inline void balance_callback(struct rq *rq) { if (unlikely(rq->balance_callback)) __balance_callback(rq); @@ -4096,7 +4129,7 @@ static inline void balance_callback(struct rq *rq) #else -static inline void balance_callback(struct rq *rq) +inline void balance_callback(struct rq *rq) { } @@ -4133,7 +4166,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* * context_switch - switch to the new MM and the new thread's register state. */ -static __always_inline struct rq * +__always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { @@ -4189,7 +4222,7 @@ context_switch(struct rq *rq, struct task_struct *prev, barrier(); return finish_task_switch(prev); -} +}EXPORT_SYMBOL(context_switch); /* * nr_running and nr_context_switches: @@ -5421,7 +5454,7 @@ void __noreturn do_task_dead(void) cpu_relax(); } -static inline void sched_submit_work(struct task_struct *tsk) +inline void sched_submit_work(struct task_struct *tsk) { unsigned int task_flags; @@ -5457,7 +5490,7 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -static void sched_update_worker(struct task_struct *tsk) +void sched_update_worker(struct task_struct *tsk) { if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { if (tsk->flags & PF_WQ_WORKER) @@ -7737,6 +7770,11 @@ void sched_setnuma(struct task_struct *p, int nid) } #endif /* CONFIG_NUMA_BALANCING */ +int wake_up_process_prefer_current_cpu(struct task_struct *next) +{ + return try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU); +} + #ifdef CONFIG_HOTPLUG_CPU /* * Ensure that the idle task is using init_mm right before its CPU goes diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20b482688e3a6132fcb1319309db910594489397..42aa3f6ca7cb8b0d894a1b99b7acd231eb7f04fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -278,11 +278,6 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED - -/* Walk up scheduling entities hierarchy */ -#define for_each_sched_entity(se) \ - for (; se; se = se->parent) - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (!path) @@ -443,8 +438,11 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #else /* !CONFIG_FAIR_GROUP_SCHED */ -#define for_each_sched_entity(se) \ - for (; se; se = NULL) +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { @@ -576,6 +574,28 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) &cfs_rq->tasks_timeline, leftmost); } +static void __traverse_cfs_rq(struct cfs_rq *cfs_rq, struct rb_node **node) +{ + struct sched_entity *entry; + + if (!*node) { + printk("TREE END\n"); + return; + } + + entry = rb_entry(*node, struct sched_entity, run_node); + + __traverse_cfs_rq(cfs_rq, &(*node)->rb_left); + printk("%p\n", entry); + __traverse_cfs_rq(cfs_rq, &(*node)->rb_left); +} + +void traverse_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; + __traverse_cfs_rq(cfs_rq, link); +} + static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); @@ -4326,7 +4346,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_enqueue(cfs_rq, se, flags); check_spread(cfs_rq, se); if (!curr) - __enqueue_entity(cfs_rq, se); +#ifdef CONFIG_DTS + if (se->by_pass != INIT_BY_PASS) +#endif + __enqueue_entity(cfs_rq, se); + se->on_rq = 1; /* @@ -4449,6 +4473,11 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) unsigned long ideal_runtime, delta_exec; struct sched_entity *se; s64 delta; +#ifdef CONFIG_DTS + struct task_struct *curr_task = NULL; + if (curr->by_pass != NONE_BY_PASS) + curr_task = task_of_dts_shared_se(curr); +#endif ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; @@ -4459,7 +4488,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); - return; + goto end; } /* @@ -4468,19 +4497,72 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * This also mitigates buddy induced latencies under load. */ if (delta_exec < sysctl_sched_min_granularity) - return; + goto end; se = __pick_first_entity(cfs_rq); delta = curr->vruntime - se->vruntime; if (delta < 0) - return; + goto end; - if (delta > ideal_runtime) + if (delta > ideal_runtime) { resched_curr(rq_of(cfs_rq)); + goto end; + } else { + return; + } +end: +#ifdef CONFIG_DTS + if (curr_task != NULL) { + curr_task->by_pass = END_BY_PASS; + curr_task->se.by_pass = END_BY_PASS; + curr_task->dts_shared_se.by_pass = END_BY_PASS; + } +#endif } -static void +#ifdef CONFIG_DTS +/* + * We dequeue the task original se but we do NOT CHANGE any schedule infomation of se. + * Correspondingly, enqueue the task original se without any changes on se's information + * when the shared se expired. // TODO + * shared se's stats acquiring, etc NEEDs TO BE fixed when task execute in DTS mode. // TODO + */ +void +replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *shared_se) +{ + if (shared_se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + // TODO + update_stats_wait_end(cfs_rq, shared_se); + __dequeue_entity(cfs_rq, se); /* the se of next task should be dequeued */ + update_load_avg(cfs_rq, shared_se, UPDATE_TG); + } + + update_stats_curr_start(cfs_rq, shared_se); + cfs_rq->curr = shared_se; // 后续update_curr是update cfs_rq->curr + + /* + * Track our maximum slice length, if the CPU's load is at + * least twice that of our own weight (i.e. dont track it + * when there are only lesser-weight tasks around): + */ + if (schedstat_enabled() && + rq_of(cfs_rq)->cfs.load.weight >= 2*shared_se->load.weight) { + schedstat_set(shared_se->statistics.slice_max, + max((u64)schedstat_val(shared_se->statistics.slice_max), + shared_se->sum_exec_runtime - shared_se->prev_sum_exec_runtime)); + } + + shared_se->prev_sum_exec_runtime = shared_se->sum_exec_runtime; +} +#endif + +void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { clear_buddies(cfs_rq, se); @@ -4576,8 +4658,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +/* the prev's value is unique or shared for the dts mechanism */ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { +#ifdef CONFIG_DTS + struct task_struct *task = task_of(prev); +#endif /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: @@ -4598,6 +4684,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_load_avg(cfs_rq, prev, 0); } cfs_rq->curr = NULL; +#ifdef CONFIG_DTS + if (task->by_pass == END_BY_PASS) { + task->by_pass = NONE_BY_PASS; + task->se.by_pass = NONE_BY_PASS; + task->dts_shared_se.by_pass = NONE_BY_PASS; + } +#endif } static void @@ -5601,6 +5694,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_new = !(flags & ENQUEUE_WAKEUP); unsigned int prev_nr = rq->cfs.h_nr_running; +#ifdef CONFIG_DTS + if (p->by_pass != NONE_BY_PASS) { + se = &p->dts_shared_se; + } +#endif + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -5708,11 +5807,17 @@ static void set_next_buddy(struct sched_entity *se); static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; + struct sched_entity *se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); unsigned int prev_nr = rq->cfs.h_nr_running; bool was_sched_idle = sched_idle_rq(rq); +#ifdef CONFIG_DTS + if (p->by_pass != NONE_BY_PASS) + se = &p->dts_shared_se; + else +#endif + se = &p->se; util_est_dequeue(&rq->cfs, p); @@ -6858,6 +6963,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + if ((wake_flags & WF_CURRENT_CPU) && cpumask_test_cpu(cpu, p->cpus_ptr)) + return cpu; + time = schedstat_start_time(); /* @@ -7079,11 +7187,28 @@ static void set_skip_buddy(struct sched_entity *se) static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; + struct sched_entity *se, *pse; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; +#ifdef CONFIG_DTS + int curr_by_pass = curr->by_pass; + int p_by_pass = p->by_pass; + + if (curr_by_pass != NONE_BY_PASS) + se = &curr->dts_shared_se; + else +#endif + se = &curr->se; + +#ifdef CONFIG_DTS + if (p_by_pass != NONE_BY_PASS) + pse = &p->dts_shared_se; + else +#endif + pse = &p->se; + if (unlikely(se == pse)) return; @@ -7666,7 +7791,14 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf * least amount of cfs_rqs. */ if (prev != p) { - struct sched_entity *pse = &prev->se; + struct sched_entity *pse; +#ifdef CONFIG_DTS + if (prev->by_pass != NONE_BY_PASS) + pse = &prev->dts_shared_se; + else +#endif + pse = &prev->se; + while (!(cfs_rq = is_same_group(se, pse))) { int se_depth = se->depth; @@ -7815,8 +7947,15 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq) */ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { - struct sched_entity *se = &prev->se; + struct sched_entity *se; struct cfs_rq *cfs_rq; +#ifdef CONFIG_DTS + if (prev->by_pass != NONE_BY_PASS) + se = &prev->dts_shared_se; + else +#endif + se = &prev->se; + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -7833,7 +7972,13 @@ static void yield_task_fair(struct rq *rq) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *se = &curr->se; + struct sched_entity *se; +#ifdef CONFIG_DTS + if (curr->by_pass != NONE_BY_PASS) + se = &curr->dts_shared_se; + else +#endif + se = &curr->se; /* * Are we the only task in the tree? @@ -7864,6 +8009,13 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; +#ifdef CONFIG_DTS + /* DTS tasks DO NOT support being executed by yeild_to method.*/ + if (p->by_pass != NONE_BY_PASS) { + return false; + } +#endif + /* throttled hierarchies are not runnable */ if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; @@ -8301,7 +8453,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) /* * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. */ -static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) +void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) { lockdep_assert_rq_held(src_rq); @@ -8480,6 +8632,10 @@ static void attach_task(struct rq *rq, struct task_struct *p) BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); + +#ifdef CONFIG_DTS + if (p->by_pass != INIT_BY_PASS) +#endif check_preempt_curr(rq, p, 0); } @@ -11454,6 +11610,53 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, return stolen; } +int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, + struct task_struct *tsk) +{ + struct rq_flags rf; + int stolen = 0; + int dst_cpu = dst_rq->cpu; + struct rq *src_rq = task_rq(tsk); + int src_cpu = task_cpu(tsk); + + if (!steal_enabled()) + return 0; + + if (!cpu_active(dst_cpu)) + return 0; + + if (dst_cpu == src_cpu) + return 0; + + if (*locked) { + rq_unpin_lock(dst_rq, dst_rf); + raw_spin_rq_unlock(dst_rq); + *locked = false; + } + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (!cpu_active(src_cpu)) + tsk = NULL; + else + detach_task(tsk, src_rq, dst_cpu); + + rq_unlock(src_rq, &rf); + + if (tsk) { + raw_spin_rq_lock(dst_rq); + rq_repin_lock(dst_rq, dst_rf); + *locked = true; + update_rq_clock(dst_rq); + attach_task(dst_rq, tsk); + stolen = 1; + schedstat_inc(dst_rq->steal); + } + local_irq_restore(rf.flags); + + return stolen; +} + /* * Conservative upper bound on the max cost of a steal, in nsecs (the typical * cost is 1-2 microsec). Do not steal if average idle time is less. @@ -11676,6 +11879,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; +#ifdef CONFIG_DTS + if (curr->by_pass != NONE_BY_PASS) { + se = &curr->dts_shared_se; + } +#endif + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); @@ -12169,6 +12378,81 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task return rr_interval; } +void update_before_bypass(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct sched_entity *curr; + struct cfs_rq *cfs_rq; + +#ifdef CONFIG_DTS + if (current->by_pass != NONE_BY_PASS) + curr = ¤t->dts_shared_se; + else +#endif + curr = ¤t->se; + + cfs_rq = cfs_rq_of(curr); + + rq_lock(rq, &rf); + update_rq_clock(rq); + + /* + * Ensure that runnable average is periodically updated. + */ + update_load_avg(cfs_rq, curr, UPDATE_TG); + update_cfs_group(curr); + + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + /* + * Ensure that runnable average is periodically updated. + */ + update_load_avg(cfs_rq, curr, UPDATE_TG); + update_cfs_group(curr); + + rq_unlock(rq, &rf); +} + +/* + * return 1: left time Y + * + */ +int check_task_left_time(struct task_struct *task) +{ + unsigned long ideal_runtime, delta_exec; + struct sched_entity *se; + struct cfs_rq *cfs_rq; +#ifdef CONFIG_DTS + if (task->by_pass != NONE_BY_PASS) + se = &task->dts_shared_se; + else +#endif + se = &task->se; + + cfs_rq = cfs_rq_of(se); + + ideal_runtime = sched_slice(cfs_rq, se); + delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { + if (cfs_rq->nr_running > 1) { + resched_curr(rq_of(cfs_rq)); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. + */ + clear_buddies(cfs_rq, se); + } + return 0; + } + + return 1; +} + /* * All the scheduling class methods: */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3bd6c988652af037831f4007c8b39c03a73eb52d..7f86a5b8707986dab0c53909b6116b04a39673db 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -526,6 +526,12 @@ extern void sched_offline_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); +#ifdef CONFIG_DTS +extern void replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev_se, struct sched_entity *shared_se); +#endif + +extern void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se); + #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); @@ -1384,12 +1390,30 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define raw_rq() raw_cpu_ptr(&runqueues) #ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_DTS +static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se) +{ + SCHED_WARN_ON(!entity_is_task(dts_shared_se)); + return container_of(dts_shared_se, struct task_struct, dts_shared_se); +} +#endif + static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); +#ifdef CONFIG_DTS + if (se->by_pass != NONE_BY_PASS) + return task_of_dts_shared_se(se); + else +#endif return container_of(se, struct task_struct, se); } +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ + for (; se; se = se->parent) + + static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) { return p->se.cfs_rq; @@ -1409,17 +1433,40 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) #else +#ifdef CONFIG_DTS +static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se) +{ + return container_of(dts_shared_se, struct task_struct, dts_shared_se); +} + +static inline struct cfs_rq *cfs_rq_of_dts_shared_se(struct sched_entity *se) +{ + struct task_struct *p = task_of_dts_shared_se(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} +#endif + static inline struct task_struct *task_of(struct sched_entity *se) { +#ifdef CONFIG_DTS + if (se->by_pass != NONE_BY_PASS) + return task_of_dts_shared_se(se); + else +#endif return container_of(se, struct task_struct, se); } +#define for_each_sched_entity(se) \ + for (; se; se = NULL) + static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) { return &task_rq(p)->cfs; } -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of_se(struct sched_entity *se) { struct task_struct *p = task_of(se); struct rq *rq = task_rq(p); @@ -1427,6 +1474,17 @@ static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) return &rq->cfs; } +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ +#ifdef CONFIG_DTS + if (se->by_pass != NONE_BY_PASS) + return cfs_rq_of_dts_shared_se(se); + else +#endif + return cfs_rq_of_se(se); + +} + /* runqueue "owned" by this group */ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) { @@ -2060,6 +2118,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_FORK 0x02 /* Child wakeup after fork */ #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ #define WF_ON_CPU 0x08 /* Wakee is on_cpu */ +#define WF_CURRENT_CPU 0x10 /* Prefer to move wakee to the current CPU */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -2625,6 +2684,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); +extern void traverse_cfs_rq(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG extern bool sched_debug_enabled; @@ -3010,3 +3070,16 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) void swake_up_all_locked(struct swait_queue_head *q); void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +#ifdef CONFIG_DTS +extern void sched_submit_work(struct task_struct *tsk); +extern void sched_update_worker(struct task_struct *tsk); +extern struct rq *context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next, struct rq_flags *rf); +#ifdef CONFIG_SCHED_STEAL +extern int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, + struct task_struct *tsk); +extern void update_before_bypass(void); +extern void balance_callback(struct rq *rq); +#endif +#endif \ No newline at end of file diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore index 0efcd494daabfa5fc90ba1b027015e1564087433..d661ef0946ccd87ed1d6114c184ea48f5434778a 100644 --- a/tools/testing/selftests/futex/functional/.gitignore +++ b/tools/testing/selftests/futex/functional/.gitignore @@ -2,6 +2,7 @@ futex_requeue_pi futex_requeue_pi_mismatched_ops futex_requeue_pi_signal_restart +futex_swap futex_wait_private_mapped_file futex_wait_timeout futex_wait_uninitialized_heap diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 23207829ec752b52a56577a679f5baf3f3f51a46..6992fac38b1583190ce759342917158e4a3c8ee6 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -13,6 +13,7 @@ TEST_GEN_FILES := \ futex_requeue_pi \ futex_requeue_pi_signal_restart \ futex_requeue_pi_mismatched_ops \ + futex_swap \ futex_wait_uninitialized_heap \ futex_wait_private_mapped_file diff --git a/tools/testing/selftests/futex/functional/futex_swap.c b/tools/testing/selftests/futex/functional/futex_swap.c new file mode 100644 index 0000000000000000000000000000000000000000..501a0bfe2cd1ea4f083d77375ed8ee9a1597d68b --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_swap.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include +#include +#include +#include "atomic.h" +#include "futextest.h" + +/* The futex the main thread waits on. */ +futex_t futex_main = FUTEX_INITIALIZER; +/* The futex the other thread waits on. */ +futex_t futex_other = FUTEX_INITIALIZER; + +/* The number of iterations to run (>1 => run benchmarks. */ +static int cfg_iterations = 1; + +/* If != 0, print diagnostic messages. */ +static int cfg_verbose; + +/* If == 0, do not use validation_counter. Useful for benchmarking. */ +static int cfg_validate = 1; + +/* How to swap threads. */ +#define SWAP_WAKE_WAIT 1 +#define SWAP_SWAP 2 +#define SWAP_SWAP_DTS 4 + +/* Futex values. */ +#define FUTEX_WAITING 0 +#define FUTEX_WAKEUP 1 + +#define FUTEX_FLAGS_DTS_MODE 512 + +/* An atomic counter used to validate proper swapping. */ +static atomic_t validation_counter; + +void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that) +{ + int ret; + int flags = 0; + + switch (mode) { + case SWAP_WAKE_WAIT: + futex_set(futex_this, FUTEX_WAITING); + futex_set(futex_that, FUTEX_WAKEUP); + futex_wake(futex_that, 1, FUTEX_PRIVATE_FLAG); + futex_wait(futex_this, FUTEX_WAITING, NULL, FUTEX_PRIVATE_FLAG); + if (*futex_this != FUTEX_WAKEUP) { + fprintf(stderr, "unexpected futex_this value on wakeup\n"); + exit(1); + } + break; + + case SWAP_SWAP_DTS: + flags |= FUTEX_FLAGS_DTS_MODE; + case SWAP_SWAP: + flags |= FUTEX_PRIVATE_FLAG; + futex_set(futex_this, FUTEX_WAITING); + futex_set(futex_that, FUTEX_WAKEUP); + ret = futex_swap(futex_this, FUTEX_WAITING, NULL, + futex_that, flags); + if (ret < 0 && errno == ENOSYS) { + /* futex_swap not implemented */ + perror("futex_swap"); + exit(1); + } + if (*futex_this != FUTEX_WAKEUP) { + fprintf(stderr, "unexpected futex_this value on wakeup\n"); + exit(1); + } + break; + + default: + fprintf(stderr, "unknown mode in %s\n", __func__); + exit(1); + } +} + +void *other_thread(void *arg) +{ + int mode = *((int *)arg); + int counter; + + if (cfg_verbose) + printf("%s started\n", __func__); + + futex_wait(&futex_other, 0, NULL, FUTEX_PRIVATE_FLAG); + + for (counter = 0; counter < cfg_iterations; ++counter) { + if (cfg_validate) { + int prev = 2 * counter + 1; + + if (prev != atomic_cmpxchg(&validation_counter, prev, + prev + 1)) { + fprintf(stderr, "swap validation failed\n"); + exit(1); + } + } + futex_swap_op(mode, &futex_other, &futex_main); + } + + if (cfg_verbose) + printf("%s finished: %d iteration(s)\n", __func__, counter); + + return NULL; +} + +void run_test(int mode) +{ + struct timespec start, stop; + int ret, counter; + pthread_t thread; + uint64_t duration; + + futex_set(&futex_other, FUTEX_WAITING); + atomic_set(&validation_counter, 0); + ret = pthread_create(&thread, NULL, &other_thread, &mode); + if (ret) { + perror("pthread_create"); + exit(1); + } + + ret = clock_gettime(CLOCK_MONOTONIC, &start); + if (ret) { + perror("clock_gettime"); + exit(1); + } + + for (counter = 0; counter < cfg_iterations; ++counter) { + if (cfg_validate) { + int prev = 2 * counter; + + if (prev != atomic_cmpxchg(&validation_counter, prev, + prev + 1)) { + fprintf(stderr, "swap validation failed\n"); + exit(1); + } + } + futex_swap_op(mode, &futex_main, &futex_other); + } + if (cfg_validate && validation_counter.val != 2 * cfg_iterations) { + fprintf(stderr, "final swap validation failed\n"); + exit(1); + } + + ret = clock_gettime(CLOCK_MONOTONIC, &stop); + if (ret) { + perror("clock_gettime"); + exit(1); + } + + duration = (stop.tv_sec - start.tv_sec) * 1000000000LL + + stop.tv_nsec - start.tv_nsec; + if (cfg_verbose || cfg_iterations > 1) { + printf("completed %d swap and back iterations in %lu ns: %lu ns per swap\n", + cfg_iterations, duration, + duration / (cfg_iterations * 2)); + } + + /* The remote thread is blocked; send it the final wake. */ + futex_set(&futex_other, FUTEX_WAKEUP); + futex_wake(&futex_other, 1, FUTEX_PRIVATE_FLAG); + if (pthread_join(thread, NULL)) { + perror("pthread_join"); + exit(1); + } +} + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -h Display this help message\n"); + printf(" -i N Use N iterations to benchmark\n"); + printf(" -n Do not validate swapping correctness\n"); + printf(" -v Print diagnostic messages\n"); + printf(" -d Benchmark with the direct-thread-switch(DTS) mechanism\n"); +} + +int main(int argc, char *argv[]) +{ + int c; + + while ((c = getopt(argc, argv, "hi:nvd")) != -1) { + switch (c) { + case 'h': + usage(basename(argv[0])); + exit(0); + case 'i': + cfg_iterations = atoi(optarg); + break; + case 'n': + cfg_validate = 0; + break; + case 'v': + cfg_verbose = 1; + break; + case 'd': + goto dts_test; + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + printf("\n\n------- running SWAP_WAKE_WAIT -----------\n\n"); + run_test(SWAP_WAKE_WAIT); + printf("PASS\n"); + + printf("\n\n------- running SWAP_SWAP -----------\n\n"); + run_test(SWAP_SWAP); + printf("PASS\n"); + +dts_test: + printf("\n\n---- running SWAP_SWAP with the direct-thread-switch(DTS) mechanism ----\n\n"); + run_test(SWAP_SWAP_DTS); + printf("PASS\n"); + + return 0; +} diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h index ddbcfc9b7bac4aebb5bac2f249e26ecfd948aa84..4d6a0a18445ad6a5d723acd710f6dc455b69568c 100644 --- a/tools/testing/selftests/futex/include/futextest.h +++ b/tools/testing/selftests/futex/include/futextest.h @@ -38,6 +38,9 @@ typedef volatile u_int32_t futex_t; #ifndef FUTEX_CMP_REQUEUE_PI #define FUTEX_CMP_REQUEUE_PI 12 #endif +#ifndef FUTEX_SWAP +#define FUTEX_SWAP 13 +#endif #ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) @@ -46,6 +49,9 @@ typedef volatile u_int32_t futex_t; #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) #endif +#ifndef FUTEX_SWAP_PRIVATE +#define FUTEX_SWAP_PRIVATE (FUTEX_WAIT_WAKE | FUTEX_PRIVATE_FLAG) +#endif /** * futex() - SYS_futex syscall wrapper @@ -204,6 +210,19 @@ futex_cmp_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake, val, opflags); } +/** + * futex_swap() - block on uaddr and wake one task blocked on uaddr2. + * @uaddr: futex to block the current task on + * @timeout: relative timeout for the current task block + * @uaddr2: futex to wake tasks at (can be the same as uaddr) + */ +static inline int +futex_swap(futex_t *uaddr, futex_t val, struct timespec *timeout, + futex_t *uaddr2, int opflags) +{ + return futex(uaddr, FUTEX_SWAP, val, timeout, uaddr2, 0, opflags); +} + /** * futex_cmpxchg() - atomic compare and exchange * @uaddr: The address of the futex to be modified