From b87ae678de0e7f875419aedf334f34f4cf300a77 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 22 Jul 2020 16:45:36 -0700
Subject: [PATCH 1/4] futex: introduce FUTEX_SWAP operation

openeuler inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4L9RU
CVE: NA

-------------------

As Paul Turner presented at LPC in 2013 ...
- pdf: http://pdxplumbers.osuosl.org/2013/ocw//system/presentations/1653/original/LPC%20-%20User%20Threading.pdf
- video: https://www.youtube.com/watch?v=KXuZi9aeGTw

... Google has developed an M:N userspace threading subsystem backed
by Google-private SwitchTo Linux Kernel API (page 17 in the pdf referenced
above). This subsystem provides latency-sensitive services at Google with
fine-grained user-space control/scheduling over what is running when,
and this subsystem is used widely internally (called schedulers or fibers).

This patchset is the first step to open-source this work. As explained
in the linked pdf and video, SwitchTo API has three core operations: wait,
resume, and swap (=switch). So this patchset adds a FUTEX_SWAP operation
that, in addition to FUTEX_WAIT and FUTEX_WAKE, will provide a foundation
on top of which user-space threading libraries can be built.

Another common use case for FUTEX_SWAP is message passing a-la RPC
between tasks: task/thread T1 prepares a message,
wakes T2 to work on it, and waits for the results; when T2 is done, it
wakes T1 and waits for more work to arrive. Currently the simplest
way to implement this is

a. T1: futex-wake T2, futex-wait
b. T2: wakes, does what it has been woken to do
c. T2: futex-wake T1, futex-wait

With FUTEX_SWAP, steps a and c above can be reduced to one futex operation
that runs 5-10 times faster.

Patches in this patchset:

Patch 1: (this patch) introduce FUTEX_SWAP futex operation that,
         internally, does wake + wait. The purpose of this patch is
         to work out the API.
Patch 2: a first rough attempt to make FUTEX_SWAP faster than
         what wake + wait can do.
Patch 3: a selftest that can also be used to benchmark FUTEX_SWAP vs
         FUTEX_WAKE + FUTEX_WAIT.

Tested: see patch 3 in this patchset.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/uapi/linux/futex.h |  2 +
 kernel/futex.c             | 94 ++++++++++++++++++++++++++++++--------
 2 files changed, 78 insertions(+), 18 deletions(-)

diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..c1d151d97dea 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -21,6 +21,7 @@
 #define FUTEX_WAKE_BITSET	10
 #define FUTEX_WAIT_REQUEUE_PI	11
 #define FUTEX_CMP_REQUEUE_PI	12
+#define FUTEX_SWAP		13
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
@@ -40,6 +41,7 @@
 					 FUTEX_PRIVATE_FLAG)
 #define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
+#define FUTEX_SWAP_PRIVATE		(FUTEX_SWAP | FUTEX_PRIVATE_FLAG)
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
diff --git a/kernel/futex.c b/kernel/futex.c
index 98a6e1b80bfe..8d0fadfe5a9c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1181,7 +1181,7 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
 	 *  tsk->futex_state =               } else {
 	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
 	 *					  FUTEX_STATE_DEAD)
-	 *				         return -EAGAIN;
+	 *					 return -EAGAIN;
 	 *				       return -ESRCH; <--- FAIL
 	 *				     }
 	 *
@@ -1584,16 +1584,16 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 }
 
 /*
- * Wake up waiters matching bitset queued on this futex (uaddr).
+ * Prepare wake queue matching bitset queued on this futex (uaddr).
  */
 static int
-futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+prepare_wake_q(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset,
+	       struct wake_q_head *wake_q)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
-	DEFINE_WAKE_Q(wake_q);
 
 	if (!bitset)
 		return -EINVAL;
@@ -1621,14 +1621,28 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 			if (!(this->bitset & bitset))
 				continue;
 
-			mark_wake_futex(&wake_q, this);
+			mark_wake_futex(wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
 	}
 
 	spin_unlock(&hb->lock);
+	return ret;
+}
+
+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+	int ret;
+	DEFINE_WAKE_Q(wake_q);
+
+	ret = prepare_wake_q(uaddr, flags, nr_wake, bitset, &wake_q);
 	wake_up_q(&wake_q);
+
 	return ret;
 }
 
@@ -2576,9 +2590,12 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
  * @hb:		the futex hash bucket, must be locked by the caller
  * @q:		the futex_q to queue up on
  * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
+ * @next:	if present, wake next and hint to the scheduler that we'd
+ *		prefer to execute it locally.
  */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
-				struct hrtimer_sleeper *timeout)
+				struct hrtimer_sleeper *timeout,
+				struct task_struct *next)
 {
 	/*
 	 * The task state is guaranteed to be set before another task can
@@ -2603,10 +2620,26 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 		 * flagged for rescheduling. Only call schedule if there
 		 * is no timeout, or if it has yet to expire.
 		 */
-		if (!timeout || timeout->task)
+		if (!timeout || timeout->task) {
+			if (next) {
+				/*
+				 * wake_up_process() below will be replaced
+				 * in the next patch with
+				 * wake_up_process_prefer_current_cpu().
+				 */
+				wake_up_process(next);
+				put_task_struct(next);
+				next = NULL;
+			}
 			freezable_schedule();
+		}
 	}
 	__set_current_state(TASK_RUNNING);
+
+	if (next) {
+		wake_up_process(next);
+		put_task_struct(next);
+	}
 }
 
 /**
@@ -2682,7 +2715,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 }
 
 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
-		      ktime_t *abs_time, u32 bitset)
+		      ktime_t *abs_time, u32 bitset, struct task_struct *next)
 {
 	struct hrtimer_sleeper timeout, *to;
 	struct restart_block *restart;
@@ -2706,7 +2739,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 		goto out;
 
 	/* queue_me and wait for wakeup, timeout, or a signal. */
-	futex_wait_queue_me(hb, &q, to);
+	futex_wait_queue_me(hb, &q, to, next);
+	next = NULL;
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	ret = 0;
@@ -2738,6 +2772,10 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 	ret = set_restart_fn(restart, futex_wait_restart);
 
 out:
+	if (next) {
+		wake_up_process(next);
+		put_task_struct(next);
+	}
 	if (to) {
 		hrtimer_cancel(&to->timer);
 		destroy_hrtimer_on_stack(&to->timer);
@@ -2745,7 +2783,6 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 	return ret;
 }
 
-
 static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = restart->futex.uaddr;
@@ -2757,10 +2794,29 @@ static long futex_wait_restart(struct restart_block *restart)
 	}
 	restart->fn = do_no_restart_syscall;
 
-	return (long)futex_wait(uaddr, restart->futex.flags,
-				restart->futex.val, tp, restart->futex.bitset);
+	return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val,
+				tp, restart->futex.bitset, NULL);
 }
 
+static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
+		      ktime_t *abs_time, u32 __user *uaddr2)
+{
+	u32 bitset = FUTEX_BITSET_MATCH_ANY;
+	struct task_struct *next = NULL;
+	DEFINE_WAKE_Q(wake_q);
+	int ret;
+
+	ret = prepare_wake_q(uaddr2, flags, 1, bitset, &wake_q);
+	if (ret < 0)
+		return ret;
+	if (!wake_q_empty(&wake_q)) {
+		/* At most one wakee can be present. Pull it out. */
+		next = container_of(wake_q.first, struct task_struct, wake_q);
+		next->wake_q.next = NULL;
+	}
+
+	return futex_wait(uaddr, flags, val, abs_time, bitset, next);
+}
 
 /*
  * Userspace tried a 0 -> TID atomic transition of the futex value
@@ -3222,7 +3278,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	}
 
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
-	futex_wait_queue_me(hb, &q, to);
+	futex_wait_queue_me(hb, &q, to, NULL);
 
 	spin_lock(&hb->lock);
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
@@ -3732,7 +3788,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		val3 = FUTEX_BITSET_MATCH_ANY;
 		fallthrough;
 	case FUTEX_WAIT_BITSET:
-		return futex_wait(uaddr, flags, val, timeout, val3);
+		return futex_wait(uaddr, flags, val, timeout, val3, NULL);
 	case FUTEX_WAKE:
 		val3 = FUTEX_BITSET_MATCH_ANY;
 		fallthrough;
@@ -3756,6 +3812,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 					     uaddr2);
 	case FUTEX_CMP_REQUEUE_PI:
 		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+	case FUTEX_SWAP:
+		return futex_swap(uaddr, flags, val, timeout, uaddr2);
 	}
 	return -ENOSYS;
 }
@@ -3772,7 +3830,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 
 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
 		      cmd == FUTEX_WAIT_BITSET ||
-		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
+		      cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) {
 		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
 			return -EFAULT;
 		if (get_timespec64(&ts, utime))
@@ -3781,7 +3839,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 			return -EINVAL;
 
 		t = timespec64_to_ktime(ts);
-		if (cmd == FUTEX_WAIT)
+		if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP)
 			t = ktime_add_safe(ktime_get(), t);
 		else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
 			t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
@@ -3968,14 +4026,14 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
 
 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
 		      cmd == FUTEX_WAIT_BITSET ||
-		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
+		      cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) {
 		if (get_old_timespec32(&ts, utime))
 			return -EFAULT;
 		if (!timespec64_valid(&ts))
 			return -EINVAL;
 
 		t = timespec64_to_ktime(ts);
-		if (cmd == FUTEX_WAIT)
+		if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP)
 			t = ktime_add_safe(ktime_get(), t);
 		else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
 			t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
-- 
Gitee


From fc4a23541f862d575e1dbc77016d636820adaed6 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 22 Jul 2020 16:45:37 -0700
Subject: [PATCH 2/4] futex/sched: add wake_up_process_prefer_current_cpu, use
 in FUTEX_SWAP

openeuler inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4L9RU
CVE: NA

-------------------

As described in the previous patch in this patchset
("futex: introduce FUTEX_SWAP operation"), it is often
beneficial to wake a task and run it on the same CPU
where the current going to sleep task it running.

Internally at Google, switchto_switch sycall not only
migrates the wakee to the current CPU, but also moves
the waker's load stats to the wakee, thus ensuring
that the migration to the current CPU does not interfere
with load balancing. switchto_switch also does the
context switch into the wakee, bypassing schedule().

This patchset does not go that far yet, it simply
migrates the wakee to the current CPU and calls schedule().

In follow-up patches I will try to fune-tune the behavior by adjusting
load stats and schedule(): our internal switchto_switch
is still about 2x faster than FUTEX_SWAP (see numbers below).

And now about performance: futex_swap benchmark
from the last patch in this patchset produces this typical
output:

$ ./futex_swap -i 100000

------- running SWAP_WAKE_WAIT -----------

completed 100000 swap and back iterations in 820683263 ns: 4103 ns per swap
PASS

------- running SWAP_SWAP -----------

completed 100000 swap and back iterations in 124034476 ns: 620 ns per swap
PASS

In the above, the first benchmark (SWAP_WAKE_WAIT) calls FUTEX_WAKE,
then FUTEX_WAIT; the second benchmark (SWAP_SWAP) calls FUTEX_SWAP.

If the benchmark is restricted to a single cpu:

$ taskset -c 1 ./futex_swap -i 1000000

The numbers are very similar, as expected (with wake+wait being
a bit slower than swap due to two vs one syscalls).

Please also note that switchto_switch is about 2x faster than
FUTEX_SWAP because it does a contex switch to the wakee immediately,
bypassing schedule(), so this is one of the options I'll
explore in further patches (if/when this initial patchset is
accepted).

Tested: see the last patch is this patchset.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/linux/sched.h | 1 +
 kernel/futex.c        | 6 +++++-
 kernel/sched/core.c   | 5 +++++
 kernel/sched/fair.c   | 4 ++++
 kernel/sched/sched.h  | 1 +
 5 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c7235c2a3d7..714386dd3e18 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1829,6 +1829,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr);
 
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
+extern int wake_up_process_prefer_current_cpu(struct task_struct *tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
diff --git a/kernel/futex.c b/kernel/futex.c
index 8d0fadfe5a9c..42f55d1da678 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2627,7 +2627,11 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 				 * in the next patch with
 				 * wake_up_process_prefer_current_cpu().
 				 */
-				wake_up_process(next);
+#ifdef CONFIG_SMP
+                               wake_up_process_prefer_current_cpu(next);
+#else
+                               wake_up_process(next);
+#endif
 				put_task_struct(next);
 				next = NULL;
 			}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9d8ee902b711..5bf355323381 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6931,6 +6931,11 @@ void sched_setnuma(struct task_struct *p, int nid)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+int wake_up_process_prefer_current_cpu(struct task_struct *next)
+{
+       return try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Ensure that the idle task is using init_mm right before its CPU goes
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d99e5bd9798d..30fbcd06baa9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6899,6 +6899,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int ret;
 #endif
 
+        if ((wake_flags & WF_CURRENT_CPU) && cpumask_test_cpu(cpu, p->cpus_ptr))
+                return cpu;
+
+
 	time = schedstat_start_time();
 
 	if (sd_flag & SD_BALANCE_WAKE) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1aeccc5111d7..976cdb36fe08 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1862,6 +1862,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 #define WF_FORK			0x02		/* Child wakeup after fork */
 #define WF_MIGRATED		0x04		/* Internal use, task got migrated */
 #define WF_ON_CPU		0x08		/* Wakee is on_cpu */
+#define WF_CURRENT_CPU         0x10            /* Prefer to move wakee to the current CPU */
 
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
-- 
Gitee


From 8871eed8a3f60bbd2e2a4151156f7e047bf62923 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Wed, 22 Jul 2020 16:45:38 -0700
Subject: [PATCH 3/4] selftests/futex: add futex_swap selftest

openeuler inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4L9RU
CVE: NA

-------------------

This is the final patch in FUTEX_SWAP patchset. It
adds a test/benchmark to validate behavior and
compare performance of a new FUTEX_SWAP futex operation.

Detailed API design and behavior considerations are provided
in the commit messages of the previous two patches.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 .../selftests/futex/functional/.gitignore     |   1 +
 .../selftests/futex/functional/Makefile       |   1 +
 .../selftests/futex/functional/futex_swap.c   | 209 ++++++++++++++++++
 .../selftests/futex/include/futextest.h       |  19 ++
 4 files changed, 230 insertions(+)
 create mode 100644 tools/testing/selftests/futex/functional/futex_swap.c

diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore
index 0efcd494daab..d661ef0946cc 100644
--- a/tools/testing/selftests/futex/functional/.gitignore
+++ b/tools/testing/selftests/futex/functional/.gitignore
@@ -2,6 +2,7 @@
 futex_requeue_pi
 futex_requeue_pi_mismatched_ops
 futex_requeue_pi_signal_restart
+futex_swap
 futex_wait_private_mapped_file
 futex_wait_timeout
 futex_wait_uninitialized_heap
diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
index 23207829ec75..6992fac38b15 100644
--- a/tools/testing/selftests/futex/functional/Makefile
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -13,6 +13,7 @@ TEST_GEN_FILES := \
 	futex_requeue_pi \
 	futex_requeue_pi_signal_restart \
 	futex_requeue_pi_mismatched_ops \
+	futex_swap \
 	futex_wait_uninitialized_heap \
 	futex_wait_private_mapped_file
 
diff --git a/tools/testing/selftests/futex/functional/futex_swap.c b/tools/testing/selftests/futex/functional/futex_swap.c
new file mode 100644
index 000000000000..9034d04372d3
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_swap.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <errno.h>
+#include <getopt.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "atomic.h"
+#include "futextest.h"
+
+/* The futex the main thread waits on. */
+futex_t futex_main = FUTEX_INITIALIZER;
+/* The futex the other thread wats on. */
+futex_t futex_other = FUTEX_INITIALIZER;
+
+/* The number of iterations to run (>1 => run benchmarks. */
+static int cfg_iterations = 1;
+
+/* If != 0, print diagnostic messages. */
+static int cfg_verbose;
+
+/* If == 0, do not use validation_counter. Useful for benchmarking. */
+static int cfg_validate = 1;
+
+/* How to swap threads. */
+#define SWAP_WAKE_WAIT 1
+#define SWAP_SWAP 2
+
+/* Futex values. */
+#define FUTEX_WAITING 0
+#define FUTEX_WAKEUP 1
+
+/* An atomic counter used to validate proper swapping. */
+static atomic_t validation_counter;
+
+void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that)
+{
+	int ret;
+
+	switch (mode) {
+	case SWAP_WAKE_WAIT:
+		futex_set(futex_this, FUTEX_WAITING);
+		futex_set(futex_that, FUTEX_WAKEUP);
+		futex_wake(futex_that, 1, FUTEX_PRIVATE_FLAG);
+		futex_wait(futex_this, FUTEX_WAITING, NULL, FUTEX_PRIVATE_FLAG);
+		if (*futex_this != FUTEX_WAKEUP) {
+			fprintf(stderr, "unexpected futex_this value on wakeup\n");
+			exit(1);
+		}
+		break;
+
+	case SWAP_SWAP:
+		futex_set(futex_this, FUTEX_WAITING);
+		futex_set(futex_that, FUTEX_WAKEUP);
+		ret = futex_swap(futex_this, FUTEX_WAITING, NULL,
+				 futex_that, FUTEX_PRIVATE_FLAG);
+		if (ret < 0 && errno == ENOSYS) {
+			/* futex_swap not implemented */
+			perror("futex_swap");
+			exit(1);
+		}
+		if (*futex_this != FUTEX_WAKEUP) {
+			fprintf(stderr, "unexpected futex_this value on wakeup\n");
+			exit(1);
+		}
+		break;
+
+	default:
+		fprintf(stderr, "unknown mode in %s\n", __func__);
+		exit(1);
+	}
+}
+
+void *other_thread(void *arg)
+{
+	int mode = *((int *)arg);
+	int counter;
+
+	if (cfg_verbose)
+		printf("%s started\n", __func__);
+
+	futex_wait(&futex_other, 0, NULL, FUTEX_PRIVATE_FLAG);
+
+	for (counter = 0; counter < cfg_iterations; ++counter) {
+		if (cfg_validate) {
+			int prev = 2 * counter + 1;
+
+			if (prev != atomic_cmpxchg(&validation_counter, prev,
+						   prev + 1)) {
+				fprintf(stderr, "swap validation failed\n");
+				exit(1);
+			}
+		}
+		futex_swap_op(mode, &futex_other, &futex_main);
+	}
+
+	if (cfg_verbose)
+		printf("%s finished: %d iteration(s)\n", __func__, counter);
+
+	return NULL;
+}
+
+void run_test(int mode)
+{
+	struct timespec start, stop;
+	int ret, counter;
+	pthread_t thread;
+	uint64_t duration;
+
+	futex_set(&futex_other, FUTEX_WAITING);
+	atomic_set(&validation_counter, 0);
+	ret = pthread_create(&thread, NULL, &other_thread, &mode);
+	if (ret) {
+		perror("pthread_create");
+		exit(1);
+	}
+
+	ret = clock_gettime(CLOCK_MONOTONIC, &start);
+	if (ret) {
+		perror("clock_gettime");
+		exit(1);
+	}
+
+	for (counter = 0; counter < cfg_iterations; ++counter) {
+		if (cfg_validate) {
+			int prev = 2 * counter;
+
+			if (prev != atomic_cmpxchg(&validation_counter, prev,
+						   prev + 1)) {
+				fprintf(stderr, "swap validation failed\n");
+				exit(1);
+			}
+		}
+		futex_swap_op(mode, &futex_main, &futex_other);
+	}
+	if (cfg_validate && validation_counter.val != 2 * cfg_iterations) {
+		fprintf(stderr, "final swap validation failed\n");
+		exit(1);
+	}
+
+	ret = clock_gettime(CLOCK_MONOTONIC, &stop);
+	if (ret) {
+		perror("clock_gettime");
+		exit(1);
+	}
+
+	duration = (stop.tv_sec - start.tv_sec) * 1000000000LL +
+	stop.tv_nsec - start.tv_nsec;
+	if (cfg_verbose || cfg_iterations > 1) {
+		printf("completed %d swap and back iterations in %lu ns: %lu ns per swap\n",
+			cfg_iterations, duration,
+			duration / (cfg_iterations * 2));
+	}
+
+	/* The remote thread is blocked; send it the final wake. */
+	futex_set(&futex_other, FUTEX_WAKEUP);
+	futex_wake(&futex_other, 1, FUTEX_PRIVATE_FLAG);
+	if (pthread_join(thread, NULL)) {
+		perror("pthread_join");
+		exit(1);
+	}
+}
+
+void usage(char *prog)
+{
+	printf("Usage: %s\n", prog);
+	printf("  -h    Display this help message\n");
+	printf("  -i N  Use N iterations to benchmark\n");
+	printf("  -n    Do not validate swapping correctness\n");
+	printf("  -v    Print diagnostic messages\n");
+}
+
+int main(int argc, char *argv[])
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "hi:nv")) != -1) {
+		switch (c) {
+		case 'h':
+			usage(basename(argv[0]));
+			exit(0);
+		case 'i':
+			cfg_iterations = atoi(optarg);
+			break;
+		case 'n':
+			cfg_validate = 0;
+			break;
+		case 'v':
+			cfg_verbose = 1;
+			break;
+		default:
+			usage(basename(argv[0]));
+			exit(1);
+		}
+	}
+
+	printf("\n\n------- running SWAP_WAKE_WAIT -----------\n\n");
+	run_test(SWAP_WAKE_WAIT);
+	printf("PASS\n");
+
+	printf("\n\n------- running SWAP_SWAP -----------\n\n");
+	run_test(SWAP_SWAP);
+	printf("PASS\n");
+
+	return 0;
+}
diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h
index ddbcfc9b7bac..4d6a0a18445a 100644
--- a/tools/testing/selftests/futex/include/futextest.h
+++ b/tools/testing/selftests/futex/include/futextest.h
@@ -38,6 +38,9 @@ typedef volatile u_int32_t futex_t;
 #ifndef FUTEX_CMP_REQUEUE_PI
 #define FUTEX_CMP_REQUEUE_PI		12
 #endif
+#ifndef FUTEX_SWAP
+#define FUTEX_SWAP			13
+#endif
 #ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE
 #define FUTEX_WAIT_REQUEUE_PI_PRIVATE	(FUTEX_WAIT_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
@@ -46,6 +49,9 @@ typedef volatile u_int32_t futex_t;
 #define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
 #endif
+#ifndef FUTEX_SWAP_PRIVATE
+#define FUTEX_SWAP_PRIVATE		(FUTEX_WAIT_WAKE | FUTEX_PRIVATE_FLAG)
+#endif
 
 /**
  * futex() - SYS_futex syscall wrapper
@@ -204,6 +210,19 @@ futex_cmp_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake,
 		     val, opflags);
 }
 
+/**
+ * futex_swap() - block on uaddr and wake one task blocked on uaddr2.
+ * @uaddr:	futex to block the current task on
+ * @timeout:	relative timeout for the current task block
+ * @uaddr2:	futex to wake tasks at (can be the same as uaddr)
+ */
+static inline int
+futex_swap(futex_t *uaddr, futex_t val, struct timespec *timeout,
+	   futex_t *uaddr2, int opflags)
+{
+	return futex(uaddr, FUTEX_SWAP, val, timeout, uaddr2, 0, opflags);
+}
+
 /**
  * futex_cmpxchg() - atomic compare and exchange
  * @uaddr:	The address of the futex to be modified
-- 
Gitee


From dad99a5750c4c942300c6626888a6ad52e62bf24 Mon Sep 17 00:00:00 2001
From: briansun <imbriansun@gmail.com>
Date: Tue, 25 Oct 2022 20:21:07 +0800
Subject: [PATCH 4/4] futex: introduce the direct-thread-switch mechanism

openeuler inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4L9RU
CVE: NA

Reference: https://lore.kernel.org/lkml/20200722234538.166697-2-posk@posk.io/

-------------------

In some scenarios, we need to run several low-thrashing required threads
together which act as logical operations like PV operations. This kind of
thread always falls asleep and wakes other threads up, and thread switching
requires the kernel to do several scheduling related overheads (Select the
proper core to execute, wake the task up, enqueue the task, mark the task
scheduling flag, pick the task at the proper time, dequeue the task and do
context switching). These overheads mentioned above are not accepted for the
low-thrashing threads. Therefore, we require a mechanism to decline the
unnecessary overhead and to swap threads directly without affecting the
fairness of CFS tasks.

To achieve this goal, we implemented the direct-thread-switch mechanism
based on the futex_swap patch*, which switches the DTS task directly with
the shared schedule entity. Also, we ensured the kernel keeps secure and
consistent basically.

Signed-off-by: Zhi Song <hizhisong@gmail.com>
---
 include/linux/sched.h                         |  22 ++
 include/uapi/linux/futex.h                    |   8 +
 init/Kconfig                                  |   7 +
 kernel/futex.c                                | 280 ++++++++++++++-
 kernel/sched/core.c                           |  49 ++-
 kernel/sched/fair.c                           | 318 +++++++++++++++++-
 kernel/sched/sched.h                          |  68 +++-
 .../selftests/futex/functional/futex_swap.c   |  24 +-
 8 files changed, 739 insertions(+), 37 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 714386dd3e18..9aa03c0957be 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -68,6 +68,11 @@ struct task_delay_info;
 struct task_group;
 struct io_uring_task;
 
+#define NONE_BY_PASS			0x0000
+#define INIT_BY_PASS			0x0001
+#define IN_BY_PASS			0x0002
+#define END_BY_PASS			0x0004
+
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
@@ -500,6 +505,10 @@ struct sched_entity {
 	unsigned long			runnable_weight;
 #endif
 
+#ifdef CONFIG_DTS
+	int 				by_pass;
+#endif
+
 #ifdef CONFIG_SMP
 	/*
 	 * Per entity load average tracking.
@@ -726,6 +735,15 @@ struct task_struct {
 	int				normal_prio;
 	unsigned int			rt_priority;
 
+#ifdef CONFIG_DTS
+	/*
+	 * by_pass indicate that the task is launched by direct-thread-switch.
+	 * dts_shared_se is the schedule entity shared with DTS task.
+	 */
+	int				by_pass;
+	struct sched_entity		dts_shared_se;
+#endif
+
 	const struct sched_class	*sched_class;
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
@@ -2194,6 +2212,10 @@ static inline int sched_qos_cpu_overload(void)
 }
 #endif
 
+#ifdef CONFIG_DTS
+extern int check_task_left_time(struct task_struct *task);
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 extern void sched_settag(struct task_struct *tsk, s64 tag);
 
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index c1d151d97dea..e2ed0553046d 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -25,7 +25,14 @@
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
+
+#ifdef CONFIG_DTS
+#define FUTEX_FLAGS_DTS_MODE	512
+#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
+				  FUTEX_FLAGS_DTS_MODE)
+#else
 #define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#endif
 
 #define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
@@ -43,6 +50,7 @@
 					 FUTEX_PRIVATE_FLAG)
 #define FUTEX_SWAP_PRIVATE		(FUTEX_SWAP | FUTEX_PRIVATE_FLAG)
 
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
diff --git a/init/Kconfig b/init/Kconfig
index 1c607825c2db..e1be030628a3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1261,6 +1261,13 @@ config SCHED_STEAL
 
 	  If unsure, say N here.
 
+config DTS
+	bool "Direct Thread Switch"
+	default y
+	depends on SCHED_STEAL
+	help
+	  enable the direct thread switch mechanism in the futex_swap operation
+
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support"
 	select PROC_CHILDREN
diff --git a/kernel/futex.c b/kernel/futex.c
index 42f55d1da678..48da1e8b8948 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -39,11 +39,16 @@
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
 #include <linux/time_namespace.h>
+#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/futex.h>
 
 #include "locking/rtmutex_common.h"
 
+#ifdef CONFIG_DTS
+#include "sched/sched.h"
+#endif
 /*
  * READ this before attempting to hack on futexes!
  *
@@ -161,7 +166,7 @@ static int  __read_mostly futex_cmpxchg_enabled;
  * NOMMU does not have per process address space. Let the compiler optimize
  * code away.
  */
-# define FLAGS_SHARED		0x00
+#define FLAGS_SHARED		0x00
 #endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
@@ -2585,6 +2590,219 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 	return 0;
 }
 
+#ifdef CONFIG_DTS
+static int __direct_thread_switch(struct task_struct *next)
+{
+	int cpu = smp_processor_id();
+	int success = 1;
+	struct rq_flags rf;
+	struct rq *rq = cpu_rq(cpu);
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	struct task_struct *prev = rq->curr;
+	struct sched_entity *prev_se, *next_se;
+	unsigned long *switch_count = &prev->nvcsw;
+	unsigned long prev_state;
+	int next_state;
+	struct rq *src_rq_next;
+	bool locked;
+
+	preempt_disable();
+	local_irq_disable();
+
+	if (!prev->by_pass) {
+		prev_se = &prev->se;
+	} else {
+		prev_se = &prev->dts_shared_se;
+	}
+
+	next_se = &next->se;
+
+	prev->by_pass = NONE_BY_PASS;
+	next->by_pass = INIT_BY_PASS;
+	next->dts_shared_se = *prev_se;
+	prev_se->by_pass = NONE_BY_PASS;
+	next->dts_shared_se.by_pass = INIT_BY_PASS;
+
+	/* task_struct::state is volatile so far */
+	next_state = next->state;
+	src_rq_next = task_rq(next);
+	locked = true;
+	/* Deliver the execution to the callee. */
+	if (next_state == TASK_RUNNING) {
+		/* The next is running now. */
+		if (task_running(src_rq_next, next)) {
+			success = 0;
+			goto end;
+		}
+		/* The next task is runnable, and may stay in the current core's rq or other cores' rq. */
+		/* Dequeue the next task's se (rather than dts_shared_se) to keep fairness and consistence.
+		 * Enqueue the next task's se when the task expired.
+		 */
+		if (task_rq(next) != rq) {
+#ifdef CONFIG_SCHED_STEAL
+			/* migrate */
+			if (!steal_task(rq, &rf, &locked, next)) {
+				success = 0;
+				goto end;
+			}
+#else
+			success = 0;
+			goto end;
+#endif
+		}
+		replace_shared_entity(cfs_rq, next_se, &next->dts_shared_se);
+	} else if (next_state == TASK_INTERRUPTIBLE) {
+		/*
+		 *
+		 * The next task in the sleeping state caused by futex_swap, futex_wait,
+		 * can be woken up here so far, but signals, and other interruptible situations
+		 * need to be implemented here.
+		 * P.S. We pick up the next task from the wake list of the corresponding futex_t.
+		 */
+
+		/* Enqueue the shared_se and change the state without entering schedule() path. */
+		if (!wake_up_process_prefer_current_cpu(next)) {
+			success = 0;
+			goto end;
+		}
+
+		/* success to wakeup (set p->state = TASK_RUNNING) */
+		/* dequeue the shared_se and set rq->curr = &next->dts_shared_se; */
+		set_next_entity(cfs_rq, &next->dts_shared_se);
+
+	} else {
+		success = 0;
+		goto end;
+	}
+
+	/* increase rq->cfs.nr_running */
+	cfs_rq->nr_running++;
+
+	sched_submit_work(prev);
+
+	rcu_note_context_switch(false);
+
+	/*
+	 * Make sure that signal_pending_state()->signal_pending() below
+	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+	 * done by the caller(futex_wait_queue_me) to avoid the race with signal_wake_up():
+	 *
+	 * __set_current_state(@state)		signal_wake_up()
+	 * __direct_thread_switch()		set_tsk_thread_flag(p, TIF_SIGPENDING)
+	 *					  wake_up_state(p, state)
+	 *   LOCK rq->lock			    LOCK p->pi_state
+	 *   smp_mb__after_spinlock()		    smp_mb__after_spinlock()
+	 *     if (signal_pending_state())	    if (p->state & @state)
+	 *
+	 * Also, the membarrier system call requires a full memory barrier
+	 * after coming from user-space, before storing to rq->curr.
+	 */
+	rq_lock(rq, &rf);
+	smp_mb__after_spinlock();
+
+	/*
+	 * We may fail to switch, so do not deactivate the current task before
+	 * process the next.
+	 */
+
+	/*
+	 * We must load prev->state once (task_struct::state is volatile), such
+	 * that:
+	 *
+	 *  - we form a control dependency vs deactivate_task() below.
+	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
+	 */
+	prev_state = prev->state;
+	if (prev_state) {
+		if (signal_pending_state(prev_state, prev)) {
+			prev->state = TASK_RUNNING;
+		} else {
+			prev->sched_contributes_to_load =
+				(prev_state & TASK_UNINTERRUPTIBLE) &&
+				!(prev_state & TASK_NOLOAD) &&
+				!(prev->flags & PF_FROZEN);
+
+			if (prev->sched_contributes_to_load)
+				rq->nr_uninterruptible++;
+
+			/*
+			 * __schedule()			ttwu()
+			 *   prev_state = prev->state;    if (p->on_rq && ...)
+			 *   if (prev_state)		    goto out;
+			 *     p->on_rq = 0;		  smp_acquire__after_ctrl_dep();
+			 *				  p->state = TASK_WAKING
+			 *
+			 * Where __schedule() and ttwu() have matching control dependencies.
+			 *
+			 * After this, schedule() must not care about p->state any more.
+			 */
+			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+
+			if (prev->in_iowait) {
+				atomic_inc(&rq->nr_iowait);
+				delayacct_blkio_start();
+			}
+		}
+	}
+
+	rq->nr_switches++;
+	/*
+	* RCU users of rcu_dereference(rq->curr) may not see
+	* changes to task_struct made by pick_next_task().
+	*/
+	RCU_INIT_POINTER(rq->curr, next);
+	/*
+	* The membarrier system call requires each architecture
+	* to have a full memory barrier after updating
+	* rq->curr, before returning to user-space.
+	*
+	* Here are the schemes providing that barrier on the
+	* various architectures:
+	* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+	*   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+	* - finish_lock_switch() for weakly-ordered
+	*   architectures where spin_unlock is a full barrier,
+	* - switch_to() for arm64 (weakly-ordered, spin_unlock
+	*   is a RELEASE barrier),
+	*/
+	++*switch_count;
+
+	psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+
+	trace_sched_switch(false, prev, next);
+
+	/* do the get_task_struct() in the futex_wait_queue_me() before */
+	put_task_struct(next);
+
+	rq = context_switch(rq, prev, next, &rf);
+
+	balance_callback(rq);
+	sched_update_worker(next);
+end:
+	sched_preempt_enable_no_resched();
+	return success;
+}
+
+/*
+ * return
+ * 0 for fail
+ * 1 for succeed
+ */
+static int direct_thread_switch(struct task_struct *next)
+{
+	if (next->sched_class != &fair_sched_class ||
+				 current == next) {
+		return 0;
+	}
+
+	if (!check_task_left_time(current)) {
+		return 0;
+	}
+
+	return __direct_thread_switch(next);
+}
+#endif /* CONFIG_DTS */
+
 /**
  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
  * @hb:		the futex hash bucket, must be locked by the caller
@@ -2595,7 +2813,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
  */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 				struct hrtimer_sleeper *timeout,
-				struct task_struct *next)
+				struct task_struct *next, int flags)
 {
 	/*
 	 * The task state is guaranteed to be set before another task can
@@ -2615,6 +2833,9 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	 * has tried to wake us, and we can skip the call to schedule().
 	 */
 	if (likely(!plist_node_empty(&q->list))) {
+#ifdef CONFIG_DTS
+		int do_dts_switch = 0;
+#endif
 		/*
 		 * If the timer has already expired, current will already be
 		 * flagged for rescheduling. Only call schedule if there
@@ -2622,27 +2843,49 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 		 */
 		if (!timeout || timeout->task) {
 			if (next) {
+#ifdef CONFIG_DTS
 				/*
-				 * wake_up_process() below will be replaced
-				 * in the next patch with
-				 * wake_up_process_prefer_current_cpu().
+				 * If we fail to switch to the next task directly, try to switch to
+				 * the next task in the traditional way.
+				 *
 				 */
+				if (flags & FUTEX_FLAGS_DTS_MODE)
+					do_dts_switch = direct_thread_switch(next);
+
+				if (!do_dts_switch)
+#endif
+				{
 #ifdef CONFIG_SMP
-                               wake_up_process_prefer_current_cpu(next);
+					wake_up_process_prefer_current_cpu(next);
 #else
-                               wake_up_process(next);
+					wake_up_process(next);
+#endif
+				}
+
+#ifdef CONFIG_DTS
+				if (!do_dts_switch)
 #endif
-				put_task_struct(next);
+					put_task_struct(next);
+
 				next = NULL;
 			}
-			freezable_schedule();
+#ifdef CONFIG_DTS
+			if (!do_dts_switch)
+#endif
+				freezable_schedule();
 		}
 	}
 	__set_current_state(TASK_RUNNING);
 
+
+
 	if (next) {
+#ifdef CONFIG_DTS
+		direct_thread_switch(next);
+#else
 		wake_up_process(next);
 		put_task_struct(next);
+#endif
 	}
 }
 
@@ -2743,7 +2986,7 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 		goto out;
 
 	/* queue_me and wait for wakeup, timeout, or a signal. */
-	futex_wait_queue_me(hb, &q, to, next);
+	futex_wait_queue_me(hb, &q, to, next, flags);
 	next = NULL;
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
@@ -2819,6 +3062,15 @@ static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
 		next->wake_q.next = NULL;
 	}
 
+	/* Basic security test. (Are the two tasks in the same group?) */
+
+	/* Have any time slices to be used? */
+
+	/*
+	 * The old one will go to sleep and enqueue the rq, meanwhile, get
+	 * the new one to run.
+	 */
+
 	return futex_wait(uaddr, flags, val, abs_time, bitset, next);
 }
 
@@ -3282,7 +3534,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	}
 
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
-	futex_wait_queue_me(hb, &q, to, NULL);
+	futex_wait_queue_me(hb, &q, to, NULL, flags);
 
 	spin_lock(&hb->lock);
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
@@ -3768,6 +4020,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	int cmd = op & FUTEX_CMD_MASK;
 	unsigned int flags = 0;
 
+#ifdef CONFIG_DTS
+	if (op & FUTEX_FLAGS_DTS_MODE) {
+		flags |= FUTEX_FLAGS_DTS_MODE;
+	}
+#endif
+
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		flags |= FLAGS_SHARED;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5bf355323381..a9093f6d98d9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2469,7 +2469,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
 			   struct rq_flags *rf)
 {
-	check_preempt_curr(rq, p, wake_flags);
+#ifdef CONFIG_DTS
+	if (p->by_pass != INIT_BY_PASS)
+#endif
+		check_preempt_curr(rq, p, wake_flags);
+
 	p->state = TASK_RUNNING;
 	trace_sched_wakeup(p);
 
@@ -2996,7 +3000,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 out:
 	if (success)
 		ttwu_stat(p, task_cpu(p), wake_flags);
-	preempt_enable();
+#ifdef CONFIG_DTS
+	if (p->by_pass == INIT_BY_PASS) {
+		p->by_pass = IN_BY_PASS;
+		p->se.by_pass = IN_BY_PASS;
+		p->dts_shared_se.by_pass = IN_BY_PASS;
+		preempt_enable_no_resched();
+	}
+	else
+#endif
+		preempt_enable();
 
 	return success;
 }
@@ -3086,6 +3099,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#ifdef CONFIG_DTS
+	p->dts_shared_se.on_rq			= 0;
+	p->dts_shared_se.exec_start		= 0;
+	p->dts_shared_se.sum_exec_runtime		= 0;
+	p->dts_shared_se.prev_sum_exec_runtime	= 0;
+	p->dts_shared_se.nr_migrations		= 0;
+	p->dts_shared_se.vruntime			= 0;
+	INIT_LIST_HEAD(&p->dts_shared_se.group_node);
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
 #endif
@@ -3315,6 +3338,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 	init_entity_runnable_average(&p->se);
 
+#ifdef CONFIG_DTS
+	p->by_pass = NONE_BY_PASS;
+	p->se.by_pass = NONE_BY_PASS;
+	p->dts_shared_se.by_pass = NONE_BY_PASS;
+#endif
 
 #ifdef CONFIG_SCHED_INFO
 	if (likely(sched_info_on()))
@@ -3702,6 +3730,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop(mm);
 	}
+#ifdef CONFIG_DTS
+		prev->by_pass = NONE_BY_PASS;
+		prev->se.by_pass = NONE_BY_PASS;
+		prev->dts_shared_se.by_pass = NONE_BY_PASS;
+#endif
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -3744,7 +3777,7 @@ static void __balance_callback(struct rq *rq)
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-static inline void balance_callback(struct rq *rq)
+inline void balance_callback(struct rq *rq)
 {
 	if (unlikely(rq->balance_callback))
 		__balance_callback(rq);
@@ -3752,7 +3785,7 @@ static inline void balance_callback(struct rq *rq)
 
 #else
 
-static inline void balance_callback(struct rq *rq)
+inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -3789,7 +3822,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 /*
  * context_switch - switch to the new MM and the new thread's register state.
  */
-static __always_inline struct rq *
+__always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next, struct rq_flags *rf)
 {
@@ -3846,7 +3879,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	barrier();
 
 	return finish_task_switch(prev);
-}
+}EXPORT_SYMBOL(context_switch);
 
 /*
  * nr_running and nr_context_switches:
@@ -4615,7 +4648,7 @@ void __noreturn do_task_dead(void)
 		cpu_relax();
 }
 
-static inline void sched_submit_work(struct task_struct *tsk)
+inline void sched_submit_work(struct task_struct *tsk)
 {
 	unsigned int task_flags;
 
@@ -4651,7 +4684,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		blk_schedule_flush_plug(tsk);
 }
 
-static void sched_update_worker(struct task_struct *tsk)
+void sched_update_worker(struct task_struct *tsk)
 {
 	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
 		if (tsk->flags & PF_WQ_WORKER)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 30fbcd06baa9..d58a949c9519 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -572,6 +572,28 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
 }
 
+static void __traverse_cfs_rq(struct cfs_rq *cfs_rq, struct rb_node **node)
+{
+	struct sched_entity *entry;
+
+	if (!*node) {
+		printk("TREE END\n");
+		return;
+	}
+
+	entry = rb_entry(*node, struct sched_entity, run_node);
+
+	__traverse_cfs_rq(cfs_rq, &(*node)->rb_left);
+	printk("%p\n", entry);
+	__traverse_cfs_rq(cfs_rq, &(*node)->rb_left);
+}
+
+void traverse_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
+	__traverse_cfs_rq(cfs_rq, link);
+}
+
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
@@ -2982,7 +3004,7 @@ adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
 }
 #endif
 
-static void
+void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
@@ -4340,7 +4362,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_stats_enqueue(cfs_rq, se, flags);
 	check_spread(cfs_rq, se);
 	if (!curr)
-		__enqueue_entity(cfs_rq, se);
+#ifdef CONFIG_DTS
+		if (se->by_pass != INIT_BY_PASS)
+#endif
+			__enqueue_entity(cfs_rq, se);
+
 	se->on_rq = 1;
 
 	/*
@@ -4463,6 +4489,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	unsigned long ideal_runtime, delta_exec;
 	struct sched_entity *se;
 	s64 delta;
+#ifdef CONFIG_DTS
+	struct task_struct *curr_task = NULL;
+
+	if (entity_is_task(curr) && curr->by_pass != NONE_BY_PASS)
+		curr_task = task_of_dts_shared_se(curr);
+#endif
 
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -4488,7 +4520,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		 * re-elected due to buddy favours.
 		 */
 		clear_buddies(cfs_rq, curr);
-		return;
+		goto end;
 	}
 
 	/*
@@ -4497,19 +4529,72 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 * This also mitigates buddy induced latencies under load.
 	 */
 	if (delta_exec < sysctl_sched_min_granularity)
-		return;
+		goto end;
 
 	se = __pick_first_entity(cfs_rq);
 	delta = curr->vruntime - se->vruntime;
 
 	if (delta < 0)
-		return;
+		goto end;
 
-	if (delta > ideal_runtime)
+	if (delta > ideal_runtime) {
 		resched_curr(rq_of(cfs_rq));
+		goto end;
+	} else {
+		return;
+	}
+end:
+#ifdef CONFIG_DTS
+	if (curr_task) {
+		curr_task->by_pass = END_BY_PASS;
+		curr_task->se.by_pass = END_BY_PASS;
+		curr_task->dts_shared_se.by_pass = END_BY_PASS;
+	}
+#endif
 }
 
-static void
+#ifdef CONFIG_DTS
+/*
+ * We dequeue the task original se but we do NOT CHANGE any schedule infomation of se.
+ * Correspondingly, enqueue the task original se without any changes on se's information
+ * when the shared se expired. // TODO
+ * shared se's stats acquiring, etc NEEDs TO BE fixed when task execute in DTS mode. // TODO
+ */
+void
+replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *shared_se)
+{
+	if (shared_se->on_rq) {
+		/*
+		 * Any task has to be enqueued before it get to execute on
+		 * a CPU. So account for the time it spent waiting on the
+		 * runqueue.
+		 */
+		// TODO
+		update_stats_wait_end(cfs_rq, shared_se);
+		__dequeue_entity(cfs_rq, se);	/* the se of next task should be dequeued */
+		update_load_avg(cfs_rq, shared_se, UPDATE_TG);
+	}
+
+	update_stats_curr_start(cfs_rq, shared_se);
+	cfs_rq->curr = shared_se;	// 后续update_curr是update cfs_rq->curr
+
+	/*
+	 * Track our maximum slice length, if the CPU's load is at
+	 * least twice that of our own weight (i.e. dont track it
+	 * when there are only lesser-weight tasks around):
+	 */
+	if (schedstat_enabled() &&
+	    rq_of(cfs_rq)->cfs.load.weight >= 2*shared_se->load.weight) {
+		schedstat_set(shared_se->statistics.slice_max,
+			max((u64)schedstat_val(shared_se->statistics.slice_max),
+			    shared_se->sum_exec_runtime - shared_se->prev_sum_exec_runtime));
+	}
+
+	shared_se->prev_sum_exec_runtime = shared_se->sum_exec_runtime;
+}
+#endif
+
+void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/* 'current' is not kept within the tree. */
@@ -4605,8 +4690,15 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
+/* the prev's value is unique or shared for the dts mechanism */
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
+#ifdef CONFIG_DTS
+	struct task_struct *task = NULL;
+
+	if (entity_is_task(prev))
+		task = task_of(prev);
+#endif
 	/*
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
@@ -4627,6 +4719,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		update_load_avg(cfs_rq, prev, 0);
 	}
 	cfs_rq->curr = NULL;
+#ifdef CONFIG_DTS
+	if (task && task->by_pass == END_BY_PASS) {
+		task->by_pass = NONE_BY_PASS;
+		task->se.by_pass = NONE_BY_PASS;
+		task->dts_shared_se.by_pass = NONE_BY_PASS;
+	}
+#endif
 }
 
 static void
@@ -5630,6 +5729,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 	unsigned int prev_nr = rq->cfs.h_nr_running;
 
+#ifdef CONFIG_DTS
+	if (p->by_pass != NONE_BY_PASS) {
+		se = &p->dts_shared_se;
+	}
+#endif
+
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -5737,11 +5842,17 @@ static void set_next_buddy(struct sched_entity *se);
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se;
 	int task_sleep = flags & DEQUEUE_SLEEP;
 	int idle_h_nr_running = task_has_idle_policy(p);
 	unsigned int prev_nr = rq->cfs.h_nr_running;
 	bool was_sched_idle = sched_idle_rq(rq);
+#ifdef CONFIG_DTS
+	if (p->by_pass != NONE_BY_PASS)
+		se = &p->dts_shared_se;
+	else
+#endif
+		se = &p->se;
 
 	util_est_dequeue(&rq->cfs, p);
 
@@ -7159,11 +7270,28 @@ static void set_skip_buddy(struct sched_entity *se)
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	struct task_struct *curr = rq->curr;
-	struct sched_entity *se = &curr->se, *pse = &p->se;
+	struct sched_entity *se, *pse;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 	int next_buddy_marked = 0;
 
+#ifdef CONFIG_DTS
+	int curr_by_pass = curr->by_pass;
+	int p_by_pass = p->by_pass;
+
+	if (curr_by_pass != NONE_BY_PASS)
+		se = &curr->dts_shared_se;
+	else
+#endif
+		se = &curr->se;
+
+#ifdef CONFIG_DTS
+	if (p_by_pass != NONE_BY_PASS)
+		pse = &p->dts_shared_se;
+	else
+#endif
+		pse = &p->se;
+
 	if (unlikely(se == pse))
 		return;
 
@@ -7718,13 +7846,25 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 
 	p = task_of(se);
 
+	if (se == NULL) {
+		printk("CFS_RQ Nr_running: %d\n", rq->cfs.nr_running);
+		printk("RQ Nr_running: %d\n", rq->nr_running);
+	}
+
 	/*
 	 * Since we haven't yet done put_prev_entity and if the selected task
 	 * is a different task than we started out with, try and touch the
 	 * least amount of cfs_rqs.
 	 */
 	if (prev != p) {
-		struct sched_entity *pse = &prev->se;
+		struct sched_entity *pse;
+#ifdef CONFIG_DTS
+		if (prev->by_pass != NONE_BY_PASS)
+			pse = &prev->dts_shared_se;
+		else
+#endif
+		pse = &prev->se;
+
 
 		while (!(cfs_rq = is_same_group(se, pse))) {
 			int se_depth = se->depth;
@@ -7877,8 +8017,15 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
  */
 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 {
-	struct sched_entity *se = &prev->se;
+	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_DTS
+	if (prev->by_pass != NONE_BY_PASS)
+		se = &prev->dts_shared_se;
+	else
+#endif
+		se = &prev->se;
+
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
@@ -7895,7 +8042,13 @@ static void yield_task_fair(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	struct sched_entity *se = &curr->se;
+	struct sched_entity *se;
+#ifdef CONFIG_DTS
+	if (curr->by_pass != NONE_BY_PASS)
+		se = &curr->dts_shared_se;
+	else
+#endif
+	se = &curr->se;
 
 	/*
 	 * Are we the only task in the tree?
@@ -7926,6 +8079,13 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 
+#ifdef CONFIG_DTS
+	/* DTS tasks DO NOT support being executed by yeild_to method.*/
+	if (p->by_pass != NONE_BY_PASS) {
+		return false;
+	}
+#endif
+
 	/* throttled hierarchies are not runnable */
 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 		return false;
@@ -8363,7 +8523,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
 /*
  * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu.
  */
-static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
+void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu)
 {
 	lockdep_assert_held(&src_rq->lock);
 
@@ -8573,6 +8733,10 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 
 	BUG_ON(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
+
+#ifdef CONFIG_DTS
+	if (p->by_pass != INIT_BY_PASS)
+#endif
 	check_preempt_curr(rq, p, 0);
 }
 
@@ -11544,6 +11708,53 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
 	return stolen;
 }
 
+int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+		      struct task_struct *tsk)
+{
+	struct rq_flags rf;
+	int stolen = 0;
+	int dst_cpu = dst_rq->cpu;
+	struct rq *src_rq = task_rq(tsk);
+	int src_cpu = task_cpu(tsk);
+
+	if (!steal_enabled())
+		return 0;
+
+	if (!cpu_active(dst_cpu))
+		return 0;
+
+	if (dst_cpu == src_cpu)
+		return 0;
+
+	if (*locked) {
+		rq_unpin_lock(dst_rq, dst_rf);
+		raw_spin_unlock(&dst_rq->lock);
+		*locked = false;
+	}
+	rq_lock_irqsave(src_rq, &rf);
+	update_rq_clock(src_rq);
+
+	if (!cpu_active(src_cpu))
+		tsk = NULL;
+	else
+		detach_task(tsk, src_rq, dst_cpu);
+
+	rq_unlock(src_rq, &rf);
+
+	if (tsk) {
+		raw_spin_lock(&dst_rq->lock);
+		rq_repin_lock(dst_rq, dst_rf);
+		*locked = true;
+		update_rq_clock(dst_rq);
+		attach_task(dst_rq, tsk);
+		stolen = 1;
+		schedstat_inc(dst_rq->steal);
+	}
+	local_irq_restore(rf.flags);
+
+	return stolen;
+}
+
 /*
  * Conservative upper bound on the max cost of a steal, in nsecs (the typical
  * cost is 1-2 microsec).  Do not steal if average idle time is less.
@@ -11653,6 +11864,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 
+#ifdef CONFIG_DTS
+	if (curr->by_pass != NONE_BY_PASS) {
+		se = &curr->dts_shared_se;
+	}
+#endif
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		entity_tick(cfs_rq, se, queued);
@@ -12148,6 +12365,81 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 	return rr_interval;
 }
 
+void update_before_bypass(void)
+{
+	int cpu = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
+	struct sched_entity *curr;
+	struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_DTS
+	if (current->by_pass != NONE_BY_PASS)
+		curr = &current->dts_shared_se;
+	else
+#endif
+		curr = &current->se;
+
+	cfs_rq = cfs_rq_of(curr);
+
+	rq_lock(rq, &rf);
+	update_rq_clock(rq);
+
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
+	update_cfs_group(curr);
+
+	/*
+	 * Update run-time statistics of the 'current'.
+	 */
+	update_curr(cfs_rq);
+
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
+	update_cfs_group(curr);
+
+	rq_unlock(rq, &rf);
+}
+
+/*
+ * return 1: left time Y
+ *
+ */
+int check_task_left_time(struct task_struct *task)
+{
+	unsigned long ideal_runtime, delta_exec;
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+#ifdef CONFIG_DTS
+	if (task->by_pass != NONE_BY_PASS)
+		se = &task->dts_shared_se;
+	else
+#endif
+		se = &task->se;
+
+	cfs_rq = cfs_rq_of(se);
+
+	ideal_runtime = sched_slice(cfs_rq, se);
+	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+	if (delta_exec > ideal_runtime) {
+		if (cfs_rq->nr_running > 1) {
+			resched_curr(rq_of(cfs_rq));
+			/*
+			* The current task ran long enough, ensure it doesn't get
+			* re-elected due to buddy favours.
+			*/
+			clear_buddies(cfs_rq, se);
+		}
+		return 0;
+	}
+
+	return 1;
+}
+
 /*
  * All the scheduling class methods:
  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 976cdb36fe08..456dfd0abf8d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -534,6 +534,12 @@ extern void sched_offline_group(struct task_group *tg);
 
 extern void sched_move_task(struct task_struct *tsk);
 
+#ifdef CONFIG_DTS
+extern void replace_shared_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev_se, struct sched_entity *shared_se);
+#endif
+
+extern void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
@@ -1185,9 +1191,22 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define raw_rq()		raw_cpu_ptr(&runqueues)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_DTS
+static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se)
+{
+	SCHED_WARN_ON(!entity_is_task(dts_shared_se));
+	return container_of(dts_shared_se, struct task_struct, dts_shared_se);
+}
+#endif
+
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
 	SCHED_WARN_ON(!entity_is_task(se));
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return task_of_dts_shared_se(se);
+	else
+#endif
 	return container_of(se, struct task_struct, se);
 }
 
@@ -1210,8 +1229,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 
 #else
 
+#ifdef CONFIG_DTS
+static inline struct task_struct *task_of_dts_shared_se(struct sched_entity *dts_shared_se)
+{
+	return container_of(dts_shared_se, struct task_struct, dts_shared_se);
+}
+
+static inline struct cfs_rq *cfs_rq_of_dts_shared_se(struct sched_entity *se)
+{
+	struct task_struct *p = task_of_dts_shared_se(se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->cfs;
+}
+#endif
+
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return task_of_dts_shared_se(se);
+	else
+#endif
 	return container_of(se, struct task_struct, se);
 }
 
@@ -1220,7 +1259,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 	return &task_rq(p)->cfs;
 }
 
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of_se(struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
 	struct rq *rq = task_rq(p);
@@ -1228,6 +1267,17 @@ static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 	return &rq->cfs;
 }
 
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+#ifdef CONFIG_DTS
+	if (se->by_pass != NONE_BY_PASS)
+		return cfs_rq_of_dts_shared_se(se);
+	else
+#endif
+	return cfs_rq_of_se(se);
+
+}
+
 /* runqueue "owned" by this group */
 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 {
@@ -2404,6 +2454,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+extern void traverse_cfs_rq(struct cfs_rq *cfs_rq);
 
 #ifdef	CONFIG_SCHED_DEBUG
 extern bool sched_debug_enabled;
@@ -2789,3 +2840,18 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+#ifdef CONFIG_DTS
+extern void sched_submit_work(struct task_struct *tsk);
+extern void sched_update_worker(struct task_struct *tsk);
+extern struct rq *context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf);
+extern void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se);
+#ifdef CONFIG_SCHED_STEAL
+extern int steal_task(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+			struct task_struct *tsk);
+extern void update_before_bypass(void);
+extern void balance_callback(struct rq *rq);
+#endif
+#endif
\ No newline at end of file
diff --git a/tools/testing/selftests/futex/functional/futex_swap.c b/tools/testing/selftests/futex/functional/futex_swap.c
index 9034d04372d3..8ce0266fcbc3 100644
--- a/tools/testing/selftests/futex/functional/futex_swap.c
+++ b/tools/testing/selftests/futex/functional/futex_swap.c
@@ -13,11 +13,11 @@
 
 /* The futex the main thread waits on. */
 futex_t futex_main = FUTEX_INITIALIZER;
-/* The futex the other thread wats on. */
+/* The futex the other thread waits on. */
 futex_t futex_other = FUTEX_INITIALIZER;
 
 /* The number of iterations to run (>1 => run benchmarks. */
-static int cfg_iterations = 1;
+static int cfg_iterations = 5;
 
 /* If != 0, print diagnostic messages. */
 static int cfg_verbose;
@@ -28,17 +28,21 @@ static int cfg_validate = 1;
 /* How to swap threads. */
 #define SWAP_WAKE_WAIT 1
 #define SWAP_SWAP 2
+#define SWAP_SWAP_DTS 4
 
 /* Futex values. */
 #define FUTEX_WAITING 0
 #define FUTEX_WAKEUP 1
 
+#define FUTEX_FLAGS_DTS_MODE	512
+
 /* An atomic counter used to validate proper swapping. */
 static atomic_t validation_counter;
 
 void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that)
 {
 	int ret;
+	int flags = 0;
 
 	switch (mode) {
 	case SWAP_WAKE_WAIT:
@@ -52,11 +56,14 @@ void futex_swap_op(int mode, futex_t *futex_this, futex_t *futex_that)
 		}
 		break;
 
+	case SWAP_SWAP_DTS:
+		flags |= FUTEX_FLAGS_DTS_MODE;
 	case SWAP_SWAP:
+		flags |= FUTEX_PRIVATE_FLAG;
 		futex_set(futex_this, FUTEX_WAITING);
 		futex_set(futex_that, FUTEX_WAKEUP);
 		ret = futex_swap(futex_this, FUTEX_WAITING, NULL,
-				 futex_that, FUTEX_PRIVATE_FLAG);
+				 futex_that, flags);
 		if (ret < 0 && errno == ENOSYS) {
 			/* futex_swap not implemented */
 			perror("futex_swap");
@@ -171,13 +178,14 @@ void usage(char *prog)
 	printf("  -i N  Use N iterations to benchmark\n");
 	printf("  -n    Do not validate swapping correctness\n");
 	printf("  -v    Print diagnostic messages\n");
+	printf("  -d    Benchmark with the direct-thread-switch(DTS) mechanism\n");
 }
 
 int main(int argc, char *argv[])
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "hi:nv")) != -1) {
+	while ((c = getopt(argc, argv, "hi:nvd")) != -1) {
 		switch (c) {
 		case 'h':
 			usage(basename(argv[0]));
@@ -191,6 +199,9 @@ int main(int argc, char *argv[])
 		case 'v':
 			cfg_verbose = 1;
 			break;
+		case 'd':
+			goto dts_test;
+			break;
 		default:
 			usage(basename(argv[0]));
 			exit(1);
@@ -205,5 +216,10 @@ int main(int argc, char *argv[])
 	run_test(SWAP_SWAP);
 	printf("PASS\n");
 
+dts_test:
+	printf("\n\n---- running SWAP_SWAP with the direct-thread-switch(DTS) mechanism ----\n\n");
+	run_test(SWAP_SWAP_DTS);
+	printf("PASS\n");
+
 	return 0;
 }
-- 
Gitee