diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 4e1e8dacb3b82716d3a05004c1b0aa042be6cc4b..ceafa4be443a58dec9692aedfd95b899a932bdad 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -977,6 +977,7 @@ CONFIG_BLK_WBT_MQ=y # CONFIG_BLK_CGROUP_FC_APPID is not set CONFIG_BLK_CGROUP_IOCOST=y CONFIG_BLK_CGROUP_LEGACY_IOCOST=y +CONFIG_BLK_CGROUP_IOINFLIGHT=y # CONFIG_BLK_CGROUP_IOPRIO is not set CONFIG_BLK_DEBUG_FS=y CONFIG_BLK_DEBUG_FS_ZONED=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 1ca21b317a43a3d414458868f4073a7c9516e6cd..2f625e87da684ff6e922a64ad988c6153a9d87df 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -990,6 +990,7 @@ CONFIG_BLK_WBT_MQ=y # CONFIG_BLK_CGROUP_FC_APPID is not set CONFIG_BLK_CGROUP_IOCOST=y CONFIG_BLK_CGROUP_LEGACY_IOCOST=y +CONFIG_BLK_CGROUP_IOINFLIGHT=y # CONFIG_BLK_CGROUP_IOPRIO is not set CONFIG_BLK_DEBUG_FS=y CONFIG_BLK_DEBUG_FS_ZONED=y diff --git a/block/Kconfig b/block/Kconfig index 7018fdcaa459308ba68fd96fea6ca6dde6a5665f..1d338261b751b98efb85b088d03cfc22ec450fac 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -204,6 +204,16 @@ config BLK_CGROUP_LEGACY_IOCOST If unsure, say N. +config BLK_CGROUP_IOINFLIGHT + bool "Enable support for inflight based cgroup IO controller" + depends on BLK_CGROUP + select BLK_RQ_ALLOC_TIME + help + Enabling this option enables the .inf.qos interface for inflight + based proportional IO control. The IO controller distributes IO + capacity between different groups based on their share of the + overall weight distribution. + config BLK_CGROUP_IOPRIO bool "Cgroup I/O controller for assigning an I/O priority class" depends on BLK_CGROUP diff --git a/block/Makefile b/block/Makefile index 400731b162c08417772005071ead71e797915845..3585999387571c4909811d33a15946603e77a2fa 100644 --- a/block/Makefile +++ b/block/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o +obj-$(CONFIG_BLK_CGROUP_IOINFLIGHT) += blk-ioinf.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o diff --git a/block/blk-ioinf.c b/block/blk-ioinf.c new file mode 100644 index 0000000000000000000000000000000000000000..446a6a0f8b8433826d5ec1fc8d057608a1ab6af0 --- /dev/null +++ b/block/blk-ioinf.c @@ -0,0 +1,1315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IO inflight relative controller + */ + +#include +#include +#include +#include +#include +#include + +#include "blk-cgroup.h" +#include "blk-rq-qos.h" +#include "blk-mq.h" + +#define IOINFG_WEIGHT_UNINIT (CGROUP_WEIGHT_MAX + 1) +#define IOINF_MIN_INFLIGHT 3 +#define IOINFG_MIN_INFLIGHT 1 +/* default wake-up time in jiffies for backgroup job, see ioinf_timer_fn() */ +#define IOINF_TIMER_PERID (HZ / 2) +/* Minimum wait queue count for offline cgroups. */ +#define IOINFG_MIN_WQ_NR 8 +/* minimal number of samples for congestion control */ +#define IOINF_MIN_SAMPLES 100 + +/* scale inflight from 1/1000 to 100 */ +enum { + MIN_SCALE = 1, /* one thousandth. */ + DFL_SCALE = 100, /* one tenth. */ + SCALE_GRAN = 1000, /* The control granularity is 1/1000. */ + MAX_SCALE = 100000, /* A hundredfold. */ +}; + +/* io.inf.qos controls */ +enum { + INF_ENABLE, + INF_INFLIGHT, + INF_FLAGS, + + QOS_ENABLE, + QOS_RLAT, + QOS_WLAT, + QOS_RPCT, + QOS_WPCT, + + NR_QOS_CTRL_PARAMS +}; + +/* qos control params */ +struct ioinf_params { + bool enabled; + bool qos_enabled; + u32 inflight; + unsigned long flags; + u64 rlat; + u64 wlat; + u32 rpct; + u32 wpct; +}; + +struct ioinf_io_stat { + u64 nr; + u64 lat; + u64 met; +}; + +struct ioinf_lat_stat { + struct ioinf_io_stat read; + struct ioinf_io_stat write; +}; + +struct ioinf_rq_wait { + wait_queue_head_t *wait; + u32 wq_nr; + atomic_t next_wq; + atomic_t sleepers; + + atomic_t inflight; + u32 hinflight; + u32 max_inflight; + u32 last_max; + u32 exhausted; + u32 issued; +}; + +/* the global conrtol structure */ +struct ioinf { + struct rq_qos rqos; + + struct ioinf_params params; + u32 inflight; + u32 scale; + u32 old_scale; + u32 max_scale; + u32 scale_step; + + /* default time for ioinf_timer_fn */ + unsigned long inf_timer_perid; + struct timer_list inf_timer; + + /* global lock */ + spinlock_t lock; + + /* for offline cgroups */ + struct ioinf_rq_wait offline; + /* for online cgroups */ + struct ioinf_rq_wait online; + + /* timer for ioinf_wakeup_timer_fn */ + struct hrtimer wakeup_timer; + bool waking; + + struct ioinf_lat_stat last_stat; + struct ioinf_lat_stat cur_stat; + struct ioinf_lat_stat delta_stat; + struct ioinf_lat_stat __percpu *stat; +}; + +/* per disk-cgroup pair structure */ +struct ioinf_gq { + struct blkg_policy_data pd; + struct ioinf *inf; + + /* weight < 0: offline; weight > 0: online; weight == 0: unset */ + int user_weight; + int dfl_user_weight; +}; + +/* per cgroup structure, used to record default weight for all disks */ +struct ioinf_cgrp { + struct blkcg_policy_data cpd; + + /* weight < 0: offline; weight > 0: online; weight == 0: unset */ + int dfl_user_weight; +}; + +/* io-inflight flags bit */ +enum { + /* + * Cgroups with unset weight are not throttled and latency is not + * recorded. Without this flag, such cgroups are treated as offline. + */ + DEFAULT_NOLIMIT, + + /* If QoS not met, also throttle online, trading BW for latency. */ + THROTTLE_ONLINE, + + NR_INF_FLAGS +}; + +static inline int inf_test_flag(struct ioinf *inf, int bit) +{ + return test_bit(bit, &inf->params.flags); +} + +static int infg_user_weight(struct ioinf_gq *infg) +{ + if (infg->user_weight) + return infg->user_weight; + + /* if user doesn't set per disk weight, use the cgroup default weight */ + if (infg->dfl_user_weight) + return infg->dfl_user_weight; + + /* No limit for Cgroups with unset weight */ + if (inf_test_flag(infg->inf, DEFAULT_NOLIMIT)) + return 0; + + /* Cgroups with unset weight are treated as offline. */ + return -1; +} + +static bool infg_offline(struct ioinf_gq *infg) +{ + return infg_user_weight(infg) < 0; +} + +static bool infg_nolimit(struct ioinf_gq *infg) +{ + return infg_user_weight(infg) == 0; +} + +static struct ioinf *rqos_to_inf(struct rq_qos *rqos) +{ + return container_of(rqos, struct ioinf, rqos); +} + +static struct ioinf *q_to_inf(struct request_queue *q) +{ + return rqos_to_inf(rq_qos_id(q, RQ_QOS_INFLIGHT)); +} + +static struct ioinf_gq *pd_to_infg(struct blkg_policy_data *pd) +{ + if (!pd) + return NULL; + + return container_of(pd, struct ioinf_gq, pd); +} + +static struct blkcg_policy blkcg_policy_ioinf; + +static struct ioinf_gq *blkg_to_infg(struct blkcg_gq *blkg) +{ + return pd_to_infg(blkg_to_pd(blkg, &blkcg_policy_ioinf)); +} + +static struct ioinf_cgrp *blkcg_to_infcg(struct blkcg *blkcg) +{ + struct blkcg_policy_data *cpd = + blkcg_to_cpd(blkcg, &blkcg_policy_ioinf); + + return container_of(cpd, struct ioinf_cgrp, cpd); +} + +static struct blkcg_gq *ioinf_bio_blkg(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + if (!blkg || !blkg->online) + return NULL; + + if (blkg->blkcg->css.cgroup->level == 0) + return NULL; + + return blkg; +} + +static struct ioinf_gq *ioinf_bio_infg(struct bio *bio) +{ + struct ioinf_gq *infg; + struct blkcg_gq *blkg = ioinf_bio_blkg(bio); + + if (!blkg) + return NULL; + + infg = blkg_to_infg(blkg); + if (!infg) + return NULL; + + return infg; +} + +static void ioinf_set_hinflight(struct ioinf_rq_wait *rqw, u32 new) +{ + rqw->hinflight = new; + rqw->last_max = max(rqw->last_max >> 1, rqw->max_inflight); + rqw->max_inflight = IOINFG_MIN_INFLIGHT; +} + +static inline void ioinf_rqw_wake_up_all(struct ioinf_rq_wait *rqw) +{ + if (!atomic_read(&rqw->sleepers)) + return; + + for (int i = 0; i < rqw->wq_nr; i++) + wake_up_all(&rqw->wait[i]); +} + +static void ioinf_wake_up_all(struct ioinf *inf) +{ + ioinf_rqw_wake_up_all(&inf->online); + ioinf_rqw_wake_up_all(&inf->offline); +} + +static enum hrtimer_restart ioinf_wakeup_timer_fn(struct hrtimer *timer) +{ + struct ioinf *inf = container_of(timer, struct ioinf, wakeup_timer); + + WRITE_ONCE(inf->waking, false); + ioinf_wake_up_all(inf); + + return HRTIMER_NORESTART; +} + +void ioinf_done(struct ioinf *inf, struct ioinf_rq_wait *rqw) +{ + int inflight; + + if (!inf->params.enabled) + return; + + inflight = atomic_dec_return(&rqw->inflight); + if (inflight >= (int)rqw->hinflight) + return; + + if (!READ_ONCE(inf->waking) && atomic_read(&rqw->sleepers)) { + WRITE_ONCE(inf->waking, true); + hrtimer_start(&inf->wakeup_timer, 0, HRTIMER_MODE_REL); + } +} + +struct ioinf_rq_qos_wait_data { + struct wait_queue_entry wq; + struct task_struct *task; + struct ioinf_rq_wait *rqw; + struct ioinf *inf; + bool is_prio; + bool do_wakeup; + bool got_token; +}; + +static bool ioinf_inflight_cb(struct ioinf_rq_qos_wait_data *data) +{ + struct ioinf *inf = data->inf; + struct ioinf_rq_wait *rqw = data->rqw; + u32 inflight; + u32 sleepers = 0; + + if (!inf->params.enabled) + return true; + + if (!data->do_wakeup) + sleepers = atomic_read(&rqw->sleepers); +retry: + /* + * IOs which may cause priority inversions are + * dispatched directly, even if they're over limit. + */ + inflight = atomic_read(&rqw->inflight); + if (inflight + sleepers < rqw->hinflight || data->is_prio) { + inflight = atomic_inc_return(&rqw->inflight); + + if (inflight > rqw->max_inflight) + rqw->max_inflight = inflight; + rqw->issued++; + return true; + } + + rqw->max_inflight = max(rqw->max_inflight, inflight + 1); + if (rqw == &inf->offline) { + rqw->exhausted++; + return false; + } + + if (inf->offline.hinflight > IOINFG_MIN_INFLIGHT) { + /* Reclaim half of the inflight budget from offline groups. */ + inf->offline.hinflight = inf->offline.hinflight >> 1; + inf->online.hinflight = inf->inflight - inf->offline.hinflight; + goto retry; + } + + rqw->exhausted++; + /* wake up ioinf_timer_fn() immediately to adjust scale */ + if (inf->scale < inf->max_scale || !inf_test_flag(inf, THROTTLE_ONLINE)) + timer_reduce(&inf->inf_timer, jiffies + 1); + return false; +} + +static int ioinf_wake_fn(struct wait_queue_entry *curr, + unsigned int mode, int wake_flags, void *key) +{ + struct ioinf_rq_qos_wait_data *data = container_of(curr, + struct ioinf_rq_qos_wait_data, wq); + + /* + * If we fail to get a budget, return -1 to interrupt + * the wake up loop in __wake_up_common. + */ + if (!ioinf_inflight_cb(data)) + return -1; + + data->got_token = true; + wake_up_process(data->task); + list_del_init_careful(&curr->entry); + return 1; +} + +static void ioinf_throttle(struct ioinf *inf, struct ioinf_rq_wait *rqw, + bool is_prio) +{ + bool has_sleeper; + u32 wq_idx; + struct ioinf_rq_qos_wait_data data = { + .wq = { + .func = ioinf_wake_fn, + .entry = LIST_HEAD_INIT(data.wq.entry), + }, + .task = current, + .rqw = rqw, + .inf = inf, + .is_prio = is_prio, + .do_wakeup = false, + }; + + if (!timer_pending(&inf->inf_timer)) + timer_reduce(&inf->inf_timer, jiffies + inf->inf_timer_perid); + + if (ioinf_inflight_cb(&data)) + return; + + data.do_wakeup = true; + wq_idx = atomic_fetch_inc(&rqw->next_wq) % rqw->wq_nr; + has_sleeper = !prepare_to_wait_exclusive(&rqw->wait[wq_idx], &data.wq, + TASK_UNINTERRUPTIBLE); + atomic_inc(&rqw->sleepers); + do { + /* The memory barrier in set_task_state saves us here. */ + if (data.got_token) + break; + if (!has_sleeper && ioinf_inflight_cb(&data)) { + finish_wait(&rqw->wait[wq_idx], &data.wq); + + /* + * We raced with rq_qos_wake_function() getting a token, + * which means we now have two. Put our local token + * and wake anyone else potentially waiting for one. + */ + if (data.got_token) + ioinf_done(inf, rqw); + break; + } + io_schedule(); + has_sleeper = true; + set_current_state(TASK_UNINTERRUPTIBLE); + } while (1); + + finish_wait(&rqw->wait[wq_idx], &data.wq); + atomic_dec(&rqw->sleepers); +} + +static void ioinf_rqos_throttle(struct rq_qos *rqos, struct bio *bio) +{ + struct ioinf *inf = rqos_to_inf(rqos); + struct ioinf_gq *infg = ioinf_bio_infg(bio); + bool is_prio; + + if (!inf->params.enabled || !infg || infg_nolimit(infg)) + return; + + is_prio = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); + + if (infg_offline(infg)) { + ioinf_throttle(inf, &inf->offline, is_prio); + return; + } + + if (!inf->online.issued && !inf->params.qos_enabled) + inf->max_scale = inf->scale = inf->old_scale = SCALE_GRAN; + ioinf_throttle(inf, &inf->online, is_prio); +} + +static void ioinf_rqos_track(struct rq_qos *rqos, struct request *rq, + struct bio *bio) +{ + struct blkcg_gq *blkg = ioinf_bio_blkg(bio); + + if (!blkg) + return; + + rq->blkg = blkg; +} + +static void ioinf_record_lat(struct ioinf *inf, struct request *rq) +{ + u64 lat; + + lat = rq->io_end_time_ns ? rq->io_end_time_ns : blk_time_get_ns(); + lat -= rq->alloc_time_ns; + + switch (req_op(rq)) { + case REQ_OP_READ: + this_cpu_inc(inf->stat->read.nr); + this_cpu_add(inf->stat->read.lat, lat); + if (inf->params.qos_enabled && lat <= inf->params.rlat) + this_cpu_inc(inf->stat->read.met); + break; + case REQ_OP_WRITE: + this_cpu_inc(inf->stat->write.nr); + this_cpu_add(inf->stat->write.lat, lat); + if (inf->params.qos_enabled && lat <= inf->params.wlat) + this_cpu_inc(inf->stat->write.met); + break; + default: + break; + } +} + +static void ioinf_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg = ioinf_bio_blkg(bio); + struct ioinf_gq *infg; + struct ioinf *inf; + + if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED)) + return; + + infg = blkg_to_infg(blkg); + if (!infg) + return; + + inf = infg->inf; + if (!inf->params.enabled || infg_nolimit(infg)) + return; + + if (infg_offline(infg)) + ioinf_done(inf, &inf->offline); + else + ioinf_done(inf, &inf->online); +} + +static void ioinf_rqos_done(struct rq_qos *rqos, struct request *rq) +{ + struct blkcg_gq *blkg = rq->blkg; + struct ioinf_gq *infg; + + if (!blkg) + return; + + rq->blkg = NULL; + + infg = blkg_to_infg(blkg); + if (!infg || !infg->inf->params.enabled || + infg_offline(infg) || infg_nolimit(infg)) + return; + + ioinf_record_lat(infg->inf, rq); +} + +static void ioinf_rqos_exit(struct rq_qos *rqos) +{ + struct ioinf *inf = rqos_to_inf(rqos); + + blkcg_deactivate_policy(rqos->disk, &blkcg_policy_ioinf); + + hrtimer_cancel(&inf->wakeup_timer); + timer_shutdown_sync(&inf->inf_timer); + ioinf_wake_up_all(inf); + kfree(inf->online.wait); + kfree(inf->offline.wait); + free_percpu(inf->stat); + kfree(inf); +} + +static inline u64 ioinf_qos_met_percent(struct ioinf_io_stat *io_stat) +{ + if (!io_stat->nr) + return 0; + return div_u64(io_stat->met * 100, io_stat->nr); +} + +static int ioinf_stat_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct ioinf *inf = rqos_to_inf(rqos); + struct ioinf_lat_stat *stat; + + if (!inf->params.enabled) { + seq_puts(m, "\tinf.qos disabled.\n"); + return 0; + } + + spin_lock_irq(&inf->lock); + + seq_printf(m, "scale %u/%u inflight %u->%u\n", + inf->scale, SCALE_GRAN, + inf->params.inflight, inf->inflight); + + seq_printf(m, "online inflight %d/%u, sleepers: %d\n", + atomic_read(&inf->online.inflight), + inf->online.hinflight, atomic_read(&inf->online.sleepers)); + seq_printf(m, "offline inflight %d/%u, sleepers: %d\n", + atomic_read(&inf->offline.inflight), + inf->offline.hinflight, atomic_read(&inf->offline.sleepers)); + + stat = &inf->delta_stat; + seq_puts(m, "online average latency:\n"); + seq_printf(m, "(%llu/%llu-%llu-%llu%%) (%llu/%llu-%llu-%llu%%)\n", + stat->read.met, stat->read.nr, stat->read.lat, + ioinf_qos_met_percent(&stat->read), + stat->write.met, stat->write.nr, stat->write.lat, + ioinf_qos_met_percent(&stat->write)); + spin_unlock_irq(&inf->lock); + + return 0; +} + +static const struct blk_mq_debugfs_attr ioinf_debugfs_attrs[] = { + {"stat", 0400, ioinf_stat_show}, + {}, +}; + +static struct rq_qos_ops ioinf_rqos_ops = { + .throttle = ioinf_rqos_throttle, + .done_bio = ioinf_rqos_done_bio, + .done = ioinf_rqos_done, + .track = ioinf_rqos_track, + .exit = ioinf_rqos_exit, + +#ifdef CONFIG_BLK_DEBUG_FS + .debugfs_attrs = ioinf_debugfs_attrs, +#endif +}; + +static void __inflight_scale_up(struct ioinf *inf, u32 aim, bool force) +{ + u32 new_scale; + + inf->old_scale = inf->scale; + if (aim < inf->inflight || inf->scale >= MAX_SCALE) + return; + + new_scale = DIV_ROUND_UP(aim * SCALE_GRAN, inf->params.inflight); + if (new_scale <= inf->old_scale) { + if (!force) + return; + new_scale = inf->scale + inf->scale_step; + } + + inf->scale = umin(new_scale, inf->max_scale); +} + +static void inflight_scale_up(struct ioinf *inf, u32 aim) +{ + __inflight_scale_up(inf, aim, false); +} + +static void inflight_force_scale_up(struct ioinf *inf, u32 aim) +{ + __inflight_scale_up(inf, aim, true); +} + +static void __inflight_scale_down(struct ioinf *inf, u32 aim, bool force) +{ + u32 new_scale; + + inf->old_scale = inf->scale; + if (inf->inflight <= IOINF_MIN_INFLIGHT || inf->scale <= MIN_SCALE) + return; + + new_scale = DIV_ROUND_UP(aim * SCALE_GRAN, inf->params.inflight); + if (new_scale >= inf->old_scale) { + if (!force) + return; + new_scale = inf->scale - inf->scale_step; + } + + inf->scale = new_scale; +} + +static void inflight_scale_down(struct ioinf *inf, u32 aim) +{ + __inflight_scale_down(inf, aim, false); +} + +static void inflight_force_scale_down(struct ioinf *inf, u32 aim) +{ + __inflight_scale_down(inf, aim, true); +} + +u32 ioinf_calc_budget(struct ioinf_rq_wait *rqw) +{ + u32 new_budget; + u64 exhausted = rqw->exhausted; + u64 issued = rqw->issued; + + new_budget = max(rqw->last_max, rqw->max_inflight); + /* How much budget is needed to avoid 'exhausted'? */ + if (exhausted && issued) + new_budget += div_u64(exhausted * new_budget, issued); + + return new_budget; +} + +static void ioinf_sample_cpu_lat(struct ioinf_lat_stat *cur, int cpu, + struct ioinf_lat_stat __percpu *stat) +{ + struct ioinf_lat_stat *pstat = per_cpu_ptr(stat, cpu); + + cur->read.nr += pstat->read.nr; + cur->read.lat += pstat->read.lat; + cur->read.met += pstat->read.met; + cur->write.nr += pstat->write.nr; + cur->write.lat += pstat->write.lat; + cur->write.met += pstat->write.met; +} + +static void ioinf_update_delta_stat(struct ioinf_lat_stat *cur, + struct ioinf_lat_stat *last, struct ioinf_lat_stat *delta) +{ + delta->read.nr += cur->read.nr - last->read.nr; + delta->read.met += cur->read.met - last->read.met; + delta->read.lat += cur->read.lat - last->read.lat; + if (delta->read.nr > 0) + delta->read.lat = div_u64(delta->read.lat, delta->read.nr); + + delta->write.nr += cur->write.nr - last->write.nr; + delta->write.met += cur->write.met - last->write.met; + delta->write.lat += cur->write.lat - last->write.lat; + if (delta->write.nr > 0) + delta->write.lat = div_u64(delta->write.lat, delta->write.nr); +} + +static void ioinf_sample_lat(struct ioinf *inf) +{ + int cpu; + + inf->last_stat = inf->cur_stat; + memset(&inf->cur_stat, 0, sizeof(struct ioinf_lat_stat)); + for_each_possible_cpu(cpu) + ioinf_sample_cpu_lat(&inf->cur_stat, cpu, inf->stat); + + if (!inf->params.qos_enabled) + memset(&inf->delta_stat, 0, sizeof(struct ioinf_lat_stat)); + if (inf->delta_stat.read.nr >= IOINF_MIN_SAMPLES) + memset(&inf->delta_stat.read, 0, sizeof(struct ioinf_io_stat)); + if (inf->delta_stat.write.nr >= IOINF_MIN_SAMPLES) + memset(&inf->delta_stat.write, 0, sizeof(struct ioinf_io_stat)); + ioinf_update_delta_stat(&inf->cur_stat, &inf->last_stat, + &inf->delta_stat); +} + +static int ioinf_online_busy(struct ioinf *inf) +{ + struct ioinf_lat_stat *stat = &inf->delta_stat; + int met_percent, unmet_percent = 0; + + if (stat->read.nr >= IOINF_MIN_SAMPLES) { + met_percent = ioinf_qos_met_percent(&stat->read); + unmet_percent = inf->params.rpct - met_percent; + } + if (stat->write.nr >= IOINF_MIN_SAMPLES) { + met_percent = ioinf_qos_met_percent(&stat->write); + if (unmet_percent < inf->params.wpct - met_percent) + unmet_percent = inf->params.wpct - met_percent; + } + + return unmet_percent; +} + +static +void ioinf_update_inflight(struct ioinf *inf, u32 new_online, u32 new_offline) +{ + inf->scale = clamp(inf->scale, MIN_SCALE, MAX_SCALE); + inf->inflight = inf->params.inflight * inf->scale / SCALE_GRAN; + if (inf->inflight < IOINF_MIN_INFLIGHT) { + inf->inflight = IOINF_MIN_INFLIGHT; + inf->scale = inf->inflight * SCALE_GRAN / inf->params.inflight; + } + + if (new_online < inf->inflight) + new_offline = inf->inflight - new_online; + else + new_offline = min(new_offline, IOINFG_MIN_INFLIGHT); + + if (inf_test_flag(inf, THROTTLE_ONLINE)) { + new_online = inf->inflight - new_offline; + } else { + inf->inflight = new_online + new_offline; + inf->scale = inf->inflight * SCALE_GRAN / inf->params.inflight; + } + + ioinf_set_hinflight(&inf->offline, new_offline); + inf->offline.exhausted = 0; + inf->offline.issued = 0; + + ioinf_set_hinflight(&inf->online, new_online); + inf->online.exhausted = 0; + inf->online.issued = 0; + + ioinf_wake_up_all(inf); +} + +static void ioinf_timer_fn(struct timer_list *timer) +{ + struct ioinf *inf = container_of(timer, struct ioinf, inf_timer); + struct ioinf_rq_wait *online = &inf->online; + struct ioinf_rq_wait *offline = &inf->offline; + unsigned long flags; + u32 online_budget, offline_budget, total_budget; + int unmet_percent = 0; + + spin_lock_irqsave(&inf->lock, flags); + ioinf_sample_lat(inf); + if (inf->params.qos_enabled) + unmet_percent = ioinf_online_busy(inf); + + online_budget = ioinf_calc_budget(online); + offline_budget = ioinf_calc_budget(offline); + total_budget = online_budget + offline_budget; + + if (unmet_percent < 0 && inf->max_scale < MAX_SCALE) + inf->max_scale++; + + if (unmet_percent > 0) { + inf->max_scale = inf->scale; + if (inf->max_scale > inf->scale_step) + inf->max_scale -= inf->scale_step; + total_budget = umin(online->hinflight + offline->hinflight, + total_budget); + total_budget -= total_budget * unmet_percent / 100; + inflight_force_scale_down(inf, total_budget); + } else if (inf->scale < inf->max_scale && online->exhausted) { + inflight_force_scale_up(inf, total_budget); + if (inf->scale > inf->max_scale) + inf->scale = (inf->old_scale + inf->max_scale + 1) / 2; + } else if (!online->issued && online_budget <= IOINFG_MIN_INFLIGHT) { + inf->max_scale = inf->scale = inf->old_scale = MAX_SCALE; + } else if (inf->scale < inf->max_scale && inf->params.qos_enabled) { + inflight_scale_up(inf, total_budget); + } else if (inf->old_scale < inf->scale) { + inflight_scale_down(inf, total_budget); + } + + ioinf_update_inflight(inf, online_budget, offline_budget); + + spin_unlock_irqrestore(&inf->lock, flags); + mod_timer(&inf->inf_timer, jiffies + inf->inf_timer_perid); +} + +static u32 ioinf_default_inflight(struct ioinf *inf) +{ + u32 inflight = inf->params.inflight * DFL_SCALE / SCALE_GRAN; + + if (inflight < IOINF_MIN_INFLIGHT) + inflight = IOINF_MIN_INFLIGHT; + inf->scale = DIV_ROUND_UP(inflight * SCALE_GRAN, inf->params.inflight); + inf->old_scale = inf->scale; + + return inf->params.inflight * inf->scale / SCALE_GRAN; +} + +static inline int ioinf_rqw_init(struct ioinf_rq_wait *rqw) +{ + int i; + + rqw->wait = kcalloc(rqw->wq_nr, sizeof(wait_queue_head_t), GFP_KERNEL); + if (!rqw->wait) + return -ENOMEM; + + for (i = 0; i < rqw->wq_nr; i++) + init_waitqueue_head(&rqw->wait[i]); + + return 0; +} + +static int blk_ioinf_init(struct gendisk *disk) +{ + struct ioinf *inf; + int ret = -ENOMEM; + + inf = kzalloc(sizeof(*inf), GFP_KERNEL); + if (!inf) + return ret; + + inf->stat = alloc_percpu(struct ioinf_lat_stat); + if (!inf->stat) + goto free_inf; + + inf->offline.wq_nr = umax(num_possible_cpus() / 2, IOINFG_MIN_WQ_NR); + ret = ioinf_rqw_init(&inf->offline); + if (ret) + goto free_stat; + + inf->online.wq_nr = 1; + ret = ioinf_rqw_init(&inf->online); + if (ret) + goto free_wq; + + spin_lock_init(&inf->lock); + inf->params.inflight = disk->queue->nr_requests; + inf->inflight = ioinf_default_inflight(inf); + inf->max_scale = MAX_SCALE; + inf->inf_timer_perid = IOINF_TIMER_PERID; + + inf->offline.hinflight = inf->inflight - IOINFG_MIN_INFLIGHT; + inf->online.hinflight = IOINFG_MIN_INFLIGHT; + + timer_setup(&inf->inf_timer, ioinf_timer_fn, 0); + hrtimer_init(&inf->wakeup_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + inf->wakeup_timer.function = ioinf_wakeup_timer_fn; + inf->waking = false; + + ret = rq_qos_add(&inf->rqos, disk, RQ_QOS_INFLIGHT, &ioinf_rqos_ops); + if (ret) + goto err_cancel_timer; + + ret = blkcg_activate_policy(disk, &blkcg_policy_ioinf); + if (ret) + goto err_del_qos; + return 0; + +err_del_qos: + rq_qos_del(&inf->rqos); +err_cancel_timer: + hrtimer_cancel(&inf->wakeup_timer); + timer_shutdown_sync(&inf->inf_timer); + kfree(inf->online.wait); +free_wq: + kfree(inf->offline.wait); +free_stat: + free_percpu(inf->stat); +free_inf: + kfree(inf); + return ret; +} + +static u64 ioinf_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioinf_gq *infg = pd_to_infg(pd); + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct ioinf_cgrp *infcg = blkcg_to_infcg(blkcg); + + if (!infg->inf->params.enabled) + return 0; + + if (dname && infg_user_weight(infg) != infcg->dfl_user_weight) + seq_printf(sf, "%s %d\n", dname, infg_user_weight(infg)); + + return 0; +} + +static int ioinf_weight_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct ioinf_cgrp *infcg = blkcg_to_infcg(blkcg); + + if (infcg->dfl_user_weight == IOINFG_WEIGHT_UNINIT) + return 0; + + seq_printf(sf, "default %d\n", infcg->dfl_user_weight); + blkcg_print_blkgs(sf, blkcg, ioinf_weight_prfill, &blkcg_policy_ioinf, + seq_cft(sf)->private, false); + + return 0; +} + +static void ioinf_default_weight_update(struct blkcg *blkcg, int v) +{ + struct ioinf_cgrp *infcg = blkcg_to_infcg(blkcg); + struct blkcg_gq *blkg; + struct hlist_node *tmp; + struct ioinf_gq *infg; + + if (v == infcg->dfl_user_weight) + return; + + infcg->dfl_user_weight = v; + spin_lock_irq(&blkcg->lock); + hlist_for_each_entry_safe(blkg, tmp, &blkcg->blkg_list, blkcg_node) { + infg = blkg_to_infg(blkg); + if (infg && infg->dfl_user_weight != v) { + spin_unlock_irq(&blkcg->lock); + blk_mq_freeze_queue(infg->inf->rqos.disk->queue); + blk_mq_quiesce_queue(infg->inf->rqos.disk->queue); + infg->dfl_user_weight = v; + blk_mq_unquiesce_queue(infg->inf->rqos.disk->queue); + blk_mq_unfreeze_queue(infg->inf->rqos.disk->queue); + spin_lock_irq(&blkcg->lock); + } + } + spin_unlock_irq(&blkcg->lock); +} + +static void propagate_parent_weights(struct ioinf_gq *root) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + struct ioinf_gq *infg; + + rcu_read_lock(); + blkg_for_each_descendant_pre(blkg, pos_css, pd_to_blkg(&root->pd)) { + infg = blkg_to_infg(blkg); + if (infg && infg->user_weight != root->user_weight) + infg->user_weight = root->user_weight; + } + rcu_read_unlock(); +} + +static int infg_weight_write(struct blkcg *blkcg, char *buf) +{ + struct blkg_conf_ctx ctx; + struct ioinf_gq *infg; + int ret; + int v; + + blkg_conf_init(&ctx, buf); + ret = blkg_conf_prep(blkcg, &blkcg_policy_ioinf, &ctx); + if (ret) { + blkg_conf_exit(&ctx); + return ret; + } + + infg = blkg_to_infg(ctx.blkg); + if (!strncmp(ctx.body, "default", 7)) { + v = infg->dfl_user_weight; + } else if (kstrtoint(ctx.body, 0, &v) || abs(v) > CGROUP_WEIGHT_MAX) { + blkg_conf_exit(&ctx); + return -EINVAL; + } + + spin_unlock_irq(&bdev_get_queue(ctx.bdev)->queue_lock); + blk_mq_freeze_queue(infg->inf->rqos.disk->queue); + blk_mq_quiesce_queue(infg->inf->rqos.disk->queue); + infg->user_weight = v; + propagate_parent_weights(infg); + blk_mq_unquiesce_queue(infg->inf->rqos.disk->queue); + blk_mq_unfreeze_queue(infg->inf->rqos.disk->queue); + spin_lock_irq(&bdev_get_queue(ctx.bdev)->queue_lock); + + blkg_conf_exit(&ctx); + return 0; +} + +static ssize_t ioinf_weight_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + int ret; + + if (!strchr(buf, ':')) { + int v; + + if (sscanf(buf, "default %d", &v) != 1 && kstrtoint(buf, 0, &v)) + return -EINVAL; + + if (abs(v) > CGROUP_WEIGHT_MAX) + return -EINVAL; + + ioinf_default_weight_update(blkcg, v); + return nbytes; + } + + ret = infg_weight_write(blkcg, buf); + return ret ? ret : nbytes; +} + +static u64 ioinf_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioinf *inf = q_to_inf(pd->blkg->q); + struct ioinf_params params; + + if (!dname) + return 0; + + params = inf->params; + seq_printf(sf, "%s enable=%d inflight=%u flags=%lu qos_enable=%d", + dname, params.enabled, params.inflight, params.flags, + params.qos_enabled); + + if (inf->params.qos_enabled) + seq_printf(sf, " rlat=%llu rpct=%u wlat=%llu wpct=%u", + params.rlat, params.rpct, params.wlat, params.wpct); + + seq_putc(sf, '\n'); + return 0; +} + +static int ioinf_qos_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + + blkcg_print_blkgs(sf, blkcg, ioinf_qos_prfill, + &blkcg_policy_ioinf, seq_cft(sf)->private, false); + return 0; +} + +static const match_table_t qos_ctrl_tokens = { + { INF_ENABLE, "enable=%u" }, + { INF_INFLIGHT, "inflight=%u" }, + { INF_FLAGS, "flags=%u" }, + { QOS_ENABLE, "qos_enable=%u" }, + { QOS_RLAT, "rlat=%u" }, + { QOS_WLAT, "wlat=%u" }, + { QOS_RPCT, "rpct=%u" }, + { QOS_WPCT, "wpct=%u" }, + { NR_QOS_CTRL_PARAMS, NULL }, +}; + +static ssize_t ioinf_qos_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) +{ + struct blkg_conf_ctx ctx; + struct gendisk *disk; + struct ioinf *inf; + struct ioinf_params params = {0}; + char *body, *p; + int ret; + + blkg_conf_init(&ctx, input); + + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto err; + + body = ctx.body; + disk = ctx.bdev->bd_disk; + if (!queue_is_mq(disk->queue)) { + ret = -EOPNOTSUPP; + goto err; + } + + inf = q_to_inf(disk->queue); + if (!inf) { + ret = blk_ioinf_init(disk); + if (ret) + goto err; + inf = q_to_inf(disk->queue); + } + params = inf->params; + + while ((p = strsep(&body, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + u64 v; + + if (!*p) + continue; + + switch (match_token(p, qos_ctrl_tokens, args)) { + case INF_ENABLE: + if (match_u64(&args[0], &v)) + goto einval; + params.enabled = !!v; + continue; + case INF_INFLIGHT: + if (match_u64(&args[0], &v) || v == 0) + goto einval; + params.inflight = v; + continue; + case INF_FLAGS: + if (match_u64(&args[0], &v) || v >= 1 << NR_INF_FLAGS) + goto einval; + params.flags = v; + continue; + case QOS_ENABLE: + if (match_u64(&args[0], &v)) + goto einval; + params.qos_enabled = !!v; + continue; + case QOS_RLAT: + if (match_u64(&args[0], &v) || v == 0) + goto einval; + params.rlat = v; + continue; + case QOS_WLAT: + if (match_u64(&args[0], &v) || v == 0) + goto einval; + params.wlat = v; + continue; + case QOS_RPCT: + if (match_u64(&args[0], &v) || v > 100) + goto einval; + params.rpct = v; + continue; + case QOS_WPCT: + if (match_u64(&args[0], &v) || v > 100) + goto einval; + params.wpct = v; + continue; + default: + goto einval; + } + } + + if (!params.enabled && !inf->params.enabled) + goto out; + + blk_mq_freeze_queue(disk->queue); + blk_mq_quiesce_queue(disk->queue); + + if (params.enabled && !inf->params.enabled) { + blk_stat_enable_accounting(disk->queue); + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); + } else if (inf->params.enabled && !params.enabled) { + blk_stat_disable_accounting(disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); + } + + spin_lock_irq(&inf->lock); + inf->params = params; + inf->old_scale = inf->max_scale = MAX_SCALE; + if (inf->inflight != params.inflight) { + inf->scale = SCALE_GRAN; + inf->scale_step = DIV_ROUND_UP(SCALE_GRAN, + inf->params.inflight); + ioinf_update_inflight(inf, inf->online.hinflight, + inf->offline.hinflight); + } + spin_unlock_irq(&inf->lock); + + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); +out: + blkg_conf_exit(&ctx); + return nbytes; + +einval: + ret = -EINVAL; +err: + blkg_conf_exit(&ctx); + return ret; +} + +static struct cftype ioinf_files[] = { + { + .name = "inf.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = ioinf_weight_show, + .write = ioinf_weight_write, + }, + { + .name = "inf.qos", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioinf_qos_show, + .write = ioinf_qos_write, + }, + {} +}; + +static struct cftype ioinf_legacy_files[] = { + { + .name = "inf.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = ioinf_weight_show, + .write = ioinf_weight_write, + }, + { + .name = "inf.qos", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioinf_qos_show, + .write = ioinf_qos_write, + }, + {} +}; + +static struct blkcg_policy_data *ioinf_cpd_alloc(gfp_t gfp) +{ + struct ioinf_cgrp *infcg = kzalloc(sizeof(*infcg), gfp); + + if (!infcg) + return NULL; + + infcg->dfl_user_weight = IOINFG_WEIGHT_UNINIT; + return &infcg->cpd; +} + +static void ioinf_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(container_of(cpd, struct ioinf_cgrp, cpd)); +} + +static struct blkg_policy_data *ioinf_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) +{ + struct ioinf_gq *infg = kzalloc_node(sizeof(*infg), gfp, disk->node_id); + + if (!infg) + return NULL; + + return &infg->pd; +} + +static void ioinf_pd_init(struct blkg_policy_data *pd) +{ + struct ioinf_gq *infg = pd_to_infg(pd); + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct ioinf_cgrp *infcg = blkcg_to_infcg(blkg->blkcg); + struct blkcg_gq *parent = blkg->parent; + + infg->inf = q_to_inf(blkg->q); + if (!parent || parent->blkcg->css.cgroup->level == 0) { + infcg->dfl_user_weight = 0; + return; + } + + infg->user_weight = blkg_to_infg(parent)->user_weight; + infg->dfl_user_weight = blkg_to_infg(parent)->dfl_user_weight; + + /* Inherit the parent cgroup's dfl_user_weight if it was not set. */ + if (infcg->dfl_user_weight == IOINFG_WEIGHT_UNINIT) { + struct ioinf_cgrp *parent_cgrp = blkcg_to_infcg(parent->blkcg); + + infcg->dfl_user_weight = parent_cgrp->dfl_user_weight; + } +} + +static void ioinf_pd_free(struct blkg_policy_data *pd) +{ + struct ioinf_gq *infg = pd_to_infg(pd); + + kfree(infg); +} + +static struct blkcg_policy blkcg_policy_ioinf = { + .dfl_cftypes = ioinf_files, + .legacy_cftypes = ioinf_legacy_files, + + .cpd_alloc_fn = ioinf_cpd_alloc, + .cpd_free_fn = ioinf_cpd_free, + + .pd_alloc_fn = ioinf_pd_alloc, + .pd_init_fn = ioinf_pd_init, + .pd_free_fn = ioinf_pd_free, +}; + +static int __init ioinf_init(void) +{ + return blkcg_policy_register(&blkcg_policy_ioinf); +} + +static void __exit ioinf_exit(void) +{ + blkcg_policy_unregister(&blkcg_policy_ioinf); +} + +MODULE_AUTHOR("Baokun Li, Yu Kuai and others"); +MODULE_DESCRIPTION("Block IO infligt I/O controller"); +MODULE_LICENSE("GPL"); +module_init(ioinf_init); +module_exit(ioinf_exit); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index efe99cfae51da835683442ae862568bc9d3a7a94..b5af47bf99d40651e54bc800e31092efe62ab1d4 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -798,6 +798,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) return "latency"; case RQ_QOS_COST: return "cost"; + case RQ_QOS_INFLIGHT: + return "inflight"; } return "unknown"; } diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 93d1ba69297377745a6975573564226a798ae825..d504a302ca0f1f5d576b86087908915b644df633 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -17,6 +17,7 @@ enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, + RQ_QOS_INFLIGHT, }; struct rq_wait { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4c4416fd2df73211f3fbfd4b93d14f02924f1984..81a733e1bef967cd54260ca398c6a3de29ce647b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -115,6 +115,9 @@ struct request { #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; +#endif +#ifdef CONFIG_BLK_CGROUP_IOINFLIGHT + struct blkcg_gq *blkg; #endif /* * rq sectors used for blk stats. It has the same value