Revert "block/mq-deadline: Prioritize high-priority requests"

This reverts commit fb92603. Zhen reports that this commit slows down mq-deadline on a 128 thread box, going from 258K IOPS to 170-180K. My testing shows that Optane gen2 IOPS goes from 2.3M IOPS to 1.2M IOPS on a 64 thread box. Looking in detail at the code, the main culprit here is needing to sum percpu counters in the dispatch hot path, leading to very high CPU utilization there. To make matters worse, the code currently needs to sum 2 percpu counters, and it does so in the most naive way of iterating possible CPUs _twice_. Since we're close to release, revert this commit and we can re-do it with regular per-priority counters instead for the 5.15 kernel. Link: https://lore.kernel.org/linux-block/20210826144039.2143-1-thunder.leizhen@huawei.com/ Reported-by: Zhen Lei <thunder.leizhen@huawei.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
ljia5 · Aug 26, 2021 · 7b05bf7 · 7b05bf7
1 parent b6d2b05
commit 7b05bf7
Showing 1 changed file with 5 additions and 37 deletions.
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
@@ -31,11 +31,6 @@
  */
 static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
 static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
-/*
- * Time after which to dispatch lower priority requests even if higher
- * priority requests are pending.
- */
-static const int aging_expire = 10 * HZ;
 static const int writes_starved = 2;    /* max times reads can starve a write */
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
 				     by the above parameters. For throughput. */
@@ -103,7 +98,6 @@ struct deadline_data {
 	int writes_starved;
 	int front_merges;
 	u32 async_depth;
-	int aging_expire;
 
 	spinlock_t lock;
 	spinlock_t zone_lock;
@@ -369,11 +363,10 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 
 /*
  * deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc and with a start time <= @latest.
+ * read/write expire, fifo_batch, etc
  */
 static struct request *__dd_dispatch_request(struct deadline_data *dd,
-					     struct dd_per_prio *per_prio,
-					     u64 latest_start_ns)
+					     struct dd_per_prio *per_prio)
 {
 	struct request *rq, *next_rq;
 	enum dd_data_dir data_dir;
@@ -385,8 +378,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	if (!list_empty(&per_prio->dispatch)) {
 		rq = list_first_entry(&per_prio->dispatch, struct request,
 				      queuelist);
-		if (rq->start_time_ns > latest_start_ns)
-			return NULL;
 		list_del_init(&rq->queuelist);
 		goto done;
 	}
@@ -464,8 +455,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	dd->batching = 0;
 
 dispatch_request:
-	if (rq->start_time_ns > latest_start_ns)
-		return NULL;
 	/*
 	 * rq is the selected appropriate request.
 	 */
@@ -494,32 +483,15 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-	const u64 now_ns = ktime_get_ns();
-	struct request *rq = NULL;
+	struct request *rq;
 	enum dd_prio prio;
 
 	spin_lock(&dd->lock);
-	/*
-	 * Start with dispatching requests whose deadline expired more than
-	 * aging_expire jiffies ago.
-	 */
-	for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
-		rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns -
-					   jiffies_to_nsecs(dd->aging_expire));
-		if (rq)
-			goto unlock;
-	}
-	/*
-	 * Next, dispatch requests in priority order. Ignore lower priority
-	 * requests if any higher priority requests are pending.
-	 */
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
-		rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns);
-		if (rq || dd_queued(dd, prio))
+		rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
+		if (rq)
 			break;
 	}
-
-unlock:
 	spin_unlock(&dd->lock);
 
 	return rq;
@@ -620,7 +592,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	dd->front_merges = 1;
 	dd->last_dir = DD_WRITE;
 	dd->fifo_batch = fifo_batch;
-	dd->aging_expire = aging_expire;
 	spin_lock_init(&dd->lock);
 	spin_lock_init(&dd->zone_lock);
 
@@ -842,7 +813,6 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
 SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
 SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
-SHOW_JIFFIES(deadline_aging_expire_show, dd->aging_expire);
 SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
 SHOW_INT(deadline_front_merges_show, dd->front_merges);
 SHOW_INT(deadline_async_depth_show, dd->front_merges);
@@ -872,7 +842,6 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
 	STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
 STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
 STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
-STORE_JIFFIES(deadline_aging_expire_store, &dd->aging_expire, 0, INT_MAX);
 STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
 STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
 STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
@@ -891,7 +860,6 @@ static struct elv_fs_entry deadline_attrs[] = {
 	DD_ATTR(front_merges),
 	DD_ATTR(async_depth),
 	DD_ATTR(fifo_batch),
-	DD_ATTR(aging_expire),
 	__ATTR_NULL
 };