From 2d33ca8dde21de6b9f2bfdccd4ab8708ca4cdb47 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 10 Oct 2014 16:58:05 -0400 Subject: [PATCH] Redispatch ZIOs in deep call graphs The below excerpt of a backtrace is from a ztest failure when running ZoL's ztest: /#453 0x00007f03c8060b35 in vdev_queue_io_to_issue (vq=vq@entry=0x99f8a8) at ../../module/zfs/vdev_queue.c:706 /#454 0x00007f03c806106e in vdev_queue_io (zio=zio@entry=0x7f0350003de0) at ../../module/zfs/vdev_queue.c:747 /#455 0x00007f03c80818c1 in zio_vdev_io_start (zio=0x7f0350003de0) at ../../module/zfs/zio.c:2659 /#456 0x00007f03c807f243 in __zio_execute (zio=0x7f0350003de0) at ../../module/zfs/zio.c:1399 /#457 zio_nowait (zio=0x7f0350003de0) at ../../module/zfs/zio.c:1456 /#458 0x00007f03c805f71b in vdev_mirror_io_start (zio=0x7f0350003a10) at ../../module/zfs/vdev_mirror.c:374 /#459 0x00007f03c807f243 in __zio_execute (zio=0x7f0350003a10) at ../../module/zfs/zio.c:1399 /#460 zio_nowait (zio=0x7f0350003a10) at ../../module/zfs/zio.c:1456 /#461 0x00007f03c806464c in vdev_raidz_io_start (zio=0x7f0350003380) at ../../module/zfs/vdev_raidz.c:1607 /#462 0x00007f03c807f243 in __zio_execute (zio=0x7f0350003380) at ../../module/zfs/zio.c:1399 /#463 zio_nowait (zio=0x7f0350003380) at ../../module/zfs/zio.c:1456 /#464 0x00007f03c805f71b in vdev_mirror_io_start (zio=0x7f0350002fb0) at ../../module/zfs/vdev_mirror.c:374 /#465 0x00007f03c807f243 in __zio_execute (zio=0x7f0350002fb0) at ../../module/zfs/zio.c:1399 /#466 zio_nowait (zio=0x7f0350002fb0) at ../../module/zfs/zio.c:1456 /#467 0x00007f03c805ed43 in vdev_mirror_io_done (zio=0x7f033957ebf0) at ../../module/zfs/vdev_mirror.c:499 /#468 0x00007f03c807a0c0 in zio_vdev_io_done (zio=0x7f033957ebf0) at ../../module/zfs/zio.c:2707 /#469 0x00007f03c808285b in __zio_execute (zio=0x7f033957ebf0) at ../../module/zfs/zio.c:1399 /#470 zio_notify_parent (wait=ZIO_WAIT_DONE, zio=0x7f0390001330, pio=0x7f033957ebf0) at ../../module/zfs/zio.c:547 /#471 zio_done (zio=0x7f0390001330) at ../../module/zfs/zio.c:3278 /#472 0x00007f03c808285b in __zio_execute (zio=0x7f0390001330) at ../../module/zfs/zio.c:1399 /#473 zio_notify_parent (wait=ZIO_WAIT_DONE, zio=0x7f03b4013a00, pio=0x7f0390001330) at ../../module/zfs/zio.c:547 /#474 zio_done (zio=0x7f03b4013a00) at ../../module/zfs/zio.c:3278 /#475 0x00007f03c808285b in __zio_execute (zio=0x7f03b4013a00) at ../../module/zfs/zio.c:1399 /#476 zio_notify_parent (wait=ZIO_WAIT_DONE, zio=0x7f03b4014210, pio=0x7f03b4013a00) at ../../module/zfs/zio.c:547 /#477 zio_done (zio=0x7f03b4014210) at ../../module/zfs/zio.c:3278 /#478 0x00007f03c808285b in __zio_execute (zio=0x7f03b4014210) at ../../module/zfs/zio.c:1399 /#479 zio_notify_parent (wait=ZIO_WAIT_DONE, zio=0x7f03b4014620, pio=0x7f03b4014210) at ../../module/zfs/zio.c:547 /#480 zio_done (zio=0x7f03b4014620) at ../../module/zfs/zio.c:3278 /#481 0x00007f03c807a6d3 in __zio_execute (zio=0x7f03b4014620) at ../../module/zfs/zio.c:1399 /#482 zio_execute (zio=zio@entry=0x7f03b4014620) at ../../module/zfs/zio.c:1337 /#483 0x00007f03c8060b35 in vdev_queue_io_to_issue (vq=vq@entry=0x99f8a8) at ../../module/zfs/vdev_queue.c:706 /#484 0x00007f03c806106e in vdev_queue_io (zio=zio@entry=0x7f0350002be0) at ../../module/zfs/vdev_queue.c:747 /#485 0x00007f03c80818c1 in zio_vdev_io_start (zio=0x7f0350002be0) at ../../module/zfs/zio.c:2659 /#486 0x00007f03c807f243 in __zio_execute (zio=0x7f0350002be0) at ../../module/zfs/zio.c:1399 /#487 zio_nowait (zio=0x7f0350002be0) at ../../module/zfs/zio.c:1456 /#488 0x00007f03c805f71b in vdev_mirror_io_start (zio=0x7f0350002810) at ../../module/zfs/vdev_mirror.c:374 /#489 0x00007f03c807f243 in __zio_execute (zio=0x7f0350002810) at ../../module/zfs/zio.c:1399 /#490 zio_nowait (zio=0x7f0350002810) at ../../module/zfs/zio.c:1456 /#491 0x00007f03c8064593 in vdev_raidz_io_start (zio=0x7f0350001270) at ../../module/zfs/vdev_raidz.c:1591 /#492 0x00007f03c807f243 in __zio_execute (zio=0x7f0350001270) at ../../module/zfs/zio.c:1399 /#493 zio_nowait (zio=0x7f0350001270) at ../../module/zfs/zio.c:1456 /#494 0x00007f03c805f71b in vdev_mirror_io_start (zio=0x7f0350001e60) at ../../module/zfs/vdev_mirror.c:374 /#495 0x00007f03c807f243 in __zio_execute (zio=0x7f0350001e60) at ../../module/zfs/zio.c:1399 /#496 zio_nowait (zio=0x7f0350001e60) at ../../module/zfs/zio.c:1456 /#497 0x00007f03c805ed43 in vdev_mirror_io_done (zio=0x7f033a0c39c0) at ../../module/zfs/vdev_mirror.c:499 /#498 0x00007f03c807a0c0 in zio_vdev_io_done (zio=0x7f033a0c39c0) at ../../module/zfs/zio.c:2707 /#499 0x00007f03c808285b in __zio_execute (zio=0x7f033a0c39c0) at ../../module/zfs/zio.c:1399 /#500 zio_notify_parent (wait=ZIO_WAIT_DONE, zio=0x7f03a8003c00, pio=0x7f033a0c39c0) at ../../module/zfs/zio.c:547 /#501 zio_done (zio=0x7f03a8003c00) at ../../module/zfs/zio.c:3278 /#502 0x00007f03c808285b in __zio_execute (zio=0x7f03a8003c00) at ../../module/zfs/zio.c:1399 /#503 zio_notify_parent (wait=ZIO_WAIT_DONE, zio=0x7f038800c400, pio=0x7f03a8003c00) at ../../module/zfs/zio.c:547 /#504 zio_done (zio=0x7f038800c400) at ../../module/zfs/zio.c:3278 /#505 0x00007f03c808285b in __zio_execute (zio=0x7f038800c400) at ../../module/zfs/zio.c:1399 /#506 zio_notify_parent (wait=ZIO_WAIT_DONE, zio=0x7f038800da00, pio=0x7f038800c400) at ../../module/zfs/zio.c:547 /#507 zio_done (zio=0x7f038800da00) at ../../module/zfs/zio.c:3278 /#508 0x00007f03c808285b in __zio_execute (zio=0x7f038800da00) at ../../module/zfs/zio.c:1399 /#509 zio_notify_parent (wait=ZIO_WAIT_DONE, zio=0x7f038800fd80, pio=0x7f038800da00) at ../../module/zfs/zio.c:547 /#510 zio_done (zio=0x7f038800fd80) at ../../module/zfs/zio.c:3278 /#511 0x00007f03c807a6d3 in __zio_execute (zio=0x7f038800fd80) at ../../module/zfs/zio.c:1399 /#512 zio_execute (zio=zio@entry=0x7f038800fd80) at ../../module/zfs/zio.c:1337 /#513 0x00007f03c8060b35 in vdev_queue_io_to_issue (vq=vq@entry=0x99f8a8) at ../../module/zfs/vdev_queue.c:706 /#514 0x00007f03c806119d in vdev_queue_io_done (zio=zio@entry=0x7f03a0010950) at ../../module/zfs/vdev_queue.c:775 /#515 0x00007f03c807a0e8 in zio_vdev_io_done (zio=0x7f03a0010950) at ../../module/zfs/zio.c:2686 /#516 0x00007f03c807a6d3 in __zio_execute (zio=0x7f03a0010950) at ../../module/zfs/zio.c:1399 /#517 zio_execute (zio=0x7f03a0010950) at ../../module/zfs/zio.c:1337 /#518 0x00007f03c7fcd0c4 in taskq_thread (arg=0x966d50) at ../../lib/libzpool/taskq.c:215 /#519 0x00007f03c7fc7937 in zk_thread_helper (arg=0x967e90) at ../../lib/libzpool/kernel.c:135 /#520 0x00007f03c78890a3 in start_thread (arg=0x7f03c2703700) at pthread_create.c:309 /#521 0x00007f03c75c50fd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 This occurred when ztest was simulating a scrub under heavy I/O load. Under those circumstances, it was possible for a mix of noop I/Os for aggregation continuity and the I/O elevator to generate arbitrarily deep recursion. This patch modifies ZFS to propapage a recursion counter inside the zio_t objects such that IOs will be redispatched upon reaching a given recursion depth. We can detect long call chains and dispatch to another ZIO taskq. We cut in-line when we do this to minimize the potential for taskq exhaustion that can prevent a zio from notifying its parent. Signed-off-by: Richard Yao --- include/sys/zio.h | 1 + module/zfs/vdev_queue.c | 9 ++++++--- module/zfs/zio.c | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/include/sys/zio.h b/include/sys/zio.h index 69b00d0f4029..278b138e6cea 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -440,6 +440,7 @@ struct zio { uint64_t io_child_count; uint64_t io_phys_children; uint64_t io_parent_count; + uint64_t io_recursion_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 34e4420da733..07649d086508 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -650,6 +650,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) } zio_add_child(dio, aio); + dio->io_recursion_count = aio->io_recursion_count + 1; vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); zio_execute(dio); @@ -661,7 +662,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) } static zio_t * -vdev_queue_io_to_issue(vdev_queue_t *vq) +vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t recursion_count) { zio_t *zio, *aio; zio_priority_t p; @@ -708,6 +709,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) */ if (zio->io_flags & ZIO_FLAG_NODATA) { mutex_exit(&vq->vq_lock); + zio->io_recursion_count = recursion_count + 1; zio_vdev_io_bypass(zio); zio_execute(zio); mutex_enter(&vq->vq_lock); @@ -750,7 +752,7 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); zio->io_timestamp = gethrtime(); vdev_queue_io_add(vq, zio); - nio = vdev_queue_io_to_issue(vq); + nio = vdev_queue_io_to_issue(vq, zio->io_recursion_count); mutex_exit(&vq->vq_lock); if (nio == NULL) @@ -781,7 +783,8 @@ vdev_queue_io_done(zio_t *zio) vq->vq_io_complete_ts = gethrtime(); vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; - while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { + while ((nio = vdev_queue_io_to_issue(vq, zio->io_recursion_count)) + != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0ba167c62b59..89978689da1a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -60,6 +60,7 @@ kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; int zio_bulk_flags = 0; int zio_delay_max = ZIO_DELAY_MAX; +int zio_recursion_threshold = 16; /* Threshold to redispatch zio_t objects */ /* * The following actions directly effect the spa's sync-to-convergence logic. @@ -516,6 +517,7 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) if (*countp == 0 && pio->io_stall == countp) { pio->io_stall = NULL; + pio->io_recursion_count = zio->io_recursion_count + 1; mutex_exit(&pio->io_lock); __zio_execute(pio); } else { @@ -975,6 +977,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; + zio->io_recursion_count = pio->io_recursion_count + 1; + return (zio); } @@ -1268,6 +1272,9 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); + /* Reset the notify counter */ + zio->io_recursion_count = 0; + /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. @@ -1391,6 +1398,16 @@ __zio_execute(zio_t *zio) cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; + /* + * Deep call graphs can cause us to overrun the stack. + * Redispatch ZIO when we hit zio_recursion_threshold. + */ + if (zio->io_recursion_count && + (zio->io_recursion_count >= zio_recursion_threshold)) { + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); + return; + } + /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O,