Fix deadlock in zfs_zget()

openzfs#180 occurred because of a race between inode eviction and zfs_zget(). openzfs/zfs@36df284 tried to address it by making an upcall to the VFS to learn whether an inode is being evicted and spin until it leaves eviction. This is a hack around the fact that we cannot ensure "SA" does immediate eviction by hooking into generic_drop_inode(), which is GPL exported and cannot be wrapped. Unfortunately, the act of calling ilookup to avoid this race during writeback creates a deadlock: [ 602.268492] INFO: task kworker/u24:6:891 blocked for more than 120 seconds. [ 602.268496] Tainted: P O 3.13.6 #1 [ 602.268498] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 602.268500] kworker/u24:6 D ffff88107fcd2e80 0 891 2 0x00000000 [ 602.268511] Workqueue: writeback bdi_writeback_workfn (flush-zfs-5) [ 602.268522] ffff8810370ff950 0000000000000002 ffff88103853d940 0000000000012e80 [ 602.268526] ffff8810370fffd8 0000000000012e80 ffff88103853d940 ffff880f5c8be098 [ 602.268530] ffff88107ffb6950 ffff8810370ff980 ffff88103a9a5b78 0000000000000000 [ 602.268534] Call Trace: [ 602.268541] [<ffffffff813dd1d4>] schedule+0x24/0x70 [ 602.268546] [<ffffffff8115fc09>] __wait_on_freeing_inode+0x99/0xc0 [ 602.268552] [<ffffffff810821c0>] ? autoremove_wake_function+0x40/0x40 [ 602.268555] [<ffffffff8115fdd8>] find_inode_fast+0x78/0xb0 [ 602.268559] [<ffffffff811608c5>] ilookup+0x65/0xd0 [ 602.268590] [<ffffffffa035c5ab>] zfs_zget+0xdb/0x260 [zfs] [ 602.268594] [<ffffffff813e013b>] ? __mutex_lock_slowpath+0x21b/0x360 [ 602.268613] [<ffffffffa03589d6>] zfs_get_data+0x46/0x340 [zfs] [ 602.268631] [<ffffffffa035fee1>] zil_add_block+0xa31/0xc00 [zfs] [ 602.268634] [<ffffffff813dfe79>] ? mutex_unlock+0x9/0x10 [ 602.268651] [<ffffffffa0360642>] zil_commit+0x12/0x20 [zfs] [ 602.268669] [<ffffffffa036a6e4>] zpl_putpage+0x174/0x840 [zfs] [ 602.268674] [<ffffffff811071ec>] do_writepages+0x1c/0x40 [ 602.268677] [<ffffffff8116df2b>] __writeback_single_inode+0x3b/0x2b0 [ 602.268680] [<ffffffff8116ecf7>] writeback_sb_inodes+0x247/0x420 [ 602.268684] [<ffffffff8116f5f3>] wb_writeback+0xe3/0x320 [ 602.268689] [<ffffffff81062cc1>] ? set_worker_desc+0x71/0x80 [ 602.268692] [<ffffffff81170b8e>] bdi_writeback_workfn+0xfe/0x490 [ 602.268696] [<ffffffff813e12b4>] ? _raw_spin_unlock_irq+0x14/0x40 [ 602.268700] [<ffffffff8106fd19>] ? finish_task_switch+0x59/0x130 [ 602.268703] [<ffffffff8106072c>] process_one_work+0x16c/0x490 [ 602.268706] [<ffffffff810613f3>] worker_thread+0x113/0x390 [ 602.268710] [<ffffffff810612e0>] ? manage_workers.isra.27+0x2a0/0x2a0 [ 602.268713] [<ffffffff81066edf>] kthread+0xdf/0x100 [ 602.268717] [<ffffffff8107877e>] ? arch_vtime_task_switch+0x8e/0xa0 [ 602.268720] [<ffffffff81066e00>] ? kthread_create_on_node+0x190/0x190 [ 602.268723] [<ffffffff813e71fc>] ret_from_fork+0x7c/0xb0 [ 602.268730] [<ffffffff81066e00>] ? kthread_create_on_node+0x190/0x190 The return value from igrab() gives us the information that ifind() provided without the risk of a deadlock. Ideally, we should ask upstream to export generic_drop_inode() so that we can wrap it to properly handle this situation, but until then, lets hook into the return value of ifind() so that we do not deadlock here. In addition, zfs_zget() should exit with a hold on the inode, but that was never done in the Linux port when the inode had already been constructed. This can lead to undefined behavior, such as inodes that are evicted when they have users. The function is modified to ensure that successful exit from this function has a hold on the inode, which the code expects. Signed-off-by: Richard Yao <ryao@gentoo.org> Please enter the commit message for your changes. Lines starting
ryao · Mar 25, 2014 · 62f270f · 62f270f · behlendorf · Apr 3, 2014
1 parent a15dac4
commit 62f270f
Showing 1 changed file with 16 additions and 22 deletions.
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
@@ -859,19 +859,15 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp)
 	znode_t		*zp;
 	int err;
 	sa_handle_t	*hdl;
-	struct inode	*ip;
 
 	*zpp = NULL;
 
 again:
-	ip = ilookup(zsb->z_sb, obj_num);
-
 	ZFS_OBJ_HOLD_ENTER(zsb, obj_num);
 
 	err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zsb, obj_num);
-		iput(ip);
 		return (err);
 	}
 
@@ -882,25 +878,11 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp)
 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zsb, obj_num);
-		iput(ip);
 		return (SET_ERROR(EINVAL));
 	}
 
 	hdl = dmu_buf_get_user(db);
 	if (hdl != NULL) {
-		if (ip == NULL) {
-			/*
-			 * ilookup returned NULL, which means
-			 * the znode is dying - but the SA handle isn't
-			 * quite dead yet, we need to drop any locks
-			 * we're holding, re-schedule the task and try again.
-			 */
-			sa_buf_rele(db, NULL);
-			ZFS_OBJ_HOLD_EXIT(zsb, obj_num);
-
-			schedule();
-			goto again;
-		}
 
 		zp = sa_get_userdata(hdl);
 
@@ -917,19 +899,31 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp)
 		if (zp->z_unlinked) {
 			err = SET_ERROR(ENOENT);
 		} else {
-			igrab(ZTOI(zp));
+			/*
+			 * if igrab returns NULL, the znode is being evicted,
+			 * but the SA handle did not do immediate eviction like
+			 * we expected because GPL symbol exportrestrictions
+			 * prevent us from hooking into ->drop_inode() at this
+			 * time. As a workaround, we drop any locks we're
+			 * holding, re-schedule the task and try again.
+			 */
+			if (igrab(ZTOI(zp)) == NULL) {
+				mutex_exit(&zp->z_lock);
+				sa_buf_rele(db, NULL);
+				ZFS_OBJ_HOLD_EXIT(zsb, obj_num);
+
+				schedule();
+				goto again;
+			}
 			*zpp = zp;
 			err = 0;
 		}
 		sa_buf_rele(db, NULL);
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zsb, obj_num);
-		iput(ip);
 		return (err);
 	}
 
-	ASSERT3P(ip, ==, NULL);
-
 	/*
 	 * Not found create new znode/vnode but only if file exists.
 	 *