From 9de770fc5f552c9e5d2278237993f99ca9a9a335 Mon Sep 17 00:00:00 2001 From: Bobi Jam Date: Thu, 21 Sep 2023 22:24:32 +0800 Subject: [PATCH] LU-16958 llite: migrate deadlock on not responding lock cancel lfs migrate race makes MDS hang with following backtrace [ 3683.248584] [<0>] ldlm_completion_ast+0x78d/0x8e0 [ptlrpc] [ 3683.250122] [<0>] ldlm_cli_enqueue_local+0x2fd/0x840 [ptlrpc] [ 3683.251363] [<0>] mdt_object_local_lock+0x50e/0xb10 [mdt] [ 3683.252615] [<0>] mdt_object_lock_internal+0x187/0x430 [mdt] [ 3683.253793] [<0>] mdt_object_lock_try+0x22/0xa0 [mdt] [ 3683.254857] [<0>] mdt_getattr_name_lock+0x1317/0x1dc0 [mdt] [ 3683.256016] [<0>] mdt_intent_getattr+0x264/0x440 [mdt] [ 3683.257105] [<0>] mdt_intent_opc+0x452/0xa80 [mdt] [ 3683.258126] [<0>] mdt_intent_policy+0x1fd/0x390 [mdt] [ 3683.259191] [<0>] ldlm_lock_enqueue+0x469/0xa90 [ptlrpc] [ 3683.260350] [<0>] ldlm_handle_enqueue0+0x61a/0x16c0 [ptlrpc] [ 3683.261596] [<0>] tgt_enqueue+0xa4/0x200 [ptlrpc] [ 3683.262662] [<0>] tgt_request_handle+0xc9c/0x1a40 [ptlrpc] [ 3683.263860] [<0>] ptlrpc_server_handle_request+0x323/0xbd0 [ptlrpc] [ 3683.265220] [<0>] ptlrpc_main+0xbf3/0x1540 [ptlrpc] [ 3683.266303] [<0>] kthread+0x134/0x150 [ 3683.267111] [<0>] ret_from_fork+0x35/0x40 The deadlock happens as follows: T1: vvp_io_init() ->ll_layout_refresh() <= take lli_layout_mutex ->ll_layout_intent() ->ll_take_md_lock() <= take the CR layout lock ref ->ll_layout_conf() ->vvp_prune() ->vvp_inode_ops() <= release lli_layout_mtex ->vvp_inode_ops() <= try to acquire lli_layout_mutex -> racer wait here for T2 T2: ->ll_file_write_iter() ->vvp_io_init() ->ll_layout_refresh() <= take lli_layout_mutex ->ll_layout_intent() <= Request layout from MDT -> racer wait from server... And server want to cancel the CR layout lock T1 hold, and it won't happen. Also T1 could has take extent ldlm lock while waiting lli_layout_mutex hold by T2, and ofd_destroy_hdl does not get the lock cancellation response from T1. lli_layout_mutex is only needed for enqueuing layout lock from server, so that ll_layout_conf() does not involve with lli_layout_mutex. Lustre-commit: TBD (from 7de620b53bea8a2fc252ceea4787f1226ce63a02) Lustre-change: https://review.whamcloud.com/52388 Fixes: 8f2c1592c3 ("LU-16958 llite: migrate vs regular ops deadlock") Signed-off-by: Bobi Jam Change-Id: Ib94de2c63544c3a962199aa0537418255980ae8c Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/52451 Tested-by: jenkins Tested-by: Andreas Dilger Reviewed-by: Qian Yingjin Reviewed-by: Andreas Dilger --- lustre/include/cl_object.h | 2 -- lustre/llite/file.c | 9 ++---- lustre/llite/vvp_object.c | 14 -------- lustre/lov/lov_cl_internal.h | 11 +++++-- lustre/lov/lov_object.c | 76 +++++++++++++++++++++----------------------- 5 files changed, 47 insertions(+), 65 deletions(-) diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index ae4de53..09745d4 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -308,8 +308,6 @@ enum coo_inode_opc { COIO_INODE_UNLOCK, COIO_SIZE_LOCK, COIO_SIZE_UNLOCK, - COIO_LAYOUT_LOCK, - COIO_LAYOUT_UNLOCK, }; /** diff --git a/lustre/llite/file.c b/lustre/llite/file.c index e53b9cf..a1846dc 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -6497,10 +6497,6 @@ int ll_layout_refresh(struct inode *inode, __u32 *gen) LASSERT(fid_is_sane(ll_inode2fid(inode))); LASSERT(S_ISREG(inode->i_mode)); - /* take layout lock mutex to enqueue layout lock exclusively. */ - mutex_lock(&lli->lli_layout_mutex); - lli->lli_layout_lock_owner = current; - while (1) { /* mostly layout lock is caching on the local side, so try to * match it before grabbing layout lock mutex. */ @@ -6514,15 +6510,16 @@ int ll_layout_refresh(struct inode *inode, __u32 *gen) break; } + /* take layout lock mutex to enqueue layout lock exclusively. */ + mutex_lock(&lli->lli_layout_mutex); rc = ll_layout_intent(inode, &intent); + mutex_unlock(&lli->lli_layout_mutex); if (rc != 0) break; } if (rc == 0) *gen = ll_layout_version_get(lli); - lli->lli_layout_lock_owner = NULL; - mutex_unlock(&lli->lli_layout_mutex); RETURN(rc); } diff --git a/lustre/llite/vvp_object.c b/lustre/llite/vvp_object.c index d26a984..416fda0 100644 --- a/lustre/llite/vvp_object.c +++ b/lustre/llite/vvp_object.c @@ -268,20 +268,6 @@ static int vvp_inode_ops(const struct lu_env *env, struct cl_object *obj, else rc = -ENOLCK; break; - case COIO_LAYOUT_LOCK: - if (lli->lli_layout_lock_owner != current) { - mutex_lock(&lli->lli_layout_mutex); - lli->lli_layout_lock_owner = current; - } - break; - case COIO_LAYOUT_UNLOCK: - if (lli->lli_layout_lock_owner == current) { - lli->lli_layout_lock_owner = NULL; - mutex_unlock(&lli->lli_layout_mutex); - } else { - rc = -ENOLCK; - } - break; default: rc = -EINVAL; break; diff --git a/lustre/lov/lov_cl_internal.h b/lustre/lov/lov_cl_internal.h index 5bee437..05aa3e0 100644 --- a/lustre/lov/lov_cl_internal.h +++ b/lustre/lov/lov_cl_internal.h @@ -248,6 +248,12 @@ struct lov_mirror_entry { unsigned short lre_end; /* end index of this mirror */ }; +enum lov_object_flags { + /* Layout is invalid, set when layout lock is lost */ + LO_LAYOUT_INVALID = 0x1, + LO_NEED_INODE_LOCK = 0x2, +}; + /** * lov-specific file state. * @@ -278,10 +284,9 @@ struct lov_object { */ enum lov_layout_type lo_type; /** - * True if layout is invalid. This bit is cleared when layout lock - * is lost. + * Object flags. */ - bool lo_layout_invalid; + unsigned long lo_obj_flags; /** * How many IOs are on going on this object. Layout can be changed * only if there is no active IO. diff --git a/lustre/lov/lov_object.c b/lustre/lov/lov_object.c index fc6affa..3d6952e 100644 --- a/lustre/lov/lov_object.c +++ b/lustre/lov/lov_object.c @@ -178,7 +178,7 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov, old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type); LASSERT(old_obj != NULL); old_lov = cl2lov(lu2cl(old_obj)); - if (old_lov->lo_layout_invalid) { + if (test_bit(LO_LAYOUT_INVALID, &old_lov->lo_obj_flags)) { /* the object's layout has already changed but isn't * refreshed */ lu_object_unhash(env, &subobj->co_lu); @@ -625,7 +625,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev, LASSERT(lsm->lsm_entry_count > 0); LASSERT(lov->lo_lsm == NULL); lov->lo_lsm = lsm_addref(lsm); - lov->lo_layout_invalid = true; + set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); dump_lsm(D_INODE, lsm); @@ -967,7 +967,8 @@ static void lov_fini_released(const struct lu_env *env, struct lov_object *lov, static int lov_print_empty(const struct lu_env *env, void *cookie, lu_printer_t p, const struct lu_object *o) { - (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid); + (*p)(env, cookie, "empty %d\n", + test_bit(LO_LAYOUT_INVALID, &lu2lov(o)->lo_obj_flags)); return 0; } @@ -980,8 +981,8 @@ static int lov_print_composite(const struct lu_env *env, void *cookie, (*p)(env, cookie, "entries: %d, %s, lsm{%p 0x%08X %d %u}:\n", lsm->lsm_entry_count, - lov->lo_layout_invalid ? "invalid" : "valid", lsm, - lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" : + "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen); for (i = 0; i < lsm->lsm_entry_count; i++) { @@ -1010,8 +1011,8 @@ static int lov_print_released(const struct lu_env *env, void *cookie, (*p)(env, cookie, "released: %s, lsm{%p 0x%08X %d %u}:\n", - lov->lo_layout_invalid ? "invalid" : "valid", lsm, - lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" : + "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen); return 0; } @@ -1024,7 +1025,8 @@ static int lov_print_foreign(const struct lu_env *env, void *cookie, (*p)(env, cookie, "foreign: %s, lsm{%p 0x%08X %d %u}:\n", - lov->lo_layout_invalid ? "invalid" : "valid", lsm, + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? + "invalid" : "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen); (*p)(env, cookie, @@ -1345,8 +1347,11 @@ static int lov_layout_change(const struct lu_env *unused, new_ops = &lov_dispatch[llt]; rc = cl_object_prune(env, &lov->lo_cl); - if (rc != 0) + if (rc != 0) { + if (rc == -EAGAIN) + set_bit(LO_NEED_INODE_LOCK, &lov->lo_obj_flags); GOTO(out, rc); + } rc = old_ops->llo_delete(env, lov, &lov->u); if (rc != 0) @@ -1435,9 +1440,8 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, struct lov_object *lov = cl2lov(obj); int result = 0; struct cl_object *top = cl_object_top(obj); - bool unlock_inode = false; - bool lock_inode_size = false; - bool lock_layout = false; + bool lock_inode = false; + bool inode_size_locked = false; ENTRY; if (conf->coc_opc == OBJECT_CONF_SET && @@ -1453,12 +1457,12 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, retry: lov_conf_lock(lov); if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { - lov->lo_layout_invalid = true; + set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); GOTO(out, result = 0); } if (conf->coc_opc == OBJECT_CONF_WAIT) { - if (lov->lo_layout_invalid && + if (test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) && atomic_read(&lov->lo_active_ios) > 0) { lov_conf_unlock(lov); result = lov_layout_wait(env, lov); @@ -1493,20 +1497,21 @@ retry: (lov->lo_lsm->lsm_entries[0]->lsme_pattern == lsm->lsm_entries[0]->lsme_pattern))) { /* same version of layout */ - lov->lo_layout_invalid = false; + clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); GOTO(out, result = 0); } /* will change layout - check if there still exists active IO. */ if (atomic_read(&lov->lo_active_ios) > 0) { - lov->lo_layout_invalid = true; + set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); GOTO(out, result = -EBUSY); } + clear_bit(LO_NEED_INODE_LOCK, &lov->lo_obj_flags); result = lov_layout_change(env, lov, lsm, conf); - lov->lo_layout_invalid = result != 0; if (result) { - if (result == -EAGAIN) { + if (result == -EAGAIN && + test_bit(LO_NEED_INODE_LOCK, &lov->lo_obj_flags)) { /** * we need unlocked lov conf and get inode lock. * It's possible we have already taken inode's size @@ -1514,55 +1519,45 @@ retry: * order, lest deadlock happens: * inode lock (ll_inode_lock()) * inode size lock (ll_inode_size_lock()) - * inode layout lock (ll_layout_refresh()) * lov conf lock (lov_conf_lock()) * * e.g. * vfs_setxattr inode locked * ll_lov_setstripe_ea_info inode size locked * ll_prep_inode - * ll_file_inode_init - * cl_conf_set - * lov_conf_set lov conf locked - * - * ll_migrate inode locked - * ... - * ll_layout_refresh inode layout locked - * ll_layout_conf + * cl_file_inode_init * cl_conf_set * lov_conf_set lov conf locked */ lov_conf_unlock(lov); - if (cl_object_inode_ops(env, top, COIO_LAYOUT_UNLOCK, - NULL) == 0) - lock_layout = true; if (cl_object_inode_ops(env, top, COIO_SIZE_UNLOCK, NULL) == 0) - lock_inode_size = true; + inode_size_locked = true; /* take lock in order */ if (cl_object_inode_ops( env, top, COIO_INODE_LOCK, NULL) == 0) - unlock_inode = true; - if (lock_inode_size) + lock_inode = true; + if (inode_size_locked) cl_object_inode_ops(env, top, COIO_SIZE_LOCK, NULL); - if (lock_layout) - cl_object_inode_ops(env, top, COIO_LAYOUT_LOCK, - NULL); goto retry; } + set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); + } else { + clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags); } EXIT; out: lov_conf_unlock(lov); - if (unlock_inode) + if (lock_inode) cl_object_inode_ops(env, top, COIO_INODE_UNLOCK, NULL); lov_lsm_put(lsm); - CDEBUG(D_INODE, DFID" lo_layout_invalid=%d\n", - PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid); + CDEBUG(D_INODE, DFID" lo_layout_invalid=%u\n", + PFID(lu_object_fid(lov2lu(lov))), + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags)); RETURN(result); } @@ -2428,7 +2423,8 @@ static struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov) lsm = lsm_addref(lov->lo_lsm); CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n", lsm, atomic_read(&lsm->lsm_refc), - lov->lo_layout_invalid, current); + test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags), + current); } lov_conf_thaw(lov); return lsm; -- 1.8.3.1