From: Qian Yingjin Date: Wed, 7 Sep 2022 08:59:19 +0000 (-0400) Subject: LU-16139 statahead: avoid to block ptlrpcd interpret context X-Git-Tag: 2.15.53~141 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=2e0897439014338553a51fae338fb2c1b655f067;p=fs%2Flustre-release.git LU-16139 statahead: avoid to block ptlrpcd interpret context If a stat-ahead entry is a striped directory or a regular file with layout change, it will generate a new RPC and block ptlrpcd interpret context for a long time. However, it is dangerous of blocking in ptlrpcd thread as it may result in deadlock. The following is the stack trace for the timeout of replay-dual test_26: task:ptlrpcd_00_01 state:I stack: 0 pid: 8026 ppid: 2 osc_extent_wait+0x44d/0x560 [osc] osc_cache_wait_range+0x2b8/0x930 [osc] osc_io_fsync_end+0x67/0x80 [osc] cl_io_end+0x58/0x130 [obdclass] lov_io_end_wrapper+0xcf/0xe0 [lov] lov_io_fsync_end+0x6f/0x1c0 [lov] cl_io_end+0x58/0x130 [obdclass] cl_io_loop+0xa7/0x200 [obdclass] cl_sync_file_range+0x2c9/0x340 [lustre] vvp_prune+0x5d/0x1e0 [lustre] cl_object_prune+0x58/0x130 [obdclass] lov_layout_change.isra.47+0x1ba/0x640 [lov] lov_conf_set+0x38d/0x4e0 [lov] cl_conf_set+0x60/0x140 [obdclass] cl_file_inode_init+0xc8/0x380 [lustre] ll_update_inode+0x432/0x6e0 [lustre] ll_iget+0x227/0x320 [lustre] ll_prep_inode+0x344/0xb60 [lustre] ll_statahead_interpret_common.isra.26+0x69/0x830 [lustre] ll_statahead_interpret+0x2c8/0x5b0 [lustre] mdc_intent_getattr_async_interpret+0x14a/0x3e0 [mdc] ptlrpc_check_set+0x5b8/0x1fe0 [ptlrpc] ptlrpcd+0x6c6/0xa50 [ptlrpc] In this patch, we use work queue to handle the extra RPC and long wait in a separate thread for a striped directory and a regular file with layout change: (@ll_prep_inode->@lmv_revalidate_slaves); (@ll_prep_inode->@lov_layout_change->osc_cache_wait_range) Test-Parameters: testlist=replay-dual env=ONLY=26,ONLY_REPEAT=10 mdscount=2 mdtcount=4 Signed-off-by: Qian Yingjin Change-Id: I404a320620c4ec4caa608e675ecf324fcd26f1e0 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48451 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Lai Siyao Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre_intent.h b/lustre/include/lustre_intent.h index 09aa504..5f3a717 100644 --- a/lustre/include/lustre_intent.h +++ b/lustre/include/lustre_intent.h @@ -47,8 +47,6 @@ struct lookup_intent { __u64 it_remote_lock_handle; struct ptlrpc_request *it_request; unsigned int it_lock_set:1; - unsigned int it_extra_rpc_check:1; - unsigned int it_extra_rpc_need:1; }; static inline int it_disposition(const struct lookup_intent *it, int flag) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 78ac884..3e03514 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -967,9 +967,7 @@ struct md_readdir_info { }; struct md_op_item; -typedef int (*md_op_item_cb_t)(struct req_capsule *pill, - struct md_op_item *item, - int rc); +typedef int (*md_op_item_cb_t)(struct md_op_item *item, int rc); struct md_op_item { struct md_op_data mop_data; @@ -979,6 +977,8 @@ struct md_op_item { md_op_item_cb_t mop_cb; void *mop_cbdata; struct inode *mop_dir; + struct req_capsule *mop_pill; + struct work_struct mop_work; }; struct obd_ops { diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 94d7025..a1c60fe 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -1537,12 +1537,6 @@ struct ll_statahead_info { atomic_t sai_cache_count; /* entry count in cache */ }; -struct ll_interpret_work { - struct work_struct lpw_work; - struct md_op_item *lpw_item; - struct req_capsule *lpw_pill; -}; - int ll_revalidate_statahead(struct inode *dir, struct dentry **dentry, bool unplug); int ll_start_statahead(struct inode *dir, struct dentry *dentry, bool agl); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 9db4ef5..1bc27da 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -3230,13 +3230,6 @@ int ll_prep_inode(struct inode **inode, struct req_capsule *pill, if (rc != 0) GOTO(out, rc); - if (S_ISDIR(md.body->mbo_mode) && md.lmv && lmv_dir_striped(md.lmv) && - it && it->it_extra_rpc_check) { - /* TODO: Check @lsm unchanged via @lsm_md_eq. */ - it->it_extra_rpc_need = 1; - GOTO(out, rc = -EAGAIN); - } - /* * clear default_lmv only if intent_getattr reply doesn't contain it. * but it needs to be done after iget, check this early because diff --git a/lustre/llite/statahead.c b/lustre/llite/statahead.c index 597405f..6152300 100644 --- a/lustre/llite/statahead.c +++ b/lustre/llite/statahead.c @@ -342,8 +342,7 @@ static void sa_fini_data(struct md_op_item *item) OBD_FREE_PTR(item); } -static int ll_statahead_interpret(struct req_capsule *pill, - struct md_op_item *item, int rc); +static int ll_statahead_interpret(struct md_op_item *item, int rc); /* * prepare arguments for async stat RPC. @@ -609,59 +608,6 @@ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) EXIT; } -static int ll_statahead_interpret_common(struct inode *dir, - struct ll_statahead_info *sai, - struct req_capsule *pill, - struct lookup_intent *it, - struct sa_entry *entry, - struct mdt_body *body) -{ - struct inode *child; - int rc; - - ENTRY; - - child = entry->se_inode; - rc = ll_prep_inode(&child, pill, dir->i_sb, it); - if (rc) - GOTO(out, rc); - - /* If encryption context was returned by MDT, put it in - * inode now to save an extra getxattr. - */ - if (body->mbo_valid & OBD_MD_ENCCTX) { - void *encctx = req_capsule_server_get(pill, &RMF_FILE_ENCCTX); - __u32 encctxlen = req_capsule_get_size(pill, &RMF_FILE_ENCCTX, - RCL_SERVER); - - if (encctxlen) { - CDEBUG(D_SEC, - "server returned encryption ctx for "DFID"\n", - PFID(ll_inode2fid(child))); - rc = ll_xattr_cache_insert(child, - xattr_for_enc(child), - encctx, encctxlen); - if (rc) - CWARN("%s: cannot set enc ctx for "DFID": rc = %d\n", - ll_i2sbi(child)->ll_fsname, - PFID(ll_inode2fid(child)), rc); - } - } - - CDEBUG(D_READA, "%s: setting %.*s"DFID" l_data to inode %p\n", - ll_i2sbi(dir)->ll_fsname, entry->se_qstr.len, - entry->se_qstr.name, PFID(ll_inode2fid(child)), child); - ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); - - entry->se_inode = child; - - if (agl_should_run(sai, child)) - ll_agl_add(sai, child, entry->se_index); - -out: - RETURN(rc); -} - static void ll_statahead_interpret_fini(struct ll_inode_info *lli, struct ll_statahead_info *sai, struct md_op_item *item, @@ -685,12 +631,11 @@ static void ll_statahead_interpret_fini(struct ll_inode_info *lli, spin_unlock(&lli->lli_sa_lock); } -static void ll_statahead_interpret_work(struct work_struct *data) +static void ll_statahead_interpret_work(struct work_struct *work) { - struct ll_interpret_work *work = container_of(data, - struct ll_interpret_work, lpw_work); - struct md_op_item *item = work->lpw_item; - struct req_capsule *pill = work->lpw_pill; + struct md_op_item *item = container_of(work, struct md_op_item, + mop_work); + struct req_capsule *pill = item->mop_pill; struct inode *dir = item->mop_dir; struct ll_inode_info *lli = ll_i2info(dir); struct ll_statahead_info *sai = lli->lli_sai; @@ -726,11 +671,43 @@ static void ll_statahead_interpret_work(struct work_struct *data) if (rc != 1) GOTO(out, rc = -EAGAIN); - LASSERT(it->it_extra_rpc_check == 0); - rc = ll_statahead_interpret_common(dir, sai, pill, it, entry, body); + rc = ll_prep_inode(&child, pill, dir->i_sb, it); + if (rc) + GOTO(out, rc); + + /* If encryption context was returned by MDT, put it in + * inode now to save an extra getxattr. + */ + if (body->mbo_valid & OBD_MD_ENCCTX) { + void *encctx = req_capsule_server_get(pill, &RMF_FILE_ENCCTX); + __u32 encctxlen = req_capsule_get_size(pill, &RMF_FILE_ENCCTX, + RCL_SERVER); + + if (encctxlen) { + CDEBUG(D_SEC, + "server returned encryption ctx for "DFID"\n", + PFID(ll_inode2fid(child))); + rc = ll_xattr_cache_insert(child, + xattr_for_enc(child), + encctx, encctxlen); + if (rc) + CWARN("%s: cannot set enc ctx for "DFID": rc = %d\n", + ll_i2sbi(child)->ll_fsname, + PFID(ll_inode2fid(child)), rc); + } + } + + CDEBUG(D_READA, "%s: setting %.*s"DFID" l_data to inode %p\n", + ll_i2sbi(dir)->ll_fsname, entry->se_qstr.len, + entry->se_qstr.name, PFID(ll_inode2fid(child)), child); + ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); + + entry->se_inode = child; + + if (agl_should_run(sai, child)) + ll_agl_add(sai, child, entry->se_index); out: ll_statahead_interpret_fini(lli, sai, item, entry, pill->rc_req, rc); - OBD_FREE_PTR(work); } /* @@ -738,14 +715,15 @@ out: * the inode and set lock data directly in the ptlrpcd context. It will wake up * the directory listing process if the dentry is the waiting one. */ -static int ll_statahead_interpret(struct req_capsule *pill, - struct md_op_item *item, int rc) +static int ll_statahead_interpret(struct md_op_item *item, int rc) { + struct req_capsule *pill = item->mop_pill; struct lookup_intent *it = &item->mop_it; struct inode *dir = item->mop_dir; struct ll_inode_info *lli = ll_i2info(dir); struct ll_statahead_info *sai = lli->lli_sai; struct sa_entry *entry = (struct sa_entry *)item->mop_cbdata; + struct work_struct *work = &item->mop_work; struct mdt_body *body; struct inode *child; __u64 handle = 0; @@ -786,49 +764,33 @@ static int ll_statahead_interpret(struct req_capsule *pill, entry->se_handle = it->it_lock_handle; /* * In ptlrpcd context, it is not allowed to generate new RPCs - * especially for striped directories. + * especially for striped directories or regular files with layout + * change. */ - it->it_extra_rpc_check = 1; - rc = ll_statahead_interpret_common(dir, sai, pill, it, entry, body); - if (rc == -EAGAIN && it->it_extra_rpc_need) { - struct ll_interpret_work *work; - - /* - * release ibits lock ASAP to avoid deadlock when statahead - * thread enqueues lock on parent in readdir and another - * process enqueues lock on child with parent lock held, eg. - * unlink. - */ - handle = it->it_lock_handle; - ll_intent_drop_lock(it); - ll_unlock_md_op_lsm(&item->mop_data); - it->it_extra_rpc_check = 0; - it->it_extra_rpc_need = 0; - - /* - * If the stat-ahead entry is a striped directory, there are two - * solutions: - * 1. It can drop the result, let the scanning process do stat() - * on the striped directory in synchronous way. By this way, it - * can avoid to generate new RPCs to obtain the attributes for - * slaves of the striped directory in the ptlrpcd context as it - * is dangerous of blocking in ptlrpcd thread. - * 2. Use work queue or the separate statahead thread to handle - * the extra RPCs (@ll_prep_inode->@lmv_revalidate_slaves). - * Here we adopt the second solution. - */ - OBD_ALLOC_GFP(work, sizeof(*work), GFP_ATOMIC); - if (work == NULL) - GOTO(out, rc = -ENOMEM); - - INIT_WORK(&work->lpw_work, ll_statahead_interpret_work); - work->lpw_item = item; - work->lpw_pill = pill; - ptlrpc_request_addref(pill->rc_req); - schedule_work(&work->lpw_work); - RETURN(0); - } + /* + * release ibits lock ASAP to avoid deadlock when statahead + * thread enqueues lock on parent in readdir and another + * process enqueues lock on child with parent lock held, eg. + * unlink. + */ + handle = it->it_lock_handle; + ll_intent_drop_lock(it); + ll_unlock_md_op_lsm(&item->mop_data); + /* + * If the statahead entry is a striped directory or regular file with + * layout change, it will generate a new RPC and long wait in the + * ptlrpcd context. + * However, it is dangerous of blocking in ptlrpcd thread. + * Here we use work queue or the separate statahead thread to handle + * the extra RPC and long wait: + * (@ll_prep_inode->@lmv_revalidate_slaves); + * (@ll_prep_inode->@lov_layout_change->osc_cache_wait_range); + */ + INIT_WORK(work, ll_statahead_interpret_work); + ptlrpc_request_addref(pill->rc_req); + schedule_work(work); + RETURN(0); out: ll_statahead_interpret_fini(lli, sai, item, entry, NULL, rc); RETURN(rc); diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 4ad1de4..8397868 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -1407,7 +1407,8 @@ static int mdc_intent_getattr_async_interpret(const struct lu_env *env, EXIT; out: - item->mop_cb(&req->rq_pill, item, rc); + item->mop_pill = &req->rq_pill; + item->mop_cb(item, rc); return 0; }