From 9ae3b7e83cd19f4ffdc1e111496ca90971f12061 Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Thu, 11 Apr 2013 16:11:14 -0700 Subject: [PATCH] LU-3124 llite: To use an extra RPC to transfer layout To support wide stripe, we have to use an extra RPC to transfer large layout, instead of using LVB buffer in completion AST for layout lock since it doesn't reserve enough space. Also, to fix the problem in LU-2807, we decide to transfer layout with an extra RPC if it has ever been blocked. In LU-2807, it turns out we can't call mdt_object_find() in ptlrpc thread context as following may happen: 1. thread1 unlink reaches the MDT; 2. before unlink enqueues lock, thread2 does getattr intent req to find and hold object; 3. unlink acquires inodebits dlm lock; 4. thread3 enqueues LAYOUT lock, blocked; 4. thread2 blocked at acquiring dlm lock as well; 5. unlink finishes and releases the lock(the object becomes dying), LAYOUT lock's completion_ast will be invoked; 6. mdt_lvbo_fill() calls mdt_object_find() and waits for dying object, this will never succeed because thread2 is being blocked at completion AST with object held. live locked. By using extra RPC to fetch layout, we won't have the above problem any more. Signed-off-by: Jinshan Xiong Change-Id: If75ae92424ada6ef275e813a87a93acd426eabdc Reviewed-on: http://review.whamcloud.com/6042 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Johann Lombardi Reviewed-by: Fan Yong Reviewed-by: Oleg Drokin --- lustre/ldlm/ldlm_lockd.c | 7 ++++ lustre/llite/file.c | 97 +++++++++++++++++++++++++++++++++++++++++------- lustre/mdt/mdt_lvb.c | 3 +- 3 files changed, 91 insertions(+), 16 deletions(-) diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 31925ff..da54fcd 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -953,6 +953,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) /* server namespace, doesn't need lock */ lvb_len = ldlm_lvbo_size(lock); + /* LU-3124 & LU-2187: to not return layout in completion AST because + * it may deadlock for LU-2187, or client may not have enough space + * for large layout. The layout will be returned to client with an + * extra RPC to fetch xattr.lov */ + if (ldlm_has_layout(lock)) + lvb_len = 0; + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len); rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK); if (rc) { diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 842e438..09f9905 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -3236,6 +3236,72 @@ int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf) RETURN(result); } +/* Fetch layout from MDT with getxattr request, if it's not ready yet */ +static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock) + +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_capa *oc; + struct ptlrpc_request *req; + struct mdt_body *body; + void *lvbdata; + void *lmm; + int lmmsize; + int rc; + ENTRY; + + if (lock->l_lvb_data != NULL) + RETURN(0); + + /* if layout lock was granted right away, the layout is returned + * within DLM_LVB of dlm reply; otherwise if the lock was ever + * blocked and then granted via completion ast, we have to fetch + * layout here. Please note that we can't use the LVB buffer in + * completion AST because it doesn't have a large enough buffer */ + oc = ll_mdscapa_get(inode); + rc = ll_get_max_mdsize(sbi, &lmmsize); + if (rc == 0) + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, + OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0, + lmmsize, 0, &req); + capa_put(oc); + if (rc < 0) + RETURN(rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL || body->eadatasize > lmmsize) + GOTO(out, rc = -EPROTO); + + lmmsize = body->eadatasize; + if (lmmsize == 0) /* empty layout */ + GOTO(out, rc = 0); + + lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize); + if (lmm == NULL) + GOTO(out, rc = -EFAULT); + + OBD_ALLOC_LARGE(lvbdata, lmmsize); + if (lvbdata == NULL) + GOTO(out, rc = -ENOMEM); + + memcpy(lvbdata, lmm, lmmsize); + lock_res_and_lock(lock); + if (lock->l_lvb_data == NULL) { + lock->l_lvb_data = lvbdata; + lock->l_lvb_len = lmmsize; + lvbdata = NULL; + } + unlock_res_and_lock(lock); + + if (lvbdata != NULL) + OBD_FREE_LARGE(lvbdata, lmmsize); + EXIT; + +out: + ptlrpc_req_finished(req); + return rc; +} + /** * Apply the layout to the inode. Layout lock is held and will be released * in this function. @@ -3250,6 +3316,7 @@ static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode, struct cl_object_conf conf; int rc = 0; bool lvb_ready; + bool wait_layout = false; ENTRY; LASSERT(lustre_handle_is_used(lockh)); @@ -3267,8 +3334,6 @@ static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode, /* checking lvb_ready is racy but this is okay. The worst case is * that multi processes may configure the file on the same time. */ if (lvb_ready || !reconf) { - LDLM_LOCK_PUT(lock); - rc = -ENODATA; if (lvb_ready) { /* layout_gen must be valid if layout lock is not @@ -3276,10 +3341,13 @@ static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode, *gen = lli->lli_layout_gen; rc = 0; } - ldlm_lock_decref(lockh, mode); - RETURN(rc); + GOTO(out, rc); } + rc = ll_layout_fetch(inode, lock); + if (rc < 0) + GOTO(out, rc); + /* for layout lock, lmm is returned in lock's lvb. * lvb_data is immutable if the lock is held so it's safe to access it * without res lock. See the description in ldlm_lock_decref_internal() @@ -3298,11 +3366,8 @@ static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode, PFID(&lli->lli_fid), rc); } } - if (rc < 0) { - LDLM_LOCK_PUT(lock); - ldlm_lock_decref(lockh, mode); - RETURN(rc); - } + if (rc < 0) + GOTO(out, rc); /* set layout to file. Unlikely this will fail as old layout was * surely eliminated */ @@ -3312,15 +3377,20 @@ static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode, conf.coc_lock = lock; conf.u.coc_md = &md; rc = ll_layout_conf(inode, &conf); - LDLM_LOCK_PUT(lock); - - ldlm_lock_decref(lockh, mode); if (md.lsm != NULL) obd_free_memmd(sbi->ll_dt_exp, &md.lsm); + /* refresh layout failed, need to wait */ + wait_layout = rc == -EBUSY; + EXIT; + +out: + LDLM_LOCK_PUT(lock); + ldlm_lock_decref(lockh, mode); + /* wait for IO to complete if it's still being used. */ - if (rc == -EBUSY) { + if (wait_layout) { CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n", ll_get_fsname(inode->i_sb, NULL, 0), inode, PFID(&lli->lli_fid)); @@ -3335,7 +3405,6 @@ static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode, CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n", PFID(&lli->lli_fid), rc); } - RETURN(rc); } diff --git a/lustre/mdt/mdt_lvb.c b/lustre/mdt/mdt_lvb.c index 9a830d0..6396d5c 100644 --- a/lustre/mdt/mdt_lvb.c +++ b/lustre/mdt/mdt_lvb.c @@ -114,8 +114,7 @@ static int mdt_lvbo_fill(struct ldlm_lock *lock, void *lvb, int lvblen) RETURN(rc); } - if (lock->l_resource->lr_type != LDLM_IBITS || - !(lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT)) + if (!ldlm_has_layout(lock)) RETURN(0); /* layout lock will be granted to client, fill in lvb with layout */ -- 1.8.3.1