From 62713a8530349a75b7202c9bfc6be121409a0203 Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Thu, 3 Dec 2015 09:57:36 +0300 Subject: [PATCH] LU-5282 mdc: fix panic at mdc_free_open() Assertion was happened for open request when rq_replay is set to 1. ASSERTION(mod->mod_open_req->rq_replay == 0) But this situation is not fatal for client, and could happened when mdc_close() failed. The fix allow to free such requests. If mdc_close fail, MDS doesn`t receive close request from client. And in a worst case client would be evicted. The test recreates issue when mdc_close failed and client asserts: ASSERTION( mod->mod_open_req->rq_replay == 0 ) failed Signed-off-by: Alexander Boyko Seagate-bug-id: MRP-3156 Change-Id: I5f98901f633355849fc107149eadc9cf171819af Reviewed-on: http://review.whamcloud.com/17495 Reviewed-by: Alex Zhuravlev Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/mdc/mdc_request.c | 72 +++++++++++++++++++++++++++----------------- lustre/tests/sanity.sh | 8 +++++ 3 files changed, 54 insertions(+), 27 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index c1eedb5..3016487 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -457,6 +457,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 #define OBD_FAIL_MDC_RPCS_SEM 0x804 #define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 +#define OBD_FAIL_MDC_CLOSE 0x806 #define OBD_FAIL_MGS 0x900 #define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index e347423..f5dfcdc 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -699,9 +699,15 @@ static void mdc_free_open(struct md_open_data *mod) imp_connect_disp_stripe(mod->mod_open_req->rq_import)) committed = 1; - LASSERT(mod->mod_open_req->rq_replay == 0); + /** + * No reason to asssert here if the open request has + * rq_replay == 1. It means that mdc_close failed, and + * close request wasn`t sent. It is not fatal to client. + * The worst thing is eviction if the client gets open lock + **/ - DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request\n"); + DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request rq_replay" + "= %d\n", mod->mod_open_req->rq_replay); ptlrpc_request_committed(mod->mod_open_req, committed); if (mod->mod_close_req) @@ -760,15 +766,43 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, } *request = NULL; - req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt); - if (req == NULL) - RETURN(-ENOMEM); + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_CLOSE)) + req = NULL; + else + req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt); + + /* Ensure that this close's handle is fixed up during replay. */ + if (likely(mod != NULL)) { + LASSERTF(mod->mod_open_req != NULL && + mod->mod_open_req->rq_type != LI_POISON, + "POISONED open %p!\n", mod->mod_open_req); + + mod->mod_close_req = req; + + DEBUG_REQ(D_HA, mod->mod_open_req, "matched open"); + /* We no longer want to preserve this open for replay even + * though the open was committed. b=3632, b=3633 */ + spin_lock(&mod->mod_open_req->rq_lock); + mod->mod_open_req->rq_replay = 0; + spin_unlock(&mod->mod_open_req->rq_lock); + } else { + CDEBUG(D_HA, "couldn't find open req; expecting close error\n"); + } + if (req == NULL) { + /** + * TODO: repeat close after errors + */ + CWARN("%s: close of FID "DFID" failed, file reference will be " + "dropped when this client unmounts or is evicted\n", + obd->obd_name, PFID(&op_data->op_fid1)); + GOTO(out, rc = -ENOMEM); + } - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE); - if (rc) { - ptlrpc_request_free(req); - RETURN(rc); - } + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE); + if (rc) { + ptlrpc_request_free(req); + GOTO(out, rc); + } /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a * portal whose threads are not taking any DLM locks and are therefore @@ -776,23 +810,6 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, req->rq_request_portal = MDS_READPAGE_PORTAL; ptlrpc_at_set_req_timeout(req); - /* Ensure that this close's handle is fixed up during replay. */ - if (likely(mod != NULL)) { - LASSERTF(mod->mod_open_req != NULL && - mod->mod_open_req->rq_type != LI_POISON, - "POISONED open %p!\n", mod->mod_open_req); - - mod->mod_close_req = req; - - DEBUG_REQ(D_HA, mod->mod_open_req, "matched open"); - /* We no longer want to preserve this open for replay even - * though the open was committed. b=3632, b=3633 */ - spin_lock(&mod->mod_open_req->rq_lock); - mod->mod_open_req->rq_replay = 0; - spin_unlock(&mod->mod_open_req->rq_lock); - } else { - CDEBUG(D_HA, "couldn't find open req; expecting close error\n"); - } mdc_close_pack(req, op_data); @@ -837,6 +854,7 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, } } +out: if (mod) { if (rc != 0) mod->mod_close_req = NULL; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 66c1ccc..c8b46c8 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -13953,6 +13953,14 @@ test_256() { } run_test 256 "Check llog delete for empty and not full state" +test_260() { +#define OBD_FAIL_MDC_CLOSE 0x806 + $LCTL set_param fail_loc=0x80000806 + touch $DIR/$tfile + +} +run_test 260 "Check mdc_close fail" + cleanup_test_300() { trap 0 umask $SAVE_UMASK -- 1.8.3.1