From ccaeafd40baad0b8e8d0ab544a18f2f5b0b5f9a6 Mon Sep 17 00:00:00 2001 From: yury Date: Fri, 13 Jun 2008 08:53:45 +0000 Subject: [PATCH] b=15230 r=nikita,shadow - fixed handling for OBD_FAIL_$PREF_$OPC_NET fail_ids in mdt. Former code did not check it correctly (due to typo with && instead of &) in mdt_req_handle() and they all did not work. In same time, some handlers like mdt_close() and mdt_enqueue() tried to check them again (result of some wrong fix) but again, did it not correctly. They returned 0 error without doing anything. This should have to emulate network failure. But as they did not allocate reply buffer and returned 0 error, they caused rs != NULL assert in ptlrpc. Fxing this also fixed replay-single.sh test_53* and replay_dual.sh test_12 and possibly others; - removed checking for NET fail_id in mdt_close() and mdt_enqueue() - sources of recent assert; - added sanity check in mdt_req_handle() for any other invalid situation about returning 0 error and not allocating reply buffers; - removed mdt_reply(), move its one line call into mdt_req_handle(). This was needed to simplify handling NET fail_ids in which case we should just return 0 and make sure that no reply is sent; - comments and cleanups; - in reply-dual.sh - remove test 8 from ALWAYS_EXCEPT. It passes in HEAD. Originally for placed into ALWAYS_EXCEPT for old mds code and later moved to HEAD test scripts but as mds in HEAD is completely new this bug is making any sense there; - in reply-single.sh - remove tests 0b 39 56 from ALWAYS_EXCEPT. They are passing in HEAD. Also they are obsolete and related to closed bugs. --- lustre/mdt/mdt_handler.c | 63 +++++++++++++------------------------------ lustre/mdt/mdt_open.c | 6 ----- lustre/tests/replay-dual.sh | 4 +-- lustre/tests/replay-single.sh | 4 +-- 4 files changed, 23 insertions(+), 54 deletions(-) diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 618638b..82225d8 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -1144,7 +1144,7 @@ static int mdt_sendpage(struct mdt_thread_info *info, GOTO(free_desc, rc); if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) - GOTO(abort_bulk, rc); + GOTO(abort_bulk, rc = 0); *lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL); rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), lwi); @@ -1507,12 +1507,6 @@ static int mdt_reint(struct mdt_thread_info *info) ENTRY; - if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_REINT_NET, - OBD_FAIL_MDS_REINT_NET)) { - info->mti_fail_id = OBD_FAIL_MDS_REINT_NET; - RETURN(0); - } - opc = mdt_reint_opcode(info, reint_fmts); if (opc >= 0) { /* @@ -1652,11 +1646,6 @@ static int mdt_enqueue(struct mdt_thread_info *info) */ LASSERT(info->mti_dlm_req != NULL); - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_ENQUEUE)) { - info->mti_fail_id = OBD_FAIL_LDLM_ENQUEUE; - return 0; - } - req = mdt_info_req(info); /* @@ -2050,22 +2039,18 @@ static int mdt_req_handle(struct mdt_thread_info *info, LASSERT(current->journal_info == NULL); /* - * Mask out OBD_FAIL_ONCE, because that will stop - * correct handling of failed req later in ldlm due to doing - * obd_fail_loc |= OBD_FAIL_ONCE without actually - * correct actions like it is done in target_send_reply_msg(). + * Checking for various OBD_FAIL_$PREF_$OPC_NET codes. _Do_ not try + * to put same checks into handlers like mdt_close(), mdt_reint(), + * etc., without talking to mdt authors first. Checking same thing + * there again is useless and returning 0 error wihtout packing reply + * is buggy! Handlers either pack reply or return error. + * + * We return 0 here and do not send any reply in order to emulate + * network failure. Do not send any reply in case any of NET related + * fail_id has occured. */ - if (h->mh_fail_id != 0) { - /* - * Set to info->mti_fail_id to handler fail_id, it will be used - * later, and better than use default fail_id. - */ - if (OBD_FAIL_CHECK_RESET(h->mh_fail_id && OBD_FAIL_MASK_LOC, - h->mh_fail_id & ~OBD_FAILED)) { - info->mti_fail_id = h->mh_fail_id; - RETURN(0); - } - } + if (OBD_FAIL_CHECK_ORSET(h->mh_fail_id, OBD_FAIL_ONCE)) + RETURN(0); rc = 0; flags = h->mh_flags; @@ -2112,6 +2097,11 @@ static int mdt_req_handle(struct mdt_thread_info *info, * only */ rc = h->mh_act(info); + if (rc == 0 && req->rq_reply_state == NULL) { + DEBUG_REQ(D_ERROR, req, "MDT \"handler\" %s did not pack " + "reply and returned 0 error\n", h->mh_name); + LBUG(); + } serious = is_serious(rc); rc = clear_serious(rc); } else @@ -2147,7 +2137,8 @@ static int mdt_req_handle(struct mdt_thread_info *info, LBUG(); } - RETURN(rc); + target_send_reply(req, rc, info->mti_fail_id); + RETURN(0); } void mdt_lock_handle_init(struct mdt_lock_handle *lh) @@ -2341,21 +2332,6 @@ static int mdt_recovery(struct mdt_thread_info *info) RETURN(+1); } -static int mdt_reply(struct ptlrpc_request *req, int rc, - struct mdt_thread_info *info) -{ - ENTRY; - -#if 0 - if (req->rq_reply_state == NULL && rc == 0) { - req->rq_status = rc; - lustre_pack_reply(req, 1, NULL, NULL); - } -#endif - target_send_reply(req, rc, info->mti_fail_id); - RETURN(0); -} - static int mdt_msg_check_version(struct lustre_msg *msg) { int rc; @@ -2459,7 +2435,6 @@ static int mdt_handle0(struct ptlrpc_request *req, supported); if (likely(h != NULL)) { rc = mdt_req_handle(info, h, req); - rc = mdt_reply(req, rc, info); } else { CERROR("The unsupported opc: 0x%x\n", lustre_msg_get_opc(msg) ); req->rq_status = -ENOTSUPP; diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index 0ac88c4..cdda73e 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -1170,12 +1170,6 @@ int mdt_close(struct mdt_thread_info *info) int rc, ret = 0; ENTRY; - if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_CLOSE_NET, - OBD_FAIL_MDS_CLOSE_NET)) { - info->mti_fail_id = OBD_FAIL_MDS_CLOSE_NET; - RETURN(0); - } - /* Close may come with the Size-on-MDS update. Unpack it. */ rc = mdt_close_unpack(info); if (rc) diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index e55cc72..dffb15c 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -2,8 +2,8 @@ set -e -# bug number: 13129 13129 6088 10124 -ALWAYS_EXCEPT="2 3 8 15c $REPLAY_DUAL_EXCEPT" +# bug number: 13129 13129 10124 +ALWAYS_EXCEPT="2 3 15c $REPLAY_DUAL_EXCEPT" SAVE_PWD=$PWD PTLDEBUG=${PTLDEBUG:--1} diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 55ab4d1..ea4994f 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -18,8 +18,8 @@ GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} # Skip these tests -# bug number: 2766 4176 11404 -ALWAYS_EXCEPT="0b 39 56 $REPLAY_SINGLE_EXCEPT" +# bug number: +ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT" # 63 min 7 min AT AT AT AT" [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a 44b 65 66 67 68" -- 1.8.3.1