From: Vladimir Saveliev Date: Tue, 18 Jan 2022 21:48:29 +0000 (-0800) Subject: LU-13513 osp: make neterr not fatal for precreate_reserve X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=57187cda861c57229f9993e2e3aa91c63989bb62;p=fs%2Flustre-release.git LU-13513 osp: make neterr not fatal for precreate_reserve When OST_CREATE (not resendable rpc) sent by precreate thread fails with network error, osp_pre_update_status() sets d->opd_pre_status to EIO. osp_precreate_reserve() considers EIO as fatal and does not wait for another attempt from precreate thread. That may make mdt_intent_open() to return ENOSPC confusing a caller. ENOSPC comes from lod_alloc_rr(). osp_precreate_send(): in case of network error switch EIO to ENOTCONN. Test to illustrate the issue is added. Lustre-change: https://review.whamcloud.com/38472 Lustre-commit: 4bba67075aa3d8739d8ca99642ff2b2836774479 Cray-bug-id: LUS-8811 Signed-off-by: Vladimir Saveliev Change-Id: Iffaad9bd16f216f758c784b708e21b525c999b14 Reviewed-by: Andreas Dilger Reviewed-by: Alex Zhuravlev Reviewed-on: https://review.whamcloud.com/46179 Tested-by: jenkins Tested-by: Maloo --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index f7010080..7720c3c 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -455,6 +455,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_PTLRPC_RESEND_RACE 0x525 #define OBD_FAIL_PTLRPC_ROUND_XID 0x530 #define OBD_FAIL_PTLRPC_CONNECT_RACE 0x531 +#define OBD_FAIL_NET_ERROR_RPC 0x532 #define OBD_FAIL_OBD_PING_NET 0x600 /* OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 obsolete since 1.5 */ @@ -711,6 +712,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 #define OBD_FAIL_OSP_INVALID_LOGID 0x2106 #define OBD_FAIL_OSP_CON_EVENT_DELAY 0x2107 +#define OBD_FAIL_OSP_PRECREATE_PAUSE 0x2108 /* barrier */ #define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200 diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index 1f2addb..670a1b5 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -675,6 +675,9 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d) if (rc) { CERROR("%s: can't precreate: rc = %d\n", d->opd_obd->obd_name, rc); + if (req->rq_net_err) + /* have osp_precreate_reserve() to wait for repeat */ + rc = -ENOTCONN; GOTO(out_req, rc); } LASSERT(req->rq_transno == 0); @@ -722,6 +725,9 @@ out_req: osp_pre_update_status(d, rc); wake_up(&d->opd_pre_user_waitq); + /* pause to let osp_precreate_reserve to go first */ + CFS_FAIL_TIMEOUT(OBD_FAIL_OSP_PRECREATE_PAUSE, 2); + ptlrpc_req_finished(req); RETURN(rc); } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index ba15faa..037fd6b 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -56,6 +56,12 @@ void request_out_callback(struct lnet_event *ev) LASSERT(ev->type == LNET_EVENT_SEND || ev->type == LNET_EVENT_UNLINK); LASSERT(ev->unlinked); + if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val && + CFS_FAIL_CHECK_RESET(OBD_FAIL_NET_ERROR_RPC, + OBD_FAIL_OSP_PRECREATE_PAUSE | + CFS_FAIL_ONCE))) + ev->status = -ECONNABORTED; + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); /* Do not update imp_next_ping for connection request */ diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 9114d58..5fb9dbb 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -26808,6 +26808,34 @@ test_820() { } run_test 820 "update max EA from open intent" +test_822() { + local p="$TMP/$TESTSUITE-$TESTNAME.parameters" + + save_lustre_params mds1 \ + "osp.$FSNAME-OST*-osc-MDT0000.max_create_count" > $p + do_facet $SINGLEMDS "$LCTL set_param -n \ + osp.$FSNAME-OST*MDT0000.max_create_count=0" + do_facet $SINGLEMDS "$LCTL set_param -n \ + osp.$FSNAME-OST0000*MDT0000.max_create_count=20000" + + # wait for statfs update to clear OS_STATFS_NOPRECREATE + local maxage=$(do_facet mds1 $LCTL get_param -n \ + osp.$FSNAME-OST0000*MDT0000.maxage) + sleep $((maxage + 1)) + + #define OBD_FAIL_NET_ERROR_RPC 0x532 + do_facet mds1 "$LCTL set_param fail_loc=0x80000532 fail_val=5" + + stack_trap "restore_lustre_params < $p; rm $p" + + local count=$(do_facet $SINGLEMDS "lctl get_param -n \ + osp.$FSNAME-OST0000*MDT0000.create_count") + for i in $(seq 1 $count); do + touch $DIR/$tfile.${i} || error "touch failed" + done +} +run_test 822 "test precreate failure" + # # tests that do cleanup/setup should be run at the end #