From: Vladimir Saveliev Date: Mon, 2 Nov 2020 10:10:42 +0000 (+0300) Subject: LU-13513 osp: make neterr not fatal for precreate_reserve X-Git-Tag: 2.14.51~175 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=4bba67075aa3d8739d8ca99642ff2b2836774479 LU-13513 osp: make neterr not fatal for precreate_reserve When OST_CREATE (not resendable rpc) sent by precreate thread fails with network error, osp_pre_update_status() sets d->opd_pre_status to EIO. osp_precreate_reserve() considers EIO as fatal and does not wait for another attempt from precreate thread. That may make mdt_intent_open() to return ENOSPC confusing a caller. ENOSPC comes from lod_alloc_rr(). osp_precreate_send(): in case of network error switch EIO to ENOTCONN. Test to illustrate the issue is added. Cray-bug-id: LUS-8811 Signed-off-by: Vladimir Saveliev Change-Id: Iffaad9bd16f216f758c784b708e21b525c999b14 Reviewed-on: https://review.whamcloud.com/38472 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index c79a62b..085a71f 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -453,6 +453,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_PTLRPC_RESEND_RACE 0x525 #define OBD_FAIL_PTLRPC_ROUND_XID 0x530 #define OBD_FAIL_PTLRPC_CONNECT_RACE 0x531 +#define OBD_FAIL_NET_ERROR_RPC 0x532 #define OBD_FAIL_OBD_PING_NET 0x600 /* OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 obsolete since 1.5 */ @@ -703,6 +704,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 #define OBD_FAIL_OSP_INVALID_LOGID 0x2106 #define OBD_FAIL_OSP_CON_EVENT_DELAY 0x2107 +#define OBD_FAIL_OSP_PRECREATE_PAUSE 0x2108 /* barrier */ #define OBD_FAIL_MGS_BARRIER_READ_NET 0x2200 diff --git a/lustre/osp/osp_precreate.c b/lustre/osp/osp_precreate.c index bbbf7c4..60ecc66 100644 --- a/lustre/osp/osp_precreate.c +++ b/lustre/osp/osp_precreate.c @@ -675,6 +675,9 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d) if (rc) { CERROR("%s: can't precreate: rc = %d\n", d->opd_obd->obd_name, rc); + if (req->rq_net_err) + /* have osp_precreate_reserve() to wait for repeat */ + rc = -ENOTCONN; GOTO(out_req, rc); } LASSERT(req->rq_transno == 0); @@ -722,6 +725,9 @@ out_req: osp_pre_update_status(d, rc); wake_up(&d->opd_pre_user_waitq); + /* pause to let osp_precreate_reserve to go first */ + CFS_FAIL_TIMEOUT(OBD_FAIL_OSP_PRECREATE_PAUSE, 2); + ptlrpc_req_finished(req); RETURN(rc); } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 7296a23..8d569a7 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -56,6 +56,12 @@ void request_out_callback(struct lnet_event *ev) LASSERT(ev->type == LNET_EVENT_SEND || ev->type == LNET_EVENT_UNLINK); LASSERT(ev->unlinked); + if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val && + CFS_FAIL_CHECK_RESET(OBD_FAIL_NET_ERROR_RPC, + OBD_FAIL_OSP_PRECREATE_PAUSE | + CFS_FAIL_ONCE))) + ev->status = -ECONNABORTED; + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); /* Do not update imp_next_ping for connection request */ diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index b25855a..af78fe5 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -25435,6 +25435,34 @@ test_820() { } run_test 820 "update max EA from open intent" +test_822() { + local p="$TMP/$TESTSUITE-$TESTNAME.parameters" + + save_lustre_params mds1 \ + "osp.$FSNAME-OST*-osc-MDT0000.max_create_count" > $p + do_facet $SINGLEMDS "$LCTL set_param -n \ + osp.$FSNAME-OST*MDT0000.max_create_count=0" + do_facet $SINGLEMDS "$LCTL set_param -n \ + osp.$FSNAME-OST0000*MDT0000.max_create_count=20000" + + # wait for statfs update to clear OS_STATFS_NOPRECREATE + local maxage=$(do_facet mds1 $LCTL get_param -n \ + osp.$FSNAME-OST0000*MDT0000.maxage) + sleep $((maxage + 1)) + + #define OBD_FAIL_NET_ERROR_RPC 0x532 + do_facet mds1 "$LCTL set_param fail_loc=0x80000532 fail_val=5" + + stack_trap "restore_lustre_params < $p; rm $p" + + local count=$(do_facet $SINGLEMDS "lctl get_param -n \ + osp.$FSNAME-OST0000*MDT0000.create_count") + for i in $(seq 1 $count); do + touch $DIR/$tfile.${i} || error "touch failed" + done +} +run_test 822 "test precreate failure" + # # tests that do cleanup/setup should be run at the end #