Whamcloud - gitweb
LU-13513 osp: make neterr not fatal for precreate_reserve 72/38472/10
authorVladimir Saveliev <c17830@cray.com>
Mon, 2 Nov 2020 10:10:42 +0000 (13:10 +0300)
committerOleg Drokin <green@whamcloud.com>
Fri, 26 Feb 2021 08:20:47 +0000 (08:20 +0000)
When OST_CREATE (not resendable rpc) sent by precreate thread fails
with network error, osp_pre_update_status() sets d->opd_pre_status to
EIO. osp_precreate_reserve() considers EIO as fatal and does not wait
for another attempt from precreate thread. That may make
mdt_intent_open() to return ENOSPC confusing a caller.  ENOSPC comes
from lod_alloc_rr().

osp_precreate_send(): in case of network error switch EIO to ENOTCONN.

Test to illustrate the issue is added.

Cray-bug-id: LUS-8811
Signed-off-by: Vladimir Saveliev <c17830@cray.com>
Change-Id: Iffaad9bd16f216f758c784b708e21b525c999b14
Reviewed-on: https://review.whamcloud.com/38472
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/osp/osp_precreate.c
lustre/ptlrpc/events.c
lustre/tests/sanity.sh

index c79a62b..085a71f 100644 (file)
@@ -453,6 +453,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_PTLRPC_RESEND_RACE     0x525
 #define OBD_FAIL_PTLRPC_ROUND_XID       0x530
 #define OBD_FAIL_PTLRPC_CONNECT_RACE    0x531
 #define OBD_FAIL_PTLRPC_RESEND_RACE     0x525
 #define OBD_FAIL_PTLRPC_ROUND_XID       0x530
 #define OBD_FAIL_PTLRPC_CONNECT_RACE    0x531
+#define OBD_FAIL_NET_ERROR_RPC          0x532
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 /*     OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 /*     OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
@@ -703,6 +704,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSP_CANT_PROCESS_LLOG         0x2105
 #define OBD_FAIL_OSP_INVALID_LOGID             0x2106
 #define OBD_FAIL_OSP_CON_EVENT_DELAY           0x2107
 #define OBD_FAIL_OSP_CANT_PROCESS_LLOG         0x2105
 #define OBD_FAIL_OSP_INVALID_LOGID             0x2106
 #define OBD_FAIL_OSP_CON_EVENT_DELAY           0x2107
+#define OBD_FAIL_OSP_PRECREATE_PAUSE           0x2108
 
 /* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET          0x2200
 
 /* barrier */
 #define OBD_FAIL_MGS_BARRIER_READ_NET          0x2200
index bbbf7c4..60ecc66 100644 (file)
@@ -675,6 +675,9 @@ static int osp_precreate_send(const struct lu_env *env, struct osp_device *d)
        if (rc) {
                CERROR("%s: can't precreate: rc = %d\n", d->opd_obd->obd_name,
                       rc);
        if (rc) {
                CERROR("%s: can't precreate: rc = %d\n", d->opd_obd->obd_name,
                       rc);
+               if (req->rq_net_err)
+                       /* have osp_precreate_reserve() to wait for repeat */
+                       rc = -ENOTCONN;
                GOTO(out_req, rc);
        }
        LASSERT(req->rq_transno == 0);
                GOTO(out_req, rc);
        }
        LASSERT(req->rq_transno == 0);
@@ -722,6 +725,9 @@ out_req:
        osp_pre_update_status(d, rc);
        wake_up(&d->opd_pre_user_waitq);
 
        osp_pre_update_status(d, rc);
        wake_up(&d->opd_pre_user_waitq);
 
+       /* pause to let osp_precreate_reserve to go first */
+       CFS_FAIL_TIMEOUT(OBD_FAIL_OSP_PRECREATE_PAUSE, 2);
+
        ptlrpc_req_finished(req);
        RETURN(rc);
 }
        ptlrpc_req_finished(req);
        RETURN(rc);
 }
index 7296a23..8d569a7 100644 (file)
@@ -56,6 +56,12 @@ void request_out_callback(struct lnet_event *ev)
        LASSERT(ev->type == LNET_EVENT_SEND || ev->type == LNET_EVENT_UNLINK);
        LASSERT(ev->unlinked);
 
        LASSERT(ev->type == LNET_EVENT_SEND || ev->type == LNET_EVENT_UNLINK);
        LASSERT(ev->unlinked);
 
+       if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val &&
+                    CFS_FAIL_CHECK_RESET(OBD_FAIL_NET_ERROR_RPC,
+                                         OBD_FAIL_OSP_PRECREATE_PAUSE |
+                                         CFS_FAIL_ONCE)))
+               ev->status = -ECONNABORTED;
+
        DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
 
        /* Do not update imp_next_ping for connection request */
        DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
 
        /* Do not update imp_next_ping for connection request */
index b25855a..af78fe5 100755 (executable)
@@ -25435,6 +25435,34 @@ test_820() {
 }
 run_test 820 "update max EA from open intent"
 
 }
 run_test 820 "update max EA from open intent"
 
+test_822() {
+       local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
+
+       save_lustre_params mds1 \
+               "osp.$FSNAME-OST*-osc-MDT0000.max_create_count" > $p
+       do_facet $SINGLEMDS "$LCTL set_param -n \
+                       osp.$FSNAME-OST*MDT0000.max_create_count=0"
+       do_facet $SINGLEMDS "$LCTL set_param -n \
+                       osp.$FSNAME-OST0000*MDT0000.max_create_count=20000"
+
+       # wait for statfs update to clear OS_STATFS_NOPRECREATE
+       local maxage=$(do_facet mds1 $LCTL get_param -n \
+                      osp.$FSNAME-OST0000*MDT0000.maxage)
+       sleep $((maxage + 1))
+
+       #define OBD_FAIL_NET_ERROR_RPC          0x532
+       do_facet mds1 "$LCTL set_param fail_loc=0x80000532 fail_val=5"
+
+       stack_trap "restore_lustre_params < $p; rm $p"
+
+       local count=$(do_facet $SINGLEMDS "lctl get_param -n \
+                     osp.$FSNAME-OST0000*MDT0000.create_count")
+       for i in $(seq 1 $count); do
+               touch $DIR/$tfile.${i} || error "touch failed"
+       done
+}
+run_test 822 "test precreate failure"
+
 #
 # tests that do cleanup/setup should be run at the end
 #
 #
 # tests that do cleanup/setup should be run at the end
 #