Whamcloud - gitweb
LU-11601 ptlrpc: IR doesn't reconnect after EAGAIN 57/33557/9
authorSergey Cheremencev <c17829@cray.com>
Tue, 3 Jul 2018 12:45:01 +0000 (15:45 +0300)
committerOleg Drokin <green@whamcloud.com>
Sat, 13 Apr 2019 04:49:29 +0000 (04:49 +0000)
There is a chance that client is connecting to OST
before recovery start when OST is not configured.
In such case OST returns EAGAIN(target->obd_no_conn == 1).
There is no problem when pinger_recov is enabled
because ptlrpc_pinger_main will reconnect later.
But it doesn't reconnect when pinger_recov is 0.

Move setting imp_connect_error to ptlrpc_connect_interpret.
It is needed to store there only connection errors.

Cray-bug-id: LUS-2034
Change-Id: I35ad57e43825162f4056ad346e22a8dddea0e191
Signed-off-by: Sergey Cheremencev <c17829@cray.com>
Reviewed-on: https://es-gerrit.dev.cray.com/153542
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Tested-by: Elena Gryaznova <c17455@cray.com>
Reviewed-by: Vitaly Fertman <c17818@cray.com>
Reviewed-on: https://review.whamcloud.com/33557
Tested-by: Jenkins
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/ofd/ofd_dev.c
lustre/ptlrpc/client.c
lustre/ptlrpc/import.c
lustre/ptlrpc/pinger.c
lustre/tests/recovery-small.sh

index 0d522cb..d4a2029 100644 (file)
@@ -342,6 +342,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OST_INTEGRITY_CMP      0x244
 #define OBD_FAIL_OST_DISCONNECT_DELAY   0x245
 #define OBD_FAIL_OST_DELAY_TRANS        0x246
 #define OBD_FAIL_OST_INTEGRITY_CMP      0x244
 #define OBD_FAIL_OST_DISCONNECT_DELAY   0x245
 #define OBD_FAIL_OST_DELAY_TRANS        0x246
+#define OBD_FAIL_OST_PREPARE_DELAY      0x247
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
index a8cf364..1f0fbdc 100644 (file)
@@ -633,6 +633,8 @@ static int ofd_prepare(const struct lu_env *env, struct lu_device *pdev,
        LASSERTF(rc == 0, "register namespace failed: rc = %d\n", rc);
 
        target_recovery_init(&ofd->ofd_lut, tgt_request_handle);
        LASSERTF(rc == 0, "register namespace failed: rc = %d\n", rc);
 
        target_recovery_init(&ofd->ofd_lut, tgt_request_handle);
+       OBD_FAIL_TIMEOUT_ORSET(OBD_FAIL_OST_PREPARE_DELAY, OBD_FAIL_ONCE,
+                              (OBD_TIMEOUT_DEFAULT + 1) / 4);
        LASSERT(obd->obd_no_conn);
        spin_lock(&obd->obd_dev_lock);
        obd->obd_no_conn = 0;
        LASSERT(obd->obd_no_conn);
        spin_lock(&obd->obd_dev_lock);
        obd->obd_no_conn = 0;
index 0cd73d7..064c468 100644 (file)
@@ -1445,8 +1445,7 @@ static int after_reply(struct ptlrpc_request *req)
         ptlrpc_at_adj_net_latency(req,
                                   lustre_msg_get_service_time(req->rq_repmsg));
 
         ptlrpc_at_adj_net_latency(req,
                                   lustre_msg_get_service_time(req->rq_repmsg));
 
-        rc = ptlrpc_check_status(req);
-        imp->imp_connect_error = rc;
+       rc = ptlrpc_check_status(req);
 
        if (rc) {
                /*
 
        if (rc) {
                /*
index b91dc59..174c12b 100644 (file)
@@ -987,6 +987,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                RETURN(0);
        }
 
                RETURN(0);
        }
 
+       imp->imp_connect_error = rc;
        if (rc) {
                struct ptlrpc_request *free_req;
                struct ptlrpc_request *tmp;
        if (rc) {
                struct ptlrpc_request *free_req;
                struct ptlrpc_request *tmp;
index ef808d8..dc6981b 100644 (file)
@@ -247,14 +247,15 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp,
               ptlrpc_import_state_name(level), level, force, force_next,
               imp->imp_deactive, imp->imp_pingable, suppress);
 
               ptlrpc_import_state_name(level), level, force, force_next,
               imp->imp_deactive, imp->imp_pingable, suppress);
 
-        if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
-                /* wait for a while before trying recovery again */
-                imp->imp_next_ping = ptlrpc_next_reconnect(imp);
-                if (!imp->imp_no_pinger_recover)
-                        ptlrpc_initiate_recovery(imp);
-        } else if (level != LUSTRE_IMP_FULL ||
-                   imp->imp_obd->obd_no_recov ||
-                   imp_is_deactive(imp)) {
+       if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+               /* wait for a while before trying recovery again */
+               imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+               if (!imp->imp_no_pinger_recover ||
+                   imp->imp_connect_error == -EAGAIN)
+                       ptlrpc_initiate_recovery(imp);
+       } else if (level != LUSTRE_IMP_FULL ||
+                  imp->imp_obd->obd_no_recov ||
+                  imp_is_deactive(imp)) {
                CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
                       "or recovery disabled: %s)\n",
                       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
                CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
                       "or recovery disabled: %s)\n",
                       imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
index 87f9986..231ad43 100755 (executable)
@@ -1756,26 +1756,45 @@ test_100()
 }
 run_test 100 "IR: Make sure normal recovery still works w/o IR"
 
 }
 run_test 100 "IR: Make sure normal recovery still works w/o IR"
 
-test_101()
+test_101a()
 {
 {
-        do_facet mgs $LCTL list_param mgs.*.ir_timeout ||
-                { skip "MGS without IR support"; return 0; }
+       do_facet mgs $LCTL list_param mgs.*.ir_timeout ||
+               skip "MGS without IR support"
 
 
-        set_ir_status full
+       set_ir_status full
 
 
-        local OST1_IMP=$(get_osc_import_name client ost1)
+       local ost1_imp=$(get_osc_import_name client ost1)
 
 
-        # disable pinger recovery
-        lctl set_param -n osc.$OST1_IMP.pinger_recov=0
+       # disable pinger recovery
+       lctl set_param -n osc.$ost1_imp.pinger_recov=0
+       stack_trap "$LCTL set_param -n osc.$ost1_imp.pinger_recov=1" EXIT
 
 
-        fail ost1
+       fail ost1
 
 
-        target_instance_match ost1 || error "instance mismatch"
-        nidtbl_versions_match || error "version must match"
+       target_instance_match ost1 || error "instance mismatch"
+       nidtbl_versions_match || error "version must match"
+}
+run_test 101a "IR: Make sure IR works w/o normal recovery"
 
 
-        lctl set_param -n osc.$OST1_IMP.pinger_recov=1
+test_101b()
+{
+       do_facet mgs $LCTL list_param mgs.*.ir_timeout ||
+               skip "MGS without IR support"
+
+       set_ir_status full
+
+       local ost1_imp=$(get_osc_import_name client ost1)
+
+#define OBD_FAIL_OST_PREPARE_DELAY      0x247
+       do_facet ost1 "$LCTL set_param fail_loc=0x247"
+       # disable pinger recovery
+       $LCTL set_param -n osc.$ost1_imp.pinger_recov=0
+       stack_trap "$LCTL set_param -n osc.$ost1_imp.pinger_recov=1" EXIT
+
+#OST may return EAGAIN if it is not configured yet
+       fail ost1
 }
 }
-run_test 101 "IR: Make sure IR works w/o normal recovery"
+run_test 101b "IR: Make sure IR works w/o normal recovery and proceed EAGAIN"
 
 test_102()
 {
 
 test_102()
 {