From: Sergey Cheremencev Date: Tue, 3 Jul 2018 12:45:01 +0000 (+0300) Subject: LU-11601 ptlrpc: IR doesn't reconnect after EAGAIN X-Git-Tag: 2.12.53~55 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F57%2F33557%2F9;p=fs%2Flustre-release.git LU-11601 ptlrpc: IR doesn't reconnect after EAGAIN There is a chance that client is connecting to OST before recovery start when OST is not configured. In such case OST returns EAGAIN(target->obd_no_conn == 1). There is no problem when pinger_recov is enabled because ptlrpc_pinger_main will reconnect later. But it doesn't reconnect when pinger_recov is 0. Move setting imp_connect_error to ptlrpc_connect_interpret. It is needed to store there only connection errors. Cray-bug-id: LUS-2034 Change-Id: I35ad57e43825162f4056ad346e22a8dddea0e191 Signed-off-by: Sergey Cheremencev Reviewed-on: https://es-gerrit.dev.cray.com/153542 Reviewed-by: Alexey Lyashkov Tested-by: Elena Gryaznova Reviewed-by: Vitaly Fertman Reviewed-on: https://review.whamcloud.com/33557 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alexandr Boyko Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 0d522cb..d4a2029 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -342,6 +342,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OST_INTEGRITY_CMP 0x244 #define OBD_FAIL_OST_DISCONNECT_DELAY 0x245 #define OBD_FAIL_OST_DELAY_TRANS 0x246 +#define OBD_FAIL_OST_PREPARE_DELAY 0x247 #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index a8cf364..1f0fbdc 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -633,6 +633,8 @@ static int ofd_prepare(const struct lu_env *env, struct lu_device *pdev, LASSERTF(rc == 0, "register namespace failed: rc = %d\n", rc); target_recovery_init(&ofd->ofd_lut, tgt_request_handle); + OBD_FAIL_TIMEOUT_ORSET(OBD_FAIL_OST_PREPARE_DELAY, OBD_FAIL_ONCE, + (OBD_TIMEOUT_DEFAULT + 1) / 4); LASSERT(obd->obd_no_conn); spin_lock(&obd->obd_dev_lock); obd->obd_no_conn = 0; diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 0cd73d7..064c468 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1445,8 +1445,7 @@ static int after_reply(struct ptlrpc_request *req) ptlrpc_at_adj_net_latency(req, lustre_msg_get_service_time(req->rq_repmsg)); - rc = ptlrpc_check_status(req); - imp->imp_connect_error = rc; + rc = ptlrpc_check_status(req); if (rc) { /* diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index b91dc59..174c12b 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -987,6 +987,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, RETURN(0); } + imp->imp_connect_error = rc; if (rc) { struct ptlrpc_request *free_req; struct ptlrpc_request *tmp; diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index ef808d8..dc6981b 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -247,14 +247,15 @@ static void ptlrpc_pinger_process_import(struct obd_import *imp, ptlrpc_import_state_name(level), level, force, force_next, imp->imp_deactive, imp->imp_pingable, suppress); - if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) { - /* wait for a while before trying recovery again */ - imp->imp_next_ping = ptlrpc_next_reconnect(imp); - if (!imp->imp_no_pinger_recover) - ptlrpc_initiate_recovery(imp); - } else if (level != LUSTRE_IMP_FULL || - imp->imp_obd->obd_no_recov || - imp_is_deactive(imp)) { + if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) { + /* wait for a while before trying recovery again */ + imp->imp_next_ping = ptlrpc_next_reconnect(imp); + if (!imp->imp_no_pinger_recover || + imp->imp_connect_error == -EAGAIN) + ptlrpc_initiate_recovery(imp); + } else if (level != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov || + imp_is_deactive(imp)) { CDEBUG(D_HA, "%s->%s: not pinging (in recovery " "or recovery disabled: %s)\n", imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 87f9986..231ad43 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -1756,26 +1756,45 @@ test_100() } run_test 100 "IR: Make sure normal recovery still works w/o IR" -test_101() +test_101a() { - do_facet mgs $LCTL list_param mgs.*.ir_timeout || - { skip "MGS without IR support"; return 0; } + do_facet mgs $LCTL list_param mgs.*.ir_timeout || + skip "MGS without IR support" - set_ir_status full + set_ir_status full - local OST1_IMP=$(get_osc_import_name client ost1) + local ost1_imp=$(get_osc_import_name client ost1) - # disable pinger recovery - lctl set_param -n osc.$OST1_IMP.pinger_recov=0 + # disable pinger recovery + lctl set_param -n osc.$ost1_imp.pinger_recov=0 + stack_trap "$LCTL set_param -n osc.$ost1_imp.pinger_recov=1" EXIT - fail ost1 + fail ost1 - target_instance_match ost1 || error "instance mismatch" - nidtbl_versions_match || error "version must match" + target_instance_match ost1 || error "instance mismatch" + nidtbl_versions_match || error "version must match" +} +run_test 101a "IR: Make sure IR works w/o normal recovery" - lctl set_param -n osc.$OST1_IMP.pinger_recov=1 +test_101b() +{ + do_facet mgs $LCTL list_param mgs.*.ir_timeout || + skip "MGS without IR support" + + set_ir_status full + + local ost1_imp=$(get_osc_import_name client ost1) + +#define OBD_FAIL_OST_PREPARE_DELAY 0x247 + do_facet ost1 "$LCTL set_param fail_loc=0x247" + # disable pinger recovery + $LCTL set_param -n osc.$ost1_imp.pinger_recov=0 + stack_trap "$LCTL set_param -n osc.$ost1_imp.pinger_recov=1" EXIT + +#OST may return EAGAIN if it is not configured yet + fail ost1 } -run_test 101 "IR: Make sure IR works w/o normal recovery" +run_test 101b "IR: Make sure IR works w/o normal recovery and proceed EAGAIN" test_102() {