There is a chance that client is connecting to OST
before recovery start when OST is not configured.
In such case OST returns EAGAIN(target->obd_no_conn == 1).
There is no problem when pinger_recov is enabled
because ptlrpc_pinger_main will reconnect later.
But it doesn't reconnect when pinger_recov is 0.
Move setting imp_connect_error to ptlrpc_connect_interpret.
It is needed to store there only connection errors.
Cray-bug-id: LUS-2034
Change-Id: I35ad57e43825162f4056ad346e22a8dddea0e191
Signed-off-by: Sergey Cheremencev <c17829@cray.com>
Reviewed-on: https://es-gerrit.dev.cray.com/153542
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Tested-by: Elena Gryaznova <c17455@cray.com>
Reviewed-by: Vitaly Fertman <c17818@cray.com>
Reviewed-on: https://review.whamcloud.com/33557
Tested-by: Jenkins
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
#define OBD_FAIL_OST_INTEGRITY_CMP 0x244
#define OBD_FAIL_OST_DISCONNECT_DELAY 0x245
#define OBD_FAIL_OST_DELAY_TRANS 0x246
#define OBD_FAIL_OST_INTEGRITY_CMP 0x244
#define OBD_FAIL_OST_DISCONNECT_DELAY 0x245
#define OBD_FAIL_OST_DELAY_TRANS 0x246
+#define OBD_FAIL_OST_PREPARE_DELAY 0x247
#define OBD_FAIL_LDLM 0x300
#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301
#define OBD_FAIL_LDLM 0x300
#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301
LASSERTF(rc == 0, "register namespace failed: rc = %d\n", rc);
target_recovery_init(&ofd->ofd_lut, tgt_request_handle);
LASSERTF(rc == 0, "register namespace failed: rc = %d\n", rc);
target_recovery_init(&ofd->ofd_lut, tgt_request_handle);
+ OBD_FAIL_TIMEOUT_ORSET(OBD_FAIL_OST_PREPARE_DELAY, OBD_FAIL_ONCE,
+ (OBD_TIMEOUT_DEFAULT + 1) / 4);
LASSERT(obd->obd_no_conn);
spin_lock(&obd->obd_dev_lock);
obd->obd_no_conn = 0;
LASSERT(obd->obd_no_conn);
spin_lock(&obd->obd_dev_lock);
obd->obd_no_conn = 0;
ptlrpc_at_adj_net_latency(req,
lustre_msg_get_service_time(req->rq_repmsg));
ptlrpc_at_adj_net_latency(req,
lustre_msg_get_service_time(req->rq_repmsg));
- rc = ptlrpc_check_status(req);
- imp->imp_connect_error = rc;
+ rc = ptlrpc_check_status(req);
+ imp->imp_connect_error = rc;
if (rc) {
struct ptlrpc_request *free_req;
struct ptlrpc_request *tmp;
if (rc) {
struct ptlrpc_request *free_req;
struct ptlrpc_request *tmp;
ptlrpc_import_state_name(level), level, force, force_next,
imp->imp_deactive, imp->imp_pingable, suppress);
ptlrpc_import_state_name(level), level, force, force_next,
imp->imp_deactive, imp->imp_pingable, suppress);
- if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
- /* wait for a while before trying recovery again */
- imp->imp_next_ping = ptlrpc_next_reconnect(imp);
- if (!imp->imp_no_pinger_recover)
- ptlrpc_initiate_recovery(imp);
- } else if (level != LUSTRE_IMP_FULL ||
- imp->imp_obd->obd_no_recov ||
- imp_is_deactive(imp)) {
+ if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+ /* wait for a while before trying recovery again */
+ imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+ if (!imp->imp_no_pinger_recover ||
+ imp->imp_connect_error == -EAGAIN)
+ ptlrpc_initiate_recovery(imp);
+ } else if (level != LUSTRE_IMP_FULL ||
+ imp->imp_obd->obd_no_recov ||
+ imp_is_deactive(imp)) {
CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
"or recovery disabled: %s)\n",
imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
"or recovery disabled: %s)\n",
imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
}
run_test 100 "IR: Make sure normal recovery still works w/o IR"
}
run_test 100 "IR: Make sure normal recovery still works w/o IR"
- do_facet mgs $LCTL list_param mgs.*.ir_timeout ||
- { skip "MGS without IR support"; return 0; }
+ do_facet mgs $LCTL list_param mgs.*.ir_timeout ||
+ skip "MGS without IR support"
- local OST1_IMP=$(get_osc_import_name client ost1)
+ local ost1_imp=$(get_osc_import_name client ost1)
- # disable pinger recovery
- lctl set_param -n osc.$OST1_IMP.pinger_recov=0
+ # disable pinger recovery
+ lctl set_param -n osc.$ost1_imp.pinger_recov=0
+ stack_trap "$LCTL set_param -n osc.$ost1_imp.pinger_recov=1" EXIT
- target_instance_match ost1 || error "instance mismatch"
- nidtbl_versions_match || error "version must match"
+ target_instance_match ost1 || error "instance mismatch"
+ nidtbl_versions_match || error "version must match"
+}
+run_test 101a "IR: Make sure IR works w/o normal recovery"
- lctl set_param -n osc.$OST1_IMP.pinger_recov=1
+test_101b()
+{
+ do_facet mgs $LCTL list_param mgs.*.ir_timeout ||
+ skip "MGS without IR support"
+
+ set_ir_status full
+
+ local ost1_imp=$(get_osc_import_name client ost1)
+
+#define OBD_FAIL_OST_PREPARE_DELAY 0x247
+ do_facet ost1 "$LCTL set_param fail_loc=0x247"
+ # disable pinger recovery
+ $LCTL set_param -n osc.$ost1_imp.pinger_recov=0
+ stack_trap "$LCTL set_param -n osc.$ost1_imp.pinger_recov=1" EXIT
+
+#OST may return EAGAIN if it is not configured yet
+ fail ost1
-run_test 101 "IR: Make sure IR works w/o normal recovery"
+run_test 101b "IR: Make sure IR works w/o normal recovery and proceed EAGAIN"