From f0d5693584bc70913541201c54c3666a473715b2 Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Fri, 11 Nov 2016 06:33:03 -0500 Subject: [PATCH] LU-8826 recovery: don't shrink recovery hard time 1. Don't shrink recovery hard time, otherwise, soft limit will be very close to hard limit and hard timer can be expired easily during the 1st recovery stage: waiting for all clients connected, and VBR won't take effect at all. Added replay-vbr test_13 to verify this. 2. When exp_need_sync is set, all transactions on MDT/OST should be synchronous. 3. Removed unused target_client_add_cb(). Signed-off-by: Niu Yawei Change-Id: Idbb011f2772d5c779c53c3e990ceef5a386e4b5b Reviewed-on: https://review.whamcloud.com/23716 Tested-by: Jenkins Reviewed-by: Lai Siyao Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/include/lustre_lib.h | 2 -- lustre/include/obd_support.h | 2 ++ lustre/ldlm/ldlm_lib.c | 27 ++++++++++-------------- lustre/obdclass/obd_mount_server.c | 11 ++++------ lustre/ofd/ofd_obd.c | 2 +- lustre/target/tgt_lastrcvd.c | 11 +++++++++- lustre/tests/replay-vbr.sh | 42 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 70 insertions(+), 27 deletions(-) diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index b4eb57f..c07366d 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -65,8 +65,6 @@ struct l_wait_info; #define LP_POISON ((void *)LL_POISON) #ifdef HAVE_SERVER_SUPPORT -void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data, - int error); int rev_import_init(struct obd_export *exp); int target_handle_connect(struct ptlrpc_request *req); int target_handle_disconnect(struct ptlrpc_request *req); diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index d9d6359..480ea07 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -452,6 +452,8 @@ extern char obd_jobid_var[]; #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715 #define OBD_FAIL_TGT_MOUNT_RACE 0x716 #define OBD_FAIL_TGT_REPLAY_TIMEOUT 0x717 +#define OBD_FAIL_TGT_CLIENT_DEL 0x718 +#define OBD_FAIL_TGT_SLUGGISH_NET 0x719 #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index b916701..acb3aae 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -834,20 +834,6 @@ out_already: RETURN(EALREADY); } -void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data, - int error) -{ - struct obd_export *exp = cb_data; - - CDEBUG(D_RPCTRACE, "%s: committing for initial connect of %s\n", - obd->obd_name, exp->exp_client_uuid.uuid); - - spin_lock(&exp->exp_lock); - exp->exp_need_sync = 0; - spin_unlock(&exp->exp_lock); - class_export_cb_put(exp); -} - static void check_and_start_recovery_timer(struct obd_device *obd, struct ptlrpc_request *req, int new_client); @@ -1794,8 +1780,13 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend) to = drt; } - if (to > obd->obd_recovery_time_hard) - to = obd->obd_recovery_time_hard; + if (to > obd->obd_recovery_time_hard) { + to = obd->obd_recovery_time_hard; + CWARN("%s: extended recovery timer reaching hard " + "limit: %d, extend: %d\n", + obd->obd_name, to, extend); + } + if (obd->obd_recovery_timeout < to) { obd->obd_recovery_timeout = to; end = obd->obd_recovery_start + to; @@ -1839,6 +1830,10 @@ check_and_start_recovery_timer(struct obd_device *obd, * and reuse service_time to limit stack usage. */ service_time = at_est2timeout(service_time); + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) && + service_time < at_extra) + service_time = at_extra; + /* We expect other clients to timeout within service_time, then try * to reconnect, then try the failover server. The max delay between * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL. */ diff --git a/lustre/obdclass/obd_mount_server.c b/lustre/obdclass/obd_mount_server.c index 8b5bf91..3f621dc 100644 --- a/lustre/obdclass/obd_mount_server.c +++ b/lustre/obdclass/obd_mount_server.c @@ -1904,23 +1904,20 @@ void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd) if (has_ir) { int new_soft = soft; - int new_hard = hard; /* adjust timeout value by imperative recovery */ - new_soft = (soft * factor) / OBD_IR_FACTOR_MAX; - new_hard = (hard * factor) / OBD_IR_FACTOR_MAX; - /* make sure the timeout is not too short */ new_soft = max(min, new_soft); - new_hard = max(new_soft, new_hard); LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery " "window shrunk from %d-%d down to %d-%d\n", - obd->obd_name, soft, hard, new_soft, new_hard); + obd->obd_name, soft, hard, new_soft, hard); soft = new_soft; - hard = new_hard; + } else { + LCONSOLE_INFO("%s: Imperative Recovery not enabled, recovery " + "window %d-%d\n", obd->obd_name, soft, hard); } /* we're done */ diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index 957aac5..28debc1 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -237,7 +237,7 @@ static int ofd_parse_connect_data(const struct lu_env *env, if (!(lsd->lsd_feature_compat & OBD_COMPAT_OST)) { /* this will only happen on the first connect */ lsd->lsd_feature_compat |= OBD_COMPAT_OST; - /* sync is not needed here as lut_client_add will + /* sync is not needed here as tgt_client_new will * set exp_need_sync flag */ tgt_server_data_update(env, &ofd->ofd_lut, 0); } diff --git a/lustre/target/tgt_lastrcvd.c b/lustre/target/tgt_lastrcvd.c index c3811cd..8be1a07 100644 --- a/lustre/target/tgt_lastrcvd.c +++ b/lustre/target/tgt_lastrcvd.c @@ -812,6 +812,10 @@ out: OBD_FREE_PTR(ccb); } +/** + * Add commit callback function, it returns a non-zero value to inform + * caller to use sync transaction if necessary. + */ int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt, struct obd_export *exp, __u64 transno) { @@ -842,7 +846,9 @@ int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt, /* report failure to force synchronous operation */ return -EPERM; - return rc; + /* if exp_need_sync is set, return non-zero value to force + * a sync transaction. */ + return rc ? rc : exp->exp_need_sync; } struct tgt_new_client_callback { @@ -1073,6 +1079,9 @@ int tgt_client_del(const struct lu_env *env, struct obd_export *exp) if (exp->exp_flags & OBD_OPT_FAILOVER) RETURN(0); + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_DEL)) + RETURN(0); + /* Make sure the server's last_transno is up to date. * This should be done before zeroing client slot so last_transno will * be in server data or in client data in case of failure */ diff --git a/lustre/tests/replay-vbr.sh b/lustre/tests/replay-vbr.sh index c475a8c..3fb72d3 100755 --- a/lustre/tests/replay-vbr.sh +++ b/lustre/tests/replay-vbr.sh @@ -1184,6 +1184,48 @@ test_12a() { # former test_2a } run_test 12a "lost data due to missed REMOTE client during replay" +test_13() { # LU-8826 + local var=${SINGLEMDS}_svc + + if combined_mgs_mds ; then + skip "Needs separate MGS to enable IR" + return 0 + fi + + do_facet $SINGLEMDS "$LCTL set_param mdd.${!var}.sync_permission=0" + do_facet $SINGLEMDS "$LCTL set_param mdt.${!var}.commit_on_sharing=0" + + zconf_mount $CLIENT2 $MOUNT2 + do_node $CLIENT1 openfile -f O_RDWR:O_CREAT -m 0644 $DIR/$tfile + + # set ir_timeout to a reasonable small value + local ir_timeout=$(do_facet mgs $LCTL get_param -n mgs.*.ir_timeout) + do_facet mgs $LCTL set_param mgs.*.ir_timeout=5 + # make sure IR functional + sleep 5 + + replay_barrier $SINGLEMDS + do_node $CLIENT1 chmod 666 $DIR/$tfile + do_node $CLIENT2 chmod 777 $DIR2/$tfile + + # make sure client data of $CLIENT2:$MOUNT2 is remained + # define OBD_FAIL_TGT_CLIENT_DEL 0x718 + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x718 + zconf_umount $CLIENT2 $MOUNT2 + # define OBD_FAIL_TGT_SLUGGISH_NET 0x719 + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x719 + facet_failover $SINGLEMDS + + client_up $CLIENT1 || error "$CLIENT1 evicted" + + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 + do_facet mgs $LCTL set_param mgs.*.ir_timeout=$ir_timeout + + do_node $CLIENT1 $CHECKSTAT -p 0666 $DIR/$tfile || + error "$DIR/$tfile-a: unexpected state" +} +run_test 13 "Shouldn't give up VBR easily on sluggish network" + #restore COS setting restore_lustre_params < $cos_param_file rm -f $cos_param_file -- 1.8.3.1