Client doesn't restore import state correctly
on reconnect during replay. It resends lock replay
when final ping was queued by server.
Server fails with "target_queue_recovery_request())
ASSERTION( req->rq_export->exp_lock_replay_needed ) failed"
Add imp_replay_state to store last replay state.
imp_state is restored from imp_replay_state
during reconnect.
Lustre-commit:
f61cec84e61e4be07ab741dd0fbeac3b4a388eef
Lustre-change: http://review.whamcloud.com/#/c/12015
The new test for replay-single will only work for lustre
versions that contain commit
f61cec84. This patch does
lustre version checking to over failing on platforms that
are not fixed.
Lustre-commit:
afde9f17260650d0cb80d53613fb5afda0a39384
Lustre-change: http://review.whamcloud.com/#/c/12942
Xyratex-bug-id: MRP-2022
Signed-off-by: Andriy Skulysh <Andriy_Skulysh@xyratex.com>
Signed-off-by: Bob Glossman <bob.glossman@intel.com>
Change-Id: Iaa14fe968cc31f266b605785df4fa676083fbca4
Reviewed-on: http://review.whamcloud.com/12163
Tested-by: Jenkins
Reviewed-by: Jian Yu <jian.yu@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: James Simmons <uja.ornl@gmail.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
cfs_atomic_t imp_timeouts;
/** Current import state */
enum lustre_imp_state imp_state;
+ /** Last replay state */
+ enum lustre_imp_state imp_replay_state;
/** History of import states */
struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN];
int imp_state_hist_idx;
#define OBD_FAIL_TGT_CLIENT_ADD 0x711
#define OBD_FAIL_TGT_RCVG_FLAG 0x712
#define OBD_FAIL_TGT_REPLAY_DELAY2 0x714
+#define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715
#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800
#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801
* The third stage: reply on final pings, at this moment all clients
* must have request in final queue
*/
+ CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_RECONNECT, cfs_fail_val);
CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n");
/** Update server last boot epoch */
tgt_boot_epoch_update(lut);
static void __import_set_state(struct obd_import *imp,
enum lustre_imp_state state)
{
+ switch (state) {
+ case LUSTRE_IMP_CLOSED:
+ case LUSTRE_IMP_NEW:
+ case LUSTRE_IMP_DISCON:
+ case LUSTRE_IMP_CONNECTING:
+ break;
+ case LUSTRE_IMP_REPLAY_WAIT:
+ imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS;
+ break;
+ default:
+ imp->imp_replay_state = LUSTRE_IMP_REPLAY;
+ }
imp->imp_state = state;
imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
imp->imp_resend_replay = 1;
spin_unlock(&imp->imp_lock);
- IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+ IMPORT_SET_STATE(imp, imp->imp_replay_state);
} else {
IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
}
int rc;
ENTRY;
+ if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_RECONNECT)) {
+ /* don't send early reply */
+ RETURN(1);
+ }
+
/* deadline is when the client expects us to reply, margin is the
difference between clients' and servers' expectations */
DEBUG_REQ(D_ADAPTTO, req,
}
run_test 90 "lfs find identifies the missing striped file segments"
+test_93() {
+ local server_version=$(lustre_version_code $SINGLEMDS)
+ [[ $server_version -ge $(version_code 2.6.90) ]] ||
+ [[ $server_version -ge $(version_code 2.5.4) &&
+ $server_version -lt $(version_code 2.5.50) ]] ||
+ { skip "Need MDS version 2.5.4+ or 2.6.90+"; return; }
+
+ cancel_lru_locks osc
+
+ $SETSTRIPE -i 0 -c 1 $DIR/$tfile
+ dd if=/dev/zero of=$DIR/$tfile bs=1024 count=1
+#define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715
+ # We need to emulate a state that OST is waiting for other clients
+ # not completing the recovery. Final ping is queued, but reply will be sent
+ # on the recovery completion. It is done by sleep before processing final
+ # pings
+ do_facet ost1 "$LCTL set_param fail_val=40"
+ do_facet ost1 "$LCTL set_param fail_loc=0x715"
+ fail ost1
+}
+run_test 93 "replay + reconnect"
+
complete $SECONDS
check_and_cleanup_lustre
exit_status