From: wang di Date: Mon, 20 Jul 2015 17:36:52 +0000 (-0700) Subject: LU-6880 update: after reply move dtrq to finish list X-Git-Tag: 2.7.59~17 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=2a874ec011e680f49405a7e901d8d0d35dcb4f1a LU-6880 update: after reply move dtrq to finish list update replay request (dtrq) will be moved to a replay finish list after update replay is finished, so if client send the replay request with the same transno, it can check if the request has been redone by update replay. And same as normal replay, update replay should also update obd_next_recovery_transno. Add 70c replay single test cases to verify this. Signed-off-by: wang di Change-Id: I2fae2a6f8264f55d069997e23a50e71b6a9f39db Reviewed-on: http://review.whamcloud.com/15682 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lu_target.h b/lustre/include/lu_target.h index 73eb250..bf62975 100644 --- a/lustre/include/lu_target.h +++ b/lustre/include/lu_target.h @@ -56,6 +56,7 @@ struct distribute_txn_replay_req { struct list_head dtrq_list; __u64 dtrq_master_transno; __u64 dtrq_batchid; + __u64 dtrq_xid; /* all of sub updates are linked here */ struct list_head dtrq_sub_list; @@ -104,11 +105,10 @@ struct target_distribute_txn_data { /* recovery update */ distribute_txn_replay_handler_t tdtd_replay_handler; struct list_head tdtd_replay_list; + struct list_head tdtd_replay_finish_list; spinlock_t tdtd_replay_list_lock; /* last replay update transno */ - __u64 tdtd_last_update_transno; __u32 tdtd_replay_ready:1; - }; struct lu_target { @@ -475,7 +475,10 @@ distribute_txn_get_next_req(struct target_distribute_txn_data *tdtd); void dtrq_destroy(struct distribute_txn_replay_req *dtrq); struct distribute_txn_replay_req_sub * dtrq_sub_lookup(struct distribute_txn_replay_req *dtrq, __u32 mdt_index); - +struct distribute_txn_replay_req * +distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd, + __u64 transno); +bool is_req_replayed_by_update(struct ptlrpc_request *req); enum { ESERIOUS = 0x0001000 }; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 9b51752..4cb82a6 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1502,8 +1502,11 @@ static void target_finish_recovery(struct lu_target *lut) spin_unlock(&obd->obd_recovery_task_lock); if (lut->lut_tdtd != NULL && - !list_empty(&lut->lut_tdtd->tdtd_replay_list)) + (!list_empty(&lut->lut_tdtd->tdtd_replay_list) || + !list_empty(&lut->lut_tdtd->tdtd_replay_finish_list))) { dtrq_list_dump(lut->lut_tdtd, D_ERROR); + dtrq_list_destroy(lut->lut_tdtd); + } obd->obd_recovery_end = cfs_time_current_sec(); @@ -1770,6 +1773,7 @@ static int check_for_next_transno(struct lu_target *lut) { struct ptlrpc_request *req = NULL; struct obd_device *obd = lut->lut_obd; + struct target_distribute_txn_data *tdtd = lut->lut_tdtd; int wake_up = 0, connected, completed, queue_len; __u64 req_transno = 0; __u64 update_transno = 0; @@ -1783,12 +1787,8 @@ static int check_for_next_transno(struct lu_target *lut) req_transno = lustre_msg_get_transno(req->rq_reqmsg); } - if (lut->lut_tdtd != NULL) { - struct target_distribute_txn_data *tdtd; - - tdtd = lut->lut_tdtd; - update_transno = distribute_txn_get_next_transno(lut->lut_tdtd); - } + if (tdtd != NULL) + update_transno = distribute_txn_get_next_transno(tdtd); connected = atomic_read(&obd->obd_connected_clients); completed = connected - atomic_read(&obd->obd_req_replay_clients); @@ -1806,6 +1806,13 @@ static int check_for_next_transno(struct lu_target *lut) } else if (obd->obd_recovery_expired) { CDEBUG(D_HA, "waking for expired recovery\n"); wake_up = 1; + } else if (tdtd != NULL && req != NULL && + is_req_replayed_by_update(req)) { + LASSERTF(req_transno < next_transno, "req_transno "LPU64 + "next_transno"LPU64"\n", req_transno, next_transno); + CDEBUG(D_HA, "waking for duplicate req ("LPU64")\n", + req_transno); + wake_up = 1; } else if (req_transno == next_transno || (update_transno != 0 && update_transno <= next_transno)) { CDEBUG(D_HA, "waking for next ("LPD64")\n", next_transno); @@ -2169,23 +2176,35 @@ static void replay_request_or_update(struct lu_env *env, spin_lock(&obd->obd_recovery_task_lock); transno = get_next_transno(lut, &type); - if (type == REQUEST_RECOVERY && tdtd != NULL && - transno == tdtd->tdtd_last_update_transno) { + if (type == REQUEST_RECOVERY && transno != 0) { /* Drop replay request from client side, if the * replay has been executed by update with the * same transno */ req = list_entry(obd->obd_req_replay_queue.next, struct ptlrpc_request, rq_list); + list_del_init(&req->rq_list); obd->obd_requests_queued_for_recovery--; spin_unlock(&obd->obd_recovery_task_lock); - drop_duplicate_replay_req(env, obd, req); - } else if (type == REQUEST_RECOVERY && transno != 0) { - req = list_entry(obd->obd_req_replay_queue.next, - struct ptlrpc_request, rq_list); - list_del_init(&req->rq_list); - obd->obd_requests_queued_for_recovery--; - spin_unlock(&obd->obd_recovery_task_lock); + + /* Let's check if the request has been redone by + * update replay */ + if (is_req_replayed_by_update(req)) { + struct distribute_txn_replay_req *dtrq; + + dtrq = distribute_txn_lookup_finish_list(tdtd, + req->rq_xid); + LASSERT(dtrq != NULL); + spin_lock(&tdtd->tdtd_replay_list_lock); + list_del_init(&dtrq->dtrq_list); + spin_unlock(&tdtd->tdtd_replay_list_lock); + dtrq_destroy(dtrq); + + drop_duplicate_replay_req(env, obd, req); + + continue; + } + LASSERT(trd->trd_processing_task == current_pid()); DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s", lustre_msg_get_transno(req->rq_reqmsg), @@ -2214,13 +2233,27 @@ static void replay_request_or_update(struct lu_env *env, tdtd->tdtd_replay_handler(env, tdtd, dtrq); lu_context_exit(&thread->t_env->le_ctx); extend_recovery_timer(obd, obd_timeout, true); - LASSERT(tdtd->tdtd_last_update_transno <= transno); - tdtd->tdtd_last_update_transno = transno; + + /* Add it to the replay finish list */ + spin_lock(&tdtd->tdtd_replay_list_lock); + if (dtrq->dtrq_xid != 0) { + CDEBUG(D_HA, "Move x"LPU64" t"LPU64 + " to finish list\n", dtrq->dtrq_xid, + dtrq->dtrq_master_transno); + list_add(&dtrq->dtrq_list, + &tdtd->tdtd_replay_finish_list); + } else { + dtrq_destroy(dtrq); + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + spin_lock(&obd->obd_recovery_task_lock); - if (transno > obd->obd_next_recovery_transno) - obd->obd_next_recovery_transno = transno; + if (transno == obd->obd_next_recovery_transno) + obd->obd_next_recovery_transno++; + else if (transno > obd->obd_next_recovery_transno) + obd->obd_next_recovery_transno = transno + 1; spin_unlock(&obd->obd_recovery_task_lock); - dtrq_destroy(dtrq); + } else { spin_unlock(&obd->obd_recovery_task_lock); LASSERT(list_empty(&obd->obd_req_replay_queue)); @@ -2551,8 +2584,13 @@ int target_queue_recovery_request(struct ptlrpc_request *req, CDEBUG(D_HA, "Next recovery transno: "LPU64 ", current: "LPU64", replaying\n", obd->obd_next_recovery_transno, transno); + + /* If the request has been replayed by update replay, then sends this + * request to the recovery thread (replay_request_or_update()), where + * it will be handled */ spin_lock(&obd->obd_recovery_task_lock); - if (transno < obd->obd_next_recovery_transno) { + if (transno < obd->obd_next_recovery_transno && + !is_req_replayed_by_update(req)) { /* Processing the queue right now, don't re-add. */ LASSERT(list_empty(&req->rq_list)); spin_unlock(&obd->obd_recovery_task_lock); diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c index 8ec09b8..7886704 100644 --- a/lustre/lod/lod_dev.c +++ b/lustre/lod/lod_dev.c @@ -729,7 +729,7 @@ static int lod_prepare_distribute_txn(const struct lu_env *env, RETURN(-ENOMEM); lut = lod2lu_dev(lod)->ld_site->ls_tgt; - + tdtd->tdtd_dt = &lod->lod_dt_dev; rc = distribute_txn_init(env, lut, tdtd, lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id); @@ -740,12 +740,6 @@ static int lod_prepare_distribute_txn(const struct lu_env *env, RETURN(rc); } - tdtd->tdtd_dt = &lod->lod_dt_dev; - INIT_LIST_HEAD(&tdtd->tdtd_replay_list); - spin_lock_init(&tdtd->tdtd_replay_list_lock); - tdtd->tdtd_replay_handler = distribute_txn_replay_handle; - tdtd->tdtd_replay_ready = 0; - lut->lut_tdtd = tdtd; RETURN(0); diff --git a/lustre/target/update_recovery.c b/lustre/target/update_recovery.c index 8d7f5cc..7cece02 100644 --- a/lustre/target/update_recovery.c +++ b/lustre/target/update_recovery.c @@ -539,6 +539,11 @@ void dtrq_list_destroy(struct target_distribute_txn_data *tdtd) list_del_init(&dtrq->dtrq_list); dtrq_destroy(dtrq); } + list_for_each_entry_safe(dtrq, tmp, &tdtd->tdtd_replay_finish_list, + dtrq_list) { + list_del_init(&dtrq->dtrq_list); + dtrq_destroy(dtrq); + } spin_unlock(&tdtd->tdtd_replay_list_lock); } EXPORT_SYMBOL(dtrq_list_destroy); @@ -597,6 +602,40 @@ __u64 distribute_txn_get_next_transno(struct target_distribute_txn_data *tdtd) } EXPORT_SYMBOL(distribute_txn_get_next_transno); +struct distribute_txn_replay_req * +distribute_txn_lookup_finish_list(struct target_distribute_txn_data *tdtd, + __u64 xid) +{ + struct distribute_txn_replay_req *dtrq = NULL; + struct distribute_txn_replay_req *iter; + + spin_lock(&tdtd->tdtd_replay_list_lock); + list_for_each_entry(iter, &tdtd->tdtd_replay_finish_list, dtrq_list) { + if (iter->dtrq_xid == xid) { + dtrq = iter; + break; + } + } + spin_unlock(&tdtd->tdtd_replay_list_lock); + return dtrq; +} + +bool is_req_replayed_by_update(struct ptlrpc_request *req) +{ + struct lu_target *tgt = class_exp2tgt(req->rq_export); + struct distribute_txn_replay_req *dtrq; + + if (tgt->lut_tdtd == NULL) + return false; + + dtrq = distribute_txn_lookup_finish_list(tgt->lut_tdtd, req->rq_xid); + if (dtrq == NULL) + return false; + + return true; +} +EXPORT_SYMBOL(is_req_replayed_by_update); + /** * Check if the update of one object is committed * @@ -1025,6 +1064,7 @@ static void update_recovery_update_ses(struct lu_env *env, struct target_distribute_txn_data *tdtd, struct thandle *th, struct thandle *master_th, + struct distribute_txn_replay_req *dtrq, struct tx_arg *ta_arg) { struct tgt_session_info *tsi; @@ -1068,6 +1108,7 @@ static void update_recovery_update_ses(struct lu_env *env, tsi->tsi_opdata = lrd->lrd_data; tsi->tsi_result = lrd->lrd_result; tsi->tsi_client_gen = lrd->lrd_client_gen; + dtrq->dtrq_xid = lrd->lrd_xid; top_th = container_of(th, struct top_thandle, tt_super); top_th->tt_master_sub_thandle = master_th; cfs_hash_putref(hash); @@ -1335,7 +1376,7 @@ int distribute_txn_replay_handle(struct lu_env *env, * tgt_last_rcvd_update() can be called correctly */ if (rc == 0 && dt_obj == tdtd->tdtd_lut->lut_reply_data) update_recovery_update_ses(env, tdtd, th, - st->st_sub_th, ta_arg); + st->st_sub_th, dtrq, ta_arg); if (unlikely(rc < 0)) { CDEBUG(D_HA, "error during execution of #%u from" diff --git a/lustre/target/update_trans.c b/lustre/target/update_trans.c index 00bc034..d3fd4c1 100644 --- a/lustre/target/update_trans.c +++ b/lustre/target/update_trans.c @@ -1645,8 +1645,13 @@ int distribute_txn_init(const struct lu_env *env, int rc; ENTRY; - spin_lock_init(&tdtd->tdtd_batchid_lock); INIT_LIST_HEAD(&tdtd->tdtd_list); + INIT_LIST_HEAD(&tdtd->tdtd_replay_finish_list); + INIT_LIST_HEAD(&tdtd->tdtd_replay_list); + spin_lock_init(&tdtd->tdtd_batchid_lock); + spin_lock_init(&tdtd->tdtd_replay_list_lock); + tdtd->tdtd_replay_handler = distribute_txn_replay_handle; + tdtd->tdtd_replay_ready = 0; tdtd->tdtd_batchid = lut->lut_last_transno + 1; diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 61d429bb..d5e7404 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -2099,6 +2099,74 @@ test_70b () { run_test 70b "dbench ${MDSCOUNT}mdts recovery; $CLIENTCOUNT clients" # end multi-client tests +random_fail_mdt() { + local max_index=$1 + local duration=$2 + local monitor_pid=$3 + local elapsed + local start_ts=$(date +%s) + local num_failovers=0 + local fail_index + + elapsed=$(($(date +%s) - start_ts)) + while [ $elapsed -lt $duration ]; do + fail_index=$((RANDOM%max_index+1)) + kill -0 $monitor_pid || + error "$monitor_pid stopped" + sleep 120 + replay_barrier mds$fail_index + sleep 10 + # Increment the number of failovers + num_failovers=$((num_failovers+1)) + log "$TESTNAME fail mds$fail_index $num_failovers times" + fail mds$fail_index + elapsed=$(($(date +%s) - start_ts)) + done +} + +cleanup_70c() { + trap 0 + kill -9 $tar_70c_pid +} +test_70c () { + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=600 + + local elapsed + local start_ts=$(date +%s) + + trap cleanup_70c EXIT + ( + while true; do + test_mkdir -p -c$MDSCOUNT $DIR/$tdir || break + if [ $MDSCOUNT -ge 2 ]; then + $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir || + error "set default dirstripe failed" + fi + cd $DIR/$tdir || break + tar cf - /etc | tar xf - || error "tar failed" + cd $DIR || break + rm -rf $DIR/$tdir || break + done + )& + tar_70c_pid=$! + echo "Started tar $tar_70c_pid" + + random_fail_mdt $MDSCOUNT $duration $tar_70c_pid + kill -0 $tar_70c_pid || error "tar $tar_70c_pid stopped" + + cleanup_70c + true +} +run_test 70c "tar ${MDSCOUNT}mdts recovery" + test_73a() { multiop_bg_pause $DIR/$tfile O_tSc || error "multiop_bg_pause $DIR/$tfile failed"