Whamcloud - gitweb
LU-1571 mdt: Do not update xid for open replay req
authorWang Di <di.wang@whamcloud.com>
Sat, 15 Sep 2012 14:34:15 +0000 (07:34 -0700)
committerOleg Drokin <green@whamcloud.com>
Sat, 27 Oct 2012 04:32:01 +0000 (00:32 -0400)
Do not update last_xid for open replay req,
otherwise the following resend(after replay)
can not be matched with correct xid.

Remove unnecessary mti_transo zero check in
mdt_empty_transno.

Signed-off-by: wang di <di.wang@whamcloud.com>
Change-Id: I2a05f3ac05b301ae31641a1dc51f8c4eed96427d
Reviewed-on: http://review.whamcloud.com/3195
Tested-by: Hudson
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
lustre/mdt/mdt_open.c
lustre/mdt/mdt_recovery.c
lustre/tests/recovery-small.sh

index bdee514..d1fe097 100644 (file)
@@ -577,13 +577,24 @@ static void mdt_empty_transno(struct mdt_thread_info *info, int rc)
                 RETURN_EXIT;
 
         cfs_spin_lock(&mdt->mdt_lut.lut_translock);
                 RETURN_EXIT;
 
         cfs_spin_lock(&mdt->mdt_lut.lut_translock);
-        if (info->mti_transno == 0) {
+       if (rc != 0) {
+               if (info->mti_transno != 0) {
+                       struct obd_export *exp = req->rq_export;
+
+                       CERROR("%s: replay trans "LPU64" NID %s: rc = %d\n",
+                               mdt->mdt_md_dev.md_lu_dev.ld_obd->obd_name,
+                               info->mti_transno,
+                               libcfs_nid2str(exp->exp_connection->c_peer.nid),
+                               rc);
+                       RETURN_EXIT;
+               }
+       } else if (info->mti_transno == 0) {
                 info->mti_transno = ++ mdt->mdt_lut.lut_last_transno;
         } else {
                 /* should be replay */
                 if (info->mti_transno > mdt->mdt_lut.lut_last_transno)
                         mdt->mdt_lut.lut_last_transno = info->mti_transno;
                 info->mti_transno = ++ mdt->mdt_lut.lut_last_transno;
         } else {
                 /* should be replay */
                 if (info->mti_transno > mdt->mdt_lut.lut_last_transno)
                         mdt->mdt_lut.lut_last_transno = info->mti_transno;
-        }
+       }
         cfs_spin_unlock(&mdt->mdt_lut.lut_translock);
 
         CDEBUG(D_INODE, "transno = "LPU64", last_committed = "LPU64"\n",
         cfs_spin_unlock(&mdt->mdt_lut.lut_translock);
 
         CDEBUG(D_INODE, "transno = "LPU64", last_committed = "LPU64"\n",
@@ -598,10 +609,23 @@ static void mdt_empty_transno(struct mdt_thread_info *info, int rc)
         LASSERT(ted);
         cfs_mutex_lock(&ted->ted_lcd_lock);
         lcd = ted->ted_lcd;
         LASSERT(ted);
         cfs_mutex_lock(&ted->ted_lcd_lock);
         lcd = ted->ted_lcd;
+       if (info->mti_transno < lcd->lcd_last_transno &&
+           info->mti_transno != 0) {
+               /* This should happen during replay. Do not update
+                * last rcvd info if replay req transno < last transno,
+                * otherwise the following resend(after replay) can not
+                * be checked correctly by xid */
+               cfs_mutex_unlock(&ted->ted_lcd_lock);
+               CDEBUG(D_HA, "%s: transno = "LPU64" < last_transno = "LPU64"\n",
+                       mdt->mdt_md_dev.md_lu_dev.ld_obd->obd_name,
+                       info->mti_transno, lcd->lcd_last_transno);
+               RETURN_EXIT;
+       }
+
         if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE ||
             lustre_msg_get_opc(req->rq_reqmsg) == MDS_DONE_WRITING) {
         if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE ||
             lustre_msg_get_opc(req->rq_reqmsg) == MDS_DONE_WRITING) {
-                if (info->mti_transno != 0)
-                        lcd->lcd_last_close_transno = info->mti_transno;
+               if (info->mti_transno != 0)
+                       lcd->lcd_last_close_transno = info->mti_transno;
                 lcd->lcd_last_close_xid = req->rq_xid;
                 lcd->lcd_last_close_result = rc;
         } else {
                 lcd->lcd_last_close_xid = req->rq_xid;
                 lcd->lcd_last_close_result = rc;
         } else {
@@ -613,9 +637,10 @@ static void mdt_empty_transno(struct mdt_thread_info *info, int rc)
                         lcd->lcd_pre_versions[2] = pre_versions[2];
                         lcd->lcd_pre_versions[3] = pre_versions[3];
                 }
                         lcd->lcd_pre_versions[2] = pre_versions[2];
                         lcd->lcd_pre_versions[3] = pre_versions[3];
                 }
-                if (info->mti_transno != 0)
-                        lcd->lcd_last_transno = info->mti_transno;
-                lcd->lcd_last_xid = req->rq_xid;
+               if (info->mti_transno != 0)
+                       lcd->lcd_last_transno = info->mti_transno;
+
+               lcd->lcd_last_xid = req->rq_xid;
                 lcd->lcd_last_result = rc;
                 lcd->lcd_last_data = info->mti_opdata;
         }
                 lcd->lcd_last_result = rc;
                 lcd->lcd_last_data = info->mti_opdata;
         }
index 98bae6f..30e8ece 100644 (file)
@@ -515,8 +515,9 @@ static int mdt_txn_stop_cb(const struct lu_env *env,
         cfs_spin_lock(&mdt->mdt_lut.lut_translock);
         if (txn->th_result != 0) {
                 if (mti->mti_transno != 0) {
         cfs_spin_lock(&mdt->mdt_lut.lut_translock);
         if (txn->th_result != 0) {
                 if (mti->mti_transno != 0) {
-                        CERROR("Replay transno "LPU64" failed: rc %d\n",
-                               mti->mti_transno, txn->th_result);
+                       CERROR("Replay transno "LPU64" failed: rc %d\n",
+                               mti->mti_transno, txn->th_result);
+                       return 0;
                 }
         } else if (mti->mti_transno == 0) {
                 mti->mti_transno = ++ mdt->mdt_lut.lut_last_transno;
                 }
         } else if (mti->mti_transno == 0) {
                 mti->mti_transno = ++ mdt->mdt_lut.lut_last_transno;
index aba5353..5c19142 100755 (executable)
@@ -1542,6 +1542,29 @@ test_106() { # LU-1789
 }
 run_test 106 "lightweight connection support"
 
 }
 run_test 106 "lightweight connection support"
 
+test_107 () {
+       local CLIENT_PID
+       local close_pid
+
+       mkdir -p $DIR/$tdir
+       # OBD_FAIL_MDS_REINT_NET_REP   0x119
+       do_facet $SINGLEMDS lctl set_param fail_loc=0x119
+       multiop $DIR/$tdir D_c &
+       close_pid=$!
+       mkdir $DIR/$tdir/dir_106 &
+       CLIENT_PID=$!
+       do_facet $SINGLEMDS lctl set_param fail_loc=0
+       fail $SINGLEMDS
+
+       wait $CLIENT_PID || rc=$?
+       checkstat -t dir $DIR/$tdir/dir_106 || return 1
+
+       kill -USR1 $close_pid
+       wait $close_pid || return 2
+
+       return $rc
+}
+run_test 107 "drop reint reply, then restart MDT"
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status