Whamcloud - gitweb
LU-5319 tests: testcases for multiple modify RPCs feature
[fs/lustre-release.git] / lustre / ldlm / ldlm_lib.c
index 38c0b93..f56b452 100644 (file)
@@ -43,6 +43,7 @@
 
 #define DEBUG_SUBSYSTEM S_LDLM
 
+#include <linux/kthread.h>
 #include <libcfs/libcfs.h>
 #include <obd.h>
 #include <obd_class.h>
@@ -259,14 +260,15 @@ static int osc_on_mdt(char *obdname)
  */
 int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 {
-        struct client_obd *cli = &obddev->u.cli;
-        struct obd_import *imp;
-        struct obd_uuid server_uuid;
-        int rq_portal, rp_portal, connect_op;
-        char *name = obddev->obd_type->typ_name;
-        ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
-        int rc;
-        ENTRY;
+       struct client_obd *cli = &obddev->u.cli;
+       struct obd_import *imp;
+       struct obd_uuid server_uuid;
+       int rq_portal, rp_portal, connect_op;
+       char *name = obddev->obd_type->typ_name;
+       ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
+       int rc;
+       char *cli_name = lustre_cfg_buf(lcfg, 0);
+       ENTRY;
 
         /* In a more perfect world, we would hang a ptlrpc_client off of
          * obd_type and just use the values from there. */
@@ -282,7 +284,12 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                rq_portal = MDS_REQUEST_PORTAL;
                rp_portal = MDC_REPLY_PORTAL;
                connect_op = MDS_CONNECT;
-               cli->cl_sp_me = LUSTRE_SP_CLI;
+               if (is_lwp_on_ost(cli_name))
+                       cli->cl_sp_me = LUSTRE_SP_OST;
+               else if (is_lwp_on_mdt(cli_name))
+                       cli->cl_sp_me = LUSTRE_SP_MDT;
+               else
+                       cli->cl_sp_me = LUSTRE_SP_CLI;
                cli->cl_sp_to = LUSTRE_SP_MDT;
                ns_type = LDLM_NS_TYPE_MDC;
        } else if (!strcmp(name, LUSTRE_OSP_NAME)) {
@@ -300,7 +307,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                        rq_portal = OST_REQUEST_PORTAL;
                }
                rp_portal = OSC_REPLY_PORTAL;
-               cli->cl_sp_me = LUSTRE_SP_CLI;
+               cli->cl_sp_me = LUSTRE_SP_MDT;
         } else if (!strcmp(name, LUSTRE_MGC_NAME)) {
                 rq_portal = MGS_REQUEST_PORTAL;
                 rp_portal = MGC_REPLY_PORTAL;
@@ -414,6 +421,23 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                else
                        cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
         }
+
+       spin_lock_init(&cli->cl_mod_rpcs_lock);
+       spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
+       cli->cl_max_mod_rpcs_in_flight = 0;
+       cli->cl_mod_rpcs_in_flight = 0;
+       cli->cl_close_rpcs_in_flight = 0;
+       init_waitqueue_head(&cli->cl_mod_rpcs_waitq);
+       cli->cl_mod_tag_bitmap = NULL;
+
+       if (connect_op == MDS_CONNECT) {
+               cli->cl_max_mod_rpcs_in_flight = cli->cl_max_rpcs_in_flight - 1;
+               OBD_ALLOC(cli->cl_mod_tag_bitmap,
+                         BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+               if (cli->cl_mod_tag_bitmap == NULL)
+                       GOTO(err, rc = -ENOMEM);
+       }
+
         rc = ldlm_get_ref();
         if (rc) {
                 CERROR("ldlm_get_ref failed: %d\n", rc);
@@ -438,10 +462,9 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                 GOTO(err_import, rc);
         }
 
-        cli->cl_import = imp;
-        /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
-        cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
-        cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+       cli->cl_import = imp;
+       /* cli->cl_max_mds_easize updated by mdc_init_ea_size() */
+       cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
 
         if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
                 if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
@@ -464,8 +487,6 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                 GOTO(err_import, rc = -ENOMEM);
         }
 
-        cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
-
         RETURN(rc);
 
 err_import:
@@ -473,6 +494,10 @@ err_import:
 err_ldlm:
         ldlm_put_ref();
 err:
+       if (cli->cl_mod_tag_bitmap != NULL)
+               OBD_FREE(cli->cl_mod_tag_bitmap,
+                        BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+       cli->cl_mod_tag_bitmap = NULL;
         RETURN(rc);
 
 }
@@ -480,6 +505,7 @@ EXPORT_SYMBOL(client_obd_setup);
 
 int client_obd_cleanup(struct obd_device *obddev)
 {
+       struct client_obd *cli = &obddev->u.cli;
        ENTRY;
 
        ldlm_namespace_free_post(obddev->obd_namespace);
@@ -489,6 +515,12 @@ int client_obd_cleanup(struct obd_device *obddev)
        LASSERT(obddev->u.cli.cl_import == NULL);
 
        ldlm_put_ref();
+
+       if (cli->cl_mod_tag_bitmap != NULL)
+               OBD_FREE(cli->cl_mod_tag_bitmap,
+                        BITS_TO_LONGS(OBD_MAX_RIF_MAX) * sizeof(long));
+       cli->cl_mod_tag_bitmap = NULL;
+
        RETURN(0);
 }
 EXPORT_SYMBOL(client_obd_cleanup);
@@ -504,6 +536,7 @@ int client_connect_import(const struct lu_env *env,
        struct obd_connect_data *ocd;
        struct lustre_handle    conn    = { 0 };
        int                     rc;
+       bool                    is_mdc = false;
        ENTRY;
 
         *exp = NULL;
@@ -528,6 +561,10 @@ int client_connect_import(const struct lu_env *env,
         ocd = &imp->imp_connect_data;
         if (data) {
                 *ocd = *data;
+               is_mdc = strncmp(imp->imp_obd->obd_type->typ_name,
+                                LUSTRE_MDC_NAME, 3) == 0;
+               if (is_mdc)
+                       data->ocd_connect_flags |= OBD_CONNECT_MULTIMODRPCS;
                 imp->imp_connect_flags_orig = data->ocd_connect_flags;
         }
 
@@ -543,6 +580,10 @@ int client_connect_import(const struct lu_env *env,
                          ocd->ocd_connect_flags, "old "LPX64", new "LPX64"\n",
                          data->ocd_connect_flags, ocd->ocd_connect_flags);
                 data->ocd_connect_flags = ocd->ocd_connect_flags;
+               /* clear the flag as it was not set and is not known
+                * by upper layers */
+               if (is_mdc)
+                       data->ocd_connect_flags &= ~OBD_CONNECT_MULTIMODRPCS;
         }
 
         ptlrpc_pinger_add_import(imp);
@@ -647,8 +688,7 @@ int server_disconnect_export(struct obd_export *exp)
        if (exp->exp_imp_reverse)
                ptlrpc_cleanup_imp(exp->exp_imp_reverse);
 
-       if (exp->exp_obd->obd_namespace != NULL)
-               ldlm_cancel_locks_for_export(exp);
+       ldlm_bl_thread_wakeup();
 
         /* complete all outstanding replies */
        spin_lock(&exp->exp_lock);
@@ -773,6 +813,8 @@ int target_handle_connect(struct ptlrpc_request *req)
         char *target_start;
         int target_len;
        bool     mds_conn = false, lw_client = false;
+       bool     mds_mds_conn = false;
+       bool     new_mds_mds_conn = false;
         struct obd_connect_data *data, *tmpdata;
         int size, tmpsize;
         lnet_nid_t *client_nid = NULL;
@@ -870,6 +912,20 @@ int target_handle_connect(struct ptlrpc_request *req)
         if (rc)
                 GOTO(out, rc);
 
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+       /* Don't allow clients to connect that are using old 1.8 format
+        * protocol conventions (LUSTRE_MSG_MAGIC_v1, !MSGHDR_CKSUM_INCOMPAT18,
+        * ldlm_flock_policy_wire format, MDT_ATTR_xTIME_SET, etc).  The
+        * FULL20 flag should be set on all connections since 2.0, but no
+        * longer affects behaviour.
+        *
+        * Later this check will be disabled and the flag can be retired
+        * completely once interop with 3.0 is no longer needed.
+        */
+       if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
+               GOTO(out, rc = -EPROTO);
+#endif
+
        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
                if (data->ocd_version < LUSTRE_VERSION_CODE -
                                               LUSTRE_VERSION_ALLOWED_OFFSET ||
@@ -895,10 +951,15 @@ int target_handle_connect(struct ptlrpc_request *req)
                }
        }
 
+       /* Note: lw_client is needed in MDS-MDS failover during update log
+        * processing, so we needs to allow lw_client to be connected at
+        * anytime, instead of only the initial connection */
+       lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0;
+
        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
                mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0;
-               lw_client = (data->ocd_connect_flags &
-                            OBD_CONNECT_LIGHTWEIGHT) != 0;
+               mds_mds_conn = (data->ocd_connect_flags &
+                               OBD_CONNECT_MDS_MDS) != 0;
 
                /* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS
                 * for Imperative Recovery connection from MGC to MGS.
@@ -1050,7 +1111,9 @@ no_export:
 
         if (export == NULL) {
                /* allow lightweight connections during recovery */
-               if (target->obd_recovering && !lw_client) {
+               /* allow "new" MDT to be connected during recovery, since we
+                * need retrieve recovery update records from it */
+               if (target->obd_recovering && !lw_client && !mds_mds_conn) {
                         cfs_time_t t;
                        int     c; /* connected */
                        int     i; /* in progress */
@@ -1064,30 +1127,33 @@ no_export:
                        t = cfs_timer_deadline(&target->obd_recovery_timer);
                        t = cfs_time_sub(t, cfs_time_current());
                        t = cfs_duration_sec(t);
-                       LCONSOLE_WARN("%s: Denying connection for new client "
-                                     "%s (at %s), waiting for all %d known "
-                                     "clients (%d recovered, %d in progress, "
-                                     "and %d evicted) to recover in %d:%.02d\n",
+                       LCONSOLE_WARN("%s: Denying connection for new client %s"
+                                     "(at %s), waiting for %d known clients "
+                                     "(%d recovered, %d in progress, and %d "
+                                     "evicted) to recover in %d:%.02d\n",
                                      target->obd_name, cluuid.uuid,
                                      libcfs_nid2str(req->rq_peer.nid), k,
                                      c - i, i, s, (int)t / 60,
                                      (int)t % 60);
-                        rc = -EBUSY;
-                } else {
+                       rc = -EBUSY;
+               } else {
 dont_check_exports:
-                        rc = obd_connect(req->rq_svc_thread->t_env,
-                                         &export, target, &cluuid, data,
-                                         client_nid);
+                       rc = obd_connect(req->rq_svc_thread->t_env,
+                                        &export, target, &cluuid, data,
+                                        client_nid);
                        if (mds_conn && OBD_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG))
                                lustre_msg_add_op_flags(req->rq_repmsg,
-                                               MSG_CONNECT_RECOVERING);
-                        if (rc == 0)
-                                conn.cookie = export->exp_handle.h_cookie;
-                }
-        } else {
-                rc = obd_reconnect(req->rq_svc_thread->t_env,
-                                   export, target, &cluuid, data, client_nid);
-        }
+                                                       MSG_CONNECT_RECOVERING);
+                       if (rc == 0)
+                               conn.cookie = export->exp_handle.h_cookie;
+
+                       if (mds_mds_conn)
+                               new_mds_mds_conn = true;
+               }
+       } else {
+               rc = obd_reconnect(req->rq_svc_thread->t_env,
+                                  export, target, &cluuid, data, client_nid);
+       }
        if (rc)
                GOTO(out, rc);
 
@@ -1200,6 +1266,14 @@ dont_check_exports:
 
                atomic_inc(&target->obd_req_replay_clients);
                atomic_inc(&target->obd_lock_replay_clients);
+               /* Note: MDS-MDS connection is allowed to be connected during
+                * recovery, no matter if the exports needs to be recoveried.
+                * Because we need retrieve updates logs from all other MDTs.
+                * So if the MDS-MDS export is new, obd_max_recoverable_clients
+                * also needs to be increased to match other recovery checking
+                * condition. */
+               if (new_mds_mds_conn)
+                       target->obd_max_recoverable_clients++;
                if (atomic_inc_return(&target->obd_connected_clients) ==
                    target->obd_max_recoverable_clients)
                        wake_up(&target->obd_next_transno_waitq);
@@ -1236,17 +1310,12 @@ dont_check_exports:
         * ptlrpc_handle_server_req_in->lustre_unpack_msg(). */
         revimp->imp_msg_magic = req->rq_reqmsg->lm_magic;
 
-       if ((data->ocd_connect_flags & OBD_CONNECT_AT) &&
-           (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1))
+       if (data->ocd_connect_flags & OBD_CONNECT_AT)
                revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
        else
                revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
 
-       if ((data->ocd_connect_flags & OBD_CONNECT_FULL20) &&
-            (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1))
-                revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
-        else
-                revimp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+       revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
 
        rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr);
        if (rc) {
@@ -1347,6 +1416,7 @@ static int target_exp_enqueue_req_replay(struct ptlrpc_request *req)
         __u64                  transno = lustre_msg_get_transno(req->rq_reqmsg);
         struct obd_export     *exp = req->rq_export;
         struct ptlrpc_request *reqiter;
+       struct ptlrpc_request *dup_req = NULL;
         int                    dup = 0;
 
         LASSERT(exp);
@@ -1355,6 +1425,7 @@ static int target_exp_enqueue_req_replay(struct ptlrpc_request *req)
        list_for_each_entry(reqiter, &exp->exp_req_replay_queue,
                                 rq_replay_list) {
                 if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) {
+                       dup_req = reqiter;
                         dup = 1;
                         break;
                 }
@@ -1366,6 +1437,16 @@ static int target_exp_enqueue_req_replay(struct ptlrpc_request *req)
                      (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY))
                         CERROR("invalid flags %x of resent replay\n",
                                lustre_msg_get_flags(req->rq_reqmsg));
+
+               if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+                       __u32 new_conn;
+
+                       new_conn = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+                       if (new_conn >
+                           lustre_msg_get_conn_cnt(dup_req->rq_reqmsg))
+                               lustre_msg_set_conn_cnt(dup_req->rq_reqmsg,
+                                                       new_conn);
+               }
         } else {
                list_add_tail(&req->rq_replay_list,
                                   &exp->exp_req_replay_queue);
@@ -1385,8 +1466,9 @@ static void target_exp_dequeue_req_replay(struct ptlrpc_request *req)
        spin_unlock(&req->rq_export->exp_lock);
 }
 
-static void target_finish_recovery(struct obd_device *obd)
+static void target_finish_recovery(struct lu_target *lut)
 {
+       struct obd_device *obd = lut->lut_obd;
         ENTRY;
 
        /* Only log a recovery message when recovery has occurred. */
@@ -1419,6 +1501,10 @@ static void target_finish_recovery(struct obd_device *obd)
        }
        spin_unlock(&obd->obd_recovery_task_lock);
 
+       if (lut->lut_tdtd != NULL &&
+           !list_empty(&lut->lut_tdtd->tdtd_replay_list))
+               dtrq_list_dump(lut->lut_tdtd, D_ERROR);
+
         obd->obd_recovery_end = cfs_time_current_sec();
 
        /* When recovery finished, cleanup orphans on MDS and OST. */
@@ -1495,6 +1581,7 @@ void target_cleanup_recovery(struct obd_device *obd)
                return;
        }
        obd->obd_recovering = obd->obd_abort_recovery = 0;
+       obd->obd_force_abort_recovery = 0;
        spin_unlock(&obd->obd_dev_lock);
 
        spin_lock(&obd->obd_recovery_task_lock);
@@ -1535,7 +1622,8 @@ static void target_start_recovery_timer(struct obd_device *obd)
                return;
 
        spin_lock(&obd->obd_dev_lock);
-       if (!obd->obd_recovering || obd->obd_abort_recovery) {
+       if (!obd->obd_recovering || obd->obd_abort_recovery ||
+           obd->obd_force_abort_recovery) {
                spin_unlock(&obd->obd_dev_lock);
                return;
        }
@@ -1576,7 +1664,8 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend)
        int to;
 
        spin_lock(&obd->obd_dev_lock);
-       if (!obd->obd_recovering || obd->obd_abort_recovery) {
+       if (!obd->obd_recovering || obd->obd_abort_recovery ||
+           obd->obd_force_abort_recovery) {
                spin_unlock(&obd->obd_dev_lock);
                 return;
         }
@@ -1678,23 +1767,14 @@ static inline int exp_finished(struct obd_export *exp)
         return (exp->exp_in_recovery && !exp->exp_lock_replay_needed);
 }
 
-/** Checking routines for recovery */
-static int check_for_clients(struct obd_device *obd)
-{
-       unsigned int clnts = atomic_read(&obd->obd_connected_clients);
-
-       if (obd->obd_abort_recovery || obd->obd_recovery_expired)
-               return 1;
-       LASSERT(clnts <= obd->obd_max_recoverable_clients);
-       return (clnts + obd->obd_stale_clients ==
-               obd->obd_max_recoverable_clients);
-}
-
-static int check_for_next_transno(struct obd_device *obd)
+static int check_for_next_transno(struct lu_target *lut)
 {
        struct ptlrpc_request *req = NULL;
+       struct obd_device *obd = lut->lut_obd;
        int wake_up = 0, connected, completed, queue_len;
-       __u64 next_transno, req_transno;
+       __u64 req_transno = 0;
+       __u64 update_transno = 0;
+       __u64 next_transno = 0;
        ENTRY;
 
        spin_lock(&obd->obd_recovery_task_lock);
@@ -1702,8 +1782,13 @@ static int check_for_next_transno(struct obd_device *obd)
                req = list_entry(obd->obd_req_replay_queue.next,
                                     struct ptlrpc_request, rq_list);
                req_transno = lustre_msg_get_transno(req->rq_reqmsg);
-       } else {
-               req_transno = 0;
+       }
+
+       if (lut->lut_tdtd != NULL) {
+               struct target_distribute_txn_data *tdtd;
+
+               tdtd = lut->lut_tdtd;
+               update_transno = distribute_txn_get_next_transno(lut->lut_tdtd);
        }
 
        connected = atomic_read(&obd->obd_connected_clients);
@@ -1716,13 +1801,14 @@ static int check_for_next_transno(struct obd_device *obd)
               obd->obd_max_recoverable_clients, connected, completed,
               queue_len, req_transno, next_transno);
 
-       if (obd->obd_abort_recovery) {
+       if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
                CDEBUG(D_HA, "waking for aborted recovery\n");
                wake_up = 1;
        } else if (obd->obd_recovery_expired) {
                CDEBUG(D_HA, "waking for expired recovery\n");
                wake_up = 1;
-       } else if (req_transno == next_transno) {
+       } else if (req_transno == next_transno ||
+                  (update_transno != 0 && update_transno <= next_transno)) {
                CDEBUG(D_HA, "waking for next ("LPD64")\n", next_transno);
                wake_up = 1;
        } else if (queue_len > 0 &&
@@ -1738,10 +1824,10 @@ static int check_for_next_transno(struct obd_device *obd)
                CDEBUG(d_lvl,
                       "%s: waking for gap in transno, VBR is %s (skip: "
                       LPD64", ql: %d, comp: %d, conn: %d, next: "LPD64
-                      ", last_committed: "LPD64")\n",
+                      ", next_update "LPD64" last_committed: "LPD64")\n",
                       obd->obd_name, obd->obd_version_recov ? "ON" : "OFF",
                       next_transno, queue_len, completed, connected,
-                      req_transno, obd->obd_last_committed);
+                      req_transno, update_transno, obd->obd_last_committed);
                obd->obd_next_recovery_transno = req_transno;
                wake_up = 1;
        } else if (atomic_read(&obd->obd_req_replay_clients) == 0) {
@@ -1757,8 +1843,9 @@ static int check_for_next_transno(struct obd_device *obd)
        return wake_up;
 }
 
-static int check_for_next_lock(struct obd_device *obd)
+static int check_for_next_lock(struct lu_target *lut)
 {
+       struct obd_device *obd = lut->lut_obd;
        int wake_up = 0;
 
        spin_lock(&obd->obd_recovery_task_lock);
@@ -1768,7 +1855,7 @@ static int check_for_next_lock(struct obd_device *obd)
        } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
                CDEBUG(D_HA, "waking for completed lock replay\n");
                wake_up = 1;
-       } else if (obd->obd_abort_recovery) {
+       } else if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
                CDEBUG(D_HA, "waking for aborted recovery\n");
                wake_up = 1;
        } else if (obd->obd_recovery_expired) {
@@ -1785,10 +1872,11 @@ static int check_for_next_lock(struct obd_device *obd)
  * check its status with help of check_routine
  * evict dead clients via health_check
  */
-static int target_recovery_overseer(struct obd_device *obd,
-                                   int (*check_routine)(struct obd_device *),
+static int target_recovery_overseer(struct lu_target *lut,
+                                   int (*check_routine)(struct lu_target *),
                                    int (*health_check)(struct obd_export *))
 {
+       struct obd_device       *obd = lut->lut_obd;
 repeat:
        if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >=
              (obd->obd_recovery_start + obd->obd_recovery_time_hard))) {
@@ -1797,11 +1885,11 @@ repeat:
        }
 
        while (wait_event_timeout(obd->obd_next_transno_waitq,
-                                 check_routine(obd),
+                                 check_routine(lut),
                                  msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0)
                /* wait indefinitely for event, but don't trigger watchdog */;
 
-       if (obd->obd_abort_recovery) {
+       if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
                CWARN("recovery is aborted, evict exports in recovery\n");
                /** evict exports which didn't finish recovery yet */
                class_disconnect_stale_exports(obd, exp_finished);
@@ -1828,50 +1916,13 @@ repeat:
        return 0;
 }
 
-static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd)
-{
-       struct ptlrpc_request *req = NULL;
-       ENTRY;
-
-       CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
-               obd->obd_next_recovery_transno);
-
-       CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
-       /** It is needed to extend recovery window above recovery_time_soft.
-        *  Extending is possible only in the end of recovery window
-        *  (see more details in handle_recovery_req).
-        */
-       CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
-
-       if (target_recovery_overseer(obd, check_for_next_transno,
-                                    exp_req_replay_healthy)) {
-               abort_req_replay_queue(obd);
-               abort_lock_replay_queue(obd);
-       }
-
-       spin_lock(&obd->obd_recovery_task_lock);
-       if (!list_empty(&obd->obd_req_replay_queue)) {
-               req = list_entry(obd->obd_req_replay_queue.next,
-                                    struct ptlrpc_request, rq_list);
-               list_del_init(&req->rq_list);
-               obd->obd_requests_queued_for_recovery--;
-               spin_unlock(&obd->obd_recovery_task_lock);
-       } else {
-               spin_unlock(&obd->obd_recovery_task_lock);
-               LASSERT(list_empty(&obd->obd_req_replay_queue));
-               LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
-               /** evict exports failed VBR */
-               class_disconnect_stale_exports(obd, exp_vbr_healthy);
-       }
-       RETURN(req);
-}
-
-static struct ptlrpc_request *target_next_replay_lock(struct obd_device *obd)
+static struct ptlrpc_request *target_next_replay_lock(struct lu_target *lut)
 {
+       struct obd_device       *obd = lut->lut_obd;
        struct ptlrpc_request *req = NULL;
 
        CDEBUG(D_HA, "Waiting for lock\n");
-       if (target_recovery_overseer(obd, check_for_next_lock,
+       if (target_recovery_overseer(lut, check_for_next_lock,
                                     exp_lock_replay_healthy))
                abort_lock_replay_queue(obd);
 
@@ -1965,6 +2016,318 @@ static void handle_recovery_req(struct ptlrpc_thread *thread,
        EXIT;
 }
 
+/** Checking routines for recovery */
+static int check_for_recovery_ready(struct lu_target *lut)
+{
+       struct obd_device *obd = lut->lut_obd;
+       unsigned int clnts = atomic_read(&obd->obd_connected_clients);
+
+       CDEBUG(D_HA, "connected %d stale %d max_recoverable_clients %d"
+              " abort %d expired %d\n", clnts, obd->obd_stale_clients,
+              obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
+              obd->obd_recovery_expired);
+
+       if (obd->obd_force_abort_recovery)
+               return 1;
+
+       if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
+               LASSERT(clnts <= obd->obd_max_recoverable_clients);
+               if (clnts + obd->obd_stale_clients <
+                   obd->obd_max_recoverable_clients)
+                       return 0;
+       }
+
+       if (lut->lut_tdtd != NULL) {
+               if (!lut->lut_tdtd->tdtd_replay_ready) {
+                       /* Let's extend recovery timer, in case the recovery
+                        * timer expired, and some clients got evicted */
+                       extend_recovery_timer(obd, obd->obd_recovery_timeout,
+                                             true);
+                       return 0;
+               } else {
+                       dtrq_list_dump(lut->lut_tdtd, D_HA);
+               }
+       }
+
+       return 1;
+}
+
+enum {
+       REQUEST_RECOVERY = 1,
+       UPDATE_RECOVERY = 2,
+};
+
+static __u64 get_next_replay_req_transno(struct obd_device *obd)
+{
+       __u64 transno = 0;
+
+       if (!list_empty(&obd->obd_req_replay_queue)) {
+               struct ptlrpc_request *req;
+
+               req = list_entry(obd->obd_req_replay_queue.next,
+                                struct ptlrpc_request, rq_list);
+               transno = lustre_msg_get_transno(req->rq_reqmsg);
+       }
+
+       return transno;
+}
+__u64 get_next_transno(struct lu_target *lut, int *type)
+{
+       struct obd_device *obd = lut->lut_obd;
+       struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+       __u64 transno = 0;
+       __u64 update_transno;
+       ENTRY;
+
+       transno = get_next_replay_req_transno(obd);
+       if (type != NULL)
+               *type = REQUEST_RECOVERY;
+
+       if (tdtd == NULL)
+               RETURN(transno);
+
+       update_transno = distribute_txn_get_next_transno(tdtd);
+       if (transno == 0 || (transno >= update_transno &&
+                            update_transno != 0)) {
+               transno = update_transno;
+               if (type != NULL)
+                       *type = UPDATE_RECOVERY;
+       }
+
+       RETURN(transno);
+}
+
+/**
+ * drop duplicate replay request
+ *
+ * Because the operation has been replayed by update recovery, the request
+ * with the same transno will be dropped and also notify the client to send
+ * next replay request.
+ *
+ * \param[in] env      execution environment
+ * \param[in] obd      failover obd device
+ * \param[in] req      request to be dropped
+ */
+static void drop_duplicate_replay_req(struct lu_env *env,
+                                     struct obd_device *obd,
+                                     struct ptlrpc_request *req)
+{
+       DEBUG_REQ(D_HA, req, "remove t"LPD64" from %s because of duplicate"
+                 " update records are found.\n",
+                 lustre_msg_get_transno(req->rq_reqmsg),
+                 libcfs_nid2str(req->rq_peer.nid));
+
+       /* Right now, only for MDS reint operation update replay and
+        * normal request replay can have the same transno */
+       if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT) {
+               req_capsule_set(&req->rq_pill, &RQF_MDS_REINT);
+               req->rq_status = req_capsule_server_pack(&req->rq_pill);
+               if (likely(req->rq_export))
+                       target_committed_to_req(req);
+               lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
+               target_send_reply(req, req->rq_status, 0);
+       } else {
+               DEBUG_REQ(D_ERROR, req, "wrong opc" "from %s\n",
+               libcfs_nid2str(req->rq_peer.nid));
+       }
+       target_exp_dequeue_req_replay(req);
+       target_request_copy_put(req);
+       obd->obd_replayed_requests++;
+}
+
+/**
+ * Update last_rcvd of the update
+ *
+ * Because update recovery might update the last_rcvd by updates, i.e.
+ * it will not update the last_rcvd information in memory, so we need
+ * refresh these information in memory after update recovery.
+ *
+ * \param[in] obd      obd_device under recoverying.
+ * \param[in] dtrq     the update replay requests being replayed.
+ */
+static void target_update_lcd(struct lu_env *env, struct lu_target *lut,
+                             struct distribute_txn_replay_req *dtrq)
+{
+       struct obd_device       *obd = lut->lut_obd;
+       struct obd_export       *export;
+       struct tg_export_data   *ted;
+       struct distribute_txn_replay_req_sub    *dtrqs;
+       struct seq_server_site *site;
+       struct update_records   *ur;
+       const struct lu_fid     *fid;
+       struct update_ops       *ops;
+       struct update_params    *params;
+       struct update_op        *op;
+       __u32                   mdt_index;
+       unsigned int            i;
+       struct lsd_client_data  *lcd = NULL;
+
+       /* if Updates has been executed(committed) on the recovery target,
+        * i.e. the updates is not being executed on the target, so we do
+        * not need update it in memory */
+       site = lu_site2seq(obd->obd_lu_dev->ld_site);
+       mdt_index = site->ss_node_id;
+       dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+       if (dtrqs != NULL)
+               return;
+
+       if (dtrq->dtrq_lur == NULL)
+               return;
+
+       /* Find the update last_rcvd record */
+       fid = lu_object_fid(&lut->lut_last_rcvd->do_lu);
+       ur = &dtrq->dtrq_lur->lur_update_rec;
+       ops = &ur->ur_ops;
+       params = update_records_get_params(ur);
+       for (i = 0, op = &ops->uops_op[0]; i < ur->ur_update_count;
+            i++, op = update_op_next_op(op)) {
+               __u64 pos;
+               __u16 size;
+               void *buf;
+
+               if (!lu_fid_eq(&op->uop_fid, fid))
+                       continue;
+
+               if (op->uop_type != OUT_WRITE)
+                       continue;
+
+               buf = update_params_get_param_buf(params, op->uop_params_off[1],
+                                                 ur->ur_param_count, NULL);
+               if (buf == NULL)
+                       continue;
+
+               pos = le64_to_cpu(*(__u64 *)buf);
+               if (pos == 0)
+                       continue;
+
+               buf = update_params_get_param_buf(params, op->uop_params_off[0],
+                                                 ur->ur_param_count, &size);
+               if (buf == NULL)
+                       continue;
+
+               if (size != sizeof(*lcd))
+                       continue;
+               lcd = buf;
+       }
+
+       if (lcd == NULL || lcd->lcd_uuid[0] == '\0')
+               return;
+
+       /* locate the export then update the exp_target_data if needed */
+       export = cfs_hash_lookup(obd->obd_uuid_hash, lcd->lcd_uuid);
+       if (export == NULL)
+               return;
+
+       ted = &export->exp_target_data;
+       if (lcd->lcd_last_xid > ted->ted_lcd->lcd_last_xid) {
+               CDEBUG(D_HA, "%s update xid from "LPU64" to "LPU64"\n",
+                      lut->lut_obd->obd_name, ted->ted_lcd->lcd_last_xid,
+                      lcd->lcd_last_xid);
+               ted->ted_lcd->lcd_last_xid = lcd->lcd_last_xid;
+               ted->ted_lcd->lcd_last_result = lcd->lcd_last_result;
+       }
+       class_export_put(export);
+}
+
+static void replay_request_or_update(struct lu_env *env,
+                                    struct lu_target *lut,
+                                    struct target_recovery_data *trd,
+                                    struct ptlrpc_thread *thread)
+{
+       struct obd_device *obd = lut->lut_obd;
+       struct ptlrpc_request *req = NULL;
+       int                     type;
+       __u64                   transno;
+       ENTRY;
+
+       CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
+              obd->obd_next_recovery_transno);
+
+       /* Replay all of request and update by transno */
+       do {
+               struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+               CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
+
+               /** It is needed to extend recovery window above
+                *  recovery_time_soft. Extending is possible only in the
+                *  end of recovery window (see more details in
+                *  handle_recovery_req()).
+                */
+               CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
+
+               if (target_recovery_overseer(lut, check_for_next_transno,
+                                            exp_req_replay_healthy)) {
+                       abort_req_replay_queue(obd);
+                       abort_lock_replay_queue(obd);
+               }
+
+               spin_lock(&obd->obd_recovery_task_lock);
+               transno = get_next_transno(lut, &type);
+               if (type == REQUEST_RECOVERY && tdtd != NULL &&
+                   transno == tdtd->tdtd_last_update_transno) {
+                       /* Drop replay request from client side, if the
+                        * replay has been executed by update with the
+                        * same transno */
+                       req = list_entry(obd->obd_req_replay_queue.next,
+                                       struct ptlrpc_request, rq_list);
+                       list_del_init(&req->rq_list);
+                       obd->obd_requests_queued_for_recovery--;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       drop_duplicate_replay_req(env, obd, req);
+               } else if (type == REQUEST_RECOVERY && transno != 0) {
+                       req = list_entry(obd->obd_req_replay_queue.next,
+                                            struct ptlrpc_request, rq_list);
+                       list_del_init(&req->rq_list);
+                       obd->obd_requests_queued_for_recovery--;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       LASSERT(trd->trd_processing_task == current_pid());
+                       DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
+                                 lustre_msg_get_transno(req->rq_reqmsg),
+                                 libcfs_nid2str(req->rq_peer.nid));
+
+                       handle_recovery_req(thread, req,
+                                           trd->trd_recovery_handler);
+                       /**
+                        * bz18031: increase next_recovery_transno before
+                        * target_request_copy_put() will drop exp_rpc reference
+                        */
+                       spin_lock(&obd->obd_recovery_task_lock);
+                       obd->obd_next_recovery_transno++;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       target_exp_dequeue_req_replay(req);
+                       target_request_copy_put(req);
+                       obd->obd_replayed_requests++;
+               } else if (type == UPDATE_RECOVERY && transno != 0) {
+                       struct distribute_txn_replay_req *dtrq;
+
+                       spin_unlock(&obd->obd_recovery_task_lock);
+
+                       LASSERT(tdtd != NULL);
+                       dtrq = distribute_txn_get_next_req(tdtd);
+                       lu_context_enter(&thread->t_env->le_ctx);
+                       tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+                       lu_context_exit(&thread->t_env->le_ctx);
+                       extend_recovery_timer(obd, obd_timeout, true);
+                       LASSERT(tdtd->tdtd_last_update_transno <= transno);
+                       tdtd->tdtd_last_update_transno = transno;
+                       spin_lock(&obd->obd_recovery_task_lock);
+                       if (transno > obd->obd_next_recovery_transno)
+                               obd->obd_next_recovery_transno = transno;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       target_update_lcd(env, lut, dtrq);
+                       dtrq_destroy(dtrq);
+               } else {
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       LASSERT(list_empty(&obd->obd_req_replay_queue));
+                       LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
+                       /** evict exports failed VBR */
+                       class_disconnect_stale_exports(obd, exp_vbr_healthy);
+                       break;
+               }
+       } while (1);
+}
+
 static int target_recovery_thread(void *arg)
 {
         struct lu_target *lut = arg;
@@ -2010,43 +2373,28 @@ static int target_recovery_thread(void *arg)
        spin_unlock(&obd->obd_dev_lock);
        complete(&trd->trd_starting);
 
-        /* first of all, we have to know the first transno to replay */
-        if (target_recovery_overseer(obd, check_for_clients,
-                                     exp_connect_healthy)) {
-                abort_req_replay_queue(obd);
-                abort_lock_replay_queue(obd);
-        }
+       /* first of all, we have to know the first transno to replay */
+       if (target_recovery_overseer(lut, check_for_recovery_ready,
+                                    exp_connect_healthy)) {
+               abort_req_replay_queue(obd);
+               abort_lock_replay_queue(obd);
+               if (lut->lut_tdtd != NULL)
+                       dtrq_list_destroy(lut->lut_tdtd);
+       }
 
-       /* next stage: replay requests */
+       /* next stage: replay requests or update */
        delta = jiffies;
        CDEBUG(D_INFO, "1: request replay stage - %d clients from t"LPU64"\n",
               atomic_read(&obd->obd_req_replay_clients),
               obd->obd_next_recovery_transno);
-       while ((req = target_next_replay_req(obd))) {
-               LASSERT(trd->trd_processing_task == current_pid());
-               DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
-                         lustre_msg_get_transno(req->rq_reqmsg),
-                         libcfs_nid2str(req->rq_peer.nid));
-                handle_recovery_req(thread, req,
-                                    trd->trd_recovery_handler);
-                /**
-                 * bz18031: increase next_recovery_transno before
-                 * target_request_copy_put() will drop exp_rpc reference
-                 */
-               spin_lock(&obd->obd_recovery_task_lock);
-               obd->obd_next_recovery_transno++;
-               spin_unlock(&obd->obd_recovery_task_lock);
-                target_exp_dequeue_req_replay(req);
-                target_request_copy_put(req);
-                obd->obd_replayed_requests++;
-        }
+       replay_request_or_update(env, lut, trd, thread);
 
        /**
         * The second stage: replay locks
         */
        CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
               atomic_read(&obd->obd_lock_replay_clients));
-       while ((req = target_next_replay_lock(obd))) {
+       while ((req = target_next_replay_lock(lut))) {
                LASSERT(trd->trd_processing_task == current_pid());
                DEBUG_REQ(D_HA, req, "processing lock from %s: ",
                          libcfs_nid2str(req->rq_peer.nid));
@@ -2093,7 +2441,7 @@ static int target_recovery_thread(void *arg)
                libcfs_debug_dumplog();
        }
 
-        target_finish_recovery(obd);
+       target_finish_recovery(lut);
 
         lu_context_fini(&env->le_ctx);
         trd->trd_processing_task = 0;
@@ -2169,6 +2517,7 @@ static void target_recovery_expired(unsigned long castmeharder)
 void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 {
         struct obd_device *obd = lut->lut_obd;
+
         if (obd->obd_max_recoverable_clients == 0) {
                 /** Update server last boot epoch */
                 tgt_boot_epoch_update(lut);
@@ -2188,7 +2537,6 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 }
 EXPORT_SYMBOL(target_recovery_init);
 
-
 static int target_process_req_flags(struct obd_device *obd,
                                     struct ptlrpc_request *req)
 {
@@ -2409,20 +2757,23 @@ int target_pack_pool_reply(struct ptlrpc_request *req)
 static int target_send_reply_msg(struct ptlrpc_request *req,
                                 int rc, int fail_id)
 {
-        if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
-                DEBUG_REQ(D_ERROR, req, "dropping reply");
-                return (-ECOMM);
-        }
+       if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+               DEBUG_REQ(D_ERROR, req, "dropping reply");
+               return -ECOMM;
+       }
+       if (unlikely(lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT &&
+                    OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_MULTI_NET_REP)))
+               return -ECOMM;
 
-        if (unlikely(rc)) {
-                DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
-                req->rq_status = rc;
-                return (ptlrpc_send_error(req, 1));
-        } else {
-                DEBUG_REQ(D_NET, req, "sending reply");
-        }
+       if (unlikely(rc)) {
+               DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+               req->rq_status = rc;
+               return ptlrpc_send_error(req, 1);
+       } else {
+               DEBUG_REQ(D_NET, req, "sending reply");
+       }
 
-        return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT));
+       return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT);
 }
 
 void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
@@ -2668,6 +3019,13 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
        } else {
                if (req->rq_bulk_read)
                        rc = sptlrpc_svc_wrap_bulk(req, desc);
+
+               if ((exp->exp_connect_data.ocd_connect_flags &
+                    OBD_CONNECT_BULK_MBITS) != 0)
+                       req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg);
+               else /* old version, bulk matchbits is rq_xid */
+                       req->rq_mbits = req->rq_xid;
+
                if (rc == 0)
                        rc = ptlrpc_start_bulk_transfer(desc);
        }