LU-3540 lod: update recovery thread

[fs/lustre-release.git] / lustre / ldlm / ldlm_lib.c
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index 0c878dc..2c288c2 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -27,7 +27,7 @@
   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
   *
- * Copyright (c) 2010, 2013, Intel Corporation.
+ * Copyright (c) 2010, 2014, Intel Corporation.
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
@@ -43,6 +43,7 @@
  
  #define DEBUG_SUBSYSTEM S_LDLM
  
+#include <linux/kthread.h>
  #include <libcfs/libcfs.h>
  #include <obd.h>
  #include <obd_class.h>
@@ -353,7 +354,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
         INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
         INIT_LIST_HEAD(&cli->cl_loi_write_list);
         INIT_LIST_HEAD(&cli->cl_loi_read_list);
-       client_obd_list_lock_init(&cli->cl_loi_list_lock);
+       spin_lock_init(&cli->cl_loi_list_lock);
         atomic_set(&cli->cl_pending_w_pages, 0);
         atomic_set(&cli->cl_pending_r_pages, 0);
         cli->cl_r_in_flight = 0;
@@ -372,7 +373,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
         atomic_long_set(&cli->cl_lru_busy, 0);
         atomic_long_set(&cli->cl_lru_in_list, 0);
         INIT_LIST_HEAD(&cli->cl_lru_list);
-       client_obd_list_lock_init(&cli->cl_lru_list_lock);
+       spin_lock_init(&cli->cl_lru_list_lock);
         atomic_long_set(&cli->cl_unstable_count, 0);
  
         init_waitqueue_head(&cli->cl_destroy_waitq);
@@ -438,10 +439,9 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                  GOTO(err_import, rc);
          }
  
-        cli->cl_import = imp;
-        /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
-        cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
-        cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+       cli->cl_import = imp;
+       /* cli->cl_max_mds_easize updated by mdc_init_ea_size() */
+       cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
  
          if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
                  if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
@@ -464,8 +464,6 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                  GOTO(err_import, rc = -ENOMEM);
          }
  
-        cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
-
          RETURN(rc);
  
  err_import:
@@ -508,7 +506,7 @@ int client_connect_import(const struct lu_env *env,
  
          *exp = NULL;
         down_write(&cli->cl_sem);
-        if (cli->cl_conn_count > 0 )
+       if (cli->cl_conn_count > 0)
                  GOTO(out_sem, rc = -EALREADY);
  
          rc = class_connect(&conn, obd, cluuid);
@@ -580,17 +578,17 @@ int client_disconnect_export(struct obd_export *exp)
          imp = cli->cl_import;
  
         down_write(&cli->cl_sem);
-        CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name,
-               cli->cl_conn_count);
+       CDEBUG(D_INFO, "disconnect %s - %zu\n", obd->obd_name,
+               cli->cl_conn_count);
  
-        if (!cli->cl_conn_count) {
+       if (cli->cl_conn_count == 0) {
                  CERROR("disconnecting disconnected device (%s)\n",
                         obd->obd_name);
                  GOTO(out_disconnect, rc = -EINVAL);
          }
  
          cli->cl_conn_count--;
-        if (cli->cl_conn_count)
+       if (cli->cl_conn_count != 0)
                  GOTO(out_disconnect, rc = 0);
  
         /* Mark import deactivated now, so we don't try to reconnect if any
@@ -752,7 +750,6 @@ void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
         spin_unlock(&exp->exp_lock);
         class_export_cb_put(exp);
  }
-EXPORT_SYMBOL(target_client_add_cb);
  
  static void
  check_and_start_recovery_timer(struct obd_device *obd,
@@ -774,6 +771,8 @@ int target_handle_connect(struct ptlrpc_request *req)
          char *target_start;
          int target_len;
         bool     mds_conn = false, lw_client = false;
+       bool     mds_mds_conn = false;
+       bool     new_mds_mds_conn = false;
          struct obd_connect_data *data, *tmpdata;
          int size, tmpsize;
          lnet_nid_t *client_nid = NULL;
@@ -871,6 +870,20 @@ int target_handle_connect(struct ptlrpc_request *req)
          if (rc)
                  GOTO(out, rc);
  
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0)
+       /* Don't allow clients to connect that are using old 1.8 format
+        * protocol conventions (LUSTRE_MSG_MAGIC_v1, !MSGHDR_CKSUM_INCOMPAT18,
+        * ldlm_flock_policy_wire format, MDT_ATTR_xTIME_SET, etc).  The
+        * FULL20 flag should be set on all connections since 2.0, but no
+        * longer affects behaviour.
+        *
+        * Later this check will be disabled and the flag can be retired
+        * completely once interop with 3.0 is no longer needed.
+        */
+       if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
+               GOTO(out, rc = -EPROTO);
+#endif
+
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
                 if (data->ocd_version < LUSTRE_VERSION_CODE -
                                                LUSTRE_VERSION_ALLOWED_OFFSET ||
@@ -896,10 +909,47 @@ int target_handle_connect(struct ptlrpc_request *req)
                 }
         }
  
+       /* Note: lw_client is needed in MDS-MDS failover during update log
+        * processing, so we needs to allow lw_client to be connected at
+        * anytime, instead of only the initial connection */
+       lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0;
+
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
                 mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0;
-               lw_client = (data->ocd_connect_flags &
-                            OBD_CONNECT_LIGHTWEIGHT) != 0;
+               mds_mds_conn = (data->ocd_connect_flags &
+                               OBD_CONNECT_MDS_MDS) != 0;
+
+               /* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS
+                * for Imperative Recovery connection from MGC to MGS.
+                *
+                * Via check OBD_CONNECT_FID, we can distinguish whether
+                * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from
+                * MGC or MDT. */
+               if (!lw_client &&
+                   (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
+                   (data->ocd_connect_flags & OBD_CONNECT_FID) &&
+                   (data->ocd_connect_flags & OBD_CONNECT_VERSION)) {
+                       __u32 major = OBD_OCD_VERSION_MAJOR(data->ocd_version);
+                       __u32 minor = OBD_OCD_VERSION_MINOR(data->ocd_version);
+                       __u32 patch = OBD_OCD_VERSION_PATCH(data->ocd_version);
+
+                       /* We do not support the MDT-MDT interoperations with
+                        * different version MDT because of protocol changes. */
+                       if (unlikely(major != LUSTRE_MAJOR ||
+                                    minor != LUSTRE_MINOR ||
+                                    abs(patch - LUSTRE_PATCH) > 3)) {
+                               LCONSOLE_WARN("%s (%u.%u.%u.%u) refused the "
+                                       "connection from different version MDT "
+                                       "(%d.%d.%d.%d) %s %s\n",
+                                       target->obd_name, LUSTRE_MAJOR,
+                                       LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX,
+                                       major, minor, patch,
+                                       OBD_OCD_VERSION_FIX(data->ocd_version),
+                                       libcfs_nid2str(req->rq_peer.nid), str);
+
+                               GOTO(out, rc = -EPROTO);
+                       }
+               }
         }
  
          /* lctl gets a backstage, all-access pass. */
@@ -1019,7 +1069,9 @@ no_export:
  
          if (export == NULL) {
                 /* allow lightweight connections during recovery */
-               if (target->obd_recovering && !lw_client) {
+               /* allow "new" MDT to be connected during recovery, since we
+                * need retrieve recovery update records from it */
+               if (target->obd_recovering && !lw_client && !mds_mds_conn) {
                          cfs_time_t t;
                         int     c; /* connected */
                         int     i; /* in progress */
@@ -1033,47 +1085,38 @@ no_export:
                         t = cfs_timer_deadline(&target->obd_recovery_timer);
                         t = cfs_time_sub(t, cfs_time_current());
                         t = cfs_duration_sec(t);
-                       LCONSOLE_WARN("%s: Denying connection for new client "
-                                     "%s (at %s), waiting for all %d known "
-                                     "clients (%d recovered, %d in progress, "
-                                     "and %d evicted) to recover in %d:%.02d\n",
+                       LCONSOLE_WARN("%s: Denying connection for new client %s"
+                                     "(at %s), waiting for %d known clients "
+                                     "(%d recovered, %d in progress, and %d "
+                                     "evicted) to recover in %d:%.02d\n",
                                       target->obd_name, cluuid.uuid,
                                       libcfs_nid2str(req->rq_peer.nid), k,
                                       c - i, i, s, (int)t / 60,
                                       (int)t % 60);
-                        rc = -EBUSY;
-                } else {
+                       rc = -EBUSY;
+               } else {
  dont_check_exports:
-                        rc = obd_connect(req->rq_svc_thread->t_env,
-                                         &export, target, &cluuid, data,
-                                         client_nid);
+                       rc = obd_connect(req->rq_svc_thread->t_env,
+                                        &export, target, &cluuid, data,
+                                        client_nid);
                         if (mds_conn && OBD_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG))
                                 lustre_msg_add_op_flags(req->rq_repmsg,
-                                               MSG_CONNECT_RECOVERING);
-                        if (rc == 0)
-                                conn.cookie = export->exp_handle.h_cookie;
-                }
-        } else {
-                rc = obd_reconnect(req->rq_svc_thread->t_env,
-                                   export, target, &cluuid, data, client_nid);
-        }
-        if (rc)
-                GOTO(out, rc);
+                                                       MSG_CONNECT_RECOVERING);
+                       if (rc == 0)
+                               conn.cookie = export->exp_handle.h_cookie;
  
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 6, 53, 0)
-       /* 2.2.0 clients always swab nidtbl entries due to a bug, so server
-        * will do the swabbing for if the client is using the same endianness.
-        *
-        * This fixup is version-limited, because we don't want to carry the
-        * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we need
-        * interop with unpatched 2.2 clients.  For newer clients, servers
-        * will never do MNE swabbing, let the client handle that.  LU-1644 */
-       export->exp_need_mne_swab = !ptlrpc_req_need_swab(req) &&
-                       !(data->ocd_connect_flags & OBD_CONNECT_MNE_SWAB);
-#endif
+                       if (mds_mds_conn)
+                               new_mds_mds_conn = true;
+               }
+       } else {
+               rc = obd_reconnect(req->rq_svc_thread->t_env,
+                                  export, target, &cluuid, data, client_nid);
+       }
+       if (rc)
+               GOTO(out, rc);
  
-        LASSERT(target->u.obt.obt_magic == OBT_MAGIC);
-        data->ocd_instance = target->u.obt.obt_instance;
+       LASSERT(target->u.obt.obt_magic == OBT_MAGIC);
+       data->ocd_instance = target->u.obt.obt_instance;
  
          /* Return only the parts of obd_connect_data that we understand, so the
           * client knows that we don't understand the rest. */
@@ -1181,6 +1224,14 @@ dont_check_exports:
  
                 atomic_inc(&target->obd_req_replay_clients);
                 atomic_inc(&target->obd_lock_replay_clients);
+               /* Note: MDS-MDS connection is allowed to be connected during
+                * recovery, no matter if the exports needs to be recoveried.
+                * Because we need retrieve updates logs from all other MDTs.
+                * So if the MDS-MDS export is new, obd_max_recoverable_clients
+                * also needs to be increased to match other recovery checking
+                * condition. */
+               if (new_mds_mds_conn)
+                       target->obd_max_recoverable_clients++;
                 if (atomic_inc_return(&target->obd_connected_clients) ==
                     target->obd_max_recoverable_clients)
                         wake_up(&target->obd_next_transno_waitq);
@@ -1217,17 +1268,12 @@ dont_check_exports:
          * ptlrpc_handle_server_req_in->lustre_unpack_msg(). */
          revimp->imp_msg_magic = req->rq_reqmsg->lm_magic;
  
-       if ((data->ocd_connect_flags & OBD_CONNECT_AT) &&
-           (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1))
+       if (data->ocd_connect_flags & OBD_CONNECT_AT)
                 revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
         else
                 revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
  
-       if ((data->ocd_connect_flags & OBD_CONNECT_FULL20) &&
-            (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1))
-                revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
-        else
-                revimp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+       revimp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
  
         rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr);
         if (rc) {
@@ -1261,7 +1307,6 @@ out:
                 req->rq_status = rc;
         RETURN(rc);
  }
-EXPORT_SYMBOL(target_handle_connect);
  
  int target_handle_disconnect(struct ptlrpc_request *req)
  {
@@ -1277,7 +1322,6 @@ int target_handle_disconnect(struct ptlrpc_request *req)
  
          RETURN(0);
  }
-EXPORT_SYMBOL(target_handle_disconnect);
  
  void target_destroy_export(struct obd_export *exp)
  {
@@ -1368,8 +1412,9 @@ static void target_exp_dequeue_req_replay(struct ptlrpc_request *req)
         spin_unlock(&req->rq_export->exp_lock);
  }
  
-static void target_finish_recovery(struct obd_device *obd)
+static void target_finish_recovery(struct lu_target *lut)
  {
+       struct obd_device *obd = lut->lut_obd;
          ENTRY;
  
         /* Only log a recovery message when recovery has occurred. */
@@ -1402,6 +1447,10 @@ static void target_finish_recovery(struct obd_device *obd)
         }
         spin_unlock(&obd->obd_recovery_task_lock);
  
+       if (lut->lut_tdtd != NULL &&
+           !list_empty(&lut->lut_tdtd->tdtd_replay_list))
+               dtrq_list_dump(lut->lut_tdtd, D_ERROR);
+
          obd->obd_recovery_end = cfs_time_current_sec();
  
         /* When recovery finished, cleanup orphans on MDS and OST. */
@@ -1478,6 +1527,7 @@ void target_cleanup_recovery(struct obd_device *obd)
                 return;
         }
         obd->obd_recovering = obd->obd_abort_recovery = 0;
+       obd->obd_force_abort_recovery = 0;
         spin_unlock(&obd->obd_dev_lock);
  
         spin_lock(&obd->obd_recovery_task_lock);
@@ -1486,7 +1536,7 @@ void target_cleanup_recovery(struct obd_device *obd)
         spin_unlock(&obd->obd_recovery_task_lock);
  
         list_for_each_entry_safe(req, n, &clean_list, rq_list) {
-               LASSERT(req->rq_reply_state == 0);
+               LASSERT(req->rq_reply_state == NULL);
                 target_exp_dequeue_req_replay(req);
                 target_request_copy_put(req);
         }
@@ -1497,7 +1547,7 @@ void target_cleanup_recovery(struct obd_device *obd)
         spin_unlock(&obd->obd_recovery_task_lock);
  
         list_for_each_entry_safe(req, n, &clean_list, rq_list) {
-                LASSERT(req->rq_reply_state == 0);
+               LASSERT(req->rq_reply_state == NULL);
                  target_request_copy_put(req);
          }
  
@@ -1511,7 +1561,6 @@ void target_cancel_recovery_timer(struct obd_device *obd)
          CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
          cfs_timer_disarm(&obd->obd_recovery_timer);
  }
-EXPORT_SYMBOL(target_cancel_recovery_timer);
  
  static void target_start_recovery_timer(struct obd_device *obd)
  {
@@ -1519,7 +1568,8 @@ static void target_start_recovery_timer(struct obd_device *obd)
                 return;
  
         spin_lock(&obd->obd_dev_lock);
-       if (!obd->obd_recovering || obd->obd_abort_recovery) {
+       if (!obd->obd_recovering || obd->obd_abort_recovery ||
+           obd->obd_force_abort_recovery) {
                 spin_unlock(&obd->obd_dev_lock);
                 return;
         }
@@ -1560,7 +1610,8 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend)
         int to;
  
         spin_lock(&obd->obd_dev_lock);
-       if (!obd->obd_recovering || obd->obd_abort_recovery) {
+       if (!obd->obd_recovering || obd->obd_abort_recovery ||
+           obd->obd_force_abort_recovery) {
                 spin_unlock(&obd->obd_dev_lock);
                  return;
          }
@@ -1575,22 +1626,20 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend)
                  to += drt - left;
          } else if (!extend && (drt > to)) {
                  to = drt;
-                /* reduce drt by already passed time */
-                drt -= obd->obd_recovery_timeout - left;
          }
  
          if (to > obd->obd_recovery_time_hard)
                  to = obd->obd_recovery_time_hard;
-       if (obd->obd_recovery_timeout < to ||
-           obd->obd_recovery_timeout == obd->obd_recovery_time_hard) {
+       if (obd->obd_recovery_timeout < to) {
                  obd->obd_recovery_timeout = to;
-                cfs_timer_arm(&obd->obd_recovery_timer,
-                              cfs_time_shift(drt));
+               end = obd->obd_recovery_start + to;
+               cfs_timer_arm(&obd->obd_recovery_timer,
+                               cfs_time_shift(end - now));
          }
         spin_unlock(&obd->obd_dev_lock);
  
         CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n",
-              obd->obd_name, (unsigned)drt);
+               obd->obd_name, (unsigned)cfs_time_sub(end, now));
  }
  
  /* Reset the timer with each new client connection */
@@ -1664,23 +1713,14 @@ static inline int exp_finished(struct obd_export *exp)
          return (exp->exp_in_recovery && !exp->exp_lock_replay_needed);
  }
  
-/** Checking routines for recovery */
-static int check_for_clients(struct obd_device *obd)
-{
-       unsigned int clnts = atomic_read(&obd->obd_connected_clients);
-
-       if (obd->obd_abort_recovery || obd->obd_recovery_expired)
-               return 1;
-       LASSERT(clnts <= obd->obd_max_recoverable_clients);
-       return (clnts + obd->obd_stale_clients ==
-               obd->obd_max_recoverable_clients);
-}
-
-static int check_for_next_transno(struct obd_device *obd)
+static int check_for_next_transno(struct lu_target *lut)
  {
         struct ptlrpc_request *req = NULL;
+       struct obd_device *obd = lut->lut_obd;
         int wake_up = 0, connected, completed, queue_len;
-       __u64 next_transno, req_transno;
+       __u64 req_transno = 0;
+       __u64 update_transno = 0;
+       __u64 next_transno = 0;
         ENTRY;
  
         spin_lock(&obd->obd_recovery_task_lock);
@@ -1688,8 +1728,14 @@ static int check_for_next_transno(struct obd_device *obd)
                 req = list_entry(obd->obd_req_replay_queue.next,
                                      struct ptlrpc_request, rq_list);
                 req_transno = lustre_msg_get_transno(req->rq_reqmsg);
-       } else {
-               req_transno = 0;
+       }
+
+       if (lut->lut_tdtd != NULL) {
+               struct target_distribute_txn_data *tdtd;
+               __u64 update_transno;
+
+               tdtd = lut->lut_tdtd;
+               update_transno = distribute_txn_get_next_transno(lut->lut_tdtd);
         }
  
         connected = atomic_read(&obd->obd_connected_clients);
@@ -1702,13 +1748,14 @@ static int check_for_next_transno(struct obd_device *obd)
                obd->obd_max_recoverable_clients, connected, completed,
                queue_len, req_transno, next_transno);
  
-       if (obd->obd_abort_recovery) {
+       if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
                 CDEBUG(D_HA, "waking for aborted recovery\n");
                 wake_up = 1;
         } else if (obd->obd_recovery_expired) {
                 CDEBUG(D_HA, "waking for expired recovery\n");
                 wake_up = 1;
-       } else if (req_transno == next_transno) {
+       } else if (req_transno == next_transno || (update_transno != 0 &&
+                                          update_transno <= next_transno)) {
                 CDEBUG(D_HA, "waking for next ("LPD64")\n", next_transno);
                 wake_up = 1;
         } else if (queue_len > 0 &&
@@ -1724,10 +1771,10 @@ static int check_for_next_transno(struct obd_device *obd)
                 CDEBUG(d_lvl,
                        "%s: waking for gap in transno, VBR is %s (skip: "
                        LPD64", ql: %d, comp: %d, conn: %d, next: "LPD64
-                      ", last_committed: "LPD64")\n",
+                      ", next_update "LPD64" last_committed: "LPD64")\n",
                        obd->obd_name, obd->obd_version_recov ? "ON" : "OFF",
                        next_transno, queue_len, completed, connected,
-                      req_transno, obd->obd_last_committed);
+                      req_transno, update_transno, obd->obd_last_committed);
                 obd->obd_next_recovery_transno = req_transno;
                 wake_up = 1;
         } else if (atomic_read(&obd->obd_req_replay_clients) == 0) {
@@ -1743,8 +1790,9 @@ static int check_for_next_transno(struct obd_device *obd)
         return wake_up;
  }
  
-static int check_for_next_lock(struct obd_device *obd)
+static int check_for_next_lock(struct lu_target *lut)
  {
+       struct obd_device *obd = lut->lut_obd;
         int wake_up = 0;
  
         spin_lock(&obd->obd_recovery_task_lock);
@@ -1754,7 +1802,7 @@ static int check_for_next_lock(struct obd_device *obd)
         } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
                 CDEBUG(D_HA, "waking for completed lock replay\n");
                 wake_up = 1;
-       } else if (obd->obd_abort_recovery) {
+       } else if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
                 CDEBUG(D_HA, "waking for aborted recovery\n");
                 wake_up = 1;
         } else if (obd->obd_recovery_expired) {
@@ -1771,13 +1819,24 @@ static int check_for_next_lock(struct obd_device *obd)
   * check its status with help of check_routine
   * evict dead clients via health_check
   */
-static int target_recovery_overseer(struct obd_device *obd,
-                                   int (*check_routine)(struct obd_device *),
+static int target_recovery_overseer(struct lu_target *lut,
+                                   int (*check_routine)(struct lu_target *),
                                     int (*health_check)(struct obd_export *))
  {
+       struct obd_device       *obd = lut->lut_obd;
  repeat:
-       wait_event(obd->obd_next_transno_waitq, check_routine(obd));
-       if (obd->obd_abort_recovery) {
+       if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >=
+             (obd->obd_recovery_start + obd->obd_recovery_time_hard))) {
+               CWARN("recovery is aborted by hard timeout\n");
+               obd->obd_abort_recovery = 1;
+       }
+
+       while (wait_event_timeout(obd->obd_next_transno_waitq,
+                                 check_routine(lut),
+                                 msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0)
+               /* wait indefinitely for event, but don't trigger watchdog */;
+
+       if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
                 CWARN("recovery is aborted, evict exports in recovery\n");
                 /** evict exports which didn't finish recovery yet */
                 class_disconnect_stale_exports(obd, exp_finished);
@@ -1804,45 +1863,13 @@ repeat:
         return 0;
  }
  
-static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd)
-{
-       struct ptlrpc_request *req = NULL;
-       ENTRY;
-
-       CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
-               obd->obd_next_recovery_transno);
-
-       CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
-
-       if (target_recovery_overseer(obd, check_for_next_transno,
-                                    exp_req_replay_healthy)) {
-               abort_req_replay_queue(obd);
-               abort_lock_replay_queue(obd);
-       }
-
-       spin_lock(&obd->obd_recovery_task_lock);
-       if (!list_empty(&obd->obd_req_replay_queue)) {
-               req = list_entry(obd->obd_req_replay_queue.next,
-                                    struct ptlrpc_request, rq_list);
-               list_del_init(&req->rq_list);
-               obd->obd_requests_queued_for_recovery--;
-               spin_unlock(&obd->obd_recovery_task_lock);
-       } else {
-               spin_unlock(&obd->obd_recovery_task_lock);
-               LASSERT(list_empty(&obd->obd_req_replay_queue));
-               LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
-               /** evict exports failed VBR */
-               class_disconnect_stale_exports(obd, exp_vbr_healthy);
-       }
-       RETURN(req);
-}
-
-static struct ptlrpc_request *target_next_replay_lock(struct obd_device *obd)
+static struct ptlrpc_request *target_next_replay_lock(struct lu_target *lut)
  {
+       struct obd_device       *obd = lut->lut_obd;
         struct ptlrpc_request *req = NULL;
  
         CDEBUG(D_HA, "Waiting for lock\n");
-       if (target_recovery_overseer(obd, check_for_next_lock,
+       if (target_recovery_overseer(lut, check_for_next_lock,
                                      exp_lock_replay_healthy))
                 abort_lock_replay_queue(obd);
  
@@ -1936,6 +1963,318 @@ static void handle_recovery_req(struct ptlrpc_thread *thread,
         EXIT;
  }
  
+/** Checking routines for recovery */
+static int check_for_recovery_ready(struct lu_target *lut)
+{
+       struct obd_device *obd = lut->lut_obd;
+       unsigned int clnts = atomic_read(&obd->obd_connected_clients);
+
+       CDEBUG(D_HA, "connected %d stale %d max_recoverable_clients %d"
+              " abort %d expired %d\n", clnts, obd->obd_stale_clients,
+              obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
+              obd->obd_recovery_expired);
+
+       if (obd->obd_force_abort_recovery)
+               return 1;
+
+       if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
+               LASSERT(clnts <= obd->obd_max_recoverable_clients);
+               if (clnts + obd->obd_stale_clients <
+                   obd->obd_max_recoverable_clients)
+                       return 0;
+       }
+
+       if (lut->lut_tdtd != NULL) {
+               if (!lut->lut_tdtd->tdtd_replay_ready) {
+                       /* Let's extend recovery timer, in case the recovery
+                        * timer expired, and some clients got evicted */
+                       extend_recovery_timer(obd, obd->obd_recovery_timeout,
+                                             true);
+                       return 0;
+               } else {
+                       dtrq_list_dump(lut->lut_tdtd, D_HA);
+               }
+       }
+
+       return 1;
+}
+
+enum {
+       REQUEST_RECOVERY = 1,
+       UPDATE_RECOVERY = 2,
+};
+
+static __u64 get_next_replay_req_transno(struct obd_device *obd)
+{
+       __u64 transno = 0;
+
+       if (!list_empty(&obd->obd_req_replay_queue)) {
+               struct ptlrpc_request *req;
+
+               req = list_entry(obd->obd_req_replay_queue.next,
+                                struct ptlrpc_request, rq_list);
+               transno = lustre_msg_get_transno(req->rq_reqmsg);
+       }
+
+       return transno;
+}
+__u64 get_next_transno(struct lu_target *lut, int *type)
+{
+       struct obd_device *obd = lut->lut_obd;
+       struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+       __u64 transno = 0;
+       __u64 update_transno;
+       ENTRY;
+
+       transno = get_next_replay_req_transno(obd);
+       if (type != NULL)
+               *type = REQUEST_RECOVERY;
+
+       if (tdtd == NULL)
+               RETURN(transno);
+
+       update_transno = distribute_txn_get_next_transno(tdtd);
+       if (transno == 0 || (transno >= update_transno &&
+                            update_transno != 0)) {
+               transno = update_transno;
+               if (type != NULL)
+                       *type = UPDATE_RECOVERY;
+       }
+
+       RETURN(transno);
+}
+
+/**
+ * drop duplicate replay request
+ *
+ * Because the operation has been replayed by update recovery, the request
+ * with the same transno will be dropped and also notify the client to send
+ * next replay request.
+ *
+ * \param[in] env      execution environment
+ * \param[in] obd      failover obd device
+ * \param[in] req      request to be dropped
+ */
+static void drop_duplicate_replay_req(struct lu_env *env,
+                                     struct obd_device *obd,
+                                     struct ptlrpc_request *req)
+{
+       DEBUG_REQ(D_HA, req, "remove t"LPD64" from %s because of duplicate"
+                 " update records are found.\n",
+                 lustre_msg_get_transno(req->rq_reqmsg),
+                 libcfs_nid2str(req->rq_peer.nid));
+
+       /* Right now, only for MDS reint operation update replay and
+        * normal request replay can have the same transno */
+       if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_REINT) {
+               req_capsule_set(&req->rq_pill, &RQF_MDS_REINT);
+               req->rq_status = req_capsule_server_pack(&req->rq_pill);
+               if (likely(req->rq_export))
+                       target_committed_to_req(req);
+               lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
+               target_send_reply(req, req->rq_status, 0);
+       } else {
+               DEBUG_REQ(D_ERROR, req, "wrong opc" "from %s\n",
+               libcfs_nid2str(req->rq_peer.nid));
+       }
+       target_exp_dequeue_req_replay(req);
+       target_request_copy_put(req);
+       obd->obd_replayed_requests++;
+}
+
+/**
+ * Update last_rcvd of the update
+ *
+ * Because update recovery might update the last_rcvd by updates, i.e.
+ * it will not update the last_rcvd information in memory, so we need
+ * refresh these information in memory after update recovery.
+ *
+ * \param[in] obd      obd_device under recoverying.
+ * \param[in] dtrq     the update replay requests being replayed.
+ */
+static void target_update_lcd(struct lu_env *env, struct lu_target *lut,
+                             struct distribute_txn_replay_req *dtrq)
+{
+       struct obd_device       *obd = lut->lut_obd;
+       struct obd_export       *export;
+       struct tg_export_data   *ted;
+       struct distribute_txn_replay_req_sub    *dtrqs;
+       struct seq_server_site *site;
+       struct update_records   *ur;
+       const struct lu_fid     *fid;
+       struct update_ops       *ops;
+       struct update_params    *params;
+       struct update_op        *op;
+       __u32                   mdt_index;
+       unsigned int            i;
+       struct lsd_client_data  *lcd = NULL;
+
+       /* if Updates has been executed(committed) on the recovery target,
+        * i.e. the updates is not being executed on the target, so we do
+        * not need update it in memory */
+       site = lu_site2seq(obd->obd_lu_dev->ld_site);
+       mdt_index = site->ss_node_id;
+       dtrqs = dtrq_sub_lookup(dtrq, mdt_index);
+       if (dtrqs != NULL)
+               return;
+
+       if (dtrq->dtrq_lur == NULL)
+               return;
+
+       /* Find the update last_rcvd record */
+       fid = lu_object_fid(&lut->lut_last_rcvd->do_lu);
+       ur = &dtrq->dtrq_lur->lur_update_rec;
+       ops = &ur->ur_ops;
+       params = update_records_get_params(ur);
+       for (i = 0, op = &ops->uops_op[0]; i < ur->ur_update_count;
+            i++, op = update_op_next_op(op)) {
+               __u64 pos;
+               __u16 size;
+               void *buf;
+
+               if (!lu_fid_eq(&op->uop_fid, fid))
+                       continue;
+
+               if (op->uop_type != OUT_WRITE)
+                       continue;
+
+               buf = update_params_get_param_buf(params, op->uop_params_off[1],
+                                                 ur->ur_param_count, NULL);
+               if (buf == NULL)
+                       continue;
+
+               pos = le64_to_cpu(*(__u64 *)buf);
+               if (pos == 0)
+                       continue;
+
+               buf = update_params_get_param_buf(params, op->uop_params_off[0],
+                                                 ur->ur_param_count, &size);
+               if (buf == NULL)
+                       continue;
+
+               if (size != sizeof(*lcd))
+                       continue;
+               lcd = buf;
+       }
+
+       if (lcd == NULL || lcd->lcd_uuid[0] == '\0')
+               return;
+
+       /* locate the export then update the exp_target_data if needed */
+       export = cfs_hash_lookup(obd->obd_uuid_hash, lcd->lcd_uuid);
+       if (export == NULL)
+               return;
+
+       ted = &export->exp_target_data;
+       if (lcd->lcd_last_xid > ted->ted_lcd->lcd_last_xid) {
+               CDEBUG(D_HA, "%s update xid from "LPU64" to "LPU64"\n",
+                      lut->lut_obd->obd_name, ted->ted_lcd->lcd_last_xid,
+                      lcd->lcd_last_xid);
+               ted->ted_lcd->lcd_last_xid = lcd->lcd_last_xid;
+               ted->ted_lcd->lcd_last_result = lcd->lcd_last_result;
+       }
+       class_export_put(export);
+}
+
+static void replay_request_or_update(struct lu_env *env,
+                                    struct lu_target *lut,
+                                    struct target_recovery_data *trd,
+                                    struct ptlrpc_thread *thread)
+{
+       struct obd_device *obd = lut->lut_obd;
+       struct ptlrpc_request *req = NULL;
+       int                     type;
+       __u64                   transno;
+       ENTRY;
+
+       CDEBUG(D_HA, "Waiting for transno "LPD64"\n",
+              obd->obd_next_recovery_transno);
+
+       /* Replay all of request and update by transno */
+       do {
+               struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+               CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);
+
+               /** It is needed to extend recovery window above
+                *  recovery_time_soft. Extending is possible only in the
+                *  end of recovery window (see more details in
+                *  handle_recovery_req()).
+                */
+               CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
+
+               if (target_recovery_overseer(lut, check_for_next_transno,
+                                            exp_req_replay_healthy)) {
+                       abort_req_replay_queue(obd);
+                       abort_lock_replay_queue(obd);
+               }
+
+               spin_lock(&obd->obd_recovery_task_lock);
+               transno = get_next_transno(lut, &type);
+               if (type == REQUEST_RECOVERY && tdtd != NULL &&
+                   transno == tdtd->tdtd_last_update_transno) {
+                       /* Drop replay request from client side, if the
+                        * replay has been executed by update with the
+                        * same transno */
+                       req = list_entry(obd->obd_req_replay_queue.next,
+                                       struct ptlrpc_request, rq_list);
+                       list_del_init(&req->rq_list);
+                       obd->obd_requests_queued_for_recovery--;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       drop_duplicate_replay_req(env, obd, req);
+               } else if (type == REQUEST_RECOVERY && transno != 0) {
+                       req = list_entry(obd->obd_req_replay_queue.next,
+                                            struct ptlrpc_request, rq_list);
+                       list_del_init(&req->rq_list);
+                       obd->obd_requests_queued_for_recovery--;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       LASSERT(trd->trd_processing_task == current_pid());
+                       DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
+                                 lustre_msg_get_transno(req->rq_reqmsg),
+                                 libcfs_nid2str(req->rq_peer.nid));
+
+                       handle_recovery_req(thread, req,
+                                           trd->trd_recovery_handler);
+                       /**
+                        * bz18031: increase next_recovery_transno before
+                        * target_request_copy_put() will drop exp_rpc reference
+                        */
+                       spin_lock(&obd->obd_recovery_task_lock);
+                       obd->obd_next_recovery_transno++;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       target_exp_dequeue_req_replay(req);
+                       target_request_copy_put(req);
+                       obd->obd_replayed_requests++;
+               } else if (type == UPDATE_RECOVERY && transno != 0) {
+                       struct distribute_txn_replay_req *dtrq;
+
+                       spin_unlock(&obd->obd_recovery_task_lock);
+
+                       LASSERT(tdtd != NULL);
+                       dtrq = distribute_txn_get_next_req(tdtd);
+                       lu_context_enter(&thread->t_env->le_ctx);
+                       tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+                       lu_context_exit(&thread->t_env->le_ctx);
+                       extend_recovery_timer(obd, obd_timeout, true);
+                       LASSERT(tdtd->tdtd_last_update_transno <= transno);
+                       tdtd->tdtd_last_update_transno = transno;
+                       spin_lock(&obd->obd_recovery_task_lock);
+                       if (transno > obd->obd_next_recovery_transno)
+                               obd->obd_next_recovery_transno = transno;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       target_update_lcd(env, lut, dtrq);
+                       dtrq_destory(dtrq);
+               } else {
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       LASSERT(list_empty(&obd->obd_req_replay_queue));
+                       LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
+                       /** evict exports failed VBR */
+                       class_disconnect_stale_exports(obd, exp_vbr_healthy);
+                       break;
+               }
+       } while (1);
+}
+
  static int target_recovery_thread(void *arg)
  {
          struct lu_target *lut = arg;
@@ -1981,43 +2320,28 @@ static int target_recovery_thread(void *arg)
         spin_unlock(&obd->obd_dev_lock);
         complete(&trd->trd_starting);
  
-        /* first of all, we have to know the first transno to replay */
-        if (target_recovery_overseer(obd, check_for_clients,
-                                     exp_connect_healthy)) {
-                abort_req_replay_queue(obd);
-                abort_lock_replay_queue(obd);
-        }
+       /* first of all, we have to know the first transno to replay */
+       if (target_recovery_overseer(lut, check_for_recovery_ready,
+                                    exp_connect_healthy)) {
+               abort_req_replay_queue(obd);
+               abort_lock_replay_queue(obd);
+               if (lut->lut_tdtd != NULL)
+                       dtrq_list_destroy(lut->lut_tdtd);
+       }
  
-       /* next stage: replay requests */
+       /* next stage: replay requests or update */
         delta = jiffies;
         CDEBUG(D_INFO, "1: request replay stage - %d clients from t"LPU64"\n",
                atomic_read(&obd->obd_req_replay_clients),
                obd->obd_next_recovery_transno);
-       while ((req = target_next_replay_req(obd))) {
-               LASSERT(trd->trd_processing_task == current_pid());
-               DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
-                         lustre_msg_get_transno(req->rq_reqmsg),
-                         libcfs_nid2str(req->rq_peer.nid));
-                handle_recovery_req(thread, req,
-                                    trd->trd_recovery_handler);
-                /**
-                 * bz18031: increase next_recovery_transno before
-                 * target_request_copy_put() will drop exp_rpc reference
-                 */
-               spin_lock(&obd->obd_recovery_task_lock);
-               obd->obd_next_recovery_transno++;
-               spin_unlock(&obd->obd_recovery_task_lock);
-                target_exp_dequeue_req_replay(req);
-                target_request_copy_put(req);
-                obd->obd_replayed_requests++;
-        }
+       replay_request_or_update(env, lut, trd, thread);
  
         /**
          * The second stage: replay locks
          */
         CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
                atomic_read(&obd->obd_lock_replay_clients));
-       while ((req = target_next_replay_lock(obd))) {
+       while ((req = target_next_replay_lock(lut))) {
                 LASSERT(trd->trd_processing_task == current_pid());
                 DEBUG_REQ(D_HA, req, "processing lock from %s: ",
                           libcfs_nid2str(req->rq_peer.nid));
@@ -2031,6 +2355,7 @@ static int target_recovery_thread(void *arg)
           * The third stage: reply on final pings, at this moment all clients
           * must have request in final queue
           */
+       CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_RECONNECT, cfs_fail_val);
          CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n");
          /** Update server last boot epoch */
          tgt_boot_epoch_update(lut);
@@ -2053,9 +2378,9 @@ static int target_recovery_thread(void *arg)
                  * export is being evicted */
                 ptlrpc_update_export_timer(req->rq_export, 0);
                 target_request_copy_put(req);
-        }
+       }
  
-       delta = (jiffies - delta) / HZ;
+       delta = jiffies_to_msecs(jiffies - delta) / MSEC_PER_SEC;
         CDEBUG(D_INFO,"4: recovery completed in %lus - %d/%d reqs/locks\n",
               delta, obd->obd_replayed_requests, obd->obd_replayed_locks);
         if (delta > OBD_RECOVERY_TIME_SOFT) {
@@ -2063,7 +2388,7 @@ static int target_recovery_thread(void *arg)
                 libcfs_debug_dumplog();
         }
  
-        target_finish_recovery(obd);
+       target_finish_recovery(lut);
  
          lu_context_fini(&env->le_ctx);
          trd->trd_processing_task = 0;
@@ -2139,6 +2464,7 @@ static void target_recovery_expired(unsigned long castmeharder)
  void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
  {
          struct obd_device *obd = lut->lut_obd;
+
          if (obd->obd_max_recoverable_clients == 0) {
                  /** Update server last boot epoch */
                  tgt_boot_epoch_update(lut);
@@ -2158,7 +2484,6 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
  }
  EXPORT_SYMBOL(target_recovery_init);
  
-
  static int target_process_req_flags(struct obd_device *obd,
                                      struct ptlrpc_request *req)
  {
@@ -2323,14 +2648,12 @@ added:
         wake_up(&obd->obd_next_transno_waitq);
         RETURN(0);
  }
-EXPORT_SYMBOL(target_queue_recovery_request);
  
  int target_handle_ping(struct ptlrpc_request *req)
  {
          obd_ping(req->rq_svc_thread->t_env, req->rq_export);
          return req_capsule_server_pack(&req->rq_pill);
  }
-EXPORT_SYMBOL(target_handle_ping);
  
  void target_committed_to_req(struct ptlrpc_request *req)
  {
@@ -2347,7 +2670,6 @@ void target_committed_to_req(struct ptlrpc_request *req)
          CDEBUG(D_INFO, "last_committed "LPU64", transno "LPU64", xid "LPU64"\n",
                 exp->exp_last_committed, req->rq_transno, req->rq_xid);
  }
-EXPORT_SYMBOL(target_committed_to_req);
  
  #endif /* HAVE_SERVER_SUPPORT */
  
@@ -2378,9 +2700,9 @@ int target_pack_pool_reply(struct ptlrpc_request *req)
  
          RETURN(0);
  }
-EXPORT_SYMBOL(target_pack_pool_reply);
  
-int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id)
+static int target_send_reply_msg(struct ptlrpc_request *req,
+                                int rc, int fail_id)
  {
          if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
                  DEBUG_REQ(D_ERROR, req, "dropping reply");
@@ -2489,7 +2811,6 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
         spin_unlock(&svcpt->scp_rep_lock);
         EXIT;
  }
-EXPORT_SYMBOL(target_send_reply);
  
  ldlm_mode_t lck_compat_array[] = {
         [LCK_EX]    = LCK_COMPAT_EX,
@@ -2512,6 +2833,7 @@ int ldlm_error2errno(ldlm_error_t error)
  
          switch (error) {
          case ELDLM_OK:
+       case ELDLM_LOCK_MATCHED:
                  result = 0;
                  break;
          case ELDLM_LOCK_CHANGED:
@@ -2578,7 +2900,6 @@ ldlm_error_t ldlm_errno2error(int err_no)
          }
          return error;
  }
-EXPORT_SYMBOL(ldlm_errno2error);
  
  #if LUSTRE_TRACKS_LOCK_EXP_REFS
  void ldlm_dump_export_locks(struct obd_export *exp)
@@ -2607,9 +2928,13 @@ static int target_bulk_timeout(void *data)
          RETURN(1);
  }
  
-static inline char *bulk2type(struct ptlrpc_bulk_desc *desc)
+static inline const char *bulk2type(struct ptlrpc_request *req)
  {
-        return desc->bd_type == BULK_GET_SINK ? "GET" : "PUT";
+       if (req->rq_bulk_read)
+               return "READ";
+       if (req->rq_bulk_write)
+               return "WRITE";
+       return "UNKNOWN";
  }
  
  int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
@@ -2636,7 +2961,7 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
             exp->exp_conn_cnt > lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
                 rc = -ENOTCONN;
         } else {
-               if (desc->bd_type == BULK_PUT_SINK)
+               if (req->rq_bulk_read)
                         rc = sptlrpc_svc_wrap_bulk(req, desc);
                 if (rc == 0)
                         rc = ptlrpc_start_bulk_transfer(desc);
@@ -2644,7 +2969,7 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
  
         if (rc < 0) {
                 DEBUG_REQ(D_ERROR, req, "bulk %s failed: rc %d",
-                         bulk2type(desc), rc);
+                         bulk2type(req), rc);
                 RETURN(rc);
         }
  
@@ -2662,6 +2987,7 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                 long timeoutl = deadline - cfs_time_current_sec();
                 cfs_duration_t timeout = timeoutl <= 0 ?
                                          CFS_TICK : cfs_time_seconds(timeoutl);
+               time_t  rq_deadline;
  
                 *lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
                                             target_bulk_timeout, desc);
@@ -2673,39 +2999,45 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
                                   lwi);
                 LASSERT(rc == 0 || rc == -ETIMEDOUT);
                 /* Wait again if we changed rq_deadline. */
+               rq_deadline = ACCESS_ONCE(req->rq_deadline);
                 deadline = start + bulk_timeout;
-               if (deadline > req->rq_deadline)
-                       deadline = req->rq_deadline;
+               if (deadline > rq_deadline)
+                       deadline = rq_deadline;
         } while ((rc == -ETIMEDOUT) &&
                  (deadline > cfs_time_current_sec()));
  
         if (rc == -ETIMEDOUT) {
                 DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
-                         bulk2type(desc), deadline - start,
+                         bulk2type(req), deadline - start,
                           cfs_time_current_sec() - deadline);
                 ptlrpc_abort_bulk(desc);
         } else if (exp->exp_failed) {
                 DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
-                         bulk2type(desc));
+                         bulk2type(req));
                 rc = -ENOTCONN;
                 ptlrpc_abort_bulk(desc);
         } else if (exp->exp_conn_cnt >
                    lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
                 DEBUG_REQ(D_ERROR, req, "Reconnect on bulk %s",
-                         bulk2type(desc));
+                         bulk2type(req));
                 /* We don't reply anyway. */
                 rc = -ETIMEDOUT;
                 ptlrpc_abort_bulk(desc);
-       } else if (desc->bd_failure ||
-                  desc->bd_nob_transferred != desc->bd_nob) {
-               DEBUG_REQ(D_ERROR, req, "%s bulk %s %d(%d)",
-                         desc->bd_failure ? "network error on" : "truncated",
-                         bulk2type(desc), desc->bd_nob_transferred,
-                         desc->bd_nob);
-               /* XXX Should this be a different errno? */
+       } else if (desc->bd_failure) {
+               DEBUG_REQ(D_ERROR, req, "network error on bulk %s",
+                         bulk2type(req));
+               /* XXX should this be a different errno? */
                 rc = -ETIMEDOUT;
-       } else if (desc->bd_type == BULK_GET_SINK) {
-               rc = sptlrpc_svc_unwrap_bulk(req, desc);
+       } else {
+               if (req->rq_bulk_write)
+                       rc = sptlrpc_svc_unwrap_bulk(req, desc);
+               if (rc == 0 && desc->bd_nob_transferred != desc->bd_nob) {
+                       DEBUG_REQ(D_ERROR, req, "truncated bulk %s %d(%d)",
+                                 bulk2type(req), desc->bd_nob_transferred,
+                                 desc->bd_nob);
+                       /* XXX should this be a different errno? */
+                       rc = -ETIMEDOUT;
+               }
         }
  
         RETURN(rc);