Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / ldlm / ldlm_lib.c
index a9b6f08..b4e9d61 100644 (file)
@@ -99,7 +99,7 @@ static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
         if (create) {
                 imp_conn->oic_conn = ptlrpc_conn;
                 imp_conn->oic_uuid = *uuid;
-                item->oic_last_attempt = 0;
+                imp_conn->oic_last_attempt = 0;
                 if (priority)
                         list_add(&imp_conn->oic_item, &imp->imp_conn_list);
                 else
@@ -188,7 +188,7 @@ out:
         RETURN(rc);
 }
 
-static void destroy_import(struct obd_import *imp)
+void client_destroy_import(struct obd_import *imp)
 {
         /* drop security policy instance after all rpc finished/aborted
          * to let all busy contexts be released. */
@@ -277,6 +277,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                 cli->cl_dirty_max = num_physpages << (CFS_PAGE_SHIFT - 3);
         CFS_INIT_LIST_HEAD(&cli->cl_cache_waiters);
         CFS_INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+        CFS_INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
         CFS_INIT_LIST_HEAD(&cli->cl_loi_write_list);
         CFS_INIT_LIST_HEAD(&cli->cl_loi_read_list);
         client_obd_list_lock_init(&cli->cl_loi_list_lock);
@@ -385,37 +386,45 @@ int client_obd_cleanup(struct obd_device *obddev)
 
 /* ->o_connect() method for client side (OSC and MDC and MGC) */
 int client_connect_import(const struct lu_env *env,
-                          struct lustre_handle *dlm_handle,
+                          struct obd_export **exp,
                           struct obd_device *obd, struct obd_uuid *cluuid,
                           struct obd_connect_data *data, void *localdata)
 {
         struct client_obd *cli = &obd->u.cli;
         struct obd_import *imp = cli->cl_import;
-        struct obd_export *exp;
         struct obd_connect_data *ocd;
         struct ldlm_namespace *to_be_freed = NULL;
+        struct lustre_handle conn = { 0 };
         int rc;
         ENTRY;
 
+        *exp = NULL;
         down_write(&cli->cl_sem);
-        rc = class_connect(dlm_handle, obd, cluuid);
+        if (cli->cl_conn_count > 0 )
+                GOTO(out_sem, rc = -EALREADY);
+
+        rc = class_connect(&conn, obd, cluuid);
         if (rc)
                 GOTO(out_sem, rc);
-
+                
         cli->cl_conn_count++;
-        if (cli->cl_conn_count > 1)
-                GOTO(out_sem, rc);
-        exp = class_conn2export(dlm_handle);
+        *exp = class_conn2export(&conn);
 
         if (obd->obd_namespace != NULL)
                 CERROR("already have namespace!\n");
+
+        /*
+         * Deadlock case - bug 18380
+         */
+        up_write(&cli->cl_sem);
         obd->obd_namespace = ldlm_namespace_new(obd, obd->obd_name,
                                                 LDLM_NAMESPACE_CLIENT,
                                                 LDLM_NAMESPACE_GREEDY);
+        down_write(&cli->cl_sem);
         if (obd->obd_namespace == NULL)
                 GOTO(out_disco, rc = -ENOMEM);
 
-        imp->imp_dlm_handle = *dlm_handle;
+        imp->imp_dlm_handle = conn;
         rc = ptlrpc_init_import(imp);
         if (rc != 0)
                 GOTO(out_ldlm, rc);
@@ -431,7 +440,7 @@ int client_connect_import(const struct lu_env *env,
                 LASSERT (imp->imp_state == LUSTRE_IMP_DISCON);
                 GOTO(out_ldlm, rc);
         }
-        LASSERT(exp->exp_connection);
+        LASSERT((*exp)->exp_connection);
 
         if (data) {
                 LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
@@ -451,9 +460,8 @@ out_ldlm:
                 obd->obd_namespace = NULL;
 out_disco:
                 cli->cl_conn_count--;
-                class_disconnect(exp);
-        } else {
-                class_export_put(exp);
+                class_disconnect(*exp);
+                *exp = NULL;
         }
 out_sem:
         up_write(&cli->cl_sem);
@@ -485,12 +493,12 @@ int client_disconnect_export(struct obd_export *exp)
         if (!cli->cl_conn_count) {
                 CERROR("disconnecting disconnected device (%s)\n",
                        obd->obd_name);
-                GOTO(out_sem, rc = -EINVAL);
+                GOTO(out_disconnect, rc = -EINVAL);
         }
 
         cli->cl_conn_count--;
         if (cli->cl_conn_count)
-                GOTO(out_no_disconnect, rc = 0);
+                GOTO(out_disconnect, rc = 0);
 
         /* Mark import deactivated now, so we don't try to reconnect if any
          * of the cleanup RPCs fails (e.g. ldlm cancel, etc).  We don't
@@ -513,7 +521,13 @@ int client_disconnect_export(struct obd_export *exp)
                 to_be_freed = obd->obd_namespace;
         }
 
+        /*
+         * there's no necessary to hold sem during diconnecting an import,
+         * and actually it may cause deadlock in gss.
+         */
+        up_write(&cli->cl_sem);
         rc = ptlrpc_disconnect_import(imp, 0);
+        down_write(&cli->cl_sem);
 
         ptlrpc_invalidate_import(imp);
         /* set obd_namespace to NULL only after invalidate, because we can have
@@ -525,15 +539,18 @@ int client_disconnect_export(struct obd_export *exp)
                 ptlrpc_free_rq_pool(imp->imp_rq_pool);
                 imp->imp_rq_pool = NULL;
         }
-        destroy_import(imp);
+        client_destroy_import(imp);
         cli->cl_import = NULL;
 
         EXIT;
- out_no_disconnect:
+
+ out_disconnect:
+        /* use server style - class_disconnect should be always called for
+         * o_disconnect */
         err = class_disconnect(exp);
         if (!rc && err)
                 rc = err;
- out_sem:
+
         up_write(&cli->cl_sem);
         if (to_be_freed)
                 ldlm_namespace_free_post(to_be_freed);
@@ -545,11 +562,12 @@ int client_disconnect_export(struct obd_export *exp)
  * from old lib/target.c
  * -------------------------------------------------------------------------- */
 
-int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
-                            struct obd_uuid *cluuid, int mds_conn)
+static int target_handle_reconnect(struct lustre_handle *conn,
+                                   struct obd_export *exp,
+                                   struct obd_uuid *cluuid)
 {
         ENTRY;
-        if (exp->exp_connection && exp->exp_imp_reverse && !mds_conn) {
+        if (exp->exp_connection && exp->exp_imp_reverse) {
                 struct lustre_handle *hdl;
                 hdl = &exp->exp_imp_reverse->imp_remote_handle;
                 /* Might be a re-connect after a partition. */
@@ -725,18 +743,35 @@ int target_handle_connect(struct ptlrpc_request *req)
         if (obd_uuid_equals(&cluuid, &target->obd_uuid))
                 goto dont_check_exports;
 
-        spin_lock(&target->obd_dev_lock);
         export = lustre_hash_lookup(target->obd_uuid_hash, &cluuid);
+        if (!export)
+                goto no_export;
 
-        if (export != NULL && export->exp_connecting) { /* bug 9635, et. al. */
+        /* we've found an export in the hash */
+        if (export->exp_connecting) { /* bug 9635, et. al. */
                 CWARN("%s: exp %p already connecting\n",
                       export->exp_obd->obd_name, export);
                 class_export_put(export);
                 export = NULL;
                 rc = -EALREADY;
-        } else if (export != NULL && export->exp_connection != NULL &&
-                   req->rq_peer.nid != export->exp_connection->c_peer.nid &&
-                   !mds_conn) {
+        } else if (mds_conn && export->exp_connection) {
+                if (req->rq_peer.nid != export->exp_connection->c_peer.nid)
+                        /* mds reconnected after failover */
+                        CWARN("%s: received MDS connection from NID %s,"
+                              " removing former export from NID %s\n",
+                            target->obd_name, libcfs_nid2str(req->rq_peer.nid),
+                            libcfs_nid2str(export->exp_connection->c_peer.nid));
+                else
+                        /* new mds connection from the same nid */
+                        CWARN("%s: received new MDS connection from NID %s,"
+                              " removing former export from same NID\n",
+                            target->obd_name, libcfs_nid2str(req->rq_peer.nid));
+                class_fail_export(export);
+                class_export_put(export);
+                export = NULL;
+                rc = 0;
+        } else if (export->exp_connection != NULL &&
+                   req->rq_peer.nid != export->exp_connection->c_peer.nid) {
                 /* in mds failover we have static uuid but nid can be
                  * changed*/
                 CWARN("%s: cookie %s seen on new NID %s when "
@@ -747,20 +782,19 @@ int target_handle_connect(struct ptlrpc_request *req)
                 rc = -EALREADY;
                 class_export_put(export);
                 export = NULL;
-        } else if (export != NULL) {
+        } else {
                 spin_lock(&export->exp_lock);
                 export->exp_connecting = 1;
                 spin_unlock(&export->exp_lock);
                 class_export_put(export);
-                spin_unlock(&target->obd_dev_lock);
                 LASSERT(export->exp_obd == target);
 
-                rc = target_handle_reconnect(&conn, export, &cluuid, mds_conn);
+                rc = target_handle_reconnect(&conn, export, &cluuid);
         }
 
         /* If we found an export, we already unlocked. */
         if (!export) {
-                spin_unlock(&target->obd_dev_lock);
+no_export:
                 OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout);
         } else if (req->rq_export == NULL &&
                    atomic_read(&export->exp_rpc_count) > 0) {
@@ -771,23 +805,25 @@ int target_handle_connect(struct ptlrpc_request *req)
                 GOTO(out, rc = -EBUSY);
         } else if (req->rq_export != NULL &&
                    (atomic_read(&export->exp_rpc_count) > 1)) {
+                /* the current connect rpc has increased exp_rpc_count */
                 CWARN("%s: refuse reconnection from %s@%s to 0x%p/%d\n",
                       target->obd_name, cluuid.uuid,
                       libcfs_nid2str(req->rq_peer.nid),
-                      export, atomic_read(&export->exp_rpc_count));
+                      export, atomic_read(&export->exp_rpc_count) - 1);
+                spin_lock(&export->exp_lock);
+                if (req->rq_export->exp_conn_cnt <
+                    lustre_msg_get_conn_cnt(req->rq_reqmsg))
+                        /* try to abort active requests */
+                        req->rq_export->exp_abort_active_req = 1;
+                spin_unlock(&export->exp_lock);
                 GOTO(out, rc = -EBUSY);
-        } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 &&
-                   !mds_conn) {
+        } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
                 CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; "
                        "cookies not random?\n", target->obd_name,
                        libcfs_nid2str(req->rq_peer.nid), cluuid.uuid);
                 GOTO(out, rc = -EALREADY);
         } else {
                 OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
-                if (req->rq_export == NULL && mds_conn)
-                       export->exp_last_request_time =
-                               max(export->exp_last_request_time,
-                                   (time_t)cfs_time_current_sec());
         }
 
         if (rc < 0) {
@@ -840,12 +876,17 @@ int target_handle_connect(struct ptlrpc_request *req)
                 } else {
 dont_check_exports:
                         rc = obd_connect(req->rq_svc_thread->t_env,
-                                         &conn, target, &cluuid, data,
+                                         &export, target, &cluuid, data,
                                          client_nid);
+                        if (rc == 0)
+                                conn.cookie = export->exp_handle.h_cookie;
                 }
         } else {
                 rc = obd_reconnect(req->rq_svc_thread->t_env,
                                    export, target, &cluuid, data, client_nid);
+                if (rc == 0)
+                        /* prevous done via class_conn2export */
+                        class_export_get(export);
         }
         if (rc)
                 GOTO(out, rc);
@@ -863,15 +904,6 @@ dont_check_exports:
 
         lustre_msg_set_handle(req->rq_repmsg, &conn);
 
-        /* ownership of this export ref transfers to the request AFTER we
-         * drop any previous reference the request had, but we don't want
-         * that to go to zero before we get our new export reference. */
-        export = class_conn2export(&conn);
-        if (!export) {
-                DEBUG_REQ(D_ERROR, req, "Missing export!");
-                GOTO(out, rc = -ENODEV);
-        }
-
         /* If the client and the server are the same node, we will already
          * have an export that really points to the client's DLM export,
          * because we have a shared handles table.
@@ -885,9 +917,7 @@ dont_check_exports:
         req->rq_export = export;
 
         spin_lock(&export->exp_lock);
-        if (mds_conn) {
-                lustre_msg_set_conn_cnt(req->rq_repmsg, export->exp_conn_cnt + 1);
-        } else if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+        if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
                 spin_unlock(&export->exp_lock);
                 CERROR("%s: %s already connected at higher conn_cnt: %d > %d\n",
                        cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
@@ -897,6 +927,7 @@ dont_check_exports:
                 GOTO(out, rc = -EALREADY);
         }
         export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+        export->exp_abort_active_req = 0;
 
         /* request from liblustre?  Don't evict it for not pinging. */
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
@@ -942,7 +973,7 @@ dont_check_exports:
                 atomic_inc(&target->obd_lock_replay_clients);
                 if (target->obd_connected_clients ==
                     target->obd_max_recoverable_clients)
-                        wake_up(&target->obd_next_transno_waitq);
+                        cfs_waitq_signal(&target->obd_next_transno_waitq);
         }
         spin_unlock_bh(&target->obd_processing_task_lock);
         tmp = req_capsule_client_get(&req->rq_pill, &RMF_CONN);
@@ -966,7 +997,7 @@ dont_check_exports:
                  */
                 sptlrpc_import_inval_all_ctx(export->exp_imp_reverse);
 
-                destroy_import(export->exp_imp_reverse);
+                client_destroy_import(export->exp_imp_reverse);
         }
 
         /* for the rest part, we return -ENOTCONN in case of errors
@@ -994,8 +1025,7 @@ dont_check_exports:
         else
                 revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
 
-        rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx,
-                                      req->rq_flvr.sf_rpc);
+        rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr);
         if (rc) {
                 CERROR("Failed to get sec for reverse import: %d\n", rc);
                 export->exp_imp_reverse = NULL;
@@ -1036,7 +1066,7 @@ void target_destroy_export(struct obd_export *exp)
         /* exports created from last_rcvd data, and "fake"
            exports created by lctl don't have an import */
         if (exp->exp_imp_reverse != NULL)
-                destroy_import(exp->exp_imp_reverse);
+                client_destroy_import(exp->exp_imp_reverse);
 
         /* We cancel locks at disconnect time, but this will catch any locks
          * granted in a race with recovery-induced disconnect. */
@@ -1170,7 +1200,7 @@ static void target_finish_recovery(struct obd_device *obd)
             list_empty(&obd->obd_final_req_queue)) {
                 obd->obd_processing_task = 0;
         } else {
-                CERROR("%s: Recovery queues ( %s%s%s) are empty\n",
+                CERROR("%s: Recovery queues ( %s%s%s) are not empty\n",
                        obd->obd_name,
                        list_empty(&obd->obd_req_replay_queue) ? "" : "req ",
                        list_empty(&obd->obd_lock_replay_queue) ? "" : "lock ",
@@ -1183,8 +1213,9 @@ static void target_finish_recovery(struct obd_device *obd)
         /* when recovery finished, cleanup orphans on mds and ost */
         if (OBT(obd) && OBP(obd, postrecov)) {
                 int rc = OBP(obd, postrecov)(obd);
-                LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name,
-                              rc < 0 ? "failed" : "complete", rc);
+                if (rc < 0)
+                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+                                      obd->obd_name, rc);
         }
 
         obd->obd_recovery_end = cfs_time_current_sec();
@@ -1221,7 +1252,7 @@ static void abort_lock_replay_queue(struct obd_device *obd)
         spin_lock_bh(&obd->obd_processing_task_lock);
         list_splice_init(&obd->obd_lock_replay_queue, &abort_list);
         spin_unlock_bh(&obd->obd_processing_task_lock);
-        list_for_each_entry_safe(req, n, &obd->obd_lock_replay_queue, rq_list){
+        list_for_each_entry_safe(req, n, &abort_list, rq_list){
                 DEBUG_REQ(D_ERROR, req, "aborted:");
                 req->rq_status = -ENOTCONN;
                 if (ptlrpc_error(req)) {
@@ -1338,7 +1369,7 @@ static void check_and_start_recovery_timer(struct obd_device *obd)
                 spin_unlock_bh(&obd->obd_processing_task_lock);
                 return;
         }
-        CWARN("%s: starting recovery timer\n", obd->obd_name);
+        CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
         obd->obd_recovery_start = cfs_time_current_sec();
         /* minimum */
         obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
@@ -1446,7 +1477,8 @@ static int check_for_next_transno(struct obd_device *obd)
                  * to replay requests that demand on already committed ones
                  * also, we can replay first non-committed transation */
                 LASSERT(req_transno != 0);
-                if (req_transno == obd->obd_last_committed + 1) {
+                if (obd->obd_version_recov ||
+                    req_transno == obd->obd_last_committed + 1) {
                         obd->obd_next_recovery_transno = req_transno;
                 } else if (req_transno > obd->obd_last_committed) {
                         /* can't continue recovery: have no needed transno */
@@ -1480,7 +1512,6 @@ static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd)
         } else if (!list_empty(&obd->obd_req_replay_queue)) {
                 req = list_entry(obd->obd_req_replay_queue.next,
                                  struct ptlrpc_request, rq_list);
-                target_exp_dequeue_req_replay(req);
                 list_del_init(&req->rq_list);
                 obd->obd_requests_queued_for_recovery--;
         } else {
@@ -1545,6 +1576,11 @@ static struct ptlrpc_request *target_next_final_ping(struct obd_device *obd)
                 req = list_entry(obd->obd_final_req_queue.next,
                                  struct ptlrpc_request, rq_list);
                 list_del_init(&req->rq_list);
+                if (req->rq_export->exp_in_recovery) {
+                        spin_lock(&req->rq_export->exp_lock);
+                        req->rq_export->exp_in_recovery = 0;
+                        spin_unlock(&req->rq_export->exp_lock);
+                }
         } else {
                 req = NULL;
         }
@@ -1552,6 +1588,11 @@ static struct ptlrpc_request *target_next_final_ping(struct obd_device *obd)
         return req;
 }
 
+static inline int req_vbr_done(struct obd_export *exp)
+{
+        return (exp->exp_vbr_failed == 0);
+}
+
 static inline int req_replay_done(struct obd_export *exp)
 {
         return (exp->exp_req_replay_needed == 0);
@@ -1569,7 +1610,7 @@ static inline int connect_done(struct obd_export *exp)
 
 static int check_for_clients(struct obd_device *obd)
 {
-        if (obd->obd_abort_recovery)
+        if (obd->obd_abort_recovery || obd->obd_version_recov)
                 return 1;
         LASSERT(obd->obd_connected_clients <= obd->obd_max_recoverable_clients);
         if (obd->obd_no_conn == 0 &&
@@ -1606,21 +1647,27 @@ static int handle_recovery_req(struct ptlrpc_thread *thread,
         if (!req_replay_done(req->rq_export) ||
             !lock_replay_done(req->rq_export))
                 reset_recovery_timer(class_exp2obd(req->rq_export),
-                       AT_OFF ? obd_timeout :
+                                     AT_OFF ? obd_timeout :
                        at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1);
+
+        /**
+         * bz18031: increase next_recovery_transno before ptlrpc_free_clone()
+         * will drop exp_rpc reference
+         */
+        if (!req_replay_done(req->rq_export)) {
+                spin_lock_bh(&req->rq_export->exp_obd->obd_processing_task_lock);
+                req->rq_export->exp_obd->obd_next_recovery_transno++;
+                spin_unlock_bh(&req->rq_export->exp_obd->obd_processing_task_lock);
+                target_exp_dequeue_req_replay(req);
+        }
         ptlrpc_free_clone(req);
         RETURN(0);
 }
 
-static void resume_recovery_timer(struct obd_device *obd)
-{
-        /* to be safe, make it at least OBD_RECOVERY_FACTOR * obd_timeout */
-        reset_recovery_timer(obd, OBD_RECOVERY_FACTOR * obd_timeout, 1);
-}
-
 static int target_recovery_thread(void *arg)
 {
-        struct obd_device *obd = arg;
+        struct lu_target *lut = arg;
+        struct obd_device *obd = lut->lut_obd;
         struct ptlrpc_request *req;
         struct target_recovery_data *trd = &obd->obd_recovery_data;
         struct l_wait_info lwi = { 0 };
@@ -1646,8 +1693,8 @@ static int target_recovery_thread(void *arg)
         env.le_ctx.lc_thread = thread;
 
         CERROR("%s: started recovery thread pid %d\n", obd->obd_name,
-               current->pid);
-        trd->trd_processing_task = current->pid;
+               cfs_curproc_pid());
+        trd->trd_processing_task = cfs_curproc_pid();
 
         obd->obd_recovering = 1;
         complete(&trd->trd_starting);
@@ -1657,84 +1704,87 @@ static int target_recovery_thread(void *arg)
         l_wait_event(obd->obd_next_transno_waitq,
                      check_for_clients(obd), &lwi);
 
-        spin_lock_bh(&obd->obd_processing_task_lock);
-        target_cancel_recovery_timer(obd);
-        spin_unlock_bh(&obd->obd_processing_task_lock);
-
         /* If some clients haven't connected in time, evict them */
-        if (obd->obd_abort_recovery) {
+        if (obd->obd_connected_clients < obd->obd_max_recoverable_clients) {
                 CWARN("Some clients haven't connect in time (%d/%d),"
                        "evict them\n", obd->obd_connected_clients,
                        obd->obd_max_recoverable_clients);
-                obd->obd_abort_recovery = obd->obd_stopping;
-                class_disconnect_stale_exports(obd, connect_done);
+                class_disconnect_stale_exports(obd, connect_done,
+                                               exp_flags_from_obd(obd) |
+                                               OBD_OPT_ABORT_RECOV);
+                /**
+                 * if recovery proceeds with versions then some clients may be
+                 * timed out waiting for others and trying to reconnect.
+                 * Extend timer for such reconnect cases.
+                 */
+                if (obd->obd_version_recov)
+                        reset_recovery_timer(obd, RECONNECT_DELAY_MAX * 2, 1);
         }
+
         /* next stage: replay requests */
         delta = jiffies;
         obd->obd_req_replaying = 1;
         CDEBUG(D_INFO, "1: request replay stage - %d clients from t"LPU64"\n",
-              atomic_read(&obd->obd_req_replay_clients),
-              obd->obd_next_recovery_transno);
-        resume_recovery_timer(obd);
+               atomic_read(&obd->obd_req_replay_clients),
+               obd->obd_next_recovery_transno);
         while ((req = target_next_replay_req(obd))) {
-                LASSERT(trd->trd_processing_task == current->pid);
+                LASSERT(trd->trd_processing_task == cfs_curproc_pid());
                 DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
                           lustre_msg_get_transno(req->rq_reqmsg),
                           libcfs_nid2str(req->rq_peer.nid));
                 handle_recovery_req(thread, req,
                                     trd->trd_recovery_handler);
                 obd->obd_replayed_requests++;
-                spin_lock_bh(&obd->obd_processing_task_lock);
-                obd->obd_next_recovery_transno++;
-                spin_unlock_bh(&obd->obd_processing_task_lock);
         }
 
-        spin_lock_bh(&obd->obd_processing_task_lock);
-        target_cancel_recovery_timer(obd);
-        spin_unlock_bh(&obd->obd_processing_task_lock);
-
         /* If some clients haven't replayed requests in time, evict them */
         if (obd->obd_abort_recovery) {
-                CDEBUG(D_ERROR, "req replay timed out, aborting ...\n");
-                obd->obd_abort_recovery = obd->obd_stopping;
-                class_disconnect_stale_exports(obd, req_replay_done);
+                CDEBUG(D_WARNING, "req replay is aborted\n");
+                class_disconnect_stale_exports(obd, req_replay_done,
+                                               exp_flags_from_obd(obd) |
+                                               OBD_OPT_ABORT_RECOV);
                 abort_req_replay_queue(obd);
         }
+        LASSERT(list_empty(&obd->obd_req_replay_queue));
 
         /* The second stage: replay locks */
         CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
                atomic_read(&obd->obd_lock_replay_clients));
-        resume_recovery_timer(obd);
         while ((req = target_next_replay_lock(obd))) {
-                LASSERT(trd->trd_processing_task == current->pid);
-                DEBUG_REQ(D_HA|D_WARNING, req, "processing lock from %s: ",
+                LASSERT(trd->trd_processing_task == cfs_curproc_pid());
+                DEBUG_REQ(D_HA, req, "processing lock from %s: ",
                           libcfs_nid2str(req->rq_peer.nid));
                 handle_recovery_req(thread, req,
                                     trd->trd_recovery_handler);
                 obd->obd_replayed_locks++;
         }
 
-        spin_lock_bh(&obd->obd_processing_task_lock);
-        target_cancel_recovery_timer(obd);
-        spin_unlock_bh(&obd->obd_processing_task_lock);
         /* If some clients haven't replayed requests in time, evict them */
         if (obd->obd_abort_recovery) {
-                int stale;
-                CERROR("lock replay timed out, aborting ...\n");
-                obd->obd_abort_recovery = obd->obd_stopping;
-                stale = class_disconnect_stale_exports(obd, lock_replay_done);
+                CERROR("lock replay is aborted\n");
+                class_disconnect_stale_exports(obd, lock_replay_done,
+                                               exp_flags_from_obd(obd) |
+                                               OBD_OPT_ABORT_RECOV);
                 abort_lock_replay_queue(obd);
         }
+        LASSERT(list_empty(&obd->obd_lock_replay_queue));
 
+        /* The third stage: reply on final pings */
+        CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n");
+        /** evict exports failed VBR */
+        class_disconnect_stale_exports(obd, req_vbr_done,
+                                       exp_flags_from_obd(obd) |
+                                       OBD_OPT_ABORT_RECOV);
+        /** Update server last boot epoch */
+        lut_boot_epoch_update(lut);
         /* We drop recoverying flag to forward all new requests
          * to regular mds_handle() since now */
         spin_lock_bh(&obd->obd_processing_task_lock);
         obd->obd_recovering = obd->obd_abort_recovery = 0;
+        target_cancel_recovery_timer(obd);
         spin_unlock_bh(&obd->obd_processing_task_lock);
-        /* The third stage: reply on final pings */
-        CDEBUG(D_INFO, "3: final stage - process recovery completion pings\n");
         while ((req = target_next_final_ping(obd))) {
-                LASSERT(trd->trd_processing_task == current->pid);
+                LASSERT(trd->trd_processing_task == cfs_curproc_pid());
                 DEBUG_REQ(D_HA, req, "processing final ping from %s: ",
                           libcfs_nid2str(req->rq_peer.nid));
                 handle_recovery_req(thread, req,
@@ -1746,7 +1796,7 @@ static int target_recovery_thread(void *arg)
               delta, obd->obd_replayed_requests, obd->obd_replayed_locks);
         LASSERT(atomic_read(&obd->obd_req_replay_clients) == 0);
         LASSERT(atomic_read(&obd->obd_lock_replay_clients) == 0);
-        if (delta > obd_timeout * 2) {
+        if (delta > obd_timeout * OBD_RECOVERY_FACTOR) {
                 CWARN("too long recovery - read logs\n");
                 libcfs_debug_dumplog();
         }
@@ -1759,8 +1809,10 @@ static int target_recovery_thread(void *arg)
         RETURN(rc);
 }
 
-int target_start_recovery_thread(struct obd_device *obd, svc_handler_t handler)
+static int target_start_recovery_thread(struct lu_target *lut,
+                                        svc_handler_t handler)
 {
+        struct obd_device *obd = lut->lut_obd;
         int rc = 0;
         struct target_recovery_data *trd = &obd->obd_recovery_data;
 
@@ -1769,7 +1821,7 @@ int target_start_recovery_thread(struct obd_device *obd, svc_handler_t handler)
         init_completion(&trd->trd_finishing);
         trd->trd_recovery_handler = handler;
 
-        if (kernel_thread(target_recovery_thread, obd, 0) > 0) {
+        if (kernel_thread(target_recovery_thread, lut, 0) > 0) {
                 wait_for_completion(&trd->trd_starting);
                 LASSERT(obd->obd_recovering != 0);
         } else
@@ -1785,7 +1837,7 @@ void target_stop_recovery_thread(struct obd_device *obd)
                 struct target_recovery_data *trd = &obd->obd_recovery_data;
                 CERROR("%s: Aborting recovery\n", obd->obd_name);
                 obd->obd_abort_recovery = 1;
-                wake_up(&obd->obd_next_transno_waitq);
+                cfs_waitq_signal(&obd->obd_next_transno_waitq);
                 spin_unlock_bh(&obd->obd_processing_task_lock);
                 wait_for_completion(&trd->trd_finishing);
         } else {
@@ -1804,26 +1856,34 @@ EXPORT_SYMBOL(target_recovery_fini);
 static void target_recovery_expired(unsigned long castmeharder)
 {
         struct obd_device *obd = (struct obd_device *)castmeharder;
-        LCONSOLE_WARN("%s: recovery timed out; %d clients never reconnected "
-                      "after %lds (%d clients did)\n",
-                      obd->obd_name, obd->obd_recoverable_clients,
-                      cfs_time_current_sec()- obd->obd_recovery_start,
-                      obd->obd_connected_clients);
+        CDEBUG(D_HA, "%s: recovery timed out; %d clients never reconnected "
+               "after %lds (%d clients did)\n",
+               obd->obd_name, obd->obd_recoverable_clients,
+               cfs_time_current_sec()- obd->obd_recovery_start,
+               obd->obd_connected_clients);
+
         spin_lock_bh(&obd->obd_processing_task_lock);
-        if (obd->obd_recovering)
-                obd->obd_abort_recovery = 1;
+        obd->obd_version_recov = 1;
+        CDEBUG(D_INFO, "VBR is used for %d clients from t"LPU64"\n",
+               atomic_read(&obd->obd_req_replay_clients),
+               obd->obd_next_recovery_transno);
         cfs_waitq_signal(&obd->obd_next_transno_waitq);
         spin_unlock_bh(&obd->obd_processing_task_lock);
 }
 
-void target_recovery_init(struct obd_device *obd, svc_handler_t handler)
+void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 {
-        if (obd->obd_max_recoverable_clients == 0)
+        struct obd_device *obd = lut->lut_obd;
+        if (obd->obd_max_recoverable_clients == 0) {
+                /** Update server last boot epoch */
+                lut_boot_epoch_update(lut);
                 return;
+        }
 
         CWARN("RECOVERY: service %s, %d recoverable clients, "
               "last_transno "LPU64"\n", obd->obd_name,
               obd->obd_max_recoverable_clients, obd->obd_last_committed);
+        LASSERT(obd->obd_stopping == 0);
         obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
         obd->obd_recovery_start = 0;
         obd->obd_recovery_end = 0;
@@ -1831,13 +1891,14 @@ void target_recovery_init(struct obd_device *obd, svc_handler_t handler)
         /* bz13079: this should be set to desired value for ost but not for mds */
         obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME;
         cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
-        target_start_recovery_thread(obd, handler);
+        target_start_recovery_thread(lut, handler);
 }
 EXPORT_SYMBOL(target_recovery_init);
 
 #endif
 
-int target_process_req_flags(struct obd_device *obd, struct ptlrpc_request *req)
+static int target_process_req_flags(struct obd_device *obd,
+                                    struct ptlrpc_request *req)
 {
         struct obd_export *exp = req->rq_export;
         LASSERT(exp != NULL);
@@ -1854,7 +1915,6 @@ int target_process_req_flags(struct obd_device *obd, struct ptlrpc_request *req)
                         obd->obd_recoverable_clients--;
                         if (atomic_read(&obd->obd_req_replay_clients) == 0)
                                 CDEBUG(D_HA, "all clients have replayed reqs\n");
-                        wake_up(&obd->obd_next_transno_waitq);
                 }
                 spin_unlock_bh(&obd->obd_processing_task_lock);
         }
@@ -1870,7 +1930,6 @@ int target_process_req_flags(struct obd_device *obd, struct ptlrpc_request *req)
                         atomic_dec(&obd->obd_lock_replay_clients);
                         if (atomic_read(&obd->obd_lock_replay_clients) == 0)
                                 CDEBUG(D_HA, "all clients have replayed locks\n");
-                        wake_up(&obd->obd_next_transno_waitq);
                 }
                 spin_unlock_bh(&obd->obd_processing_task_lock);
         }
@@ -1884,7 +1943,6 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         struct list_head *tmp;
         int inserted = 0;
         __u64 transno = lustre_msg_get_transno(req->rq_reqmsg);
-
         ENTRY;
 
         if (obd->obd_recovery_data.trd_processing_task == cfs_curproc_pid()) {
@@ -1902,6 +1960,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
                         RETURN(-ENOMEM);
                 DEBUG_REQ(D_HA, req, "queue final req");
                 spin_lock_bh(&obd->obd_processing_task_lock);
+                cfs_waitq_signal(&obd->obd_next_transno_waitq);
                 if (obd->obd_recovering)
                         list_add_tail(&req->rq_list, &obd->obd_final_req_queue);
                 else {
@@ -1923,6 +1982,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
                         RETURN(-ENOMEM);
                 DEBUG_REQ(D_HA, req, "queue lock replay req");
                 spin_lock_bh(&obd->obd_processing_task_lock);
+                cfs_waitq_signal(&obd->obd_next_transno_waitq);
                 LASSERT(obd->obd_recovering);
                 /* usually due to recovery abort */
                 if (!req->rq_export->exp_in_recovery) {
@@ -1933,7 +1993,6 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
                 LASSERT(req->rq_export->exp_lock_replay_needed);
                 list_add_tail(&req->rq_list, &obd->obd_lock_replay_queue);
                 spin_unlock_bh(&obd->obd_processing_task_lock);
-                wake_up(&obd->obd_next_transno_waitq);
                 RETURN(0);
         }
 
@@ -2018,7 +2077,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
                 list_add_tail(&req->rq_list, &obd->obd_req_replay_queue);
 
         obd->obd_requests_queued_for_recovery++;
-        wake_up(&obd->obd_next_transno_waitq);
+        cfs_waitq_signal(&obd->obd_next_transno_waitq);
         spin_unlock_bh(&obd->obd_processing_task_lock);
         RETURN(0);
 }
@@ -2130,22 +2189,20 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
         rs->rs_xid       = req->rq_xid;
         rs->rs_transno   = req->rq_transno;
         rs->rs_export    = exp;
+        rs->rs_opc       = lustre_msg_get_opc(rs->rs_msg);
 
-        spin_lock(&obd->obd_uncommitted_replies_lock);
-
+        spin_lock(&exp->exp_uncommitted_replies_lock);
         CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n",
-               rs->rs_transno, obd->obd_last_committed);
-        if (rs->rs_transno > obd->obd_last_committed) {
+               rs->rs_transno, exp->exp_last_committed);
+        if (rs->rs_transno > exp->exp_last_committed) {
                 /* not committed already */
-                list_add_tail (&rs->rs_obd_list,
-                               &obd->obd_uncommitted_replies);
+                list_add_tail(&rs->rs_obd_list,
+                              &exp->exp_uncommitted_replies);
         }
+        spin_unlock (&exp->exp_uncommitted_replies_lock);
 
-        spin_unlock (&obd->obd_uncommitted_replies_lock);
-        spin_lock (&exp->exp_lock);
-
-        list_add_tail (&rs->rs_exp_list, &exp->exp_outstanding_replies);
-
+        spin_lock(&exp->exp_lock);
+        list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
         spin_unlock(&exp->exp_lock);
 
         netrc = target_send_reply_msg (req, rc, fail_id);
@@ -2166,7 +2223,7 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
         }
 
         spin_lock(&rs->rs_lock);
-        if (rs->rs_transno <= obd->obd_last_committed ||
+        if (rs->rs_transno <= exp->exp_last_committed ||
             (!rs->rs_on_net && !rs->rs_no_ack) ||
              list_empty(&rs->rs_exp_list) ||     /* completed already */
              list_empty(&rs->rs_obd_list)) {
@@ -2189,26 +2246,19 @@ int target_handle_ping(struct ptlrpc_request *req)
 
 void target_committed_to_req(struct ptlrpc_request *req)
 {
-        struct obd_device *obd;
-
-        if (req == NULL || req->rq_export == NULL)
-                return;
-
-        obd = req->rq_export->exp_obd;
-        if (obd == NULL)
-                return;
+        struct obd_export *exp = req->rq_export;
 
-        if (!obd->obd_no_transno && req->rq_repmsg != NULL)
+        if (!exp->exp_obd->obd_no_transno && req->rq_repmsg != NULL)
                 lustre_msg_set_last_committed(req->rq_repmsg,
-                                              obd->obd_last_committed);
+                                              exp->exp_last_committed);
         else
                 DEBUG_REQ(D_IOCTL, req, "not sending last_committed update (%d/"
-                          "%d)", obd->obd_no_transno, req->rq_repmsg == NULL);
+                          "%d)", exp->exp_obd->obd_no_transno,
+                          req->rq_repmsg == NULL);
 
         CDEBUG(D_INFO, "last_committed "LPU64", transno "LPU64", xid "LPU64"\n",
-               obd->obd_last_committed, req->rq_transno, req->rq_xid);
+               exp->exp_last_committed, req->rq_transno, req->rq_xid);
 }
-
 EXPORT_SYMBOL(target_committed_to_req);
 
 int target_handle_qc_callback(struct ptlrpc_request *req)
@@ -2250,12 +2300,11 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
 
         LASSERT(req->rq_export);
 
-        OBD_ALLOC(qdata, sizeof(struct qunit_data));
-        if (!qdata)
-                RETURN(-ENOMEM);
-        rc = quota_get_qdata(req, qdata, QUOTA_REQUEST, QUOTA_EXPORT);
-        if (rc < 0) {
+        qdata = quota_get_qdata(req, QUOTA_REQUEST, QUOTA_EXPORT);
+        if (IS_ERR(qdata)) {
+                rc = PTR_ERR(qdata);
                 CDEBUG(D_ERROR, "Can't unpack qunit_data(rc: %d)\n", rc);
+                req->rq_status = rc;
                 GOTO(out, rc);
         }
 
@@ -2263,7 +2312,7 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
         if (!obd->obd_observer || !obd->obd_observer->obd_observer) {
                 CERROR("Can't find the observer, it is recovering\n");
                 req->rq_status = -EAGAIN;
-                GOTO(send_reply, rc = -EAGAIN);
+                GOTO(out, rc);
         }
 
         master_obd = obd->obd_observer->obd_observer;
@@ -2277,7 +2326,6 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
                 CDEBUG(D_QUOTA, "quota_type not processed yet, return "
                        "-EAGAIN\n");
                 req->rq_status = -EAGAIN;
-                rc = ptlrpc_reply(req);
                 GOTO(out, rc);
         }
 
@@ -2290,7 +2338,6 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
                 CDEBUG(D_QUOTA, "quota_ctxt is not ready yet, return "
                        "-EAGAIN\n");
                 req->rq_status = -EAGAIN;
-                rc = ptlrpc_reply(req);
                 GOTO(out, rc);
         }
 
@@ -2300,24 +2347,22 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
         up_read(&obt->obt_rwsem);
         if (rc && rc != -EDQUOT)
                 CDEBUG(rc == -EBUSY  ? D_QUOTA : D_ERROR,
-                       "dqacq failed! (rc:%d)\n", rc);
+                       "dqacq/dqrel failed! (rc:%d)\n", rc);
         req->rq_status = rc;
 
-        /* there are three forms of qunit(historic causes), so we need to
-         * adjust the same form to different forms slaves needed */
         rc = quota_copy_qdata(req, qdata, QUOTA_REPLY, QUOTA_EXPORT);
         if (rc < 0) {
-                CDEBUG(D_ERROR, "Can't pack qunit_data(rc: %d)\n", rc);
+                CERROR("Can't pack qunit_data(rc: %d)\n", rc);
                 GOTO(out, rc);
         }
 
         /* Block the quota req. b=14840 */
         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_BLOCK_QUOTA_REQ, obd_timeout);
-send_reply:
-        rc = ptlrpc_reply(req);
+        EXIT;
+
 out:
-        OBD_FREE(qdata, sizeof(struct qunit_data));
-        RETURN(rc);
+        rc = ptlrpc_reply(req);
+        return rc;
 #else
         return 0;
 #endif /* !__KERNEL__ */