Whamcloud - gitweb
LU-1522 recovery: rework LU-1166 patch in different way
[fs/lustre-release.git] / lustre / ldlm / ldlm_lib.c
index 2483afb..290005e 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -36,9 +34,6 @@
  * Lustre is a trademark of Sun Microsystems, Inc.
  */
 
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
 #define DEBUG_SUBSYSTEM S_LDLM
 
 #ifdef __KERNEL__
@@ -228,6 +223,29 @@ void client_destroy_import(struct obd_import *imp)
 }
 EXPORT_SYMBOL(client_destroy_import);
 
+/**
+ * check whether the osc is on MDT or not
+ * In the config log,
+ * osc on MDT
+ *     setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
+ * osc on client
+ *     setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
+ *
+ **/
+static int osc_on_mdt(char *obdname)
+{
+       char *ptr;
+
+       ptr = strrchr(obdname, '-');
+       if (ptr == NULL)
+               return 0;
+
+       if (strncmp(ptr + 1, "MDT", 3) == 0)
+               return 1;
+
+       return 0;
+}
+
 /* configure an RPC client OBD device
  *
  * lcfg parameters:
@@ -318,6 +336,8 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
         CFS_INIT_LIST_HEAD(&cli->cl_loi_write_list);
         CFS_INIT_LIST_HEAD(&cli->cl_loi_read_list);
         client_obd_list_lock_init(&cli->cl_loi_list_lock);
+       cfs_atomic_set(&cli->cl_pending_w_pages, 0);
+       cfs_atomic_set(&cli->cl_pending_r_pages, 0);
         cli->cl_r_in_flight = 0;
         cli->cl_w_in_flight = 0;
 
@@ -355,9 +375,11 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
         } else if (cfs_num_physpages >> (20 - CFS_PAGE_SHIFT) <= 512 /* MB */) {
                 cli->cl_max_rpcs_in_flight = 4;
         } else {
-                cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
+               if (osc_on_mdt(obddev->obd_name))
+                       cli->cl_max_rpcs_in_flight = MDS_OSC_MAX_RIF_DEFAULT;
+               else
+                       cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
         }
-
         rc = ldlm_get_ref();
         if (rc) {
                 CERROR("ldlm_get_ref failed: %d\n", rc);
@@ -577,6 +599,7 @@ int client_disconnect_export(struct obd_export *exp)
         RETURN(rc);
 }
 
+#ifdef HAVE_SERVER_SUPPORT
 int server_disconnect_export(struct obd_export *exp)
 {
         int rc;
@@ -597,18 +620,20 @@ int server_disconnect_export(struct obd_export *exp)
                 struct ptlrpc_reply_state *rs =
                         cfs_list_entry(exp->exp_outstanding_replies.next,
                                        struct ptlrpc_reply_state, rs_exp_list);
-                struct ptlrpc_service *svc = rs->rs_service;
+               struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
 
-                cfs_spin_lock(&svc->srv_rs_lock);
-                cfs_list_del_init(&rs->rs_exp_list);
-                cfs_spin_lock(&rs->rs_lock);
-                ptlrpc_schedule_difficult_reply(rs);
-                cfs_spin_unlock(&rs->rs_lock);
-                cfs_spin_unlock(&svc->srv_rs_lock);
-        }
-        cfs_spin_unlock(&exp->exp_lock);
+               cfs_spin_lock(&svcpt->scp_rep_lock);
 
-        RETURN(rc);
+               cfs_list_del_init(&rs->rs_exp_list);
+               cfs_spin_lock(&rs->rs_lock);
+               ptlrpc_schedule_difficult_reply(rs);
+               cfs_spin_unlock(&rs->rs_lock);
+
+               cfs_spin_unlock(&svcpt->scp_rep_lock);
+       }
+       cfs_spin_unlock(&exp->exp_lock);
+
+       RETURN(rc);
 }
 
 /* --------------------------------------------------------------------------
@@ -630,22 +655,25 @@ static int target_handle_reconnect(struct lustre_handle *conn,
 
                 /* Might be a re-connect after a partition. */
                 if (!memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) {
-                        if (target->obd_recovering)
+                        if (target->obd_recovering) {
+                                int timeout = cfs_duration_sec(cfs_time_sub(
+                                        cfs_timer_deadline(
+                                        &target->obd_recovery_timer),
+                                        cfs_time_current()));
+
                                 LCONSOLE_WARN("%s: Client %s (at %s) reconnect"
-                                        "ing, waiting for %d clients in "
-                                        "recovery for %lds\n", target->obd_name,
+                                        "ing, waiting for %d clients in recov"
+                                        "ery for %d:%.02d\n", target->obd_name,
                                         obd_uuid2str(&exp->exp_client_uuid),
                                         obd_export_nid2str(exp),
                                         target->obd_max_recoverable_clients,
-                                        cfs_duration_sec(cfs_time_sub(
-                                        cfs_timer_deadline(
-                                        &target->obd_recovery_timer),
-                                        cfs_time_current())));
-                        else
+                                        timeout / 60, timeout % 60);
+                        } else {
                                 LCONSOLE_WARN("%s: Client %s (at %s) "
                                         "reconnecting\n", target->obd_name,
                                         obd_uuid2str(&exp->exp_client_uuid),
                                         obd_export_nid2str(exp));
+                        }
 
                         conn->cookie = exp->exp_handle.h_cookie;
                         /* target_handle_connect() treats EALREADY and
@@ -653,14 +681,13 @@ static int target_handle_reconnect(struct lustre_handle *conn,
                          * doing a valid reconnect from the same client. */
                         RETURN(EALREADY);
                 } else {
-                        LCONSOLE_WARN("%s: The server has already connected "
-                                      "client %s (at %s) with handle " LPX64
-                                      ", rejecting a client with the same "
-                                      "uuid trying to reconnect with "
-                                      "handle " LPX64, target->obd_name,
-                                      obd_uuid2str(&exp->exp_client_uuid),
-                                      obd_export_nid2str(exp),
-                                      hdl->cookie, conn->cookie);
+                       LCONSOLE_WARN("%s: already connected client %s (at %s) "
+                                     "with handle "LPX64". Rejecting client "
+                                     "with the same UUID trying to reconnect "
+                                     "with handle "LPX64"\n", target->obd_name,
+                                     obd_uuid2str(&exp->exp_client_uuid),
+                                     obd_export_nid2str(exp),
+                                     hdl->cookie, conn->cookie);
                         memset(conn, 0, sizeof *conn);
                         /* target_handle_connect() treats EALREADY and
                          * -EALREADY differently.  -EALREADY is an error
@@ -704,7 +731,7 @@ check_and_start_recovery_timer(struct obd_device *obd,
 
 int target_handle_connect(struct ptlrpc_request *req)
 {
-        struct obd_device *target, *targref = NULL;
+       struct obd_device *target = NULL, *targref = NULL;
         struct obd_export *export = NULL;
         struct obd_import *revimp;
         struct lustre_handle conn;
@@ -735,17 +762,29 @@ int target_handle_connect(struct ptlrpc_request *req)
         if (!target)
                 target = class_name2obd(str);
 
-        if (!target || target->obd_stopping || !target->obd_set_up) {
-                deuuidify(str, NULL, &target_start, &target_len);
-                LCONSOLE_ERROR_MSG(0x137, "%.*s: Not available for connect "
-                                   "from %s (%s)\n", target_len, target_start,
-                                   libcfs_nid2str(req->rq_peer.nid), !target ?
-                                   "no target" : (target->obd_stopping ?
-                                   "stopping" : "not set up"));
-                GOTO(out, rc = -ENODEV);
-        }
+       if (!target) {
+               deuuidify(str, NULL, &target_start, &target_len);
+               LCONSOLE_ERROR_MSG(0x137, "UUID '%s' is not available for "
+                                  "connect (no target)\n", str);
+               GOTO(out, rc = -ENODEV);
+       }
+
+       cfs_spin_lock(&target->obd_dev_lock);
+       if (target->obd_stopping || !target->obd_set_up) {
+               cfs_spin_unlock(&target->obd_dev_lock);
+
+               deuuidify(str, NULL, &target_start, &target_len);
+               LCONSOLE_ERROR_MSG(0x137, "%.*s: Not available for connect "
+                                  "from %s (%s)\n", target_len, target_start,
+                                  libcfs_nid2str(req->rq_peer.nid), 
+                                  (target->obd_stopping ?
+                                  "stopping" : "not set up"));
+               GOTO(out, rc = -ENODEV);
+       }
 
         if (target->obd_no_conn) {
+               cfs_spin_unlock(&target->obd_dev_lock);
+
                 LCONSOLE_WARN("%s: Temporarily refusing client connection "
                               "from %s\n", target->obd_name,
                               libcfs_nid2str(req->rq_peer.nid));
@@ -757,6 +796,8 @@ int target_handle_connect(struct ptlrpc_request *req)
            Really, class_uuid2obd should take the ref. */
         targref = class_incref(target, __FUNCTION__, cfs_current());
 
+       target->obd_conn_inprogress++;
+       cfs_spin_unlock(&target->obd_dev_lock);
 
         str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
         if (str == NULL) {
@@ -960,34 +1001,28 @@ no_export:
 
                         t = cfs_timer_deadline(&target->obd_recovery_timer);
                         t = cfs_time_sub(t, cfs_time_current());
+                        t = cfs_duration_sec(t);
                         LCONSOLE_WARN("%s: Denying connection for new client "
                                       "%s (at %s), waiting for %d clients in "
-                                      "recovery for "CFS_TIME_T"s\n",
+                                      "recovery for %d:%.02d\n",
                                       target->obd_name,
                                       libcfs_nid2str(req->rq_peer.nid),
                                       cluuid.uuid,
                                       cfs_atomic_read(&target-> \
                                                       obd_lock_replay_clients),
-                                      cfs_duration_sec(t));
+                                      (int)t / 60, (int)t % 60);
                         rc = -EBUSY;
                 } else {
 dont_check_exports:
                         rc = obd_connect(req->rq_svc_thread->t_env,
                                          &export, target, &cluuid, data,
                                          client_nid);
-                        if (rc == 0) {
+                        if (rc == 0)
                                 conn.cookie = export->exp_handle.h_cookie;
-                                /* LU-1092 reconnect put export refcount in the
-                                 * end, connect needs take one here too. */
-                                class_export_get(export);
-                        }
                 }
         } else {
                 rc = obd_reconnect(req->rq_svc_thread->t_env,
                                    export, target, &cluuid, data, client_nid);
-                if (rc == 0)
-                        /* prevous done via class_conn2export */
-                        class_export_get(export);
         }
         if (rc)
                 GOTO(out, rc);
@@ -1023,7 +1058,8 @@ dont_check_exports:
         if (req->rq_export != NULL)
                 class_export_put(req->rq_export);
 
-        req->rq_export = export;
+       /* request takes one export refcount */
+       req->rq_export = class_export_get(export);
 
         cfs_spin_lock(&export->exp_lock);
         if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
@@ -1071,14 +1107,18 @@ dont_check_exports:
                              &export->exp_connection->c_peer.nid,
                              &export->exp_nid_hash);
         }
-        /**
-          class_disconnect->class_export_recovery_cleanup() race
-         */
+
         if (target->obd_recovering && !export->exp_in_recovery) {
                 int has_transno;
                 __u64 transno = data->ocd_transno;
 
                 cfs_spin_lock(&export->exp_lock);
+               /* possible race with class_disconnect_stale_exports,
+                * export may be already in the eviction process */
+               if (export->exp_failed) {
+                       cfs_spin_unlock(&export->exp_lock);
+                       GOTO(out, rc = -ENODEV);
+               }
                 export->exp_in_recovery = 1;
                 export->exp_req_replay_needed = 1;
                 export->exp_lock_replay_needed = 1;
@@ -1116,8 +1156,9 @@ dont_check_exports:
 
         if (export->exp_imp_reverse != NULL) {
                 /* destroyed import can be still referenced in ctxt */
-                obd_set_info_async(export, sizeof(KEY_REVIMP_UPD),
-                                   KEY_REVIMP_UPD, 0, NULL, NULL);
+                obd_set_info_async(req->rq_svc_thread->t_env, export,
+                                   sizeof(KEY_REVIMP_UPD), KEY_REVIMP_UPD,
+                                   0, NULL, NULL);
 
                 client_destroy_import(export->exp_imp_reverse);
         }
@@ -1169,8 +1210,13 @@ out:
 
                 class_export_put(export);
         }
-        if (targref)
+        if (targref) {
+               cfs_spin_lock(&target->obd_dev_lock);
+               target->obd_conn_inprogress--;
+               cfs_spin_unlock(&target->obd_dev_lock);
+
                 class_decref(targref, __FUNCTION__, cfs_current());
+       }
         if (rc)
                 req->rq_status = rc;
         RETURN(rc);
@@ -1515,7 +1561,7 @@ check_and_start_recovery_timer(struct obd_device *obd,
         if (!new_client && service_time)
                 /* Teach server about old server's estimates, as first guess
                  * at how long new requests will take. */
-                at_measured(&req->rq_rqbd->rqbd_service->srv_at_estimate,
+               at_measured(&req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
                             service_time);
 
         target_start_recovery_timer(obd);
@@ -1689,8 +1735,8 @@ repeat:
         } else if (obd->obd_recovery_expired) {
                 obd->obd_recovery_expired = 0;
                 /** If some clients died being recovered, evict them */
-                CDEBUG(D_WARNING,
-                       "recovery is timed out, evict stale exports\n");
+                LCONSOLE_WARN("%s: recovery is timed out, "
+                              "evict stale exports\n", obd->obd_name);
                 /** evict cexports with no replay in queue, they are stalled */
                 class_disconnect_stale_exports(obd, health_check);
                 /** continue with VBR */
@@ -1826,15 +1872,17 @@ static int handle_recovery_req(struct ptlrpc_thread *thread,
                  * this client may come in recovery time
                  */
                 if (!AT_OFF) {
-                        struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
-                        /* If the server sent early reply for this request,
-                         * the client will recalculate the timeout according to
-                         * current server estimate service time, so we will
-                         * use the maxium timeout here for waiting the client
-                         * sending the next req */
-                        to = max((int)at_est2timeout(
-                                 at_get(&svc->srv_at_estimate)),
-                                 (int)lustre_msg_get_timeout(req->rq_reqmsg));
+                       struct ptlrpc_service_part *svcpt;
+
+                       svcpt = req->rq_rqbd->rqbd_svcpt;
+                       /* If the server sent early reply for this request,
+                        * the client will recalculate the timeout according to
+                        * current server estimate service time, so we will
+                        * use the maxium timeout here for waiting the client
+                        * sending the next req */
+                       to = max((int)at_est2timeout(
+                                at_get(&svcpt->scp_at_estimate)),
+                                (int)lustre_msg_get_timeout(req->rq_reqmsg));
                         /* Add net_latency (see ptlrpc_replay_req) */
                         to += lustre_msg_get_service_time(req->rq_reqmsg);
                 }
@@ -1874,7 +1922,7 @@ static int target_recovery_thread(void *arg)
                 RETURN(-ENOMEM);
         }
 
-        rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD);
+        rc = lu_context_init(&env->le_ctx, LCT_MD_THREAD | LCT_DT_THREAD);
         if (rc) {
                 OBD_FREE_PTR(thread);
                 OBD_FREE_PTR(env);
@@ -2065,7 +2113,7 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 }
 EXPORT_SYMBOL(target_recovery_init);
 
-#endif
+#endif /* __KERNEL__ */
 
 static int target_process_req_flags(struct obd_device *obd,
                                     struct ptlrpc_request *req)
@@ -2174,8 +2222,9 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
          * Also, a resent, replayed request that has already been
          * handled will pass through here and be processed immediately.
          */
-        CWARN("Next recovery transno: "LPU64", current: "LPU64", replaying\n",
-              obd->obd_next_recovery_transno, transno);
+        CDEBUG(D_HA, "Next recovery transno: "LPU64
+               ", current: "LPU64", replaying\n",
+               obd->obd_next_recovery_transno, transno);
         cfs_spin_lock(&obd->obd_recovery_task_lock);
         if (transno < obd->obd_next_recovery_transno) {
                 /* Processing the queue right now, don't re-add. */
@@ -2234,6 +2283,31 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         RETURN(0);
 }
 
+int target_handle_ping(struct ptlrpc_request *req)
+{
+        obd_ping(req->rq_svc_thread->t_env, req->rq_export);
+        return req_capsule_server_pack(&req->rq_pill);
+}
+
+void target_committed_to_req(struct ptlrpc_request *req)
+{
+        struct obd_export *exp = req->rq_export;
+
+        if (!exp->exp_obd->obd_no_transno && req->rq_repmsg != NULL)
+                lustre_msg_set_last_committed(req->rq_repmsg,
+                                              exp->exp_last_committed);
+        else
+                DEBUG_REQ(D_IOCTL, req, "not sending last_committed update (%d/"
+                          "%d)", exp->exp_obd->obd_no_transno,
+                          req->rq_repmsg == NULL);
+
+        CDEBUG(D_INFO, "last_committed "LPU64", transno "LPU64", xid "LPU64"\n",
+               exp->exp_last_committed, req->rq_transno, req->rq_xid);
+}
+EXPORT_SYMBOL(target_committed_to_req);
+
+#endif /* HAVE_SERVER_SUPPORT */
+
 /**
  * Packs current SLV and Limit into \a req.
  */
@@ -2286,10 +2360,10 @@ int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id)
 
 void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
 {
+       struct ptlrpc_service_part *svcpt;
         int                        netrc;
         struct ptlrpc_reply_state *rs;
         struct obd_export         *exp;
-        struct ptlrpc_service     *svc;
         ENTRY;
 
         if (req->rq_no_reply) {
@@ -2297,7 +2371,7 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
                 return;
         }
 
-        svc = req->rq_rqbd->rqbd_service;
+       svcpt = req->rq_rqbd->rqbd_svcpt;
         rs = req->rq_reply_state;
         if (rs == NULL || !rs->rs_difficult) {
                 /* no notifiers */
@@ -2309,7 +2383,7 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
         /* must be an export if locks saved */
         LASSERT (req->rq_export != NULL);
         /* req/reply consistent */
-        LASSERT (rs->rs_service == svc);
+       LASSERT(rs->rs_svcpt == svcpt);
 
         /* "fresh" reply */
         LASSERT (!rs->rs_scheduled);
@@ -2346,9 +2420,9 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
 
         netrc = target_send_reply_msg (req, rc, fail_id);
 
-        cfs_spin_lock(&svc->srv_rs_lock);
+       cfs_spin_lock(&svcpt->scp_rep_lock);
 
-        cfs_atomic_inc(&svc->srv_n_difficult_replies);
+       cfs_atomic_inc(&svcpt->scp_nreps_difficult);
 
         if (netrc != 0) {
                 /* error sending: reply is off the net.  Also we need +1
@@ -2368,36 +2442,13 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
                 CDEBUG(D_HA, "Schedule reply immediately\n");
                 ptlrpc_dispatch_difficult_reply(rs);
         } else {
-                cfs_list_add (&rs->rs_list, &svc->srv_active_replies);
-                rs->rs_scheduled = 0;           /* allow notifier to schedule */
-        }
-        cfs_spin_unlock(&rs->rs_lock);
-        cfs_spin_unlock(&svc->srv_rs_lock);
-        EXIT;
-}
-
-int target_handle_ping(struct ptlrpc_request *req)
-{
-        obd_ping(req->rq_export);
-        return req_capsule_server_pack(&req->rq_pill);
-}
-
-void target_committed_to_req(struct ptlrpc_request *req)
-{
-        struct obd_export *exp = req->rq_export;
-
-        if (!exp->exp_obd->obd_no_transno && req->rq_repmsg != NULL)
-                lustre_msg_set_last_committed(req->rq_repmsg,
-                                              exp->exp_last_committed);
-        else
-                DEBUG_REQ(D_IOCTL, req, "not sending last_committed update (%d/"
-                          "%d)", exp->exp_obd->obd_no_transno,
-                          req->rq_repmsg == NULL);
-
-        CDEBUG(D_INFO, "last_committed "LPU64", transno "LPU64", xid "LPU64"\n",
-               exp->exp_last_committed, req->rq_transno, req->rq_xid);
+               cfs_list_add(&rs->rs_list, &svcpt->scp_rep_active);
+               rs->rs_scheduled = 0;   /* allow notifier to schedule */
+       }
+       cfs_spin_unlock(&rs->rs_lock);
+       cfs_spin_unlock(&svcpt->scp_rep_lock);
+       EXIT;
 }
-EXPORT_SYMBOL(target_committed_to_req);
 
 int target_handle_qc_callback(struct ptlrpc_request *req)
 {
@@ -2611,12 +2662,13 @@ void ldlm_dump_export_locks(struct obd_export *exp)
             CERROR("dumping locks for export %p,"
                    "ignore if the unmount doesn't hang\n", exp);
             cfs_list_for_each_entry(lock, &exp->exp_locks_list, l_exp_refs_link)
-                ldlm_lock_dump(D_ERROR, lock, 0);
+                LDLM_ERROR(lock, "lock:");
         }
         cfs_spin_unlock(&exp->exp_locks_list_guard);
 }
 #endif
 
+#ifdef HAVE_SERVER_SUPPORT
 static int target_bulk_timeout(void *data)
 {
         ENTRY;
@@ -2720,3 +2772,5 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
         RETURN(rc);
 }
 EXPORT_SYMBOL(target_bulk_io);
+
+#endif /* HAVE_SERVER_SUPPORT */