Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / ptlrpc / client.c
index 704f66f..de5f94b 100644 (file)
@@ -94,7 +94,7 @@ static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal
         cfs_waitq_init(&desc->bd_waitq);
         desc->bd_max_iov = npages;
         desc->bd_iov_count = 0;
-        desc->bd_md_h = LNET_INVALID_HANDLE;
+        LNetInvalidateHandle(&desc->bd_md_h);
         desc->bd_portal = portal;
         desc->bd_type = type;
 
@@ -207,8 +207,7 @@ void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
         idx = import_at_get_index(req->rq_import,
                                   req->rq_request_portal);
         serv_est = at_get(&at->iat_service_estimate[idx]);
-        /* add an arbitrary minimum: 125% +5 sec */
-        req->rq_timeout = serv_est + (serv_est >> 2) + 5;
+        req->rq_timeout = at_est2timeout(serv_est);
         /* We could get even fancier here, using history to predict increased
            loading... */
 
@@ -218,17 +217,19 @@ void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
 }
 
 /* Adjust max service estimate based on server value */
-static void ptlrpc_at_adj_service(struct ptlrpc_request *req)
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+                                  unsigned int serv_est)
 {
         int idx;
-        unsigned int serv_est, oldse;
-        struct imp_at *at = &req->rq_import->imp_at;
+        unsigned int oldse;
+        struct imp_at *at;
 
-        LASSERT(req->rq_import);
+        /* do estimate only if is not in recovery */
+        if (!(req->rq_send_state & (LUSTRE_IMP_FULL | LUSTRE_IMP_CONNECTING)))
+                return;
 
-        /* service estimate is returned in the repmsg timeout field,
-           may be 0 on err */
-        serv_est = lustre_msg_get_timeout(req->rq_repmsg);
+        LASSERT(req->rq_import);
+        at = &req->rq_import->imp_at;
 
         idx = import_at_get_index(req->rq_import, req->rq_request_portal);
         /* max service estimates are tracked on the server side,
@@ -248,21 +249,22 @@ int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
 }
 
 /* Adjust expected network latency */
-static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req)
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+                                      unsigned int service_time)
 {
-        unsigned int st, nl, oldnl;
-        struct imp_at *at = &req->rq_import->imp_at;
+        unsigned int nl, oldnl;
+        struct imp_at *at;
         time_t now = cfs_time_current_sec();
 
         LASSERT(req->rq_import);
-
-        st = lustre_msg_get_service_time(req->rq_repmsg);
+        at = &req->rq_import->imp_at;
 
         /* Network latency is total time less server processing time */
-        nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
-        if (st > now - req->rq_sent + 3 /* bz16408 */)
+        nl = max_t(int, now - req->rq_sent - service_time, 0) +1/*st rounding*/;
+        if (service_time > now - req->rq_sent + 3 /* bz16408 */)
                 CWARN("Reported service time %u > total measured time "
-                       CFS_DURATION_T"\n", st, cfs_time_sub(now, req->rq_sent));
+                      CFS_DURATION_T"\n", service_time,
+                      cfs_time_sub(now, req->rq_sent));
 
         oldnl = at_add(&at->iat_net_latency, nl);
         if (oldnl != 0)
@@ -299,45 +301,53 @@ static int unpack_reply(struct ptlrpc_request *req)
  * Handle an early reply message, called with the rq_lock held.
  * If anything goes wrong just ignore it - same as if it never happened
  */
-static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
-        time_t          olddl;
-        int             rc;
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+{
+        struct ptlrpc_request *early_req;
+        time_t                 olddl;
+        int                    rc;
         ENTRY;
 
         req->rq_early = 0;
         spin_unlock(&req->rq_lock);
 
-        rc = sptlrpc_cli_unwrap_early_reply(req);
-        if (rc)
-                GOTO(out, rc);
-
-        rc = unpack_reply(req);
-        if (rc)
-                GOTO(out_cleanup, rc);
+        rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+        if (rc) {
+                spin_lock(&req->rq_lock);
+                RETURN(rc);
+        }
 
-        /* Expecting to increase the service time estimate here */
-        ptlrpc_at_adj_service(req);
-        ptlrpc_at_adj_net_latency(req);
+        rc = unpack_reply(early_req);
+        if (rc == 0) {
+                /* Expecting to increase the service time estimate here */
+                ptlrpc_at_adj_service(req,
+                        lustre_msg_get_timeout(early_req->rq_repmsg));
+                ptlrpc_at_adj_net_latency(req,
+                        lustre_msg_get_service_time(early_req->rq_repmsg));
+        }
 
-        /* Adjust the local timeout for this req */
-        ptlrpc_at_set_req_timeout(req);
+        sptlrpc_cli_finish_early_reply(early_req);
 
-        olddl = req->rq_deadline;
-        /* server assumes it now has rq_timeout from when it sent the
-           early reply, so client should give it at least that long. */
-        req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
-                    ptlrpc_at_get_net_latency(req);
-
-        DEBUG_REQ(D_ADAPTTO, req,
-                  "Early reply #%d, new deadline in "CFS_DURATION_T"s ("
-                  CFS_DURATION_T"s)", req->rq_early_count,
-                  cfs_time_sub(req->rq_deadline, cfs_time_current_sec()),
-                  cfs_time_sub(req->rq_deadline, olddl));
-
-out_cleanup:
-        sptlrpc_cli_finish_early_reply(req);
-out:
         spin_lock(&req->rq_lock);
+
+        if (rc == 0) {
+                /* Adjust the local timeout for this req */
+                ptlrpc_at_set_req_timeout(req);
+
+                olddl = req->rq_deadline;
+                /* server assumes it now has rq_timeout from when it sent the
+                   early reply, so client should give it at least that long. */
+                req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
+                            ptlrpc_at_get_net_latency(req);
+
+                DEBUG_REQ(D_ADAPTTO, req,
+                          "Early reply #%d, new deadline in "CFS_DURATION_T"s "
+                          "("CFS_DURATION_T"s)", req->rq_early_count,
+                          cfs_time_sub(req->rq_deadline,
+                                       cfs_time_current_sec()),
+                          cfs_time_sub(req->rq_deadline, olddl));
+        }
+
         RETURN(rc);
 }
 
@@ -568,9 +578,10 @@ struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
                 OBD_ALLOC_PTR(request);
 
         if (request) {
-                LASSERT((unsigned long)imp > 0x1000);
+                LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
                 LASSERT(imp != LP_POISON);
-                LASSERT((unsigned long)imp->imp_client > 0x1000);
+                LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
+                        imp->imp_client);
                 LASSERT(imp->imp_client != LP_POISON);
 
                 request->rq_import = class_import_get(imp);
@@ -715,14 +726,13 @@ void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
                 if (req->rq_phase == RQ_PHASE_NEW) {
 
                         if (req->rq_interpret_reply != NULL) {
-                                int (*interpreter)(struct ptlrpc_request *,
-                                                   void *, int) =
+                                ptlrpc_interpterer_t interpreter =
                                         req->rq_interpret_reply;
 
                                 /* higher level (i.e. LOV) failed;
                                  * let the sub reqs clean up */
                                 req->rq_status = -EBADR;
-                                interpreter(req, &req->rq_async_args,
+                                interpreter(NULL, req, &req->rq_async_args,
                                             req->rq_status);
                         }
                         set->set_remaining--;
@@ -976,8 +986,9 @@ static int after_reply(struct ptlrpc_request *req)
         }
 
         OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val);
-        ptlrpc_at_adj_service(req);
-        ptlrpc_at_adj_net_latency(req);
+        ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+        ptlrpc_at_adj_net_latency(req,
+                                  lustre_msg_get_service_time(req->rq_repmsg));
 
         rc = ptlrpc_check_status(req);
         imp->imp_connect_error = rc;
@@ -1117,7 +1128,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 }
 
 /* this sends any unsent RPCs in @set and returns TRUE if all are sent */
-int ptlrpc_check_set(struct ptlrpc_request_set *set)
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 {
         struct list_head *tmp;
         int force_timer_recalc = 0;
@@ -1223,7 +1234,11 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                                 spin_unlock(&imp->imp_lock);
 
                                 req->rq_waiting = 0;
-                                if (req->rq_resend) {
+
+                                if (req->rq_timedout||req->rq_resend) {
+                                        /* This is re-sending anyways, 
+                                         * let's mark req as resend. */
+                                        req->rq_resend = 1;
                                         lustre_msg_add_flags(req->rq_reqmsg,
                                                              MSG_RESENT);
                                         if (req->rq_bulk) {
@@ -1343,9 +1358,10 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                         ptlrpc_unregister_bulk (req);
 
                 if (req->rq_interpret_reply != NULL) {
-                        int (*interpreter)(struct ptlrpc_request *,void *,int) =
+                        ptlrpc_interpterer_t interpreter =
                                 req->rq_interpret_reply;
-                        req->rq_status = interpreter(req, &req->rq_async_args,
+                        req->rq_status = interpreter(NULL, req,
+                                                     &req->rq_async_args,
                                                      req->rq_status);
                 }
                 req->rq_phase = RQ_PHASE_COMPLETE;
@@ -1568,7 +1584,8 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
                 lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout ? timeout : 1),
                                        ptlrpc_expired_set,
                                        ptlrpc_interrupted_set, set);
-                rc = l_wait_event(set->set_waitq, ptlrpc_check_set(set), &lwi);
+                rc = l_wait_event(set->set_waitq,
+                                  ptlrpc_check_set(NULL, set), &lwi);
 
                 LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
 
@@ -1673,11 +1690,6 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
         EXIT;
 }
 
-void ptlrpc_free_req(struct ptlrpc_request *request)
-{
-        __ptlrpc_free_req(request, 0);
-}
-
 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
 void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
 {
@@ -2079,8 +2091,8 @@ repeat:
                   "-- sleeping for "CFS_DURATION_T" ticks", timeout);
         lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request,
                                req);
-        rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi);
-        if (rc == -ETIMEDOUT && ((req->rq_deadline > cfs_time_current_sec()) ||
+        brc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi);
+        if (brc == -ETIMEDOUT && ((req->rq_deadline > cfs_time_current_sec()) ||
                                  ptlrpc_check_and_wait_suspend(req)))
                 goto repeat;
 
@@ -2105,7 +2117,7 @@ after_send:
         if (req->rq_err) {
                 DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d",
                           rc, req->rq_status);
-                GOTO(out, rc = -EIO);
+                GOTO(out, rc = rc ? rc : -EIO);
         }
 
         if (req->rq_intr) {
@@ -2182,7 +2194,8 @@ struct ptlrpc_replay_async_args {
         int praa_old_status;
 };
 
-static int ptlrpc_replay_interpret(struct ptlrpc_request *req,
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+                                   struct ptlrpc_request *req,
                                     void * data, int rc)
 {
         struct ptlrpc_replay_async_args *aa = data;
@@ -2355,8 +2368,37 @@ void ptlrpc_abort_set(struct ptlrpc_request_set *set)
         }
 }
 
-static __u64 ptlrpc_last_xid = 0;
-spinlock_t ptlrpc_last_xid_lock;
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/* Initialize the XID for the node.  This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing.  It does not need to be sequential.  Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would cause old to be delivered into the wrong buffer) we initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+        time_t now = cfs_time_current_sec();
+
+        spin_lock_init(&ptlrpc_last_xid_lock);
+        if (now < YEAR_2004) {
+                ll_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+                ptlrpc_last_xid >>= 2;
+                ptlrpc_last_xid |= (1ULL << 61);
+        } else {
+                ptlrpc_last_xid = (now << 20);
+        }
+}
 
 __u64 ptlrpc_next_xid(void)
 {
@@ -2369,10 +2411,16 @@ __u64 ptlrpc_next_xid(void)
 
 __u64 ptlrpc_sample_next_xid(void)
 {
+#if BITS_PER_LONG == 32
+        /* need to avoid possible word tearing on 32-bit systems */
         __u64 tmp;
         spin_lock(&ptlrpc_last_xid_lock);
         tmp = ptlrpc_last_xid + 1;
         spin_unlock(&ptlrpc_last_xid_lock);
         return tmp;
+#else
+        /* No need to lock, since returned value is racy anyways */
+        return ptlrpc_last_xid + 1;
+#endif
 }
 EXPORT_SYMBOL(ptlrpc_sample_next_xid);