b=3055

author nathan <nathan>

Wed, 12 Sep 2007 19:18:47 +0000 (19:18 +0000)

committer nathan <nathan>

Wed, 12 Sep 2007 19:18:47 +0000 (19:18 +0000)
author nathan <nathan>
Wed, 12 Sep 2007 19:18:47 +0000 (19:18 +0000)
committer nathan <nathan>
Wed, 12 Sep 2007 19:18:47 +0000 (19:18 +0000)
diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h

index 42dd2a8..cef9318 100644 (file)
--- a/lustre/include/lustre_import.h
+++ b/lustre/include/lustre_import.h
@@ -12,14 +12,11 @@
  /* Adaptive Timeout stuff */
  #define D_ADAPTTO D_OTHER
  #define AT_BINS 4                  /* "bin" means "N seconds of history" */
-#define AT_TIMEBASE_DEFAULT 600    /* remembered history (sec) (should be
-                                      evenly divisible by AT_BINS) */
  #define AT_FLG_NOHIST 0x1          /* use last reported value only */
  #define AT_FLG_MIN    0x2          /* use a minimum limit */
  
  struct adaptive_timeout {
          time_t       at_binstart;         /* bin start time */
-        time_t       at_binlimit;         /* bin time limit */
          unsigned int at_hist[AT_BINS];    /* timeout history bins */
          unsigned int at_flags;
          unsigned int at_current;          /* current timeout value */
@@ -143,10 +140,8 @@ struct obd_import {
  };
  
  /* import.c */
-static inline void at_init(struct adaptive_timeout *at, int val, int timebase,
-                           int flags) {
+static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
          memset(at, 0, sizeof(*at));
-        at->at_binlimit = timebase / AT_BINS;
          at->at_current = val;
          at->at_worst_ever = val;
          at->at_worst_time = cfs_time_current_sec();
@@ -159,7 +154,7 @@ static inline int at_get(struct adaptive_timeout *at) {
                  return min(at->at_current, adaptive_timeout_max);
          return at->at_current;
  }
-void at_add(struct adaptive_timeout *at, unsigned int val);
+int at_add(struct adaptive_timeout *at, unsigned int val);
  int import_at_get_index(struct obd_import *imp, int portal);
  int import_at_get_ldlm(struct obd_import *imp);
  #define AT_OFF (adaptive_timeout_max == 0)
diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h

index d1be77c..3636c84 100644 (file)
--- a/lustre/include/lustre_lib.h
+++ b/lustre/include/lustre_lib.h
@@ -80,7 +80,7 @@ void target_abort_recovery(void *data);
  void target_cleanup_recovery(struct obd_device *obd);
  int target_queue_recovery_request(struct ptlrpc_request *req,
                                    struct obd_device *obd);
-int target_queue_final_reply(struct ptlrpc_request *req, int rc);
+int target_queue_last_replay_reply(struct ptlrpc_request *req, int rc);
  void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
  
  /* client.c */
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index 3111f41..94ee5d2 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -260,7 +260,7 @@ struct ptlrpc_request {
          __u64            rq_history_seq;        /* history sequence # */
          int rq_status;
          spinlock_t rq_lock;
-        /* client-side flags. */
+        /* client-side flags are serialized by rq_lock */
          unsigned int rq_intr:1, rq_replied:1, rq_err:1,
                  rq_timedout:1, rq_resend:1, rq_restart:1,
                  /*
@@ -275,8 +275,8 @@ struct ptlrpc_request {
                  rq_replay:1,
                  rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
                  rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1,
-                /* server-side: */
-                rq_final:1;  /* packed final reply */
+                /* server-side flags */
+                rq_packed_final:1;  /* packed final reply */
          enum rq_phase rq_phase; /* one of RQ_PHASE_* */
          atomic_t rq_refcount;   /* client-side refcount for SENT race,
                                     server-side refcounf for multiple replies */
@@ -313,7 +313,6 @@ struct ptlrpc_request {
          /* server-side... */
          struct timeval       rq_arrival_time;       /* request arrival time */
          struct ptlrpc_reply_state *rq_reply_state;  /* separated reply state */
-        struct semaphore     rq_rs_sem;             /* one reply at a time */
          struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer*/
  #ifdef CRAY_XT3
          __u32                rq_uid;            /* peer uid, used in MDS only */
@@ -337,7 +336,9 @@ struct ptlrpc_request {
  
          /* client outgoing req */
          time_t rq_sent;                  /* when request/reply sent (secs) */
-        time_t rq_deadline;              /* when request must finish */
+        volatile time_t rq_deadline;     /* when request must finish. volatile
+               so that servers' early reply updates to the deadline aren't 
+               kept in per-cpu cache */
          int    rq_timeout;               /* service time estimate (secs) */
  
          /* Multi-rpc bits */
@@ -660,34 +661,12 @@ void ptlrpc_cleanup_client(struct obd_import *imp);
  struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
  
  static inline int
-ptlrpc_client_receiving_reply (struct ptlrpc_request *req)
+ptlrpc_client_recv_or_unlink (struct ptlrpc_request *req)
  {
          int           rc;
  
          spin_lock(&req->rq_lock);
-        rc = req->rq_receiving_reply;
-        spin_unlock(&req->rq_lock);
-        return (rc);
-}
-
-static inline int
-ptlrpc_client_must_unlink (struct ptlrpc_request *req)
-{
-        int           rc;
-
-        spin_lock(&req->rq_lock);
-        rc = req->rq_must_unlink;
-        spin_unlock(&req->rq_lock);
-        return (rc);
-}
-
-static inline int
-ptlrpc_client_replied (struct ptlrpc_request *req)
-{
-        int           rc;
-
-        spin_lock(&req->rq_lock);
-        rc = req->rq_replied;
+        rc = req->rq_receiving_reply || req->rq_must_unlink;
          spin_unlock(&req->rq_lock);
          return (rc);
  }
@@ -873,14 +852,6 @@ static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
          ptlrpc_rs_decref(req->rq_reply_state);
          req->rq_reply_state = NULL;
          req->rq_repmsg = NULL;
-        up(&req->rq_rs_sem); /* held since lustre_pack_reply */
-}
-
-/* Check if we already packed a normal (non-early) reply.
-   Single thread only! */
-static inline int lustre_packed_reply(struct ptlrpc_request *req)
-{
-        return req->rq_final;
  }
  
  static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
@@ -907,6 +878,7 @@ static inline void
  ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, int *lens)
  {
          int size = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+        
          req->rq_replen = size + lustre_msg_early_size();
          if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
                  req->rq_reqmsg->lm_repsize = size;
diff --git a/lustre/include/obd.h b/lustre/include/obd.h

index 2dfd54a..9c05f17 100644 (file)
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -614,7 +614,7 @@ static inline void oti_init(struct obd_trans_info *oti,
  
          oti->oti_xid = req->rq_xid;
  
-        if (req->rq_reqmsg && req->rq_repmsg && req->rq_reply_state)
+        if ((req->rq_reqmsg != NULL) && (req->rq_repmsg != NULL))
                  oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
          oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1;
          oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index f1c1896..b4bbad6 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -40,6 +40,7 @@ extern unsigned int obd_timeout;          /* seconds */
  extern unsigned int ldlm_timeout;         /* seconds */
  extern unsigned int adaptive_timeout_min; /* seconds */
  extern unsigned int adaptive_timeout_max; /* seconds */
+extern unsigned int adaptive_timeout_history; /* seconds */
  extern unsigned int obd_sync_filter;
  extern unsigned int obd_max_dirty_pages;
  extern atomic_t obd_dirty_pages;
@@ -71,7 +72,7 @@ extern unsigned int obd_alloc_fail_rate;
  #define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/2)
  #endif
  #define LND_TIMEOUT 50           /* LNET LND-level RPC timeout */
-#define FOREVER 300              /* Something taking this long is broken */
+#define LONG_UNLINK 300          /* Unlink should happen before now */
  
  
  #define OBD_FAIL_MDS                     0x100
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index 838e6b7..47fe12d 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -1329,32 +1329,34 @@ struct obd_device * target_req2obd(struct ptlrpc_request *req)
          return req->rq_export->exp_obd;
  }
  
-int target_queue_final_reply(struct ptlrpc_request *req, int rc)
+int target_queue_last_replay_reply(struct ptlrpc_request *req, int rc)
  {
          struct obd_device *obd = target_req2obd(req);
          struct ptlrpc_request *saved_req;
          struct lustre_msg *reqmsg;
          int recovery_done = 0;
  
-        LASSERT ((rc == 0) == lustre_packed_reply(req));
+        LASSERT ((rc == 0) == req->rq_packed_final);
  
-        if (!lustre_packed_reply(req)) {
+        if (!req->rq_packed_final) {
                  /* Just like ptlrpc_error, but without the sending. */
                  rc = lustre_pack_reply(req, 1, NULL, NULL);
                  if (rc)
-                        CERROR("pack error %d\n", rc);
+                        return rc;
                  req->rq_type = PTL_RPC_MSG_ERR;
          }
  
-        LASSERT (!req->rq_reply_state->rs_difficult);
+        LASSERT(!req->rq_reply_state->rs_difficult);
          LASSERT(list_empty(&req->rq_list));
          /* XXX a bit like the request-dup code in queue_recovery_request */
          OBD_ALLOC(saved_req, sizeof *saved_req);
          if (!saved_req)
-                LBUG();
+                return -ENOMEM;
          OBD_ALLOC(reqmsg, req->rq_reqlen);
-        if (!reqmsg)
-                LBUG();
+        if (!reqmsg) {
+                OBD_FREE(saved_req, sizeof *req);
+                return -ENOMEM;
+        }
          *saved_req = *req;
          memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
  
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c

index e79914d..acc5b32 100644 (file)
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -1018,7 +1018,7 @@ existing_lock:
          EXIT;
   out:
          req->rq_status = rc ?: err;  /* return either error - bug 11190 */
-        if (!lustre_packed_reply(req)) {
+        if (!req->rq_packed_final) {
                  err = lustre_pack_reply(req, 1, NULL, NULL);
                  if (rc == 0)
                          rc = err;
@@ -1348,7 +1348,7 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
  static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
  {
          req->rq_status = rc;
-        if (!lustre_packed_reply(req)) {
+        if (!req->rq_packed_final) {
                  rc = lustre_pack_reply(req, 1, NULL, NULL);
                  if (rc)
                          return rc;
@@ -2008,7 +2008,7 @@ EXPORT_SYMBOL(target_queue_recovery_request);
  EXPORT_SYMBOL(target_handle_ping);
  EXPORT_SYMBOL(target_pack_pool_reply);
  EXPORT_SYMBOL(target_handle_disconnect);
-EXPORT_SYMBOL(target_queue_final_reply);
+EXPORT_SYMBOL(target_queue_last_replay_reply);
  
  /* l_lock.c */
  EXPORT_SYMBOL(lock_res_and_lock);
diff --git a/lustre/mdc/lproc_mdc.c b/lustre/mdc/lproc_mdc.c

index cfba05b..2ef3332 100644 (file)
--- a/lustre/mdc/lproc_mdc.c
+++ b/lustre/mdc/lproc_mdc.c
@@ -79,7 +79,7 @@ static struct lprocfs_vars lprocfs_obd_vars[] = {
          { "mds_conn_uuid",   lprocfs_rd_conn_uuid,   0, 0 },
          { "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight,
                                  mdc_wr_max_rpcs_in_flight, 0 },
-        { "timeouts",        lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 },
+        { "timeouts",        lprocfs_rd_timeouts,    0, 0 },
          { 0 }
  };
  
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c

index 4b23502..35eac95 100644 (file)
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -1027,7 +1027,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
                  pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
          default:
                  mds_exit_ucred(&uc, mds);
-                if (!lustre_packed_reply(req)) {
+                if (!req->rq_packed_final) {
                          req->rq_status = rc;
                          lustre_pack_reply(req, 1, NULL, NULL);
                  }
@@ -1077,7 +1077,7 @@ static int mds_getattr(struct ptlrpc_request *req, int offset)
  out_pop:
          pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
  out_ucred:
-        if (!lustre_packed_reply(req)) {
+        if (!req->rq_packed_final) {
                  req->rq_status = rc;
                  lustre_pack_reply(req, 1, NULL, NULL);
          }
@@ -1798,7 +1798,7 @@ int mds_handle(struct ptlrpc_request *req)
          if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
                  if (obd && obd->obd_recovering) {
                          DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
-                        return target_queue_final_reply(req, rc);
+                        return target_queue_last_replay_reply(req, rc);
                  }
                  /* Lost a race with recovery; let the error path DTRT. */
                  rc = req->rq_status = -ENOTCONN;
diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c

index 17e174b..632d7e3 100644 (file)
--- a/lustre/obdclass/class_obd.c
+++ b/lustre/obdclass/class_obd.c
@@ -65,8 +65,11 @@ unsigned int obd_dump_on_timeout;
  unsigned int obd_dump_on_eviction;
  unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
  unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+/* Covers the maximum expected network latency */
  unsigned int adaptive_timeout_min = 10;           /* seconds */
  unsigned int adaptive_timeout_max = 600;          /* seconds */
+/* We remember the slowest event that took place within history */
+unsigned int adaptive_timeout_history = 600;      /* seconds */
  unsigned int obd_max_dirty_pages = 256;
  atomic_t obd_dirty_pages;
  
@@ -387,6 +390,7 @@ EXPORT_SYMBOL(obd_timeout);
  EXPORT_SYMBOL(ldlm_timeout);
  EXPORT_SYMBOL(adaptive_timeout_min);
  EXPORT_SYMBOL(adaptive_timeout_max);
+EXPORT_SYMBOL(adaptive_timeout_history);
  EXPORT_SYMBOL(obd_max_dirty_pages);
  EXPORT_SYMBOL(obd_dirty_pages);
  EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c

index 86cafe3..470cae7 100644 (file)
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -797,14 +797,13 @@ static void init_imp_at(struct imp_at *at) {
             Since we can't say for sure how slow a network might be, we use
             a user-defined max expected network latency. We will adapt to slow
             increases, but a sudden jump can still kill us. */
-        at_init(&at->iat_net_latency, adaptive_timeout_min, AT_TIMEBASE_DEFAULT, 
-                AT_FLG_MIN);
+        at_init(&at->iat_net_latency, adaptive_timeout_min, AT_FLG_MIN);
          for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
                  /* max service estimates are tracked on the server side, so
                     don't use the AT history here, just use the last reported
                     val. (But keep hist for proc histogram, worst_ever) */
                  at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
-                        AT_TIMEBASE_DEFAULT, AT_FLG_NOHIST);
+                        AT_FLG_NOHIST);
          }
          at->iat_drain = 0;
  }
diff --git a/lustre/obdclass/linux/linux-sysctl.c b/lustre/obdclass/linux/linux-sysctl.c

index 89c0cb9..85cba69 100644 (file)
--- a/lustre/obdclass/linux/linux-sysctl.c
+++ b/lustre/obdclass/linux/linux-sysctl.c
@@ -64,6 +64,7 @@ enum {
          OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
          ADAPTIVE_MIN,           /* Adaptive timeout lower limit */
          ADAPTIVE_MAX,           /* Adaptive timeout upper limit */
+        ADAPTIVE_HISTORY,       /* Adaptive timeout timebase */
  };
  
  int LL_PROC_PROTO(proc_fail_loc)
@@ -213,6 +214,14 @@ static cfs_sysctl_table_t obd_table[] = {
                  .mode     = 0644,
                  .proc_handler = &proc_dointvec
          },
+        {
+                .ctl_name = ADAPTIVE_HISTORY,
+                .procname = "adaptive_history",
+                .data     = &adaptive_timeout_history,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_dointvec
+        },
          { 0 }
  };
  
diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c

index c6883e4..77d405a 100644 (file)
--- a/lustre/obdclass/lprocfs_status.c
+++ b/lustre/obdclass/lprocfs_status.c
@@ -598,10 +598,6 @@ int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
                                "adaptive timeouts off, using obd_timeout %u\n",
                                obd_timeout);
          
-        rc += snprintf(page + rc, count - rc, 
-                       "%-10s : %ld sec\n", "timebase",
-                       imp->imp_at.iat_net_latency.at_binlimit * AT_BINS);
-        
          now = cfs_time_current_sec();
  
          /* Some network health info for kicks */
@@ -640,38 +636,6 @@ int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
          return rc;
  }
  
-int lprocfs_wr_timeouts(struct file *file, const char *buffer,
-                        unsigned long count, void *data)
-{
-        struct obd_device *obd = (struct obd_device *)data;
-        struct obd_import *imp;
-        time_t bval;
-        int val, i, rc;
-
-        LASSERT(obd != NULL);
-        LPROCFS_CLIMP_CHECK(obd);
-        imp = obd->u.cli.cl_import;
-         
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc < 0)
-                return rc;
-        if (val <= 0)
-                return -ERANGE;
-
-        bval = max(1, val / AT_BINS);
-        spin_lock(&imp->imp_at.iat_net_latency.at_lock);
-        imp->imp_at.iat_net_latency.at_binlimit = bval;
-        spin_unlock(&imp->imp_at.iat_net_latency.at_lock);
-        for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
-                spin_lock(&imp->imp_at.iat_service_estimate[i].at_lock);
-                imp->imp_at.iat_service_estimate[i].at_binlimit = bval;
-                spin_unlock(&imp->imp_at.iat_service_estimate[i].at_lock);
-        }
-        
-        LPROCFS_CLIMP_EXIT(obd);
-        return count;
-}
-
  static const char *obd_connect_names[] = {
          "read_only",
          "lov_index",
@@ -1586,7 +1550,6 @@ EXPORT_SYMBOL(lprocfs_rd_num_exports);
  EXPORT_SYMBOL(lprocfs_rd_numrefs);
  EXPORT_SYMBOL(lprocfs_at_hist_helper);
  EXPORT_SYMBOL(lprocfs_rd_timeouts);
-EXPORT_SYMBOL(lprocfs_wr_timeouts);
  EXPORT_SYMBOL(lprocfs_rd_blksize);
  EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
  EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c

index 7a50e66..878fa8e 100644 (file)
--- a/lustre/osc/lproc_osc.c
+++ b/lustre/osc/lproc_osc.c
@@ -329,7 +329,7 @@ static struct lprocfs_vars lprocfs_obd_vars[] = {
          { "prealloc_next_id", osc_rd_prealloc_next_id, 0, 0 },
          { "prealloc_last_id", osc_rd_prealloc_last_id, 0, 0 },
          { "checksums",       osc_rd_checksum, osc_wr_checksum, 0 },
-        { "timeouts",        lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 },
+        { "timeouts",        lprocfs_rd_timeouts,      0, 0 },
          { 0 }
  };
  
diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c

index f3659e1..be608a6 100644 (file)
--- a/lustre/ost/ost_handler.c
+++ b/lustre/ost/ost_handler.c
@@ -1643,7 +1643,7 @@ out:
          if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
                  if (obd && obd->obd_recovering) {
                          DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
-                        return target_queue_final_reply(req, rc);
+                        return target_queue_last_replay_reply(req, rc);
                  }
                  /* Lost a race with recovery; let the error path DTRT. */
                  rc = req->rq_status = -ENOTCONN;
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c

index 741dcc6..e69a5f1 100644 (file)
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -190,22 +190,25 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
  }
  
  /* Set server timelimit for this req */
-static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) {
+static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) 
+{
          __u32 serv_est;
          int idx;
          struct imp_at *at;
+
          LASSERT(req->rq_import);
  
-        if (AT_OFF || 
-            ((idx = import_at_get_index(req->rq_import, 
-                                        req->rq_request_portal)) < 0)) {
+        if (AT_OFF) {
                  /* non-AT settings */
                  req->rq_timeout = req->rq_import->imp_server_timeout ? 
                          obd_timeout / 2 : obd_timeout;
-                goto out;
+                lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+                return;
          }
  
          at = &req->rq_import->imp_at;
+        idx = import_at_get_index(req->rq_import, 
+                                  req->rq_request_portal);
          serv_est = at_get(&at->iat_service_estimate[idx]);
          /* add an arbitrary minimum: 125% +5 sec */
          req->rq_timeout = serv_est + (serv_est >> 2) + 5;
@@ -220,17 +223,18 @@ static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) {
                         req->rq_timeout);
          }
  
-out:
          /* Let the server know what this RPC timeout is by putting it in the 
             reqmsg*/
          lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
  }
  
  /* Adjust max service estimate based on server value */
-static void ptlrpc_at_adj_service(struct ptlrpc_request *req) {
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req) 
+{
          int idx;
          unsigned int serv_est, oldse;
          struct imp_at *at = &req->rq_import->imp_at;
+
          LASSERT(req->rq_import);
          
          /* service estimate is returned in the repmsg timeout field,
@@ -238,31 +242,29 @@ static void ptlrpc_at_adj_service(struct ptlrpc_request *req) {
          serv_est = lustre_msg_get_timeout(req->rq_repmsg);
  
          idx = import_at_get_index(req->rq_import, req->rq_request_portal);
-        if (idx < 0) 
-                return;
-
-        oldse = at_get(&at->iat_service_estimate[idx]);
          /* max service estimates are tracked on the server side,
             so just keep minimal history here */
-        at_add(&at->iat_service_estimate[idx], serv_est);
-
-        if (at_get(&at->iat_service_estimate[idx]) != oldse)
+        oldse = at_add(&at->iat_service_estimate[idx], serv_est);
+        if (oldse != 0)
                  CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
-                      "has changed %ds to %u\n", 
-                      req->rq_import->imp_obd->obd_name, req->rq_request_portal,
-                      at_get(&at->iat_service_estimate[idx]) - oldse, serv_est);
+                       "has changed from %d to %d\n", 
+                       req->rq_import->imp_obd->obd_name,req->rq_request_portal,
+                       oldse, at_get(&at->iat_service_estimate[idx]));
  }
  
  /* Expected network latency per remote node (secs) */
-int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) {
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
          return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
  }
  
  /* Adjust expected network latency */
-static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) {
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req)
+{
          unsigned int st, nl, oldnl;
          struct imp_at *at = &req->rq_import->imp_at;
          time_t now = cfs_time_current_sec();
+
          LASSERT(req->rq_import);
          
          st = lustre_msg_get_service_time(req->rq_repmsg);
@@ -273,19 +275,18 @@ static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) {
                  CERROR("Reported service time %u > total measured time %ld\n",
                         st, now - req->rq_sent);
  
-        oldnl = at_get(&at->iat_net_latency);
-        at_add(&at->iat_net_latency, nl);
-
-        if (at_get(&at->iat_net_latency) != oldnl)
-                CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s)"
-                       " has changed %ds to %u\n", 
+        oldnl = at_add(&at->iat_net_latency, nl);
+        if (oldnl != 0)
+                CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) "
+                       "has changed from %d to %d\n", 
                         req->rq_import->imp_obd->obd_name,
                         obd_uuid2str(
                                 &req->rq_import->imp_connection->c_remote_uuid),
-                       at_get(&at->iat_net_latency) - oldnl, nl);
+                       oldnl, at_get(&at->iat_net_latency));
  }
  
-static int unpack_reply(struct ptlrpc_request *req) {
+static int unpack_reply(struct ptlrpc_request *req)
+{
          int rc;
  
          /* Clear reply swab mask; we may have already swabbed an early reply */
@@ -308,7 +309,7 @@ static int unpack_reply(struct ptlrpc_request *req) {
  /* Handle an early reply message.
     We can't risk the real reply coming in and changing rq_repmsg, 
     so this fn must be called under the rq_lock */
-static int ptlrpc_at_early_reply(struct ptlrpc_request *req) {
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
          struct lustre_msg *oldmsg, *msgcpy;
          time_t olddl;
          int oldlen, rc;
@@ -741,7 +742,7 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
                  GOTO(out, rc = 1);
  
          if (req->rq_early) {
-                ptlrpc_at_early_reply(req);
+                ptlrpc_at_recv_early_reply(req);
                  GOTO(out, rc = 0); /* keep waiting */
          }
  
@@ -1049,21 +1050,27 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                                  force_timer_recalc = 1;
                          }
  
+                        spin_lock(&req->rq_lock);
+
                          /* Still waiting for a reply? */
-                        if (ptlrpc_client_receiving_reply(req))
+                        if (req->rq_receiving_reply) {
+                                spin_unlock(&req->rq_lock);
                                  continue;
+                        }
                          
-                        spin_lock(&req->rq_lock);
                          if (req->rq_early) {
-                                ptlrpc_at_early_reply(req);
+                                ptlrpc_at_recv_early_reply(req);
                                  spin_unlock(&req->rq_lock);
                                  continue;
                          }
-                        spin_unlock(&req->rq_lock);
  
                          /* Did we actually receive a reply? */
-                        if (!ptlrpc_client_replied(req))
+                        if (!req->rq_replied) {
+                                spin_unlock(&req->rq_lock);
                                  continue;
+                        }
+
+                        spin_unlock(&req->rq_lock);
  
                          spin_lock(&imp->imp_lock);
                          list_del_init(&req->rq_list);
@@ -1291,7 +1298,7 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
                  req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
  
                  /* request in-flight? */
-                if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting) ||
+                if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
                        (req->rq_phase == RQ_PHASE_BULK)))
                          continue;
  
@@ -1301,7 +1308,9 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
                  if (req->rq_deadline <= now) {  /* actually expired already */
                          timeout = 1;    /* ASAP */
                          break;
-                } else if (timeout == 0 || timeout > req->rq_deadline - now) {
+                }
+                
+                if ((timeout == 0) || (timeout > (req->rq_deadline - now))) {
                          timeout = req->rq_deadline - now;
                  }
          }
@@ -1495,9 +1504,7 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request)
          ENTRY;
  
          LASSERT(!in_interrupt ());             /* might sleep */
-
-        if (!ptlrpc_client_receiving_reply(request) && 
-            !ptlrpc_client_must_unlink(request))
+        if (!ptlrpc_client_recv_or_unlink(request))
                  /* Nothing left to do */
                  return;
  
@@ -1515,19 +1522,16 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request)
          for (;;) {
                  /* Network access will complete in finite time but the HUGE
                   * timeout lets us CWARN for visibility of sluggish NALs */
-                lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL);
-                rc = l_wait_event (*wq, 
-                                   !ptlrpc_client_receiving_reply(request) &&
-                                   !ptlrpc_client_must_unlink(request),
+                lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL);
+                rc = l_wait_event (*wq, !ptlrpc_client_recv_or_unlink(request),
                                     &lwi);
                  if (rc == 0)
                          return;
  
                  LASSERT (rc == -ETIMEDOUT);
                  DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
-                          "rvcng=%d unlnk=%d", 
-                          ptlrpc_client_receiving_reply(request),
-                          ptlrpc_client_must_unlink(request));
+                          "rvcng=%d unlnk=%d", request->rq_receiving_reply,
+                          request->rq_must_unlink);
          }
          EXIT;
  }
@@ -1831,6 +1835,7 @@ restart:
                 lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
                 libcfs_nid2str(imp->imp_connection->c_peer.nid),
                 lustre_msg_get_opc(req->rq_reqmsg));
+
          spin_lock(&imp->imp_lock);
          list_del_init(&req->rq_list);
          spin_unlock(&imp->imp_lock);
@@ -1884,33 +1889,25 @@ restart:
  
   out:
          if (req->rq_bulk != NULL) {
-                while(rc >= 0) {
+                if (rc >= 0) {
                          /* success so far.  Note that anything going wrong
                           * with bulk now, is EXTREMELY strange, since the
                           * server must have believed that the bulk
                           * tranferred OK before she replied with success to
                           * me. */
-                        timeoutl = req->rq_deadline - cfs_time_current_sec();
-                        timeout = (timeoutl <= 0) ? CFS_TICK :
-                                cfs_time_seconds(timeoutl);
-
                          lwi = LWI_TIMEOUT(timeout, NULL, NULL);
                          brc = l_wait_event(req->rq_reply_waitq,
                                             !ptlrpc_bulk_active(req->rq_bulk),
                                             &lwi);
                          LASSERT(brc == 0 || brc == -ETIMEDOUT);
-                        if ((brc == -ETIMEDOUT) &&
-                            (req->rq_deadline > cfs_time_current_sec()))
-                                /* Wait again if we changed deadline */
-                                continue;
                          if (brc != 0) {
+                                LASSERT(brc == -ETIMEDOUT);
                                  DEBUG_REQ(D_ERROR, req, "bulk timed out");
                                  rc = brc;
                          } else if (!req->rq_bulk->bd_success) {
                                  DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
                                  rc = -EIO;
                          }
-                        break;
                  }
                  if (rc < 0)
                          ptlrpc_unregister_bulk (req);
diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c

index 9df83cb..d836c5b 100644 (file)
--- a/lustre/ptlrpc/events.c
+++ b/lustre/ptlrpc/events.c
@@ -88,6 +88,9 @@ void reply_in_callback(lnet_event_t *ev)
          LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
          LASSERT(ev->md.start == req->rq_repbuf);
          LASSERT(ev->mlength <= req->rq_replen);
+        /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+           for adaptive timeouts' early reply. */
+        LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
  
          spin_lock(&req->rq_lock);
  
@@ -102,12 +105,8 @@ void reply_in_callback(lnet_event_t *ev)
                  goto out_wake;
          }
  
-        /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
-           for adaptive timeouts' early reply. */
-        LASSERT(ev->md.options & LNET_MD_MANAGE_REMOTE);
-
          if ((ev->offset == 0) && 
-            lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) {
+            ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) != 0)) {
                  /* Early reply */
                  DEBUG_REQ(D_ADAPTTO, req,
                            "Early reply received: mlen=%u offset=%d replen=%d "
@@ -254,7 +253,6 @@ void request_in_callback(lnet_event_t *ev)
          req->rq_uid = ev->uid;
  #endif
          spin_lock_init(&req->rq_lock);
-        sema_init(&req->rq_rs_sem, 1);
          CFS_INIT_LIST_HEAD(&req->rq_timed_list);
          atomic_set(&req->rq_refcount, 1);
          if (ev->type == LNET_EVENT_PUT)
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c

index 62c92a4..a058f6b 100644 (file)
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -213,7 +213,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
          timeout = (int)(last - cfs_time_current_sec());
          if (timeout > 0) {
                  lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(timeout),
-                                           HZ, NULL, NULL);
+                                           cfs_time_seconds(1), NULL, NULL);
                  rc = l_wait_event(imp->imp_recovery_waitq,
                                    (atomic_read(&imp->imp_inflight) == 0),
                                    &lwi);
@@ -1050,14 +1050,15 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
          if (ptlrpc_import_in_recovery(imp)) {
                  struct l_wait_info lwi;
                  cfs_duration_t timeout;
-                int idx;
  
-                if (AT_OFF || (idx = import_at_get_index(imp, 
-                                      imp->imp_client->cli_request_portal)) < 0)
+                if (AT_OFF) {
                          timeout = cfs_time_seconds(obd_timeout);
-                else
+                } else {
+                        int idx = import_at_get_index(imp, 
+                                imp->imp_client->cli_request_portal);
                          timeout = cfs_time_seconds(
                                  at_get(&imp->imp_at.iat_service_estimate[idx]));
+                }
                  lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout), 
                                         back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
                  rc = l_wait_event(imp->imp_recovery_waitq,
@@ -1124,20 +1125,22 @@ void ptlrpc_import_setasync(struct obd_import *imp, int count)
     This gives us a max of the last binlimit*AT_BINS secs without the storage,
     but still smoothing out a return to normalcy from a slow response.
     (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
-void at_add(struct adaptive_timeout *at, unsigned int val) {
-        /*unsigned int old = at->at_current;*/
+int at_add(struct adaptive_timeout *at, unsigned int val) 
+{
+        unsigned int old = at->at_current;
          time_t now = cfs_time_current_sec();
+        time_t binlimit = max_t(time_t, adaptive_timeout_history / AT_BINS, 1);
  
          LASSERT(at);
  #if 0
-        CDEBUG(D_INFO, "add %u to %p time=%lu tb=%lu v=%u (%u %u %u %u)\n", 
-               val, at, now - at->at_binstart, at->at_binlimit, at->at_current,
+        CDEBUG(D_INFO, "add %u to %p time=%lu v=%u (%u %u %u %u)\n", 
+               val, at, now - at->at_binstart, at->at_current,
                 at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
  #endif
          if (val == 0) 
                  /* 0's don't count, because we never want our timeout to 
                     drop to 0, and because 0 could mean an error */
-                return;
+                return 0;
  
          spin_lock(&at->at_lock);
  
@@ -1148,7 +1151,7 @@ void at_add(struct adaptive_timeout *at, unsigned int val) {
                  at->at_worst_time = now;
                  at->at_hist[0] = val;
                  at->at_binstart = now;
-        } else if (now - at->at_binstart < at->at_binlimit ) {
+        } else if (now - at->at_binstart < binlimit ) {
                  /* in bin 0 */
                  at->at_hist[0] = max(val, at->at_hist[0]);
                  at->at_current = max(val, at->at_current);
@@ -1156,7 +1159,7 @@ void at_add(struct adaptive_timeout *at, unsigned int val) {
                  int i, shift;
                  unsigned int maxv = val;
                  /* move bins over */
-                shift = (now - at->at_binstart) / at->at_binlimit;
+                shift = (now - at->at_binstart) / binlimit;
                  LASSERT(shift > 0);
                  for(i = AT_BINS - 1; i >= 0; i--) {
                          if (i >= shift) {
@@ -1168,7 +1171,7 @@ void at_add(struct adaptive_timeout *at, unsigned int val) {
                  }
                  at->at_hist[0] = val;
                  at->at_current = maxv;
-                at->at_binstart += shift * at->at_binlimit;
+                at->at_binstart += shift * binlimit;
          }
  
          if ((at->at_flags & AT_FLG_MIN) && 
@@ -1193,11 +1196,17 @@ void at_add(struct adaptive_timeout *at, unsigned int val) {
                         at->at_hist[0], at->at_hist[1], at->at_hist[2],
                         at->at_hist[3]);
  #endif
+        
+        /* if we changed, report the old value */
+        old = (at->at_current != old) ? old : 0;
+        
          spin_unlock(&at->at_lock);
+        return old;
  }
  
  /* Find the imp_at index for a given portal; assign if space available */
-int import_at_get_index(struct obd_import *imp, int portal) {
+int import_at_get_index(struct obd_import *imp, int portal) 
+{
          struct imp_at *at = &imp->imp_at;
          int i;
  
@@ -1220,15 +1229,11 @@ int import_at_get_index(struct obd_import *imp, int portal) {
                          /* unused */
                          break;
          }
+        
+        /* Not enough portals? */
+        LASSERT(i < IMP_AT_MAX_PORTALS);
  
-        if (i >= IMP_AT_MAX_PORTALS) {
-                CERROR("Tried to use more than %d portals, not enough room "
-                       "in adaptive timeout stats.\n", IMP_AT_MAX_PORTALS);
-                i = -1;
-                goto out;
-        }
          at->iat_portal[i] = portal;
-
  out:
          spin_unlock(&imp->imp_lock);
          return i;
@@ -1238,18 +1243,16 @@ out:
     Since any early reply will only affect the RPC wait time, and not
     any local lock timer we set based on the return value here,
     we should be conservative. */
-int import_at_get_ldlm(struct obd_import *imp) {
+int import_at_get_ldlm(struct obd_import *imp) 
+{
          int idx, tot;
          
          if (!imp || !imp->imp_client || AT_OFF)
                  return obd_timeout;
          
-        tot = at_get(&imp->imp_at.iat_net_latency);
          idx = import_at_get_index(imp, imp->imp_client->cli_request_portal);
-        if (idx < 0)
-                tot += obd_timeout;
-        else
-                tot += at_get(&imp->imp_at.iat_service_estimate[idx]);
+        tot = at_get(&imp->imp_at.iat_net_latency) +
+                at_get(&imp->imp_at.iat_service_estimate[idx]);
  
          /* add an arbitrary minimum: 150% + 10 sec */
          tot += (tot >> 1) + 10;
diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c

index 392c3c7..15a7da2 100644 (file)
--- a/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/lustre/ptlrpc/lproc_ptlrpc.c
@@ -407,9 +407,6 @@ static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off,
                                "adaptive timeouts off, using obd_timeout %u\n",
                                obd_timeout);
          rc += snprintf(page + rc, count - rc, 
-                       "%10s : %ld sec\n", "timebase",
-                       svc->srv_at_estimate.at_binlimit * AT_BINS);
-        rc += snprintf(page + rc, count - rc, 
                         "%10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
                         "service", cur, worst, worstt, 
                         DHMS_VARS(&ts));
@@ -418,25 +415,6 @@ static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off,
          return rc;
  }               
  
-static int ptlrpc_lprocfs_wr_timeouts(struct file *file, const char *buffer,
-                                      unsigned long count, void *data)
-{
-        struct ptlrpc_service *svc = data;
-        int val, rc;
-         
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc < 0)
-                return rc;
-        if (val <= 0)
-                return -ERANGE;
-
-        spin_lock(&svc->srv_at_estimate.at_lock);
-        svc->srv_at_estimate.at_binlimit = max(1, val / AT_BINS);
-        spin_unlock(&svc->srv_at_estimate.at_lock);
-
-        return count;
-}
-
  void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
                                       struct ptlrpc_service *svc)
  {
@@ -450,7 +428,6 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
                   .read_fptr  = ptlrpc_lprocfs_read_req_history_max,
                   .data       = svc},
                  {.name       = "timeouts",
-                 .write_fptr = ptlrpc_lprocfs_wr_timeouts,
                   .read_fptr  = ptlrpc_lprocfs_rd_timeouts,
                   .data       = svc},
                  {NULL}
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c

index 49bbcde..0f4e001 100644 (file)
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -349,14 +349,14 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int flags)
                               req->rq_arrival_time.tv_sec, 1);
          if (!(flags & PTLRPC_REPLY_EARLY) && 
              (req->rq_type != PTL_RPC_MSG_ERR)) {
-                int oldse = at_get(&svc->srv_at_estimate);
                  /* early replies and errors don't count toward our service
                     time estimate */
-                at_add(&svc->srv_at_estimate, service_time);
-                if (service_time > oldse)
+                int oldse = at_add(&svc->srv_at_estimate, service_time);
+                if (oldse != 0)
                          DEBUG_REQ(D_ADAPTTO, req,
-                                  "svc %s increased estimate from %d to %d",
-                                  svc->srv_name, oldse, service_time);
+                                  "svc %s changed estimate from %d to %d",
+                                  svc->srv_name, oldse, 
+                                  at_get(&svc->srv_at_estimate));
          }
          /* Report actual service time for client latency calc */
          lustre_msg_set_service_time(req->rq_repmsg, service_time);
diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c

index eea15f0..29f4f36 100644 (file)
--- a/lustre/ptlrpc/pack_generic.c
+++ b/lustre/ptlrpc/pack_generic.c
@@ -346,24 +346,20 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count,
          int                        size;
          ENTRY;
  
-        /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */
-        down(&req->rq_rs_sem);
          LASSERT(req->rq_reply_state == NULL);
-        if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) {
+        if (req->rq_packed_final)
                  /* Already packed final, no more early */
-                up(&req->rq_rs_sem); 
                  RETURN(-EALREADY);
-        }
+        if ((flags & LPRFL_EARLY_REPLY) == 0)
+                req->rq_packed_final = 1;
  
          msg_len = lustre_msg_size_v1(count, lens);
          size = sizeof(struct ptlrpc_reply_state) + msg_len;
          OBD_ALLOC(rs, size);
          if (unlikely(rs == NULL)) {
                  rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size);
-                if (!rs) {
-                        up(&req->rq_rs_sem);
+                if (!rs) 
                          RETURN (-ENOMEM);
-                }
          }
          atomic_set(&rs->rs_refcount, 1);        /* 1 ref for rq_reply_state */
          rs->rs_cb_id.cbid_fn = reply_out_callback;
@@ -377,8 +373,6 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count,
          req->rq_replen = msg_len;
          req->rq_reply_state = rs;
          req->rq_repmsg = rs->rs_msg;
-        if (!flags)
-                req->rq_final = 1; /* checked in lustre_packed_reply */
  
          lustre_init_msg_v1(rs->rs_msg, count, lens, bufs);
  
@@ -395,24 +389,21 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
          int                        size;
          ENTRY;
  
-        /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */
-        down(&req->rq_rs_sem); 
          LASSERT(req->rq_reply_state == NULL);
-        if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) {
+        if (req->rq_packed_final) {
                  /* Already packed final, no more early */
-                up(&req->rq_rs_sem); 
                  RETURN(-EALREADY);
          }
+        if ((flags & LPRFL_EARLY_REPLY) == 0)
+                req->rq_packed_final = 1;
  
          msg_len = lustre_msg_size_v2(count, lens);
          size = sizeof(struct ptlrpc_reply_state) + msg_len;
          OBD_ALLOC(rs, size);
          if (unlikely(rs == NULL)) {
                  rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size);
-                if (!rs) {
-                        up(&req->rq_rs_sem);
+                if (!rs) 
                          RETURN (-ENOMEM);
-                }
          }
          atomic_set(&rs->rs_refcount, 1);        /* 1 ref for rq_reply_state */
          rs->rs_cb_id.cbid_fn = reply_out_callback;
@@ -426,8 +417,6 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
          req->rq_replen = msg_len;
          req->rq_reply_state = rs;
          req->rq_repmsg = rs->rs_msg;
-        if (!(flags & LPRFL_EARLY_REPLY))
-                req->rq_final = 1; /* checked in lustre_packed_reply */
  
          /* server side, no rq_repbuf */
          lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
@@ -1579,7 +1568,7 @@ __u32 lustre_msg_get_timeout(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V2_SWABBED: {
                  struct ptlrpc_body *pb;
                  
-                pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
@@ -1603,7 +1592,7 @@ __u32 lustre_msg_get_service_time(struct lustre_msg *msg)
          case LUSTRE_MSG_MAGIC_V2_SWABBED: {
                  struct ptlrpc_body *pb;
          
-                pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
                  if (!pb) {
                          CERROR("invalid msg %p: no ptlrpc body!\n", msg);
                          return 0;
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c

index 8470fc7..2cf34b0 100644 (file)
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -38,6 +38,14 @@ int test_req_buffer_pressure = 0;
  CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444,
                  "set non-zero to put pressure on request buffer pools");
  
+static int at_early_margin = 3;
+CFS_MODULE_PARM(at_early_margin, "i", int, 0644,
+                "How far before the deadline we send an early reply");
+
+static int at_extra = 10;
+CFS_MODULE_PARM(at_extra, "i", int, 0644,
+                "How much extra time we give with an early reply");
+
  /* forward ref */
  static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc);
  
@@ -321,7 +329,7 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
          cfs_timer_init(&service->srv_at_timer, ptlrpc_at_timer, service);
          /* At SOW, service time should be quick; 10s seems generous. If client 
             timeout is less than this, we'll be sending an early reply. */
-        at_init(&service->srv_at_estimate, 10, AT_TIMEBASE_DEFAULT, 0);
+        at_init(&service->srv_at_estimate, 10, 0);
  
          spin_lock (&ptlrpc_all_services_lock);
          list_add (&service->srv_list, &ptlrpc_all_services);
@@ -352,7 +360,7 @@ failed:
          return NULL;
  }
  
-static void ptlrpc_server_decref(struct ptlrpc_request *req)
+static void ptlrpc_server_req_decref(struct ptlrpc_request *req)
  {
          struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
  
@@ -379,7 +387,7 @@ static void __ptlrpc_server_free_request(struct ptlrpc_request *req)
  {
          list_del(&req->rq_list);
          ptlrpc_req_drop_rs(req);
-        ptlrpc_server_decref(req);
+        ptlrpc_server_req_decref(req);
  }
  
  static void ptlrpc_server_free_request(struct ptlrpc_request *req)
@@ -390,7 +398,8 @@ static void ptlrpc_server_free_request(struct ptlrpc_request *req)
          struct list_head                  *tmp;
          struct list_head                  *nxt;
  
-        DEBUG_REQ(D_INFO, req, "free req");
+        if (req->rq_phase != RQ_PHASE_NEW) /* incorrect message magic */
+                DEBUG_REQ(D_INFO, req, "free req");
          spin_lock(&svc->srv_at_lock);
          list_del_init(&req->rq_timed_list);
          spin_unlock(&svc->srv_at_lock);
@@ -552,12 +561,6 @@ static int ptlrpc_check_req(struct ptlrpc_request *req)
          return 0;
  }
  
-/* If closest exipiration is within EARLY_MIN, send early replies to everybody
-   expiring within EARLY_MAX, asking for AT_EXTRA time */
-#define AT_EARLY_MIN  2   /* Min time needed to send an early reply */
-#define AT_EARLY_MAX  5   /* Dont send early replies if deadline is beyond */
-#define AT_EXTRA 10       /* Early replies add this time to client timeout */
-
  static void ptlrpc_at_set_timer(struct ptlrpc_service *svc)
  {
          struct ptlrpc_request *rq;
@@ -573,13 +576,13 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service *svc)
          /* Set timer for closest deadline */
          rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, 
                          rq_timed_list);
-        next = rq->rq_deadline - cfs_time_current_sec() - AT_EARLY_MIN;
+        next = rq->rq_deadline - cfs_time_current_sec() - at_early_margin;
          if (next <= 0) 
                  ptlrpc_at_timer((unsigned long)svc);
          else
                  cfs_timer_arm(&svc->srv_at_timer, cfs_time_shift(next));
          spin_unlock(&svc->srv_at_lock);
-        CDEBUG(D_ADAPTTO, "armed %s at %+lds\n", svc->srv_name, next);
+        CDEBUG(D_INFO, "armed %s at %+lds\n", svc->srv_name, next);
  }
  
  /* Add rpc to early reply check list */
@@ -629,6 +632,8 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
                                        int extra_time)
  {
          struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+        struct ptlrpc_request *reqcopy;
+        struct lustre_msg *reqmsg;
          long deadline = req->rq_deadline - cfs_time_current_sec();
          int rc;
          ENTRY;
@@ -644,46 +649,67 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
          if (AT_OFF) 
                  RETURN(0);
          
+        if (deadline < 0) {
+                CERROR("Already past deadline (%+lds), not sending early "
+                       "reply\n", deadline);
+                /* Return an error so we're not re-added to the timed list. */
+                RETURN(-ETIMEDOUT);
+        }
+
          if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) == 0) {
                  CERROR("Wanted to ask client for more time, but no AT "
                         "support\n");
                  RETURN(-ENOSYS);
          }
  
-        rc = lustre_pack_reply_flags(req, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+        OBD_ALLOC(reqcopy, sizeof *reqcopy);
+        if (reqcopy == NULL)
+                RETURN(-ENOMEM);
+        OBD_ALLOC(reqmsg, req->rq_reqlen);
+        if (!reqmsg) {
+                OBD_FREE(reqcopy, sizeof *reqcopy);
+                RETURN(-ENOMEM);
+        }
+        
+        *reqcopy = *req;
+        /* We need the reqmsg for the magic */
+        reqcopy->rq_reqmsg = reqmsg;
+        memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+        rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
          if (rc) 
-                /* EALREADY means final reply was already packed */
-                RETURN(rc);
+                GOTO(out, rc);
  
          if (extra_time) {
                  /* Fake our processing time into the future to ask the 
                     clients for some extra amount of time */
                  extra_time += cfs_time_current_sec() -
-                        req->rq_arrival_time.tv_sec;
+                        reqcopy->rq_arrival_time.tv_sec;
                  at_add(&svc->srv_at_estimate, extra_time);
          }
  
-        req->rq_early_count++; /* number sent, server side */
-        rc = ptlrpc_send_reply(req, PTLRPC_REPLY_EARLY);
+        rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
  
          if (!rc) {
                  /* Adjust our own deadline to what we told the client */
                  req->rq_deadline = req->rq_arrival_time.tv_sec + 
                          at_get(&svc->srv_at_estimate);
+                req->rq_early_count++; /* number sent, server side */
          } else {
                  DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
          }
  
-        /* Reset reply */
-        req->rq_rep_swab_mask = 0;
          /* Free the (early) reply state from lustre_pack_reply. 
             (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
-        ptlrpc_req_drop_rs(req);
-
+        ptlrpc_req_drop_rs(reqcopy);
+out:        
+        OBD_FREE(reqmsg, req->rq_reqlen);
+        OBD_FREE(reqcopy, sizeof *reqcopy);
          RETURN(rc);
  }
  
-/* Check if we need to send any early replies, and send them */
+/* Send early replies to everybody expiring within at_early_margin
+   asking for at_extra time */
  static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
  {
          struct ptlrpc_request *rq, *n;
@@ -711,7 +737,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
          rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request,
                          rq_timed_list);
          first = (int)(rq->rq_deadline - now);
-        if (first > AT_EARLY_MIN) {
+        if (first > at_early_margin) {
                  /* We've still got plenty of time.  Reset the timer. */
                  spin_unlock(&svc->srv_at_lock);
                  ptlrpc_at_set_timer(svc);
@@ -722,7 +748,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
             server will take. Send early replies to everyone expiring soon. */
          CFS_INIT_LIST_HEAD(&work_list);
          list_for_each_entry_safe(rq, n, &svc->srv_at_list, rq_timed_list) {
-                if (rq->rq_deadline <= now + AT_EARLY_MAX) {
+                if (rq->rq_deadline <= now + at_early_margin) {
                          list_move(&rq->rq_timed_list, &work_list);
                          counter++;
                  } else {
@@ -733,7 +759,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
          spin_unlock(&svc->srv_at_lock);
  
          CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
-               "replies\n", first, AT_EXTRA, counter);
+               "replies\n", first, at_extra, counter);
          if (first < 0)
                  /* We're already past request deadlines before we even get a 
                     chance to send early replies */
@@ -752,15 +778,12 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
                     deleted, and is safe to take a ref to keep the req around */
                  atomic_inc(&rq->rq_refcount);
                  spin_unlock(&svc->srv_at_lock);
-                if (!rq->rq_final && 
-                    (ptlrpc_at_send_early_reply(rq, AT_EXTRA) == 0)) {
+                if (!rq->rq_packed_final && 
+                    (ptlrpc_at_send_early_reply(rq, at_extra) == 0)) {
                          counter++;
                          ptlrpc_at_add_timed(rq);
-                } else if (rq->rq_final) {
-                        DEBUG_REQ(D_ADAPTTO, rq, "already packed final reply, "
-                                  "not sending early");
                  }
-                ptlrpc_server_decref(rq);
+                ptlrpc_server_req_decref(rq);
                  spin_lock(&svc->srv_at_lock);
          }
          spin_unlock(&svc->srv_at_lock);
@@ -773,7 +796,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
          RETURN(0);      
  }
  
-/* Handle freshly incoming reqs, check timeout, send early reply if needed,
+/* Handle freshly incoming reqs, add to timed early reply list,
     pass on to regular request queue */
  static int
  ptlrpc_server_handle_req_in(struct ptlrpc_service *svc)
@@ -932,6 +955,17 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                  export = class_export_rpc_get(request->rq_export);
          }
  
+        /* Discard requests queued for longer than the deadline.  
+           The deadline is increased if we send an early reply. */
+        if (cfs_time_current_sec() > request->rq_deadline) {
+                CERROR("Dropping timed-out opc %d request from %s"
+                       ": deadline %lds ago\n",
+                       lustre_msg_get_opc(request->rq_reqmsg),
+                       libcfs_id2str(request->rq_peer),
+                       cfs_time_current_sec() - request->rq_deadline);
+                goto put_rpc_export;
+        }
+
          request->rq_phase = RQ_PHASE_INTERPRET;
  
          CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
@@ -960,6 +994,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                 libcfs_id2str(request->rq_peer),
                 lustre_msg_get_opc(request->rq_reqmsg));
  
+put_rpc_export:
          if (export != NULL)
                  class_export_rpc_put(export);
  
@@ -1197,7 +1232,7 @@ static int ptlrpc_main(void *arg)
  #ifdef WITH_GROUP_INFO
          struct group_info *ginfo = NULL;
  #endif
-        int counter, rc = 0;
+        int counter = 0, rc = 0;
          ENTRY;
  
          ptlrpc_daemonize(data->name);
@@ -1305,13 +1340,13 @@ static int ptlrpc_main(void *arg)
                  if (!list_empty(&svc->srv_reply_queue))
                          ptlrpc_server_handle_reply (svc);
  
-                counter = 0;
-                while(!list_empty(&svc->srv_req_in_queue)) {
+                if (!list_empty(&svc->srv_req_in_queue)) {
                          /* Process all incoming reqs before handling any */
                          ptlrpc_server_handle_req_in(svc);
                          /* but limit ourselves in case of flood */ 
-                        if (counter++ > 1000)
-                                break;
+                        if (counter++ < 1000)
+                                continue;
+                        counter = 0;
                  }
  
                  if (svc->srv_at_check) 
@@ -1524,7 +1559,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
  
                  /* Network access will complete in finite time but the HUGE
                   * timeout lets us CWARN for visibility of sluggish NALs */
-                lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL);
+                lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL);
                  rc = l_wait_event(service->srv_waitq,
                                    service->srv_nrqbd_receiving == 0,
                                    &lwi);
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh

index 5ddab23..f7e2604 100755 (executable)
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -1177,16 +1177,14 @@ test_61c() {
  }
  run_test 61c "test race mds llog sync vs llog cleanup"
  
-
+#Adaptive Timeouts
  at_start() #bug 3055
  {
      if [ -z "$ATOLDBASE" ]; then
-       ATOLDBASE=$(do_facet mds "grep timebase $LPROC/mdt/MDS/mds/timeouts" | awk '{print $3}' )
+       ATOLDBASE=$(do_facet mds "sysctl -n lustre.adaptive_history")
          # speed up the timebase so we can check decreasing AT
-       do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds/timeouts"
-       do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_readpage/timeouts"
-       do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_setattr/timeouts"
-       do_facet ost1 "echo 8 >> $LPROC/ost/OSS/ost/timeouts"
+       do_facet mds "sysctl -w lustre.adaptive_history=8"
+       do_facet ost1 "sysctl -w lustre.adaptive_history=8"
      fi
  }
  
@@ -1195,9 +1193,9 @@ test_65() #bug 3055
      at_start
      $LCTL dk > /dev/null
      # slow down a request
-    sysctl -w lustre.fail_val=30000
+    do_facet mds sysctl -w lustre.fail_val=30000
  #define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
-    sysctl -w lustre.fail_loc=0x8000050a
+    do_facet mds sysctl -w lustre.fail_loc=0x8000050a
      createmany -o $DIR/$tfile 10 > /dev/null
      unlinkmany $DIR/$tfile 10 > /dev/null
      # check for log message
@@ -1301,10 +1299,8 @@ test_67b() #bug 3055
  run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
  
  if [ -n "$ATOLDBASE" ]; then
-    do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds/timeouts" 
-    do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_readpage/timeouts"
-    do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_setattr/timeouts"
-    do_facet ost1 "echo $ATOLDBASE >> $LPROC/ost/OSS/ost/timeouts"
+    do_facet mds "sysctl -w lustre.adaptive_history=$ATOLDBASE"
+    do_facet ost1 "sysctl -w lustre.adaptive_history=$ATOLDBASE"
  fi
  # end of AT tests includes above lines
author	nathan <nathan>
	Wed, 12 Sep 2007 19:18:47 +0000 (19:18 +0000)
committer	nathan <nathan>
	Wed, 12 Sep 2007 19:18:47 +0000 (19:18 +0000)
lustre/include/lustre_import.h		patch \| blob \| history
lustre/include/lustre_lib.h		patch \| blob \| history
lustre/include/lustre_net.h		patch \| blob \| history
lustre/include/obd.h		patch \| blob \| history
lustre/include/obd_support.h		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/ldlm/ldlm_lockd.c		patch \| blob \| history
lustre/mdc/lproc_mdc.c		patch \| blob \| history
lustre/mds/handler.c		patch \| blob \| history
lustre/obdclass/class_obd.c		patch \| blob \| history
lustre/obdclass/genops.c		patch \| blob \| history
lustre/obdclass/linux/linux-sysctl.c		patch \| blob \| history
lustre/obdclass/lprocfs_status.c		patch \| blob \| history
lustre/osc/lproc_osc.c		patch \| blob \| history
lustre/ost/ost_handler.c		patch \| blob \| history
lustre/ptlrpc/client.c		patch \| blob \| history
lustre/ptlrpc/events.c		patch \| blob \| history
lustre/ptlrpc/import.c		patch \| blob \| history
lustre/ptlrpc/lproc_ptlrpc.c		patch \| blob \| history
lustre/ptlrpc/niobuf.c		patch \| blob \| history
lustre/ptlrpc/pack_generic.c		patch \| blob \| history
lustre/ptlrpc/service.c		patch \| blob \| history
lustre/tests/replay-single.sh		patch \| blob \| history