Whamcloud - gitweb
b=3055
authornathan <nathan>
Mon, 3 Sep 2007 06:05:55 +0000 (06:05 +0000)
committernathan <nathan>
Mon, 3 Sep 2007 06:05:55 +0000 (06:05 +0000)
i=adilger
i=eeb (pending)
adaptive timeouts

59 files changed:
lustre/ChangeLog
lustre/include/linux/lustre_fsfilt.h
lustre/include/lprocfs_status.h
lustre/include/lustre/lustre_idl.h
lustre/include/lustre_import.h
lustre/include/lustre_lib.h
lustre/include/lustre_net.h
lustre/include/obd.h
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/liblustre/llite_lib.c
lustre/liblustre/super.c
lustre/llite/llite_lib.c
lustre/mdc/lproc_mdc.c
lustre/mds/handler.c
lustre/mds/mds_fs.c
lustre/mds/mds_internal.h
lustre/mds/mds_lov.c
lustre/mgc/mgc_request.c
lustre/mgs/mgs_handler.c
lustre/mgs/mgs_internal.h
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/linux/linux-module.c
lustre/obdclass/linux/linux-sysctl.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/obd_config.c
lustre/obdclass/obd_mount.c
lustre/obdfilter/filter.c
lustre/obdfilter/filter_internal.h
lustre/obdfilter/filter_io.c
lustre/obdfilter/filter_io_24.c
lustre/obdfilter/filter_io_26.c
lustre/osc/lproc_osc.c
lustre/osc/osc_create.c
lustre/ost/ost_handler.c
lustre/ost/ost_internal.h
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/ptlrpc/import.c
lustre/ptlrpc/lproc_ptlrpc.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/pinger.c
lustre/ptlrpc/ptlrpc_internal.h
lustre/ptlrpc/ptlrpc_module.c
lustre/ptlrpc/service.c
lustre/ptlrpc/wiretest.c
lustre/tests/conf-sanity.sh
lustre/tests/recovery-small.sh
lustre/tests/replay-dual.sh
lustre/tests/replay-single.sh
lustre/tests/sanity.sh
lustre/tests/sanityN.sh
lustre/tests/test-framework.sh
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index 8111ab8..8d68675 100644 (file)
        * Recommended e2fsprogs version: 1.40.2-cfs1
        * Note that reiserfs quotas are disabled on SLES 10 in this kernel.
 
+Severity   : enhancement
+Bugzilla   : 3055
+Description: Adaptive timeouts
+Details    : RPC timeouts adapt to changing server load and network
+            conditions to reduce resend attempts and improve recovery time.
+
 Severity   : major
 Bugzilla   : 12932
 Description: obd_health_check_timeout too short
@@ -923,6 +929,7 @@ Bugzilla   : 12860
 Description: mds_lov_synchronize race leads to various problems
 Details    : simultaneous MDT->OST connections at startup can cause the 
             sync to abort, leaving the OSC in a bad state.
+
 Severity   : normal
 Bugzilla   : 12975
 Frequency  : rare
index a95cc61..cee0c21 100644 (file)
@@ -164,14 +164,14 @@ static inline lvfs_sbdev_type fsfilt_journal_sbdev(struct obd_device *obd,
 #define FSFILT_OP_JOIN          11
 #define FSFILT_OP_NOOP          15
 
-#define fsfilt_check_slow(obd, start, timeout, msg)                     \
+#define fsfilt_check_slow(obd, start, msg)                              \
 do {                                                                    \
         if (time_before(jiffies, start + 15 * HZ))                      \
                 break;                                                  \
         else if (time_before(jiffies, start + 30 * HZ))                 \
                 CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \
                        msg, (jiffies-start) / HZ);                      \
-        else if (time_before(jiffies, start + timeout / 2 * HZ))        \
+        else if (time_before(jiffies, start + DISK_TIMEOUT * HZ))       \
                 CWARN("%s: slow %s %lus\n", obd->obd_name, msg,         \
                       (jiffies - start) / HZ);                          \
         else                                                            \
@@ -202,7 +202,7 @@ static inline void *fsfilt_start_log(struct obd_device *obd,
                         LBUG();
                 }
         }
-        fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+        fsfilt_check_slow(obd, now, "journal start");
         return handle;
 }
 
@@ -237,7 +237,7 @@ static inline void *fsfilt_brw_start_log(struct obd_device *obd, int objcount,
                         LBUG();
                 }
         }
-        fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+        fsfilt_check_slow(obd, now, "journal start");
 
         return handle;
 }
@@ -257,7 +257,7 @@ static inline int fsfilt_extend(struct obd_device *obd, struct inode *inode,
         int rc = obd->obd_fsops->fs_extend(inode, nblocks, handle);
         CDEBUG(D_INFO, "extending handle %p with %u blocks\n", handle, nblocks);
 
-        fsfilt_check_slow(obd, now, obd_timeout, "journal extend");
+        fsfilt_check_slow(obd, now, "journal extend");
 
         return rc;
 }
@@ -269,7 +269,7 @@ static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode,
         int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
         CDEBUG(D_INFO, "committing handle %p\n", handle);
 
-        fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+        fsfilt_check_slow(obd, now, "journal start");
 
         return rc;
 }
@@ -282,7 +282,7 @@ static inline int fsfilt_commit_async(struct obd_device *obd,
         int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle);
 
         CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle);
-        fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+        fsfilt_check_slow(obd, now, "journal start");
 
         return rc;
 }
@@ -293,7 +293,7 @@ static inline int fsfilt_commit_wait(struct obd_device *obd,
         unsigned long now = jiffies;
         int rc = obd->obd_fsops->fs_commit_wait(inode, handle);
         CDEBUG(D_INFO, "waiting for completion %p\n", handle);
-        fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+        fsfilt_check_slow(obd, now, "journal start");
         return rc;
 }
 
@@ -303,7 +303,7 @@ static inline int fsfilt_setattr(struct obd_device *obd, struct dentry *dentry,
         unsigned long now = jiffies;
         int rc;
         rc = obd->obd_fsops->fs_setattr(dentry, handle, iattr, do_trunc);
-        fsfilt_check_slow(obd, now, obd_timeout, "setattr");
+        fsfilt_check_slow(obd, now, "setattr");
         return rc;
 }
 
index da1dc16..beba48e 100644 (file)
@@ -150,6 +150,23 @@ struct obd_device;
 struct file;
 struct obd_histogram;
 
+/* Days / hours / mins / seconds format */
+struct dhms {
+        int d,h,m,s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+        ts->d = secs / 86400;
+        secs = secs % 86400;
+        ts->h = secs / 3600;
+        secs = secs % 3600;
+        ts->m = secs / 60;
+        ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+
 #ifdef LPROCFS
 
 /* Two optimized LPROCFS counter increment functions are provided:
@@ -271,6 +288,13 @@ extern int lprocfs_rd_num_exports(char *page, char **start, off_t off,
                                   int count, int *eof, void *data);
 extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
                               int count, int *eof, void *data);
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(char *page, int count, int rc, 
+                                  struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+                               int count, int *eof, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+                               unsigned long count, void *data);
 extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
                                    unsigned long count, void *data);
 extern int lprocfs_wr_ping(struct file *file, const char *buffer,
@@ -449,6 +473,16 @@ static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off,
 static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off,
                                      int count, int *eof, void *data)
 { return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(char *page, int count, int rc, 
+                                         struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+                                      unsigned long count, void *data)
+{ return 0; }
 static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer,
                                           unsigned long count, void *data)
 { return 0; }
index e425c82..094998e 100644 (file)
@@ -188,7 +188,7 @@ struct lustre_msg_v2 {
         __u32 lm_secflvr;
         __u32 lm_magic;
         __u32 lm_repsize;
-        __u32 lm_timeout;
+        __u32 lm_cksum;
         __u32 lm_padding_1;
         __u32 lm_padding_2;
         __u32 lm_padding_3;
@@ -209,11 +209,11 @@ struct ptlrpc_body {
         __u32 pb_flags;
         __u32 pb_op_flags;
         __u32 pb_conn_cnt;
+        __u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+        __u32 pb_service_time; /* for rep, actual service time */
         __u32 pb_padding_1;
         __u32 pb_padding_2;
         __u32 pb_padding_3;
-        __u32 pb_padding_4;
-        __u32 pb_padding_5;
 };
 
 extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
@@ -247,6 +247,7 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
 #define MSG_LAST_REPLAY        1
 #define MSG_RESENT             2
 #define MSG_REPLAY             4
+#define MSG_AT_SUPPORT         8
 
 /*
  * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
@@ -293,14 +294,14 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                 OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
                                 OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \
                                 OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
-                                OBD_CONNECT_CANCELSET)
+                                OBD_CONNECT_CANCELSET | OBD_CONNECT_AT)
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                 OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
                                 OBD_CONNECT_BRW_SIZE | OBD_CONNECT_QUOTA64 | \
-                                OBD_CONNECT_CANCELSET)
+                                OBD_CONNECT_CANCELSET | OBD_CONNECT_AT)
 #define ECHO_CONNECT_SUPPORTED (0)
-#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION)
+#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT)
 
 #define MAX_QUOTA_COUNT32 ((0xffffffffULL >> QUOTABLOCK_BITS) << QUOTABLOCK_BITS)
 
index 672d045..42dd2a8 100644 (file)
@@ -8,6 +8,26 @@
 #include <lustre_handles.h>
 #include <lustre/lustre_idl.h>
 
+
+/* Adaptive Timeout stuff */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4                  /* "bin" means "N seconds of history" */
+#define AT_TIMEBASE_DEFAULT 600    /* remembered history (sec) (should be
+                                      evenly divisible by AT_BINS) */
+#define AT_FLG_NOHIST 0x1          /* use last reported value only */
+#define AT_FLG_MIN    0x2          /* use a minimum limit */
+
+struct adaptive_timeout {
+        time_t       at_binstart;         /* bin start time */
+        time_t       at_binlimit;         /* bin time limit */
+        unsigned int at_hist[AT_BINS];    /* timeout history bins */
+        unsigned int at_flags;
+        unsigned int at_current;          /* current timeout value */
+        unsigned int at_worst_ever;       /* worst-ever timeout value */
+        time_t       at_worst_time;       /* worst-ever timeout timestamp */
+        spinlock_t   at_lock;
+};
+
 enum lustre_imp_state {
         LUSTRE_IMP_CLOSED     = 1,
         LUSTRE_IMP_NEW        = 2,
@@ -48,6 +68,14 @@ struct obd_import_conn {
         __u64                     oic_last_attempt; /* jiffies, 64-bit */
 };
 
+#define IMP_AT_MAX_PORTALS 4
+struct imp_at {
+        int                     iat_portal[IMP_AT_MAX_PORTALS];
+        struct adaptive_timeout iat_net_latency;
+        struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+        time_t                  iat_drain; /* hack to slow reconnect reqs */
+};
+
 struct obd_import {
         struct portals_handle     imp_handle;
         atomic_t                  imp_refcount;
@@ -106,25 +134,35 @@ struct obd_import {
         __u64                     imp_connect_flags_orig;
 
         __u32                     imp_msg_magic;
+        __u32                     imp_msg_flags;          /* adjusted based on server capability */
 
-        struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
-};
-
-typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
-                                    int event, void *event_arg, void *cb_data);
+        struct ptlrpc_request_pool *imp_rq_pool;          /* emergency request pool */
 
-struct obd_import_observer {
-        struct list_head     oio_chain;
-        obd_import_callback  oio_cb;
-        void                *oio_cb_data;
+        struct imp_at             imp_at;                 /* adaptive timeout data */
+        time_t                    imp_last_reply_time;    /* for health check */
 };
 
-void class_observe_import(struct obd_import *imp, obd_import_callback cb,
-                          void *cb_data);
-void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
-                            void *cb_data);
-void class_notify_import_observers(struct obd_import *imp, int event,
-                                   void *event_arg);
+/* import.c */
+static inline void at_init(struct adaptive_timeout *at, int val, int timebase,
+                           int flags) {
+        memset(at, 0, sizeof(*at));
+        at->at_binlimit = timebase / AT_BINS;
+        at->at_current = val;
+        at->at_worst_ever = val;
+        at->at_worst_time = cfs_time_current_sec();
+        at->at_flags = flags;
+        spin_lock_init(&at->at_lock);
+}
+static inline int at_get(struct adaptive_timeout *at) {
+        extern unsigned int adaptive_timeout_max;
+        if (adaptive_timeout_max)
+                return min(at->at_current, adaptive_timeout_max);
+        return at->at_current;
+}
+void at_add(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+int import_at_get_ldlm(struct obd_import *imp);
+#define AT_OFF (adaptive_timeout_max == 0)
 
 /* genops.c */
 struct obd_export;
index 092b3a6..1e1a116 100644 (file)
@@ -73,9 +73,8 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req);
 #endif
 
 void target_cancel_recovery_timer(struct obd_device *obd);
-
-#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) /* *waves hands* */
-void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler);
+void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler,
+                                 struct ptlrpc_request *req);
 void target_abort_recovery(void *data);
 void target_cleanup_recovery(struct obd_device *obd);
 int target_queue_recovery_request(struct ptlrpc_request *req,
index 76a5eaf..9153574 100644 (file)
@@ -255,11 +255,12 @@ struct ptlrpc_request_pool {
 struct ptlrpc_request {
         int rq_type; /* one of PTL_RPC_MSG_* */
         struct list_head rq_list;
+        struct list_head rq_timed_list;         /* server-side early replies */
         struct list_head rq_history_list;       /* server-side history */
         __u64            rq_history_seq;        /* history sequence # */
         int rq_status;
         spinlock_t rq_lock;
-        /* client-side flags */
+        /* client-side flags. */
         unsigned int rq_intr:1, rq_replied:1, rq_err:1,
                 rq_timedout:1, rq_resend:1, rq_restart:1,
                 /*
@@ -273,9 +274,12 @@ struct ptlrpc_request {
                  */
                 rq_replay:1,
                 rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
-                rq_no_delay:1, rq_net_err:1;
+                rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1,
+                /* server-side: */
+                rq_final:1;  /* packed final reply */
         enum rq_phase rq_phase; /* one of RQ_PHASE_* */
-        atomic_t rq_refcount;   /* client-side refcount for SENT race */
+        atomic_t rq_refcount;   /* client-side refcount for SENT race,
+                                   server-side refcounf for multiple replies */
 
         struct ptlrpc_thread *rq_svc_thread; /* initial thread servicing req */
 
@@ -287,8 +291,8 @@ struct ptlrpc_request {
         int rq_reqlen;
         struct lustre_msg *rq_reqmsg;
 
-        int rq_timeout;         /* time to wait for reply (seconds) */
         int rq_replen;
+        struct lustre_msg *rq_repbuf; /* client only, buf may be bigger than msg */
         struct lustre_msg *rq_repmsg;
         __u64 rq_transno;
         __u64 rq_xid;
@@ -300,6 +304,8 @@ struct ptlrpc_request {
         int rq_import_generation;
         enum lustre_imp_state rq_send_state;
 
+        int rq_early_count;           /* how many early replies (for stats) */
+
         /* client+server request */
         lnet_handle_md_t     rq_req_md_h;
         struct ptlrpc_cb_id  rq_req_cbid;
@@ -307,6 +313,7 @@ struct ptlrpc_request {
         /* server-side... */
         struct timeval       rq_arrival_time;       /* request arrival time */
         struct ptlrpc_reply_state *rq_reply_state;  /* separated reply state */
+        struct semaphore     rq_rs_sem;             /* one reply at a time */
         struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer*/
 #ifdef CRAY_XT3
         __u32                rq_uid;            /* peer uid, used in MDS only */
@@ -327,7 +334,11 @@ struct ptlrpc_request {
         void  *rq_cb_data;
 
         struct ptlrpc_bulk_desc *rq_bulk;       /* client side bulk */
-        time_t rq_sent;                         /* when request sent, seconds */
+
+        /* client outgoing req */
+        time_t rq_sent;                  /* when request/reply sent (secs) */
+        time_t rq_deadline;              /* when request must finish */
+        int    rq_timeout;               /* service time estimate (secs) */
 
         /* Multi-rpc bits */
         struct list_head rq_set_chain;
@@ -526,13 +537,21 @@ struct ptlrpc_service {
         int              srv_n_difficult_replies; /* # 'difficult' replies */
         int              srv_n_active_reqs;     /* # reqs being served */
         cfs_duration_t   srv_rqbd_timeout;      /* timeout before re-posting reqs, in tick */
-        int              srv_watchdog_timeout;  /* soft watchdog timeout, in ms */
+        int              srv_watchdog_factor;   /* soft watchdog timeout mutiplier */
         unsigned         srv_cpu_affinity:1;    /* bind threads to CPUs */
+        unsigned         srv_at_check:1;        /* check early replies */
 
         __u32            srv_req_portal;
         __u32            srv_rep_portal;
-
-        int               srv_n_queued_reqs;    /* # reqs waiting to be served */
+        
+        /* AT stuff */
+        struct adaptive_timeout srv_at_estimate;/* estimated service time */
+        spinlock_t        srv_at_lock;
+        struct list_head  srv_at_list;          /* reqs waiting for replies */
+        cfs_timer_t       srv_at_timer;         /* early reply timer */
+
+        int               srv_n_queued_reqs;    /* # reqs in either of the queues below */
+        struct list_head  srv_req_in_queue;     /* incoming reqs */
         struct list_head  srv_request_queue;    /* reqs waiting for service */
 
         struct list_head  srv_request_history;  /* request history */
@@ -624,10 +643,13 @@ static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc)
         return (rc);
 }
 
-int ptlrpc_send_reply(struct ptlrpc_request *req, int);
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY           0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
 int ptlrpc_reply(struct ptlrpc_request *req);
 int ptlrpc_error(struct ptlrpc_request *req);
 void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
 int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
 int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd);
 
@@ -649,6 +671,17 @@ ptlrpc_client_receiving_reply (struct ptlrpc_request *req)
 }
 
 static inline int
+ptlrpc_client_must_unlink (struct ptlrpc_request *req)
+{
+        int           rc;
+
+        spin_lock(&req->rq_lock);
+        rc = req->rq_must_unlink;
+        spin_unlock(&req->rq_lock);
+        return (rc);
+}
+
+static inline int
 ptlrpc_client_replied (struct ptlrpc_request *req)
 {
         int           rc;
@@ -722,7 +755,7 @@ void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs);
 struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
                                        int max_reply_size,
                                        int req_portal, int rep_portal,
-                                       int watchdog_timeout, /* in ms */
+                                       int watchdog_factor,
                                        svc_handler_t, char *name,
                                        cfs_proc_dir_entry_t *proc_entry,
                                        svcreq_printfn_t, 
@@ -759,10 +792,14 @@ int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
                         int *lens, char **bufs);
 int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens,
                       char **bufs);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, int *lens,
+                            char **bufs, int flags);
 void lustre_shrink_reply(struct ptlrpc_request *req, int segment,
                          unsigned int newlen, int move_data);
 void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
 int lustre_msg_size(__u32 magic, int count, int *lengths);
+int lustre_msg_early_size(void);
 int lustre_unpack_msg(struct lustre_msg *m, int len);
 void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
 int lustre_msg_buflen(struct lustre_msg *m, int n);
@@ -791,7 +828,12 @@ __u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
 __u64 lustre_msg_get_transno(struct lustre_msg *msg);
 int   lustre_msg_get_status(struct lustre_msg *msg);
 __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
 __u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
 void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
 void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
 void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
@@ -800,6 +842,9 @@ void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed);
 void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
 void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
 void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
 
 static inline void
 ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
@@ -816,6 +861,24 @@ ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
                 lustre_free_reply_state(rs);
 }
 
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+        if (req->rq_reply_state == NULL) 
+                return; /* shouldn't occur */
+        ptlrpc_rs_decref(req->rq_reply_state);
+        req->rq_reply_state = NULL;
+        req->rq_repmsg = NULL;
+        up(&req->rq_rs_sem); /* held since lustre_pack_reply */
+}
+
+/* Check if we already packed a normal (non-early) reply.
+   Single thread only! */
+static inline int lustre_packed_reply(struct ptlrpc_request *req)
+{
+        return req->rq_final;
+}
+
 static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
 {
         return lustre_msg_get_magic(req->rq_reqmsg);
@@ -839,9 +902,10 @@ static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
 static inline void
 ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, int *lens)
 {
-        req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+        int size = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+        req->rq_replen = size + lustre_msg_early_size();
         if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
-                req->rq_reqmsg->lm_repsize = req->rq_replen;
+                req->rq_reqmsg->lm_repsize = size;
 }
 
 /* ldlm/ldlm_lib.c */
index 817058d..6551ac2 100644 (file)
@@ -613,7 +613,7 @@ static inline void oti_init(struct obd_trans_info *oti,
 
         oti->oti_xid = req->rq_xid;
 
-        if (req->rq_repmsg && req->rq_reqmsg != 0)
+        if (req->rq_reqmsg && req->rq_repmsg && req->rq_reply_state)
                 oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
         oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1;
         oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
@@ -767,7 +767,8 @@ struct obd_device {
         struct list_head                 obd_recovery_queue;
         struct list_head                 obd_delayed_reply_queue;
         time_t                           obd_recovery_start;
-        time_t                           obd_recovery_end;
+        time_t                           obd_recovery_end; /* for lprocfs_status */
+        int                              obd_recovery_timeout;
 
         union {
                 struct obd_device_target obt;
index 5db0110..f1c1896 100644 (file)
@@ -34,11 +34,12 @@ extern unsigned int obd_fail_val;
 extern unsigned int obd_debug_peer_on_timeout;
 extern unsigned int obd_dump_on_timeout;
 extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for 
+   networking / disk / timings affected by load (use Adaptive Timeouts) */
 extern unsigned int obd_timeout;          /* seconds */
-#define PING_INTERVAL max(obd_timeout / 4, 1U)
-#define RECONNECT_INTERVAL max(obd_timeout / 10, 10U)
-extern unsigned int ldlm_timeout;
-extern unsigned int obd_health_check_timeout;
+extern unsigned int ldlm_timeout;         /* seconds */
+extern unsigned int adaptive_timeout_min; /* seconds */
+extern unsigned int adaptive_timeout_max; /* seconds */
 extern unsigned int obd_sync_filter;
 extern unsigned int obd_max_dirty_pages;
 extern atomic_t obd_dirty_pages;
@@ -47,11 +48,31 @@ extern int obd_race_state;
 extern unsigned int obd_alloc_fail_rate;
 
 /* Timeout definitions */
-#define LDLM_TIMEOUT_DEFAULT 20
 #define OBD_TIMEOUT_DEFAULT 100
-#define HEALTH_CHECK_COEF 3 / 2
-#define HEALTH_CHECK_TIMEOUT_DEFAULT (OBD_TIMEOUT_DEFAULT * HEALTH_CHECK_COEF)
-#define HEALTH_CHECK_TIMEOUT (obd_timeout * HEALTH_CHECK_COEF)
+#define LDLM_TIMEOUT_DEFAULT 20
+/* Time to wait for all clients to reconnect during recovery */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* Client may skip 1 ping; wait for 2.5 */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 5 / 2)
+#define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5  /* Connection switching rate limiter */
+#ifndef CRAY_XT3
+/* In general this should be low to have quick detection of a system 
+   running on a backup server. */
+#define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/20)
+#else
+/* ...but for very large systems (e.g. CRAY) we need to keep the initial 
+   connect t.o. high (bz 10803), because they will nearly ALWAYS be doing the
+   connects for the first time (clients "reboot" after every process, so no
+   chance to generate adaptive timeout data. */
+#define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/2)
+#endif
+#define LND_TIMEOUT 50           /* LNET LND-level RPC timeout */
+#define FOREVER 300              /* Something taking this long is broken */
+
 
 #define OBD_FAIL_MDS                     0x100
 #define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
@@ -131,7 +152,7 @@ extern unsigned int obd_alloc_fail_rate;
 #define OBD_FAIL_OST_BRW_READ_BULK       0x20f
 #define OBD_FAIL_OST_SYNC_NET            0x210
 #define OBD_FAIL_OST_ALL_REPLY_NET       0x211
-#define OBD_FAIL_OST_ALL_REQUESTS_NET    0x212
+#define OBD_FAIL_OST_ALL_REQUEST_NET     0x212
 #define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
 #define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
 #define OBD_FAIL_OST_ENOSPC              0x215
@@ -147,6 +168,7 @@ extern unsigned int obd_alloc_fail_rate;
 #define OBD_FAIL_OST_HOLD_WRITE_RPC      0x21f
 #define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
 #define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE        0x223
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
@@ -187,6 +209,8 @@ extern unsigned int obd_alloc_fail_rate;
 #define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
 #define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP        0x50c
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
@@ -208,9 +232,9 @@ extern unsigned int obd_alloc_fail_rate;
 #define OBD_FAIL_MGS                     0x900
 #define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
 #define OBD_FAIL_MGS_ALL_REPLY_NET       0x902
-#define OBD_FAIL_MGC_PROCESS_LOG         0x903
-#define OBD_FAIL_MGS_SLOW_REQUEST_NET    0x904
-#define OBD_FAIL_MGS_SLOW_TARGET_REG     0x905
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG   0x903
+#define OBD_FAIL_MGS_PAUSE_REQ           0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG    0x905
 
 #define OBD_FAIL_QUOTA_QD_COUNT_32BIT    0xA00
 
index 2ea0d26..48229aa 100644 (file)
@@ -729,15 +729,15 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
                 OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout);
         } else if (req->rq_export == NULL &&
                    atomic_read(&export->exp_rpc_count) > 0) {
-                CWARN("%s: refuse connection from %s/%s to 0x%p/%d\n",
-                      target->obd_name, cluuid.uuid,
+                CWARN("%s: refuse connection from %s/%s to 0x%p; still busy "
+                      "with %d references\n", target->obd_name, cluuid.uuid,
                       libcfs_nid2str(req->rq_peer.nid),
                       export, atomic_read(&export->exp_refcount));
                 GOTO(out, rc = -EBUSY);
         } else if (req->rq_export != NULL &&
                    atomic_read(&export->exp_rpc_count) > 1) {
-                CWARN("%s: refuse reconnection from %s@%s to 0x%p/%d\n",
-                      target->obd_name, cluuid.uuid,
+                CWARN("%s: refuse reconnection from %s@%s to 0x%p; still busy "
+                      "with %d active RPCs\n", target->obd_name, cluuid.uuid,
                       libcfs_nid2str(req->rq_peer.nid),
                       export, atomic_read(&export->exp_rpc_count));
                 GOTO(out, rc = -EBUSY);
@@ -763,7 +763,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         /* If this is the first client, start the recovery timer */
         if (target->obd_recovering) {
                 lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
-                target_start_recovery_timer(target, handler);
+                target_start_recovery_timer(target, handler, req);
         }
 
         /* Tell the client if we support replayable requests */
@@ -862,8 +862,10 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
                                     &export->exp_nid_hash);
         spin_unlock(&target->obd_dev_lock);
 
-        if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT)
-                GOTO(out, rc = 0);
+        if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) {
+                revimp = class_import_get(export->exp_imp_reverse);
+                GOTO(set_flags, rc = 0);
+        }
 
         if (target->obd_recovering)
                 target->obd_connected_clients++;
@@ -880,10 +882,13 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         revimp->imp_remote_handle = conn;
         revimp->imp_dlm_fake = 1;
         revimp->imp_state = LUSTRE_IMP_FULL;
-
+set_flags:
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_NEXT_VER) {
+                /* Client wants v2 */
                 revimp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
                 lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_NEXT_VER);
+                if (export->exp_connect_flags & OBD_CONNECT_AT)
+                        revimp->imp_msg_flags |= MSG_AT_SUPPORT;
         }
 
         class_import_put(revimp);
@@ -892,7 +897,7 @@ out:
                 spin_lock(&export->exp_lock);
                 export->exp_connecting = 0;
                 spin_unlock(&export->exp_lock);
-       }
+        }
         if (targref)
                 class_decref(targref);
         if (rc)
@@ -934,11 +939,7 @@ void target_destroy_export(struct obd_export *exp)
 
 static void target_release_saved_req(struct ptlrpc_request *req)
 {
-        if (req->rq_reply_state != NULL) {
-                ptlrpc_rs_decref(req->rq_reply_state);
-                /* req->rq_reply_state = NULL; */
-        }
-
+        ptlrpc_req_drop_rs(req);
         class_export_put(req->rq_export);
         OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
         OBD_FREE(req, sizeof *req);
@@ -968,7 +969,7 @@ static void target_finish_recovery(struct obd_device *obd)
                 ptlrpc_reply(req);
                 target_release_saved_req(req);
         }
-        obd->obd_recovery_end = CURRENT_SECONDS;
+        obd->obd_recovery_end = cfs_time_current_sec();
 }
 
 static void abort_recovery_queue(struct obd_device *obd)
@@ -1080,38 +1081,44 @@ void target_cancel_recovery_timer(struct obd_device *obd)
         cfs_timer_disarm(&obd->obd_recovery_timer);
 }
 
-static void reset_recovery_timer(struct obd_device *obd)
+static void reset_recovery_timer(struct obd_device *obd,
+                                 struct ptlrpc_request *req, int first)
 {
         spin_lock_bh(&obd->obd_processing_task_lock);
         if (!obd->obd_recovering) {
                 spin_unlock_bh(&obd->obd_processing_task_lock);
                 return;
         }
+        /* Track the client's largest expected replay time */
+        obd->obd_recovery_timeout = 
+                max((first ? (int)OBD_RECOVERY_TIMEOUT : 
+                     obd->obd_recovery_timeout),
+                    (int)lustre_msg_get_timeout(req->rq_reqmsg));
         cfs_timer_arm(&obd->obd_recovery_timer, 
-                      cfs_time_shift(OBD_RECOVERY_TIMEOUT));
+                      cfs_time_shift(obd->obd_recovery_timeout));
         spin_unlock_bh(&obd->obd_processing_task_lock);
-        CDEBUG(D_HA, "%s: timer will expire in %u seconds\n", obd->obd_name,
-               OBD_RECOVERY_TIMEOUT);
-        /* Only used for lprocfs_status */
-        obd->obd_recovery_end = CURRENT_SECONDS + OBD_RECOVERY_TIMEOUT;
+        CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n", 
+               obd->obd_name, obd->obd_recovery_timeout);
+        obd->obd_recovery_end = cfs_time_current_sec() + 
+                obd->obd_recovery_timeout;
 }
 
 
 /* Only start it the first time called */
-void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler)
+void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler,
+                                 struct ptlrpc_request *req)
 {
         spin_lock_bh(&obd->obd_processing_task_lock);
         if (obd->obd_recovery_handler) {
                 spin_unlock_bh(&obd->obd_processing_task_lock);
                 return;
         }
-        CWARN("%s: starting recovery timer (%us)\n", obd->obd_name,
-              OBD_RECOVERY_TIMEOUT);
+        CWARN("%s: starting recovery timer\n", obd->obd_name);
         obd->obd_recovery_handler = handler;
         cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
         spin_unlock_bh(&obd->obd_processing_task_lock);
 
-        reset_recovery_timer(obd);
+        reset_recovery_timer(obd, req, 1);
 }
 
 static int check_for_next_transno(struct obd_device *obd)
@@ -1193,14 +1200,11 @@ static void process_recovery_queue(struct obd_device *obd)
                 DEBUG_REQ(D_HA, req, "processing: ");
                 (void)obd->obd_recovery_handler(req);
                 obd->obd_replayed_requests++;
-                reset_recovery_timer(obd);
+                reset_recovery_timer(obd, req, 0);
                 /* bug 1580: decide how to properly sync() in recovery */
                 //mds_fsync_super(obd->u.obt.obt_sb);
                 class_export_put(req->rq_export);
-                if (req->rq_reply_state != NULL) {
-                        ptlrpc_rs_decref(req->rq_reply_state);
-                        /* req->rq_reply_state = NULL; */
-                }
+                ptlrpc_req_drop_rs(req);
                 OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
                 OBD_FREE(req, sizeof *req);
                 spin_lock_bh(&obd->obd_processing_task_lock);
@@ -1332,12 +1336,13 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc)
         struct lustre_msg *reqmsg;
         int recovery_done = 0;
 
-        LASSERT ((rc == 0) == (req->rq_reply_state != NULL));
+        LASSERT ((rc == 0) == lustre_packed_reply(req));
 
-        if (rc) {
+        if (!lustre_packed_reply(req)) {
                 /* Just like ptlrpc_error, but without the sending. */
                 rc = lustre_pack_reply(req, 1, NULL, NULL);
-                LASSERT(rc == 0); /* XXX handle this */
+                if (rc)
+                        CERROR("pack error %d\n", rc);
                 req->rq_type = PTL_RPC_MSG_ERR;
         }
 
@@ -1417,7 +1422,7 @@ target_send_reply_msg (struct ptlrpc_request *req, int rc, int fail_id)
                 DEBUG_REQ(D_NET, req, "sending reply");
         }
 
-        return (ptlrpc_send_reply(req, 1));
+        return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT));
 }
 
 void
index 7fd3d0b..15bd1b8 100644 (file)
@@ -58,10 +58,10 @@ inline cfs_time_t round_timeout(cfs_time_t timeout)
         return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
 }
 
-/* timeout for initial callback (AST) reply */
-static inline unsigned int ldlm_get_rq_timeout(unsigned int ldlm_timeout,
-                                               unsigned int obd_timeout)
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
 {
+        /* Non-AT value */
         unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
 
         return timeout < 1 ? 1 : timeout;
@@ -204,15 +204,15 @@ static void waiting_locks_callback(unsigned long unused)
                 lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
                                   l_pending_chain);
 
-                if (cfs_time_after(lock->l_callback_timeout, cfs_time_current()) ||
-                    (lock->l_req_mode == LCK_GROUP))
+                if (cfs_time_after(lock->l_callback_timeout, cfs_time_current())
+                    || (lock->l_req_mode == LCK_GROUP))
                         break;
 
-                LDLM_ERROR(lock, "lock callback timer expired: evicting client "
-                           "%s@%s nid %s ",lock->l_export->exp_client_uuid.uuid,
-                           lock->l_export->exp_connection->c_remote_uuid.uuid,
-                           libcfs_nid2str(lock->l_export->exp_connection->c_peer.nid));
-
+                LDLM_ERROR(lock, "lock callback timer expired after %lds: "
+                           "evicting client at %s ",
+                           cfs_time_current_sec()- lock->l_enqueued_time.tv_sec,
+                           libcfs_nid2str(
+                                   lock->l_export->exp_connection->c_peer.nid));
                 if (lock == last) {
                         LDLM_ERROR(lock, "waiting on lock multiple times");
                         CERROR("wll %p n/p %p/%p, l_pending %p n/p %p/%p\n",
@@ -270,20 +270,23 @@ static void waiting_locks_callback(unsigned long unused)
  */
 static int __ldlm_add_waiting_lock(struct ldlm_lock *lock)
 {
+        int timeout = obd_timeout / 2;  /* Non-AT value */
         cfs_time_t timeout_rounded;
 
         if (!list_empty(&lock->l_pending_chain))
                 return 0;
 
-        lock->l_callback_timeout =cfs_time_add(cfs_time_current(),
-                                               cfs_time_seconds(obd_timeout)/2);
+        if (lock->l_export && !AT_OFF) 
+                timeout = import_at_get_ldlm(lock->l_export->exp_imp_reverse);
+
+        lock->l_callback_timeout = cfs_time_shift(timeout);
 
         timeout_rounded = round_timeout(lock->l_callback_timeout);
 
-        if (cfs_time_before(timeout_rounded, cfs_timer_deadline(&waiting_locks_timer)) ||
+        if (cfs_time_before(timeout_rounded, 
+                            cfs_timer_deadline(&waiting_locks_timer)) ||
             !cfs_timer_is_armed(&waiting_locks_timer)) {
                 cfs_timer_arm(&waiting_locks_timer, timeout_rounded);
-
         }
         list_add_tail(&lock->l_pending_chain, &waiting_locks_list); /* FIFO */
         return 1;
@@ -614,7 +617,9 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
         }
 
         req->rq_send_state = LUSTRE_IMP_FULL;
-        req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout);
+        /* ptlrpc_prep_req already set timeout */
+        if (AT_OFF)
+                req->rq_timeout = ldlm_get_rq_timeout();
 
         if (lock->l_export && lock->l_export->exp_ldlm_stats)
                 lprocfs_counter_incr(lock->l_export->exp_ldlm_stats,
@@ -644,7 +649,8 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         total_enqueue_wait = cfs_timeval_sub(&granted_time,
                                              &lock->l_enqueued_time, NULL);
 
-        if (total_enqueue_wait / 1000000 > obd_timeout)
+        if (total_enqueue_wait / ONE_MILLION > obd_timeout)
+                /* non-fatal with AT - change to LDLM_DEBUG? */
                 LDLM_ERROR(lock, "enqueue wait took %luus from %lu",
                            total_enqueue_wait, lock->l_enqueued_time.tv_sec);
 
@@ -687,7 +693,9 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         ptlrpc_req_set_repsize(req, 1, NULL);
 
         req->rq_send_state = LUSTRE_IMP_FULL;
-        req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout);
+        /* ptlrpc_prep_req already set timeout */
+        if (AT_OFF)
+                req->rq_timeout = ldlm_get_rq_timeout();
 
         /* We only send real blocking ASTs after the lock is granted */
         lock_res_and_lock(lock);
@@ -750,7 +758,9 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
         ptlrpc_req_set_repsize(req, 2, size);
 
         req->rq_send_state = LUSTRE_IMP_FULL;
-        req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout);
+        /* ptlrpc_prep_req already set timeout */
+        if (AT_OFF)
+                req->rq_timeout = ldlm_get_rq_timeout();
 
         if (lock->l_export && lock->l_export->exp_ldlm_stats)
                 lprocfs_counter_incr(lock->l_export->exp_ldlm_stats,
@@ -1010,7 +1020,7 @@ existing_lock:
         EXIT;
  out:
         req->rq_status = rc ?: err;  /* return either error - bug 11190 */
-        if (req->rq_reply_state == NULL) {
+        if (!lustre_packed_reply(req)) {
                 err = lustre_pack_reply(req, 1, NULL, NULL);
                 if (rc == 0)
                         rc = err;
@@ -1340,7 +1350,7 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
 static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
 {
         req->rq_status = rc;
-        if (req->rq_reply_state == NULL) {
+        if (!lustre_packed_reply(req)) {
                 rc = lustre_pack_reply(req, 1, NULL, NULL);
                 if (rc)
                         return rc;
@@ -1727,7 +1737,7 @@ static int ldlm_setup(void)
         ldlm_state->ldlm_cb_service =
                 ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE,
                                 LDLM_MAXREPSIZE, LDLM_CB_REQUEST_PORTAL,
-                                LDLM_CB_REPLY_PORTAL, ldlm_timeout * 900,
+                                LDLM_CB_REPLY_PORTAL, 1800,
                                 ldlm_callback_handler, "ldlm_cbd",
                                 ldlm_svc_proc_dir, NULL,
                                 LDLM_THREADS_AUTO_MIN, LDLM_THREADS_AUTO_MAX,
@@ -1741,7 +1751,7 @@ static int ldlm_setup(void)
         ldlm_state->ldlm_cancel_service =
                 ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE,
                                 LDLM_MAXREPSIZE, LDLM_CANCEL_REQUEST_PORTAL,
-                                LDLM_CANCEL_REPLY_PORTAL, ldlm_timeout * 6000,
+                                LDLM_CANCEL_REPLY_PORTAL, 6000,
                                 ldlm_cancel_handler, "ldlm_canceld",
                                 ldlm_svc_proc_dir, NULL,
                                 LDLM_THREADS_AUTO_MIN, LDLM_THREADS_AUTO_MAX,
index c1b0030..d9f5dcf 100644 (file)
@@ -61,7 +61,8 @@ int ldlm_expired_completion_wait(void *data)
                 LDLM_ERROR(lock, "lock timed out (enqueued at %lu, %lus ago); "
                            "not entering recovery in server code, just going "
                            "back to sleep", lock->l_enqueued_time.tv_sec,
-                           CURRENT_SECONDS - lock->l_enqueued_time.tv_sec);
+                           cfs_time_current_sec() -
+                           lock->l_enqueued_time.tv_sec);
                 if (cfs_time_after(cfs_time_current(), next_dump)) {
                         last_dump = next_dump;
                         next_dump = cfs_time_shift(300);
@@ -91,6 +92,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         struct obd_device *obd;
         struct obd_import *imp = NULL;
         struct l_wait_info lwi;
+        __u32 timeout = obd_timeout; /* Non-AT value */
         int rc = 0;
         ENTRY;
 
@@ -115,8 +117,10 @@ noreproc:
         obd = class_exp2obd(lock->l_conn_export);
 
         /* if this is a local lock, then there is no import */
-        if (obd != NULL)
+        if (obd != NULL) {
                 imp = obd->u.cli.cl_import;
+                timeout = import_at_get_ldlm(imp);
+        }
 
         lwd.lwd_lock = lock;
 
@@ -124,7 +128,7 @@ noreproc:
                 LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
                 lwi = LWI_INTR(interrupted_completion_wait, &lwd);
         } else {
-                lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+                lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
                                        ldlm_expired_completion_wait,
                                        interrupted_completion_wait, &lwd);
         }
index 4da1a09..9c19665 100644 (file)
@@ -155,7 +155,7 @@ int liblustre_process_log(struct config_llog_instance *cfg,
         if (ocd == NULL)
                 GOTO(out_cleanup, rc = -ENOMEM);
 
-        ocd->ocd_connect_flags = OBD_CONNECT_VERSION;
+        ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT;
         ocd->ocd_version = LUSTRE_VERSION_CODE;
 
         rc = obd_connect(&mgc_conn, obd, &mgc_uuid, ocd);
@@ -286,7 +286,7 @@ int _sysio_lustre_init(void)
                         obd_timeout);
         }
 
-       /* debug peer on timeout? */
+        /* debug peer on timeout? */
         envstr = getenv("LIBLUSTRE_DEBUG_PEER_ON_TIMEOUT");
         if (envstr != NULL) {
                 obd_debug_peer_on_timeout = 
index fa7ff8f..eaa21ec 100644 (file)
@@ -1935,7 +1935,8 @@ llu_fsswop_mount(const char *source,
         obd_set_info_async(obd->obd_self_export, strlen("async"), "async",
                            sizeof(async), &async, NULL);
 
-        ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION;
+        ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION |
+                OBD_CONNECT_AT;
         ocd.ocd_ibits_known = MDS_INODELOCK_FULL;
         ocd.ocd_version = LUSTRE_VERSION_CODE;
 
@@ -1968,7 +1969,7 @@ llu_fsswop_mount(const char *source,
         obd->obd_upcall.onu_upcall = ll_ocd_update;
 
         ocd.ocd_connect_flags = OBD_CONNECT_SRVLOCK | OBD_CONNECT_REQPORTAL |
-                                OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK;
+                OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_AT;
         ocd.ocd_version = LUSTRE_VERSION_CODE;
         err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, &ocd);
         if (err) {
index 1ef3295..0719ac2 100644 (file)
@@ -156,10 +156,9 @@ static int client_common_fill_super(struct super_block *sb,
         }
 
         /* indicate the features supported by this client */
-        data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH |
-                                  OBD_CONNECT_JOIN |
-                                  OBD_CONNECT_ATTRFID | OBD_CONNECT_VERSION |
-                                  OBD_CONNECT_CANCELSET;
+        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_IBITS |
+                OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID | OBD_CONNECT_NODEVOH |
+                OBD_CONNECT_CANCELSET | OBD_CONNECT_AT;
 #ifdef CONFIG_FS_POSIX_ACL
         data->ocd_connect_flags |= OBD_CONNECT_ACL;
 #endif
@@ -257,9 +256,9 @@ static int client_common_fill_super(struct super_block *sb,
                 GOTO(out_mdc, err = -ENODEV);
         }
 
-        data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION |
-                                  OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
-                                  OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET;
+        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_GRANT |
+                OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | 
+                OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET | OBD_CONNECT_AT;
 
         CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
                "ocd_grant: %d\n", data->ocd_connect_flags,
index 27107cd..cfba05b 100644 (file)
@@ -74,11 +74,12 @@ static struct lprocfs_vars lprocfs_obd_vars[] = {
         { "kbytesavail",     lprocfs_rd_kbytesavail, 0, 0 },
         { "filestotal",      lprocfs_rd_filestotal,  0, 0 },
         { "filesfree",       lprocfs_rd_filesfree,   0, 0 },
-        //{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },
+        /*{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },*/
         { "mds_server_uuid", lprocfs_rd_server_uuid, 0, 0 },
         { "mds_conn_uuid",   lprocfs_rd_conn_uuid,   0, 0 },
         { "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight,
                                 mdc_wr_max_rpcs_in_flight, 0 },
+        { "timeouts",        lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 },
         { 0 }
 };
 
index 4fb8b47..db7b876 100644 (file)
@@ -79,6 +79,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
         struct ptlrpc_bulk_desc *desc;
         struct l_wait_info lwi;
         struct page **pages;
+        int timeout;
         int rc = 0, npages, i, tmpcount, tmpsize = 0;
         ENTRY;
 
@@ -130,7 +131,12 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
                 GOTO(abort_bulk, rc);
         }
 
-        lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+        timeout = (int)req->rq_deadline - (int)cfs_time_current_sec();
+        if (timeout < 0) {
+                CERROR("Req deadline already passed %lu (now: %lu)\n",
+                       req->rq_deadline, cfs_time_current_sec());
+        }
+        lwi = LWI_TIMEOUT(max(timeout, 1) * HZ, NULL, NULL);
         rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi);
         LASSERT (rc == 0 || rc == -ETIMEDOUT);
 
@@ -1022,7 +1028,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
         default:
                 mds_exit_ucred(&uc, mds);
-                if (req->rq_reply_state == NULL) {
+                if (!lustre_packed_reply(req)) {
                         req->rq_status = rc;
                         lustre_pack_reply(req, 1, NULL, NULL);
                 }
@@ -1072,7 +1078,7 @@ static int mds_getattr(struct ptlrpc_request *req, int offset)
 out_pop:
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
 out_ucred:
-        if (req->rq_reply_state == NULL) {
+        if (!lustre_packed_reply(req)) {
                 req->rq_status = rc;
                 lustre_pack_reply(req, 1, NULL, NULL);
         }
@@ -1097,13 +1103,15 @@ static int mds_obd_statfs(struct obd_device *obd, struct obd_statfs *osfs,
 static int mds_statfs(struct ptlrpc_request *req)
 {
         struct obd_device *obd = req->rq_export->exp_obd;
+        struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
         int rc, size[2] = { sizeof(struct ptlrpc_body),
                             sizeof(struct obd_statfs) };
         ENTRY;
 
         /* This will trigger a watchdog timeout */
         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP,
-                         (MDS_SERVICE_WATCHDOG_TIMEOUT / 1000) + 1);
+                         (MDS_SERVICE_WATCHDOG_FACTOR * 
+                          at_get(&svc->srv_at_estimate) / 1000) + 1);
         OBD_COUNTER_INCREMENT(obd, statfs);
 
         rc = lustre_pack_reply(req, 2, size, NULL);
@@ -2018,8 +2026,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
                               obd->obd_recoverable_clients,
                               (obd->obd_recoverable_clients == 1) ?
                               "client" : "clients",
-                              (int)(OBD_RECOVERY_TIMEOUT) / 60,
-                              (int)(OBD_RECOVERY_TIMEOUT) % 60,
+                              obd->obd_recovery_timeout / 60,
+                              obd->obd_recovery_timeout % 60,
                               obd->obd_name);
         } else {
                 LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery "
@@ -2539,7 +2547,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
         mds->mds_service =
                 ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
                                 MDS_MAXREPSIZE, MDS_REQUEST_PORTAL,
-                                MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
+                                MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR,
                                 mds_handle, LUSTRE_MDS_NAME,
                                 obd->obd_proc_entry, NULL, 
                                 mds_min_threads, mds_max_threads, "ll_mdt");
@@ -2556,7 +2564,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
         mds->mds_setattr_service =
                 ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
                                 MDS_MAXREPSIZE, MDS_SETATTR_PORTAL,
-                                MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
+                                MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR,
                                 mds_handle, "mds_setattr",
                                 obd->obd_proc_entry, NULL,
                                 mds_min_threads, mds_max_threads,
@@ -2573,7 +2581,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
         mds->mds_readpage_service =
                 ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
                                 MDS_MAXREPSIZE, MDS_READPAGE_PORTAL,
-                                MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
+                                MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR,
                                 mds_handle, "mds_readpage",
                                 obd->obd_proc_entry, NULL, 
                                 MDS_THREADS_MIN_READPAGE, mds_max_threads,
index 70fb742..2c34203 100644 (file)
@@ -446,9 +446,9 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
                 obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
                 obd->obd_recovering = 1;
                 obd->obd_recovery_start = CURRENT_SECONDS;
-                /* Only used for lprocfs_status */
+                obd->obd_recovery_timeout = OBD_RECOVERY_TIMEOUT;
                 obd->obd_recovery_end = obd->obd_recovery_start +
-                        OBD_RECOVERY_TIMEOUT;
+                        obd->obd_recovery_timeout;
         }
 
         mds->mds_mount_count = mount_count + 1;
index c5355e9..d7c0db5 100644 (file)
@@ -26,7 +26,7 @@ struct mds_client_data {
         __u8 mcd_padding[LR_CLIENT_SIZE - 88];
 };
 
-#define MDS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000)
+#define MDS_SERVICE_WATCHDOG_FACTOR 2000
 
 #define MAX_ATIME_DIFF 60
 
index e2cb1c8..9633c97 100644 (file)
@@ -337,7 +337,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
         if (data == NULL)
                 RETURN(-ENOMEM);
         data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX |
-                                  OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64;
+                OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64 | OBD_CONNECT_AT;
         data->ocd_version = LUSTRE_VERSION_CODE;
         /* NB: lov_connect() needs to fill in .ocd_index for each OST */
         rc = obd_connect(&conn, mds->mds_osc_obd, &obd->obd_uuid, data);
@@ -575,8 +575,8 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 void *handle;
                 struct inode *inode = obd->u.obt.obt_sb->s_root->d_inode;
                 BDEVNAME_DECLARE_STORAGE(tmp);
-                CERROR("*** setting device %s read-only ***\n",
-                       ll_bdevname(obd->u.obt.obt_sb, tmp));
+                LCONSOLE_WARN("*** setting obd %s device '%s' read-only ***\n",
+                       obd->obd_name, ll_bdevname(obd->u.obt.obt_sb, tmp));
 
                 handle = fsfilt_start(obd, inode, FSFILT_OP_MKNOD, NULL);
                 if (!IS_ERR(handle))
index 762cbd8..6020017 100644 (file)
@@ -1051,7 +1051,7 @@ static int mgc_process_log(struct obd_device *mgc,
         if (cld->cld_stopping) 
                 RETURN(0);
 
-        OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PROCESS_LOG, 20);
+        OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
 
         lsi = s2lsi(cld->cld_cfg.cfg_sb);
 
index e93134d..e17634d 100644 (file)
@@ -173,7 +173,7 @@ static int mgs_setup(struct obd_device *obd, obd_count len, void *buf)
         mgs->mgs_service =
                 ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE,
                                 MGS_MAXREPSIZE, MGS_REQUEST_PORTAL,
-                                MGC_REPLY_PORTAL, MGS_SERVICE_WATCHDOG_TIMEOUT,
+                                MGC_REPLY_PORTAL, 2000,
                                 mgs_handle, LUSTRE_MGS_NAME,
                                 obd->obd_proc_entry, NULL,
                                 MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX,
@@ -379,7 +379,7 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req)
                                    obd->obd_name, lockrc);
         }
 
-        OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_TARGET_REG, 10);
+        OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_PAUSE_TARGET_REG, 10);
 
         /* Log writing contention is handled by the fsdb_sem */
 
@@ -457,8 +457,8 @@ int mgs_handle(struct ptlrpc_request *req)
         int opc, rc = 0;
         ENTRY;
 
-        OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_REQUEST_NET, 2);
-        OBD_FAIL_RETURN(OBD_FAIL_MGS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0);
+        OBD_FAIL_TIMEOUT_MS(OBD_FAIL_MGS_PAUSE_REQ, obd_fail_val);
+        OBD_FAIL_RETURN(OBD_FAIL_MGS_ALL_REQUEST_NET, 0);
 
         LASSERT(current->journal_info == NULL);
         opc = lustre_msg_get_opc(req->rq_reqmsg);
index 81c94ad..5647659 100644 (file)
@@ -16,9 +16,6 @@
 #include <lustre_export.h>
 
 
-/* in ms */
-#define MGS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000)
-
 /* mgs_llog.c */
 int class_dentry_readdir(struct obd_device *obd, struct dentry *dir,
                          struct vfsmount *inmnt, 
index fbfcd09..17e174b 100644 (file)
@@ -63,9 +63,10 @@ unsigned int obd_alloc_fail_rate;
 unsigned int obd_debug_peer_on_timeout;
 unsigned int obd_dump_on_timeout;
 unsigned int obd_dump_on_eviction;
-unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
 unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
-unsigned int obd_health_check_timeout = HEALTH_CHECK_TIMEOUT_DEFAULT; /* seconds */
+unsigned int adaptive_timeout_min = 10;           /* seconds */
+unsigned int adaptive_timeout_max = 600;          /* seconds */
 unsigned int obd_max_dirty_pages = 256;
 atomic_t obd_dirty_pages;
 
@@ -384,7 +385,8 @@ EXPORT_SYMBOL(obd_dump_on_timeout);
 EXPORT_SYMBOL(obd_dump_on_eviction);
 EXPORT_SYMBOL(obd_timeout);
 EXPORT_SYMBOL(ldlm_timeout);
-EXPORT_SYMBOL(obd_health_check_timeout);
+EXPORT_SYMBOL(adaptive_timeout_min);
+EXPORT_SYMBOL(adaptive_timeout_max);
 EXPORT_SYMBOL(obd_max_dirty_pages);
 EXPORT_SYMBOL(obd_dirty_pages);
 EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
index f04e3ae..86cafe3 100644 (file)
@@ -670,7 +670,7 @@ struct obd_export *class_new_export(struct obd_device *obd,
 
         CFS_INIT_LIST_HEAD(&export->exp_handle.h_link);
         class_handle_hash(&export->exp_handle, export_handle_addref);
-        export->exp_last_request_time = CURRENT_SECONDS;
+        export->exp_last_request_time = cfs_time_current_sec();
         spin_lock_init(&export->exp_lock);
         INIT_HLIST_NODE(&export->exp_uuid_hash);
         INIT_HLIST_NODE(&export->exp_nid_hash);
@@ -791,6 +791,24 @@ void class_import_destroy(struct obd_import *import)
 }
 EXPORT_SYMBOL(class_import_put);
 
+static void init_imp_at(struct imp_at *at) {
+        int i;
+        /* We need enough time to get an early response on a slow network.
+           Since we can't say for sure how slow a network might be, we use
+           a user-defined max expected network latency. We will adapt to slow
+           increases, but a sudden jump can still kill us. */
+        at_init(&at->iat_net_latency, adaptive_timeout_min, AT_TIMEBASE_DEFAULT, 
+                AT_FLG_MIN);
+        for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+                /* max service estimates are tracked on the server side, so
+                   don't use the AT history here, just use the last reported
+                   val. (But keep hist for proc histogram, worst_ever) */
+                at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+                        AT_TIMEBASE_DEFAULT, AT_FLG_NOHIST);
+        }
+        at->iat_drain = 0;
+}
+
 struct obd_import *class_new_import(struct obd_device *obd)
 {
         struct obd_import *imp;
@@ -816,6 +834,7 @@ struct obd_import *class_new_import(struct obd_device *obd)
         CFS_INIT_LIST_HEAD(&imp->imp_conn_list);
         CFS_INIT_LIST_HEAD(&imp->imp_handle.h_link);
         class_handle_hash(&imp->imp_handle, import_handle_addref);
+        init_imp_at(&imp->imp_at);
 
         /* the default magic is V1, will be used in connect RPC, and
          * then adjusted according to the flags in request/reply. */
@@ -946,9 +965,9 @@ static void class_disconnect_export_list(struct list_head *list, int flags)
                 spin_unlock(&fake_exp->exp_lock);
 
                 rc = obd_disconnect(fake_exp);
+                CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+                       obd_export_nid2str(exp), exp, rc);
                 class_export_put(exp);
-                CDEBUG(D_HA, "disconnecting export %s (%p): rc %d\n",
-                       exp->exp_client_uuid.uuid, exp, rc);
         }
         EXIT;
 }
index 9d57d4f..c651261 100644 (file)
@@ -286,27 +286,6 @@ static int obd_proc_read_health(char *page, char **start, off_t off,
         return rc;
 }
 
-static int obd_proc_rd_health_timeout(char *page, char **start, off_t off,
-                                      int count, int *eof, void *data)
-{
-        *eof = 1;
-        return snprintf(page, count, "%d\n", obd_health_check_timeout);
-}
-
-static int obd_proc_wr_health_timeout(struct file *file, const char *buffer,
-                                      unsigned long count, void *data)
-{
-        int val, rc;
-
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
-
-        obd_health_check_timeout = val;
-
-        return count;
-}
-
 /* Root for /proc/fs/lustre */
 struct proc_dir_entry *proc_lustre_root = NULL;
 
@@ -314,8 +293,6 @@ struct lprocfs_vars lprocfs_base[] = {
         { "version", obd_proc_read_version, NULL, NULL },
         { "pinger", obd_proc_read_pinger, NULL, NULL },
         { "health_check", obd_proc_read_health, NULL, NULL },
-        { "health_check_timeout", obd_proc_rd_health_timeout,
-           obd_proc_wr_health_timeout, NULL },
         { 0 }
 };
 #else
index 7d919b8..89c0cb9 100644 (file)
@@ -62,6 +62,8 @@ enum {
         OBD_DUMP_ON_EVICTION,   /* dump kernel debug log upon eviction */
         OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
         OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
+        ADAPTIVE_MIN,           /* Adaptive timeout lower limit */
+        ADAPTIVE_MAX,           /* Adaptive timeout upper limit */
 };
 
 int LL_PROC_PROTO(proc_fail_loc)
@@ -177,7 +179,7 @@ static cfs_sysctl_table_t obd_table[] = {
                 .mode     = 0644,
                 .proc_handler = &proc_dointvec
         },
-       {
+        {
                 .ctl_name = OBD_LDLM_TIMEOUT,
                 .procname = "ldlm_timeout",
                 .data     = &ldlm_timeout,
@@ -195,6 +197,22 @@ static cfs_sysctl_table_t obd_table[] = {
                 .proc_handler = &proc_alloc_fail_rate
         },
 #endif
+        {
+                .ctl_name = ADAPTIVE_MIN,
+                .procname = "adaptive_min",
+                .data     = &adaptive_timeout_min,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_dointvec
+        },
+        {
+                .ctl_name = ADAPTIVE_MAX,
+                .procname = "adaptive_max",
+                .data     = &adaptive_timeout_max,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_dointvec
+        },
         { 0 }
 };
 
index 1b2ba7a..84a8251 100644 (file)
@@ -490,6 +490,111 @@ int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count,
         return rc;
 }
 
+int lprocfs_at_hist_helper(char *page, int count, int rc, 
+                           struct adaptive_timeout *at)
+{
+        int i;
+        for (i = 0; i < AT_BINS; i++)
+                rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]); 
+        rc += snprintf(page + rc, count - rc, "\n");
+        return rc;
+}
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
+                        int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        struct obd_import *imp;
+        unsigned int cur, worst;
+        time_t now, worstt;
+        struct dhms ts;
+        int i, rc = 0;
+
+        LASSERT(obd != NULL);
+        LPROCFS_CLIMP_CHECK(obd);
+        imp = obd->u.cli.cl_import;
+        *eof = 1;
+
+        if (AT_OFF)
+                rc += snprintf(page + rc, count - rc,
+                              "adaptive timeouts off, using obd_timeout %u\n",
+                              obd_timeout);
+        
+        rc += snprintf(page + rc, count - rc, 
+                       "%-10s : %ld sec\n", "timebase",
+                       imp->imp_at.iat_net_latency.at_binlimit * AT_BINS);
+        
+        now = cfs_time_current_sec();
+
+        /* Some network health info for kicks */
+        s2dhms(&ts, now - imp->imp_last_reply_time);
+        rc += snprintf(page + rc, count - rc, 
+                       "%-10s : %ld, "DHMS_FMT" ago\n",
+                       "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+
+        cur = at_get(&imp->imp_at.iat_net_latency);
+        worst = imp->imp_at.iat_net_latency.at_worst_ever;
+        worstt = imp->imp_at.iat_net_latency.at_worst_time;
+        s2dhms(&ts, now - worstt);
+        rc += snprintf(page + rc, count - rc, 
+                       "%-10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
+                       "network", cur, worst, worstt, DHMS_VARS(&ts)); 
+        rc = lprocfs_at_hist_helper(page, count, rc,
+                                    &imp->imp_at.iat_net_latency); 
+
+        for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+                if (imp->imp_at.iat_portal[i] == 0)
+                        break;
+                cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+                worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+                worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+                s2dhms(&ts, now - worstt);
+                rc += snprintf(page + rc, count - rc,
+                               "portal %-2d  : cur %3u  worst %3u (at %ld, "
+                               DHMS_FMT" ago) ", imp->imp_at.iat_portal[i], 
+                               cur, worst, worstt, DHMS_VARS(&ts));
+                rc = lprocfs_at_hist_helper(page, count, rc,
+                                          &imp->imp_at.iat_service_estimate[i]);
+        }
+
+        LPROCFS_CLIMP_EXIT(obd);
+        return rc;
+}
+
+int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+                        unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        struct obd_import *imp;
+        time_t bval;
+        int val, i, rc;
+
+        LASSERT(obd != NULL);
+        LPROCFS_CLIMP_CHECK(obd);
+        imp = obd->u.cli.cl_import;
+         
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc < 0)
+                return rc;
+        if (val <= 0)
+                return -ERANGE;
+
+        bval = max(1, val / AT_BINS);
+        spin_lock(&imp->imp_at.iat_net_latency.at_lock);
+        imp->imp_at.iat_net_latency.at_binlimit = bval;
+        spin_unlock(&imp->imp_at.iat_net_latency.at_lock);
+        for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+                spin_lock(&imp->imp_at.iat_service_estimate[i].at_lock);
+                imp->imp_at.iat_service_estimate[i].at_binlimit = bval;
+                spin_unlock(&imp->imp_at.iat_service_estimate[i].at_lock);
+        }
+        
+        LPROCFS_CLIMP_EXIT(obd);
+        return count;
+}
+
 static const char *obd_connect_names[] = {
         "read_only",
         "lov_index",
@@ -694,7 +799,7 @@ static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
        struct lprocfs_stats *stats = p->private;
        struct lprocfs_counter  *cntr = v;
        struct lprocfs_counter  t, ret = { .lc_min = ~(__u64)0 };
-       int i, idx, rc;
+       int i, idx, rc = 0;
 
        if (cntr == &(stats->ls_percpu[0])->lp_cntr[0]) {
                struct timeval now;
@@ -729,6 +834,9 @@ static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
                ret.lc_sumsquare += t.lc_sumsquare;
        }
 
+       if (ret.lc_count == 0)
+               goto out;
+
        rc = seq_printf(p, "%-25s "LPU64" samples [%s]", cntr->lc_name,
                        ret.lc_count, cntr->lc_units);
        if (rc < 0)
@@ -1395,7 +1503,9 @@ EXPORT_SYMBOL(lprocfs_rd_server_uuid);
 EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
 EXPORT_SYMBOL(lprocfs_rd_num_exports);
 EXPORT_SYMBOL(lprocfs_rd_numrefs);
-
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+EXPORT_SYMBOL(lprocfs_wr_timeouts);
 EXPORT_SYMBOL(lprocfs_rd_blksize);
 EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
 EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
index 4c01e23..a1d9fd7 100644 (file)
@@ -760,7 +760,6 @@ int class_process_config(struct lustre_cfg *lcfg)
                 CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
                        obd_timeout, lcfg->lcfg_num);
                 obd_timeout = max(lcfg->lcfg_num, 1U);
-                obd_health_check_timeout = HEALTH_CHECK_TIMEOUT;
                 GOTO(out, err = 0);
         }
         case LCFG_SET_UPCALL: {
index 2843b54..b0bc3fa 100644 (file)
@@ -188,8 +188,10 @@ int server_put_mount(char *name, struct vfsmount *mnt)
                 CERROR("Can't find mount for %s\n", name);
                 RETURN(-ENOENT);
         }
-        lsi = s2lsi(lmi->lmi_sb);
         LASSERT(lmi->lmi_mnt == mnt);
+        /* Note if lsi is poisoned, it's probably because server_wait_finished
+           didn't wait long enough and the sb was freed at the umount. */
+        lsi = s2lsi(lmi->lmi_sb);
 
         CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
                lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts), count);
@@ -518,7 +520,7 @@ DECLARE_MUTEX(mgc_start_lock);
 static int lustre_start_mgc(struct super_block *sb)
 {
         struct lustre_handle mgc_conn = {0, };
-        struct obd_connect_data ocd = { 0 };
+        struct obd_connect_data *data = NULL;
         struct lustre_sb_info *lsi = s2lsi(sb);
         struct obd_device *obd;
         struct obd_export *exp;
@@ -698,8 +700,14 @@ static int lustre_start_mgc(struct super_block *sb)
                 /* nonfatal */
                 CERROR("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
 
+        OBD_ALLOC_PTR(data);
+        if (data == NULL)
+                GOTO(out, rc = -ENOMEM);
+        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT;
+        data->ocd_version = LUSTRE_VERSION_CODE;
         /* We connect to the MGS at setup, and don't disconnect until cleanup */
-        rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), &ocd);
+        rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), data);
+        OBD_FREE_PTR(data);
         if (rc) {
                 CERROR("connect failed %d\n", rc);
                 GOTO(out, rc);
@@ -1296,7 +1304,7 @@ out_free:
         RETURN(ERR_PTR(rc));
 }
 
-/* We have to wait for everything to finish, including lnet lnd expires,
+/* We have to wait for everything to finish, including lnet lnd expires, 
    before it is safe to free the sb */
 static void server_wait_finished(struct vfsmount *mnt)
 {
@@ -1307,9 +1315,9 @@ static void server_wait_finished(struct vfsmount *mnt)
         init_waitqueue_head(&waitq);
 
         while (atomic_read(&mnt->mnt_count) > 1) {
-                if (waited && (waited % 30 == 0)) 
+                if (waited && (waited % 30 == 0))
                         LCONSOLE_WARN("Mount still busy with %d refs after "
-                                      "%d secs\n", atomic_read(&mnt->mnt_count), 
+                                      "%d secs\n", atomic_read(&mnt->mnt_count),
                                       waited);
                 /* Wait for a bit */
                 waited += 3;
index f6251c3..423e3d2 100644 (file)
@@ -548,7 +548,7 @@ static int filter_init_export(struct obd_export *exp)
 {
         spin_lock_init(&exp->exp_filter_data.fed_lock);
         INIT_LIST_HEAD(&exp->exp_filter_data.fed_mod_list);
-       
+        
         spin_lock(&exp->exp_lock);
         exp->exp_connecting = 1;
         spin_unlock(&exp->exp_lock);
@@ -805,9 +805,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
                 obd->obd_recovering = 1;
                 obd->obd_recovery_start = CURRENT_SECONDS;
-                /* Only used for lprocfs_status */
+                obd->obd_recovery_timeout = OBD_RECOVERY_TIMEOUT;
                 obd->obd_recovery_end = obd->obd_recovery_start +
-                        OBD_RECOVERY_TIMEOUT;
+                        obd->obd_recovery_timeout;
         }
 
 out:
@@ -1156,7 +1156,7 @@ struct dentry *filter_parent_lock(struct obd_device *obd, obd_gr group,
                 return dparent;
 
         rc = filter_lock_dentry(obd, dparent);
-        fsfilt_check_slow(obd, now, obd_timeout, "parent lock");
+        fsfilt_check_slow(obd, now, "parent lock");
         return rc ? ERR_PTR(rc) : dparent;
 }
 
@@ -1742,8 +1742,8 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
                               obd->obd_recoverable_clients,
                               (obd->obd_recoverable_clients == 1)
                               ? "client" : "clients",
-                              (int)(OBD_RECOVERY_TIMEOUT) / 60,
-                              (int)(OBD_RECOVERY_TIMEOUT) % 60,
+                              obd->obd_recovery_timeout / 60,
+                              obd->obd_recovery_timeout % 60,
                               obd->obd_name);
         } else {
                 LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
@@ -2821,7 +2821,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
         struct dentry *dchild = NULL, *dparent = NULL;
         struct filter_obd *filter;
         int err = 0, rc = 0, recreate_obj = 0, i;
-        unsigned long enough_time = jiffies + min(obd_timeout * HZ / 4, 10U*HZ);
+        cfs_time_t enough_time = cfs_time_shift(DISK_TIMEOUT/2);
         obd_id next_id;
         void *handle = NULL;
         ENTRY;
@@ -2952,7 +2952,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
 
                 if (rc)
                         break;
-                if (time_after(jiffies, enough_time)) {
+                if (cfs_time_after(cfs_time_current(), enough_time)) {
                         CDEBUG(D_RPCTRACE,
                                "%s: precreate slow - want %d got %d \n",
                                obd->obd_name, *num, i);
@@ -3441,8 +3441,8 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp,
                 struct super_block *sb = obd->u.obt.obt_sb;
                 struct inode *inode = sb->s_root->d_inode;
                 BDEVNAME_DECLARE_STORAGE(tmp);
-                CERROR("*** setting device %s read-only ***\n",
-                       ll_bdevname(sb, tmp));
+                LCONSOLE_WARN("*** setting obd %s device '%s' read-only ***\n",
+                              obd->obd_name, ll_bdevname(sb, tmp));
 
                 handle = fsfilt_start(obd, inode, FSFILT_OP_MKNOD, NULL);
                 if (!IS_ERR(handle))
index 65e5443..03eabfe 100644 (file)
@@ -29,8 +29,6 @@
 #define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE)
 #define GRANT_FOR_LLOG(obd) 16
 
-#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
-
 extern struct file_operations filter_per_export_stats_fops;
 
 /* Data stored per client in the last_rcvd file.  In le32 order. */
@@ -67,6 +65,7 @@ struct filter_mod_data {
 #else
 #define FILTER_FMD_MAX_NUM_DEFAULT  32
 #endif
+/* Client cache seconds */
 #define FILTER_FMD_MAX_AGE_DEFAULT ((obd_timeout + 10) * HZ)
 
 struct filter_mod_data *filter_fmd_find(struct obd_export *exp,
index 72e3344..c0f8651 100644 (file)
@@ -304,7 +304,7 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
         inode = dentry->d_inode;
 
         obdo_to_inode(inode, oa, OBD_MD_FLATIME);
-        fsfilt_check_slow(obd, now, obd_timeout, "preprw_read setup");
+        fsfilt_check_slow(obd, now, "preprw_read setup");
 
         for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt;
              i++, rnb++, lnb++) {
@@ -337,7 +337,7 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
                 filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
         }
 
-        fsfilt_check_slow(obd, now, obd_timeout, "start_page_read");
+        fsfilt_check_slow(obd, now, "start_page_read");
 
         rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf,
                               exp, NULL, NULL, NULL);
@@ -558,7 +558,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
         fso.fso_dentry = dentry;
         fso.fso_bufcnt = obj->ioo_bufcnt;
 
-        fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "preprw_write setup");
+        fsfilt_check_slow(exp->exp_obd, now, "preprw_write setup");
 
         /* Don't update inode timestamps if this write is older than a
          * setattr which modifies the timestamps. b=10150 */
@@ -661,7 +661,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
         rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp,
                               NULL, NULL, NULL);
 
-        fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "start_page_write");
+        fsfilt_check_slow(exp->exp_obd, now, "start_page_write");
 
         lprocfs_counter_add(exp->exp_obd->obd_stats, LPROC_FILTER_WRITE_BYTES,
                             tot_bytes);
index 0ff1a0f..fe5e6c6 100644 (file)
@@ -474,7 +474,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
                 GOTO(cleanup, rc);
         }
 
-        fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
+        fsfilt_check_slow(obd, now, "brw_start");
 
         i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
 
@@ -510,7 +510,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
         if (rc == 0)
                 obdo_from_inode(oa, inode, FILTER_VALID_FLAGS);
 
-        fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
+        fsfilt_check_slow(obd, now, "direct_io");
 
         err = fsfilt_commit_wait(obd, inode, wait_handle);
         if (err) {
@@ -521,7 +521,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
                 LASSERTF(oti->oti_transno <= obd->obd_last_committed,
                          "oti_transno "LPU64" last_committed "LPU64"\n",
                          oti->oti_transno, obd->obd_last_committed);
-        fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
+        fsfilt_check_slow(obd, now, "commitrw commit");
 
 cleanup:
         filter_grant_commit(exp, niocount, res);
index 652e6c1..20550d5 100644 (file)
@@ -694,7 +694,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         DQUOT_INIT(inode);
 
         LOCK_INODE_MUTEX(inode);
-        fsfilt_check_slow(obd, now, obd_timeout, "i_mutex");
+        fsfilt_check_slow(obd, now, "i_mutex");
         oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
                                            oti);
         if (IS_ERR(oti->oti_handle)) {
@@ -707,7 +707,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         }
         /* have to call fsfilt_commit() from this point on */
 
-        fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
+        fsfilt_check_slow(obd, now, "brw_start");
 
         i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
 
@@ -759,7 +759,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
 
         lquota_getflag(filter_quota_interface_ref, obd, oa);
 
-        fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
+        fsfilt_check_slow(obd, now, "direct_io");
 
         err = fsfilt_commit_wait(obd, inode, wait_handle);
         if (err) {
@@ -772,7 +772,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                          "oti_transno "LPU64" last_committed "LPU64"\n",
                          oti->oti_transno, obd->obd_last_committed);
 
-        fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
+        fsfilt_check_slow(obd, now, "commitrw commit");
 
 cleanup:
         filter_grant_commit(exp, niocount, res);
index 4a0f183..7a50e66 100644 (file)
@@ -329,6 +329,7 @@ static struct lprocfs_vars lprocfs_obd_vars[] = {
         { "prealloc_next_id", osc_rd_prealloc_next_id, 0, 0 },
         { "prealloc_last_id", osc_rd_prealloc_last_id, 0, 0 },
         { "checksums",       osc_rd_checksum, osc_wr_checksum, 0 },
+        { "timeouts",        lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 },
         { 0 }
 };
 
index 4221588..671089d 100644 (file)
@@ -362,8 +362,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         CDEBUG(D_HA,"%s: oscc recovery in progress, waiting\n",
                                oscc->oscc_obd->obd_name);
 
-                        lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout/4)),
-                                          NULL, NULL);
+                        lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(
+                                obd_timeout / 4)), NULL, NULL);
                         rc = l_wait_event(oscc->oscc_waitq,
                                           !oscc_recovering(oscc), &lwi);
                         LASSERT(rc == 0 || rc == -ETIMEDOUT);
index 803bddc..52262ec 100644 (file)
@@ -728,13 +728,14 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
          * If getting the lock took more time than
          * client was willing to wait, drop it. b=11330
          */
-        if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout || 
+        if (cfs_time_current_sec() > req->rq_deadline || 
             OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
                 no_reply = 1;
                 CERROR("Dropping timed-out read from %s because locking"
-                       "object "LPX64" took %ld seconds.\n",
+                       "object "LPX64" took %ld seconds (limit was %ld).\n",
                        libcfs_id2str(req->rq_peer), ioo->ioo_id,
-                       cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
+                       cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
+                       req->rq_deadline - req->rq_arrival_time.tv_sec);
                 goto out_lock;
         }
 
@@ -789,12 +790,24 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 else
                         rc = ptlrpc_start_bulk_transfer(desc);
                 if (rc == 0) {
-                        lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 4, HZ,
-                                                   ost_bulk_timeout, desc);
-                        rc = l_wait_event(desc->bd_waitq,
-                                          !ptlrpc_bulk_active(desc) ||
-                                          desc->bd_export->exp_failed, &lwi);
-                        LASSERT(rc == 0 || rc == -ETIMEDOUT);
+                        do {
+                                long timeoutl = req->rq_deadline - 
+                                        cfs_time_current_sec();
+                                cfs_duration_t timeout = (timeoutl <= 0 || rc) ? 
+                                        CFS_TICK : cfs_time_seconds(timeoutl);
+                                lwi = LWI_TIMEOUT_INTERVAL(timeout, 
+                                                           cfs_time_seconds(1),
+                                                           ost_bulk_timeout, 
+                                                           desc);
+                                rc = l_wait_event(desc->bd_waitq, 
+                                                  !ptlrpc_bulk_active(desc) ||
+                                                  desc->bd_export->exp_failed,
+                                                  &lwi);
+                                LASSERT(rc == 0 || rc == -ETIMEDOUT);
+                                /* Wait again if we changed deadline */
+                        } while ((rc == -ETIMEDOUT) && 
+                                 (req->rq_deadline > cfs_time_current_sec()));
+
                         if (rc == -ETIMEDOUT) {
                                 DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT");
                                 ptlrpc_abort_bulk(desc);
@@ -848,11 +861,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 req->rq_status = rc;
                 ptlrpc_error(req);
         } else {
-                if (req->rq_reply_state != NULL) {
-                        /* reply out callback would free */
-                        ptlrpc_rs_decref(req->rq_reply_state);
-                        req->rq_reply_state = NULL;
-                }
+                /* reply out callback would free */
+                ptlrpc_req_drop_rs(req);
                 CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - "
                       "client will retry\n",
                       req->rq_export->exp_obd->obd_name,
@@ -977,13 +987,14 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
          * If getting the lock took more time than
          * client was willing to wait, drop it. b=11330
          */
-        if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout || 
+        if (cfs_time_current_sec() > req->rq_deadline || 
             OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
                 no_reply = 1;
-                CERROR("Dropping timed-out write from %s because locking"
-                       "object "LPX64" took %ld seconds.\n",
+                CERROR("Dropping timed-out write from %s because locking "
+                       "object "LPX64" took %ld seconds (limit was %ld).\n",
                        libcfs_id2str(req->rq_peer), ioo->ioo_id,
-                       cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
+                       cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
+                       req->rq_deadline - req->rq_arrival_time.tv_sec);
                 goto out_lock;
         }
 
@@ -1019,11 +1030,21 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         else
                 rc = ptlrpc_start_bulk_transfer (desc);
         if (rc == 0) {
-                lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 2, HZ,
-                                           ost_bulk_timeout, desc);
-                rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc) ||
-                                  desc->bd_export->exp_failed, &lwi);
-                LASSERT(rc == 0 || rc == -ETIMEDOUT);
+                do {
+                        long timeoutl = req->rq_deadline - 
+                                cfs_time_current_sec();
+                        cfs_duration_t timeout = (timeoutl <= 0 || rc) ? 
+                                CFS_TICK : cfs_time_seconds(timeoutl);
+                        lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+                                                   ost_bulk_timeout, desc);
+                        rc = l_wait_event(desc->bd_waitq, 
+                                          !ptlrpc_bulk_active(desc) ||
+                                          desc->bd_export->exp_failed, &lwi);
+                        LASSERT(rc == 0 || rc == -ETIMEDOUT);
+                        /* Wait again if we changed deadline */
+                } while ((rc == -ETIMEDOUT) && 
+                         (req->rq_deadline > cfs_time_current_sec()));
+
                 if (rc == -ETIMEDOUT) {
                         DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
                         ptlrpc_abort_bulk(desc);
@@ -1149,11 +1170,8 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 req->rq_status = rc;
                 ptlrpc_error(req);
         } else {
-                if (req->rq_reply_state != NULL) {
-                        /* reply out callback would free */
-                        ptlrpc_rs_decref(req->rq_reply_state);
-                        req->rq_reply_state = NULL;
-                }
+                /* reply out callback would free */
+                ptlrpc_req_drop_rs(req);
                 CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - "
                       "client will retry\n",
                       req->rq_export->exp_obd->obd_name,
@@ -1427,6 +1445,7 @@ static int ost_handle(struct ptlrpc_request *req)
         case OST_CREATE:
                 CDEBUG(D_INODE, "create\n");
                 OBD_FAIL_RETURN(OBD_FAIL_OST_CREATE_NET, 0);
+                OBD_FAIL_TIMEOUT_MS(OBD_FAIL_OST_PAUSE_CREATE, obd_fail_val);
                 if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_ENOSPC))
                         GOTO(out, rc = -ENOSPC);
                 if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_EROFS))
@@ -1702,8 +1721,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
         ost->ost_service =
                 ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
                                 OST_MAXREPSIZE, OST_REQUEST_PORTAL,
-                                OSC_REPLY_PORTAL,
-                                obd_timeout * 1000, ost_handle, LUSTRE_OSS_NAME,
+                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, 
+                                ost_handle, LUSTRE_OSS_NAME,
                                 obd->obd_proc_entry, ost_print_req,
                                 oss_min_threads, oss_max_threads, "ll_ost");
         if (ost->ost_service == NULL) {
@@ -1730,8 +1749,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
         ost->ost_create_service =
                 ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
                                 OST_MAXREPSIZE, OST_CREATE_PORTAL,
-                                OSC_REPLY_PORTAL,
-                                obd_timeout * 1000, ost_handle, "ost_create",
+                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
+                                ost_handle, "ost_create",
                                 obd->obd_proc_entry, ost_print_req,
                                 oss_min_create_threads,
                                 oss_max_create_threads,
@@ -1748,8 +1767,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
         ost->ost_io_service =
                 ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
                                 OST_MAXREPSIZE, OST_IO_PORTAL,
-                                OSC_REPLY_PORTAL,
-                                obd_timeout * 1000, ost_handle, "ost_io",
+                                OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, 
+                                ost_handle, "ost_io",
                                 obd->obd_proc_entry, ost_print_req,
                                 oss_min_threads, oss_max_threads, "ll_ost_io");
         if (ost->ost_io_service == NULL) {
index 82a40b9..e572ead 100644 (file)
@@ -11,6 +11,8 @@ extern void ost_print_req(void *seq_file, struct ptlrpc_request *req);
 # define ost_print_req NULL
 #endif
 
+#define OSS_SERVICE_WATCHDOG_FACTOR 2000
+
 /*
  * tunables for per-thread page pool (bug 5137)
  */
@@ -37,7 +39,7 @@ struct ost_thread_local_cache {
 
 struct ost_thread_local_cache *ost_tls(struct ptlrpc_request *r);
 
-#define OSS_DEF_CREATE_THREADS  1UL
+#define OSS_DEF_CREATE_THREADS  2UL
 #define OSS_MAX_CREATE_THREADS 16UL
 
 /* Quota stuff */
index c3f3f20..75b62b9 100644 (file)
@@ -189,6 +189,190 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
         EXIT;
 }
 
+/* Set server timelimit for this req */
+static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) {
+        __u32 serv_est;
+        int idx;
+        struct imp_at *at;
+        LASSERT(req->rq_import);
+
+        if (AT_OFF || 
+            ((idx = import_at_get_index(req->rq_import, 
+                                        req->rq_request_portal)) < 0)) {
+                /* non-AT settings */
+                req->rq_timeout = req->rq_import->imp_server_timeout ? 
+                        obd_timeout / 2 : obd_timeout;
+                goto out;
+        }
+
+        at = &req->rq_import->imp_at;
+        serv_est = at_get(&at->iat_service_estimate[idx]);
+        /* add an arbitrary minimum: 125% +5 sec */
+        req->rq_timeout = serv_est + (serv_est >> 2) + 5;
+        /* We could get even fancier here, using history to predict increased
+           loading... */
+             
+        if (at->iat_drain > req->rq_timeout) {
+                /* If we're trying to drain the network queues, give this 
+                   req a long timeout */
+                req->rq_timeout = at->iat_drain;
+                CDEBUG(D_ADAPTTO, "waiting %ds to let queues drain\n",
+                       req->rq_timeout);
+        }
+
+out:
+        /* Let the server know what this RPC timeout is by putting it in the 
+           reqmsg*/
+        lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req) {
+        int idx;
+        unsigned int serv_est, oldse;
+        struct imp_at *at = &req->rq_import->imp_at;
+        LASSERT(req->rq_import);
+        
+        /* service estimate is returned in the repmsg timeout field,
+           may be 0 on err */
+        serv_est = lustre_msg_get_timeout(req->rq_repmsg);
+
+        idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+        if (idx < 0) 
+                return;
+
+        oldse = at_get(&at->iat_service_estimate[idx]);
+        /* max service estimates are tracked on the server side,
+           so just keep minimal history here */
+        at_add(&at->iat_service_estimate[idx], serv_est);
+
+        if (at_get(&at->iat_service_estimate[idx]) != oldse)
+                CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
+                      "has changed %ds to %u\n", 
+                      req->rq_import->imp_obd->obd_name, req->rq_request_portal,
+                      at_get(&at->iat_service_estimate[idx]) - oldse, serv_est);
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) {
+        return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) {
+        unsigned int st, nl, oldnl;
+        struct imp_at *at = &req->rq_import->imp_at;
+        time_t now = cfs_time_current_sec();
+        LASSERT(req->rq_import);
+        
+        st = lustre_msg_get_service_time(req->rq_repmsg);
+        
+        /* Network latency is total time less server processing time */
+        nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
+        if (st > now - req->rq_sent + 1 /* rounding */) 
+                CERROR("Reported service time %u > total measured time %ld\n",
+                       st, now - req->rq_sent);
+
+        oldnl = at_get(&at->iat_net_latency);
+        at_add(&at->iat_net_latency, nl);
+
+        if (at_get(&at->iat_net_latency) != oldnl)
+                CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s)"
+                       " has changed %ds to %u\n", 
+                       req->rq_import->imp_obd->obd_name,
+                       obd_uuid2str(
+                               &req->rq_import->imp_connection->c_remote_uuid),
+                       at_get(&at->iat_net_latency) - oldnl, nl);
+}
+
+static int unpack_reply(struct ptlrpc_request *req) {
+        int rc;
+
+        /* Clear reply swab mask; we may have already swabbed an early reply */
+        req->rq_rep_swab_mask = 0;
+
+        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
+        if (rc) {
+                DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+                return(-EPROTO);
+        }
+
+        rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+        if (rc) {
+                DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+                return(-EPROTO);
+        }
+        return 0;
+}
+
+/* Handle an early reply message.
+   We can't risk the real reply coming in and changing rq_repmsg, 
+   so this fn must be called under the rq_lock */
+static int ptlrpc_at_early_reply(struct ptlrpc_request *req) {
+        struct lustre_msg *oldmsg, *msgcpy;
+        time_t olddl;
+        int oldlen, rc;
+        ENTRY;
+
+        req->rq_early = 0;
+
+        rc = unpack_reply(req);
+        if (rc) 
+                /* Let's just ignore it - same as if it never got here */ 
+                RETURN(rc);
+
+        /* We've got to make sure another early reply doesn't land on
+           top of our current repbuf.  Make a copy and verify checksum. */
+        oldlen = req->rq_replen;
+        spin_unlock(&req->rq_lock);
+        OBD_ALLOC(msgcpy, oldlen);
+        if (!msgcpy) {
+                spin_lock(&req->rq_lock);
+                RETURN(-ENOMEM);
+        }
+        spin_lock(&req->rq_lock);
+        /* Another reply might have changed the repmsg and replen while 
+           we dropped the lock; doesn't really matter, just use the latest.
+           If it doesn't fit in oldlen, checksum will be wrong. */
+        oldmsg = req->rq_repmsg;
+        memcpy(msgcpy, oldmsg, oldlen);
+        if (lustre_msg_get_cksum(msgcpy) != 
+            lustre_msg_calc_cksum(msgcpy)) {
+                CDEBUG(D_ADAPTTO, "Early reply checksum mismatch, "
+                       "discarding %x != %x\n", lustre_msg_get_cksum(msgcpy),
+                       lustre_msg_calc_cksum(msgcpy));
+                GOTO(out, rc = -EINVAL); 
+        }
+
+        /* Our copied msg is valid, now we can adjust the timeouts without 
+           worrying that a new reply will land on the copy. */
+        req->rq_repmsg = msgcpy;
+
+        /* Expecting to increase the service time estimate here */
+        ptlrpc_at_adj_service(req);
+        ptlrpc_at_adj_net_latency(req);
+
+        /* Adjust the local timeout for this req */
+        ptlrpc_at_set_req_timeout(req);
+
+        olddl = req->rq_deadline;
+        /* server assumes it now has rq_timeout from when it sent the 
+           early reply, so client should give it at least that long. */
+        req->rq_deadline = cfs_time_current_sec() + req->rq_timeout + 
+                    ptlrpc_at_get_net_latency(req);
+
+        DEBUG_REQ(D_ADAPTTO, req, 
+                  "Early reply #%d, new deadline in %lds (%+lds)", 
+                  req->rq_early_count, req->rq_deadline -
+                  cfs_time_current_sec(), req->rq_deadline - olddl);
+        
+        req->rq_repmsg = oldmsg;
+        
+out:
+        OBD_FREE(msgcpy, oldlen);
+        RETURN(rc);
+}
+
 void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
 {
         struct list_head *l, *tmp;
@@ -342,11 +526,6 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
         }
 
         lustre_msg_add_version(request->rq_reqmsg, version);
-
-        if (imp->imp_server_timeout)
-                request->rq_timeout = obd_timeout / 2;
-        else
-                request->rq_timeout = obd_timeout;
         request->rq_send_state = LUSTRE_IMP_FULL;
         request->rq_type = PTL_RPC_MSG_REQUEST;
         request->rq_import = class_import_get(imp);
@@ -363,6 +542,8 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
         /* XXX FIXME bug 249 */
         request->rq_request_portal = imp->imp_client->cli_request_portal;
         request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+        
+        ptlrpc_at_set_req_timeout(request);
 
         spin_lock_init(&request->rq_lock);
         CFS_INIT_LIST_HEAD(&request->rq_list);
@@ -373,7 +554,7 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
         atomic_set(&request->rq_refcount, 1);
 
         lustre_msg_set_opc(request->rq_reqmsg, opcode);
-        lustre_msg_set_flags(request->rq_reqmsg, 0);
+        lustre_msg_set_flags(request->rq_reqmsg, imp->imp_msg_flags);
 
         RETURN(request);
 }
@@ -558,6 +739,12 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
 
         if (req->rq_restart)
                 GOTO(out, rc = 1);
+
+        if (req->rq_early) {
+                ptlrpc_at_early_reply(req);
+                GOTO(out, rc = 0); /* keep waiting */
+        }
+
         EXIT;
  out:
         spin_unlock(&req->rq_lock);
@@ -603,22 +790,14 @@ static int after_reply(struct ptlrpc_request *req)
         /* NB Until this point, the whole of the incoming message,
          * including buflens, status etc is in the sender's byte order. */
 
-#if SWAB_PARANOIA
-        /* Clear reply swab mask; this is a new reply in sender's byte order */
-        req->rq_rep_swab_mask = 0;
-#endif
         LASSERT (req->rq_nob_received <= req->rq_replen);
-        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
-        if (rc) {
-                DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
-                RETURN(-EPROTO);
-        }
+        rc = unpack_reply(req);
+        if (rc) 
+                RETURN(rc);
 
-        rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
-        if (rc) {
-                DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
-                RETURN(-EPROTO);
-        }
+        OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val);
+        ptlrpc_at_adj_service(req);
+        ptlrpc_at_adj_net_latency(req);
 
         if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
             lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
@@ -865,6 +1044,14 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
                         /* Still waiting for a reply? */
                         if (ptlrpc_client_receiving_reply(req))
                                 continue;
+                        
+                        spin_lock(&req->rq_lock);
+                        if (req->rq_early) {
+                                ptlrpc_at_early_reply(req);
+                                spin_unlock(&req->rq_lock);
+                                continue;
+                        }
+                        spin_unlock(&req->rq_lock);
 
                         /* Did we actually receive a reply? */
                         if (!ptlrpc_client_replied(req))
@@ -950,15 +1137,25 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
         RETURN(set->set_remaining == 0 || force_timer_recalc);
 }
 
+/* Return 1 if we should give up, else 0 */
 int ptlrpc_expire_one_request(struct ptlrpc_request *req)
 {
         struct obd_import *imp = req->rq_import;
         int rc = 0;
         ENTRY;
 
-        DEBUG_REQ(D_ERROR|D_NETERROR, req, "%s (sent at %lu, %lus ago)",
+        DEBUG_REQ(D_NETERROR, req, "%s (sent at %lu, %lus ago)",
                   req->rq_net_err ? "network error" : "timeout",
-                  (long)req->rq_sent, CURRENT_SECONDS - req->rq_sent);
+                  (long)req->rq_sent, cfs_time_current_sec() - req->rq_sent);
+
+        if (imp) {
+                LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago"
+                              " has timed out (limit %lus).\n", req->rq_xid,
+                              req->rq_import->imp_obd->obd_name,
+                              libcfs_nid2str(imp->imp_connection->c_peer.nid),
+                              cfs_time_current_sec() - req->rq_sent,
+                              req->rq_deadline - req->rq_sent);
+        }
 
         if (imp != NULL && obd_debug_peer_on_timeout)
                 LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
@@ -988,6 +1185,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req)
          * then error it out here. */
         if (req->rq_send_state != LUSTRE_IMP_FULL ||
             imp->imp_obd->obd_no_recov) {
+                DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+                          ptlrpc_import_state_name(req->rq_send_state),
+                          ptlrpc_import_state_name(imp->imp_state));
                 spin_lock(&req->rq_lock);
                 req->rq_status = -ETIMEDOUT;
                 req->rq_err = 1;
@@ -995,7 +1195,8 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req)
                 RETURN(1);
         }
 
-        /* if request can't be resend we can't wait answer after timeout */
+        /* if a request can't be resent we can't wait for an answer after 
+           the timeout */
         if (req->rq_no_resend) {
                 DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
                 rc = 1;
@@ -1010,7 +1211,7 @@ int ptlrpc_expired_set(void *data)
 {
         struct ptlrpc_request_set *set = data;
         struct list_head          *tmp;
-        time_t                     now = CURRENT_SECONDS;
+        time_t                     now = cfs_time_current_sec();
         ENTRY;
 
         LASSERT(set != NULL);
@@ -1027,7 +1228,7 @@ int ptlrpc_expired_set(void *data)
                         continue;
 
                 if (req->rq_timedout ||           /* already dealt with */
-                    req->rq_sent + req->rq_timeout > now) /* not expired */
+                    req->rq_deadline > now) /* not expired */
                         continue;
 
                 /* deal with this guy */
@@ -1067,11 +1268,11 @@ void ptlrpc_interrupted_set(void *data)
         }
 }
 
+/* get the smallest timeout in the set; this does NOT set a timeout. */
 int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
 {
         struct list_head      *tmp;
-        time_t                 now = CURRENT_SECONDS;
-        time_t                 deadline;
+        time_t                 now = cfs_time_current_sec();
         int                    timeout = 0;
         struct ptlrpc_request *req;
         ENTRY;
@@ -1089,11 +1290,12 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
                 if (req->rq_timedout)   /* already timed out */
                         continue;
 
-                deadline = req->rq_sent + req->rq_timeout;
-                if (deadline <= now)    /* actually expired already */
+                if (req->rq_deadline <= now) {  /* actually expired already */
                         timeout = 1;    /* ASAP */
-                else if (timeout == 0 || timeout > deadline - now)
-                        timeout = deadline - now;
+                        break;
+                } else if (timeout == 0 || timeout > req->rq_deadline - now) {
+                        timeout = req->rq_deadline - now;
+                }
         }
         RETURN(timeout);
 }
@@ -1197,8 +1399,9 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                 LBUG();
         }
 
-        if (request->rq_repmsg != NULL) {
-                OBD_FREE(request->rq_repmsg, request->rq_replen);
+        if (request->rq_repbuf != NULL) {
+                OBD_FREE(request->rq_repbuf, request->rq_replen);
+                request->rq_repbuf = NULL;
                 request->rq_repmsg = NULL;
         }
         if (request->rq_export != NULL) {
@@ -1285,13 +1488,16 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request)
 
         LASSERT(!in_interrupt ());             /* might sleep */
 
-        if (!ptlrpc_client_receiving_reply(request))
+        if (!ptlrpc_client_receiving_reply(request) && 
+            !ptlrpc_client_must_unlink(request))
+                /* Nothing left to do */
                 return;
 
         LNetMDUnlink (request->rq_reply_md_h);
 
         /* We have to l_wait_event() whatever the result, to give liblustre
-         * a chance to run reply_in_callback() */
+         * a chance to run reply_in_callback(), and to make sure we've 
+         * unlinked before returning a req to the pool */
 
         if (request->rq_set != NULL)
                 wq = &request->rq_set->set_waitq;
@@ -1301,13 +1507,19 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request)
         for (;;) {
                 /* Network access will complete in finite time but the HUGE
                  * timeout lets us CWARN for visibility of sluggish NALs */
-                lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL);
-                rc = l_wait_event (*wq, !ptlrpc_client_receiving_reply(request), &lwi);
+                lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL);
+                rc = l_wait_event (*wq, 
+                                   !ptlrpc_client_receiving_reply(request) &&
+                                   !ptlrpc_client_must_unlink(request),
+                                   &lwi);
                 if (rc == 0)
                         return;
 
                 LASSERT (rc == -ETIMEDOUT);
-                DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout");
+                DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
+                          "rvcng=%d unlnk=%d", 
+                          ptlrpc_client_receiving_reply(request),
+                          ptlrpc_client_must_unlink(request));
         }
         EXIT;
 }
@@ -1419,14 +1631,6 @@ void ptlrpc_restart_req(struct ptlrpc_request *req)
         spin_unlock(&req->rq_lock);
 }
 
-static int expired_request(void *data)
-{
-        struct ptlrpc_request *req = data;
-        ENTRY;
-
-        RETURN(ptlrpc_expire_one_request(req));
-}
-
 static void interrupted_request(void *data)
 {
         struct ptlrpc_request *req = data;
@@ -1495,7 +1699,8 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
         int brc;
         struct l_wait_info lwi;
         struct obd_import *imp = req->rq_import;
-        cfs_duration_t timeout = 0;
+        cfs_duration_t timeout = CFS_TICK;
+        long timeoutl;
         ENTRY;
 
         LASSERT(req->rq_set == NULL);
@@ -1543,6 +1748,7 @@ restart:
                 list_del_init(&req->rq_list);
 
                 if (req->rq_err) {
+                        /* rq_status was set locally */
                         rc = -EIO;
                 }
                 else if (req->rq_intr) {
@@ -1589,17 +1795,27 @@ restart:
         spin_unlock(&imp->imp_lock);
 
         rc = ptl_send_rpc(req, 0);
-        if (rc) {
+        if (rc) 
                 DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc);
-                timeout = CFS_TICK;
-        } else {
-                timeout = cfs_timeout_cap(cfs_time_seconds(req->rq_timeout));
+        do {
+                timeoutl = req->rq_deadline - cfs_time_current_sec();
+                timeout = (timeoutl <= 0 || rc) ? CFS_TICK :
+                        cfs_time_seconds(timeoutl);
                 DEBUG_REQ(D_NET, req, 
-                          "-- sleeping for "CFS_DURATION_T" jiffies", timeout);
+                          "-- sleeping for "CFS_DURATION_T" ticks", timeout);
+                lwi = LWI_TIMEOUT_INTR(timeout, NULL, interrupted_request, req);
+                brc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req),
+                                  &lwi);
+                /* Wait again if we changed deadline */
+        } while ((brc == -ETIMEDOUT) && 
+                 (req->rq_deadline > cfs_time_current_sec()));
+
+        if ((brc == -ETIMEDOUT) && !ptlrpc_expire_one_request(req)) {
+                /* Wait forever for reconnect / replay or failure */
+                lwi = LWI_INTR(interrupted_request, req);
+                rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req),
+                                  &lwi);
         }
-        lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request,
-                               req);
-        l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi);
 
         CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:nid:opc "
                "%s:%s:%d:"LPU64":%s:%d\n", cfs_curproc_comm(),
@@ -1607,7 +1823,6 @@ restart:
                lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
                libcfs_nid2str(imp->imp_connection->c_peer.nid),
                lustre_msg_get_opc(req->rq_reqmsg));
-
         spin_lock(&imp->imp_lock);
         list_del_init(&req->rq_list);
         spin_unlock(&imp->imp_lock);
@@ -1617,16 +1832,11 @@ restart:
          * req->rq_receiving_reply is clear and returns. */
         ptlrpc_unregister_reply (req);
 
-        if (req->rq_err)
-                GOTO(out, rc = -EIO);
 
-        /* Resend if we need to, unless we were interrupted. */
-        if (req->rq_resend && !req->rq_intr) {
-                /* ...unless we were specifically told otherwise. */
-                if (req->rq_no_resend)
-                        GOTO(out, rc = -ETIMEDOUT);
-                spin_lock(&imp->imp_lock);
-                goto restart;
+        if (req->rq_err) {
+                DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d", 
+                          rc, req->rq_status);
+                GOTO(out, rc = -EIO);
         }
 
         if (req->rq_intr) {
@@ -1637,6 +1847,15 @@ restart:
                 GOTO(out, rc = -EINTR);
         }
 
+        /* Resend if we need to */
+        if (req->rq_resend) {
+                /* ...unless we were specifically told otherwise. */
+                if (req->rq_no_resend)
+                        GOTO(out, rc = -ETIMEDOUT);
+                spin_lock(&imp->imp_lock);
+                goto restart;
+        }
+
         if (req->rq_timedout) {                 /* non-recoverable timeout */
                 GOTO(out, rc = -ETIMEDOUT);
         }
@@ -1657,25 +1876,33 @@ restart:
 
  out:
         if (req->rq_bulk != NULL) {
-                if (rc >= 0) {
+                while(rc >= 0) {
                         /* success so far.  Note that anything going wrong
                          * with bulk now, is EXTREMELY strange, since the
                          * server must have believed that the bulk
                          * tranferred OK before she replied with success to
                          * me. */
+                        timeoutl = req->rq_deadline - cfs_time_current_sec();
+                        timeout = (timeoutl <= 0) ? CFS_TICK :
+                                cfs_time_seconds(timeoutl);
+
                         lwi = LWI_TIMEOUT(timeout, NULL, NULL);
                         brc = l_wait_event(req->rq_reply_waitq,
                                            !ptlrpc_bulk_active(req->rq_bulk),
                                            &lwi);
                         LASSERT(brc == 0 || brc == -ETIMEDOUT);
+                        if ((brc == -ETIMEDOUT) &&
+                            (req->rq_deadline > cfs_time_current_sec()))
+                                /* Wait again if we changed deadline */
+                                continue;
                         if (brc != 0) {
-                                LASSERT(brc == -ETIMEDOUT);
                                 DEBUG_REQ(D_ERROR, req, "bulk timed out");
                                 rc = brc;
                         } else if (!req->rq_bulk->bd_success) {
                                 DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
                                 rc = -EIO;
                         }
+                        break;
                 }
                 if (rc < 0)
                         ptlrpc_unregister_bulk (req);
@@ -1769,10 +1996,11 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         aa->praa_old_state = req->rq_send_state;
         req->rq_send_state = LUSTRE_IMP_REPLAY;
         req->rq_phase = RQ_PHASE_NEW;
-        aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+        if (req->rq_repmsg)
+                aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
         req->rq_status = 0;
-
         req->rq_interpret_reply = ptlrpc_replay_interpret;
+
         atomic_inc(&req->rq_import->imp_replay_inflight);
         ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
 
@@ -1804,6 +2032,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
                 spin_lock (&req->rq_lock);
                 if (req->rq_import_generation < imp->imp_generation) {
                         req->rq_err = 1;
+                        req->rq_status = -EINTR;
                         ptlrpc_wake_client_req(req);
                 }
                 spin_unlock (&req->rq_lock);
@@ -1818,6 +2047,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
                 spin_lock (&req->rq_lock);
                 if (req->rq_import_generation < imp->imp_generation) {
                         req->rq_err = 1;
+                        req->rq_status = -EINTR;
                         ptlrpc_wake_client_req(req);
                 }
                 spin_unlock (&req->rq_lock);
@@ -1854,3 +2084,4 @@ __u64 ptlrpc_sample_next_xid(void)
         return tmp;
 }
 EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
index 25f1330..9df83cb 100644 (file)
@@ -82,30 +82,71 @@ void reply_in_callback(lnet_event_t *ev)
         struct ptlrpc_request *req = cbid->cbid_arg;
         ENTRY;
 
-        LASSERT (ev->type == LNET_EVENT_PUT ||
-                 ev->type == LNET_EVENT_UNLINK);
-        LASSERT (ev->unlinked);
-        LASSERT (ev->md.start == req->rq_repmsg);
-        LASSERT (ev->offset == 0);
-        LASSERT (ev->mlength <= req->rq_replen);
-        
         DEBUG_REQ((ev->status == 0) ? D_NET : D_ERROR, req,
                   "type %d, status %d", ev->type, ev->status);
+        
+        LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+        LASSERT(ev->md.start == req->rq_repbuf);
+        LASSERT(ev->mlength <= req->rq_replen);
 
         spin_lock(&req->rq_lock);
 
-        LASSERT (req->rq_receiving_reply);
         req->rq_receiving_reply = 0;
+        req->rq_early = 0;
+        
+        if (ev->status)
+                goto out_wake;
+        if (ev->type == LNET_EVENT_UNLINK) {
+                req->rq_must_unlink = 0;
+                DEBUG_REQ(D_RPCTRACE, req, "unlink");
+                goto out_wake;
+        }
 
-        if (ev->type == LNET_EVENT_PUT && ev->status == 0) {
+        /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+           for adaptive timeouts' early reply. */
+        LASSERT(ev->md.options & LNET_MD_MANAGE_REMOTE);
+
+        if ((ev->offset == 0) && 
+            lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) {
+                /* Early reply */
+                DEBUG_REQ(D_ADAPTTO, req,
+                          "Early reply received: mlen=%u offset=%d replen=%d "
+                          "replied=%d", ev->mlength, ev->offset, req->rq_replen,
+                          req->rq_replied);
+                
+                LASSERT(ev->mlength == lustre_msg_early_size());
+                req->rq_early_count++; /* number received, client side */
+                if (req->rq_replied) 
+                        /* If we already got the real reply, ignore the early 
+                           reply, but signal so we can unlink */
+                        goto out_wake;
+                req->rq_early = 1;
+                req->rq_nob_received = ev->mlength;
+                /* repmsg points to early reply */
+                req->rq_repmsg = req->rq_repbuf;
+                /* And we're still receiving */
+                req->rq_receiving_reply = 1;
+        } else {
+                /* Real reply */
                 req->rq_replied = 1;
                 req->rq_nob_received = ev->mlength;
+                /* repmsg points to real reply */
+                req->rq_repmsg = (struct lustre_msg *)((char *)req->rq_repbuf +
+                                                       ev->offset);
+                /* LNetMDUnlink can't be called under the LNET_LOCK,
+                   so we must unlink in ptlrpc_unregister_reply */
+                DEBUG_REQ(D_INFO, req, 
+                          "reply in flags=%x mlen=%u offset=%d replen=%d",
+                          lustre_msg_get_flags(req->rq_reqmsg),
+                          ev->mlength, ev->offset, req->rq_replen);
         }
 
+        req->rq_import->imp_last_reply_time = cfs_time_current_sec();
+
+out_wake:
         /* NB don't unlock till after wakeup; req can disappear under us
          * since we don't have our own ref */
         ptlrpc_wake_client_req(req);
-
         spin_unlock(&req->rq_lock);
         EXIT;
 }
@@ -212,6 +253,12 @@ void request_in_callback(lnet_event_t *ev)
 #ifdef CRAY_XT3
         req->rq_uid = ev->uid;
 #endif
+        spin_lock_init(&req->rq_lock);
+        sema_init(&req->rq_rs_sem, 1);
+        CFS_INIT_LIST_HEAD(&req->rq_timed_list);
+        atomic_set(&req->rq_refcount, 1);
+        if (ev->type == LNET_EVENT_PUT)
+                DEBUG_REQ(D_RPCTRACE, req, "incoming req");
 
         spin_lock(&service->srv_lock);
 
@@ -237,7 +284,7 @@ void request_in_callback(lnet_event_t *ev)
                 rqbd->rqbd_refcount++;
         }
 
-        list_add_tail(&req->rq_list, &service->srv_request_queue);
+        list_add_tail(&req->rq_list, &service->srv_req_in_queue);
         service->srv_n_queued_reqs++;
 
         /* NB everything can disappear under us once the request
index 66adf40..c5106ba 100644 (file)
@@ -185,8 +185,11 @@ void ptlrpc_deactivate_import(struct obd_import *imp)
  */
 void ptlrpc_invalidate_import(struct obd_import *imp)
 {
+        struct list_head *tmp, *n;
+        struct ptlrpc_request *req;
         struct l_wait_info lwi;
-        int rc;
+        time_t last = 0;
+        int timeout, rc = 0;
 
         atomic_inc(&imp->imp_inval_count);
 
@@ -196,16 +199,41 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
         LASSERT(imp->imp_invalid);
 
         /* wait for all requests to error out and call completion callbacks */
-        lwi = LWI_TIMEOUT_INTERVAL(cfs_timeout_cap(cfs_time_seconds(obd_timeout)), 
-                                   HZ, NULL, NULL);
-        rc = l_wait_event(imp->imp_recovery_waitq,
-                          (atomic_read(&imp->imp_inflight) == 0),
-                          &lwi);
+        spin_lock(&imp->imp_lock);
+        list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                last = max(last, req->rq_deadline);
+        }
+        list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                last = max(last, req->rq_deadline);
+        }
+        spin_unlock(&imp->imp_lock);
 
-        if (rc)
-                CDEBUG(D_HA, "%s: rc = %d waiting for callback (%d != 0)\n",
+        timeout = (int)(last - cfs_time_current_sec());
+        if (timeout > 0) {
+                lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(timeout),
+                                           HZ, NULL, NULL);
+                rc = l_wait_event(imp->imp_recovery_waitq,
+                                  (atomic_read(&imp->imp_inflight) == 0),
+                                  &lwi);
+        }
+
+        if (atomic_read(&imp->imp_inflight)) {
+                CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
                        obd2cli_tgt(imp->imp_obd), rc,
                        atomic_read(&imp->imp_inflight));
+                spin_lock(&imp->imp_lock);
+                list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+                        req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                        DEBUG_REQ(D_ERROR, req, "still on sending list");
+                }
+                list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+                        req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                        DEBUG_REQ(D_ERROR, req, "still on delayed list");
+                }
+                spin_unlock(&imp->imp_lock);
+        }
 
         obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
 
@@ -257,6 +285,7 @@ static int import_select_connection(struct obd_import *imp)
 {
         struct obd_import_conn *imp_conn = NULL, *conn;
         struct obd_export *dlmexp;
+        int tried_all = 1;
         ENTRY;
 
         spin_lock(&imp->imp_lock);
@@ -273,44 +302,57 @@ static int import_select_connection(struct obd_import *imp)
                        imp->imp_obd->obd_name,
                        libcfs_nid2str(conn->oic_conn->c_peer.nid),
                        conn->oic_last_attempt);
-                /* Throttle the reconnect rate to once per RECONNECT_INTERVAL */
-                if (cfs_time_before_64(conn->oic_last_attempt + 
-                                       RECONNECT_INTERVAL * HZ,
-                                       cfs_time_current_64())) {
-                        /* If we have never tried this connection since the
-                           the last successful attempt, go with this one */
-                        if (cfs_time_beforeq_64(conn->oic_last_attempt,
-                                               imp->imp_last_success_conn)) {
-                                imp_conn = conn;
-                                break;
-                        }
+                
+                /* Don't thrash connections */
+                if (cfs_time_before_64(cfs_time_current_64(),
+                                     conn->oic_last_attempt + 
+                                     cfs_time_seconds(CONNECTION_SWITCH_MIN))) {
+                        continue;
+                }
 
-                        /* Both of these connections have already been tried
-                           since the last successful connection; just choose the
-                           least recently used */
-                        if (!imp_conn)
-                                imp_conn = conn;
-                        else if (cfs_time_before_64(conn->oic_last_attempt,
-                                                    imp_conn->oic_last_attempt))
-                                imp_conn = conn;
-                } else {
-                        /* Exceptionally unlikely case caused by the node
-                         * booting and attempting to mount lustre faster than
-                         * than RECONNECT_INTERVAL seconds. */
-                        if (unlikely(conn->oic_last_attempt == 0)) {
-                                imp_conn = conn;
-                                break;
-                        }
+                /* If we have not tried this connection since the
+                   the last successful attempt, go with this one */
+                if ((conn->oic_last_attempt == 0) ||
+                    cfs_time_beforeq_64(conn->oic_last_attempt,
+                                       imp->imp_last_success_conn)) {
+                        imp_conn = conn;
+                        tried_all = 0;
+                        break;
                 }
+
+                /* If all of the connections have already been tried
+                   since the last successful connection; just choose the
+                   least recently used */
+                if (!imp_conn)
+                        imp_conn = conn;
+                else if (cfs_time_before_64(conn->oic_last_attempt,
+                                            imp_conn->oic_last_attempt))
+                        imp_conn = conn;
         }
 
         /* if not found, simply choose the current one */
         if (!imp_conn) {
                 LASSERT(imp->imp_conn_current);
                 imp_conn = imp->imp_conn_current;
+                tried_all = 0;
         }
         LASSERT(imp_conn->oic_conn);
 
+        /* If we've tried everything, and we're back to the beginning of the
+           list, wait for LND_TIMEOUT to give the queues a chance to drain. */
+        if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+                int must_wait;
+                LASSERT(imp_conn->oic_last_attempt);
+                must_wait = LND_TIMEOUT -
+                        (int)cfs_duration_sec(cfs_time_current_64() - 
+                                              imp_conn->oic_last_attempt);
+                imp->imp_at.iat_drain = max(0, must_wait);
+                CWARN("Tried all connections, %lus drain time\n",
+                      imp->imp_at.iat_drain);
+        } else {
+                imp->imp_at.iat_drain = 0;
+        }
+
         imp_conn->oic_last_attempt = cfs_time_current_64();
 
         /* switch connection, don't mind if it's same as the current one */
@@ -455,21 +497,19 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
 
         aa->pcaa_peer_committed = committed_before_reconnect;
         aa->pcaa_initial_connect = initial_connect;
-
         if (aa->pcaa_initial_connect) {
                 spin_lock(&imp->imp_lock);
                 imp->imp_replayable = 1;
                 spin_unlock(&imp->imp_lock);
-                /* On an initial connect, we don't know which one of a
-                   failover server pair is up.  Don't wait long. */
-#ifdef CRAY_XT3
-                request->rq_timeout = max((int)(obd_timeout / 2), 5);
-#else
-                request->rq_timeout = max((int)(obd_timeout / 20), 5);
-#endif
+                if (AT_OFF)
+                        /* AT will use INITIAL_CONNECT_TIMEOUT the first
+                           time, adaptive after that. */
+                        request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
         }
 
-        DEBUG_REQ(D_RPCTRACE, request, "(re)connect request");
+        DEBUG_REQ(D_RPCTRACE, request, "%sconnect request %d",
+                  aa->pcaa_initial_connect ? "initial " : "re", 
+                  imp->imp_conn_cnt);
         ptlrpcd_add_req(request);
         rc = 0;
 out:
@@ -528,6 +568,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
         ENTRY;
 
         spin_lock(&imp->imp_lock);
+        imp->imp_at.iat_drain = 0;
         if (imp->imp_state == LUSTRE_IMP_CLOSED) {
                 spin_unlock(&imp->imp_lock);
                 RETURN(0);
@@ -633,6 +674,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                 imp->imp_last_replay_transno = 0;
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
         } else {
+                DEBUG_REQ(D_HA, request, "evicting, flags=%x", msg_flags);
                 imp->imp_remote_handle =
                                 *lustre_msg_get_handle(request->rq_repmsg);
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
@@ -745,6 +787,20 @@ finish:
                                 ocd->ocd_brw_size >> CFS_PAGE_SHIFT;
                 }
 
+                if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
+                    (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+                        /* We need a per-message support flag, because 
+                           a. we don't know if the incoming connect reply
+                              supports AT or not (in reply_in_callback)
+                              until we unpack it.
+                           b. failovered server means export and flags are gone
+                              (in ptlrpc_send_reply).
+                           Can only be set when we know AT is supported at 
+                           both ends */
+                        imp->imp_msg_flags |= MSG_AT_SUPPORT;
+                else
+                        imp->imp_msg_flags &= ~MSG_AT_SUPPORT;
+
                 LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
                         (cli->cl_max_pages_per_rpc > 0));
         }
@@ -991,13 +1047,19 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
 
         if (ptlrpc_import_in_recovery(imp)) {
                 struct l_wait_info lwi;
-                cfs_duration_t timeout = cfs_time_seconds(obd_timeout);
+                cfs_duration_t timeout;
+                int idx;
 
+                if (AT_OFF || (idx = import_at_get_index(imp, 
+                                      imp->imp_client->cli_request_portal)) < 0)
+                        timeout = cfs_time_seconds(obd_timeout);
+                else
+                        timeout = cfs_time_seconds(
+                                at_get(&imp->imp_at.iat_service_estimate[idx]));
                 lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout), 
                                        back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
                 rc = l_wait_event(imp->imp_recovery_waitq,
                                   !ptlrpc_import_in_recovery(imp), &lwi);
-
         }
 
         spin_lock(&imp->imp_lock);
@@ -1012,11 +1074,18 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
                  * it fails.  We can get through the above with a down server
                  * if the client doesn't know the server is gone yet. */
                 req->rq_no_resend = 1;
-#ifdef CRAY_XT3
-                req->rq_timeout = obd_timeout / 3;
+                
+#ifndef CRAY_XT3
+                /* We want client umounts to happen quickly, no matter the 
+                   server state... */
+                req->rq_timeout = min(req->rq_timeout, INITIAL_CONNECT_TIMEOUT);
 #else
-                req->rq_timeout = 5;
+                /* ... but we always want liblustre clients to nicely 
+                   disconnect, so only use the adaptive value. */
+                if (AT_OFF)
+                        req->rq_timeout = obd_timeout / 3;
 #endif
+
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
                 req->rq_send_state =  LUSTRE_IMP_CONNECTING;
                 ptlrpc_req_set_repsize(req, 1, NULL);
@@ -1031,6 +1100,8 @@ out:
         else
                 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
         memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+        /* Try all connections in the future - bz 12758 */ 
+        imp->imp_last_recon = 0;
         spin_unlock(&imp->imp_lock);
 
         RETURN(rc);
@@ -1044,3 +1115,142 @@ void ptlrpc_import_setasync(struct obd_import *imp, int count)
         LNetSetAsync(imp->imp_connection->c_peer, count);
 }
 
+
+/* Adaptive Timeout utils */
+
+/* Bin into timeslices using AT_BINS bins.
+   This gives us a max of the last binlimit*AT_BINS secs without the storage,
+   but still smoothing out a return to normalcy from a slow response.
+   (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
+void at_add(struct adaptive_timeout *at, unsigned int val) {
+        /*unsigned int old = at->at_current;*/
+        time_t now = cfs_time_current_sec();
+
+        LASSERT(at);
+#if 0
+        CDEBUG(D_INFO, "add %u to %p time=%lu tb=%lu v=%u (%u %u %u %u)\n", 
+               val, at, now - at->at_binstart, at->at_binlimit, at->at_current,
+               at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+#endif
+        if (val == 0) 
+                /* 0's don't count, because we never want our timeout to 
+                   drop to 0, and because 0 could mean an error */
+                return;
+
+        spin_lock(&at->at_lock);
+
+        if (unlikely(at->at_binstart == 0)) {
+                /* Special case to remove default from history */
+                at->at_current = val;
+                at->at_worst_ever = val;
+                at->at_worst_time = now;
+                at->at_hist[0] = val;
+                at->at_binstart = now;
+        } else if (now - at->at_binstart < at->at_binlimit ) {
+                /* in bin 0 */
+                at->at_hist[0] = max(val, at->at_hist[0]);
+                at->at_current = max(val, at->at_current);
+        } else {
+                int i, shift;
+                unsigned int maxv = val;
+                /* move bins over */
+                shift = (now - at->at_binstart) / at->at_binlimit;
+                LASSERT(shift > 0);
+                for(i = AT_BINS - 1; i >= 0; i--) {
+                        if (i >= shift) {
+                                at->at_hist[i] = at->at_hist[i - shift];
+                                maxv = max(maxv, at->at_hist[i]);
+                        } else {
+                                at->at_hist[i] = 0;
+                        }
+                }
+                at->at_hist[0] = val;
+                at->at_current = maxv;
+                at->at_binstart += shift * at->at_binlimit;
+        }
+
+        if ((at->at_flags & AT_FLG_MIN) && 
+            (at->at_current < adaptive_timeout_min))
+                at->at_current = adaptive_timeout_min;
+
+        if (at->at_current > at->at_worst_ever) {
+                at->at_worst_ever = at->at_current;
+                at->at_worst_time = now;
+        }
+
+        if (at->at_flags & AT_FLG_NOHIST)
+                /* Only keep last reported val; keeping the rest of the history
+                   for proc only */
+                at->at_current = val;
+
+#if 0
+        if (at->at_current != old)
+                CDEBUG(D_ADAPTTO, "AT change: old=%u new=%u delta=%d (val=%u) "
+                       "hist %u %u %u %u\n",
+                       old, at->at_current, at->at_current - old, val,
+                       at->at_hist[0], at->at_hist[1], at->at_hist[2],
+                       at->at_hist[3]);
+#endif
+        spin_unlock(&at->at_lock);
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal) {
+        struct imp_at *at = &imp->imp_at;
+        int i;
+
+        for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+                if (at->iat_portal[i] == portal) 
+                        return i;
+                if (at->iat_portal[i] == 0)
+                        /* unused */
+                        break;
+        }
+
+        /* Not found in list, add it under a lock */
+        spin_lock(&imp->imp_lock);
+
+        /* Check unused under lock */
+        for (; i < IMP_AT_MAX_PORTALS; i++) {
+                if (at->iat_portal[i] == portal) 
+                        goto out;
+                if (at->iat_portal[i] == 0)
+                        /* unused */
+                        break;
+        }
+
+        if (i >= IMP_AT_MAX_PORTALS) {
+                CERROR("Tried to use more than %d portals, not enough room "
+                       "in adaptive timeout stats.\n", IMP_AT_MAX_PORTALS);
+                i = -1;
+                goto out;
+        }
+        at->iat_portal[i] = portal;
+
+out:
+        spin_unlock(&imp->imp_lock);
+        return i;
+}
+
+/* Get total expected lock callback time (net + service).
+   Since any early reply will only affect the RPC wait time, and not
+   any local lock timer we set based on the return value here,
+   we should be conservative. */
+int import_at_get_ldlm(struct obd_import *imp) {
+        int idx, tot;
+        
+        if (!imp || !imp->imp_client || AT_OFF)
+                return obd_timeout;
+        
+        tot = at_get(&imp->imp_at.iat_net_latency);
+        idx = import_at_get_index(imp, imp->imp_client->cli_request_portal);
+        if (idx < 0)
+                tot += obd_timeout;
+        else
+                tot += at_get(&imp->imp_at.iat_service_estimate[idx]);
+
+        /* add an arbitrary minimum: 150% + 10 sec */
+        tot += (tot >> 1) + 10;
+        return tot;
+}
+
index 9d9ca6c..3901854 100644 (file)
@@ -153,6 +153,8 @@ void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
                              svc_counter_config, "req_qdepth", "reqs");
         lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
                              svc_counter_config, "req_active", "reqs");
+        lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+                             svc_counter_config, "adaptive_timeout", "sec");
         lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
                              svc_counter_config, "reqbuf_avail", "bufs");
         for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
@@ -385,6 +387,56 @@ ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
         return 0;
 }
 
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
+{
+        struct ptlrpc_service *svc = data;
+        unsigned int cur, worst;
+        time_t worstt;
+        struct dhms ts;
+        int rc = 0;
+
+        *eof = 1;
+        cur = at_get(&svc->srv_at_estimate);
+        worst = svc->srv_at_estimate.at_worst_ever;
+        worstt = svc->srv_at_estimate.at_worst_time;
+        s2dhms(&ts, cfs_time_current_sec() - worstt);
+        if (AT_OFF)
+                rc += snprintf(page + rc, count - rc,
+                              "adaptive timeouts off, using obd_timeout %u\n",
+                              obd_timeout);
+        rc += snprintf(page + rc, count - rc, 
+                       "%10s : %ld sec\n", "timebase",
+                       svc->srv_at_estimate.at_binlimit * AT_BINS);
+        rc += snprintf(page + rc, count - rc, 
+                       "%10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
+                       "service", cur, worst, worstt, 
+                       DHMS_VARS(&ts));
+        rc = lprocfs_at_hist_helper(page, count, rc,
+                                    &svc->srv_at_estimate);
+        return rc;
+}               
+
+static int ptlrpc_lprocfs_wr_timeouts(struct file *file, const char *buffer,
+                                      unsigned long count, void *data)
+{
+        struct ptlrpc_service *svc = data;
+        int val, rc;
+         
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc < 0)
+                return rc;
+        if (val <= 0)
+                return -ERANGE;
+
+        spin_lock(&svc->srv_at_estimate.at_lock);
+        svc->srv_at_estimate.at_binlimit = max(1, val / AT_BINS);
+        spin_unlock(&svc->srv_at_estimate.at_lock);
+
+        return count;
+}
+
 void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
                                      struct ptlrpc_service *svc)
 {
@@ -397,6 +449,10 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
                  .write_fptr = ptlrpc_lprocfs_write_req_history_max,
                  .read_fptr  = ptlrpc_lprocfs_read_req_history_max,
                  .data       = svc},
+                {.name       = "timeouts",
+                 .write_fptr = ptlrpc_lprocfs_wr_timeouts,
+                 .read_fptr  = ptlrpc_lprocfs_rd_timeouts,
+                 .data       = svc},
                 {NULL}
         };
         static struct file_operations req_history_fops = {
index 5a863ec..49bbcde 100644 (file)
@@ -35,7 +35,8 @@
 
 static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
                          lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
-                         struct ptlrpc_connection *conn, int portal, __u64 xid)
+                         struct ptlrpc_connection *conn, int portal, __u64 xid,
+                         unsigned int offset)
 {
         int              rc;
         lnet_md_t         md;
@@ -65,11 +66,11 @@ static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
                 RETURN (-ENOMEM);
         }
 
-        CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
-               len, portal, xid);
+        CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n",
+               len, portal, xid, offset);
 
         rc = LNetPut (conn->c_self, *mdh, ack, 
-                      conn->c_peer, portal, xid, 0, 0);
+                      conn->c_peer, portal, xid, offset, 0);
         if (rc != 0) {
                 int rc2;
                 /* We're going to get an UNLINK event when I unlink below,
@@ -305,11 +306,13 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
         }
 }
 
-int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
+int ptlrpc_send_reply (struct ptlrpc_request *req, int flags)
 {
         struct ptlrpc_service     *svc = req->rq_rqbd->rqbd_service;
         struct ptlrpc_reply_state *rs = req->rq_reply_state;
         struct ptlrpc_connection  *conn;
+        int                        service_time;
+        unsigned int               offset = 0;
         int                        rc;
 
         /* We must already have a reply buffer (only ptlrpc_error() may be
@@ -320,10 +323,12 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
         LASSERT (req->rq_repmsg != NULL);
         LASSERT (rs != NULL);
         LASSERT (req->rq_repmsg == rs->rs_msg);
-        LASSERT (may_be_difficult || !rs->rs_difficult);
+        LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
         LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
         LASSERT (rs->rs_cb_id.cbid_arg == rs);
 
+        /* There may be no rq_export during failover */
+
         if (req->rq_export && req->rq_export->exp_obd &&
             req->rq_export->exp_obd->obd_fail) {
                 /* Failed obd's only send ENODEV */
@@ -339,6 +344,44 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
         lustre_msg_set_type(req->rq_repmsg, req->rq_type);
         lustre_msg_set_status(req->rq_repmsg, req->rq_status);
         lustre_msg_set_opc(req->rq_repmsg, lustre_msg_get_opc(req->rq_reqmsg));
+        
+        service_time = max_t(int, cfs_time_current_sec() -
+                             req->rq_arrival_time.tv_sec, 1);
+        if (!(flags & PTLRPC_REPLY_EARLY) && 
+            (req->rq_type != PTL_RPC_MSG_ERR)) {
+                int oldse = at_get(&svc->srv_at_estimate);
+                /* early replies and errors don't count toward our service
+                   time estimate */
+                at_add(&svc->srv_at_estimate, service_time);
+                if (service_time > oldse)
+                        DEBUG_REQ(D_ADAPTTO, req,
+                                  "svc %s increased estimate from %d to %d",
+                                  svc->srv_name, oldse, service_time);
+        }
+        /* Report actual service time for client latency calc */
+        lustre_msg_set_service_time(req->rq_repmsg, service_time);
+        /* Report service time estimate for future client reqs */
+        lustre_msg_set_timeout(req->rq_repmsg, at_get(&svc->srv_at_estimate));
+        
+        if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) {
+                /* early replies go to offset 0, regular replies go after that*/
+                if (flags & PTLRPC_REPLY_EARLY) {
+                        offset = 0;
+                        /* Include a checksum on early replies - must be done
+                           after all other lustre_msg_set_* */
+                        lustre_msg_set_cksum(req->rq_repmsg, 
+                                         lustre_msg_calc_cksum(req->rq_repmsg));
+                } else {
+                        offset = lustre_msg_early_size();
+                }
+        } else {
+                CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
+                       "req_flags=%#x magic=%d:%x/%x len=%d\n", 
+                      flags, lustre_msg_get_flags(req->rq_reqmsg),
+                      lustre_msg_is_v1(req->rq_reqmsg),
+                      lustre_msg_get_magic(req->rq_reqmsg),
+                      lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+        }
 
         if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
                 conn = ptlrpc_get_connection(req->rq_peer, req->rq_self, NULL);
@@ -349,16 +392,18 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
                 CERROR("not replying on NULL connection\n"); /* bug 9635 */
                 return -ENOTCONN;
         }
+        
         atomic_inc (&svc->srv_outstanding_replies);
         ptlrpc_rs_addref(rs);                   /* +1 ref for the network */
-
+        req->rq_sent = cfs_time_current_sec();
+        
         rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen,
                            rs->rs_difficult ? LNET_ACK_REQ : LNET_NOACK_REQ,
-                           &rs->rs_cb_id, conn,
-                           svc->srv_rep_portal, req->rq_xid);
+                           &rs->rs_cb_id, conn, svc->srv_rep_portal,
+                           req->rq_xid, offset);
         if (rc != 0) {
                 atomic_dec (&svc->srv_outstanding_replies);
-                ptlrpc_rs_decref(rs);
+                ptlrpc_req_drop_rs(req);
         }
         ptlrpc_put_connection(conn);
         return rc;
@@ -428,10 +473,11 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 
         if (!noreply) {
                 LASSERT (request->rq_replen != 0);
-                if (request->rq_repmsg == NULL)
-                        OBD_ALLOC(request->rq_repmsg, request->rq_replen);
-                if (request->rq_repmsg == NULL)
+                if (request->rq_repbuf == NULL)
+                        OBD_ALLOC(request->rq_repbuf, request->rq_replen);
+                if (request->rq_repbuf == NULL)
                         GOTO(cleanup_bulk, rc = -ENOMEM);
+                request->rq_repmsg = NULL;
 
                 rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
                                   connection->c_peer, request->rq_xid, 0,
@@ -446,6 +492,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         spin_lock(&request->rq_lock);
         /* If the MD attach succeeds, there _will_ be a reply_in callback */
         request->rq_receiving_reply = !noreply;
+        /* We are responsible for unlinking the reply buffer */
+        request->rq_must_unlink = !noreply;
         /* Clear any flags that may be present from previous sends. */
         request->rq_replied = 0;
         request->rq_err = 0;
@@ -456,14 +504,19 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         spin_unlock(&request->rq_lock);
 
         if (!noreply) {
-                reply_md.start     = request->rq_repmsg;
+                reply_md.start     = request->rq_repbuf;
                 reply_md.length    = request->rq_replen;
-                reply_md.threshold = 1;
-                reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT;
+                /* Allow multiple early replies */
+                reply_md.threshold = LNET_MD_THRESH_INF;
+                /* Manage remote for early replies */
+                reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+                        LNET_MD_MANAGE_REMOTE;
                 reply_md.user_ptr  = &request->rq_reply_cbid;
                 reply_md.eq_handle = ptlrpc_eq_h;
 
-                rc = LNetMDAttach(reply_me_h, reply_md, LNET_UNLINK, 
+                /* We must see the unlink callback to unset rq_must_unlink,
+                   so we can't auto-unlink */
+                rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, 
                                  &request->rq_reply_md_h);
                 if (rc != 0) {
                         CERROR("LNetMDAttach failed: %d\n", rc);
@@ -487,14 +540,22 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 
         OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
 
-        request->rq_sent = CURRENT_SECONDS;
+        request->rq_sent = cfs_time_current_sec();
+        /* We give the server rq_timeout secs to process the req, and 
+           add the network latency for our local timeout. */
+        request->rq_deadline = request->rq_sent + request->rq_timeout + 
+                    ptlrpc_at_get_net_latency(request);
+
         ptlrpc_pinger_sending_on_import(request->rq_import);
+        
+        DEBUG_REQ(D_INFO, request, "send flg=%x",
+                  lustre_msg_get_flags(request->rq_reqmsg));
         rc = ptl_send_buf(&request->rq_req_md_h,
                           request->rq_reqmsg, request->rq_reqlen,
                           LNET_NOACK_REQ, &request->rq_req_cbid,
                           connection,
                           request->rq_request_portal,
-                          request->rq_xid);
+                          request->rq_xid, 0);
         if (rc == 0) {
                 ptlrpc_lprocfs_rpc_sent(request);
                 RETURN(rc);
@@ -506,8 +567,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 
         if (noreply)
                 RETURN(rc);
-        else
-                GOTO(cleanup_me, rc);
+
  cleanup_me:
         /* MEUnlink is safe; the PUT didn't even get off the ground, and
          * nobody apart from the PUT's target has the right nid+XID to
@@ -518,8 +578,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         LASSERT (!request->rq_receiving_reply);
 
  cleanup_repmsg:
-        OBD_FREE(request->rq_repmsg, request->rq_replen);
-        request->rq_repmsg = NULL;
+        OBD_FREE(request->rq_repbuf, request->rq_replen);
+        request->rq_repbuf = NULL;
+        request->rq_repmsg = NULL; //remove
 
  cleanup_bulk:
         if (request->rq_bulk != NULL)
index 6f53063..09ec8e1 100644 (file)
@@ -83,6 +83,15 @@ int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
         }
 }
 
+/* early reply size */
+int lustre_msg_early_size() {
+        static int size = 0;
+        if (!size) 
+                size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
+        return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
 static inline int lustre_msg_size_v1(int count, int *lengths)
 {
         int size;
@@ -244,6 +253,7 @@ static int lustre_pack_request_v2(struct ptlrpc_request *req,
         lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
         lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
         lustre_set_req_swabbed(req, MSG_PTLRPC_BODY_OFF);
+
         return 0;
 }
 
@@ -329,22 +339,31 @@ out:
 }
 
 static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count,
-                                int *lens, char **bufs)
+                                int *lens, char **bufs, int flags)
 {
         struct ptlrpc_reply_state *rs;
         int                        msg_len;
         int                        size;
         ENTRY;
 
-        LASSERT (req->rq_reply_state == NULL);
+        /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */
+        down(&req->rq_rs_sem);
+        LASSERT(req->rq_reply_state == NULL);
+        if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) {
+                /* Already packed final, no more early */
+                up(&req->rq_rs_sem); 
+                RETURN(-EALREADY);
+        }
 
         msg_len = lustre_msg_size_v1(count, lens);
         size = sizeof(struct ptlrpc_reply_state) + msg_len;
         OBD_ALLOC(rs, size);
         if (unlikely(rs == NULL)) {
                 rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size);
-                if (!rs)
+                if (!rs) {
+                        up(&req->rq_rs_sem);
                         RETURN (-ENOMEM);
+                }
         }
         atomic_set(&rs->rs_refcount, 1);        /* 1 ref for rq_reply_state */
         rs->rs_cb_id.cbid_fn = reply_out_callback;
@@ -358,6 +377,9 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count,
         req->rq_replen = msg_len;
         req->rq_reply_state = rs;
         req->rq_repmsg = rs->rs_msg;
+        if (!flags)
+                req->rq_final = 1; /* checked in lustre_packed_reply */
+
         lustre_init_msg_v1(rs->rs_msg, count, lens, bufs);
 
         PTLRPC_RS_DEBUG_LRU_ADD(rs);
@@ -366,22 +388,31 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count,
 }
 
 static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
-                                int *lens, char **bufs)
+                                int *lens, char **bufs, int flags)
 {
         struct ptlrpc_reply_state *rs;
         int                        msg_len;
         int                        size;
         ENTRY;
 
+        /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */
+        down(&req->rq_rs_sem); 
         LASSERT(req->rq_reply_state == NULL);
+        if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) {
+                /* Already packed final, no more early */
+                up(&req->rq_rs_sem); 
+                RETURN(-EALREADY);
+        }
 
         msg_len = lustre_msg_size_v2(count, lens);
         size = sizeof(struct ptlrpc_reply_state) + msg_len;
         OBD_ALLOC(rs, size);
         if (unlikely(rs == NULL)) {
                 rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size);
-                if (!rs)
+                if (!rs) {
+                        up(&req->rq_rs_sem);
                         RETURN (-ENOMEM);
+                }
         }
         atomic_set(&rs->rs_refcount, 1);        /* 1 ref for rq_reply_state */
         rs->rs_cb_id.cbid_fn = reply_out_callback;
@@ -395,6 +426,10 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
         req->rq_replen = msg_len;
         req->rq_reply_state = rs;
         req->rq_repmsg = rs->rs_msg;
+        if (!(flags & LPRFL_EARLY_REPLY))
+                req->rq_final = 1; /* checked in lustre_packed_reply */
+
+        /* server side, no rq_repbuf */
         lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
         lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
         lustre_set_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
@@ -404,8 +439,8 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
         RETURN(0);
 }
 
-int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
-                      char **bufs)
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, int *lens,
+                            char **bufs, int flags)
 {
         int size[] = { sizeof(struct ptlrpc_body) };
 
@@ -421,10 +456,10 @@ int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
         case LUSTRE_MSG_MAGIC_V1:
         case LUSTRE_MSG_MAGIC_V1_SWABBED:
                 return lustre_pack_reply_v1(req, count - 1, lens + 1,
-                                            bufs ? bufs + 1 : NULL);
+                                            bufs ? bufs + 1 : NULL, flags);
         case LUSTRE_MSG_MAGIC_V2:
         case LUSTRE_MSG_MAGIC_V2_SWABBED:
-                return lustre_pack_reply_v2(req, count, lens, bufs);
+                return lustre_pack_reply_v2(req, count, lens, bufs, flags);
         default:
                 LASSERTF(0, "incorrect message magic: %08x\n",
                          req->rq_reqmsg->lm_magic);
@@ -432,6 +467,13 @@ int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
         }
 }
 
+int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
+                      char **bufs)
+{
+        return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+
+
 void *lustre_msg_buf_v1(void *msg, int n, int min_size)
 {
         struct lustre_msg_v1 *m = (struct lustre_msg_v1 *)msg;
@@ -717,7 +759,7 @@ static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
                 __swab32s(&m->lm_bufcount);
                 __swab32s(&m->lm_secflvr);
                 __swab32s(&m->lm_repsize);
-                __swab32s(&m->lm_timeout);
+                __swab32s(&m->lm_cksum);
                 CLASSERT(offsetof(typeof(*m), lm_padding_1) != 0);
                 CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
                 CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
@@ -1406,6 +1448,17 @@ __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
         }
 }
 
+int lustre_msg_is_v1(struct lustre_msg *msg)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+        case LUSTRE_MSG_MAGIC_V1_SWABBED:
+                return 1;
+        default:
+                return 0;
+        }
+}
+
 __u32 lustre_msg_get_magic(struct lustre_msg *msg)
 {
         switch (msg->lm_magic) {
@@ -1420,6 +1473,88 @@ __u32 lustre_msg_get_magic(struct lustre_msg *msg)
         }
 }
 
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+        case LUSTRE_MSG_MAGIC_V1_SWABBED:
+                return 0;
+        case LUSTRE_MSG_MAGIC_V2:
+        case LUSTRE_MSG_MAGIC_V2_SWABBED: {
+                struct ptlrpc_body *pb;
+                
+                pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                if (!pb) {
+                        CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                        return 0;
+                        
+                }
+                return pb->pb_timeout;
+        }
+        default:
+                CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+                return 0;
+        }
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+        case LUSTRE_MSG_MAGIC_V1_SWABBED:
+                return 0;
+        case LUSTRE_MSG_MAGIC_V2:
+        case LUSTRE_MSG_MAGIC_V2_SWABBED: {
+                struct ptlrpc_body *pb;
+        
+                pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                if (!pb) {
+                        CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                        return 0;
+                        
+                }
+                return pb->pb_service_time;
+        }
+        default:
+                CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+                return 0;
+        }
+}
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+        case LUSTRE_MSG_MAGIC_V1_SWABBED:
+                return 0;
+        case LUSTRE_MSG_MAGIC_V2:
+        case LUSTRE_MSG_MAGIC_V2_SWABBED:
+                return msg->lm_cksum;
+        default:
+                CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+                return 0;
+        }
+}
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+        case LUSTRE_MSG_MAGIC_V1_SWABBED:
+                return 0;
+        case LUSTRE_MSG_MAGIC_V2:
+        case LUSTRE_MSG_MAGIC_V2_SWABBED: {
+                struct ptlrpc_body *pb;
+                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+                return crc32_le(~(__u32)0, (char *)pb, sizeof(*pb));
+        }
+        default:
+                CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+                return 0;
+        }
+}
+
 void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
 {
         switch (msg->lm_magic) {
@@ -1572,6 +1707,56 @@ void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
         }
 }
 
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+                return;
+        case LUSTRE_MSG_MAGIC_V2: {
+                struct ptlrpc_body *pb;
+                
+                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+                pb->pb_timeout = timeout;
+                return;
+        }
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+        }
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+                return;
+        case LUSTRE_MSG_MAGIC_V2: {
+                struct ptlrpc_body *pb;
+
+                pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+                LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+                pb->pb_service_time = service_time;
+                return;
+        }
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+        }
+}
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V1:
+                return;
+        case LUSTRE_MSG_MAGIC_V2:
+                msg->lm_cksum = cksum;
+                return;
+        default:
+                LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+        }
+}
+
+
 /* byte flipping routines for all wire types declared in
  * lustre_idl.h implemented here.
  */
@@ -1588,11 +1773,11 @@ void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
         __swab32s (&b->pb_flags);
         __swab32s (&b->pb_op_flags);
         __swab32s (&b->pb_conn_cnt);
+        __swab32s (&b->pb_timeout);
+        __swab32s (&b->pb_service_time);
         CLASSERT(offsetof(typeof(*b), pb_padding_1) != 0);
         CLASSERT(offsetof(typeof(*b), pb_padding_2) != 0);
         CLASSERT(offsetof(typeof(*b), pb_padding_3) != 0);
-        CLASSERT(offsetof(typeof(*b), pb_padding_4) != 0);
-        CLASSERT(offsetof(typeof(*b), pb_padding_5) != 0);
 }
 
 void lustre_swab_connect(struct obd_connect_data *ocd)
@@ -2134,8 +2319,8 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask,
         va_start(args, fmt);
         libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file,
                            data->msg_fn, data->msg_line, fmt, args,
-                           " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d ref %d fl "
-                           REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
+                           " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d "
+                           "e %d ref %d fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
                            req, req->rq_xid, req->rq_transno,
                            req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
                            req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) :
@@ -2147,8 +2332,8 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask,
                                 (char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "<?>",
                            (req->rq_import && req->rq_import->imp_client) ?
                                 req->rq_import->imp_client->cli_request_portal : -1,
-                           req->rq_reqlen, req->rq_replen, atomic_read(&req->rq_refcount),
-                           DEBUG_REQ_FLAGS(req),
+                           req->rq_reqlen, req->rq_replen, req->rq_early_count,
+                           atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req),
                            req->rq_reqmsg ? lustre_msg_get_flags(req->rq_reqmsg) : 0,
                            req->rq_repmsg ? lustre_msg_get_flags(req->rq_repmsg) : 0,
                            req->rq_status,
index 52d9574..4f320c2 100644 (file)
@@ -65,9 +65,12 @@ int ptlrpc_ping(struct obd_import *imp)
 
 static void ptlrpc_update_next_ping(struct obd_import *imp)
 {
-        imp->imp_next_ping = cfs_time_shift(
-                                (imp->imp_state == LUSTRE_IMP_DISCON ?
-                                 RECONNECT_INTERVAL : PING_INTERVAL));
+        int time = (imp->imp_state != LUSTRE_IMP_DISCON) ? PING_INTERVAL :
+                /* FIXME should this be limited to LND_TIMEOUT so we don't
+                   build up pings in LND output queues? */
+                max_t(int, CONNECTION_SWITCH_MIN, 
+                      at_get(&imp->imp_at.iat_net_latency));
+        imp->imp_next_ping = cfs_time_shift(time);
 }
 
 void ptlrpc_ping_import_soon(struct obd_import *imp)
@@ -364,7 +367,7 @@ static int ping_evictor_main(void *arg)
                 obd = pet_exp->exp_obd;
                 spin_unlock(&pet_lock);
 
-                expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2);
+                expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
 
                 CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
                        obd->obd_name, expire_time);
@@ -386,7 +389,7 @@ static int ping_evictor_main(void *arg)
                                               " it.\n", obd->obd_name,
                                               obd_uuid2str(&exp->exp_client_uuid),
                                               obd_export_nid2str(exp),
-                                              (long)(CURRENT_SECONDS -
+                                              (long)(cfs_time_current_sec() -
                                                      exp->exp_last_request_time));
                                 CDEBUG(D_HA, "Last request was at %ld\n",
                                        exp->exp_last_request_time);
index 8648fd7..25ec036 100644 (file)
@@ -115,6 +115,7 @@ enum {
         PTLRPC_REQWAIT_CNTR = 0,
         PTLRPC_REQQDEPTH_CNTR,
         PTLRPC_REQACTIVE_CNTR,
+        PTLRPC_TIMEOUT,
         PTLRPC_REQBUF_AVAIL_CNTR,
         PTLRPC_LAST_CNTR
 };
index 2ecf093..ba4c379 100644 (file)
@@ -176,6 +176,7 @@ EXPORT_SYMBOL(lustre_msg_swabbed);
 EXPORT_SYMBOL(lustre_msg_check_version);
 EXPORT_SYMBOL(lustre_pack_request);
 EXPORT_SYMBOL(lustre_pack_reply);
+EXPORT_SYMBOL(lustre_pack_reply_flags);
 EXPORT_SYMBOL(lustre_shrink_reply);
 EXPORT_SYMBOL(lustre_free_reply_state);
 EXPORT_SYMBOL(lustre_msg_size);
@@ -231,6 +232,7 @@ EXPORT_SYMBOL(lustre_msg_get_last_committed);
 EXPORT_SYMBOL(lustre_msg_get_transno);
 EXPORT_SYMBOL(lustre_msg_get_status);
 EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+EXPORT_SYMBOL(lustre_msg_is_v1);
 EXPORT_SYMBOL(lustre_msg_get_magic);
 EXPORT_SYMBOL(lustre_msg_set_handle);
 EXPORT_SYMBOL(lustre_msg_set_type);
index af8ffbf..42918aa 100644 (file)
@@ -251,10 +251,20 @@ ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc)
         return (-1);
 }
 
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+        struct ptlrpc_service *svc = (struct ptlrpc_service *)castmeharder;
+        CDEBUG(D_ADAPTTO, "at timer %s hit at %ld%s\n",
+               svc->srv_name, cfs_time_current_sec(), 
+               list_empty(&svc->srv_at_list) ? ", empty" : ""); 
+        svc->srv_at_check = 1;
+        cfs_waitq_signal(&svc->srv_waitq);
+}
+
 /* @threadname should be 11 characters or less - 3 will be added on */
 struct ptlrpc_service *
 ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
-                int req_portal, int rep_portal, int watchdog_timeout,
+                int req_portal, int rep_portal, int watchdog_factor,
                 svc_handler_t handler, char *name,
                 cfs_proc_dir_entry_t *proc_entry,
                 svcreq_printfn_t svcreq_printfn, 
@@ -283,7 +293,7 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
         service->srv_buf_size = bufsize;
         service->srv_rep_portal = rep_portal;
         service->srv_req_portal = req_portal;
-        service->srv_watchdog_timeout = watchdog_timeout;
+        service->srv_watchdog_factor = watchdog_factor;
         service->srv_handler = handler;
         service->srv_request_history_print_fn = svcreq_printfn;
         service->srv_request_seq = 1;           /* valid seq #s start at 1 */
@@ -305,6 +315,14 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
         CFS_INIT_LIST_HEAD(&service->srv_free_rs_list);
         cfs_waitq_init(&service->srv_free_rs_waitq);
 
+        spin_lock_init(&service->srv_at_lock);
+        CFS_INIT_LIST_HEAD(&service->srv_req_in_queue);
+        CFS_INIT_LIST_HEAD(&service->srv_at_list);
+        cfs_timer_init(&service->srv_at_timer, ptlrpc_at_timer, service);
+        /* At SOW, service time should be quick; 10s seems generous. If client 
+           timeout is less than this, we'll be sending an early reply. */
+        at_init(&service->srv_at_estimate, 10, AT_TIMEBASE_DEFAULT, 0);
+
         spin_lock (&ptlrpc_all_services_lock);
         list_add (&service->srv_list, &ptlrpc_all_services);
         spin_unlock (&ptlrpc_all_services_lock);
@@ -334,27 +352,37 @@ failed:
         return NULL;
 }
 
-static void __ptlrpc_server_free_request(struct ptlrpc_request *req)
+static void ptlrpc_server_decref(struct ptlrpc_request *req)
 {
         struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
 
-        list_del(&req->rq_list);
-
-        if (req->rq_reply_state != NULL) {
-                ptlrpc_rs_decref(req->rq_reply_state);
-                req->rq_reply_state = NULL;
-        }
+        if (!atomic_dec_and_test(&req->rq_refcount))
+                return;
 
         if (req != &rqbd->rqbd_req) {
                 /* NB request buffers use an embedded
                  * req if the incoming req unlinked the
                  * MD; this isn't one of them! */
                 OBD_FREE(req, sizeof(*req));
+        } else {
+                struct ptlrpc_service *svc = rqbd->rqbd_service;
+                /* schedule request buffer for re-use.
+                 * NB I can only do this after I've disposed of their
+                 * reqs; particularly the embedded req */
+                spin_lock(&svc->srv_lock);
+                list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
+                spin_unlock(&svc->srv_lock);
         }
 }
 
-static void
-ptlrpc_server_free_request(struct ptlrpc_request *req)
+static void __ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+        list_del(&req->rq_list);
+        ptlrpc_req_drop_rs(req);
+        ptlrpc_server_decref(req);
+}
+
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
 {
         struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
         struct ptlrpc_service             *svc = rqbd->rqbd_service;
@@ -362,6 +390,11 @@ ptlrpc_server_free_request(struct ptlrpc_request *req)
         struct list_head                  *tmp;
         struct list_head                  *nxt;
 
+        DEBUG_REQ(D_INFO, req, "free req");
+        spin_lock(&svc->srv_at_lock);
+        list_del_init(&req->rq_timed_list);
+        spin_unlock(&svc->srv_at_lock);
+
         spin_lock(&svc->srv_lock);
 
         svc->srv_n_active_reqs--;
@@ -407,11 +440,6 @@ ptlrpc_server_free_request(struct ptlrpc_request *req)
                         }
 
                         spin_lock(&svc->srv_lock);
-
-                        /* schedule request buffer for re-use.
-                         * NB I can only do this after I've disposed of their
-                         * reqs; particularly the embedded req */
-                        list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
                 }
         } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
                  /* If we are low on memory, we are not interested in
@@ -421,7 +449,6 @@ ptlrpc_server_free_request(struct ptlrpc_request *req)
         }
 
         spin_unlock(&svc->srv_lock);
-
 }
 
 /* This function makes sure dead exports are evicted in a timely manner.
@@ -442,7 +469,7 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
            at the exact right moment.  Eventually, all silent exports
            will make it to the top of the list. */
         exp->exp_last_request_time = max(exp->exp_last_request_time,
-                                         (time_t)CURRENT_SECONDS + extra_delay);
+                                         cfs_time_current_sec() + extra_delay);
 
         CDEBUG(D_INFO, "updating export %s at %ld\n",
                exp->exp_client_uuid.uuid,
@@ -477,21 +504,21 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
         /* Note - racing to start/reset the obd_eviction timer is safe */
         if (exp->exp_obd->obd_eviction_timer == 0) {
                 /* Check if the oldest entry is expired. */
-                if (CURRENT_SECONDS > (oldest_time +
-                                       (3 * obd_timeout / 2) + extra_delay)) {
+                if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
+                                              extra_delay)) {
                         /* We need a second timer, in case the net was down and
                          * it just came back. Since the pinger may skip every
                          * other PING_INTERVAL (see note in ptlrpc_pinger_main),
                          * we better wait for 3. */
-                        exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS +
-                                3 * PING_INTERVAL;
+                        exp->exp_obd->obd_eviction_timer =
+                                cfs_time_current_sec() + 3 * PING_INTERVAL;
                         CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
                                exp->exp_obd->obd_name, obd_export_nid2str(exp),
                                oldest_time);
                 }
         } else {
-                if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer +
-                                       extra_delay)) {
+                if (cfs_time_current_sec() > 
+                    (exp->exp_obd->obd_eviction_timer + extra_delay)) {
                         /* The evictor won't evict anyone who we've heard from
                          * recently, so we don't have to check before we start
                          * it. */
@@ -503,6 +530,344 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
         EXIT;
 }
 
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+        if (lustre_msg_get_conn_cnt(req->rq_reqmsg) < 
+            req->rq_export->exp_conn_cnt) {
+                DEBUG_REQ(D_ERROR, req,
+                          "DROPPING req from old connection %d < %d",
+                          lustre_msg_get_conn_cnt(req->rq_reqmsg),
+                          req->rq_export->exp_conn_cnt);
+                return -EEXIST;
+        }
+        if (req->rq_export->exp_obd && req->rq_export->exp_obd->obd_fail) {
+             /* Failing over, don't handle any more reqs, send
+                error response instead. */
+                CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+                       req, req->rq_export->exp_obd->obd_name);
+                req->rq_status = -ENODEV;
+                ptlrpc_error(req);
+                return -ENODEV;
+        }
+        return 0;
+}
+
+/* If closest exipiration is within EARLY_MIN, send early replies to everybody
+   expiring within EARLY_MAX, asking for AT_EXTRA time */
+#define AT_EARLY_MIN  2   /* Min time needed to send an early reply */
+#define AT_EARLY_MAX  5   /* Dont send early replies if deadline is beyond */
+#define AT_EXTRA 10       /* Early replies add this time to client timeout */
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service *svc)
+{
+        struct ptlrpc_request *rq;
+        time_t next;
+
+        spin_lock(&svc->srv_at_lock);
+        if (list_empty(&svc->srv_at_list)) {
+                cfs_timer_disarm(&svc->srv_at_timer);
+                spin_unlock(&svc->srv_at_lock);
+                return;
+        }
+
+        /* Set timer for closest deadline */
+        rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, 
+                        rq_timed_list);
+        next = rq->rq_deadline - cfs_time_current_sec() - AT_EARLY_MIN;
+        if (next <= 0) 
+                ptlrpc_at_timer((unsigned long)svc);
+        else
+                cfs_timer_arm(&svc->srv_at_timer, cfs_time_shift(next));
+        spin_unlock(&svc->srv_at_lock);
+        CDEBUG(D_ADAPTTO, "armed %s at %+lds\n", svc->srv_name, next);
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+        struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+        struct ptlrpc_request *rq;
+        int found = 0;
+
+        if (AT_OFF) 
+                return(0);
+
+        if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) == 0) 
+                return(-ENOSYS);
+        
+        DEBUG_REQ(D_ADAPTTO, req, "add timed %lds", 
+                  req->rq_deadline - cfs_time_current_sec());
+        
+        spin_lock(&svc->srv_at_lock);
+
+        LASSERT(list_empty(&req->rq_timed_list));
+        /* Add to sorted list.  Presumably latest rpcs will have the latest
+           deadlines, so search backward. */
+        list_for_each_entry_reverse(rq, &svc->srv_at_list, rq_timed_list) {
+                if (req->rq_deadline > rq->rq_deadline) {
+                        list_add(&req->rq_timed_list, &rq->rq_timed_list);
+                        found++;
+                        break;
+                }
+        }
+        if (!found)
+                /* Add to front if shortest deadline or list empty */
+                list_add(&req->rq_timed_list, &svc->srv_at_list);
+
+        /* Check if we're the head of the list */
+        found = (svc->srv_at_list.next == &req->rq_timed_list);
+
+        spin_unlock(&svc->srv_at_lock);
+
+        if (found)
+                ptlrpc_at_set_timer(svc);
+
+        return 0;
+}            
+
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, 
+                                      int extra_time)
+{
+        struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+        long deadline = req->rq_deadline - cfs_time_current_sec();
+        int rc;
+        ENTRY;
+                            
+        /* deadline is when the client expects us to reply, margin is the 
+           difference between clients' and servers' expectations */
+        DEBUG_REQ(D_ADAPTTO, req, 
+                  "%ssending early reply (deadline %+lds, margin %+lds) for "
+                  "%d+%d", AT_OFF ? "AT off - not " : "",
+                  deadline, deadline - at_get(&svc->srv_at_estimate), 
+                  at_get(&svc->srv_at_estimate), extra_time);
+
+        if (AT_OFF) 
+                RETURN(0);
+        
+        if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) == 0) {
+                CERROR("Wanted to ask client for more time, but no AT "
+                       "support\n");
+                RETURN(-ENOSYS);
+        }
+
+        rc = lustre_pack_reply_flags(req, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+        if (rc) 
+                /* EALREADY means final reply was already packed */
+                RETURN(rc);
+
+        if (extra_time) {
+                /* Fake our processing time into the future to ask the 
+                   clients for some extra amount of time */
+                extra_time += cfs_time_current_sec() -
+                        req->rq_arrival_time.tv_sec;
+                at_add(&svc->srv_at_estimate, extra_time);
+        }
+
+        req->rq_early_count++; /* number sent, server side */
+        rc = ptlrpc_send_reply(req, PTLRPC_REPLY_EARLY);
+
+        if (!rc) {
+                /* Adjust our own deadline to what we told the client */
+                req->rq_deadline = req->rq_arrival_time.tv_sec + 
+                        at_get(&svc->srv_at_estimate);
+        } else {
+                DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+        }
+
+        /* Reset reply */
+        req->rq_rep_swab_mask = 0;
+        /* Free the (early) reply state from lustre_pack_reply. 
+           (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+        ptlrpc_req_drop_rs(req);
+
+        RETURN(rc);
+}
+
+/* Check if we need to send any early replies, and send them */
+static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
+{
+        struct ptlrpc_request *rq, *n;
+        struct list_head work_list;
+        time_t now = cfs_time_current_sec();
+        int first, counter = 0;
+        ENTRY;
+
+        if (AT_OFF) 
+                RETURN(0);
+
+        spin_lock(&svc->srv_at_lock);
+        if (svc->srv_at_check == 0) {
+                spin_unlock(&svc->srv_at_lock);
+                RETURN(0);
+        }
+        svc->srv_at_check = 0;
+        
+        if (list_empty(&svc->srv_at_list)) {
+                spin_unlock(&svc->srv_at_lock);
+                RETURN(0);      
+        }
+
+        /* The timer went off, but maybe the nearest rpc already completed. */
+        rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request,
+                        rq_timed_list);
+        first = (int)(rq->rq_deadline - now);
+        if (first > AT_EARLY_MIN) {
+                /* We've still got plenty of time.  Reset the timer. */
+                spin_unlock(&svc->srv_at_lock);
+                ptlrpc_at_set_timer(svc);
+                RETURN(0);      
+        }
+
+        /* We're close to a timeout, and we don't know how much longer the 
+           server will take. Send early replies to everyone expiring soon. */
+        CFS_INIT_LIST_HEAD(&work_list);
+        list_for_each_entry_safe(rq, n, &svc->srv_at_list, rq_timed_list) {
+                if (rq->rq_deadline <= now + AT_EARLY_MAX) {
+                        list_move(&rq->rq_timed_list, &work_list);
+                        counter++;
+                } else {
+                        break;
+                }
+        }
+
+        spin_unlock(&svc->srv_at_lock);
+
+        CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
+               "replies\n", first, AT_EXTRA, counter);
+        if (first < 0)
+                /* We're already past request deadlines before we even get a 
+                   chance to send early replies */
+                LCONSOLE_WARN("%s: This server is not able to keep up with "
+                              "request traffic (cpu-bound).\n", svc->srv_name);
+
+        /* ptlrpc_server_free_request may delete an entry out of the work
+           list */
+        counter = 0;
+        spin_lock(&svc->srv_at_lock);
+        while (!list_empty(&work_list)) {
+                rq = list_entry(work_list.next, struct ptlrpc_request,
+                                rq_timed_list);
+                list_del_init(&rq->rq_timed_list);
+                /* if the entry is still in the worklist, it hasn't been
+                   deleted, and is safe to take a ref to keep the req around */
+                atomic_inc(&rq->rq_refcount);
+                spin_unlock(&svc->srv_at_lock);
+                if (!rq->rq_final && 
+                    (ptlrpc_at_send_early_reply(rq, AT_EXTRA) == 0)) {
+                        counter++;
+                        ptlrpc_at_add_timed(rq);
+                } else if (rq->rq_final) {
+                        DEBUG_REQ(D_ADAPTTO, rq, "already packed final reply, "
+                                  "not sending early");
+                }
+                ptlrpc_server_decref(rq);
+                spin_lock(&svc->srv_at_lock);
+        }
+        spin_unlock(&svc->srv_at_lock);
+
+        if (!counter)
+                /* Nothing added to timed list, so we have to kick the timer
+                   ourselves. */
+                ptlrpc_at_set_timer(svc);
+
+        RETURN(0);      
+}
+
+/* Handle freshly incoming reqs, check timeout, send early reply if needed,
+   pass on to regular request queue */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service *svc)
+{
+        struct ptlrpc_request *req;
+        __u32                  deadline;
+        int                    rc;
+        ENTRY;
+
+        LASSERT(svc);
+
+        spin_lock(&svc->srv_lock);
+        if (list_empty(&svc->srv_req_in_queue)) {
+                spin_unlock(&svc->srv_lock);
+                RETURN(0);
+        }
+
+        req = list_entry(svc->srv_req_in_queue.next,
+                         struct ptlrpc_request, rq_list);
+        list_del_init (&req->rq_list);
+        /* Consider this still a "queued" request as far as stats are
+           concerned */
+        spin_unlock(&svc->srv_lock);
+        
+#if SWAB_PARANOIA
+        /* Clear request swab mask; this is a new request */
+        req->rq_req_swab_mask = 0;
+#endif
+        rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
+        if (rc != 0) {
+                CERROR ("error unpacking request: ptl %d from %s"
+                        " xid "LPU64"\n", svc->srv_req_portal,
+                        libcfs_id2str(req->rq_peer), req->rq_xid);
+                goto err_req;
+        }
+
+        rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+        if (rc) {
+                CERROR ("error unpacking ptlrpc body: ptl %d from %s"
+                        " xid "LPU64"\n", svc->srv_req_portal,
+                        libcfs_id2str(req->rq_peer), req->rq_xid);
+                goto err_req;
+        }
+
+        rc = -EINVAL;
+        if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+                CERROR("wrong packet type received (type=%u) from %s\n",
+                       lustre_msg_get_type(req->rq_reqmsg),
+                       libcfs_id2str(req->rq_peer));
+                goto err_req;
+        }
+
+        CDEBUG(D_NET, "got req "LPD64"\n", req->rq_xid);
+        
+        req->rq_export = class_conn2export(
+                lustre_msg_get_handle(req->rq_reqmsg));
+        if (req->rq_export) {
+                rc = ptlrpc_check_req(req);
+                class_export_put(req->rq_export);
+                if (rc) 
+                        goto err_req;
+        }
+
+        /* req_in handling should/must be fast */
+        if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5) 
+                DEBUG_REQ(D_WARNING, req, "Slow req_in handling %lus",
+                          cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
+
+        /* Set rpc server deadline and add it to the timed list */
+        deadline = (lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) ? 
+                /* The max time the client expects us to take */
+                lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+        LASSERT(deadline > 0);
+        req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+        
+        ptlrpc_at_add_timed(req);
+
+        /* Move it over to the request processing queue */
+        spin_lock(&svc->srv_lock);
+        list_add_tail(&req->rq_list, &svc->srv_request_queue);
+        cfs_waitq_signal(&svc->srv_waitq);
+        spin_unlock(&svc->srv_lock);
+        RETURN(1);
+
+err_req:
+        spin_lock(&svc->srv_lock);
+        svc->srv_n_queued_reqs--;
+        svc->srv_n_active_reqs++;
+        spin_unlock(&svc->srv_lock);
+        ptlrpc_server_free_request(req);
+
+        RETURN(1);
+}
+
 static int
 ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                              struct ptlrpc_thread *thread)
@@ -512,18 +877,23 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
         struct timeval         work_start;
         struct timeval         work_end;
         long                   timediff;
-        int                    rc, reply;
+        int                    rc;
         ENTRY;
 
         LASSERT(svc);
 
         spin_lock(&svc->srv_lock);
         if (list_empty (&svc->srv_request_queue) ||
-            (svc->srv_n_difficult_replies != 0 &&
+            (
+#ifndef __KERNEL__
+             /* !@%$# liblustre only has 1 thread */
+             svc->srv_n_difficult_replies != 0 &&
+#endif
              svc->srv_n_active_reqs >= (svc->srv_threads_running - 1))) {
-                /* If all the other threads are handling requests, I must
-                 * remain free to handle any 'difficult' reply that might
-                 * block them */
+                /* Don't handle regular requests in the last thread, in order               * remain free to handle any 'difficult' replies (that might
+                 * to handle difficult replies (which might block other threads)
+                 * as well as handle any incoming reqs, early replies, etc. 
+                 * That means we always need at least 2 service threads. */
                 spin_unlock(&svc->srv_lock);
                 RETURN(0);
         }
@@ -545,78 +915,23 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                                     svc->srv_n_queued_reqs);
                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
                                     svc->srv_n_active_reqs);
+                lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+                                    at_get(&svc->srv_at_estimate));
         }
-
-#if SWAB_PARANOIA
-        /* Clear request swab mask; this is a new request */
-        request->rq_req_swab_mask = 0;
-#endif
-        rc = lustre_unpack_msg(request->rq_reqmsg, request->rq_reqlen);
-        if (rc != 0) {
-                CERROR ("error unpacking request: ptl %d from %s"
-                        " xid "LPU64"\n", svc->srv_req_portal,
-                        libcfs_id2str(request->rq_peer), request->rq_xid);
-                goto out_req;
-        }
-
-        rc = lustre_unpack_req_ptlrpc_body(request, MSG_PTLRPC_BODY_OFF);
-        if (rc) {
-                CERROR ("error unpacking ptlrpc body: ptl %d from %s"
-                        " xid "LPU64"\n", svc->srv_req_portal,
-                        libcfs_id2str(request->rq_peer), request->rq_xid);
-                goto out_req;
-        }
-
-        rc = -EINVAL;
-        if (lustre_msg_get_type(request->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
-                CERROR("wrong packet type received (type=%u) from %s\n",
-                       lustre_msg_get_type(request->rq_reqmsg),
-                       libcfs_id2str(request->rq_peer));
-                goto out_req;
-        }
-
+        
         CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
-
+        
         request->rq_svc_thread = thread;
         request->rq_export = class_conn2export(
                                      lustre_msg_get_handle(request->rq_reqmsg));
 
         if (request->rq_export) {
-                if (lustre_msg_get_conn_cnt(request->rq_reqmsg) <
-                    request->rq_export->exp_conn_cnt) {
-                        DEBUG_REQ(D_ERROR, request,
-                                  "DROPPING req from old connection %d < %d",
-                                  lustre_msg_get_conn_cnt(request->rq_reqmsg),
-                                  request->rq_export->exp_conn_cnt);
-                        goto put_conn;
-                }
-                if (request->rq_export->exp_obd &&
-                    request->rq_export->exp_obd->obd_fail) {
-                        /* Failing over, don't handle any more reqs, send
-                           error response instead. */
-                        CDEBUG(D_RPCTRACE,"Dropping req %p for failed obd %s\n",
-                               request, request->rq_export->exp_obd->obd_name);
-                        request->rq_status = -ENODEV;
-                        ptlrpc_error(request);
+                if (ptlrpc_check_req(request))
                         goto put_conn;
-                }
-
-                ptlrpc_update_export_timer(request->rq_export, timediff/500000);
+                ptlrpc_update_export_timer(request->rq_export, timediff >> 19);
                 export = class_export_rpc_get(request->rq_export);
         }
 
-        /* Discard requests queued for longer than my timeout.  If the
-         * client's timeout is similar to mine, she'll be timing out this
-         * REQ anyway (bug 1502) */
-        if (timediff / 1000000 > (long)obd_timeout) {
-                CERROR("Dropping timed-out opc %d request from %s"
-                       ": %ld seconds old\n",
-                       lustre_msg_get_opc(request->rq_reqmsg),
-                       libcfs_id2str(request->rq_peer),
-                       timediff / 1000000);
-                goto put_rpc_export;
-        }
-
         request->rq_phase = RQ_PHASE_INTERPRET;
 
         CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
@@ -629,8 +944,10 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                libcfs_id2str(request->rq_peer),
                lustre_msg_get_opc(request->rq_reqmsg));
 
-        rc = svc->srv_handler(request);
+        OBD_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, obd_fail_val);
 
+        rc = svc->srv_handler(request);
+        
         request->rq_phase = RQ_PHASE_COMPLETE;
 
         CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
@@ -643,7 +960,6 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                libcfs_id2str(request->rq_peer),
                lustre_msg_get_opc(request->rq_reqmsg));
 
-put_rpc_export:
         if (export != NULL)
                 class_export_rpc_put(export);
 
@@ -651,31 +967,23 @@ put_conn:
         if (request->rq_export != NULL)
                 class_export_put(request->rq_export);
 
-        reply = request->rq_reply_state && request->rq_repmsg;  /* bug 11169 */
+        if (cfs_time_current_sec() > request->rq_deadline) {
+                DEBUG_REQ(D_WARNING, request, "Request took longer than"
+                          " estimated (%+lds); client may timeout.",
+                          cfs_time_current_sec() - request->rq_deadline);
+        }
 
         do_gettimeofday(&work_end);
         timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
-        if (timediff / 1000000 > (long)obd_timeout)
-                CERROR("request "LPU64" opc %u from %s processed in %lds "
-                       "trans "LPU64" rc %d/%d\n",
-                       request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg),
-                       libcfs_id2str(request->rq_peer),
-                       cfs_timeval_sub(&work_end, &request->rq_arrival_time,
-                                       NULL) / 1000000,
-                       reply ? lustre_msg_get_transno(request->rq_repmsg) :
-                               request->rq_transno,
-                       request->rq_status,
-                       reply ? lustre_msg_get_status(request->rq_repmsg): -999);
-        else
-                CDEBUG(D_RPCTRACE,"request "LPU64" opc %u from %s processed in "
-                       "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
-                       request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg),
-                       libcfs_id2str(request->rq_peer), timediff,
-                       cfs_timeval_sub(&work_end, &request->rq_arrival_time,
-                                       NULL),
-                       request->rq_transno, request->rq_status,
-                       reply ? lustre_msg_get_status(request->rq_repmsg): -999);
-
+        CDEBUG(D_RPCTRACE, "request "LPU64" opc %u from %s processed in "
+               "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
+               request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg),
+               libcfs_id2str(request->rq_peer), timediff,
+               cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
+               request->rq_repmsg ? lustre_msg_get_transno(request->rq_repmsg) :
+               request->rq_transno, request->rq_status,
+               request->rq_repmsg ? lustre_msg_get_status(request->rq_repmsg): 
+               -999);
         if (svc->srv_stats != NULL) {
                 int opc = opcode_offset(lustre_msg_get_opc(request->rq_reqmsg));
                 if (opc > 0) {
@@ -685,8 +993,13 @@ put_conn:
                                             timediff);
                 }
         }
+        if (request->rq_early_count) {
+                DEBUG_REQ(D_ADAPTTO, request, 
+                          "sent %d early replies before finishing in %lds",
+                          request->rq_early_count,
+                          work_end.tv_sec - request->rq_arrival_time.tv_sec);
+        }
 
-out_req:
         ptlrpc_server_free_request(request);
 
         RETURN(1);
@@ -813,7 +1126,9 @@ liblustre_check_services (void *arg)
                 svc->srv_threads_running++;
 
                 do {
-                        rc = ptlrpc_server_handle_reply(svc);
+                        rc = ptlrpc_server_handle_req_in(svc);
+                        rc |= ptlrpc_server_handle_reply(svc);
+                        rc |= ptlrpc_at_check_timed(svc);
                         rc |= ptlrpc_server_handle_request(svc, NULL);
                         rc |= (ptlrpc_server_post_idle_rqbds(svc) > 0);
                         did_something |= rc;
@@ -882,7 +1197,7 @@ static int ptlrpc_main(void *arg)
 #ifdef WITH_GROUP_INFO
         struct group_info *ginfo = NULL;
 #endif
-        int rc = 0;
+        int counter, rc = 0;
         ENTRY;
 
         ptlrpc_daemonize(data->name);
@@ -936,7 +1251,9 @@ static int ptlrpc_main(void *arg)
          */
         cfs_waitq_signal(&thread->t_ctl_waitq);
 
-        watchdog = lc_watchdog_add(svc->srv_watchdog_timeout, NULL, NULL);
+        watchdog = lc_watchdog_add((AT_OFF ? obd_timeout : 
+                                   at_get(&svc->srv_at_estimate)) * 
+                                   svc->srv_watchdog_factor, NULL, NULL);
 
         spin_lock(&svc->srv_lock);
         svc->srv_threads_running++;
@@ -964,14 +1281,17 @@ static int ptlrpc_main(void *arg)
                                svc->srv_n_difficult_replies == 0) ||
                               (!list_empty(&svc->srv_idle_rqbds) &&
                                svc->srv_rqbd_timeout == 0) ||
-                              !list_empty (&svc->srv_reply_queue) ||
-                              (!list_empty (&svc->srv_request_queue) &&
-                               (svc->srv_n_difficult_replies == 0 ||
-                                svc->srv_n_active_reqs <
-                                (svc->srv_threads_running - 1))),
+                              !list_empty(&svc->srv_req_in_queue) ||
+                              !list_empty(&svc->srv_reply_queue) ||
+                              (!list_empty(&svc->srv_request_queue) &&
+                               (svc->srv_n_active_reqs <
+                                (svc->srv_threads_running - 1))) ||
+                              svc->srv_at_check,
                               &lwi);
 
-                lc_watchdog_touch(watchdog);
+                lc_watchdog_touch_ms(watchdog, (AT_OFF ? obd_timeout : 
+                                     at_get(&svc->srv_at_estimate)) * 
+                                     svc->srv_watchdog_factor);
 
                 ptlrpc_check_rqbd_pool(svc);
 
@@ -981,15 +1301,24 @@ static int ptlrpc_main(void *arg)
                         ptlrpc_start_thread(dev, svc);
                 }
 
-                if (!list_empty (&svc->srv_reply_queue))
+                if (!list_empty(&svc->srv_reply_queue))
                         ptlrpc_server_handle_reply (svc);
 
-                /* only handle requests if there are no difficult replies
-                 * outstanding, or I'm not the last thread handling
-                 * requests */
+                counter = 0;
+                while(!list_empty(&svc->srv_req_in_queue)) {
+                        /* Process all incoming reqs before handling any */
+                        ptlrpc_server_handle_req_in(svc);
+                        /* but limit ourselves in case of flood */ 
+                        if (counter++ > 1000)
+                                break;
+                }
+
+                if (svc->srv_at_check) 
+                        ptlrpc_at_check_timed(svc);
+
+                /* don't handle requests in the last thread */
                 if (!list_empty (&svc->srv_request_queue) &&
-                    (svc->srv_n_difficult_replies == 0 ||
-                     svc->srv_n_active_reqs < (svc->srv_threads_running - 1)))
+                    (svc->srv_n_active_reqs < (svc->srv_threads_running - 1)))
                         ptlrpc_server_handle_request(svc, thread);
 
                 if (!list_empty(&svc->srv_idle_rqbds) &&
@@ -1068,7 +1397,9 @@ int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc)
         int i, rc = 0;
         ENTRY;
 
-        LASSERT(svc->srv_threads_min > 0);
+        /* We require 2 threads min - see note in 
+           ptlrpc_server_handle_request */
+        LASSERT(svc->srv_threads_min >= 2);
         for (i = 0; i < svc->srv_threads_min; i++) {
                 rc = ptlrpc_start_thread(dev, svc);
                 if (rc) {
@@ -1149,6 +1480,8 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
         struct list_head     *tmp;
         struct ptlrpc_reply_state *rs, *t;
 
+        cfs_timer_disarm(&service->srv_at_timer);
+
         ptlrpc_stop_all_threads(service);
         LASSERT(list_empty(&service->srv_threads));
 
@@ -1190,7 +1523,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
 
                 /* Network access will complete in finite time but the HUGE
                  * timeout lets us CWARN for visibility of sluggish NALs */
-                lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL);
+                lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL);
                 rc = l_wait_event(service->srv_waitq,
                                   service->srv_nrqbd_receiving == 0,
                                   &lwi);
@@ -1212,6 +1545,17 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
         /* purge the request queue.  NB No new replies (rqbds all unlinked)
          * and no service threads, so I'm the only thread noodling the
          * request queue now */
+        while (!list_empty(&service->srv_req_in_queue)) {
+                struct ptlrpc_request *req =
+                        list_entry(service->srv_req_in_queue.next,
+                                   struct ptlrpc_request,
+                                   rq_list);
+
+                list_del(&req->rq_list);
+                service->srv_n_queued_reqs--;
+                service->srv_n_active_reqs++;
+                ptlrpc_server_free_request(req);
+        }
         while (!list_empty(&service->srv_request_queue)) {
                 struct ptlrpc_request *req =
                         list_entry(service->srv_request_queue.next,
@@ -1221,7 +1565,6 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
                 list_del(&req->rq_list);
                 service->srv_n_queued_reqs--;
                 service->srv_n_active_reqs++;
-
                 ptlrpc_server_free_request(req);
         }
         LASSERT(service->srv_n_queued_reqs == 0);
@@ -1261,6 +1604,9 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
                 OBD_FREE(rs, service->srv_max_reply_size);
         }
 
+        /* In case somebody rearmed this in the meantime */
+        cfs_timer_disarm(&service->srv_at_timer);
+
         OBD_FREE(service, sizeof(*service));
         return 0;
 }
@@ -1274,31 +1620,31 @@ int ptlrpc_service_health_check(struct ptlrpc_service *svc)
 {
         struct ptlrpc_request *request;
         struct timeval         right_now;
-        long                   timediff, cutoff;
-        int                    rc = 0;
+        long                   timediff;
 
         if (svc == NULL)
                 return 0;
 
-        spin_lock(&svc->srv_lock);
-
-        if (list_empty(&svc->srv_request_queue))
-                goto out;
+        do_gettimeofday(&right_now);
 
+        spin_lock(&svc->srv_lock);
+        if (list_empty(&svc->srv_request_queue)) {
+                spin_unlock(&svc->srv_lock);
+                return 0;
+        }
+        
+        /* How long has the next entry been waiting? */
         request = list_entry(svc->srv_request_queue.next,
                              struct ptlrpc_request, rq_list);
-
-        do_gettimeofday(&right_now);
         timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
+        spin_unlock(&svc->srv_lock);
 
-        cutoff = obd_health_check_timeout;
-
-        if (timediff / 1000000 > cutoff) {
-                rc = -1;
-                goto out;
+        if ((timediff / ONE_MILLION) > (AT_OFF ? obd_timeout * 3/2 : 
+                                        adaptive_timeout_max)) {
+                CERROR("%s: unhealthy - request has been waiting %lds\n",
+                       svc->srv_name, timediff / ONE_MILLION);
+                return (-1);
         }
 
- out:
-        spin_unlock(&svc->srv_lock);
-        return rc;
+        return 0;
 }
index c29cb6f..f9ca561 100644 (file)
@@ -12,7 +12,7 @@ void lustre_assert_wire_constants(void)
 {
         /* Wire protocol assertions generated by 'wirecheck'
          * (make -C lustre/utils newwirecheck)
-         * running on Linux pancake 2.6.18-skas3-v9-pre9 #2 Tue Oct 17 13:08:24 PDT 2006 i686 i686 i3
+         * running on Linux pancake 2.6.18-skas3-v9-pre9 #1 Tue Feb 20 10:37:58 PST 2007 i686 i686 i3
          * with gcc version 3.4.4 */
 
 
@@ -35,6 +35,8 @@ void lustre_assert_wire_constants(void)
                  (long long)MSG_RESENT);
         LASSERTF(MSG_REPLAY == 4, " found %lld\n",
                  (long long)MSG_REPLAY);
+        LASSERTF(MSG_AT_SUPPORT == 8, " found %lld\n",
+                 (long long)MSG_AT_SUPPORT);
         LASSERTF(MSG_CONNECT_RECOVERING == 1, " found %lld\n",
                  (long long)MSG_CONNECT_RECOVERING);
         LASSERTF(MSG_CONNECT_RECONNECT == 2, " found %lld\n",
@@ -298,10 +300,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
         LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
-        LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n",
-                 (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout));
-        LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout));
+        LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+        LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
         LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n",
                  (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1));
         LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n",
@@ -371,26 +373,26 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt));
         LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n",
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n",
+                 (long long)(int)offsetof(struct ptlrpc_body, pb_timeout));
+        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout));
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n",
+                 (long long)(int)offsetof(struct ptlrpc_body, pb_service_time));
+        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time));
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 76, " found %lld\n",
                  (long long)(int)offsetof(struct ptlrpc_body, pb_padding_1));
         LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n",
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 80, " found %lld\n",
                  (long long)(int)offsetof(struct ptlrpc_body, pb_padding_2));
         LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 76, " found %lld\n",
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 84, " found %lld\n",
                  (long long)(int)offsetof(struct ptlrpc_body, pb_padding_3));
         LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_4) == 80, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlrpc_body, pb_padding_4));
-        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_5) == 84, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlrpc_body, pb_padding_5));
-        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5));
 
         /* Checks for struct obd_connect_data */
         LASSERTF((int)sizeof(struct obd_connect_data) == 72, " found %lld\n",
@@ -465,7 +467,9 @@ void lustre_assert_wire_constants(void)
         CLASSERT(OBD_CONNECT_FID_CAPA == 0x100000ULL);
         CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL);
         CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL);
-                
+        CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL);
+        CLASSERT(OBD_CONNECT_AT == 0x01000000ULL);
+
         /* Checks for struct obdo */
         LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n",
                  (long long)(int)sizeof(struct obdo));
index c5aedce..34214c1 100644 (file)
@@ -15,8 +15,8 @@ ONLY=${ONLY:-"$*"}
 #              xml xml xml xml xml xml dumb FIXME
 MOUNTCONFSKIP="10  11  12  13  13b 14  15   18"
 
-# bug number for skipped test:
-ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT $MOUNTCONFSKIP"
+# bug number for skipped test:                     13369
+ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT $MOUNTCONFSKIP 34a"
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 SRCDIR=`dirname $0`
@@ -1066,7 +1066,7 @@ test_31() { # bug 10734
         mount -t lustre 4.3.2.1@tcp:/lustre $MOUNT || true
        cleanup
 }
-run_test 31 "Connect to non-existent node (shouldn't crash)"
+run_test 31 "Connect to non-existent node (returns errors, should not crash)"
 
 test_32a() {
         # XXX - make this run on client-only systems with real hardware on
@@ -1108,6 +1108,7 @@ test_32a() {
        load_modules
 
         # mount a second time to make sure we didnt leave upgrade flag on
+       load_modules
         $TUNEFS --dryrun $TMP/$tdir/mds || error "tunefs failed"
        load_modules
         start mds $TMP/$tdir/mds "-o loop,exclude=lustre-OST0000" || return 12
@@ -1128,32 +1129,36 @@ test_32b() {
        unzip -o -j -d $TMP/$tdir disk1_4.zip || { skip "Cant unzip disk1_4, skipping" && return ; }
        load_modules
        sysctl lnet.debug=$PTLDEBUG
+       NEWNAME=sofia
 
        # writeconf will cause servers to register with their current nids
-       $TUNEFS --writeconf $TMP/$tdir/mds || error "tunefs failed"
+       $TUNEFS --writeconf --fsname=$NEWNAME $TMP/$tdir/mds || error "tunefs failed"
        start mds $TMP/$tdir/mds "-o loop" || return 3
-        local UUID=$(cat $LPROC/mds/lustre-MDT0000/uuid)
+        local UUID=$(cat $LPROC/mds/${NEWNAME}-MDT0000/uuid)
        echo MDS uuid $UUID
        [ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID" 
 
-       $TUNEFS --mgsnode=`hostname` $TMP/$tdir/ost1 || error "tunefs failed"
+       $TUNEFS --mgsnode=`hostname` --fsname=$NEWNAME --writeconf $TMP/$tdir/ost1 || error "tunefs failed"
        start ost1 $TMP/$tdir/ost1 "-o loop" || return 5
-        UUID=$(cat $LPROC/obdfilter/lustre-OST0000/uuid)
+        UUID=$(cat $LPROC/obdfilter/${NEWNAME}-OST0000/uuid)
        echo OST uuid $UUID
-       [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID" 
+       [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID"
 
        echo "OSC changes should succeed:" 
-       $LCTL conf_param lustre-OST0000.osc.max_dirty_mb=15 || return 7
-       $LCTL conf_param lustre-OST0000.failover.node=$NID || return 8
+       $LCTL conf_param ${NEWNAME}-OST0000.osc.max_dirty_mb=15 || return 7
+       $LCTL conf_param ${NEWNAME}-OST0000.failover.node=$NID || return 8
        echo "ok."
        echo "MDC changes should succeed:" 
-       $LCTL conf_param lustre-MDT0000.mdc.max_rpcs_in_flight=9 || return 9
+       $LCTL conf_param ${NEWNAME}-MDT0000.mdc.max_rpcs_in_flight=9 || return 9
        echo "ok."
 
        # MDT and OST should have registered with new nids, so we should have
        # a fully-functioning client
        echo "Check client and old fs contents"
+       OLDFS=$FSNAME
+       FSNAME=$NEWNAME
        mount_client $MOUNT
+       FSNAME=$OLDFS
        set_and_check client "cat $LPROC/mdc/*/max_rpcs_in_flight" "lustre-MDT0000.mdc.max_rpcs_in_flight" || return 11
        [ "$(cksum $MOUNT/passwd | cut -d' ' -f 1,2)" == "2479747619 779" ] || return 12  
        echo "ok."
index 89c2acb..f927fb6 100755 (executable)
@@ -5,7 +5,7 @@ set -e
 #         bug  5493
 ALWAYS_EXCEPT="52 $RECOVERY_SMALL_EXCEPT"
 
-PTLDEBUG=${PTLDEBUG:--1}
+#PTLDEBUG=${PTLDEBUG:--1}
 LUSTRE=${LUSTRE:-`dirname $0`/..}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
@@ -253,8 +253,9 @@ test_18b() {
     do_facet client cp $SAMPLE_FILE $f
     sync
     ost_evict_client
-    # allow recovery to complete
-    sleep $((TIMEOUT + 2))
+    # force reconnect
+    df $MOUNT > /dev/null 2>&1
+    sleep 2
     # my understanding is that there should be nothing in the page
     # cache after the client reconnects?     
     rc=0
@@ -593,12 +594,13 @@ test_26b() {      # bug 10140 - evict dead exports by pinger
        OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
         OST_NEXP1="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
        echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
+       #force umount a client; exports should get evicted
        zconf_umount `hostname` $MOUNT2 -f
-       # evictor takes up to 2.25x to evict.  But if there's a 
-       # race to start the evictor from various obds, the loser
-       # might have to wait for the next ping.
-       echo Waiting for $(($TIMEOUT * 4)) secs
-       sleep $(($TIMEOUT * 4))
+       # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.  
+        # But if there's a race to start the evictor from various obds, 
+        # the loser might have to wait for the next ping.
+       echo Waiting for $(($TIMEOUT * 8)) secs
+       sleep $(($TIMEOUT * 8))
         OST_NEXP2="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
         MDS_NEXP2="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
        echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
@@ -646,25 +648,38 @@ run_test 28 "handle error adding new clients (bug 6086)"
 
 test_50() {
        mkdir -p $DIR/$tdir
+       debugsave
+       sysctl -w lnet.debug="-dlmtrace -ha"
        # put a load of file creates/writes/deletes
-       writemany -q $DIR/$tdir/$tfile 0 5 &
+       writemany -a -q $DIR/$tdir/$tfile 0 5 &
        CLIENT_PID=$!
        echo writemany pid $CLIENT_PID
        sleep 10
+       ps $CLIENT_PID > /dev/null || error "Writemany died 1.1"
        FAILURE_MODE="SOFT"
+       $LCTL mark "$TESTNAME fail mds 1"
        fail mds
+       ps $CLIENT_PID > /dev/null || error "Writemany died 2.1"
        # wait for client to reconnect to MDS
        sleep 60
+       ps $CLIENT_PID > /dev/null || error "Writemany died 2.2"
+       $LCTL mark "$TESTNAME fail mds 2"
        fail mds
+       ps $CLIENT_PID > /dev/null || error "Writemany died 3.1"
        sleep 60
+       ps $CLIENT_PID > /dev/null || error "Writemany died 3.2"
+       $LCTL mark "$TESTNAME fail mds 3"
        fail mds
        # client process should see no problems even though MDS went down
+       ps $CLIENT_PID > /dev/null || error "Writemany died 4.1"
        sleep $TIMEOUT
+       ps $CLIENT_PID > /dev/null || error "Writemany died 4.2"
         kill -USR1 $CLIENT_PID
        wait $CLIENT_PID 
        rc=$?
        echo writemany returned $rc
        #these may fail because of eviction due to slow AST response.
+       debugrestore
        return $rc
 }
 run_test 50 "failover MDS under load"
@@ -672,7 +687,7 @@ run_test 50 "failover MDS under load"
 test_51() {
        mkdir -p $DIR/$tdir
        # put a load of file creates/writes/deletes
-       writemany -q $DIR/$tdir/$tfile 0 5 &
+       writemany -a -q $DIR/$tdir/$tfile 0 5 &
        CLIENT_PID=$!
        sleep 1
        FAILURE_MODE="SOFT"
@@ -683,7 +698,10 @@ test_51() {
         for i in $SEQ
           do
           echo failover in $i sec
+         ps $CLIENT_PID > /dev/null || error "Writemany died $i.1"
           sleep $i
+         ps $CLIENT_PID > /dev/null || error "Writemany died $i.1"
+         $LCTL mark "$TESTNAME fail mds $i"
           facet_failover mds
         done
        # client process should see no problems even though MDS went down
@@ -703,6 +721,7 @@ test_52_guts() {
        echo writemany pid $CLIENT_PID
        sleep 10
        FAILURE_MODE="SOFT"
+       $LCTL mark "$TESTNAME fail ost $1"
        fail ost1
        rc=0
        wait $CLIENT_PID || rc=$?
@@ -716,16 +735,16 @@ test_52_guts() {
 
 test_52() {
        mkdir -p $DIR/$tdir
-       test_52_guts
+       test_52_guts 1
        rc=$?
        [ $rc -ne 0 ] && { return $rc; }
        # wait for client to reconnect to OST
        sleep 30
-       test_52_guts
+       test_52_guts 2
        rc=$?
        [ $rc -ne 0 ] && { return $rc; }
        sleep 30
-       test_52_guts
+       test_52_guts 3
        rc=$?
        client_reconnect
        #return $rc
index ab25f6b..8fa5dc1 100755 (executable)
@@ -204,7 +204,7 @@ test_12() {
 #define OBD_FAIL_LDLM_ENQUEUE            0x302
     do_facet mds sysctl -w lustre.fail_loc=0x80000302
     facet_failover mds
-    df $MOUNT || return 1
+    df $MOUNT || { kill -USR1 $MULTIPID  && return 1; }
     do_facet mds sysctl -w lustre.fail_loc=0
 
     ls $DIR/$tfile
index da1b1d3..5ddab23 100755 (executable)
@@ -916,13 +916,19 @@ run_test 43 "mds osc import failure during recovery; don't LBUG"
 test_44() {
     mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
     [ "$mdcdev" ] || exit 2
+    # adaptive timeouts slow this way down
+    MDS_AT_MAX=$(do_facet mds "sysctl -n lustre.adaptive_max")
+    do_facet mds "sysctl -w lustre.adaptive_max=40"
     for i in `seq 1 10`; do
+       echo "$i of 10 ($(date +%s))"
+       do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
        #define OBD_FAIL_TGT_CONN_RACE     0x701
        do_facet mds "sysctl -w lustre.fail_loc=0x80000701"
        $LCTL --device $mdcdev recover
        df $MOUNT
     done
     do_facet mds "sysctl -w lustre.fail_loc=0"
+    do_facet mds "sysctl -w lustre.adaptive_max=$MDS_AT_MAX"
     return 0
 }
 run_test 44 "race in target handle connect"
@@ -931,6 +937,8 @@ test_44b() {
     mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
     [ "$mdcdev" ] || exit 2
     for i in `seq 1 10`; do
+       echo "$i of 10 ($(date +%s))"
+       do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
        #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
        do_facet mds "sysctl -w lustre.fail_loc=0x80000704"
        $LCTL --device $mdcdev recover
@@ -1130,7 +1138,7 @@ test_60() {
 run_test 60 "test llog post recovery init vs llog unlink"
 
 #test race  llog recovery thread vs llog cleanup
-test_61() {
+test_61a() {
     mkdir $DIR/$tdir
     createmany -o $DIR/$tdir/$tfile-%d 800
     replay_barrier ost1 
@@ -1145,7 +1153,7 @@ test_61() {
     $CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1
     rmdir $DIR/$tdir
 }
-run_test 61 "test race llog recovery vs llog cleanup"
+run_test 61a "test race llog recovery vs llog cleanup"
 
 #test race  mds llog sync vs llog cleanup
 test_61b() {
@@ -1169,6 +1177,138 @@ test_61c() {
 }
 run_test 61c "test race mds llog sync vs llog cleanup"
 
+
+at_start() #bug 3055
+{
+    if [ -z "$ATOLDBASE" ]; then
+       ATOLDBASE=$(do_facet mds "grep timebase $LPROC/mdt/MDS/mds/timeouts" | awk '{print $3}' )
+        # speed up the timebase so we can check decreasing AT
+       do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds/timeouts"
+       do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_readpage/timeouts"
+       do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_setattr/timeouts"
+       do_facet ost1 "echo 8 >> $LPROC/ost/OSS/ost/timeouts"
+    fi
+}
+
+test_65() #bug 3055
+{
+    at_start
+    $LCTL dk > /dev/null
+    # slow down a request
+    sysctl -w lustre.fail_val=30000
+#define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
+    sysctl -w lustre.fail_loc=0x8000050a
+    createmany -o $DIR/$tfile 10 > /dev/null
+    unlinkmany $DIR/$tfile 10 > /dev/null
+    # check for log message
+    $LCTL dk | grep "Early reply #" || error "No early reply" 
+    # client should show 30s timeouts
+    grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+    sleep 9
+    grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+}
+run_test 65 "AT: verify early replies"
+
+test_66a() #bug 3055
+{
+    at_start
+    grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+    # adjust 5s at a time so no early reply is sent (within deadline)
+    do_facet mds "sysctl -w lustre.fail_val=5000"
+#define OBD_FAIL_PTLRPC_PAUSE_REQ        0x50a
+    do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
+    createmany -o $DIR/$tfile 20 > /dev/null
+    unlinkmany $DIR/$tfile 20 > /dev/null
+    grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+    do_facet mds "sysctl -w lustre.fail_val=10000"
+    do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
+    createmany -o $DIR/$tfile 20 > /dev/null
+    unlinkmany $DIR/$tfile 20 > /dev/null
+    grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+    do_facet mds "sysctl -w lustre.fail_loc=0"
+    sleep 9
+    createmany -o $DIR/$tfile 20 > /dev/null
+    unlinkmany $DIR/$tfile 20 > /dev/null
+    grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12"
+    CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
+    WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
+    echo "Current MDT timeout $CUR, worst $WORST"
+    [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST" 
+}
+run_test 66a "AT: verify MDT service time adjusts with no early replies"
+
+test_66b() #bug 3055
+{
+    at_start
+    ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/lustre-*/timeouts)
+    sysctl -w lustre.fail_val=$(($ORIG + 5))
+#define OBD_FAIL_PTLRPC_PAUSE_REP      0x50c
+    sysctl -w lustre.fail_loc=0x50c
+    ls $DIR/$tfile > /dev/null 2>&1
+    sysctl -w lustre.fail_loc=0
+    CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts)
+    WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts)
+    echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
+    [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG" 
+}
+run_test 66b "AT: verify net latency adjusts"
+
+test_67a() #bug 3055
+{
+    at_start
+    CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+    # sleeping threads may drive values above this
+    do_facet ost1 "sysctl -w lustre.fail_val=400"
+#define OBD_FAIL_PTLRPC_PAUSE_REQ    0x50a
+    do_facet ost1 "sysctl -w lustre.fail_loc=0x50a"
+    createmany -o $DIR/$tfile 20 > /dev/null
+    unlinkmany $DIR/$tfile 20 > /dev/null
+    do_facet ost1 "sysctl -w lustre.fail_loc=0"
+    CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+    ATTEMPTS=$(($CONN2 - $CONN1))
+    echo "$ATTEMPTS osc reconnect attemps on gradual slow"
+    [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
+    return 0
+}
+run_test 67a "AT: verify slow request processing doesn't induce reconnects"
+
+test_67b() #bug 3055
+{
+    at_start
+    CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+#define OBD_FAIL_OST_PAUSE_CREATE        0x223
+    do_facet ost1 "sysctl -w lustre.fail_val=20000"
+    do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
+    cp /etc/profile $DIR/$tfile || error "cp failed"
+    client_reconnect
+    cat $LPROC/ost/OSS/ost_create/timeouts
+    log "phase 2"
+    CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+    ATTEMPTS=$(($CONN2 - $CONN1))
+    echo "$ATTEMPTS osc reconnect attemps on instant slow"
+    # do it again; should not timeout
+    do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
+    cp /etc/profile $DIR/$tfile || error "cp failed"
+    do_facet ost1 "sysctl -w lustre.fail_loc=0"
+    client_reconnect
+    cat $LPROC/ost/OSS/ost_create/timeouts
+    CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+    ATTEMPTS=$(($CONN3 - $CONN2))
+    echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
+    [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
+    return 0
+}
+run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
+
+if [ -n "$ATOLDBASE" ]; then
+    do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds/timeouts" 
+    do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_readpage/timeouts"
+    do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_setattr/timeouts"
+    do_facet ost1 "echo $ATOLDBASE >> $LPROC/ost/OSS/ost/timeouts"
+fi
+# end of AT tests includes above lines
+
+
 equals_msg `basename $0`: test complete, cleaning up
 check_and_cleanup_lustre
 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
index f09d249..2a7c03a 100644 (file)
@@ -783,7 +783,7 @@ test_27a() {
        $SETSTRIPE $DIR/d27/f0 65536 0 1 || error "lstripe failed"
        $CHECKSTAT -t file $DIR/d27/f0 || error "checkstat failed"
        pass
-       log "== test_27b: write to one stripe file ========================="
+       log "== test 27b: write to one stripe file ========================="
        cp /etc/hosts $DIR/d27/f0 || error
 }
 run_test 27a "one stripe file =================================="
@@ -795,7 +795,7 @@ test_27c() {
        [ `$GETSTRIPE $DIR/d27/f01 | grep -A 10 obdidx | wc -l` -eq 4 ] ||
                error "two-stripe file doesn't have two stripes"
        pass
-       log "== test_27d: write to two stripe file file f01 ================"
+       log "== test 27d: write to two stripe file file f01 ================"
        dd if=/dev/zero of=$DIR/d27/f01 bs=4k count=4 || error "dd failed"
 }
 run_test 27c "create two stripe file f01 ======================="
@@ -3211,7 +3211,7 @@ test_99b() {
        TOIGNORE=$(find . -type l -printf '-I %f\n' -o \
                        ! -perm +4 -printf '-I %f\n')
        $RUNAS cvs -d $DIR/d99cvsroot import -m "nomesg" $TOIGNORE \
-               d99reposname vtag rtag || error "cvs import failed"
+               d99reposname vtag rtag > /dev/null || error "cvs import failed"
 }
 run_test 99b "cvs import ======================================="
 
@@ -3221,7 +3221,7 @@ test_99c() {
        mkdir -p $DIR/d99reposname || error "mkdir $DIR/d99reposname failed"
        chown $RUNAS_ID $DIR/d99reposname || \
                error "chown $DIR/d99reposname failed"
-       $RUNAS cvs -d $DIR/d99cvsroot co d99reposname || \
+       $RUNAS cvs -d $DIR/d99cvsroot co d99reposname > /dev/null || \
                error "cvs co d99reposname failed"
 }
 run_test 99c "cvs checkout ====================================="
@@ -3622,20 +3622,20 @@ test_103 () {
     echo "performing cp ..."
     run_acl_subtest cp || error
     echo "performing getfacl-noacl..."
-    run_acl_subtest getfacl-noacl || error
+    run_acl_subtest getfacl-noacl > /dev/null || error
     echo "performing misc..."
-    run_acl_subtest misc || error
+    run_acl_subtest misc > /dev/null || error
 #    XXX add back permission test when we support supplementary groups.
 #    echo "performing permissions..."
 #    run_acl_subtest permissions || error
     echo "performing setfacl..."
-    run_acl_subtest setfacl || error
+    run_acl_subtest setfacl > /dev/null || error
 
     # inheritance test got from HP
     echo "performing inheritance..."
     cp $SAVE_PWD/acl/make-tree . || error
     chmod +x make-tree || error
-    run_acl_subtest inheritance || error
+    run_acl_subtest inheritance > /dev/null || error
     rm -f make-tree
 
     cd $SAVE_PWD
index 5cd1286..3b87e09 100644 (file)
@@ -4,7 +4,7 @@ set -e
 
 ONLY=${ONLY:-"$*"}
 # bug number for skipped test:  3192
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b"}
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b $SANITYN_EXCEPT"}
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 [ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 16"
index 7c5dc63..8fed513 100644 (file)
@@ -225,8 +225,8 @@ start() {
     do_facet ${facet} mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} 
     RC=${PIPESTATUS[0]}
     if [ $RC -ne 0 ]; then
-        echo mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} 
-        echo Start of ${device} on ${facet} failed ${RC}
+        echo "mount -t lustre $@ ${device} ${MOUNT%/*}/${facet}" 
+        echo "Start of ${device} on ${facet} failed ${RC}"
     else 
         do_facet ${facet} sync
         label=$(do_facet ${facet} "e2label ${device}")
@@ -586,7 +586,7 @@ do_facet() {
     shift
     HOST=`facet_active_host $facet`
     [ -z $HOST ] && echo No host defined for facet ${facet} && exit 1
-    do_node $HOST $@
+    do_node $HOST "$@"
 }
 
 add() {
index dfd2604..a843db8 100644 (file)
@@ -109,7 +109,7 @@ check_lustre_msg_v2(void)
         CHECK_MEMBER(lustre_msg_v2, lm_secflvr);
         CHECK_MEMBER(lustre_msg_v2, lm_magic);
         CHECK_MEMBER(lustre_msg_v2, lm_repsize);
-        CHECK_MEMBER(lustre_msg_v2, lm_timeout);
+        CHECK_MEMBER(lustre_msg_v2, lm_cksum);
         CHECK_MEMBER(lustre_msg_v2, lm_padding_1);
         CHECK_MEMBER(lustre_msg_v2, lm_padding_2);
         CHECK_MEMBER(lustre_msg_v2, lm_padding_3);
@@ -133,11 +133,11 @@ check_ptlrpc_body(void)
         CHECK_MEMBER(ptlrpc_body, pb_flags);
         CHECK_MEMBER(ptlrpc_body, pb_op_flags);
         CHECK_MEMBER(ptlrpc_body, pb_conn_cnt);
+        CHECK_MEMBER(ptlrpc_body, pb_timeout);
+        CHECK_MEMBER(ptlrpc_body, pb_service_time);
         CHECK_MEMBER(ptlrpc_body, pb_padding_1);
         CHECK_MEMBER(ptlrpc_body, pb_padding_2);
         CHECK_MEMBER(ptlrpc_body, pb_padding_3);
-        CHECK_MEMBER(ptlrpc_body, pb_padding_4);
-        CHECK_MEMBER(ptlrpc_body, pb_padding_5);
 }
 
 static void check_obd_connect_data(void)
@@ -1069,6 +1069,7 @@ main(int argc, char **argv)
         CHECK_VALUE(MSG_LAST_REPLAY);
         CHECK_VALUE(MSG_RESENT);
         CHECK_VALUE(MSG_REPLAY);
+        CHECK_VALUE(MSG_AT_SUPPORT);
 
         CHECK_VALUE(MSG_CONNECT_RECOVERING);
         CHECK_VALUE(MSG_CONNECT_RECONNECT);
index e07146f..992292c 100644 (file)
@@ -28,7 +28,7 @@ void lustre_assert_wire_constants(void)
 {
         /* Wire protocol assertions generated by 'wirecheck'
          * (make -C lustre/utils newwirecheck)
-         * running on Linux pancake 2.6.18-skas3-v9-pre9 #2 Tue Oct 17 13:08:24 PDT 2006 i686 i686 i3
+         * running on Linux pancake 2.6.18-skas3-v9-pre9 #1 Tue Feb 20 10:37:58 PST 2007 i686 i686 i3
          * with gcc version 3.4.4 */
 
 
@@ -51,6 +51,8 @@ void lustre_assert_wire_constants(void)
                  (long long)MSG_RESENT);
         LASSERTF(MSG_REPLAY == 4, " found %lld\n",
                  (long long)MSG_REPLAY);
+        LASSERTF(MSG_AT_SUPPORT == 8, " found %lld\n",
+                 (long long)MSG_AT_SUPPORT);
         LASSERTF(MSG_CONNECT_RECOVERING == 1, " found %lld\n",
                  (long long)MSG_CONNECT_RECOVERING);
         LASSERTF(MSG_CONNECT_RECONNECT == 2, " found %lld\n",
@@ -314,10 +316,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
         LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
-        LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n",
-                 (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout));
-        LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout));
+        LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+        LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
         LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n",
                  (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1));
         LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n",
@@ -387,26 +389,26 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt));
         LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n",
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n",
+                 (long long)(int)offsetof(struct ptlrpc_body, pb_timeout));
+        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout));
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n",
+                 (long long)(int)offsetof(struct ptlrpc_body, pb_service_time));
+        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time));
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 76, " found %lld\n",
                  (long long)(int)offsetof(struct ptlrpc_body, pb_padding_1));
         LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n",
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 80, " found %lld\n",
                  (long long)(int)offsetof(struct ptlrpc_body, pb_padding_2));
         LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 76, " found %lld\n",
+        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 84, " found %lld\n",
                  (long long)(int)offsetof(struct ptlrpc_body, pb_padding_3));
         LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_4) == 80, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlrpc_body, pb_padding_4));
-        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4));
-        LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_5) == 84, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlrpc_body, pb_padding_5));
-        LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5));
 
         /* Checks for struct obd_connect_data */
         LASSERTF((int)sizeof(struct obd_connect_data) == 72, " found %lld\n",
@@ -481,7 +483,9 @@ void lustre_assert_wire_constants(void)
         CLASSERT(OBD_CONNECT_FID_CAPA == 0x100000ULL);
         CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL);
         CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL);
+        CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL);
+        CLASSERT(OBD_CONNECT_AT == 0x01000000ULL);
+
         /* Checks for struct obdo */
         LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n",
                  (long long)(int)sizeof(struct obdo));