* Recommended e2fsprogs version: 1.40.2-cfs1
* Note that reiserfs quotas are disabled on SLES 10 in this kernel.
+Severity : enhancement
+Bugzilla : 3055
+Description: Adaptive timeouts
+Details : RPC timeouts adapt to changing server load and network
+ conditions to reduce resend attempts and improve recovery time.
+
Severity : major
Bugzilla : 12932
Description: obd_health_check_timeout too short
Description: mds_lov_synchronize race leads to various problems
Details : simultaneous MDT->OST connections at startup can cause the
sync to abort, leaving the OSC in a bad state.
+
Severity : normal
Bugzilla : 12975
Frequency : rare
#define FSFILT_OP_JOIN 11
#define FSFILT_OP_NOOP 15
-#define fsfilt_check_slow(obd, start, timeout, msg) \
+#define fsfilt_check_slow(obd, start, msg) \
do { \
if (time_before(jiffies, start + 15 * HZ)) \
break; \
else if (time_before(jiffies, start + 30 * HZ)) \
CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \
msg, (jiffies-start) / HZ); \
- else if (time_before(jiffies, start + timeout / 2 * HZ)) \
+ else if (time_before(jiffies, start + DISK_TIMEOUT * HZ)) \
CWARN("%s: slow %s %lus\n", obd->obd_name, msg, \
(jiffies - start) / HZ); \
else \
LBUG();
}
}
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return handle;
}
LBUG();
}
}
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return handle;
}
int rc = obd->obd_fsops->fs_extend(inode, nblocks, handle);
CDEBUG(D_INFO, "extending handle %p with %u blocks\n", handle, nblocks);
- fsfilt_check_slow(obd, now, obd_timeout, "journal extend");
+ fsfilt_check_slow(obd, now, "journal extend");
return rc;
}
int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
CDEBUG(D_INFO, "committing handle %p\n", handle);
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return rc;
}
int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle);
CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle);
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return rc;
}
unsigned long now = jiffies;
int rc = obd->obd_fsops->fs_commit_wait(inode, handle);
CDEBUG(D_INFO, "waiting for completion %p\n", handle);
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return rc;
}
unsigned long now = jiffies;
int rc;
rc = obd->obd_fsops->fs_setattr(dentry, handle, iattr, do_trunc);
- fsfilt_check_slow(obd, now, obd_timeout, "setattr");
+ fsfilt_check_slow(obd, now, "setattr");
return rc;
}
struct file;
struct obd_histogram;
+/* Days / hours / mins / seconds format */
+struct dhms {
+ int d,h,m,s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+ ts->d = secs / 86400;
+ secs = secs % 86400;
+ ts->h = secs / 3600;
+ secs = secs % 3600;
+ ts->m = secs / 60;
+ ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+
#ifdef LPROCFS
/* Two optimized LPROCFS counter increment functions are provided:
int count, int *eof, void *data);
extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
int count, int *eof, void *data);
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(char *page, int count, int rc,
+ struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+ unsigned long count, void *data);
extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
unsigned long count, void *data);
extern int lprocfs_wr_ping(struct file *file, const char *buffer,
static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off,
int count, int *eof, void *data)
{ return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(char *page, int count, int rc,
+ struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{ return 0; }
static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer,
unsigned long count, void *data)
{ return 0; }
__u32 lm_secflvr;
__u32 lm_magic;
__u32 lm_repsize;
- __u32 lm_timeout;
+ __u32 lm_cksum;
__u32 lm_padding_1;
__u32 lm_padding_2;
__u32 lm_padding_3;
__u32 pb_flags;
__u32 pb_op_flags;
__u32 pb_conn_cnt;
+ __u32 pb_timeout; /* for req, the deadline, for rep, the service est */
+ __u32 pb_service_time; /* for rep, actual service time */
__u32 pb_padding_1;
__u32 pb_padding_2;
__u32 pb_padding_3;
- __u32 pb_padding_4;
- __u32 pb_padding_5;
};
extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
#define MSG_LAST_REPLAY 1
#define MSG_RESENT 2
#define MSG_REPLAY 4
+#define MSG_AT_SUPPORT 8
/*
* Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \
OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
- OBD_CONNECT_CANCELSET)
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_AT)
#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
OBD_CONNECT_BRW_SIZE | OBD_CONNECT_QUOTA64 | \
- OBD_CONNECT_CANCELSET)
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_AT)
#define ECHO_CONNECT_SUPPORTED (0)
-#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION)
+#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT)
#define MAX_QUOTA_COUNT32 ((0xffffffffULL >> QUOTABLOCK_BITS) << QUOTABLOCK_BITS)
#include <lustre_handles.h>
#include <lustre/lustre_idl.h>
+
+/* Adaptive Timeout stuff */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4 /* "bin" means "N seconds of history" */
+#define AT_TIMEBASE_DEFAULT 600 /* remembered history (sec) (should be
+ evenly divisible by AT_BINS) */
+#define AT_FLG_NOHIST 0x1 /* use last reported value only */
+#define AT_FLG_MIN 0x2 /* use a minimum limit */
+
+struct adaptive_timeout {
+ time_t at_binstart; /* bin start time */
+ time_t at_binlimit; /* bin time limit */
+ unsigned int at_hist[AT_BINS]; /* timeout history bins */
+ unsigned int at_flags;
+ unsigned int at_current; /* current timeout value */
+ unsigned int at_worst_ever; /* worst-ever timeout value */
+ time_t at_worst_time; /* worst-ever timeout timestamp */
+ spinlock_t at_lock;
+};
+
enum lustre_imp_state {
LUSTRE_IMP_CLOSED = 1,
LUSTRE_IMP_NEW = 2,
__u64 oic_last_attempt; /* jiffies, 64-bit */
};
+#define IMP_AT_MAX_PORTALS 4
+struct imp_at {
+ int iat_portal[IMP_AT_MAX_PORTALS];
+ struct adaptive_timeout iat_net_latency;
+ struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+ time_t iat_drain; /* hack to slow reconnect reqs */
+};
+
struct obd_import {
struct portals_handle imp_handle;
atomic_t imp_refcount;
__u64 imp_connect_flags_orig;
__u32 imp_msg_magic;
+ __u32 imp_msg_flags; /* adjusted based on server capability */
- struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
-};
-
-typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
- int event, void *event_arg, void *cb_data);
+ struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
-struct obd_import_observer {
- struct list_head oio_chain;
- obd_import_callback oio_cb;
- void *oio_cb_data;
+ struct imp_at imp_at; /* adaptive timeout data */
+ time_t imp_last_reply_time; /* for health check */
};
-void class_observe_import(struct obd_import *imp, obd_import_callback cb,
- void *cb_data);
-void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
- void *cb_data);
-void class_notify_import_observers(struct obd_import *imp, int event,
- void *event_arg);
+/* import.c */
+static inline void at_init(struct adaptive_timeout *at, int val, int timebase,
+ int flags) {
+ memset(at, 0, sizeof(*at));
+ at->at_binlimit = timebase / AT_BINS;
+ at->at_current = val;
+ at->at_worst_ever = val;
+ at->at_worst_time = cfs_time_current_sec();
+ at->at_flags = flags;
+ spin_lock_init(&at->at_lock);
+}
+static inline int at_get(struct adaptive_timeout *at) {
+ extern unsigned int adaptive_timeout_max;
+ if (adaptive_timeout_max)
+ return min(at->at_current, adaptive_timeout_max);
+ return at->at_current;
+}
+void at_add(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+int import_at_get_ldlm(struct obd_import *imp);
+#define AT_OFF (adaptive_timeout_max == 0)
/* genops.c */
struct obd_export;
#endif
void target_cancel_recovery_timer(struct obd_device *obd);
-
-#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) /* *waves hands* */
-void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler);
+void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler,
+ struct ptlrpc_request *req);
void target_abort_recovery(void *data);
void target_cleanup_recovery(struct obd_device *obd);
int target_queue_recovery_request(struct ptlrpc_request *req,
struct ptlrpc_request {
int rq_type; /* one of PTL_RPC_MSG_* */
struct list_head rq_list;
+ struct list_head rq_timed_list; /* server-side early replies */
struct list_head rq_history_list; /* server-side history */
__u64 rq_history_seq; /* history sequence # */
int rq_status;
spinlock_t rq_lock;
- /* client-side flags */
+ /* client-side flags. */
unsigned int rq_intr:1, rq_replied:1, rq_err:1,
rq_timedout:1, rq_resend:1, rq_restart:1,
/*
*/
rq_replay:1,
rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
- rq_no_delay:1, rq_net_err:1;
+ rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1,
+ /* server-side: */
+ rq_final:1; /* packed final reply */
enum rq_phase rq_phase; /* one of RQ_PHASE_* */
- atomic_t rq_refcount; /* client-side refcount for SENT race */
+ atomic_t rq_refcount; /* client-side refcount for SENT race,
+ server-side refcounf for multiple replies */
struct ptlrpc_thread *rq_svc_thread; /* initial thread servicing req */
int rq_reqlen;
struct lustre_msg *rq_reqmsg;
- int rq_timeout; /* time to wait for reply (seconds) */
int rq_replen;
+ struct lustre_msg *rq_repbuf; /* client only, buf may be bigger than msg */
struct lustre_msg *rq_repmsg;
__u64 rq_transno;
__u64 rq_xid;
int rq_import_generation;
enum lustre_imp_state rq_send_state;
+ int rq_early_count; /* how many early replies (for stats) */
+
/* client+server request */
lnet_handle_md_t rq_req_md_h;
struct ptlrpc_cb_id rq_req_cbid;
/* server-side... */
struct timeval rq_arrival_time; /* request arrival time */
struct ptlrpc_reply_state *rq_reply_state; /* separated reply state */
+ struct semaphore rq_rs_sem; /* one reply at a time */
struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer*/
#ifdef CRAY_XT3
__u32 rq_uid; /* peer uid, used in MDS only */
void *rq_cb_data;
struct ptlrpc_bulk_desc *rq_bulk; /* client side bulk */
- time_t rq_sent; /* when request sent, seconds */
+
+ /* client outgoing req */
+ time_t rq_sent; /* when request/reply sent (secs) */
+ time_t rq_deadline; /* when request must finish */
+ int rq_timeout; /* service time estimate (secs) */
/* Multi-rpc bits */
struct list_head rq_set_chain;
int srv_n_difficult_replies; /* # 'difficult' replies */
int srv_n_active_reqs; /* # reqs being served */
cfs_duration_t srv_rqbd_timeout; /* timeout before re-posting reqs, in tick */
- int srv_watchdog_timeout; /* soft watchdog timeout, in ms */
+ int srv_watchdog_factor; /* soft watchdog timeout mutiplier */
unsigned srv_cpu_affinity:1; /* bind threads to CPUs */
+ unsigned srv_at_check:1; /* check early replies */
__u32 srv_req_portal;
__u32 srv_rep_portal;
-
- int srv_n_queued_reqs; /* # reqs waiting to be served */
+
+ /* AT stuff */
+ struct adaptive_timeout srv_at_estimate;/* estimated service time */
+ spinlock_t srv_at_lock;
+ struct list_head srv_at_list; /* reqs waiting for replies */
+ cfs_timer_t srv_at_timer; /* early reply timer */
+
+ int srv_n_queued_reqs; /* # reqs in either of the queues below */
+ struct list_head srv_req_in_queue; /* incoming reqs */
struct list_head srv_request_queue; /* reqs waiting for service */
struct list_head srv_request_history; /* request history */
return (rc);
}
-int ptlrpc_send_reply(struct ptlrpc_request *req, int);
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY 0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
int ptlrpc_reply(struct ptlrpc_request *req);
int ptlrpc_error(struct ptlrpc_request *req);
void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd);
}
static inline int
+ptlrpc_client_must_unlink (struct ptlrpc_request *req)
+{
+ int rc;
+
+ spin_lock(&req->rq_lock);
+ rc = req->rq_must_unlink;
+ spin_unlock(&req->rq_lock);
+ return (rc);
+}
+
+static inline int
ptlrpc_client_replied (struct ptlrpc_request *req)
{
int rc;
struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
int max_reply_size,
int req_portal, int rep_portal,
- int watchdog_timeout, /* in ms */
+ int watchdog_factor,
svc_handler_t, char *name,
cfs_proc_dir_entry_t *proc_entry,
svcreq_printfn_t,
int *lens, char **bufs);
int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens,
char **bufs);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, int *lens,
+ char **bufs, int flags);
void lustre_shrink_reply(struct ptlrpc_request *req, int segment,
unsigned int newlen, int move_data);
void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
int lustre_msg_size(__u32 magic, int count, int *lengths);
+int lustre_msg_early_size(void);
int lustre_unpack_msg(struct lustre_msg *m, int len);
void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
int lustre_msg_buflen(struct lustre_msg *m, int n);
__u64 lustre_msg_get_transno(struct lustre_msg *msg);
int lustre_msg_get_status(struct lustre_msg *msg);
__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
static inline void
ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
lustre_free_reply_state(rs);
}
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+ if (req->rq_reply_state == NULL)
+ return; /* shouldn't occur */
+ ptlrpc_rs_decref(req->rq_reply_state);
+ req->rq_reply_state = NULL;
+ req->rq_repmsg = NULL;
+ up(&req->rq_rs_sem); /* held since lustre_pack_reply */
+}
+
+/* Check if we already packed a normal (non-early) reply.
+ Single thread only! */
+static inline int lustre_packed_reply(struct ptlrpc_request *req)
+{
+ return req->rq_final;
+}
+
static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
{
return lustre_msg_get_magic(req->rq_reqmsg);
static inline void
ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, int *lens)
{
- req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+ int size = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+ req->rq_replen = size + lustre_msg_early_size();
if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
- req->rq_reqmsg->lm_repsize = req->rq_replen;
+ req->rq_reqmsg->lm_repsize = size;
}
/* ldlm/ldlm_lib.c */
oti->oti_xid = req->rq_xid;
- if (req->rq_repmsg && req->rq_reqmsg != 0)
+ if (req->rq_reqmsg && req->rq_repmsg && req->rq_reply_state)
oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1;
oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
struct list_head obd_recovery_queue;
struct list_head obd_delayed_reply_queue;
time_t obd_recovery_start;
- time_t obd_recovery_end;
+ time_t obd_recovery_end; /* for lprocfs_status */
+ int obd_recovery_timeout;
union {
struct obd_device_target obt;
extern unsigned int obd_debug_peer_on_timeout;
extern unsigned int obd_dump_on_timeout;
extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+ networking / disk / timings affected by load (use Adaptive Timeouts) */
extern unsigned int obd_timeout; /* seconds */
-#define PING_INTERVAL max(obd_timeout / 4, 1U)
-#define RECONNECT_INTERVAL max(obd_timeout / 10, 10U)
-extern unsigned int ldlm_timeout;
-extern unsigned int obd_health_check_timeout;
+extern unsigned int ldlm_timeout; /* seconds */
+extern unsigned int adaptive_timeout_min; /* seconds */
+extern unsigned int adaptive_timeout_max; /* seconds */
extern unsigned int obd_sync_filter;
extern unsigned int obd_max_dirty_pages;
extern atomic_t obd_dirty_pages;
extern unsigned int obd_alloc_fail_rate;
/* Timeout definitions */
-#define LDLM_TIMEOUT_DEFAULT 20
#define OBD_TIMEOUT_DEFAULT 100
-#define HEALTH_CHECK_COEF 3 / 2
-#define HEALTH_CHECK_TIMEOUT_DEFAULT (OBD_TIMEOUT_DEFAULT * HEALTH_CHECK_COEF)
-#define HEALTH_CHECK_TIMEOUT (obd_timeout * HEALTH_CHECK_COEF)
+#define LDLM_TIMEOUT_DEFAULT 20
+/* Time to wait for all clients to reconnect during recovery */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* Client may skip 1 ping; wait for 2.5 */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 5 / 2)
+#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5 /* Connection switching rate limiter */
+#ifndef CRAY_XT3
+/* In general this should be low to have quick detection of a system
+ running on a backup server. */
+#define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/20)
+#else
+/* ...but for very large systems (e.g. CRAY) we need to keep the initial
+ connect t.o. high (bz 10803), because they will nearly ALWAYS be doing the
+ connects for the first time (clients "reboot" after every process, so no
+ chance to generate adaptive timeout data. */
+#define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/2)
+#endif
+#define LND_TIMEOUT 50 /* LNET LND-level RPC timeout */
+#define FOREVER 300 /* Something taking this long is broken */
+
#define OBD_FAIL_MDS 0x100
#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101
#define OBD_FAIL_OST_BRW_READ_BULK 0x20f
#define OBD_FAIL_OST_SYNC_NET 0x210
#define OBD_FAIL_OST_ALL_REPLY_NET 0x211
-#define OBD_FAIL_OST_ALL_REQUESTS_NET 0x212
+#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212
#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214
#define OBD_FAIL_OST_ENOSPC 0x215
#define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f
#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE 0x223
#define OBD_FAIL_LDLM 0x300
#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301
#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506
#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
#define OBD_FAIL_OBD_PING_NET 0x600
#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601
#define OBD_FAIL_MGS 0x900
#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901
#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902
-#define OBD_FAIL_MGC_PROCESS_LOG 0x903
-#define OBD_FAIL_MGS_SLOW_REQUEST_NET 0x904
-#define OBD_FAIL_MGS_SLOW_TARGET_REG 0x905
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903
+#define OBD_FAIL_MGS_PAUSE_REQ 0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905
#define OBD_FAIL_QUOTA_QD_COUNT_32BIT 0xA00
OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout);
} else if (req->rq_export == NULL &&
atomic_read(&export->exp_rpc_count) > 0) {
- CWARN("%s: refuse connection from %s/%s to 0x%p/%d\n",
- target->obd_name, cluuid.uuid,
+ CWARN("%s: refuse connection from %s/%s to 0x%p; still busy "
+ "with %d references\n", target->obd_name, cluuid.uuid,
libcfs_nid2str(req->rq_peer.nid),
export, atomic_read(&export->exp_refcount));
GOTO(out, rc = -EBUSY);
} else if (req->rq_export != NULL &&
atomic_read(&export->exp_rpc_count) > 1) {
- CWARN("%s: refuse reconnection from %s@%s to 0x%p/%d\n",
- target->obd_name, cluuid.uuid,
+ CWARN("%s: refuse reconnection from %s@%s to 0x%p; still busy "
+ "with %d active RPCs\n", target->obd_name, cluuid.uuid,
libcfs_nid2str(req->rq_peer.nid),
export, atomic_read(&export->exp_rpc_count));
GOTO(out, rc = -EBUSY);
/* If this is the first client, start the recovery timer */
if (target->obd_recovering) {
lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
- target_start_recovery_timer(target, handler);
+ target_start_recovery_timer(target, handler, req);
}
/* Tell the client if we support replayable requests */
&export->exp_nid_hash);
spin_unlock(&target->obd_dev_lock);
- if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT)
- GOTO(out, rc = 0);
+ if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) {
+ revimp = class_import_get(export->exp_imp_reverse);
+ GOTO(set_flags, rc = 0);
+ }
if (target->obd_recovering)
target->obd_connected_clients++;
revimp->imp_remote_handle = conn;
revimp->imp_dlm_fake = 1;
revimp->imp_state = LUSTRE_IMP_FULL;
-
+set_flags:
if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_NEXT_VER) {
+ /* Client wants v2 */
revimp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_NEXT_VER);
+ if (export->exp_connect_flags & OBD_CONNECT_AT)
+ revimp->imp_msg_flags |= MSG_AT_SUPPORT;
}
class_import_put(revimp);
spin_lock(&export->exp_lock);
export->exp_connecting = 0;
spin_unlock(&export->exp_lock);
- }
+ }
if (targref)
class_decref(targref);
if (rc)
static void target_release_saved_req(struct ptlrpc_request *req)
{
- if (req->rq_reply_state != NULL) {
- ptlrpc_rs_decref(req->rq_reply_state);
- /* req->rq_reply_state = NULL; */
- }
-
+ ptlrpc_req_drop_rs(req);
class_export_put(req->rq_export);
OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
OBD_FREE(req, sizeof *req);
ptlrpc_reply(req);
target_release_saved_req(req);
}
- obd->obd_recovery_end = CURRENT_SECONDS;
+ obd->obd_recovery_end = cfs_time_current_sec();
}
static void abort_recovery_queue(struct obd_device *obd)
cfs_timer_disarm(&obd->obd_recovery_timer);
}
-static void reset_recovery_timer(struct obd_device *obd)
+static void reset_recovery_timer(struct obd_device *obd,
+ struct ptlrpc_request *req, int first)
{
spin_lock_bh(&obd->obd_processing_task_lock);
if (!obd->obd_recovering) {
spin_unlock_bh(&obd->obd_processing_task_lock);
return;
}
+ /* Track the client's largest expected replay time */
+ obd->obd_recovery_timeout =
+ max((first ? (int)OBD_RECOVERY_TIMEOUT :
+ obd->obd_recovery_timeout),
+ (int)lustre_msg_get_timeout(req->rq_reqmsg));
cfs_timer_arm(&obd->obd_recovery_timer,
- cfs_time_shift(OBD_RECOVERY_TIMEOUT));
+ cfs_time_shift(obd->obd_recovery_timeout));
spin_unlock_bh(&obd->obd_processing_task_lock);
- CDEBUG(D_HA, "%s: timer will expire in %u seconds\n", obd->obd_name,
- OBD_RECOVERY_TIMEOUT);
- /* Only used for lprocfs_status */
- obd->obd_recovery_end = CURRENT_SECONDS + OBD_RECOVERY_TIMEOUT;
+ CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n",
+ obd->obd_name, obd->obd_recovery_timeout);
+ obd->obd_recovery_end = cfs_time_current_sec() +
+ obd->obd_recovery_timeout;
}
/* Only start it the first time called */
-void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler)
+void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler,
+ struct ptlrpc_request *req)
{
spin_lock_bh(&obd->obd_processing_task_lock);
if (obd->obd_recovery_handler) {
spin_unlock_bh(&obd->obd_processing_task_lock);
return;
}
- CWARN("%s: starting recovery timer (%us)\n", obd->obd_name,
- OBD_RECOVERY_TIMEOUT);
+ CWARN("%s: starting recovery timer\n", obd->obd_name);
obd->obd_recovery_handler = handler;
cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
spin_unlock_bh(&obd->obd_processing_task_lock);
- reset_recovery_timer(obd);
+ reset_recovery_timer(obd, req, 1);
}
static int check_for_next_transno(struct obd_device *obd)
DEBUG_REQ(D_HA, req, "processing: ");
(void)obd->obd_recovery_handler(req);
obd->obd_replayed_requests++;
- reset_recovery_timer(obd);
+ reset_recovery_timer(obd, req, 0);
/* bug 1580: decide how to properly sync() in recovery */
//mds_fsync_super(obd->u.obt.obt_sb);
class_export_put(req->rq_export);
- if (req->rq_reply_state != NULL) {
- ptlrpc_rs_decref(req->rq_reply_state);
- /* req->rq_reply_state = NULL; */
- }
+ ptlrpc_req_drop_rs(req);
OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
OBD_FREE(req, sizeof *req);
spin_lock_bh(&obd->obd_processing_task_lock);
struct lustre_msg *reqmsg;
int recovery_done = 0;
- LASSERT ((rc == 0) == (req->rq_reply_state != NULL));
+ LASSERT ((rc == 0) == lustre_packed_reply(req));
- if (rc) {
+ if (!lustre_packed_reply(req)) {
/* Just like ptlrpc_error, but without the sending. */
rc = lustre_pack_reply(req, 1, NULL, NULL);
- LASSERT(rc == 0); /* XXX handle this */
+ if (rc)
+ CERROR("pack error %d\n", rc);
req->rq_type = PTL_RPC_MSG_ERR;
}
DEBUG_REQ(D_NET, req, "sending reply");
}
- return (ptlrpc_send_reply(req, 1));
+ return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT));
}
void
return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
}
-/* timeout for initial callback (AST) reply */
-static inline unsigned int ldlm_get_rq_timeout(unsigned int ldlm_timeout,
- unsigned int obd_timeout)
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
{
+ /* Non-AT value */
unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
return timeout < 1 ? 1 : timeout;
lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
l_pending_chain);
- if (cfs_time_after(lock->l_callback_timeout, cfs_time_current()) ||
- (lock->l_req_mode == LCK_GROUP))
+ if (cfs_time_after(lock->l_callback_timeout, cfs_time_current())
+ || (lock->l_req_mode == LCK_GROUP))
break;
- LDLM_ERROR(lock, "lock callback timer expired: evicting client "
- "%s@%s nid %s ",lock->l_export->exp_client_uuid.uuid,
- lock->l_export->exp_connection->c_remote_uuid.uuid,
- libcfs_nid2str(lock->l_export->exp_connection->c_peer.nid));
-
+ LDLM_ERROR(lock, "lock callback timer expired after %lds: "
+ "evicting client at %s ",
+ cfs_time_current_sec()- lock->l_enqueued_time.tv_sec,
+ libcfs_nid2str(
+ lock->l_export->exp_connection->c_peer.nid));
if (lock == last) {
LDLM_ERROR(lock, "waiting on lock multiple times");
CERROR("wll %p n/p %p/%p, l_pending %p n/p %p/%p\n",
*/
static int __ldlm_add_waiting_lock(struct ldlm_lock *lock)
{
+ int timeout = obd_timeout / 2; /* Non-AT value */
cfs_time_t timeout_rounded;
if (!list_empty(&lock->l_pending_chain))
return 0;
- lock->l_callback_timeout =cfs_time_add(cfs_time_current(),
- cfs_time_seconds(obd_timeout)/2);
+ if (lock->l_export && !AT_OFF)
+ timeout = import_at_get_ldlm(lock->l_export->exp_imp_reverse);
+
+ lock->l_callback_timeout = cfs_time_shift(timeout);
timeout_rounded = round_timeout(lock->l_callback_timeout);
- if (cfs_time_before(timeout_rounded, cfs_timer_deadline(&waiting_locks_timer)) ||
+ if (cfs_time_before(timeout_rounded,
+ cfs_timer_deadline(&waiting_locks_timer)) ||
!cfs_timer_is_armed(&waiting_locks_timer)) {
cfs_timer_arm(&waiting_locks_timer, timeout_rounded);
-
}
list_add_tail(&lock->l_pending_chain, &waiting_locks_list); /* FIFO */
return 1;
}
req->rq_send_state = LUSTRE_IMP_FULL;
- req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout);
+ /* ptlrpc_prep_req already set timeout */
+ if (AT_OFF)
+ req->rq_timeout = ldlm_get_rq_timeout();
if (lock->l_export && lock->l_export->exp_ldlm_stats)
lprocfs_counter_incr(lock->l_export->exp_ldlm_stats,
total_enqueue_wait = cfs_timeval_sub(&granted_time,
&lock->l_enqueued_time, NULL);
- if (total_enqueue_wait / 1000000 > obd_timeout)
+ if (total_enqueue_wait / ONE_MILLION > obd_timeout)
+ /* non-fatal with AT - change to LDLM_DEBUG? */
LDLM_ERROR(lock, "enqueue wait took %luus from %lu",
total_enqueue_wait, lock->l_enqueued_time.tv_sec);
ptlrpc_req_set_repsize(req, 1, NULL);
req->rq_send_state = LUSTRE_IMP_FULL;
- req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout);
+ /* ptlrpc_prep_req already set timeout */
+ if (AT_OFF)
+ req->rq_timeout = ldlm_get_rq_timeout();
/* We only send real blocking ASTs after the lock is granted */
lock_res_and_lock(lock);
ptlrpc_req_set_repsize(req, 2, size);
req->rq_send_state = LUSTRE_IMP_FULL;
- req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout);
+ /* ptlrpc_prep_req already set timeout */
+ if (AT_OFF)
+ req->rq_timeout = ldlm_get_rq_timeout();
if (lock->l_export && lock->l_export->exp_ldlm_stats)
lprocfs_counter_incr(lock->l_export->exp_ldlm_stats,
EXIT;
out:
req->rq_status = rc ?: err; /* return either error - bug 11190 */
- if (req->rq_reply_state == NULL) {
+ if (!lustre_packed_reply(req)) {
err = lustre_pack_reply(req, 1, NULL, NULL);
if (rc == 0)
rc = err;
static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
{
req->rq_status = rc;
- if (req->rq_reply_state == NULL) {
+ if (!lustre_packed_reply(req)) {
rc = lustre_pack_reply(req, 1, NULL, NULL);
if (rc)
return rc;
ldlm_state->ldlm_cb_service =
ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE,
LDLM_MAXREPSIZE, LDLM_CB_REQUEST_PORTAL,
- LDLM_CB_REPLY_PORTAL, ldlm_timeout * 900,
+ LDLM_CB_REPLY_PORTAL, 1800,
ldlm_callback_handler, "ldlm_cbd",
ldlm_svc_proc_dir, NULL,
LDLM_THREADS_AUTO_MIN, LDLM_THREADS_AUTO_MAX,
ldlm_state->ldlm_cancel_service =
ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE,
LDLM_MAXREPSIZE, LDLM_CANCEL_REQUEST_PORTAL,
- LDLM_CANCEL_REPLY_PORTAL, ldlm_timeout * 6000,
+ LDLM_CANCEL_REPLY_PORTAL, 6000,
ldlm_cancel_handler, "ldlm_canceld",
ldlm_svc_proc_dir, NULL,
LDLM_THREADS_AUTO_MIN, LDLM_THREADS_AUTO_MAX,
LDLM_ERROR(lock, "lock timed out (enqueued at %lu, %lus ago); "
"not entering recovery in server code, just going "
"back to sleep", lock->l_enqueued_time.tv_sec,
- CURRENT_SECONDS - lock->l_enqueued_time.tv_sec);
+ cfs_time_current_sec() -
+ lock->l_enqueued_time.tv_sec);
if (cfs_time_after(cfs_time_current(), next_dump)) {
last_dump = next_dump;
next_dump = cfs_time_shift(300);
struct obd_device *obd;
struct obd_import *imp = NULL;
struct l_wait_info lwi;
+ __u32 timeout = obd_timeout; /* Non-AT value */
int rc = 0;
ENTRY;
obd = class_exp2obd(lock->l_conn_export);
/* if this is a local lock, then there is no import */
- if (obd != NULL)
+ if (obd != NULL) {
imp = obd->u.cli.cl_import;
+ timeout = import_at_get_ldlm(imp);
+ }
lwd.lwd_lock = lock;
LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
lwi = LWI_INTR(interrupted_completion_wait, &lwd);
} else {
- lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+ lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
ldlm_expired_completion_wait,
interrupted_completion_wait, &lwd);
}
if (ocd == NULL)
GOTO(out_cleanup, rc = -ENOMEM);
- ocd->ocd_connect_flags = OBD_CONNECT_VERSION;
+ ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT;
ocd->ocd_version = LUSTRE_VERSION_CODE;
rc = obd_connect(&mgc_conn, obd, &mgc_uuid, ocd);
obd_timeout);
}
- /* debug peer on timeout? */
+ /* debug peer on timeout? */
envstr = getenv("LIBLUSTRE_DEBUG_PEER_ON_TIMEOUT");
if (envstr != NULL) {
obd_debug_peer_on_timeout =
obd_set_info_async(obd->obd_self_export, strlen("async"), "async",
sizeof(async), &async, NULL);
- ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION;
+ ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION |
+ OBD_CONNECT_AT;
ocd.ocd_ibits_known = MDS_INODELOCK_FULL;
ocd.ocd_version = LUSTRE_VERSION_CODE;
obd->obd_upcall.onu_upcall = ll_ocd_update;
ocd.ocd_connect_flags = OBD_CONNECT_SRVLOCK | OBD_CONNECT_REQPORTAL |
- OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK;
+ OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_AT;
ocd.ocd_version = LUSTRE_VERSION_CODE;
err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, &ocd);
if (err) {
}
/* indicate the features supported by this client */
- data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH |
- OBD_CONNECT_JOIN |
- OBD_CONNECT_ATTRFID | OBD_CONNECT_VERSION |
- OBD_CONNECT_CANCELSET;
+ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_IBITS |
+ OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID | OBD_CONNECT_NODEVOH |
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_AT;
#ifdef CONFIG_FS_POSIX_ACL
data->ocd_connect_flags |= OBD_CONNECT_ACL;
#endif
GOTO(out_mdc, err = -ENODEV);
}
- data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION |
- OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
- OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET;
+ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_GRANT |
+ OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+ OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET | OBD_CONNECT_AT;
CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
"ocd_grant: %d\n", data->ocd_connect_flags,
{ "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 },
{ "filestotal", lprocfs_rd_filestotal, 0, 0 },
{ "filesfree", lprocfs_rd_filesfree, 0, 0 },
- //{ "filegroups", lprocfs_rd_filegroups, 0, 0 },
+ /*{ "filegroups", lprocfs_rd_filegroups, 0, 0 },*/
{ "mds_server_uuid", lprocfs_rd_server_uuid, 0, 0 },
{ "mds_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 },
{ "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight,
mdc_wr_max_rpcs_in_flight, 0 },
+ { "timeouts", lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 },
{ 0 }
};
struct ptlrpc_bulk_desc *desc;
struct l_wait_info lwi;
struct page **pages;
+ int timeout;
int rc = 0, npages, i, tmpcount, tmpsize = 0;
ENTRY;
GOTO(abort_bulk, rc);
}
- lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+ timeout = (int)req->rq_deadline - (int)cfs_time_current_sec();
+ if (timeout < 0) {
+ CERROR("Req deadline already passed %lu (now: %lu)\n",
+ req->rq_deadline, cfs_time_current_sec());
+ }
+ lwi = LWI_TIMEOUT(max(timeout, 1) * HZ, NULL, NULL);
rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi);
LASSERT (rc == 0 || rc == -ETIMEDOUT);
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
default:
mds_exit_ucred(&uc, mds);
- if (req->rq_reply_state == NULL) {
+ if (!lustre_packed_reply(req)) {
req->rq_status = rc;
lustre_pack_reply(req, 1, NULL, NULL);
}
out_pop:
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
out_ucred:
- if (req->rq_reply_state == NULL) {
+ if (!lustre_packed_reply(req)) {
req->rq_status = rc;
lustre_pack_reply(req, 1, NULL, NULL);
}
static int mds_statfs(struct ptlrpc_request *req)
{
struct obd_device *obd = req->rq_export->exp_obd;
+ struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
int rc, size[2] = { sizeof(struct ptlrpc_body),
sizeof(struct obd_statfs) };
ENTRY;
/* This will trigger a watchdog timeout */
OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP,
- (MDS_SERVICE_WATCHDOG_TIMEOUT / 1000) + 1);
+ (MDS_SERVICE_WATCHDOG_FACTOR *
+ at_get(&svc->srv_at_estimate) / 1000) + 1);
OBD_COUNTER_INCREMENT(obd, statfs);
rc = lustre_pack_reply(req, 2, size, NULL);
obd->obd_recoverable_clients,
(obd->obd_recoverable_clients == 1) ?
"client" : "clients",
- (int)(OBD_RECOVERY_TIMEOUT) / 60,
- (int)(OBD_RECOVERY_TIMEOUT) % 60,
+ obd->obd_recovery_timeout / 60,
+ obd->obd_recovery_timeout % 60,
obd->obd_name);
} else {
LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery "
mds->mds_service =
ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
MDS_MAXREPSIZE, MDS_REQUEST_PORTAL,
- MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
+ MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR,
mds_handle, LUSTRE_MDS_NAME,
obd->obd_proc_entry, NULL,
mds_min_threads, mds_max_threads, "ll_mdt");
mds->mds_setattr_service =
ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
MDS_MAXREPSIZE, MDS_SETATTR_PORTAL,
- MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
+ MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR,
mds_handle, "mds_setattr",
obd->obd_proc_entry, NULL,
mds_min_threads, mds_max_threads,
mds->mds_readpage_service =
ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
MDS_MAXREPSIZE, MDS_READPAGE_PORTAL,
- MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
+ MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR,
mds_handle, "mds_readpage",
obd->obd_proc_entry, NULL,
MDS_THREADS_MIN_READPAGE, mds_max_threads,
obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
obd->obd_recovering = 1;
obd->obd_recovery_start = CURRENT_SECONDS;
- /* Only used for lprocfs_status */
+ obd->obd_recovery_timeout = OBD_RECOVERY_TIMEOUT;
obd->obd_recovery_end = obd->obd_recovery_start +
- OBD_RECOVERY_TIMEOUT;
+ obd->obd_recovery_timeout;
}
mds->mds_mount_count = mount_count + 1;
__u8 mcd_padding[LR_CLIENT_SIZE - 88];
};
-#define MDS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000)
+#define MDS_SERVICE_WATCHDOG_FACTOR 2000
#define MAX_ATIME_DIFF 60
if (data == NULL)
RETURN(-ENOMEM);
data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX |
- OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64;
+ OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64 | OBD_CONNECT_AT;
data->ocd_version = LUSTRE_VERSION_CODE;
/* NB: lov_connect() needs to fill in .ocd_index for each OST */
rc = obd_connect(&conn, mds->mds_osc_obd, &obd->obd_uuid, data);
void *handle;
struct inode *inode = obd->u.obt.obt_sb->s_root->d_inode;
BDEVNAME_DECLARE_STORAGE(tmp);
- CERROR("*** setting device %s read-only ***\n",
- ll_bdevname(obd->u.obt.obt_sb, tmp));
+ LCONSOLE_WARN("*** setting obd %s device '%s' read-only ***\n",
+ obd->obd_name, ll_bdevname(obd->u.obt.obt_sb, tmp));
handle = fsfilt_start(obd, inode, FSFILT_OP_MKNOD, NULL);
if (!IS_ERR(handle))
if (cld->cld_stopping)
RETURN(0);
- OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PROCESS_LOG, 20);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
lsi = s2lsi(cld->cld_cfg.cfg_sb);
mgs->mgs_service =
ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE,
MGS_MAXREPSIZE, MGS_REQUEST_PORTAL,
- MGC_REPLY_PORTAL, MGS_SERVICE_WATCHDOG_TIMEOUT,
+ MGC_REPLY_PORTAL, 2000,
mgs_handle, LUSTRE_MGS_NAME,
obd->obd_proc_entry, NULL,
MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX,
obd->obd_name, lockrc);
}
- OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_TARGET_REG, 10);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_PAUSE_TARGET_REG, 10);
/* Log writing contention is handled by the fsdb_sem */
int opc, rc = 0;
ENTRY;
- OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_REQUEST_NET, 2);
- OBD_FAIL_RETURN(OBD_FAIL_MGS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0);
+ OBD_FAIL_TIMEOUT_MS(OBD_FAIL_MGS_PAUSE_REQ, obd_fail_val);
+ OBD_FAIL_RETURN(OBD_FAIL_MGS_ALL_REQUEST_NET, 0);
LASSERT(current->journal_info == NULL);
opc = lustre_msg_get_opc(req->rq_reqmsg);
#include <lustre_export.h>
-/* in ms */
-#define MGS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000)
-
/* mgs_llog.c */
int class_dentry_readdir(struct obd_device *obd, struct dentry *dir,
struct vfsmount *inmnt,
unsigned int obd_debug_peer_on_timeout;
unsigned int obd_dump_on_timeout;
unsigned int obd_dump_on_eviction;
-unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */
unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
-unsigned int obd_health_check_timeout = HEALTH_CHECK_TIMEOUT_DEFAULT; /* seconds */
+unsigned int adaptive_timeout_min = 10; /* seconds */
+unsigned int adaptive_timeout_max = 600; /* seconds */
unsigned int obd_max_dirty_pages = 256;
atomic_t obd_dirty_pages;
EXPORT_SYMBOL(obd_dump_on_eviction);
EXPORT_SYMBOL(obd_timeout);
EXPORT_SYMBOL(ldlm_timeout);
-EXPORT_SYMBOL(obd_health_check_timeout);
+EXPORT_SYMBOL(adaptive_timeout_min);
+EXPORT_SYMBOL(adaptive_timeout_max);
EXPORT_SYMBOL(obd_max_dirty_pages);
EXPORT_SYMBOL(obd_dirty_pages);
EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
CFS_INIT_LIST_HEAD(&export->exp_handle.h_link);
class_handle_hash(&export->exp_handle, export_handle_addref);
- export->exp_last_request_time = CURRENT_SECONDS;
+ export->exp_last_request_time = cfs_time_current_sec();
spin_lock_init(&export->exp_lock);
INIT_HLIST_NODE(&export->exp_uuid_hash);
INIT_HLIST_NODE(&export->exp_nid_hash);
}
EXPORT_SYMBOL(class_import_put);
+static void init_imp_at(struct imp_at *at) {
+ int i;
+ /* We need enough time to get an early response on a slow network.
+ Since we can't say for sure how slow a network might be, we use
+ a user-defined max expected network latency. We will adapt to slow
+ increases, but a sudden jump can still kill us. */
+ at_init(&at->iat_net_latency, adaptive_timeout_min, AT_TIMEBASE_DEFAULT,
+ AT_FLG_MIN);
+ for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ /* max service estimates are tracked on the server side, so
+ don't use the AT history here, just use the last reported
+ val. (But keep hist for proc histogram, worst_ever) */
+ at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+ AT_TIMEBASE_DEFAULT, AT_FLG_NOHIST);
+ }
+ at->iat_drain = 0;
+}
+
struct obd_import *class_new_import(struct obd_device *obd)
{
struct obd_import *imp;
CFS_INIT_LIST_HEAD(&imp->imp_conn_list);
CFS_INIT_LIST_HEAD(&imp->imp_handle.h_link);
class_handle_hash(&imp->imp_handle, import_handle_addref);
+ init_imp_at(&imp->imp_at);
/* the default magic is V1, will be used in connect RPC, and
* then adjusted according to the flags in request/reply. */
spin_unlock(&fake_exp->exp_lock);
rc = obd_disconnect(fake_exp);
+ CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+ obd_export_nid2str(exp), exp, rc);
class_export_put(exp);
- CDEBUG(D_HA, "disconnecting export %s (%p): rc %d\n",
- exp->exp_client_uuid.uuid, exp, rc);
}
EXIT;
}
return rc;
}
-static int obd_proc_rd_health_timeout(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- *eof = 1;
- return snprintf(page, count, "%d\n", obd_health_check_timeout);
-}
-
-static int obd_proc_wr_health_timeout(struct file *file, const char *buffer,
- unsigned long count, void *data)
-{
- int val, rc;
-
- rc = lprocfs_write_helper(buffer, count, &val);
- if (rc)
- return rc;
-
- obd_health_check_timeout = val;
-
- return count;
-}
-
/* Root for /proc/fs/lustre */
struct proc_dir_entry *proc_lustre_root = NULL;
{ "version", obd_proc_read_version, NULL, NULL },
{ "pinger", obd_proc_read_pinger, NULL, NULL },
{ "health_check", obd_proc_read_health, NULL, NULL },
- { "health_check_timeout", obd_proc_rd_health_timeout,
- obd_proc_wr_health_timeout, NULL },
{ 0 }
};
#else
OBD_DUMP_ON_EVICTION, /* dump kernel debug log upon eviction */
OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */
+ ADAPTIVE_MIN, /* Adaptive timeout lower limit */
+ ADAPTIVE_MAX, /* Adaptive timeout upper limit */
};
int LL_PROC_PROTO(proc_fail_loc)
.mode = 0644,
.proc_handler = &proc_dointvec
},
- {
+ {
.ctl_name = OBD_LDLM_TIMEOUT,
.procname = "ldlm_timeout",
.data = &ldlm_timeout,
.proc_handler = &proc_alloc_fail_rate
},
#endif
+ {
+ .ctl_name = ADAPTIVE_MIN,
+ .procname = "adaptive_min",
+ .data = &adaptive_timeout_min,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = ADAPTIVE_MAX,
+ .procname = "adaptive_max",
+ .data = &adaptive_timeout_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
{ 0 }
};
return rc;
}
+int lprocfs_at_hist_helper(char *page, int count, int rc,
+ struct adaptive_timeout *at)
+{
+ int i;
+ for (i = 0; i < AT_BINS; i++)
+ rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]);
+ rc += snprintf(page + rc, count - rc, "\n");
+ return rc;
+}
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_import *imp;
+ unsigned int cur, worst;
+ time_t now, worstt;
+ struct dhms ts;
+ int i, rc = 0;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+ *eof = 1;
+
+ if (AT_OFF)
+ rc += snprintf(page + rc, count - rc,
+ "adaptive timeouts off, using obd_timeout %u\n",
+ obd_timeout);
+
+ rc += snprintf(page + rc, count - rc,
+ "%-10s : %ld sec\n", "timebase",
+ imp->imp_at.iat_net_latency.at_binlimit * AT_BINS);
+
+ now = cfs_time_current_sec();
+
+ /* Some network health info for kicks */
+ s2dhms(&ts, now - imp->imp_last_reply_time);
+ rc += snprintf(page + rc, count - rc,
+ "%-10s : %ld, "DHMS_FMT" ago\n",
+ "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+
+ cur = at_get(&imp->imp_at.iat_net_latency);
+ worst = imp->imp_at.iat_net_latency.at_worst_ever;
+ worstt = imp->imp_at.iat_net_latency.at_worst_time;
+ s2dhms(&ts, now - worstt);
+ rc += snprintf(page + rc, count - rc,
+ "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ",
+ "network", cur, worst, worstt, DHMS_VARS(&ts));
+ rc = lprocfs_at_hist_helper(page, count, rc,
+ &imp->imp_at.iat_net_latency);
+
+ for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ if (imp->imp_at.iat_portal[i] == 0)
+ break;
+ cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+ worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+ worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+ s2dhms(&ts, now - worstt);
+ rc += snprintf(page + rc, count - rc,
+ "portal %-2d : cur %3u worst %3u (at %ld, "
+ DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+ cur, worst, worstt, DHMS_VARS(&ts));
+ rc = lprocfs_at_hist_helper(page, count, rc,
+ &imp->imp_at.iat_service_estimate[i]);
+ }
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return rc;
+}
+
+int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_import *imp;
+ time_t bval;
+ int val, i, rc;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc < 0)
+ return rc;
+ if (val <= 0)
+ return -ERANGE;
+
+ bval = max(1, val / AT_BINS);
+ spin_lock(&imp->imp_at.iat_net_latency.at_lock);
+ imp->imp_at.iat_net_latency.at_binlimit = bval;
+ spin_unlock(&imp->imp_at.iat_net_latency.at_lock);
+ for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ spin_lock(&imp->imp_at.iat_service_estimate[i].at_lock);
+ imp->imp_at.iat_service_estimate[i].at_binlimit = bval;
+ spin_unlock(&imp->imp_at.iat_service_estimate[i].at_lock);
+ }
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return count;
+}
+
static const char *obd_connect_names[] = {
"read_only",
"lov_index",
struct lprocfs_stats *stats = p->private;
struct lprocfs_counter *cntr = v;
struct lprocfs_counter t, ret = { .lc_min = ~(__u64)0 };
- int i, idx, rc;
+ int i, idx, rc = 0;
if (cntr == &(stats->ls_percpu[0])->lp_cntr[0]) {
struct timeval now;
ret.lc_sumsquare += t.lc_sumsquare;
}
+ if (ret.lc_count == 0)
+ goto out;
+
rc = seq_printf(p, "%-25s "LPU64" samples [%s]", cntr->lc_name,
ret.lc_count, cntr->lc_units);
if (rc < 0)
EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
EXPORT_SYMBOL(lprocfs_rd_num_exports);
EXPORT_SYMBOL(lprocfs_rd_numrefs);
-
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+EXPORT_SYMBOL(lprocfs_wr_timeouts);
EXPORT_SYMBOL(lprocfs_rd_blksize);
EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
obd_timeout, lcfg->lcfg_num);
obd_timeout = max(lcfg->lcfg_num, 1U);
- obd_health_check_timeout = HEALTH_CHECK_TIMEOUT;
GOTO(out, err = 0);
}
case LCFG_SET_UPCALL: {
CERROR("Can't find mount for %s\n", name);
RETURN(-ENOENT);
}
- lsi = s2lsi(lmi->lmi_sb);
LASSERT(lmi->lmi_mnt == mnt);
+ /* Note if lsi is poisoned, it's probably because server_wait_finished
+ didn't wait long enough and the sb was freed at the umount. */
+ lsi = s2lsi(lmi->lmi_sb);
CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts), count);
static int lustre_start_mgc(struct super_block *sb)
{
struct lustre_handle mgc_conn = {0, };
- struct obd_connect_data ocd = { 0 };
+ struct obd_connect_data *data = NULL;
struct lustre_sb_info *lsi = s2lsi(sb);
struct obd_device *obd;
struct obd_export *exp;
/* nonfatal */
CERROR("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
+ OBD_ALLOC_PTR(data);
+ if (data == NULL)
+ GOTO(out, rc = -ENOMEM);
+ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT;
+ data->ocd_version = LUSTRE_VERSION_CODE;
/* We connect to the MGS at setup, and don't disconnect until cleanup */
- rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), &ocd);
+ rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), data);
+ OBD_FREE_PTR(data);
if (rc) {
CERROR("connect failed %d\n", rc);
GOTO(out, rc);
RETURN(ERR_PTR(rc));
}
-/* We have to wait for everything to finish, including lnet lnd expires,
+/* We have to wait for everything to finish, including lnet lnd expires,
before it is safe to free the sb */
static void server_wait_finished(struct vfsmount *mnt)
{
init_waitqueue_head(&waitq);
while (atomic_read(&mnt->mnt_count) > 1) {
- if (waited && (waited % 30 == 0))
+ if (waited && (waited % 30 == 0))
LCONSOLE_WARN("Mount still busy with %d refs after "
- "%d secs\n", atomic_read(&mnt->mnt_count),
+ "%d secs\n", atomic_read(&mnt->mnt_count),
waited);
/* Wait for a bit */
waited += 3;
{
spin_lock_init(&exp->exp_filter_data.fed_lock);
INIT_LIST_HEAD(&exp->exp_filter_data.fed_mod_list);
-
+
spin_lock(&exp->exp_lock);
exp->exp_connecting = 1;
spin_unlock(&exp->exp_lock);
obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
obd->obd_recovering = 1;
obd->obd_recovery_start = CURRENT_SECONDS;
- /* Only used for lprocfs_status */
+ obd->obd_recovery_timeout = OBD_RECOVERY_TIMEOUT;
obd->obd_recovery_end = obd->obd_recovery_start +
- OBD_RECOVERY_TIMEOUT;
+ obd->obd_recovery_timeout;
}
out:
return dparent;
rc = filter_lock_dentry(obd, dparent);
- fsfilt_check_slow(obd, now, obd_timeout, "parent lock");
+ fsfilt_check_slow(obd, now, "parent lock");
return rc ? ERR_PTR(rc) : dparent;
}
obd->obd_recoverable_clients,
(obd->obd_recoverable_clients == 1)
? "client" : "clients",
- (int)(OBD_RECOVERY_TIMEOUT) / 60,
- (int)(OBD_RECOVERY_TIMEOUT) % 60,
+ obd->obd_recovery_timeout / 60,
+ obd->obd_recovery_timeout % 60,
obd->obd_name);
} else {
LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
struct dentry *dchild = NULL, *dparent = NULL;
struct filter_obd *filter;
int err = 0, rc = 0, recreate_obj = 0, i;
- unsigned long enough_time = jiffies + min(obd_timeout * HZ / 4, 10U*HZ);
+ cfs_time_t enough_time = cfs_time_shift(DISK_TIMEOUT/2);
obd_id next_id;
void *handle = NULL;
ENTRY;
if (rc)
break;
- if (time_after(jiffies, enough_time)) {
+ if (cfs_time_after(cfs_time_current(), enough_time)) {
CDEBUG(D_RPCTRACE,
"%s: precreate slow - want %d got %d \n",
obd->obd_name, *num, i);
struct super_block *sb = obd->u.obt.obt_sb;
struct inode *inode = sb->s_root->d_inode;
BDEVNAME_DECLARE_STORAGE(tmp);
- CERROR("*** setting device %s read-only ***\n",
- ll_bdevname(sb, tmp));
+ LCONSOLE_WARN("*** setting obd %s device '%s' read-only ***\n",
+ obd->obd_name, ll_bdevname(sb, tmp));
handle = fsfilt_start(obd, inode, FSFILT_OP_MKNOD, NULL);
if (!IS_ERR(handle))
#define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE)
#define GRANT_FOR_LLOG(obd) 16
-#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
-
extern struct file_operations filter_per_export_stats_fops;
/* Data stored per client in the last_rcvd file. In le32 order. */
#else
#define FILTER_FMD_MAX_NUM_DEFAULT 32
#endif
+/* Client cache seconds */
#define FILTER_FMD_MAX_AGE_DEFAULT ((obd_timeout + 10) * HZ)
struct filter_mod_data *filter_fmd_find(struct obd_export *exp,
inode = dentry->d_inode;
obdo_to_inode(inode, oa, OBD_MD_FLATIME);
- fsfilt_check_slow(obd, now, obd_timeout, "preprw_read setup");
+ fsfilt_check_slow(obd, now, "preprw_read setup");
for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt;
i++, rnb++, lnb++) {
filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
}
- fsfilt_check_slow(obd, now, obd_timeout, "start_page_read");
+ fsfilt_check_slow(obd, now, "start_page_read");
rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf,
exp, NULL, NULL, NULL);
fso.fso_dentry = dentry;
fso.fso_bufcnt = obj->ioo_bufcnt;
- fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "preprw_write setup");
+ fsfilt_check_slow(exp->exp_obd, now, "preprw_write setup");
/* Don't update inode timestamps if this write is older than a
* setattr which modifies the timestamps. b=10150 */
rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp,
NULL, NULL, NULL);
- fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "start_page_write");
+ fsfilt_check_slow(exp->exp_obd, now, "start_page_write");
lprocfs_counter_add(exp->exp_obd->obd_stats, LPROC_FILTER_WRITE_BYTES,
tot_bytes);
GOTO(cleanup, rc);
}
- fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
+ fsfilt_check_slow(obd, now, "brw_start");
i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
if (rc == 0)
obdo_from_inode(oa, inode, FILTER_VALID_FLAGS);
- fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
+ fsfilt_check_slow(obd, now, "direct_io");
err = fsfilt_commit_wait(obd, inode, wait_handle);
if (err) {
LASSERTF(oti->oti_transno <= obd->obd_last_committed,
"oti_transno "LPU64" last_committed "LPU64"\n",
oti->oti_transno, obd->obd_last_committed);
- fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
+ fsfilt_check_slow(obd, now, "commitrw commit");
cleanup:
filter_grant_commit(exp, niocount, res);
DQUOT_INIT(inode);
LOCK_INODE_MUTEX(inode);
- fsfilt_check_slow(obd, now, obd_timeout, "i_mutex");
+ fsfilt_check_slow(obd, now, "i_mutex");
oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
oti);
if (IS_ERR(oti->oti_handle)) {
}
/* have to call fsfilt_commit() from this point on */
- fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
+ fsfilt_check_slow(obd, now, "brw_start");
i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
lquota_getflag(filter_quota_interface_ref, obd, oa);
- fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
+ fsfilt_check_slow(obd, now, "direct_io");
err = fsfilt_commit_wait(obd, inode, wait_handle);
if (err) {
"oti_transno "LPU64" last_committed "LPU64"\n",
oti->oti_transno, obd->obd_last_committed);
- fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
+ fsfilt_check_slow(obd, now, "commitrw commit");
cleanup:
filter_grant_commit(exp, niocount, res);
{ "prealloc_next_id", osc_rd_prealloc_next_id, 0, 0 },
{ "prealloc_last_id", osc_rd_prealloc_last_id, 0, 0 },
{ "checksums", osc_rd_checksum, osc_wr_checksum, 0 },
+ { "timeouts", lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 },
{ 0 }
};
CDEBUG(D_HA,"%s: oscc recovery in progress, waiting\n",
oscc->oscc_obd->obd_name);
- lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout/4)),
- NULL, NULL);
+ lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(
+ obd_timeout / 4)), NULL, NULL);
rc = l_wait_event(oscc->oscc_waitq,
!oscc_recovering(oscc), &lwi);
LASSERT(rc == 0 || rc == -ETIMEDOUT);
* If getting the lock took more time than
* client was willing to wait, drop it. b=11330
*/
- if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout ||
+ if (cfs_time_current_sec() > req->rq_deadline ||
OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
no_reply = 1;
CERROR("Dropping timed-out read from %s because locking"
- "object "LPX64" took %ld seconds.\n",
+ "object "LPX64" took %ld seconds (limit was %ld).\n",
libcfs_id2str(req->rq_peer), ioo->ioo_id,
- cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
+ cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
+ req->rq_deadline - req->rq_arrival_time.tv_sec);
goto out_lock;
}
else
rc = ptlrpc_start_bulk_transfer(desc);
if (rc == 0) {
- lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 4, HZ,
- ost_bulk_timeout, desc);
- rc = l_wait_event(desc->bd_waitq,
- !ptlrpc_bulk_active(desc) ||
- desc->bd_export->exp_failed, &lwi);
- LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ do {
+ long timeoutl = req->rq_deadline -
+ cfs_time_current_sec();
+ cfs_duration_t timeout = (timeoutl <= 0 || rc) ?
+ CFS_TICK : cfs_time_seconds(timeoutl);
+ lwi = LWI_TIMEOUT_INTERVAL(timeout,
+ cfs_time_seconds(1),
+ ost_bulk_timeout,
+ desc);
+ rc = l_wait_event(desc->bd_waitq,
+ !ptlrpc_bulk_active(desc) ||
+ desc->bd_export->exp_failed,
+ &lwi);
+ LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ /* Wait again if we changed deadline */
+ } while ((rc == -ETIMEDOUT) &&
+ (req->rq_deadline > cfs_time_current_sec()));
+
if (rc == -ETIMEDOUT) {
DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT");
ptlrpc_abort_bulk(desc);
req->rq_status = rc;
ptlrpc_error(req);
} else {
- if (req->rq_reply_state != NULL) {
- /* reply out callback would free */
- ptlrpc_rs_decref(req->rq_reply_state);
- req->rq_reply_state = NULL;
- }
+ /* reply out callback would free */
+ ptlrpc_req_drop_rs(req);
CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - "
"client will retry\n",
req->rq_export->exp_obd->obd_name,
* If getting the lock took more time than
* client was willing to wait, drop it. b=11330
*/
- if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout ||
+ if (cfs_time_current_sec() > req->rq_deadline ||
OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
no_reply = 1;
- CERROR("Dropping timed-out write from %s because locking"
- "object "LPX64" took %ld seconds.\n",
+ CERROR("Dropping timed-out write from %s because locking "
+ "object "LPX64" took %ld seconds (limit was %ld).\n",
libcfs_id2str(req->rq_peer), ioo->ioo_id,
- cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
+ cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
+ req->rq_deadline - req->rq_arrival_time.tv_sec);
goto out_lock;
}
else
rc = ptlrpc_start_bulk_transfer (desc);
if (rc == 0) {
- lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 2, HZ,
- ost_bulk_timeout, desc);
- rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc) ||
- desc->bd_export->exp_failed, &lwi);
- LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ do {
+ long timeoutl = req->rq_deadline -
+ cfs_time_current_sec();
+ cfs_duration_t timeout = (timeoutl <= 0 || rc) ?
+ CFS_TICK : cfs_time_seconds(timeoutl);
+ lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+ ost_bulk_timeout, desc);
+ rc = l_wait_event(desc->bd_waitq,
+ !ptlrpc_bulk_active(desc) ||
+ desc->bd_export->exp_failed, &lwi);
+ LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ /* Wait again if we changed deadline */
+ } while ((rc == -ETIMEDOUT) &&
+ (req->rq_deadline > cfs_time_current_sec()));
+
if (rc == -ETIMEDOUT) {
DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
ptlrpc_abort_bulk(desc);
req->rq_status = rc;
ptlrpc_error(req);
} else {
- if (req->rq_reply_state != NULL) {
- /* reply out callback would free */
- ptlrpc_rs_decref(req->rq_reply_state);
- req->rq_reply_state = NULL;
- }
+ /* reply out callback would free */
+ ptlrpc_req_drop_rs(req);
CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - "
"client will retry\n",
req->rq_export->exp_obd->obd_name,
case OST_CREATE:
CDEBUG(D_INODE, "create\n");
OBD_FAIL_RETURN(OBD_FAIL_OST_CREATE_NET, 0);
+ OBD_FAIL_TIMEOUT_MS(OBD_FAIL_OST_PAUSE_CREATE, obd_fail_val);
if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_ENOSPC))
GOTO(out, rc = -ENOSPC);
if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_EROFS))
ost->ost_service =
ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
OST_MAXREPSIZE, OST_REQUEST_PORTAL,
- OSC_REPLY_PORTAL,
- obd_timeout * 1000, ost_handle, LUSTRE_OSS_NAME,
+ OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
+ ost_handle, LUSTRE_OSS_NAME,
obd->obd_proc_entry, ost_print_req,
oss_min_threads, oss_max_threads, "ll_ost");
if (ost->ost_service == NULL) {
ost->ost_create_service =
ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
OST_MAXREPSIZE, OST_CREATE_PORTAL,
- OSC_REPLY_PORTAL,
- obd_timeout * 1000, ost_handle, "ost_create",
+ OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
+ ost_handle, "ost_create",
obd->obd_proc_entry, ost_print_req,
oss_min_create_threads,
oss_max_create_threads,
ost->ost_io_service =
ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
OST_MAXREPSIZE, OST_IO_PORTAL,
- OSC_REPLY_PORTAL,
- obd_timeout * 1000, ost_handle, "ost_io",
+ OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
+ ost_handle, "ost_io",
obd->obd_proc_entry, ost_print_req,
oss_min_threads, oss_max_threads, "ll_ost_io");
if (ost->ost_io_service == NULL) {
# define ost_print_req NULL
#endif
+#define OSS_SERVICE_WATCHDOG_FACTOR 2000
+
/*
* tunables for per-thread page pool (bug 5137)
*/
struct ost_thread_local_cache *ost_tls(struct ptlrpc_request *r);
-#define OSS_DEF_CREATE_THREADS 1UL
+#define OSS_DEF_CREATE_THREADS 2UL
#define OSS_MAX_CREATE_THREADS 16UL
/* Quota stuff */
EXIT;
}
+/* Set server timelimit for this req */
+static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) {
+ __u32 serv_est;
+ int idx;
+ struct imp_at *at;
+ LASSERT(req->rq_import);
+
+ if (AT_OFF ||
+ ((idx = import_at_get_index(req->rq_import,
+ req->rq_request_portal)) < 0)) {
+ /* non-AT settings */
+ req->rq_timeout = req->rq_import->imp_server_timeout ?
+ obd_timeout / 2 : obd_timeout;
+ goto out;
+ }
+
+ at = &req->rq_import->imp_at;
+ serv_est = at_get(&at->iat_service_estimate[idx]);
+ /* add an arbitrary minimum: 125% +5 sec */
+ req->rq_timeout = serv_est + (serv_est >> 2) + 5;
+ /* We could get even fancier here, using history to predict increased
+ loading... */
+
+ if (at->iat_drain > req->rq_timeout) {
+ /* If we're trying to drain the network queues, give this
+ req a long timeout */
+ req->rq_timeout = at->iat_drain;
+ CDEBUG(D_ADAPTTO, "waiting %ds to let queues drain\n",
+ req->rq_timeout);
+ }
+
+out:
+ /* Let the server know what this RPC timeout is by putting it in the
+ reqmsg*/
+ lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req) {
+ int idx;
+ unsigned int serv_est, oldse;
+ struct imp_at *at = &req->rq_import->imp_at;
+ LASSERT(req->rq_import);
+
+ /* service estimate is returned in the repmsg timeout field,
+ may be 0 on err */
+ serv_est = lustre_msg_get_timeout(req->rq_repmsg);
+
+ idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+ if (idx < 0)
+ return;
+
+ oldse = at_get(&at->iat_service_estimate[idx]);
+ /* max service estimates are tracked on the server side,
+ so just keep minimal history here */
+ at_add(&at->iat_service_estimate[idx], serv_est);
+
+ if (at_get(&at->iat_service_estimate[idx]) != oldse)
+ CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
+ "has changed %ds to %u\n",
+ req->rq_import->imp_obd->obd_name, req->rq_request_portal,
+ at_get(&at->iat_service_estimate[idx]) - oldse, serv_est);
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) {
+ return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) {
+ unsigned int st, nl, oldnl;
+ struct imp_at *at = &req->rq_import->imp_at;
+ time_t now = cfs_time_current_sec();
+ LASSERT(req->rq_import);
+
+ st = lustre_msg_get_service_time(req->rq_repmsg);
+
+ /* Network latency is total time less server processing time */
+ nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
+ if (st > now - req->rq_sent + 1 /* rounding */)
+ CERROR("Reported service time %u > total measured time %ld\n",
+ st, now - req->rq_sent);
+
+ oldnl = at_get(&at->iat_net_latency);
+ at_add(&at->iat_net_latency, nl);
+
+ if (at_get(&at->iat_net_latency) != oldnl)
+ CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s)"
+ " has changed %ds to %u\n",
+ req->rq_import->imp_obd->obd_name,
+ obd_uuid2str(
+ &req->rq_import->imp_connection->c_remote_uuid),
+ at_get(&at->iat_net_latency) - oldnl, nl);
+}
+
+static int unpack_reply(struct ptlrpc_request *req) {
+ int rc;
+
+ /* Clear reply swab mask; we may have already swabbed an early reply */
+ req->rq_rep_swab_mask = 0;
+
+ rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
+ if (rc) {
+ DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+ return(-EPROTO);
+ }
+
+ rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+ if (rc) {
+ DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+ return(-EPROTO);
+ }
+ return 0;
+}
+
+/* Handle an early reply message.
+ We can't risk the real reply coming in and changing rq_repmsg,
+ so this fn must be called under the rq_lock */
+static int ptlrpc_at_early_reply(struct ptlrpc_request *req) {
+ struct lustre_msg *oldmsg, *msgcpy;
+ time_t olddl;
+ int oldlen, rc;
+ ENTRY;
+
+ req->rq_early = 0;
+
+ rc = unpack_reply(req);
+ if (rc)
+ /* Let's just ignore it - same as if it never got here */
+ RETURN(rc);
+
+ /* We've got to make sure another early reply doesn't land on
+ top of our current repbuf. Make a copy and verify checksum. */
+ oldlen = req->rq_replen;
+ spin_unlock(&req->rq_lock);
+ OBD_ALLOC(msgcpy, oldlen);
+ if (!msgcpy) {
+ spin_lock(&req->rq_lock);
+ RETURN(-ENOMEM);
+ }
+ spin_lock(&req->rq_lock);
+ /* Another reply might have changed the repmsg and replen while
+ we dropped the lock; doesn't really matter, just use the latest.
+ If it doesn't fit in oldlen, checksum will be wrong. */
+ oldmsg = req->rq_repmsg;
+ memcpy(msgcpy, oldmsg, oldlen);
+ if (lustre_msg_get_cksum(msgcpy) !=
+ lustre_msg_calc_cksum(msgcpy)) {
+ CDEBUG(D_ADAPTTO, "Early reply checksum mismatch, "
+ "discarding %x != %x\n", lustre_msg_get_cksum(msgcpy),
+ lustre_msg_calc_cksum(msgcpy));
+ GOTO(out, rc = -EINVAL);
+ }
+
+ /* Our copied msg is valid, now we can adjust the timeouts without
+ worrying that a new reply will land on the copy. */
+ req->rq_repmsg = msgcpy;
+
+ /* Expecting to increase the service time estimate here */
+ ptlrpc_at_adj_service(req);
+ ptlrpc_at_adj_net_latency(req);
+
+ /* Adjust the local timeout for this req */
+ ptlrpc_at_set_req_timeout(req);
+
+ olddl = req->rq_deadline;
+ /* server assumes it now has rq_timeout from when it sent the
+ early reply, so client should give it at least that long. */
+ req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
+ ptlrpc_at_get_net_latency(req);
+
+ DEBUG_REQ(D_ADAPTTO, req,
+ "Early reply #%d, new deadline in %lds (%+lds)",
+ req->rq_early_count, req->rq_deadline -
+ cfs_time_current_sec(), req->rq_deadline - olddl);
+
+ req->rq_repmsg = oldmsg;
+
+out:
+ OBD_FREE(msgcpy, oldlen);
+ RETURN(rc);
+}
+
void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
{
struct list_head *l, *tmp;
}
lustre_msg_add_version(request->rq_reqmsg, version);
-
- if (imp->imp_server_timeout)
- request->rq_timeout = obd_timeout / 2;
- else
- request->rq_timeout = obd_timeout;
request->rq_send_state = LUSTRE_IMP_FULL;
request->rq_type = PTL_RPC_MSG_REQUEST;
request->rq_import = class_import_get(imp);
/* XXX FIXME bug 249 */
request->rq_request_portal = imp->imp_client->cli_request_portal;
request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+
+ ptlrpc_at_set_req_timeout(request);
spin_lock_init(&request->rq_lock);
CFS_INIT_LIST_HEAD(&request->rq_list);
atomic_set(&request->rq_refcount, 1);
lustre_msg_set_opc(request->rq_reqmsg, opcode);
- lustre_msg_set_flags(request->rq_reqmsg, 0);
+ lustre_msg_set_flags(request->rq_reqmsg, imp->imp_msg_flags);
RETURN(request);
}
if (req->rq_restart)
GOTO(out, rc = 1);
+
+ if (req->rq_early) {
+ ptlrpc_at_early_reply(req);
+ GOTO(out, rc = 0); /* keep waiting */
+ }
+
EXIT;
out:
spin_unlock(&req->rq_lock);
/* NB Until this point, the whole of the incoming message,
* including buflens, status etc is in the sender's byte order. */
-#if SWAB_PARANOIA
- /* Clear reply swab mask; this is a new reply in sender's byte order */
- req->rq_rep_swab_mask = 0;
-#endif
LASSERT (req->rq_nob_received <= req->rq_replen);
- rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
- if (rc) {
- DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
- RETURN(-EPROTO);
- }
+ rc = unpack_reply(req);
+ if (rc)
+ RETURN(rc);
- rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
- if (rc) {
- DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
- RETURN(-EPROTO);
- }
+ OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val);
+ ptlrpc_at_adj_service(req);
+ ptlrpc_at_adj_net_latency(req);
if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
/* Still waiting for a reply? */
if (ptlrpc_client_receiving_reply(req))
continue;
+
+ spin_lock(&req->rq_lock);
+ if (req->rq_early) {
+ ptlrpc_at_early_reply(req);
+ spin_unlock(&req->rq_lock);
+ continue;
+ }
+ spin_unlock(&req->rq_lock);
/* Did we actually receive a reply? */
if (!ptlrpc_client_replied(req))
RETURN(set->set_remaining == 0 || force_timer_recalc);
}
+/* Return 1 if we should give up, else 0 */
int ptlrpc_expire_one_request(struct ptlrpc_request *req)
{
struct obd_import *imp = req->rq_import;
int rc = 0;
ENTRY;
- DEBUG_REQ(D_ERROR|D_NETERROR, req, "%s (sent at %lu, %lus ago)",
+ DEBUG_REQ(D_NETERROR, req, "%s (sent at %lu, %lus ago)",
req->rq_net_err ? "network error" : "timeout",
- (long)req->rq_sent, CURRENT_SECONDS - req->rq_sent);
+ (long)req->rq_sent, cfs_time_current_sec() - req->rq_sent);
+
+ if (imp) {
+ LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago"
+ " has timed out (limit %lus).\n", req->rq_xid,
+ req->rq_import->imp_obd->obd_name,
+ libcfs_nid2str(imp->imp_connection->c_peer.nid),
+ cfs_time_current_sec() - req->rq_sent,
+ req->rq_deadline - req->rq_sent);
+ }
if (imp != NULL && obd_debug_peer_on_timeout)
LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
* then error it out here. */
if (req->rq_send_state != LUSTRE_IMP_FULL ||
imp->imp_obd->obd_no_recov) {
+ DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+ ptlrpc_import_state_name(req->rq_send_state),
+ ptlrpc_import_state_name(imp->imp_state));
spin_lock(&req->rq_lock);
req->rq_status = -ETIMEDOUT;
req->rq_err = 1;
RETURN(1);
}
- /* if request can't be resend we can't wait answer after timeout */
+ /* if a request can't be resent we can't wait for an answer after
+ the timeout */
if (req->rq_no_resend) {
DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
rc = 1;
{
struct ptlrpc_request_set *set = data;
struct list_head *tmp;
- time_t now = CURRENT_SECONDS;
+ time_t now = cfs_time_current_sec();
ENTRY;
LASSERT(set != NULL);
continue;
if (req->rq_timedout || /* already dealt with */
- req->rq_sent + req->rq_timeout > now) /* not expired */
+ req->rq_deadline > now) /* not expired */
continue;
/* deal with this guy */
}
}
+/* get the smallest timeout in the set; this does NOT set a timeout. */
int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
{
struct list_head *tmp;
- time_t now = CURRENT_SECONDS;
- time_t deadline;
+ time_t now = cfs_time_current_sec();
int timeout = 0;
struct ptlrpc_request *req;
ENTRY;
if (req->rq_timedout) /* already timed out */
continue;
- deadline = req->rq_sent + req->rq_timeout;
- if (deadline <= now) /* actually expired already */
+ if (req->rq_deadline <= now) { /* actually expired already */
timeout = 1; /* ASAP */
- else if (timeout == 0 || timeout > deadline - now)
- timeout = deadline - now;
+ break;
+ } else if (timeout == 0 || timeout > req->rq_deadline - now) {
+ timeout = req->rq_deadline - now;
+ }
}
RETURN(timeout);
}
LBUG();
}
- if (request->rq_repmsg != NULL) {
- OBD_FREE(request->rq_repmsg, request->rq_replen);
+ if (request->rq_repbuf != NULL) {
+ OBD_FREE(request->rq_repbuf, request->rq_replen);
+ request->rq_repbuf = NULL;
request->rq_repmsg = NULL;
}
if (request->rq_export != NULL) {
LASSERT(!in_interrupt ()); /* might sleep */
- if (!ptlrpc_client_receiving_reply(request))
+ if (!ptlrpc_client_receiving_reply(request) &&
+ !ptlrpc_client_must_unlink(request))
+ /* Nothing left to do */
return;
LNetMDUnlink (request->rq_reply_md_h);
/* We have to l_wait_event() whatever the result, to give liblustre
- * a chance to run reply_in_callback() */
+ * a chance to run reply_in_callback(), and to make sure we've
+ * unlinked before returning a req to the pool */
if (request->rq_set != NULL)
wq = &request->rq_set->set_waitq;
for (;;) {
/* Network access will complete in finite time but the HUGE
* timeout lets us CWARN for visibility of sluggish NALs */
- lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL);
- rc = l_wait_event (*wq, !ptlrpc_client_receiving_reply(request), &lwi);
+ lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL);
+ rc = l_wait_event (*wq,
+ !ptlrpc_client_receiving_reply(request) &&
+ !ptlrpc_client_must_unlink(request),
+ &lwi);
if (rc == 0)
return;
LASSERT (rc == -ETIMEDOUT);
- DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout");
+ DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
+ "rvcng=%d unlnk=%d",
+ ptlrpc_client_receiving_reply(request),
+ ptlrpc_client_must_unlink(request));
}
EXIT;
}
spin_unlock(&req->rq_lock);
}
-static int expired_request(void *data)
-{
- struct ptlrpc_request *req = data;
- ENTRY;
-
- RETURN(ptlrpc_expire_one_request(req));
-}
-
static void interrupted_request(void *data)
{
struct ptlrpc_request *req = data;
int brc;
struct l_wait_info lwi;
struct obd_import *imp = req->rq_import;
- cfs_duration_t timeout = 0;
+ cfs_duration_t timeout = CFS_TICK;
+ long timeoutl;
ENTRY;
LASSERT(req->rq_set == NULL);
list_del_init(&req->rq_list);
if (req->rq_err) {
+ /* rq_status was set locally */
rc = -EIO;
}
else if (req->rq_intr) {
spin_unlock(&imp->imp_lock);
rc = ptl_send_rpc(req, 0);
- if (rc) {
+ if (rc)
DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc);
- timeout = CFS_TICK;
- } else {
- timeout = cfs_timeout_cap(cfs_time_seconds(req->rq_timeout));
+ do {
+ timeoutl = req->rq_deadline - cfs_time_current_sec();
+ timeout = (timeoutl <= 0 || rc) ? CFS_TICK :
+ cfs_time_seconds(timeoutl);
DEBUG_REQ(D_NET, req,
- "-- sleeping for "CFS_DURATION_T" jiffies", timeout);
+ "-- sleeping for "CFS_DURATION_T" ticks", timeout);
+ lwi = LWI_TIMEOUT_INTR(timeout, NULL, interrupted_request, req);
+ brc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req),
+ &lwi);
+ /* Wait again if we changed deadline */
+ } while ((brc == -ETIMEDOUT) &&
+ (req->rq_deadline > cfs_time_current_sec()));
+
+ if ((brc == -ETIMEDOUT) && !ptlrpc_expire_one_request(req)) {
+ /* Wait forever for reconnect / replay or failure */
+ lwi = LWI_INTR(interrupted_request, req);
+ rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req),
+ &lwi);
}
- lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request,
- req);
- l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi);
CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:nid:opc "
"%s:%s:%d:"LPU64":%s:%d\n", cfs_curproc_comm(),
lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
libcfs_nid2str(imp->imp_connection->c_peer.nid),
lustre_msg_get_opc(req->rq_reqmsg));
-
spin_lock(&imp->imp_lock);
list_del_init(&req->rq_list);
spin_unlock(&imp->imp_lock);
* req->rq_receiving_reply is clear and returns. */
ptlrpc_unregister_reply (req);
- if (req->rq_err)
- GOTO(out, rc = -EIO);
- /* Resend if we need to, unless we were interrupted. */
- if (req->rq_resend && !req->rq_intr) {
- /* ...unless we were specifically told otherwise. */
- if (req->rq_no_resend)
- GOTO(out, rc = -ETIMEDOUT);
- spin_lock(&imp->imp_lock);
- goto restart;
+ if (req->rq_err) {
+ DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d",
+ rc, req->rq_status);
+ GOTO(out, rc = -EIO);
}
if (req->rq_intr) {
GOTO(out, rc = -EINTR);
}
+ /* Resend if we need to */
+ if (req->rq_resend) {
+ /* ...unless we were specifically told otherwise. */
+ if (req->rq_no_resend)
+ GOTO(out, rc = -ETIMEDOUT);
+ spin_lock(&imp->imp_lock);
+ goto restart;
+ }
+
if (req->rq_timedout) { /* non-recoverable timeout */
GOTO(out, rc = -ETIMEDOUT);
}
out:
if (req->rq_bulk != NULL) {
- if (rc >= 0) {
+ while(rc >= 0) {
/* success so far. Note that anything going wrong
* with bulk now, is EXTREMELY strange, since the
* server must have believed that the bulk
* tranferred OK before she replied with success to
* me. */
+ timeoutl = req->rq_deadline - cfs_time_current_sec();
+ timeout = (timeoutl <= 0) ? CFS_TICK :
+ cfs_time_seconds(timeoutl);
+
lwi = LWI_TIMEOUT(timeout, NULL, NULL);
brc = l_wait_event(req->rq_reply_waitq,
!ptlrpc_bulk_active(req->rq_bulk),
&lwi);
LASSERT(brc == 0 || brc == -ETIMEDOUT);
+ if ((brc == -ETIMEDOUT) &&
+ (req->rq_deadline > cfs_time_current_sec()))
+ /* Wait again if we changed deadline */
+ continue;
if (brc != 0) {
- LASSERT(brc == -ETIMEDOUT);
DEBUG_REQ(D_ERROR, req, "bulk timed out");
rc = brc;
} else if (!req->rq_bulk->bd_success) {
DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
rc = -EIO;
}
+ break;
}
if (rc < 0)
ptlrpc_unregister_bulk (req);
aa->praa_old_state = req->rq_send_state;
req->rq_send_state = LUSTRE_IMP_REPLAY;
req->rq_phase = RQ_PHASE_NEW;
- aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+ if (req->rq_repmsg)
+ aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
req->rq_status = 0;
-
req->rq_interpret_reply = ptlrpc_replay_interpret;
+
atomic_inc(&req->rq_import->imp_replay_inflight);
ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
spin_lock (&req->rq_lock);
if (req->rq_import_generation < imp->imp_generation) {
req->rq_err = 1;
+ req->rq_status = -EINTR;
ptlrpc_wake_client_req(req);
}
spin_unlock (&req->rq_lock);
spin_lock (&req->rq_lock);
if (req->rq_import_generation < imp->imp_generation) {
req->rq_err = 1;
+ req->rq_status = -EINTR;
ptlrpc_wake_client_req(req);
}
spin_unlock (&req->rq_lock);
return tmp;
}
EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
struct ptlrpc_request *req = cbid->cbid_arg;
ENTRY;
- LASSERT (ev->type == LNET_EVENT_PUT ||
- ev->type == LNET_EVENT_UNLINK);
- LASSERT (ev->unlinked);
- LASSERT (ev->md.start == req->rq_repmsg);
- LASSERT (ev->offset == 0);
- LASSERT (ev->mlength <= req->rq_replen);
-
DEBUG_REQ((ev->status == 0) ? D_NET : D_ERROR, req,
"type %d, status %d", ev->type, ev->status);
+
+ LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+ LASSERT(ev->md.start == req->rq_repbuf);
+ LASSERT(ev->mlength <= req->rq_replen);
spin_lock(&req->rq_lock);
- LASSERT (req->rq_receiving_reply);
req->rq_receiving_reply = 0;
+ req->rq_early = 0;
+
+ if (ev->status)
+ goto out_wake;
+ if (ev->type == LNET_EVENT_UNLINK) {
+ req->rq_must_unlink = 0;
+ DEBUG_REQ(D_RPCTRACE, req, "unlink");
+ goto out_wake;
+ }
- if (ev->type == LNET_EVENT_PUT && ev->status == 0) {
+ /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+ for adaptive timeouts' early reply. */
+ LASSERT(ev->md.options & LNET_MD_MANAGE_REMOTE);
+
+ if ((ev->offset == 0) &&
+ lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) {
+ /* Early reply */
+ DEBUG_REQ(D_ADAPTTO, req,
+ "Early reply received: mlen=%u offset=%d replen=%d "
+ "replied=%d", ev->mlength, ev->offset, req->rq_replen,
+ req->rq_replied);
+
+ LASSERT(ev->mlength == lustre_msg_early_size());
+ req->rq_early_count++; /* number received, client side */
+ if (req->rq_replied)
+ /* If we already got the real reply, ignore the early
+ reply, but signal so we can unlink */
+ goto out_wake;
+ req->rq_early = 1;
+ req->rq_nob_received = ev->mlength;
+ /* repmsg points to early reply */
+ req->rq_repmsg = req->rq_repbuf;
+ /* And we're still receiving */
+ req->rq_receiving_reply = 1;
+ } else {
+ /* Real reply */
req->rq_replied = 1;
req->rq_nob_received = ev->mlength;
+ /* repmsg points to real reply */
+ req->rq_repmsg = (struct lustre_msg *)((char *)req->rq_repbuf +
+ ev->offset);
+ /* LNetMDUnlink can't be called under the LNET_LOCK,
+ so we must unlink in ptlrpc_unregister_reply */
+ DEBUG_REQ(D_INFO, req,
+ "reply in flags=%x mlen=%u offset=%d replen=%d",
+ lustre_msg_get_flags(req->rq_reqmsg),
+ ev->mlength, ev->offset, req->rq_replen);
}
+ req->rq_import->imp_last_reply_time = cfs_time_current_sec();
+
+out_wake:
/* NB don't unlock till after wakeup; req can disappear under us
* since we don't have our own ref */
ptlrpc_wake_client_req(req);
-
spin_unlock(&req->rq_lock);
EXIT;
}
#ifdef CRAY_XT3
req->rq_uid = ev->uid;
#endif
+ spin_lock_init(&req->rq_lock);
+ sema_init(&req->rq_rs_sem, 1);
+ CFS_INIT_LIST_HEAD(&req->rq_timed_list);
+ atomic_set(&req->rq_refcount, 1);
+ if (ev->type == LNET_EVENT_PUT)
+ DEBUG_REQ(D_RPCTRACE, req, "incoming req");
spin_lock(&service->srv_lock);
rqbd->rqbd_refcount++;
}
- list_add_tail(&req->rq_list, &service->srv_request_queue);
+ list_add_tail(&req->rq_list, &service->srv_req_in_queue);
service->srv_n_queued_reqs++;
/* NB everything can disappear under us once the request
*/
void ptlrpc_invalidate_import(struct obd_import *imp)
{
+ struct list_head *tmp, *n;
+ struct ptlrpc_request *req;
struct l_wait_info lwi;
- int rc;
+ time_t last = 0;
+ int timeout, rc = 0;
atomic_inc(&imp->imp_inval_count);
LASSERT(imp->imp_invalid);
/* wait for all requests to error out and call completion callbacks */
- lwi = LWI_TIMEOUT_INTERVAL(cfs_timeout_cap(cfs_time_seconds(obd_timeout)),
- HZ, NULL, NULL);
- rc = l_wait_event(imp->imp_recovery_waitq,
- (atomic_read(&imp->imp_inflight) == 0),
- &lwi);
+ spin_lock(&imp->imp_lock);
+ list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+ req = list_entry(tmp, struct ptlrpc_request, rq_list);
+ last = max(last, req->rq_deadline);
+ }
+ list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+ req = list_entry(tmp, struct ptlrpc_request, rq_list);
+ last = max(last, req->rq_deadline);
+ }
+ spin_unlock(&imp->imp_lock);
- if (rc)
- CDEBUG(D_HA, "%s: rc = %d waiting for callback (%d != 0)\n",
+ timeout = (int)(last - cfs_time_current_sec());
+ if (timeout > 0) {
+ lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(timeout),
+ HZ, NULL, NULL);
+ rc = l_wait_event(imp->imp_recovery_waitq,
+ (atomic_read(&imp->imp_inflight) == 0),
+ &lwi);
+ }
+
+ if (atomic_read(&imp->imp_inflight)) {
+ CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
obd2cli_tgt(imp->imp_obd), rc,
atomic_read(&imp->imp_inflight));
+ spin_lock(&imp->imp_lock);
+ list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+ req = list_entry(tmp, struct ptlrpc_request, rq_list);
+ DEBUG_REQ(D_ERROR, req, "still on sending list");
+ }
+ list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+ req = list_entry(tmp, struct ptlrpc_request, rq_list);
+ DEBUG_REQ(D_ERROR, req, "still on delayed list");
+ }
+ spin_unlock(&imp->imp_lock);
+ }
obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
{
struct obd_import_conn *imp_conn = NULL, *conn;
struct obd_export *dlmexp;
+ int tried_all = 1;
ENTRY;
spin_lock(&imp->imp_lock);
imp->imp_obd->obd_name,
libcfs_nid2str(conn->oic_conn->c_peer.nid),
conn->oic_last_attempt);
- /* Throttle the reconnect rate to once per RECONNECT_INTERVAL */
- if (cfs_time_before_64(conn->oic_last_attempt +
- RECONNECT_INTERVAL * HZ,
- cfs_time_current_64())) {
- /* If we have never tried this connection since the
- the last successful attempt, go with this one */
- if (cfs_time_beforeq_64(conn->oic_last_attempt,
- imp->imp_last_success_conn)) {
- imp_conn = conn;
- break;
- }
+
+ /* Don't thrash connections */
+ if (cfs_time_before_64(cfs_time_current_64(),
+ conn->oic_last_attempt +
+ cfs_time_seconds(CONNECTION_SWITCH_MIN))) {
+ continue;
+ }
- /* Both of these connections have already been tried
- since the last successful connection; just choose the
- least recently used */
- if (!imp_conn)
- imp_conn = conn;
- else if (cfs_time_before_64(conn->oic_last_attempt,
- imp_conn->oic_last_attempt))
- imp_conn = conn;
- } else {
- /* Exceptionally unlikely case caused by the node
- * booting and attempting to mount lustre faster than
- * than RECONNECT_INTERVAL seconds. */
- if (unlikely(conn->oic_last_attempt == 0)) {
- imp_conn = conn;
- break;
- }
+ /* If we have not tried this connection since the
+ the last successful attempt, go with this one */
+ if ((conn->oic_last_attempt == 0) ||
+ cfs_time_beforeq_64(conn->oic_last_attempt,
+ imp->imp_last_success_conn)) {
+ imp_conn = conn;
+ tried_all = 0;
+ break;
}
+
+ /* If all of the connections have already been tried
+ since the last successful connection; just choose the
+ least recently used */
+ if (!imp_conn)
+ imp_conn = conn;
+ else if (cfs_time_before_64(conn->oic_last_attempt,
+ imp_conn->oic_last_attempt))
+ imp_conn = conn;
}
/* if not found, simply choose the current one */
if (!imp_conn) {
LASSERT(imp->imp_conn_current);
imp_conn = imp->imp_conn_current;
+ tried_all = 0;
}
LASSERT(imp_conn->oic_conn);
+ /* If we've tried everything, and we're back to the beginning of the
+ list, wait for LND_TIMEOUT to give the queues a chance to drain. */
+ if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+ int must_wait;
+ LASSERT(imp_conn->oic_last_attempt);
+ must_wait = LND_TIMEOUT -
+ (int)cfs_duration_sec(cfs_time_current_64() -
+ imp_conn->oic_last_attempt);
+ imp->imp_at.iat_drain = max(0, must_wait);
+ CWARN("Tried all connections, %lus drain time\n",
+ imp->imp_at.iat_drain);
+ } else {
+ imp->imp_at.iat_drain = 0;
+ }
+
imp_conn->oic_last_attempt = cfs_time_current_64();
/* switch connection, don't mind if it's same as the current one */
aa->pcaa_peer_committed = committed_before_reconnect;
aa->pcaa_initial_connect = initial_connect;
-
if (aa->pcaa_initial_connect) {
spin_lock(&imp->imp_lock);
imp->imp_replayable = 1;
spin_unlock(&imp->imp_lock);
- /* On an initial connect, we don't know which one of a
- failover server pair is up. Don't wait long. */
-#ifdef CRAY_XT3
- request->rq_timeout = max((int)(obd_timeout / 2), 5);
-#else
- request->rq_timeout = max((int)(obd_timeout / 20), 5);
-#endif
+ if (AT_OFF)
+ /* AT will use INITIAL_CONNECT_TIMEOUT the first
+ time, adaptive after that. */
+ request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
}
- DEBUG_REQ(D_RPCTRACE, request, "(re)connect request");
+ DEBUG_REQ(D_RPCTRACE, request, "%sconnect request %d",
+ aa->pcaa_initial_connect ? "initial " : "re",
+ imp->imp_conn_cnt);
ptlrpcd_add_req(request);
rc = 0;
out:
ENTRY;
spin_lock(&imp->imp_lock);
+ imp->imp_at.iat_drain = 0;
if (imp->imp_state == LUSTRE_IMP_CLOSED) {
spin_unlock(&imp->imp_lock);
RETURN(0);
imp->imp_last_replay_transno = 0;
IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
} else {
+ DEBUG_REQ(D_HA, request, "evicting, flags=%x", msg_flags);
imp->imp_remote_handle =
*lustre_msg_get_handle(request->rq_repmsg);
IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
ocd->ocd_brw_size >> CFS_PAGE_SHIFT;
}
+ if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
+ (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+ /* We need a per-message support flag, because
+ a. we don't know if the incoming connect reply
+ supports AT or not (in reply_in_callback)
+ until we unpack it.
+ b. failovered server means export and flags are gone
+ (in ptlrpc_send_reply).
+ Can only be set when we know AT is supported at
+ both ends */
+ imp->imp_msg_flags |= MSG_AT_SUPPORT;
+ else
+ imp->imp_msg_flags &= ~MSG_AT_SUPPORT;
+
LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
(cli->cl_max_pages_per_rpc > 0));
}
if (ptlrpc_import_in_recovery(imp)) {
struct l_wait_info lwi;
- cfs_duration_t timeout = cfs_time_seconds(obd_timeout);
+ cfs_duration_t timeout;
+ int idx;
+ if (AT_OFF || (idx = import_at_get_index(imp,
+ imp->imp_client->cli_request_portal)) < 0)
+ timeout = cfs_time_seconds(obd_timeout);
+ else
+ timeout = cfs_time_seconds(
+ at_get(&imp->imp_at.iat_service_estimate[idx]));
lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
rc = l_wait_event(imp->imp_recovery_waitq,
!ptlrpc_import_in_recovery(imp), &lwi);
-
}
spin_lock(&imp->imp_lock);
* it fails. We can get through the above with a down server
* if the client doesn't know the server is gone yet. */
req->rq_no_resend = 1;
-#ifdef CRAY_XT3
- req->rq_timeout = obd_timeout / 3;
+
+#ifndef CRAY_XT3
+ /* We want client umounts to happen quickly, no matter the
+ server state... */
+ req->rq_timeout = min(req->rq_timeout, INITIAL_CONNECT_TIMEOUT);
#else
- req->rq_timeout = 5;
+ /* ... but we always want liblustre clients to nicely
+ disconnect, so only use the adaptive value. */
+ if (AT_OFF)
+ req->rq_timeout = obd_timeout / 3;
#endif
+
IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
req->rq_send_state = LUSTRE_IMP_CONNECTING;
ptlrpc_req_set_repsize(req, 1, NULL);
else
IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+ /* Try all connections in the future - bz 12758 */
+ imp->imp_last_recon = 0;
spin_unlock(&imp->imp_lock);
RETURN(rc);
LNetSetAsync(imp->imp_connection->c_peer, count);
}
+
+/* Adaptive Timeout utils */
+
+/* Bin into timeslices using AT_BINS bins.
+ This gives us a max of the last binlimit*AT_BINS secs without the storage,
+ but still smoothing out a return to normalcy from a slow response.
+ (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
+void at_add(struct adaptive_timeout *at, unsigned int val) {
+ /*unsigned int old = at->at_current;*/
+ time_t now = cfs_time_current_sec();
+
+ LASSERT(at);
+#if 0
+ CDEBUG(D_INFO, "add %u to %p time=%lu tb=%lu v=%u (%u %u %u %u)\n",
+ val, at, now - at->at_binstart, at->at_binlimit, at->at_current,
+ at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+#endif
+ if (val == 0)
+ /* 0's don't count, because we never want our timeout to
+ drop to 0, and because 0 could mean an error */
+ return;
+
+ spin_lock(&at->at_lock);
+
+ if (unlikely(at->at_binstart == 0)) {
+ /* Special case to remove default from history */
+ at->at_current = val;
+ at->at_worst_ever = val;
+ at->at_worst_time = now;
+ at->at_hist[0] = val;
+ at->at_binstart = now;
+ } else if (now - at->at_binstart < at->at_binlimit ) {
+ /* in bin 0 */
+ at->at_hist[0] = max(val, at->at_hist[0]);
+ at->at_current = max(val, at->at_current);
+ } else {
+ int i, shift;
+ unsigned int maxv = val;
+ /* move bins over */
+ shift = (now - at->at_binstart) / at->at_binlimit;
+ LASSERT(shift > 0);
+ for(i = AT_BINS - 1; i >= 0; i--) {
+ if (i >= shift) {
+ at->at_hist[i] = at->at_hist[i - shift];
+ maxv = max(maxv, at->at_hist[i]);
+ } else {
+ at->at_hist[i] = 0;
+ }
+ }
+ at->at_hist[0] = val;
+ at->at_current = maxv;
+ at->at_binstart += shift * at->at_binlimit;
+ }
+
+ if ((at->at_flags & AT_FLG_MIN) &&
+ (at->at_current < adaptive_timeout_min))
+ at->at_current = adaptive_timeout_min;
+
+ if (at->at_current > at->at_worst_ever) {
+ at->at_worst_ever = at->at_current;
+ at->at_worst_time = now;
+ }
+
+ if (at->at_flags & AT_FLG_NOHIST)
+ /* Only keep last reported val; keeping the rest of the history
+ for proc only */
+ at->at_current = val;
+
+#if 0
+ if (at->at_current != old)
+ CDEBUG(D_ADAPTTO, "AT change: old=%u new=%u delta=%d (val=%u) "
+ "hist %u %u %u %u\n",
+ old, at->at_current, at->at_current - old, val,
+ at->at_hist[0], at->at_hist[1], at->at_hist[2],
+ at->at_hist[3]);
+#endif
+ spin_unlock(&at->at_lock);
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal) {
+ struct imp_at *at = &imp->imp_at;
+ int i;
+
+ for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ if (at->iat_portal[i] == portal)
+ return i;
+ if (at->iat_portal[i] == 0)
+ /* unused */
+ break;
+ }
+
+ /* Not found in list, add it under a lock */
+ spin_lock(&imp->imp_lock);
+
+ /* Check unused under lock */
+ for (; i < IMP_AT_MAX_PORTALS; i++) {
+ if (at->iat_portal[i] == portal)
+ goto out;
+ if (at->iat_portal[i] == 0)
+ /* unused */
+ break;
+ }
+
+ if (i >= IMP_AT_MAX_PORTALS) {
+ CERROR("Tried to use more than %d portals, not enough room "
+ "in adaptive timeout stats.\n", IMP_AT_MAX_PORTALS);
+ i = -1;
+ goto out;
+ }
+ at->iat_portal[i] = portal;
+
+out:
+ spin_unlock(&imp->imp_lock);
+ return i;
+}
+
+/* Get total expected lock callback time (net + service).
+ Since any early reply will only affect the RPC wait time, and not
+ any local lock timer we set based on the return value here,
+ we should be conservative. */
+int import_at_get_ldlm(struct obd_import *imp) {
+ int idx, tot;
+
+ if (!imp || !imp->imp_client || AT_OFF)
+ return obd_timeout;
+
+ tot = at_get(&imp->imp_at.iat_net_latency);
+ idx = import_at_get_index(imp, imp->imp_client->cli_request_portal);
+ if (idx < 0)
+ tot += obd_timeout;
+ else
+ tot += at_get(&imp->imp_at.iat_service_estimate[idx]);
+
+ /* add an arbitrary minimum: 150% + 10 sec */
+ tot += (tot >> 1) + 10;
+ return tot;
+}
+
svc_counter_config, "req_qdepth", "reqs");
lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
svc_counter_config, "req_active", "reqs");
+ lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+ svc_counter_config, "adaptive_timeout", "sec");
lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
svc_counter_config, "reqbuf_avail", "bufs");
for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
return 0;
}
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct ptlrpc_service *svc = data;
+ unsigned int cur, worst;
+ time_t worstt;
+ struct dhms ts;
+ int rc = 0;
+
+ *eof = 1;
+ cur = at_get(&svc->srv_at_estimate);
+ worst = svc->srv_at_estimate.at_worst_ever;
+ worstt = svc->srv_at_estimate.at_worst_time;
+ s2dhms(&ts, cfs_time_current_sec() - worstt);
+ if (AT_OFF)
+ rc += snprintf(page + rc, count - rc,
+ "adaptive timeouts off, using obd_timeout %u\n",
+ obd_timeout);
+ rc += snprintf(page + rc, count - rc,
+ "%10s : %ld sec\n", "timebase",
+ svc->srv_at_estimate.at_binlimit * AT_BINS);
+ rc += snprintf(page + rc, count - rc,
+ "%10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ",
+ "service", cur, worst, worstt,
+ DHMS_VARS(&ts));
+ rc = lprocfs_at_hist_helper(page, count, rc,
+ &svc->srv_at_estimate);
+ return rc;
+}
+
+static int ptlrpc_lprocfs_wr_timeouts(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct ptlrpc_service *svc = data;
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc < 0)
+ return rc;
+ if (val <= 0)
+ return -ERANGE;
+
+ spin_lock(&svc->srv_at_estimate.at_lock);
+ svc->srv_at_estimate.at_binlimit = max(1, val / AT_BINS);
+ spin_unlock(&svc->srv_at_estimate.at_lock);
+
+ return count;
+}
+
void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
struct ptlrpc_service *svc)
{
.write_fptr = ptlrpc_lprocfs_write_req_history_max,
.read_fptr = ptlrpc_lprocfs_read_req_history_max,
.data = svc},
+ {.name = "timeouts",
+ .write_fptr = ptlrpc_lprocfs_wr_timeouts,
+ .read_fptr = ptlrpc_lprocfs_rd_timeouts,
+ .data = svc},
{NULL}
};
static struct file_operations req_history_fops = {
static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
- struct ptlrpc_connection *conn, int portal, __u64 xid)
+ struct ptlrpc_connection *conn, int portal, __u64 xid,
+ unsigned int offset)
{
int rc;
lnet_md_t md;
RETURN (-ENOMEM);
}
- CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
- len, portal, xid);
+ CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n",
+ len, portal, xid, offset);
rc = LNetPut (conn->c_self, *mdh, ack,
- conn->c_peer, portal, xid, 0, 0);
+ conn->c_peer, portal, xid, offset, 0);
if (rc != 0) {
int rc2;
/* We're going to get an UNLINK event when I unlink below,
}
}
-int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
+int ptlrpc_send_reply (struct ptlrpc_request *req, int flags)
{
struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
struct ptlrpc_reply_state *rs = req->rq_reply_state;
struct ptlrpc_connection *conn;
+ int service_time;
+ unsigned int offset = 0;
int rc;
/* We must already have a reply buffer (only ptlrpc_error() may be
LASSERT (req->rq_repmsg != NULL);
LASSERT (rs != NULL);
LASSERT (req->rq_repmsg == rs->rs_msg);
- LASSERT (may_be_difficult || !rs->rs_difficult);
+ LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
LASSERT (rs->rs_cb_id.cbid_arg == rs);
+ /* There may be no rq_export during failover */
+
if (req->rq_export && req->rq_export->exp_obd &&
req->rq_export->exp_obd->obd_fail) {
/* Failed obd's only send ENODEV */
lustre_msg_set_type(req->rq_repmsg, req->rq_type);
lustre_msg_set_status(req->rq_repmsg, req->rq_status);
lustre_msg_set_opc(req->rq_repmsg, lustre_msg_get_opc(req->rq_reqmsg));
+
+ service_time = max_t(int, cfs_time_current_sec() -
+ req->rq_arrival_time.tv_sec, 1);
+ if (!(flags & PTLRPC_REPLY_EARLY) &&
+ (req->rq_type != PTL_RPC_MSG_ERR)) {
+ int oldse = at_get(&svc->srv_at_estimate);
+ /* early replies and errors don't count toward our service
+ time estimate */
+ at_add(&svc->srv_at_estimate, service_time);
+ if (service_time > oldse)
+ DEBUG_REQ(D_ADAPTTO, req,
+ "svc %s increased estimate from %d to %d",
+ svc->srv_name, oldse, service_time);
+ }
+ /* Report actual service time for client latency calc */
+ lustre_msg_set_service_time(req->rq_repmsg, service_time);
+ /* Report service time estimate for future client reqs */
+ lustre_msg_set_timeout(req->rq_repmsg, at_get(&svc->srv_at_estimate));
+
+ if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) {
+ /* early replies go to offset 0, regular replies go after that*/
+ if (flags & PTLRPC_REPLY_EARLY) {
+ offset = 0;
+ /* Include a checksum on early replies - must be done
+ after all other lustre_msg_set_* */
+ lustre_msg_set_cksum(req->rq_repmsg,
+ lustre_msg_calc_cksum(req->rq_repmsg));
+ } else {
+ offset = lustre_msg_early_size();
+ }
+ } else {
+ CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
+ "req_flags=%#x magic=%d:%x/%x len=%d\n",
+ flags, lustre_msg_get_flags(req->rq_reqmsg),
+ lustre_msg_is_v1(req->rq_reqmsg),
+ lustre_msg_get_magic(req->rq_reqmsg),
+ lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+ }
if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
conn = ptlrpc_get_connection(req->rq_peer, req->rq_self, NULL);
CERROR("not replying on NULL connection\n"); /* bug 9635 */
return -ENOTCONN;
}
+
atomic_inc (&svc->srv_outstanding_replies);
ptlrpc_rs_addref(rs); /* +1 ref for the network */
-
+ req->rq_sent = cfs_time_current_sec();
+
rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen,
rs->rs_difficult ? LNET_ACK_REQ : LNET_NOACK_REQ,
- &rs->rs_cb_id, conn,
- svc->srv_rep_portal, req->rq_xid);
+ &rs->rs_cb_id, conn, svc->srv_rep_portal,
+ req->rq_xid, offset);
if (rc != 0) {
atomic_dec (&svc->srv_outstanding_replies);
- ptlrpc_rs_decref(rs);
+ ptlrpc_req_drop_rs(req);
}
ptlrpc_put_connection(conn);
return rc;
if (!noreply) {
LASSERT (request->rq_replen != 0);
- if (request->rq_repmsg == NULL)
- OBD_ALLOC(request->rq_repmsg, request->rq_replen);
- if (request->rq_repmsg == NULL)
+ if (request->rq_repbuf == NULL)
+ OBD_ALLOC(request->rq_repbuf, request->rq_replen);
+ if (request->rq_repbuf == NULL)
GOTO(cleanup_bulk, rc = -ENOMEM);
+ request->rq_repmsg = NULL;
rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
connection->c_peer, request->rq_xid, 0,
spin_lock(&request->rq_lock);
/* If the MD attach succeeds, there _will_ be a reply_in callback */
request->rq_receiving_reply = !noreply;
+ /* We are responsible for unlinking the reply buffer */
+ request->rq_must_unlink = !noreply;
/* Clear any flags that may be present from previous sends. */
request->rq_replied = 0;
request->rq_err = 0;
spin_unlock(&request->rq_lock);
if (!noreply) {
- reply_md.start = request->rq_repmsg;
+ reply_md.start = request->rq_repbuf;
reply_md.length = request->rq_replen;
- reply_md.threshold = 1;
- reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT;
+ /* Allow multiple early replies */
+ reply_md.threshold = LNET_MD_THRESH_INF;
+ /* Manage remote for early replies */
+ reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+ LNET_MD_MANAGE_REMOTE;
reply_md.user_ptr = &request->rq_reply_cbid;
reply_md.eq_handle = ptlrpc_eq_h;
- rc = LNetMDAttach(reply_me_h, reply_md, LNET_UNLINK,
+ /* We must see the unlink callback to unset rq_must_unlink,
+ so we can't auto-unlink */
+ rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
&request->rq_reply_md_h);
if (rc != 0) {
CERROR("LNetMDAttach failed: %d\n", rc);
OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
- request->rq_sent = CURRENT_SECONDS;
+ request->rq_sent = cfs_time_current_sec();
+ /* We give the server rq_timeout secs to process the req, and
+ add the network latency for our local timeout. */
+ request->rq_deadline = request->rq_sent + request->rq_timeout +
+ ptlrpc_at_get_net_latency(request);
+
ptlrpc_pinger_sending_on_import(request->rq_import);
+
+ DEBUG_REQ(D_INFO, request, "send flg=%x",
+ lustre_msg_get_flags(request->rq_reqmsg));
rc = ptl_send_buf(&request->rq_req_md_h,
request->rq_reqmsg, request->rq_reqlen,
LNET_NOACK_REQ, &request->rq_req_cbid,
connection,
request->rq_request_portal,
- request->rq_xid);
+ request->rq_xid, 0);
if (rc == 0) {
ptlrpc_lprocfs_rpc_sent(request);
RETURN(rc);
if (noreply)
RETURN(rc);
- else
- GOTO(cleanup_me, rc);
+
cleanup_me:
/* MEUnlink is safe; the PUT didn't even get off the ground, and
* nobody apart from the PUT's target has the right nid+XID to
LASSERT (!request->rq_receiving_reply);
cleanup_repmsg:
- OBD_FREE(request->rq_repmsg, request->rq_replen);
- request->rq_repmsg = NULL;
+ OBD_FREE(request->rq_repbuf, request->rq_replen);
+ request->rq_repbuf = NULL;
+ request->rq_repmsg = NULL; //remove
cleanup_bulk:
if (request->rq_bulk != NULL)
}
}
+/* early reply size */
+int lustre_msg_early_size() {
+ static int size = 0;
+ if (!size)
+ size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
+ return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
static inline int lustre_msg_size_v1(int count, int *lengths)
{
int size;
lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
lustre_set_req_swabbed(req, MSG_PTLRPC_BODY_OFF);
+
return 0;
}
}
static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count,
- int *lens, char **bufs)
+ int *lens, char **bufs, int flags)
{
struct ptlrpc_reply_state *rs;
int msg_len;
int size;
ENTRY;
- LASSERT (req->rq_reply_state == NULL);
+ /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */
+ down(&req->rq_rs_sem);
+ LASSERT(req->rq_reply_state == NULL);
+ if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) {
+ /* Already packed final, no more early */
+ up(&req->rq_rs_sem);
+ RETURN(-EALREADY);
+ }
msg_len = lustre_msg_size_v1(count, lens);
size = sizeof(struct ptlrpc_reply_state) + msg_len;
OBD_ALLOC(rs, size);
if (unlikely(rs == NULL)) {
rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size);
- if (!rs)
+ if (!rs) {
+ up(&req->rq_rs_sem);
RETURN (-ENOMEM);
+ }
}
atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */
rs->rs_cb_id.cbid_fn = reply_out_callback;
req->rq_replen = msg_len;
req->rq_reply_state = rs;
req->rq_repmsg = rs->rs_msg;
+ if (!flags)
+ req->rq_final = 1; /* checked in lustre_packed_reply */
+
lustre_init_msg_v1(rs->rs_msg, count, lens, bufs);
PTLRPC_RS_DEBUG_LRU_ADD(rs);
}
static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
- int *lens, char **bufs)
+ int *lens, char **bufs, int flags)
{
struct ptlrpc_reply_state *rs;
int msg_len;
int size;
ENTRY;
+ /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */
+ down(&req->rq_rs_sem);
LASSERT(req->rq_reply_state == NULL);
+ if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) {
+ /* Already packed final, no more early */
+ up(&req->rq_rs_sem);
+ RETURN(-EALREADY);
+ }
msg_len = lustre_msg_size_v2(count, lens);
size = sizeof(struct ptlrpc_reply_state) + msg_len;
OBD_ALLOC(rs, size);
if (unlikely(rs == NULL)) {
rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size);
- if (!rs)
+ if (!rs) {
+ up(&req->rq_rs_sem);
RETURN (-ENOMEM);
+ }
}
atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */
rs->rs_cb_id.cbid_fn = reply_out_callback;
req->rq_replen = msg_len;
req->rq_reply_state = rs;
req->rq_repmsg = rs->rs_msg;
+ if (!(flags & LPRFL_EARLY_REPLY))
+ req->rq_final = 1; /* checked in lustre_packed_reply */
+
+ /* server side, no rq_repbuf */
lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
lustre_set_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
RETURN(0);
}
-int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
- char **bufs)
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, int *lens,
+ char **bufs, int flags)
{
int size[] = { sizeof(struct ptlrpc_body) };
case LUSTRE_MSG_MAGIC_V1:
case LUSTRE_MSG_MAGIC_V1_SWABBED:
return lustre_pack_reply_v1(req, count - 1, lens + 1,
- bufs ? bufs + 1 : NULL);
+ bufs ? bufs + 1 : NULL, flags);
case LUSTRE_MSG_MAGIC_V2:
case LUSTRE_MSG_MAGIC_V2_SWABBED:
- return lustre_pack_reply_v2(req, count, lens, bufs);
+ return lustre_pack_reply_v2(req, count, lens, bufs, flags);
default:
LASSERTF(0, "incorrect message magic: %08x\n",
req->rq_reqmsg->lm_magic);
}
}
+int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
+ char **bufs)
+{
+ return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+
+
void *lustre_msg_buf_v1(void *msg, int n, int min_size)
{
struct lustre_msg_v1 *m = (struct lustre_msg_v1 *)msg;
__swab32s(&m->lm_bufcount);
__swab32s(&m->lm_secflvr);
__swab32s(&m->lm_repsize);
- __swab32s(&m->lm_timeout);
+ __swab32s(&m->lm_cksum);
CLASSERT(offsetof(typeof(*m), lm_padding_1) != 0);
CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
}
}
+int lustre_msg_is_v1(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
__u32 lustre_msg_get_magic(struct lustre_msg *msg)
{
switch (msg->lm_magic) {
}
}
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ case LUSTRE_MSG_MAGIC_V2_SWABBED: {
+ struct ptlrpc_body *pb;
+
+ pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+
+ }
+ return pb->pb_timeout;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ case LUSTRE_MSG_MAGIC_V2_SWABBED: {
+ struct ptlrpc_body *pb;
+
+ pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+
+ }
+ return pb->pb_service_time;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ case LUSTRE_MSG_MAGIC_V2_SWABBED:
+ return msg->lm_cksum;
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ case LUSTRE_MSG_MAGIC_V2_SWABBED: {
+ struct ptlrpc_body *pb;
+ pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ return crc32_le(~(__u32)0, (char *)pb, sizeof(*pb));
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
{
switch (msg->lm_magic) {
}
}
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb;
+
+ pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_timeout = timeout;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb;
+
+ pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_service_time = service_time;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2:
+ msg->lm_cksum = cksum;
+ return;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+
/* byte flipping routines for all wire types declared in
* lustre_idl.h implemented here.
*/
__swab32s (&b->pb_flags);
__swab32s (&b->pb_op_flags);
__swab32s (&b->pb_conn_cnt);
+ __swab32s (&b->pb_timeout);
+ __swab32s (&b->pb_service_time);
CLASSERT(offsetof(typeof(*b), pb_padding_1) != 0);
CLASSERT(offsetof(typeof(*b), pb_padding_2) != 0);
CLASSERT(offsetof(typeof(*b), pb_padding_3) != 0);
- CLASSERT(offsetof(typeof(*b), pb_padding_4) != 0);
- CLASSERT(offsetof(typeof(*b), pb_padding_5) != 0);
}
void lustre_swab_connect(struct obd_connect_data *ocd)
va_start(args, fmt);
libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file,
data->msg_fn, data->msg_line, fmt, args,
- " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d ref %d fl "
- REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
+ " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d "
+ "e %d ref %d fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
req, req->rq_xid, req->rq_transno,
req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) :
(char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "<?>",
(req->rq_import && req->rq_import->imp_client) ?
req->rq_import->imp_client->cli_request_portal : -1,
- req->rq_reqlen, req->rq_replen, atomic_read(&req->rq_refcount),
- DEBUG_REQ_FLAGS(req),
+ req->rq_reqlen, req->rq_replen, req->rq_early_count,
+ atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req),
req->rq_reqmsg ? lustre_msg_get_flags(req->rq_reqmsg) : 0,
req->rq_repmsg ? lustre_msg_get_flags(req->rq_repmsg) : 0,
req->rq_status,
static void ptlrpc_update_next_ping(struct obd_import *imp)
{
- imp->imp_next_ping = cfs_time_shift(
- (imp->imp_state == LUSTRE_IMP_DISCON ?
- RECONNECT_INTERVAL : PING_INTERVAL));
+ int time = (imp->imp_state != LUSTRE_IMP_DISCON) ? PING_INTERVAL :
+ /* FIXME should this be limited to LND_TIMEOUT so we don't
+ build up pings in LND output queues? */
+ max_t(int, CONNECTION_SWITCH_MIN,
+ at_get(&imp->imp_at.iat_net_latency));
+ imp->imp_next_ping = cfs_time_shift(time);
}
void ptlrpc_ping_import_soon(struct obd_import *imp)
obd = pet_exp->exp_obd;
spin_unlock(&pet_lock);
- expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2);
+ expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
obd->obd_name, expire_time);
" it.\n", obd->obd_name,
obd_uuid2str(&exp->exp_client_uuid),
obd_export_nid2str(exp),
- (long)(CURRENT_SECONDS -
+ (long)(cfs_time_current_sec() -
exp->exp_last_request_time));
CDEBUG(D_HA, "Last request was at %ld\n",
exp->exp_last_request_time);
PTLRPC_REQWAIT_CNTR = 0,
PTLRPC_REQQDEPTH_CNTR,
PTLRPC_REQACTIVE_CNTR,
+ PTLRPC_TIMEOUT,
PTLRPC_REQBUF_AVAIL_CNTR,
PTLRPC_LAST_CNTR
};
EXPORT_SYMBOL(lustre_msg_check_version);
EXPORT_SYMBOL(lustre_pack_request);
EXPORT_SYMBOL(lustre_pack_reply);
+EXPORT_SYMBOL(lustre_pack_reply_flags);
EXPORT_SYMBOL(lustre_shrink_reply);
EXPORT_SYMBOL(lustre_free_reply_state);
EXPORT_SYMBOL(lustre_msg_size);
EXPORT_SYMBOL(lustre_msg_get_transno);
EXPORT_SYMBOL(lustre_msg_get_status);
EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+EXPORT_SYMBOL(lustre_msg_is_v1);
EXPORT_SYMBOL(lustre_msg_get_magic);
EXPORT_SYMBOL(lustre_msg_set_handle);
EXPORT_SYMBOL(lustre_msg_set_type);
return (-1);
}
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+ struct ptlrpc_service *svc = (struct ptlrpc_service *)castmeharder;
+ CDEBUG(D_ADAPTTO, "at timer %s hit at %ld%s\n",
+ svc->srv_name, cfs_time_current_sec(),
+ list_empty(&svc->srv_at_list) ? ", empty" : "");
+ svc->srv_at_check = 1;
+ cfs_waitq_signal(&svc->srv_waitq);
+}
+
/* @threadname should be 11 characters or less - 3 will be added on */
struct ptlrpc_service *
ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
- int req_portal, int rep_portal, int watchdog_timeout,
+ int req_portal, int rep_portal, int watchdog_factor,
svc_handler_t handler, char *name,
cfs_proc_dir_entry_t *proc_entry,
svcreq_printfn_t svcreq_printfn,
service->srv_buf_size = bufsize;
service->srv_rep_portal = rep_portal;
service->srv_req_portal = req_portal;
- service->srv_watchdog_timeout = watchdog_timeout;
+ service->srv_watchdog_factor = watchdog_factor;
service->srv_handler = handler;
service->srv_request_history_print_fn = svcreq_printfn;
service->srv_request_seq = 1; /* valid seq #s start at 1 */
CFS_INIT_LIST_HEAD(&service->srv_free_rs_list);
cfs_waitq_init(&service->srv_free_rs_waitq);
+ spin_lock_init(&service->srv_at_lock);
+ CFS_INIT_LIST_HEAD(&service->srv_req_in_queue);
+ CFS_INIT_LIST_HEAD(&service->srv_at_list);
+ cfs_timer_init(&service->srv_at_timer, ptlrpc_at_timer, service);
+ /* At SOW, service time should be quick; 10s seems generous. If client
+ timeout is less than this, we'll be sending an early reply. */
+ at_init(&service->srv_at_estimate, 10, AT_TIMEBASE_DEFAULT, 0);
+
spin_lock (&ptlrpc_all_services_lock);
list_add (&service->srv_list, &ptlrpc_all_services);
spin_unlock (&ptlrpc_all_services_lock);
return NULL;
}
-static void __ptlrpc_server_free_request(struct ptlrpc_request *req)
+static void ptlrpc_server_decref(struct ptlrpc_request *req)
{
struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
- list_del(&req->rq_list);
-
- if (req->rq_reply_state != NULL) {
- ptlrpc_rs_decref(req->rq_reply_state);
- req->rq_reply_state = NULL;
- }
+ if (!atomic_dec_and_test(&req->rq_refcount))
+ return;
if (req != &rqbd->rqbd_req) {
/* NB request buffers use an embedded
* req if the incoming req unlinked the
* MD; this isn't one of them! */
OBD_FREE(req, sizeof(*req));
+ } else {
+ struct ptlrpc_service *svc = rqbd->rqbd_service;
+ /* schedule request buffer for re-use.
+ * NB I can only do this after I've disposed of their
+ * reqs; particularly the embedded req */
+ spin_lock(&svc->srv_lock);
+ list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
+ spin_unlock(&svc->srv_lock);
}
}
-static void
-ptlrpc_server_free_request(struct ptlrpc_request *req)
+static void __ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+ list_del(&req->rq_list);
+ ptlrpc_req_drop_rs(req);
+ ptlrpc_server_decref(req);
+}
+
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
{
struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
struct ptlrpc_service *svc = rqbd->rqbd_service;
struct list_head *tmp;
struct list_head *nxt;
+ DEBUG_REQ(D_INFO, req, "free req");
+ spin_lock(&svc->srv_at_lock);
+ list_del_init(&req->rq_timed_list);
+ spin_unlock(&svc->srv_at_lock);
+
spin_lock(&svc->srv_lock);
svc->srv_n_active_reqs--;
}
spin_lock(&svc->srv_lock);
-
- /* schedule request buffer for re-use.
- * NB I can only do this after I've disposed of their
- * reqs; particularly the embedded req */
- list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
}
} else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
/* If we are low on memory, we are not interested in
}
spin_unlock(&svc->srv_lock);
-
}
/* This function makes sure dead exports are evicted in a timely manner.
at the exact right moment. Eventually, all silent exports
will make it to the top of the list. */
exp->exp_last_request_time = max(exp->exp_last_request_time,
- (time_t)CURRENT_SECONDS + extra_delay);
+ cfs_time_current_sec() + extra_delay);
CDEBUG(D_INFO, "updating export %s at %ld\n",
exp->exp_client_uuid.uuid,
/* Note - racing to start/reset the obd_eviction timer is safe */
if (exp->exp_obd->obd_eviction_timer == 0) {
/* Check if the oldest entry is expired. */
- if (CURRENT_SECONDS > (oldest_time +
- (3 * obd_timeout / 2) + extra_delay)) {
+ if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
+ extra_delay)) {
/* We need a second timer, in case the net was down and
* it just came back. Since the pinger may skip every
* other PING_INTERVAL (see note in ptlrpc_pinger_main),
* we better wait for 3. */
- exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS +
- 3 * PING_INTERVAL;
+ exp->exp_obd->obd_eviction_timer =
+ cfs_time_current_sec() + 3 * PING_INTERVAL;
CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
exp->exp_obd->obd_name, obd_export_nid2str(exp),
oldest_time);
}
} else {
- if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer +
- extra_delay)) {
+ if (cfs_time_current_sec() >
+ (exp->exp_obd->obd_eviction_timer + extra_delay)) {
/* The evictor won't evict anyone who we've heard from
* recently, so we don't have to check before we start
* it. */
EXIT;
}
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+ if (lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+ req->rq_export->exp_conn_cnt) {
+ DEBUG_REQ(D_ERROR, req,
+ "DROPPING req from old connection %d < %d",
+ lustre_msg_get_conn_cnt(req->rq_reqmsg),
+ req->rq_export->exp_conn_cnt);
+ return -EEXIST;
+ }
+ if (req->rq_export->exp_obd && req->rq_export->exp_obd->obd_fail) {
+ /* Failing over, don't handle any more reqs, send
+ error response instead. */
+ CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+ req, req->rq_export->exp_obd->obd_name);
+ req->rq_status = -ENODEV;
+ ptlrpc_error(req);
+ return -ENODEV;
+ }
+ return 0;
+}
+
+/* If closest exipiration is within EARLY_MIN, send early replies to everybody
+ expiring within EARLY_MAX, asking for AT_EXTRA time */
+#define AT_EARLY_MIN 2 /* Min time needed to send an early reply */
+#define AT_EARLY_MAX 5 /* Dont send early replies if deadline is beyond */
+#define AT_EXTRA 10 /* Early replies add this time to client timeout */
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_request *rq;
+ time_t next;
+
+ spin_lock(&svc->srv_at_lock);
+ if (list_empty(&svc->srv_at_list)) {
+ cfs_timer_disarm(&svc->srv_at_timer);
+ spin_unlock(&svc->srv_at_lock);
+ return;
+ }
+
+ /* Set timer for closest deadline */
+ rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request,
+ rq_timed_list);
+ next = rq->rq_deadline - cfs_time_current_sec() - AT_EARLY_MIN;
+ if (next <= 0)
+ ptlrpc_at_timer((unsigned long)svc);
+ else
+ cfs_timer_arm(&svc->srv_at_timer, cfs_time_shift(next));
+ spin_unlock(&svc->srv_at_lock);
+ CDEBUG(D_ADAPTTO, "armed %s at %+lds\n", svc->srv_name, next);
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+ struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+ struct ptlrpc_request *rq;
+ int found = 0;
+
+ if (AT_OFF)
+ return(0);
+
+ if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) == 0)
+ return(-ENOSYS);
+
+ DEBUG_REQ(D_ADAPTTO, req, "add timed %lds",
+ req->rq_deadline - cfs_time_current_sec());
+
+ spin_lock(&svc->srv_at_lock);
+
+ LASSERT(list_empty(&req->rq_timed_list));
+ /* Add to sorted list. Presumably latest rpcs will have the latest
+ deadlines, so search backward. */
+ list_for_each_entry_reverse(rq, &svc->srv_at_list, rq_timed_list) {
+ if (req->rq_deadline > rq->rq_deadline) {
+ list_add(&req->rq_timed_list, &rq->rq_timed_list);
+ found++;
+ break;
+ }
+ }
+ if (!found)
+ /* Add to front if shortest deadline or list empty */
+ list_add(&req->rq_timed_list, &svc->srv_at_list);
+
+ /* Check if we're the head of the list */
+ found = (svc->srv_at_list.next == &req->rq_timed_list);
+
+ spin_unlock(&svc->srv_at_lock);
+
+ if (found)
+ ptlrpc_at_set_timer(svc);
+
+ return 0;
+}
+
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
+ int extra_time)
+{
+ struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+ long deadline = req->rq_deadline - cfs_time_current_sec();
+ int rc;
+ ENTRY;
+
+ /* deadline is when the client expects us to reply, margin is the
+ difference between clients' and servers' expectations */
+ DEBUG_REQ(D_ADAPTTO, req,
+ "%ssending early reply (deadline %+lds, margin %+lds) for "
+ "%d+%d", AT_OFF ? "AT off - not " : "",
+ deadline, deadline - at_get(&svc->srv_at_estimate),
+ at_get(&svc->srv_at_estimate), extra_time);
+
+ if (AT_OFF)
+ RETURN(0);
+
+ if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) == 0) {
+ CERROR("Wanted to ask client for more time, but no AT "
+ "support\n");
+ RETURN(-ENOSYS);
+ }
+
+ rc = lustre_pack_reply_flags(req, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+ if (rc)
+ /* EALREADY means final reply was already packed */
+ RETURN(rc);
+
+ if (extra_time) {
+ /* Fake our processing time into the future to ask the
+ clients for some extra amount of time */
+ extra_time += cfs_time_current_sec() -
+ req->rq_arrival_time.tv_sec;
+ at_add(&svc->srv_at_estimate, extra_time);
+ }
+
+ req->rq_early_count++; /* number sent, server side */
+ rc = ptlrpc_send_reply(req, PTLRPC_REPLY_EARLY);
+
+ if (!rc) {
+ /* Adjust our own deadline to what we told the client */
+ req->rq_deadline = req->rq_arrival_time.tv_sec +
+ at_get(&svc->srv_at_estimate);
+ } else {
+ DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+ }
+
+ /* Reset reply */
+ req->rq_rep_swab_mask = 0;
+ /* Free the (early) reply state from lustre_pack_reply.
+ (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+ ptlrpc_req_drop_rs(req);
+
+ RETURN(rc);
+}
+
+/* Check if we need to send any early replies, and send them */
+static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_request *rq, *n;
+ struct list_head work_list;
+ time_t now = cfs_time_current_sec();
+ int first, counter = 0;
+ ENTRY;
+
+ if (AT_OFF)
+ RETURN(0);
+
+ spin_lock(&svc->srv_at_lock);
+ if (svc->srv_at_check == 0) {
+ spin_unlock(&svc->srv_at_lock);
+ RETURN(0);
+ }
+ svc->srv_at_check = 0;
+
+ if (list_empty(&svc->srv_at_list)) {
+ spin_unlock(&svc->srv_at_lock);
+ RETURN(0);
+ }
+
+ /* The timer went off, but maybe the nearest rpc already completed. */
+ rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request,
+ rq_timed_list);
+ first = (int)(rq->rq_deadline - now);
+ if (first > AT_EARLY_MIN) {
+ /* We've still got plenty of time. Reset the timer. */
+ spin_unlock(&svc->srv_at_lock);
+ ptlrpc_at_set_timer(svc);
+ RETURN(0);
+ }
+
+ /* We're close to a timeout, and we don't know how much longer the
+ server will take. Send early replies to everyone expiring soon. */
+ CFS_INIT_LIST_HEAD(&work_list);
+ list_for_each_entry_safe(rq, n, &svc->srv_at_list, rq_timed_list) {
+ if (rq->rq_deadline <= now + AT_EARLY_MAX) {
+ list_move(&rq->rq_timed_list, &work_list);
+ counter++;
+ } else {
+ break;
+ }
+ }
+
+ spin_unlock(&svc->srv_at_lock);
+
+ CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
+ "replies\n", first, AT_EXTRA, counter);
+ if (first < 0)
+ /* We're already past request deadlines before we even get a
+ chance to send early replies */
+ LCONSOLE_WARN("%s: This server is not able to keep up with "
+ "request traffic (cpu-bound).\n", svc->srv_name);
+
+ /* ptlrpc_server_free_request may delete an entry out of the work
+ list */
+ counter = 0;
+ spin_lock(&svc->srv_at_lock);
+ while (!list_empty(&work_list)) {
+ rq = list_entry(work_list.next, struct ptlrpc_request,
+ rq_timed_list);
+ list_del_init(&rq->rq_timed_list);
+ /* if the entry is still in the worklist, it hasn't been
+ deleted, and is safe to take a ref to keep the req around */
+ atomic_inc(&rq->rq_refcount);
+ spin_unlock(&svc->srv_at_lock);
+ if (!rq->rq_final &&
+ (ptlrpc_at_send_early_reply(rq, AT_EXTRA) == 0)) {
+ counter++;
+ ptlrpc_at_add_timed(rq);
+ } else if (rq->rq_final) {
+ DEBUG_REQ(D_ADAPTTO, rq, "already packed final reply, "
+ "not sending early");
+ }
+ ptlrpc_server_decref(rq);
+ spin_lock(&svc->srv_at_lock);
+ }
+ spin_unlock(&svc->srv_at_lock);
+
+ if (!counter)
+ /* Nothing added to timed list, so we have to kick the timer
+ ourselves. */
+ ptlrpc_at_set_timer(svc);
+
+ RETURN(0);
+}
+
+/* Handle freshly incoming reqs, check timeout, send early reply if needed,
+ pass on to regular request queue */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_request *req;
+ __u32 deadline;
+ int rc;
+ ENTRY;
+
+ LASSERT(svc);
+
+ spin_lock(&svc->srv_lock);
+ if (list_empty(&svc->srv_req_in_queue)) {
+ spin_unlock(&svc->srv_lock);
+ RETURN(0);
+ }
+
+ req = list_entry(svc->srv_req_in_queue.next,
+ struct ptlrpc_request, rq_list);
+ list_del_init (&req->rq_list);
+ /* Consider this still a "queued" request as far as stats are
+ concerned */
+ spin_unlock(&svc->srv_lock);
+
+#if SWAB_PARANOIA
+ /* Clear request swab mask; this is a new request */
+ req->rq_req_swab_mask = 0;
+#endif
+ rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
+ if (rc != 0) {
+ CERROR ("error unpacking request: ptl %d from %s"
+ " xid "LPU64"\n", svc->srv_req_portal,
+ libcfs_id2str(req->rq_peer), req->rq_xid);
+ goto err_req;
+ }
+
+ rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+ if (rc) {
+ CERROR ("error unpacking ptlrpc body: ptl %d from %s"
+ " xid "LPU64"\n", svc->srv_req_portal,
+ libcfs_id2str(req->rq_peer), req->rq_xid);
+ goto err_req;
+ }
+
+ rc = -EINVAL;
+ if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+ CERROR("wrong packet type received (type=%u) from %s\n",
+ lustre_msg_get_type(req->rq_reqmsg),
+ libcfs_id2str(req->rq_peer));
+ goto err_req;
+ }
+
+ CDEBUG(D_NET, "got req "LPD64"\n", req->rq_xid);
+
+ req->rq_export = class_conn2export(
+ lustre_msg_get_handle(req->rq_reqmsg));
+ if (req->rq_export) {
+ rc = ptlrpc_check_req(req);
+ class_export_put(req->rq_export);
+ if (rc)
+ goto err_req;
+ }
+
+ /* req_in handling should/must be fast */
+ if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5)
+ DEBUG_REQ(D_WARNING, req, "Slow req_in handling %lus",
+ cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
+
+ /* Set rpc server deadline and add it to the timed list */
+ deadline = (lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) ?
+ /* The max time the client expects us to take */
+ lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+ LASSERT(deadline > 0);
+ req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+
+ ptlrpc_at_add_timed(req);
+
+ /* Move it over to the request processing queue */
+ spin_lock(&svc->srv_lock);
+ list_add_tail(&req->rq_list, &svc->srv_request_queue);
+ cfs_waitq_signal(&svc->srv_waitq);
+ spin_unlock(&svc->srv_lock);
+ RETURN(1);
+
+err_req:
+ spin_lock(&svc->srv_lock);
+ svc->srv_n_queued_reqs--;
+ svc->srv_n_active_reqs++;
+ spin_unlock(&svc->srv_lock);
+ ptlrpc_server_free_request(req);
+
+ RETURN(1);
+}
+
static int
ptlrpc_server_handle_request(struct ptlrpc_service *svc,
struct ptlrpc_thread *thread)
struct timeval work_start;
struct timeval work_end;
long timediff;
- int rc, reply;
+ int rc;
ENTRY;
LASSERT(svc);
spin_lock(&svc->srv_lock);
if (list_empty (&svc->srv_request_queue) ||
- (svc->srv_n_difficult_replies != 0 &&
+ (
+#ifndef __KERNEL__
+ /* !@%$# liblustre only has 1 thread */
+ svc->srv_n_difficult_replies != 0 &&
+#endif
svc->srv_n_active_reqs >= (svc->srv_threads_running - 1))) {
- /* If all the other threads are handling requests, I must
- * remain free to handle any 'difficult' reply that might
- * block them */
+ /* Don't handle regular requests in the last thread, in order * remain free to handle any 'difficult' replies (that might
+ * to handle difficult replies (which might block other threads)
+ * as well as handle any incoming reqs, early replies, etc.
+ * That means we always need at least 2 service threads. */
spin_unlock(&svc->srv_lock);
RETURN(0);
}
svc->srv_n_queued_reqs);
lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
svc->srv_n_active_reqs);
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+ at_get(&svc->srv_at_estimate));
}
-
-#if SWAB_PARANOIA
- /* Clear request swab mask; this is a new request */
- request->rq_req_swab_mask = 0;
-#endif
- rc = lustre_unpack_msg(request->rq_reqmsg, request->rq_reqlen);
- if (rc != 0) {
- CERROR ("error unpacking request: ptl %d from %s"
- " xid "LPU64"\n", svc->srv_req_portal,
- libcfs_id2str(request->rq_peer), request->rq_xid);
- goto out_req;
- }
-
- rc = lustre_unpack_req_ptlrpc_body(request, MSG_PTLRPC_BODY_OFF);
- if (rc) {
- CERROR ("error unpacking ptlrpc body: ptl %d from %s"
- " xid "LPU64"\n", svc->srv_req_portal,
- libcfs_id2str(request->rq_peer), request->rq_xid);
- goto out_req;
- }
-
- rc = -EINVAL;
- if (lustre_msg_get_type(request->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
- CERROR("wrong packet type received (type=%u) from %s\n",
- lustre_msg_get_type(request->rq_reqmsg),
- libcfs_id2str(request->rq_peer));
- goto out_req;
- }
-
+
CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
-
+
request->rq_svc_thread = thread;
request->rq_export = class_conn2export(
lustre_msg_get_handle(request->rq_reqmsg));
if (request->rq_export) {
- if (lustre_msg_get_conn_cnt(request->rq_reqmsg) <
- request->rq_export->exp_conn_cnt) {
- DEBUG_REQ(D_ERROR, request,
- "DROPPING req from old connection %d < %d",
- lustre_msg_get_conn_cnt(request->rq_reqmsg),
- request->rq_export->exp_conn_cnt);
- goto put_conn;
- }
- if (request->rq_export->exp_obd &&
- request->rq_export->exp_obd->obd_fail) {
- /* Failing over, don't handle any more reqs, send
- error response instead. */
- CDEBUG(D_RPCTRACE,"Dropping req %p for failed obd %s\n",
- request, request->rq_export->exp_obd->obd_name);
- request->rq_status = -ENODEV;
- ptlrpc_error(request);
+ if (ptlrpc_check_req(request))
goto put_conn;
- }
-
- ptlrpc_update_export_timer(request->rq_export, timediff/500000);
+ ptlrpc_update_export_timer(request->rq_export, timediff >> 19);
export = class_export_rpc_get(request->rq_export);
}
- /* Discard requests queued for longer than my timeout. If the
- * client's timeout is similar to mine, she'll be timing out this
- * REQ anyway (bug 1502) */
- if (timediff / 1000000 > (long)obd_timeout) {
- CERROR("Dropping timed-out opc %d request from %s"
- ": %ld seconds old\n",
- lustre_msg_get_opc(request->rq_reqmsg),
- libcfs_id2str(request->rq_peer),
- timediff / 1000000);
- goto put_rpc_export;
- }
-
request->rq_phase = RQ_PHASE_INTERPRET;
CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
libcfs_id2str(request->rq_peer),
lustre_msg_get_opc(request->rq_reqmsg));
- rc = svc->srv_handler(request);
+ OBD_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, obd_fail_val);
+ rc = svc->srv_handler(request);
+
request->rq_phase = RQ_PHASE_COMPLETE;
CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
libcfs_id2str(request->rq_peer),
lustre_msg_get_opc(request->rq_reqmsg));
-put_rpc_export:
if (export != NULL)
class_export_rpc_put(export);
if (request->rq_export != NULL)
class_export_put(request->rq_export);
- reply = request->rq_reply_state && request->rq_repmsg; /* bug 11169 */
+ if (cfs_time_current_sec() > request->rq_deadline) {
+ DEBUG_REQ(D_WARNING, request, "Request took longer than"
+ " estimated (%+lds); client may timeout.",
+ cfs_time_current_sec() - request->rq_deadline);
+ }
do_gettimeofday(&work_end);
timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
- if (timediff / 1000000 > (long)obd_timeout)
- CERROR("request "LPU64" opc %u from %s processed in %lds "
- "trans "LPU64" rc %d/%d\n",
- request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg),
- libcfs_id2str(request->rq_peer),
- cfs_timeval_sub(&work_end, &request->rq_arrival_time,
- NULL) / 1000000,
- reply ? lustre_msg_get_transno(request->rq_repmsg) :
- request->rq_transno,
- request->rq_status,
- reply ? lustre_msg_get_status(request->rq_repmsg): -999);
- else
- CDEBUG(D_RPCTRACE,"request "LPU64" opc %u from %s processed in "
- "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
- request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg),
- libcfs_id2str(request->rq_peer), timediff,
- cfs_timeval_sub(&work_end, &request->rq_arrival_time,
- NULL),
- request->rq_transno, request->rq_status,
- reply ? lustre_msg_get_status(request->rq_repmsg): -999);
-
+ CDEBUG(D_RPCTRACE, "request "LPU64" opc %u from %s processed in "
+ "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
+ request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg),
+ libcfs_id2str(request->rq_peer), timediff,
+ cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
+ request->rq_repmsg ? lustre_msg_get_transno(request->rq_repmsg) :
+ request->rq_transno, request->rq_status,
+ request->rq_repmsg ? lustre_msg_get_status(request->rq_repmsg):
+ -999);
if (svc->srv_stats != NULL) {
int opc = opcode_offset(lustre_msg_get_opc(request->rq_reqmsg));
if (opc > 0) {
timediff);
}
}
+ if (request->rq_early_count) {
+ DEBUG_REQ(D_ADAPTTO, request,
+ "sent %d early replies before finishing in %lds",
+ request->rq_early_count,
+ work_end.tv_sec - request->rq_arrival_time.tv_sec);
+ }
-out_req:
ptlrpc_server_free_request(request);
RETURN(1);
svc->srv_threads_running++;
do {
- rc = ptlrpc_server_handle_reply(svc);
+ rc = ptlrpc_server_handle_req_in(svc);
+ rc |= ptlrpc_server_handle_reply(svc);
+ rc |= ptlrpc_at_check_timed(svc);
rc |= ptlrpc_server_handle_request(svc, NULL);
rc |= (ptlrpc_server_post_idle_rqbds(svc) > 0);
did_something |= rc;
#ifdef WITH_GROUP_INFO
struct group_info *ginfo = NULL;
#endif
- int rc = 0;
+ int counter, rc = 0;
ENTRY;
ptlrpc_daemonize(data->name);
*/
cfs_waitq_signal(&thread->t_ctl_waitq);
- watchdog = lc_watchdog_add(svc->srv_watchdog_timeout, NULL, NULL);
+ watchdog = lc_watchdog_add((AT_OFF ? obd_timeout :
+ at_get(&svc->srv_at_estimate)) *
+ svc->srv_watchdog_factor, NULL, NULL);
spin_lock(&svc->srv_lock);
svc->srv_threads_running++;
svc->srv_n_difficult_replies == 0) ||
(!list_empty(&svc->srv_idle_rqbds) &&
svc->srv_rqbd_timeout == 0) ||
- !list_empty (&svc->srv_reply_queue) ||
- (!list_empty (&svc->srv_request_queue) &&
- (svc->srv_n_difficult_replies == 0 ||
- svc->srv_n_active_reqs <
- (svc->srv_threads_running - 1))),
+ !list_empty(&svc->srv_req_in_queue) ||
+ !list_empty(&svc->srv_reply_queue) ||
+ (!list_empty(&svc->srv_request_queue) &&
+ (svc->srv_n_active_reqs <
+ (svc->srv_threads_running - 1))) ||
+ svc->srv_at_check,
&lwi);
- lc_watchdog_touch(watchdog);
+ lc_watchdog_touch_ms(watchdog, (AT_OFF ? obd_timeout :
+ at_get(&svc->srv_at_estimate)) *
+ svc->srv_watchdog_factor);
ptlrpc_check_rqbd_pool(svc);
ptlrpc_start_thread(dev, svc);
}
- if (!list_empty (&svc->srv_reply_queue))
+ if (!list_empty(&svc->srv_reply_queue))
ptlrpc_server_handle_reply (svc);
- /* only handle requests if there are no difficult replies
- * outstanding, or I'm not the last thread handling
- * requests */
+ counter = 0;
+ while(!list_empty(&svc->srv_req_in_queue)) {
+ /* Process all incoming reqs before handling any */
+ ptlrpc_server_handle_req_in(svc);
+ /* but limit ourselves in case of flood */
+ if (counter++ > 1000)
+ break;
+ }
+
+ if (svc->srv_at_check)
+ ptlrpc_at_check_timed(svc);
+
+ /* don't handle requests in the last thread */
if (!list_empty (&svc->srv_request_queue) &&
- (svc->srv_n_difficult_replies == 0 ||
- svc->srv_n_active_reqs < (svc->srv_threads_running - 1)))
+ (svc->srv_n_active_reqs < (svc->srv_threads_running - 1)))
ptlrpc_server_handle_request(svc, thread);
if (!list_empty(&svc->srv_idle_rqbds) &&
int i, rc = 0;
ENTRY;
- LASSERT(svc->srv_threads_min > 0);
+ /* We require 2 threads min - see note in
+ ptlrpc_server_handle_request */
+ LASSERT(svc->srv_threads_min >= 2);
for (i = 0; i < svc->srv_threads_min; i++) {
rc = ptlrpc_start_thread(dev, svc);
if (rc) {
struct list_head *tmp;
struct ptlrpc_reply_state *rs, *t;
+ cfs_timer_disarm(&service->srv_at_timer);
+
ptlrpc_stop_all_threads(service);
LASSERT(list_empty(&service->srv_threads));
/* Network access will complete in finite time but the HUGE
* timeout lets us CWARN for visibility of sluggish NALs */
- lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL);
+ lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL);
rc = l_wait_event(service->srv_waitq,
service->srv_nrqbd_receiving == 0,
&lwi);
/* purge the request queue. NB No new replies (rqbds all unlinked)
* and no service threads, so I'm the only thread noodling the
* request queue now */
+ while (!list_empty(&service->srv_req_in_queue)) {
+ struct ptlrpc_request *req =
+ list_entry(service->srv_req_in_queue.next,
+ struct ptlrpc_request,
+ rq_list);
+
+ list_del(&req->rq_list);
+ service->srv_n_queued_reqs--;
+ service->srv_n_active_reqs++;
+ ptlrpc_server_free_request(req);
+ }
while (!list_empty(&service->srv_request_queue)) {
struct ptlrpc_request *req =
list_entry(service->srv_request_queue.next,
list_del(&req->rq_list);
service->srv_n_queued_reqs--;
service->srv_n_active_reqs++;
-
ptlrpc_server_free_request(req);
}
LASSERT(service->srv_n_queued_reqs == 0);
OBD_FREE(rs, service->srv_max_reply_size);
}
+ /* In case somebody rearmed this in the meantime */
+ cfs_timer_disarm(&service->srv_at_timer);
+
OBD_FREE(service, sizeof(*service));
return 0;
}
{
struct ptlrpc_request *request;
struct timeval right_now;
- long timediff, cutoff;
- int rc = 0;
+ long timediff;
if (svc == NULL)
return 0;
- spin_lock(&svc->srv_lock);
-
- if (list_empty(&svc->srv_request_queue))
- goto out;
+ do_gettimeofday(&right_now);
+ spin_lock(&svc->srv_lock);
+ if (list_empty(&svc->srv_request_queue)) {
+ spin_unlock(&svc->srv_lock);
+ return 0;
+ }
+
+ /* How long has the next entry been waiting? */
request = list_entry(svc->srv_request_queue.next,
struct ptlrpc_request, rq_list);
-
- do_gettimeofday(&right_now);
timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
+ spin_unlock(&svc->srv_lock);
- cutoff = obd_health_check_timeout;
-
- if (timediff / 1000000 > cutoff) {
- rc = -1;
- goto out;
+ if ((timediff / ONE_MILLION) > (AT_OFF ? obd_timeout * 3/2 :
+ adaptive_timeout_max)) {
+ CERROR("%s: unhealthy - request has been waiting %lds\n",
+ svc->srv_name, timediff / ONE_MILLION);
+ return (-1);
}
- out:
- spin_unlock(&svc->srv_lock);
- return rc;
+ return 0;
}
{
/* Wire protocol assertions generated by 'wirecheck'
* (make -C lustre/utils newwirecheck)
- * running on Linux pancake 2.6.18-skas3-v9-pre9 #2 Tue Oct 17 13:08:24 PDT 2006 i686 i686 i3
+ * running on Linux pancake 2.6.18-skas3-v9-pre9 #1 Tue Feb 20 10:37:58 PST 2007 i686 i686 i3
* with gcc version 3.4.4 */
(long long)MSG_RESENT);
LASSERTF(MSG_REPLAY == 4, " found %lld\n",
(long long)MSG_REPLAY);
+ LASSERTF(MSG_AT_SUPPORT == 8, " found %lld\n",
+ (long long)MSG_AT_SUPPORT);
LASSERTF(MSG_CONNECT_RECOVERING == 1, " found %lld\n",
(long long)MSG_CONNECT_RECOVERING);
LASSERTF(MSG_CONNECT_RECONNECT == 2, " found %lld\n",
(long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n",
(long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
- LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n",
- (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout));
- LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n",
(long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1));
LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n",
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body, pb_timeout));
+ LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout));
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body, pb_service_time));
+ LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time));
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 76, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_padding_1));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n",
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 80, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_padding_2));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 76, " found %lld\n",
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 84, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_padding_3));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_4) == 80, " found %lld\n",
- (long long)(int)offsetof(struct ptlrpc_body, pb_padding_4));
- LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_5) == 84, " found %lld\n",
- (long long)(int)offsetof(struct ptlrpc_body, pb_padding_5));
- LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5));
/* Checks for struct obd_connect_data */
LASSERTF((int)sizeof(struct obd_connect_data) == 72, " found %lld\n",
CLASSERT(OBD_CONNECT_FID_CAPA == 0x100000ULL);
CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL);
CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL);
-
+ CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL);
+ CLASSERT(OBD_CONNECT_AT == 0x01000000ULL);
+
/* Checks for struct obdo */
LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n",
(long long)(int)sizeof(struct obdo));
# xml xml xml xml xml xml dumb FIXME
MOUNTCONFSKIP="10 11 12 13 13b 14 15 18"
-# bug number for skipped test:
-ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT $MOUNTCONFSKIP"
+# bug number for skipped test: 13369
+ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT $MOUNTCONFSKIP 34a"
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
SRCDIR=`dirname $0`
mount -t lustre 4.3.2.1@tcp:/lustre $MOUNT || true
cleanup
}
-run_test 31 "Connect to non-existent node (shouldn't crash)"
+run_test 31 "Connect to non-existent node (returns errors, should not crash)"
test_32a() {
# XXX - make this run on client-only systems with real hardware on
load_modules
# mount a second time to make sure we didnt leave upgrade flag on
+ load_modules
$TUNEFS --dryrun $TMP/$tdir/mds || error "tunefs failed"
load_modules
start mds $TMP/$tdir/mds "-o loop,exclude=lustre-OST0000" || return 12
unzip -o -j -d $TMP/$tdir disk1_4.zip || { skip "Cant unzip disk1_4, skipping" && return ; }
load_modules
sysctl lnet.debug=$PTLDEBUG
+ NEWNAME=sofia
# writeconf will cause servers to register with their current nids
- $TUNEFS --writeconf $TMP/$tdir/mds || error "tunefs failed"
+ $TUNEFS --writeconf --fsname=$NEWNAME $TMP/$tdir/mds || error "tunefs failed"
start mds $TMP/$tdir/mds "-o loop" || return 3
- local UUID=$(cat $LPROC/mds/lustre-MDT0000/uuid)
+ local UUID=$(cat $LPROC/mds/${NEWNAME}-MDT0000/uuid)
echo MDS uuid $UUID
[ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID"
- $TUNEFS --mgsnode=`hostname` $TMP/$tdir/ost1 || error "tunefs failed"
+ $TUNEFS --mgsnode=`hostname` --fsname=$NEWNAME --writeconf $TMP/$tdir/ost1 || error "tunefs failed"
start ost1 $TMP/$tdir/ost1 "-o loop" || return 5
- UUID=$(cat $LPROC/obdfilter/lustre-OST0000/uuid)
+ UUID=$(cat $LPROC/obdfilter/${NEWNAME}-OST0000/uuid)
echo OST uuid $UUID
- [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID"
+ [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID"
echo "OSC changes should succeed:"
- $LCTL conf_param lustre-OST0000.osc.max_dirty_mb=15 || return 7
- $LCTL conf_param lustre-OST0000.failover.node=$NID || return 8
+ $LCTL conf_param ${NEWNAME}-OST0000.osc.max_dirty_mb=15 || return 7
+ $LCTL conf_param ${NEWNAME}-OST0000.failover.node=$NID || return 8
echo "ok."
echo "MDC changes should succeed:"
- $LCTL conf_param lustre-MDT0000.mdc.max_rpcs_in_flight=9 || return 9
+ $LCTL conf_param ${NEWNAME}-MDT0000.mdc.max_rpcs_in_flight=9 || return 9
echo "ok."
# MDT and OST should have registered with new nids, so we should have
# a fully-functioning client
echo "Check client and old fs contents"
+ OLDFS=$FSNAME
+ FSNAME=$NEWNAME
mount_client $MOUNT
+ FSNAME=$OLDFS
set_and_check client "cat $LPROC/mdc/*/max_rpcs_in_flight" "lustre-MDT0000.mdc.max_rpcs_in_flight" || return 11
[ "$(cksum $MOUNT/passwd | cut -d' ' -f 1,2)" == "2479747619 779" ] || return 12
echo "ok."
# bug 5493
ALWAYS_EXCEPT="52 $RECOVERY_SMALL_EXCEPT"
-PTLDEBUG=${PTLDEBUG:--1}
+#PTLDEBUG=${PTLDEBUG:--1}
LUSTRE=${LUSTRE:-`dirname $0`/..}
. $LUSTRE/tests/test-framework.sh
init_test_env $@
do_facet client cp $SAMPLE_FILE $f
sync
ost_evict_client
- # allow recovery to complete
- sleep $((TIMEOUT + 2))
+ # force reconnect
+ df $MOUNT > /dev/null 2>&1
+ sleep 2
# my understanding is that there should be nothing in the page
# cache after the client reconnects?
rc=0
OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
OST_NEXP1="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
+ #force umount a client; exports should get evicted
zconf_umount `hostname` $MOUNT2 -f
- # evictor takes up to 2.25x to evict. But if there's a
- # race to start the evictor from various obds, the loser
- # might have to wait for the next ping.
- echo Waiting for $(($TIMEOUT * 4)) secs
- sleep $(($TIMEOUT * 4))
+ # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.
+ # But if there's a race to start the evictor from various obds,
+ # the loser might have to wait for the next ping.
+ echo Waiting for $(($TIMEOUT * 8)) secs
+ sleep $(($TIMEOUT * 8))
OST_NEXP2="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
MDS_NEXP2="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
test_50() {
mkdir -p $DIR/$tdir
+ debugsave
+ sysctl -w lnet.debug="-dlmtrace -ha"
# put a load of file creates/writes/deletes
- writemany -q $DIR/$tdir/$tfile 0 5 &
+ writemany -a -q $DIR/$tdir/$tfile 0 5 &
CLIENT_PID=$!
echo writemany pid $CLIENT_PID
sleep 10
+ ps $CLIENT_PID > /dev/null || error "Writemany died 1.1"
FAILURE_MODE="SOFT"
+ $LCTL mark "$TESTNAME fail mds 1"
fail mds
+ ps $CLIENT_PID > /dev/null || error "Writemany died 2.1"
# wait for client to reconnect to MDS
sleep 60
+ ps $CLIENT_PID > /dev/null || error "Writemany died 2.2"
+ $LCTL mark "$TESTNAME fail mds 2"
fail mds
+ ps $CLIENT_PID > /dev/null || error "Writemany died 3.1"
sleep 60
+ ps $CLIENT_PID > /dev/null || error "Writemany died 3.2"
+ $LCTL mark "$TESTNAME fail mds 3"
fail mds
# client process should see no problems even though MDS went down
+ ps $CLIENT_PID > /dev/null || error "Writemany died 4.1"
sleep $TIMEOUT
+ ps $CLIENT_PID > /dev/null || error "Writemany died 4.2"
kill -USR1 $CLIENT_PID
wait $CLIENT_PID
rc=$?
echo writemany returned $rc
#these may fail because of eviction due to slow AST response.
+ debugrestore
return $rc
}
run_test 50 "failover MDS under load"
test_51() {
mkdir -p $DIR/$tdir
# put a load of file creates/writes/deletes
- writemany -q $DIR/$tdir/$tfile 0 5 &
+ writemany -a -q $DIR/$tdir/$tfile 0 5 &
CLIENT_PID=$!
sleep 1
FAILURE_MODE="SOFT"
for i in $SEQ
do
echo failover in $i sec
+ ps $CLIENT_PID > /dev/null || error "Writemany died $i.1"
sleep $i
+ ps $CLIENT_PID > /dev/null || error "Writemany died $i.1"
+ $LCTL mark "$TESTNAME fail mds $i"
facet_failover mds
done
# client process should see no problems even though MDS went down
echo writemany pid $CLIENT_PID
sleep 10
FAILURE_MODE="SOFT"
+ $LCTL mark "$TESTNAME fail ost $1"
fail ost1
rc=0
wait $CLIENT_PID || rc=$?
test_52() {
mkdir -p $DIR/$tdir
- test_52_guts
+ test_52_guts 1
rc=$?
[ $rc -ne 0 ] && { return $rc; }
# wait for client to reconnect to OST
sleep 30
- test_52_guts
+ test_52_guts 2
rc=$?
[ $rc -ne 0 ] && { return $rc; }
sleep 30
- test_52_guts
+ test_52_guts 3
rc=$?
client_reconnect
#return $rc
#define OBD_FAIL_LDLM_ENQUEUE 0x302
do_facet mds sysctl -w lustre.fail_loc=0x80000302
facet_failover mds
- df $MOUNT || return 1
+ df $MOUNT || { kill -USR1 $MULTIPID && return 1; }
do_facet mds sysctl -w lustre.fail_loc=0
ls $DIR/$tfile
test_44() {
mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
[ "$mdcdev" ] || exit 2
+ # adaptive timeouts slow this way down
+ MDS_AT_MAX=$(do_facet mds "sysctl -n lustre.adaptive_max")
+ do_facet mds "sysctl -w lustre.adaptive_max=40"
for i in `seq 1 10`; do
+ echo "$i of 10 ($(date +%s))"
+ do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
#define OBD_FAIL_TGT_CONN_RACE 0x701
do_facet mds "sysctl -w lustre.fail_loc=0x80000701"
$LCTL --device $mdcdev recover
df $MOUNT
done
do_facet mds "sysctl -w lustre.fail_loc=0"
+ do_facet mds "sysctl -w lustre.adaptive_max=$MDS_AT_MAX"
return 0
}
run_test 44 "race in target handle connect"
mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
[ "$mdcdev" ] || exit 2
for i in `seq 1 10`; do
+ echo "$i of 10 ($(date +%s))"
+ do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
do_facet mds "sysctl -w lustre.fail_loc=0x80000704"
$LCTL --device $mdcdev recover
run_test 60 "test llog post recovery init vs llog unlink"
#test race llog recovery thread vs llog cleanup
-test_61() {
+test_61a() {
mkdir $DIR/$tdir
createmany -o $DIR/$tdir/$tfile-%d 800
replay_barrier ost1
$CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1
rmdir $DIR/$tdir
}
-run_test 61 "test race llog recovery vs llog cleanup"
+run_test 61a "test race llog recovery vs llog cleanup"
#test race mds llog sync vs llog cleanup
test_61b() {
}
run_test 61c "test race mds llog sync vs llog cleanup"
+
+at_start() #bug 3055
+{
+ if [ -z "$ATOLDBASE" ]; then
+ ATOLDBASE=$(do_facet mds "grep timebase $LPROC/mdt/MDS/mds/timeouts" | awk '{print $3}' )
+ # speed up the timebase so we can check decreasing AT
+ do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds/timeouts"
+ do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_readpage/timeouts"
+ do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_setattr/timeouts"
+ do_facet ost1 "echo 8 >> $LPROC/ost/OSS/ost/timeouts"
+ fi
+}
+
+test_65() #bug 3055
+{
+ at_start
+ $LCTL dk > /dev/null
+ # slow down a request
+ sysctl -w lustre.fail_val=30000
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+ sysctl -w lustre.fail_loc=0x8000050a
+ createmany -o $DIR/$tfile 10 > /dev/null
+ unlinkmany $DIR/$tfile 10 > /dev/null
+ # check for log message
+ $LCTL dk | grep "Early reply #" || error "No early reply"
+ # client should show 30s timeouts
+ grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+ sleep 9
+ grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+}
+run_test 65 "AT: verify early replies"
+
+test_66a() #bug 3055
+{
+ at_start
+ grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+ # adjust 5s at a time so no early reply is sent (within deadline)
+ do_facet mds "sysctl -w lustre.fail_val=5000"
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+ do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
+ createmany -o $DIR/$tfile 20 > /dev/null
+ unlinkmany $DIR/$tfile 20 > /dev/null
+ grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+ do_facet mds "sysctl -w lustre.fail_val=10000"
+ do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
+ createmany -o $DIR/$tfile 20 > /dev/null
+ unlinkmany $DIR/$tfile 20 > /dev/null
+ grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+ do_facet mds "sysctl -w lustre.fail_loc=0"
+ sleep 9
+ createmany -o $DIR/$tfile 20 > /dev/null
+ unlinkmany $DIR/$tfile 20 > /dev/null
+ grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12"
+ CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
+ WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
+ echo "Current MDT timeout $CUR, worst $WORST"
+ [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST"
+}
+run_test 66a "AT: verify MDT service time adjusts with no early replies"
+
+test_66b() #bug 3055
+{
+ at_start
+ ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/lustre-*/timeouts)
+ sysctl -w lustre.fail_val=$(($ORIG + 5))
+#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
+ sysctl -w lustre.fail_loc=0x50c
+ ls $DIR/$tfile > /dev/null 2>&1
+ sysctl -w lustre.fail_loc=0
+ CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts)
+ WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts)
+ echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
+ [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG"
+}
+run_test 66b "AT: verify net latency adjusts"
+
+test_67a() #bug 3055
+{
+ at_start
+ CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+ # sleeping threads may drive values above this
+ do_facet ost1 "sysctl -w lustre.fail_val=400"
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+ do_facet ost1 "sysctl -w lustre.fail_loc=0x50a"
+ createmany -o $DIR/$tfile 20 > /dev/null
+ unlinkmany $DIR/$tfile 20 > /dev/null
+ do_facet ost1 "sysctl -w lustre.fail_loc=0"
+ CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+ ATTEMPTS=$(($CONN2 - $CONN1))
+ echo "$ATTEMPTS osc reconnect attemps on gradual slow"
+ [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
+ return 0
+}
+run_test 67a "AT: verify slow request processing doesn't induce reconnects"
+
+test_67b() #bug 3055
+{
+ at_start
+ CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+#define OBD_FAIL_OST_PAUSE_CREATE 0x223
+ do_facet ost1 "sysctl -w lustre.fail_val=20000"
+ do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
+ cp /etc/profile $DIR/$tfile || error "cp failed"
+ client_reconnect
+ cat $LPROC/ost/OSS/ost_create/timeouts
+ log "phase 2"
+ CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+ ATTEMPTS=$(($CONN2 - $CONN1))
+ echo "$ATTEMPTS osc reconnect attemps on instant slow"
+ # do it again; should not timeout
+ do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
+ cp /etc/profile $DIR/$tfile || error "cp failed"
+ do_facet ost1 "sysctl -w lustre.fail_loc=0"
+ client_reconnect
+ cat $LPROC/ost/OSS/ost_create/timeouts
+ CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+ ATTEMPTS=$(($CONN3 - $CONN2))
+ echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
+ [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
+ return 0
+}
+run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
+
+if [ -n "$ATOLDBASE" ]; then
+ do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds/timeouts"
+ do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_readpage/timeouts"
+ do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_setattr/timeouts"
+ do_facet ost1 "echo $ATOLDBASE >> $LPROC/ost/OSS/ost/timeouts"
+fi
+# end of AT tests includes above lines
+
+
equals_msg `basename $0`: test complete, cleaning up
check_and_cleanup_lustre
[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
$SETSTRIPE $DIR/d27/f0 65536 0 1 || error "lstripe failed"
$CHECKSTAT -t file $DIR/d27/f0 || error "checkstat failed"
pass
- log "== test_27b: write to one stripe file ========================="
+ log "== test 27b: write to one stripe file ========================="
cp /etc/hosts $DIR/d27/f0 || error
}
run_test 27a "one stripe file =================================="
[ `$GETSTRIPE $DIR/d27/f01 | grep -A 10 obdidx | wc -l` -eq 4 ] ||
error "two-stripe file doesn't have two stripes"
pass
- log "== test_27d: write to two stripe file file f01 ================"
+ log "== test 27d: write to two stripe file file f01 ================"
dd if=/dev/zero of=$DIR/d27/f01 bs=4k count=4 || error "dd failed"
}
run_test 27c "create two stripe file f01 ======================="
TOIGNORE=$(find . -type l -printf '-I %f\n' -o \
! -perm +4 -printf '-I %f\n')
$RUNAS cvs -d $DIR/d99cvsroot import -m "nomesg" $TOIGNORE \
- d99reposname vtag rtag || error "cvs import failed"
+ d99reposname vtag rtag > /dev/null || error "cvs import failed"
}
run_test 99b "cvs import ======================================="
mkdir -p $DIR/d99reposname || error "mkdir $DIR/d99reposname failed"
chown $RUNAS_ID $DIR/d99reposname || \
error "chown $DIR/d99reposname failed"
- $RUNAS cvs -d $DIR/d99cvsroot co d99reposname || \
+ $RUNAS cvs -d $DIR/d99cvsroot co d99reposname > /dev/null || \
error "cvs co d99reposname failed"
}
run_test 99c "cvs checkout ====================================="
echo "performing cp ..."
run_acl_subtest cp || error
echo "performing getfacl-noacl..."
- run_acl_subtest getfacl-noacl || error
+ run_acl_subtest getfacl-noacl > /dev/null || error
echo "performing misc..."
- run_acl_subtest misc || error
+ run_acl_subtest misc > /dev/null || error
# XXX add back permission test when we support supplementary groups.
# echo "performing permissions..."
# run_acl_subtest permissions || error
echo "performing setfacl..."
- run_acl_subtest setfacl || error
+ run_acl_subtest setfacl > /dev/null || error
# inheritance test got from HP
echo "performing inheritance..."
cp $SAVE_PWD/acl/make-tree . || error
chmod +x make-tree || error
- run_acl_subtest inheritance || error
+ run_acl_subtest inheritance > /dev/null || error
rm -f make-tree
cd $SAVE_PWD
ONLY=${ONLY:-"$*"}
# bug number for skipped test: 3192
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b"}
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b $SANITYN_EXCEPT"}
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 16"
do_facet ${facet} mount -t lustre $@ ${device} ${MOUNT%/*}/${facet}
RC=${PIPESTATUS[0]}
if [ $RC -ne 0 ]; then
- echo mount -t lustre $@ ${device} ${MOUNT%/*}/${facet}
- echo Start of ${device} on ${facet} failed ${RC}
+ echo "mount -t lustre $@ ${device} ${MOUNT%/*}/${facet}"
+ echo "Start of ${device} on ${facet} failed ${RC}"
else
do_facet ${facet} sync
label=$(do_facet ${facet} "e2label ${device}")
shift
HOST=`facet_active_host $facet`
[ -z $HOST ] && echo No host defined for facet ${facet} && exit 1
- do_node $HOST $@
+ do_node $HOST "$@"
}
add() {
CHECK_MEMBER(lustre_msg_v2, lm_secflvr);
CHECK_MEMBER(lustre_msg_v2, lm_magic);
CHECK_MEMBER(lustre_msg_v2, lm_repsize);
- CHECK_MEMBER(lustre_msg_v2, lm_timeout);
+ CHECK_MEMBER(lustre_msg_v2, lm_cksum);
CHECK_MEMBER(lustre_msg_v2, lm_padding_1);
CHECK_MEMBER(lustre_msg_v2, lm_padding_2);
CHECK_MEMBER(lustre_msg_v2, lm_padding_3);
CHECK_MEMBER(ptlrpc_body, pb_flags);
CHECK_MEMBER(ptlrpc_body, pb_op_flags);
CHECK_MEMBER(ptlrpc_body, pb_conn_cnt);
+ CHECK_MEMBER(ptlrpc_body, pb_timeout);
+ CHECK_MEMBER(ptlrpc_body, pb_service_time);
CHECK_MEMBER(ptlrpc_body, pb_padding_1);
CHECK_MEMBER(ptlrpc_body, pb_padding_2);
CHECK_MEMBER(ptlrpc_body, pb_padding_3);
- CHECK_MEMBER(ptlrpc_body, pb_padding_4);
- CHECK_MEMBER(ptlrpc_body, pb_padding_5);
}
static void check_obd_connect_data(void)
CHECK_VALUE(MSG_LAST_REPLAY);
CHECK_VALUE(MSG_RESENT);
CHECK_VALUE(MSG_REPLAY);
+ CHECK_VALUE(MSG_AT_SUPPORT);
CHECK_VALUE(MSG_CONNECT_RECOVERING);
CHECK_VALUE(MSG_CONNECT_RECONNECT);
{
/* Wire protocol assertions generated by 'wirecheck'
* (make -C lustre/utils newwirecheck)
- * running on Linux pancake 2.6.18-skas3-v9-pre9 #2 Tue Oct 17 13:08:24 PDT 2006 i686 i686 i3
+ * running on Linux pancake 2.6.18-skas3-v9-pre9 #1 Tue Feb 20 10:37:58 PST 2007 i686 i686 i3
* with gcc version 3.4.4 */
(long long)MSG_RESENT);
LASSERTF(MSG_REPLAY == 4, " found %lld\n",
(long long)MSG_REPLAY);
+ LASSERTF(MSG_AT_SUPPORT == 8, " found %lld\n",
+ (long long)MSG_AT_SUPPORT);
LASSERTF(MSG_CONNECT_RECOVERING == 1, " found %lld\n",
(long long)MSG_CONNECT_RECOVERING);
LASSERTF(MSG_CONNECT_RECONNECT == 2, " found %lld\n",
(long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n",
(long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
- LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n",
- (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout));
- LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n",
(long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1));
LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n",
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body, pb_timeout));
+ LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout));
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body, pb_service_time));
+ LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time));
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 76, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_padding_1));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n",
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 80, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_padding_2));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 76, " found %lld\n",
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 84, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_padding_3));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_4) == 80, " found %lld\n",
- (long long)(int)offsetof(struct ptlrpc_body, pb_padding_4));
- LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_5) == 84, " found %lld\n",
- (long long)(int)offsetof(struct ptlrpc_body, pb_padding_5));
- LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5));
/* Checks for struct obd_connect_data */
LASSERTF((int)sizeof(struct obd_connect_data) == 72, " found %lld\n",
CLASSERT(OBD_CONNECT_FID_CAPA == 0x100000ULL);
CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL);
CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL);
-
+ CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL);
+ CLASSERT(OBD_CONNECT_AT == 0x01000000ULL);
+
/* Checks for struct obdo */
LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n",
(long long)(int)sizeof(struct obdo));