port Adaptive Timeouts from b1_6 to HEAD.
b=14071
r=nathan
r=adilger
Description: lfs find on -1 stripe looping in lsm_lmm_verify_common()
Details : Avoid lov_verify_lmm_common() on directory with -1 stripe count.
-Severity : major
-Bugzilla : 12932
-Description: obd_health_check_timeout too short
-Details : set obd_health_check_timeout as 1.5x of obd_timeout
+Severity : enhancement
+Bugzilla : 3055
+Description: Adaptive timeouts
+Details : RPC timeouts adapt to changing server load and network
+ conditions to reduce resend attempts and improve recovery time.
Severity : normal
Bugzilla : 12192
OBD_CONNECT_OSS_CAPA |
OBD_CONNECT_IBITS |
OBD_CONNECT_MDS_MDS |
- OBD_CONNECT_FID;
+ OBD_CONNECT_FID |
+ OBD_CONNECT_AT;
rc = obd_connect(env, conn, mdc, &mdc->obd_uuid, ocd, NULL);
OBD_FREE_PTR(ocd);
if (rc) {
req->rq_request_portal = (opc == SEQ_ALLOC_SUPER) ?
SEQ_CONTROLLER_PORTAL : SEQ_DATA_PORTAL;
}
+ ptlrpc_at_set_req_timeout(req);
mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
rc = ptlrpc_queue_wait(req);
ptlrpc_request_set_replen(req);
req->rq_request_portal = FLD_REQUEST_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
if (fld_op != FLD_LOOKUP)
mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
#define FSFILT_OP_JOIN 11
#define FSFILT_OP_NOOP 15
-#define __fsfilt_check_slow(obd, start, timeout, msg) \
+#define __fsfilt_check_slow(obd, start, msg) \
do { \
if (time_before(jiffies, start + 15 * HZ)) \
break; \
else if (time_before(jiffies, start + 30 * HZ)) \
CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \
msg, (jiffies-start) / HZ); \
- else if (time_before(jiffies, start + timeout / 2 * HZ)) \
+ else if (time_before(jiffies, start + DISK_TIMEOUT * HZ)) \
CWARN("%s: slow %s %lus\n", obd->obd_name, msg, \
(jiffies - start) / HZ); \
else \
(jiffies - start) / HZ); \
} while (0)
-#define fsfilt_check_slow(obd, start, timeout, msg) \
-do { \
- __fsfilt_check_slow(obd, start, timeout, msg); \
- start = jiffies; \
+#define fsfilt_check_slow(obd, start, msg) \
+do { \
+ __fsfilt_check_slow(obd, start, msg); \
+ start = jiffies; \
} while (0)
static inline void *fsfilt_start_log(struct obd_device *obd,
LBUG();
}
}
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return handle;
}
LBUG();
}
}
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return handle;
}
int rc = obd->obd_fsops->fs_extend(inode, nblocks, handle);
CDEBUG(D_INFO, "extending handle %p with %u blocks\n", handle, nblocks);
- fsfilt_check_slow(obd, now, obd_timeout, "journal extend");
+ fsfilt_check_slow(obd, now, "journal extend");
return rc;
}
int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
CDEBUG(D_INFO, "committing handle %p\n", handle);
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return rc;
}
int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle);
CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle);
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return rc;
}
unsigned long now = jiffies;
int rc = obd->obd_fsops->fs_commit_wait(inode, handle);
CDEBUG(D_INFO, "waiting for completion %p\n", handle);
- fsfilt_check_slow(obd, now, obd_timeout, "journal start");
+ fsfilt_check_slow(obd, now, "journal start");
return rc;
}
unsigned long now = jiffies;
int rc;
rc = obd->obd_fsops->fs_setattr(dentry, handle, iattr, do_trunc);
- fsfilt_check_slow(obd, now, obd_timeout, "setattr");
+ fsfilt_check_slow(obd, now, "setattr");
return rc;
}
struct file;
struct obd_histogram;
+/* Days / hours / mins / seconds format */
+struct dhms {
+ int d,h,m,s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+ ts->d = secs / 86400;
+ secs = secs % 86400;
+ ts->h = secs / 3600;
+ secs = secs % 3600;
+ ts->m = secs / 60;
+ ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+
#ifdef LPROCFS
static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int type)
int count, int *eof, void *data);
extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
int count, int *eof, void *data);
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(char *page, int count, int rc,
+ struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+ unsigned long count, void *data);
extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
unsigned long count, void *data);
extern int lprocfs_wr_ping(struct file *file, const char *buffer,
#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL)
#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write)
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
/* lprocfs_status.c: read recovery max time bz13079 */
int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
int count, int *eof, void *data);
static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off,
int count, int *eof, void *data)
{ return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(char *page, int count, int rc,
+ struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{ return 0; }
static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer,
unsigned long count, void *data)
{ return 0; }
#define LPROC_SEQ_FOPS_RO(name)
#define LPROC_SEQ_FOPS(name)
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
#endif /* LPROCFS */
#endif /* LPROCFS_SNMP_H */
tgt->cookie = src->cookie;
}
+/* flags for lm_flags */
+#define MSGHDR_AT_SUPPORT 0x1
+
#define lustre_msg lustre_msg_v2
/* we depend on this structure to be 8-byte aligned */
/* this type is only endian-adjusted in lustre_unpack_msg() */
__u32 lm_secflvr;
__u32 lm_magic;
__u32 lm_repsize;
- __u32 lm_timeout;
- __u32 lm_padding_1;
+ __u32 lm_cksum;
+ __u32 lm_flags;
__u32 lm_padding_2;
__u32 lm_padding_3;
__u32 lm_buflens[0];
__u32 pb_flags;
__u32 pb_op_flags;
__u32 pb_conn_cnt;
- __u32 pb_padding_1;
- __u32 pb_padding_2;
+ __u32 pb_timeout; /* for req, the deadline, for rep, the service est */
+ __u32 pb_service_time; /* for rep, actual service time */
__u32 pb_limit;
__u64 pb_slv;
};
#define MSG_OP_FLAG_SHIFT 16
/* Flags that apply to all requests are in the bottom 16 bits */
-#define MSG_GEN_FLAG_MASK 0x0000ffff
-#define MSG_LAST_REPLAY 1
-#define MSG_RESENT 2
-#define MSG_REPLAY 4
-#define MSG_REQ_REPLAY_DONE 8
-#define MSG_LOCK_REPLAY_DONE 16
+#define MSG_GEN_FLAG_MASK 0x0000ffff
+#define MSG_LAST_REPLAY 0x0001
+#define MSG_RESENT 0x0002
+#define MSG_REPLAY 0x0004
+/* #define MSG_AT_SUPPORT 0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_REQ_REPLAY_DONE 0x0010
+#define MSG_LOCK_REPLAY_DONE 0x0020
/*
* Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | \
OBD_CONNECT_MDS_MDS | OBD_CONNECT_CANCELSET | \
OBD_CONNECT_FID | \
- LRU_RESIZE_CONNECT_FLAG)
+ LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_AT)
#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
OBD_CONNECT_BRW_SIZE | OBD_CONNECT_QUOTA64 | \
OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET | \
- OBD_CONNECT_CKSUM | \
- LRU_RESIZE_CONNECT_FLAG)
+ OBD_CONNECT_CKSUM | LRU_RESIZE_CONNECT_FLAG | \
+ OBD_CONNECT_AT)
#define ECHO_CONNECT_SUPPORTED (0)
-#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION)
+#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT)
#define MAX_QUOTA_COUNT32 (0xffffffffULL)
* Backward link to obd, required for ldlm pool to store new SLV.
*/
struct obd_device *ns_obd;
+
+ struct adaptive_timeout ns_at_estimate;/* estimated lock callback time*/
};
static inline int ns_is_client(struct ldlm_namespace *ns)
struct ldlm_export_data exp_ldlm_data;
struct list_head exp_outstanding_replies;
time_t exp_last_request_time;
+ struct list_head exp_req_replay_queue;
spinlock_t exp_lock; /* protects flags int below */
/* ^ protects exp_outstanding_replies too */
__u64 exp_connect_flags;
#include <lustre_handles.h>
#include <lustre/lustre_idl.h>
+
+/* Adaptive Timeout stuff */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4 /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1 /* use last reported value only */
+
+struct adaptive_timeout {
+ time_t at_binstart; /* bin start time */
+ unsigned int at_hist[AT_BINS]; /* timeout history bins */
+ unsigned int at_flags;
+ unsigned int at_current; /* current timeout value */
+ unsigned int at_worst_ever; /* worst-ever timeout value */
+ time_t at_worst_time; /* worst-ever timeout timestamp */
+ spinlock_t at_lock;
+};
+
enum lustre_imp_state {
LUSTRE_IMP_CLOSED = 1,
LUSTRE_IMP_NEW = 2,
__u64 oic_last_attempt; /* jiffies, 64-bit */
};
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+ int iat_portal[IMP_AT_MAX_PORTALS];
+ struct adaptive_timeout iat_net_latency;
+ struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
struct obd_import {
struct portals_handle imp_handle;
atomic_t imp_refcount;
int imp_connect_error;
__u32 imp_msg_magic;
+ __u32 imp_msghdr_flags; /* adjusted based on server capability */
- struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
+ struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
+
+ struct imp_at imp_at; /* adaptive timeout data */
+ time_t imp_last_reply_time; /* for health check */
};
typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
void class_notify_import_observers(struct obd_import *imp, int event,
void *event_arg);
+/* import.c */
+static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
+ memset(at, 0, sizeof(*at));
+ at->at_current = val;
+ at->at_worst_ever = val;
+ at->at_worst_time = cfs_time_current_sec();
+ at->at_flags = flags;
+ spin_lock_init(&at->at_lock);
+}
+static inline int at_get(struct adaptive_timeout *at) {
+ return at->at_current;
+}
+int at_add(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
/* genops.c */
struct obd_export;
extern struct obd_import *class_exp2cliimp(struct obd_export *);
#define target_handle_qc_callback(req) (0)
#endif
-void target_cancel_recovery_timer(struct obd_device *obd);
-
-#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) /* *waves hands* */
#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
-void target_start_recovery_timer(struct obd_device *obd);
+
+void target_cancel_recovery_timer(struct obd_device *obd);
int target_start_recovery_thread(struct obd_device *obd,
- svc_handler_t handler);
+ svc_handler_t handler);
void target_stop_recovery_thread(struct obd_device *obd);
void target_cleanup_recovery(struct obd_device *obd);
int target_queue_recovery_request(struct ptlrpc_request *req,
struct obd_device *obd);
-int target_queue_final_reply(struct ptlrpc_request *req, int rc);
void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
/* client.c */
struct ptlrpc_request {
int rq_type; /* one of PTL_RPC_MSG_* */
struct list_head rq_list;
+ struct list_head rq_timed_list; /* server-side early replies */
struct list_head rq_history_list; /* server-side history */
__u64 rq_history_seq; /* history sequence # */
int rq_status;
spinlock_t rq_lock;
- /* client-side flags */
+ /* client-side flags are serialized by rq_lock */
unsigned long rq_intr:1, rq_replied:1, rq_err:1,
rq_timedout:1, rq_resend:1, rq_restart:1,
/*
/* this is the last request in the sequence. */
rq_sequence:1,
rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
- rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1;
+ rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+ rq_early:1, rq_must_unlink:1,
+ /* server-side flags */
+ rq_packed_final:1, /* packed final reply */
+ rq_sent_final:1; /* stop sending early replies */
+
enum rq_phase rq_phase; /* one of RQ_PHASE_* */
- atomic_t rq_refcount; /* client-side refcount for SENT race */
+ atomic_t rq_refcount; /* client-side refcount for SENT race,
+ server-side refcounf for multiple replies */
struct ptlrpc_thread *rq_svc_thread; /* initial thread servicing req */
int rq_reqlen;
struct lustre_msg *rq_reqmsg;
- int rq_timeout; /* time to wait for reply (seconds) */
int rq_replen;
struct lustre_msg *rq_repmsg;
__u64 rq_transno;
/* (server side), pointed directly into req buffer */
struct ptlrpc_user_desc *rq_user_desc;
+ /* early replies go to offset 0, regular replies go after that */
+ unsigned int rq_reply_off;
+
/* various buffer pointers */
struct lustre_msg *rq_reqbuf; /* req wrapper */
int rq_reqbuf_len; /* req wrapper buf len */
int rq_reqdata_len; /* req wrapper msg len */
- struct lustre_msg *rq_repbuf; /* rep wrapper */
- int rq_repbuf_len; /* rep wrapper buf len */
+ char *rq_repbuf; /* rep buffer */
+ int rq_repbuf_len; /* rep buffer len */
+ struct lustre_msg *rq_repdata; /* rep wrapper msg */
int rq_repdata_len; /* rep wrapper msg len */
struct lustre_msg *rq_clrbuf; /* only in priv mode */
int rq_clrbuf_len; /* only in priv mode */
int rq_import_generation;
enum lustre_imp_state rq_send_state;
+ int rq_early_count; /* how many early replies (for stats) */
+
/* client+server request */
lnet_handle_md_t rq_req_md_h;
struct ptlrpc_cb_id rq_req_cbid;
void (*rq_commit_cb)(struct ptlrpc_request *);
void *rq_cb_data;
- struct ptlrpc_bulk_desc *rq_bulk; /* client side bulk */
- time_t rq_sent; /* when request sent, seconds,
- * or time when request should
- * be sent */
+ struct ptlrpc_bulk_desc *rq_bulk;/* client side bulk */
+
+ /* client outgoing req */
+ time_t rq_sent; /* when request/reply sent (secs), or
+ * time when request should be sent */
+
+ volatile time_t rq_deadline; /* when request must finish. volatile
+ so that servers' early reply updates to the deadline aren't
+ kept in per-cpu cache */
+ int rq_timeout; /* service time estimate (secs) */
+
/* Multi-rpc bits */
struct list_head rq_set_chain;
struct ptlrpc_request_set *rq_set;
int srv_n_difficult_replies; /* # 'difficult' replies */
int srv_n_active_reqs; /* # reqs being served */
cfs_duration_t srv_rqbd_timeout; /* timeout before re-posting reqs, in tick */
- int srv_watchdog_timeout; /* soft watchdog timeout, in ms */
+ int srv_watchdog_factor; /* soft watchdog timeout mutiplier */
unsigned srv_cpu_affinity:1; /* bind threads to CPUs */
+ unsigned srv_at_check:1; /* check early replies */
+ cfs_time_t srv_at_checktime; /* debug */
__u32 srv_req_portal;
__u32 srv_rep_portal;
- int srv_n_queued_reqs; /* # reqs waiting to be served */
+ /* AT stuff */
+ struct adaptive_timeout srv_at_estimate;/* estimated rpc service time */
+ spinlock_t srv_at_lock;
+ struct list_head srv_at_list; /* reqs waiting for replies */
+ cfs_timer_t srv_at_timer; /* early reply timer */
+
+ int srv_n_queued_reqs; /* # reqs in either of the queues below */
+ struct list_head srv_req_in_queue; /* incoming reqs */
struct list_head srv_request_queue; /* reqs waiting for service */
struct list_head srv_request_history; /* request history */
return (rc);
}
-int ptlrpc_send_reply(struct ptlrpc_request *req, int);
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY 0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
int ptlrpc_reply(struct ptlrpc_request *req);
int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
int ptlrpc_error(struct ptlrpc_request *req);
void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd);
struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
static inline int
-ptlrpc_client_receiving_reply (struct ptlrpc_request *req)
+ptlrpc_client_recv_or_unlink (struct ptlrpc_request *req)
{
int rc;
spin_lock(&req->rq_lock);
- rc = req->rq_receiving_reply;
- spin_unlock(&req->rq_lock);
- return (rc);
-}
-
-static inline int
-ptlrpc_client_replied (struct ptlrpc_request *req)
-{
- int rc;
-
- spin_lock(&req->rq_lock);
- rc = req->rq_replied;
+ rc = req->rq_receiving_reply || req->rq_must_unlink;
spin_unlock(&req->rq_lock);
return (rc);
}
void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
struct ptlrpc_request_pool *ptlrpc_init_rq_pool(int, int,
void (*populate_pool)(struct ptlrpc_request_pool *, int));
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
const struct req_format *format);
struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
int psc_max_reply_size;
int psc_req_portal;
int psc_rep_portal;
- int psc_watchdog_timeout; /* in ms */
+ int psc_watchdog_factor;
int psc_min_threads;
int psc_max_threads;
__u32 psc_ctx_tags;
struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
int max_reply_size,
int req_portal, int rep_portal,
- int watchdog_timeout, /* in ms */
+ int watchdog_factor,
svc_handler_t, char *name,
cfs_proc_dir_entry_t *proc_entry,
svcreq_printfn_t,
int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens,
char **bufs);
int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
- int *lens, char **bufs);
+ int *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, int *lens,
+ char **bufs, int flags);
int lustre_shrink_msg(struct lustre_msg *msg, int segment,
unsigned int newlen, int move_data);
void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
int lustre_msg_size(__u32 magic, int count, int *lengths);
int lustre_msg_size_v2(int count, int *lengths);
int lustre_packed_msg_size(struct lustre_msg *msg);
+int lustre_msg_early_size(void);
int lustre_unpack_msg(struct lustre_msg *m, int len);
void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size);
void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
void *swabber);
void *lustre_swab_repbuf(struct ptlrpc_request *req, int n, int minlen,
void *swabber);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
__u32 lustre_msg_get_flags(struct lustre_msg *msg);
void lustre_msg_add_flags(struct lustre_msg *msg, int flags);
void lustre_msg_set_flags(struct lustre_msg *msg, int flags);
void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
int lustre_msg_get_status(struct lustre_msg *msg);
__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, int *sizes);
void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
static inline void
lustre_shrink_reply(struct ptlrpc_request *req, int segment,
lustre_free_reply_state(rs);
}
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+ if (req->rq_reply_state == NULL)
+ return; /* shouldn't occur */
+ ptlrpc_rs_decref(req->rq_reply_state);
+ req->rq_reply_state = NULL;
+ req->rq_repmsg = NULL;
+}
+
static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
{
return lustre_msg_get_magic(req->rq_reqmsg);
void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
int segment, int newsize);
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_finish_early_reply(struct ptlrpc_request *req);
+
void sptlrpc_request_out_callback(struct ptlrpc_request *req);
/*
spinlock_t obd_uncommitted_replies_lock;
cfs_timer_t obd_recovery_timer;
time_t obd_recovery_start; /* seconds */
- time_t obd_recovery_end; /* seconds */
+ time_t obd_recovery_end; /* seconds, for lprocfs_status */
time_t obd_recovery_max_time; /* seconds, bz13079 */
+ int obd_recovery_timeout;
/* new recovery stuff from CMD2 */
struct target_recovery_data obd_recovery_data;
extern unsigned int obd_debug_peer_on_timeout;
extern unsigned int obd_dump_on_timeout;
extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+ networking / disk / timings affected by load (use Adaptive Timeouts) */
extern unsigned int obd_timeout; /* seconds */
-#define PING_INTERVAL max(obd_timeout / 4, 1U)
-#define RECONNECT_INTERVAL max(obd_timeout / 10, 10U)
-extern unsigned int ldlm_timeout;
-extern unsigned int obd_health_check_timeout;
+extern unsigned int ldlm_timeout; /* seconds */
extern unsigned int obd_sync_filter;
extern unsigned int obd_max_dirty_pages;
extern atomic_t obd_dirty_pages;
extern unsigned int obd_alloc_fail_rate;
int __obd_fail_check_set(__u32 id, __u32 value, int set);
+int __obd_fail_timeout_set(__u32 id, __u32 value, int ms, int set);
/* lvfs.c */
int obd_alloc_fail(const void *ptr, const char *name, const char *type,
size_t size, const char *file, int line);
/* Timeout definitions */
-#define LDLM_TIMEOUT_DEFAULT 20
#define OBD_TIMEOUT_DEFAULT 100
-#define HEALTH_CHECK_COEF 3 / 2
-#define HEALTH_CHECK_TIMEOUT_DEFAULT (OBD_TIMEOUT_DEFAULT * HEALTH_CHECK_COEF)
-#define HEALTH_CHECK_TIMEOUT (obd_timeout * HEALTH_CHECK_COEF)
+#define LDLM_TIMEOUT_DEFAULT 20
+/* Time to wait for all clients to reconnect during recovery */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_FACTOR (3) /* times obd_timeout */
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+ connect requests in the LND queues, but within obd_timeout so we don't
+ miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */
+#ifndef CRAY_XT3
+/* In general this should be low to have quick detection of a system
+ running on a backup server. (If it's too low, import_select_connection
+ will increase the timeout anyhow.) */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
+#else
+/* ...but for very large systems (e.g. CRAY) we need to keep the initial
+ connect t.o. high (bz 10803), because they will nearly ALWAYS be doing the
+ connects for the first time (clients "reboot" after every process, so no
+ chance to generate adaptive timeout data. */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/2)
+#endif
+#define LONG_UNLINK 300 /* Unlink should happen before now */
+
#define OBD_FAIL_MDS 0x100
#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101
#define OBD_FAIL_OST_BRW_READ_BULK 0x20f
#define OBD_FAIL_OST_SYNC_NET 0x210
#define OBD_FAIL_OST_ALL_REPLY_NET 0x211
-#define OBD_FAIL_OST_ALL_REQUESTS_NET 0x212
+#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212
#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214
#define OBD_FAIL_OST_ENOSPC 0x215
#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220
#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE 0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224
#define OBD_FAIL_OST_CONNECT_NET2 0x225
#define OBD_FAIL_LDLM 0x300
#define OBD_FAIL_LDLM_GLIMPSE 0x30f
#define OBD_FAIL_LDLM_CANCEL_RACE 0x310
#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311
-/*
#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
-*/
#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313
#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314
#define OBD_FAIL_PTLRPC_DROP_RPC 0x505
#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506
#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
#define OBD_FAIL_OBD_PING_NET 0x600
#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601
#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
#define OBD_FAIL_TGT_DELAY_PRECREATE 0x705
#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706
+#define OBD_FAIL_TGT_REPLAY_DROP 0x707
#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800
#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802
#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803
#define OBD_FAIL_MGS 0x900
#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901
#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902
-#define OBD_FAIL_MGC_PROCESS_LOG 0x903
-#define OBD_FAIL_MGS_SLOW_REQUEST_NET 0x904
-#define OBD_FAIL_MGS_SLOW_TARGET_REG 0x905
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903
+#define OBD_FAIL_MGS_PAUSE_REQ 0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905
-#define OBD_FAIL_QUOTA_QD_COUNT_32BIT 0xa00
+#define OBD_FAIL_QUOTA_QD_COUNT_32BIT 0xA00
-#define OBD_FAIL_LPROC_REMOVE 0xb00
+#define OBD_FAIL_LPROC_REMOVE 0xB00
-#define OBD_FAIL_GENERAL_ALLOC 0xc00
+#define OBD_FAIL_GENERAL_ALLOC 0xC00
#define OBD_FAIL_SEQ 0x1000
#define OBD_FAIL_SEQ_QUERY_NET 0x1001
#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201
#define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202
#define OBD_FAIL_SEC_CTX_FINI_NET 0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204
/* Failure injection control */
#define OBD_FAIL_MASK_SYS 0x0000FF00
obd_fail_check_set(id, value, OBD_FAIL_LOC_RESET)
-static inline int obd_fail_timeout_set(__u32 id, __u32 value, int secs, int set)
+static inline int obd_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
{
- int ret = 0;
- if (unlikely(OBD_FAIL_PRECHECK(id) &&
- (ret = __obd_fail_check_set(id, value, set)))) {
- CERROR("obd_fail_timeout id %x sleeping for %d secs\n",
- id, secs);
- set_current_state(TASK_UNINTERRUPTIBLE);
- cfs_schedule_timeout(CFS_TASK_UNINT, cfs_time_seconds(secs));
- set_current_state(TASK_RUNNING);
- CERROR("obd_fail_timeout id %x awake\n", id);
- }
- return ret;
+ if (unlikely(OBD_FAIL_PRECHECK(id)))
+ return __obd_fail_timeout_set(id, value, ms, set);
+ else
+ return 0;
}
-/* If id hit obd_fail_loc, sleep secs */
+/* If id hit obd_fail_loc, sleep for seconds or milliseconds */
#define OBD_FAIL_TIMEOUT(id, secs) \
- obd_fail_timeout_set(id, 0, secs, OBD_FAIL_LOC_NOSET)
+ obd_fail_timeout_set(id, 0, secs * 1000, OBD_FAIL_LOC_NOSET)
+
+#define OBD_FAIL_TIMEOUT_MS(id, ms) \
+ obd_fail_timeout_set(id, 0, ms, OBD_FAIL_LOC_NOSET)
-/* If id hit obd_fail_loc, obd_fail_loc |= value and sleep secs */
+/* If id hit obd_fail_loc, obd_fail_loc |= value and
+ * sleep seconds or milliseconds */
#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) \
- obd_fail_timeout_set(id, value, secs, OBD_FAIL_LOC_ORSET)
+ obd_fail_timeout_set(id, value, secs * 1000, OBD_FAIL_LOC_ORSET)
+
+#define OBD_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \
+ obd_fail_timeout_set(id, value, ms, OBD_FAIL_LOC_ORSET)
#ifdef __KERNEL__
static inline void obd_fail_write(int id, struct super_block *sb)
int count, int max, int cancel_flags, int flags);
int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, int count, int max,
int flags);
+extern int ldlm_enqueue_min;
+int ldlm_get_enq_timeout(struct ldlm_lock *lock);
/* ldlm_resource.c */
int ldlm_resource_putref_locked(struct ldlm_resource *res);
spin_unlock(&exp->exp_lock);
}
EXPORT_SYMBOL(target_client_add_cb);
+static void
+target_start_and_reset_recovery_timer(struct obd_device *obd,
+ struct ptlrpc_request *req,
+ int new_client);
int target_handle_connect(struct ptlrpc_request *req)
{
(time_t)cfs_time_current_sec());
}
- /* We want to handle EALREADY but *not* -EALREADY from
- * target_handle_reconnect(), return reconnection state in a flag */
- if (rc == EALREADY) {
- lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
- rc = 0;
- } else if (rc) {
+ if (rc < 0) {
GOTO(out, rc);
}
- /* Tell the client if we're in recovery. */
- /* If this is the first client, start the recovery timer */
+
CWARN("%s: connection from %s@%s %st"LPU64" exp %p cur %ld last %ld\n",
target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
target->obd_recovering ? "recovering/" : "", data->ocd_transno,
export, (long)cfs_time_current_sec(),
export ? (long)export->exp_last_request_time : 0);
-
+ /* Tell the client if we're in recovery. */
if (target->obd_recovering) {
lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
- target_start_recovery_timer(target);
+ /* If this is the first time a client connects,
+ reset the recovery timer */
+ if (rc == 0)
+ target_start_and_reset_recovery_timer(target, req,
+ !export);
+ }
+
+ /* We want to handle EALREADY but *not* -EALREADY from
+ * target_handle_reconnect(), return reconnection state in a flag */
+ if (rc == EALREADY) {
+ lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
+ rc = 0;
+ } else {
+ LASSERT(rc == 0);
}
/* Tell the client if we support replayable requests */
revimp->imp_state = LUSTRE_IMP_FULL;
revimp->imp_msg_magic = req->rq_reqmsg->lm_magic;
+ if ((export->exp_connect_flags & OBD_CONNECT_AT) &&
+ (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1))
+ revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+ else
+ revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx,
req->rq_flvr.sf_rpc);
if (rc) {
class_export_get(copy_req->rq_export);
CFS_INIT_LIST_HEAD(©_req->rq_list);
+ CFS_INIT_LIST_HEAD(©_req->rq_replay_list);
sptlrpc_svc_ctx_addref(copy_req);
if (copy_req->rq_reply_state) {
return copy_req;
}
-void ptlrpc_free_clone( struct ptlrpc_request *req)
+void ptlrpc_free_clone(struct ptlrpc_request *req)
{
- if (req->rq_reply_state) {
- ptlrpc_rs_decref(req->rq_reply_state);
- req->rq_reply_state = NULL;
- }
+ LASSERT(list_empty(&req->rq_replay_list));
+ ptlrpc_req_drop_rs(req);
sptlrpc_svc_ctx_decref(req);
class_export_put(req->rq_export);
list_del(&req->rq_list);
OBD_FREE_PTR(req);
}
+static int target_exp_enqueue_req_replay(struct ptlrpc_request *req)
+{
+ __u64 transno = lustre_msg_get_transno(req->rq_reqmsg);
+ struct obd_export *exp = req->rq_export;
+ struct ptlrpc_request *reqiter;
+ int dup = 0;
+
+ LASSERT(exp);
+
+ spin_lock(&exp->exp_lock);
+ list_for_each_entry(reqiter, &exp->exp_req_replay_queue,
+ rq_replay_list) {
+ if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) {
+ dup = 1;
+ break;
+ }
+ }
+
+ if (dup) {
+ /* we expect it with RESENT and REPLAY flags */
+ if ((lustre_msg_get_flags(req->rq_reqmsg) &
+ (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY))
+ CERROR("invalid flags %x of resent replay\n",
+ lustre_msg_get_flags(req->rq_reqmsg));
+ } else {
+ list_add_tail(&req->rq_replay_list, &exp->exp_req_replay_queue);
+ }
+
+ spin_unlock(&exp->exp_lock);
+ return dup;
+}
+
+static void target_exp_dequeue_req_replay(struct ptlrpc_request *req)
+{
+ LASSERT(!list_empty(&req->rq_replay_list));
+ LASSERT(req->rq_export);
+
+ spin_lock(&req->rq_export->exp_lock);
+ list_del_init(&req->rq_replay_list);
+ spin_unlock(&req->rq_export->exp_lock);
+}
+
#ifdef __KERNEL__
static void target_finish_recovery(struct obd_device *obd)
{
DEBUG_REQ(D_ERROR, req,
"failed abort_req_reply; skipping");
}
+ target_exp_dequeue_req_replay(req);
ptlrpc_free_clone(req);
}
}
list_for_each_entry_safe(req, n, &obd->obd_req_replay_queue, rq_list) {
LASSERT (req->rq_reply_state == 0);
+ target_exp_dequeue_req_replay(req);
ptlrpc_free_clone(req);
}
list_for_each_entry_safe(req, n, &obd->obd_lock_replay_queue, rq_list){
static void target_recovery_expired(unsigned long castmeharder)
{
struct obd_device *obd = (struct obd_device *)castmeharder;
- CERROR("%s: recovery timed out, aborting\n", obd->obd_name);
+ LCONSOLE_WARN("%s: recovery timed out; %d clients never reconnected "
+ "after %lds (%d clients did)\n",
+ obd->obd_name, obd->obd_recoverable_clients,
+ cfs_time_current_sec()- obd->obd_recovery_start,
+ obd->obd_connected_clients);
spin_lock_bh(&obd->obd_processing_task_lock);
if (obd->obd_recovering)
obd->obd_abort_recovery = 1;
CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name);
cfs_timer_disarm(&obd->obd_recovery_timer);
}
-
-static void reset_recovery_timer(struct obd_device *obd)
+
+/* extend = 1 means require at least "duration" seconds left in the timer,
+ extend = 0 means set the total duration (start_recovery_timer) */
+static void reset_recovery_timer(struct obd_device *obd, int duration,
+ int extend)
{
- time_t timeout_shift = OBD_RECOVERY_TIMEOUT;
+ cfs_time_t now = cfs_time_current_sec();
+ cfs_duration_t left;
+
spin_lock_bh(&obd->obd_processing_task_lock);
- if (!obd->obd_recovering) {
+ if (!obd->obd_recovering || obd->obd_abort_recovery) {
spin_unlock_bh(&obd->obd_processing_task_lock);
return;
}
- if (cfs_time_current_sec() + OBD_RECOVERY_TIMEOUT >
- obd->obd_recovery_start + obd->obd_recovery_max_time)
- timeout_shift = obd->obd_recovery_start +
- obd->obd_recovery_max_time - cfs_time_current_sec();
- cfs_timer_arm(&obd->obd_recovery_timer, cfs_time_shift(timeout_shift));
+
+ left = cfs_time_sub(obd->obd_recovery_end, now);
+
+ if (extend && (duration > left))
+ obd->obd_recovery_timeout += duration - left;
+ else if (!extend && (duration > obd->obd_recovery_timeout))
+ /* Track the client's largest expected replay time */
+ obd->obd_recovery_timeout = duration;
+#ifdef CRAY_XT3
+ /*
+ * If total recovery time already exceed the
+ * obd_recovery_max_time, then CRAY XT3 will
+ * abort the recovery
+ */
+ if(obd->obd_recovery_timeout > obd->obd_recovery_max_time)
+ obd->obd_recovery_timeout = obd->obd_recovery_max_time;
+#endif
+ obd->obd_recovery_end = obd->obd_recovery_start +
+ obd->obd_recovery_timeout;
+ if (!cfs_timer_is_armed(&obd->obd_recovery_timer) ||
+ cfs_time_before(now, obd->obd_recovery_end)) {
+ left = cfs_time_sub(obd->obd_recovery_end, now);
+ cfs_timer_arm(&obd->obd_recovery_timer, cfs_time_shift(left));
+ }
spin_unlock_bh(&obd->obd_processing_task_lock);
- CDEBUG(D_HA, "%s: timer will expire in %u seconds\n", obd->obd_name,
- (unsigned int)timeout_shift);
- /* Only used for lprocfs_status */
- obd->obd_recovery_end = cfs_time_current_sec() + timeout_shift;
+ CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n",
+ obd->obd_name, (unsigned)left);
}
+static void resume_recovery_timer(struct obd_device *obd)
+{
+ LASSERT(!cfs_timer_is_armed(&obd->obd_recovery_timer));
+
+ /* to be safe, make it at least OBD_RECOVERY_FACTOR * obd_timeout */
+ reset_recovery_timer(obd, OBD_RECOVERY_FACTOR * obd_timeout, 1);
+}
-/* Only start it the first time called */
-void target_start_recovery_timer(struct obd_device *obd)
+static void check_and_start_recovery_timer(struct obd_device *obd)
{
spin_lock_bh(&obd->obd_processing_task_lock);
- if (obd->obd_recovery_handler
- || timer_pending((struct timer_list *)&obd->obd_recovery_timer)) {
+ if (cfs_timer_is_armed(&obd->obd_recovery_timer)) {
spin_unlock_bh(&obd->obd_processing_task_lock);
return;
}
- CWARN("%s: starting recovery timer (%us)\n", obd->obd_name,
- OBD_RECOVERY_TIMEOUT);
- cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
+ CWARN("%s: starting recovery timer\n", obd->obd_name);
+ obd->obd_recovery_start = cfs_time_current_sec();
+ /* minimum */
+ obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
spin_unlock_bh(&obd->obd_processing_task_lock);
- reset_recovery_timer(obd);
+ reset_recovery_timer(obd, obd->obd_recovery_timeout, 0);
+}
+
+/* Reset the timer with each new client connection */
+/*
+ * This timer is actually reconnect_timer, which is for making sure
+ * the total recovery window is at least as big as my reconnect
+ * attempt timing. So the initial recovery time_out will be set to
+ * OBD_RECOVERY_FACTOR * obd_timeout. If the timeout coming
+ * from client is bigger than this, then the recovery time_out will
+ * be extend to make sure the client could be reconnected, in the
+ * process, the timeout from the new client should be ignored.
+ */
+
+static void
+target_start_and_reset_recovery_timer(struct obd_device *obd,
+ struct ptlrpc_request *req,
+ int new_client)
+{
+ int req_timeout = OBD_RECOVERY_FACTOR *
+ lustre_msg_get_timeout(req->rq_reqmsg);
+
+ check_and_start_recovery_timer(obd);
+
+ if (req_timeout > obd->obd_recovery_timeout && !new_client)
+ reset_recovery_timer(obd, req_timeout, 0);
}
#ifdef __KERNEL__
} else if (!list_empty(&obd->obd_req_replay_queue)) {
req = list_entry(obd->obd_req_replay_queue.next,
struct ptlrpc_request, rq_list);
+ target_exp_dequeue_req_replay(req);
list_del_init(&req->rq_list);
obd->obd_requests_queued_for_recovery--;
} else {
/* don't reset timer for final stage */
if (!req_replay_done(req->rq_export) ||
!lock_replay_done(req->rq_export))
- reset_recovery_timer(class_exp2obd(req->rq_export));
+ reset_recovery_timer(class_exp2obd(req->rq_export),
+ OBD_RECOVERY_FACTOR * AT_OFF ? obd_timeout :
+ at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1);
ptlrpc_free_clone(req);
RETURN(0);
}
CDEBUG(D_INFO, "1: request replay stage - %d clients from t"LPU64"\n",
atomic_read(&obd->obd_req_replay_clients),
obd->obd_next_recovery_transno);
+ resume_recovery_timer(obd);
while ((req = target_next_replay_req(obd))) {
LASSERT(trd->trd_processing_task == current->pid);
DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s",
class_disconnect_stale_exports(obd, req_replay_done);
abort_req_replay_queue(obd);
}
+
/* The second stage: replay locks */
CDEBUG(D_INFO, "2: lock replay stage - %d clients\n",
atomic_read(&obd->obd_lock_replay_clients));
+ resume_recovery_timer(obd);
while ((req = target_next_replay_lock(obd))) {
LASSERT(trd->trd_processing_task == current->pid);
DEBUG_REQ(D_HA|D_WARNING, req, "processing lock from %s: ",
"last_transno "LPU64"\n", obd->obd_name,
obd->obd_max_recoverable_clients, obd->obd_last_committed);
obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
- target_start_recovery_thread(obd, handler);
- obd->obd_recovery_start = cfs_time_current_sec();
- /* Only used for lprocfs_status */
- obd->obd_recovery_end = obd->obd_recovery_start + OBD_RECOVERY_TIMEOUT;
+ obd->obd_recovery_start = 0;
+ obd->obd_recovery_end = 0;
+ obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
/* bz13079: this should be set to desired value for ost but not for mds */
obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME;
+ cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
+ target_start_recovery_thread(obd, handler);
}
EXPORT_SYMBOL(target_recovery_init);
}
spin_unlock_bh(&obd->obd_processing_task_lock);
- /* A resent, replayed request that is still on the queue; just drop it.
- The queued request will handle this. */
- if ((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT|MSG_REPLAY)) ==
- (MSG_RESENT | MSG_REPLAY)) {
- DEBUG_REQ(D_ERROR, req, "dropping resent queued req");
+ if (OBD_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_DROP))
RETURN(0);
- }
req = ptlrpc_clone_req(req);
if (req == NULL)
}
LASSERT(req->rq_export->exp_req_replay_needed);
+ if (target_exp_enqueue_req_replay(req)) {
+ spin_unlock_bh(&obd->obd_processing_task_lock);
+ DEBUG_REQ(D_ERROR, req, "dropping resent queued req");
+ ptlrpc_free_clone(req);
+ RETURN(0);
+ }
+
/* XXX O(n^2) */
list_for_each(tmp, &obd->obd_req_replay_queue) {
struct ptlrpc_request *reqiter =
inserted = 1;
break;
}
+
+ if (unlikely(lustre_msg_get_transno(reqiter->rq_reqmsg) ==
+ transno)) {
+ DEBUG_REQ(D_ERROR, req, "dropping replay: transno "
+ "has been claimed by another client");
+ spin_unlock_bh(&obd->obd_processing_task_lock);
+ target_exp_dequeue_req_replay(req);
+ ptlrpc_free_clone(req);
+ RETURN(0);
+ }
}
if (!inserted)
wake_up(&obd->obd_next_transno_waitq);
spin_unlock_bh(&obd->obd_processing_task_lock);
RETURN(0);
-
}
struct obd_device * target_req2obd(struct ptlrpc_request *req)
DEBUG_REQ(D_NET, req, "sending reply");
}
- return (ptlrpc_send_reply(req, 1));
+ return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT));
}
void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
lustre_msg_set_last_committed(req->rq_repmsg,
obd->obd_last_committed);
else
- DEBUG_REQ(D_IOCTL, req, "not sending last_committed update");
+ DEBUG_REQ(D_IOCTL, req, "not sending last_committed update (%d/"
+ "%d)", obd->obd_no_transno, req->rq_repmsg == NULL);
CDEBUG(D_INFO, "last_committed "LPU64", transno "LPU64", xid "LPU64"\n",
obd->obd_last_committed, req->rq_transno, req->rq_xid);
return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
}
-/* timeout for initial callback (AST) reply */
-static inline unsigned int ldlm_get_rq_timeout(unsigned int ldlm_timeout,
- unsigned int obd_timeout)
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
{
+ /* Non-AT value */
unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
return timeout < 1 ? 1 : timeout;
goto repeat;
}
- LDLM_ERROR(lock, "lock callback timer expired: evicting client "
- "%s@%s nid %s\n",
- lock->l_export->exp_client_uuid.uuid,
- lock->l_export->exp_connection->c_remote_uuid.uuid,
- libcfs_nid2str(lock->l_export->exp_connection->c_peer.nid));
+ LDLM_ERROR(lock, "lock callback timer expired after %lds: "
+ "evicting client at %s ",
+ cfs_time_current_sec()- lock->l_enqueued_time.tv_sec,
+ libcfs_nid2str(
+ lock->l_export->exp_connection->c_peer.nid));
last = lock;
*/
static int __ldlm_add_waiting_lock(struct ldlm_lock *lock)
{
+ int timeout;
cfs_time_t timeout_rounded;
if (!list_empty(&lock->l_pending_chain))
return 0;
- lock->l_callback_timeout =cfs_time_add(cfs_time_current(),
- cfs_time_seconds(obd_timeout)/2);
+ timeout = ldlm_get_enq_timeout(lock);
+
+ lock->l_callback_timeout = cfs_time_shift(timeout);
timeout_rounded = round_timeout(lock->l_callback_timeout);
- if (cfs_time_before(timeout_rounded, cfs_timer_deadline(&waiting_locks_timer)) ||
+ if (cfs_time_before(timeout_rounded,
+ cfs_timer_deadline(&waiting_locks_timer)) ||
!cfs_timer_is_armed(&waiting_locks_timer)) {
cfs_timer_arm(&waiting_locks_timer, timeout_rounded);
-
}
+ /* if the new lock has a shorter timeout than something earlier on
+ the list, we'll wait the longer amount of time; no big deal. */
list_add_tail(&lock->l_pending_chain, &waiting_locks_list); /* FIFO */
return 1;
}
}
req->rq_send_state = LUSTRE_IMP_FULL;
- req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout);
+ /* ptlrpc_prep_req already set timeout */
+ if (AT_OFF)
+ req->rq_timeout = ldlm_get_rq_timeout();
if (lock->l_export && lock->l_export->exp_ldlm_stats)
lprocfs_counter_incr(lock->l_export->exp_ldlm_stats,
total_enqueue_wait = cfs_timeval_sub(&granted_time,
&lock->l_enqueued_time, NULL);
- if (total_enqueue_wait / 1000000 > obd_timeout)
+ if (total_enqueue_wait / ONE_MILLION > obd_timeout)
+ /* non-fatal with AT - change to LDLM_DEBUG? */
LDLM_ERROR(lock, "enqueue wait took %luus from "CFS_TIME_T,
total_enqueue_wait, lock->l_enqueued_time.tv_sec);
LDLM_DEBUG(lock, "server preparing completion AST (after %ldus wait)",
total_enqueue_wait);
+ /* Server-side enqueue wait time estimate, used in
+ __ldlm_add_waiting_lock to set future enqueue timers */
+ at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
+ total_enqueue_wait / ONE_MILLION);
+
ptlrpc_request_set_replen(req);
+
req->rq_send_state = LUSTRE_IMP_FULL;
- req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout);
+ /* ptlrpc_prep_req already set timeout */
+ if (AT_OFF)
+ req->rq_timeout = ldlm_get_rq_timeout();
/* We only send real blocking ASTs after the lock is granted */
lock_res_and_lock(lock);
req->rq_send_state = LUSTRE_IMP_FULL;
- req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout);
+ /* ptlrpc_prep_req already set timeout */
+ if (AT_OFF)
+ req->rq_timeout = ldlm_get_rq_timeout();
if (lock->l_export && lock->l_export->exp_ldlm_stats)
lprocfs_counter_incr(lock->l_export->exp_ldlm_stats,
EXIT;
out:
req->rq_status = err;
- if (req->rq_reply_state == NULL) {
+ if (!req->rq_packed_final) {
err = lustre_pack_reply(req, 1, NULL, NULL);
if (rc == 0)
rc = err;
return rc;
}
-/* Cancel all the locks, which handles are packed into ldlm_request */
+/* Cancel all the locks whos handles are packed into ldlm_request */
int ldlm_request_cancel(struct ptlrpc_request *req,
const struct ldlm_request *dlm_req, int first)
{
return 0;
req->rq_status = rc;
- if (req->rq_reply_state == NULL) {
+ if (!req->rq_packed_final) {
rc = lustre_pack_reply(req, 1, NULL, NULL);
if (rc)
return rc;
ldlm_state->ldlm_cb_service =
ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE,
LDLM_MAXREPSIZE, LDLM_CB_REQUEST_PORTAL,
- LDLM_CB_REPLY_PORTAL, ldlm_timeout * 900,
+ LDLM_CB_REPLY_PORTAL, 1800,
ldlm_callback_handler, "ldlm_cbd",
ldlm_svc_proc_dir, NULL,
ldlm_min_threads, ldlm_max_threads,
ldlm_state->ldlm_cancel_service =
ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE,
LDLM_MAXREPSIZE, LDLM_CANCEL_REQUEST_PORTAL,
- LDLM_CANCEL_REPLY_PORTAL, ldlm_timeout * 6000,
+ LDLM_CANCEL_REPLY_PORTAL, 6000,
ldlm_cancel_handler, "ldlm_canceld",
ldlm_svc_proc_dir, NULL,
ldlm_min_threads, ldlm_max_threads,
#include "ldlm_internal.h"
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
+ "lock enqueue timeout minimum");
+
static void interrupted_completion_wait(void *data)
{
}
CFS_DURATION_T"s ago); not entering recovery in "
"server code, just going back to sleep",
lock->l_enqueued_time.tv_sec,
- cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+ cfs_time_current_sec() -
+ lock->l_enqueued_time.tv_sec);
if (cfs_time_after(cfs_time_current(), next_dump)) {
last_dump = next_dump;
next_dump = cfs_time_shift(300);
RETURN(0);
}
+/* We use the same basis for both server side and client side functions
+ from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+ int timeout = at_get(&lock->l_resource->lr_namespace->ns_at_estimate);
+ if (AT_OFF)
+ return obd_timeout / 2;
+ /* Since these are non-updating timeouts, we should be conservative.
+ It would be nice to have some kind of "early reply" mechanism for
+ lock callbacks too... */
+ timeout = timeout + (timeout >> 1); /* 150% */
+ return max(timeout, ldlm_enqueue_min);
+}
+
static int is_granted_or_cancelled(struct ldlm_lock *lock)
{
int ret = 0;
struct obd_device *obd;
struct obd_import *imp = NULL;
struct l_wait_info lwi;
+ __u32 timeout;
int rc = 0;
ENTRY;
obd = class_exp2obd(lock->l_conn_export);
/* if this is a local lock, then there is no import */
- if (obd != NULL)
+ if (obd != NULL) {
imp = obd->u.cli.cl_import;
+ }
+
+ /* Wait a long time for enqueue - server may have to callback a
+ lock from another client. Server will evict the other client if it
+ doesn't respond reasonably, and then give us the lock. */
+ timeout = ldlm_get_enq_timeout(lock) * 2;
lwd.lwd_lock = lock;
LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
lwi = LWI_INTR(interrupted_completion_wait, &lwd);
} else {
- lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+ lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
ldlm_expired_completion_wait,
interrupted_completion_wait, &lwd);
}
RETURN(rc);
}
- LDLM_DEBUG(lock, "client-side enqueue waking up: granted");
+ LDLM_DEBUG(lock, "client-side enqueue waking up: granted after %lds",
+ cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+
+ /* Update our time estimate */
+ at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
+ cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+
RETURN(0);
}
LASSERT(exp != NULL);
LASSERT(count > 0);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, obd_fail_val);
+
if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
RETURN(count);
req->rq_no_resend = 1;
req->rq_no_delay = 1;
- /* XXX FIXME bug 249 */
req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
ldlm_cancel_pack(req, cancels, count);
GOTO(out_proc, rc);
}
+ at_init(&ns->ns_at_estimate, ldlm_enqueue_min, 0);
+
ldlm_namespace_register(ns, client);
RETURN(ns);
out_proc:
if (ocd == NULL)
GOTO(out_cleanup, rc = -ENOMEM);
- ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID;
+ ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID |
+ OBD_CONNECT_AT;
#ifdef LIBLUSTRE_POSIX_ACL
ocd->ocd_connect_flags |= OBD_CONNECT_ACL;
#endif
obd_timeout);
}
- /* debug peer on timeout? */
+ /* debug peer on timeout? */
envstr = getenv("LIBLUSTRE_DEBUG_PEER_ON_TIMEOUT");
if (envstr != NULL) {
obd_debug_peer_on_timeout =
sizeof(async), &async, NULL);
ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION |
- OBD_CONNECT_FID;
+ OBD_CONNECT_FID | OBD_CONNECT_AT;
#ifdef LIBLUSTRE_POSIX_ACL
ocd.ocd_connect_flags |= OBD_CONNECT_ACL;
#endif
ocd.ocd_connect_flags = OBD_CONNECT_SRVLOCK | OBD_CONNECT_REQPORTAL |
OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK |
- OBD_CONNECT_FID;
+ OBD_CONNECT_FID | OBD_CONNECT_AT;
ocd.ocd_version = LUSTRE_VERSION_CODE;
err = obd_connect(NULL, &dt_conn, obd, &sbi->ll_sb_uuid, &ocd, NULL);
if (err) {
OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID |
OBD_CONNECT_VERSION | OBD_CONNECT_MDS_CAPA |
OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET|
- OBD_CONNECT_FID;
+ OBD_CONNECT_FID | OBD_CONNECT_AT;
#ifdef HAVE_LRU_RESIZE_SUPPORT
if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION |
OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
- OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK;
+ OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK|
+ OBD_CONNECT_AT;
if (sbi->ll_flags & LL_SBI_OSS_CAPA)
data->ocd_connect_flags |= OBD_CONNECT_OSS_CAPA;
}
EXPORT_SYMBOL(__obd_fail_check_set);
+int __obd_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+ int ret = 0;
+
+ ret = __obd_fail_check_set(id, value, set);
+ if (ret) {
+ CERROR("obd_fail_timeout id %x sleeping for %dms\n",
+ id, ms);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ cfs_schedule_timeout(CFS_TASK_UNINT,
+ cfs_time_seconds(ms) / 1000);
+ set_current_state(TASK_RUNNING);
+ CERROR("obd_fail_timeout id %x awake\n", id);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__obd_fail_timeout_set);
+
#ifdef LPROCFS
void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
long amount)
{ "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 },
{ "filestotal", lprocfs_rd_filestotal, 0, 0 },
{ "filesfree", lprocfs_rd_filesfree, 0, 0 },
- //{ "filegroups", lprocfs_rd_filegroups, 0, 0 },
+ /*{ "filegroups", lprocfs_rd_filegroups, 0, 0 },*/
{ "mds_server_uuid", lprocfs_rd_server_uuid, 0, 0 },
{ "mds_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 },
{ "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight,
mdc_wr_max_rpcs_in_flight, 0 },
+ { "timeouts", lprocfs_rd_timeouts, 0, 0 },
{ 0 }
};
}
if (op_data->op_attr.ia_valid & ATTR_FROM_OPEN) {
- req->rq_request_portal = MDS_SETATTR_PORTAL; //XXX FIXME bug 249
+ req->rq_request_portal = MDS_SETATTR_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
rpc_lock = obd->u.cli.cl_setattr_lock;
} else {
rpc_lock = obd->u.cli.cl_rpc_lock;
/* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
* portal whose threads are not taking any DLM locks and are therefore
* always progressing */
- /* XXX FIXME bug 249 */
req->rq_request_portal = MDS_READPAGE_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
/* Ensure that this close's handle is fixed up during replay. */
if (likely(mod != NULL))
}
req->rq_request_portal = MDS_READPAGE_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
desc = ptlrpc_prep_bulk_imp(req, 1, BULK_GET_SOURCE, MDS_BULK_PORTAL);
if (desc == NULL)
RETURN(rc);
}
- /* XXX FIXME bug 249 */
req->rq_request_portal = MDS_READPAGE_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
+
desc = ptlrpc_prep_bulk_imp(req, 1, BULK_PUT_SINK, MDS_BULK_PORTAL);
if (desc == NULL) {
ptlrpc_request_free(req);
OBD_ALLOC(data, sizeof(*data));
if (data == NULL)
RETURN(-ENOMEM);
- data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX |
+ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX |
OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64 |
- OBD_CONNECT_OSS_CAPA | OBD_CONNECT_FID;
+ OBD_CONNECT_OSS_CAPA | OBD_CONNECT_FID |
+ OBD_CONNECT_AT;
#ifdef HAVE_LRU_RESIZE_SUPPORT
data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
#endif
static int mdt_statfs(struct mdt_thread_info *info)
{
- struct md_device *next = info->mti_mdt->mdt_child;
- struct obd_statfs *osfs;
- int rc;
+ struct md_device *next = info->mti_mdt->mdt_child;
+ struct ptlrpc_service *svc;
+ struct obd_statfs *osfs;
+ int rc;
ENTRY;
+ svc = info->mti_pill->rc_req->rq_rqbd->rqbd_service;
+
/* This will trigger a watchdog timeout */
OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP,
- (MDT_SERVICE_WATCHDOG_TIMEOUT / 1000) + 1);
+ (MDT_SERVICE_WATCHDOG_FACTOR *
+ at_get(&svc->srv_at_estimate) / 1000) + 1);
rc = mdt_check_ucred(info);
if (rc)
struct l_wait_info *lwi = &info->mti_u.rdpg.mti_wait_info;
int tmpcount;
int tmpsize;
+ int timeout;
int i;
int rc;
ENTRY;
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE))
GOTO(abort_bulk, rc = 0);
- *lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+ timeout = (int) req->rq_deadline - cfs_time_current_sec();
+ if (timeout < 0)
+ CERROR("Req deadline already passed %lu (now: %lu)\n",
+ req->rq_deadline, cfs_time_current_sec());
+ *lwi = LWI_TIMEOUT(max(timeout, 1) * HZ, NULL, NULL);
rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), lwi);
LASSERT (rc == 0 || rc == -ETIMEDOUT);
sptlrpc_svc_ctx_invalidate(req);
}
+ OBD_FAIL_TIMEOUT(OBD_FAIL_SEC_CTX_HDL_PAUSE, obd_fail_val);
+
return rc;
}
procfs_entry = m->mdt_md_dev.md_lu_dev.ld_obd->obd_proc_entry;
conf = (typeof(conf)) {
- .psc_nbufs = MDS_NBUFS,
- .psc_bufsize = MDS_BUFSIZE,
- .psc_max_req_size = MDS_MAXREQSIZE,
- .psc_max_reply_size = MDS_MAXREPSIZE,
- .psc_req_portal = MDS_REQUEST_PORTAL,
- .psc_rep_portal = MDC_REPLY_PORTAL,
- .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT,
+ .psc_nbufs = MDS_NBUFS,
+ .psc_bufsize = MDS_BUFSIZE,
+ .psc_max_req_size = MDS_MAXREQSIZE,
+ .psc_max_reply_size = MDS_MAXREPSIZE,
+ .psc_req_portal = MDS_REQUEST_PORTAL,
+ .psc_rep_portal = MDC_REPLY_PORTAL,
+ .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
/*
* We'd like to have a mechanism to set this on a per-device
* basis, but alas...
*/
- .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS),
- MDT_MAX_THREADS),
- .psc_max_threads = MDT_MAX_THREADS,
- .psc_ctx_tags = LCT_MD_THREAD
+ .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS),
+ MDT_MAX_THREADS),
+ .psc_max_threads = MDT_MAX_THREADS,
+ .psc_ctx_tags = LCT_MD_THREAD
};
m->mdt_ldlm_client = &m->mdt_md_dev.md_lu_dev.ld_obd->obd_ldlm_client;
m->mdt_regular_service =
ptlrpc_init_svc_conf(&conf, mdt_regular_handle, LUSTRE_MDT_NAME,
- procfs_entry, NULL, LUSTRE_MDT_NAME);
+ procfs_entry, target_print_req,
+ LUSTRE_MDT_NAME);
if (m->mdt_regular_service == NULL)
RETURN(-ENOMEM);
* ideally.
*/
conf = (typeof(conf)) {
- .psc_nbufs = MDS_NBUFS,
- .psc_bufsize = MDS_BUFSIZE,
- .psc_max_req_size = MDS_MAXREQSIZE,
- .psc_max_reply_size = MDS_MAXREPSIZE,
- .psc_req_portal = MDS_READPAGE_PORTAL,
- .psc_rep_portal = MDC_REPLY_PORTAL,
- .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT,
- .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS),
- MDT_MAX_THREADS),
- .psc_max_threads = MDT_MAX_THREADS,
- .psc_ctx_tags = LCT_MD_THREAD
+ .psc_nbufs = MDS_NBUFS,
+ .psc_bufsize = MDS_BUFSIZE,
+ .psc_max_req_size = MDS_MAXREQSIZE,
+ .psc_max_reply_size = MDS_MAXREPSIZE,
+ .psc_req_portal = MDS_READPAGE_PORTAL,
+ .psc_rep_portal = MDC_REPLY_PORTAL,
+ .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
+ .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS),
+ MDT_MAX_THREADS),
+ .psc_max_threads = MDT_MAX_THREADS,
+ .psc_ctx_tags = LCT_MD_THREAD
};
m->mdt_readpage_service =
ptlrpc_init_svc_conf(&conf, mdt_readpage_handle,
LUSTRE_MDT_NAME "_readpage",
- procfs_entry, NULL, "mdt_rdpg");
+ procfs_entry, target_print_req,"mdt_rdpg");
if (m->mdt_readpage_service == NULL) {
CERROR("failed to start readpage service\n");
* setattr service configuration.
*/
conf = (typeof(conf)) {
- .psc_nbufs = MDS_NBUFS,
- .psc_bufsize = MDS_BUFSIZE,
- .psc_max_req_size = MDS_MAXREQSIZE,
- .psc_max_reply_size = MDS_MAXREPSIZE,
- .psc_req_portal = MDS_SETATTR_PORTAL,
- .psc_rep_portal = MDC_REPLY_PORTAL,
- .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT,
+ .psc_nbufs = MDS_NBUFS,
+ .psc_bufsize = MDS_BUFSIZE,
+ .psc_max_req_size = MDS_MAXREQSIZE,
+ .psc_max_reply_size = MDS_MAXREPSIZE,
+ .psc_req_portal = MDS_SETATTR_PORTAL,
+ .psc_rep_portal = MDC_REPLY_PORTAL,
+ .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
.psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS),
- MDT_MAX_THREADS),
- .psc_max_threads = MDT_MAX_THREADS,
- .psc_ctx_tags = LCT_MD_THREAD
+ MDT_MAX_THREADS),
+ .psc_max_threads = MDT_MAX_THREADS,
+ .psc_ctx_tags = LCT_MD_THREAD
};
m->mdt_setattr_service =
ptlrpc_init_svc_conf(&conf, mdt_regular_handle,
LUSTRE_MDT_NAME "_setattr",
- procfs_entry, NULL, "mdt_attr");
+ procfs_entry, target_print_req,"mdt_attr");
if (!m->mdt_setattr_service) {
CERROR("failed to start setattr service\n");
* sequence controller service configuration
*/
conf = (typeof(conf)) {
- .psc_nbufs = MDS_NBUFS,
- .psc_bufsize = MDS_BUFSIZE,
- .psc_max_req_size = SEQ_MAXREQSIZE,
- .psc_max_reply_size = SEQ_MAXREPSIZE,
- .psc_req_portal = SEQ_CONTROLLER_PORTAL,
- .psc_rep_portal = MDC_REPLY_PORTAL,
- .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT,
- .psc_min_threads = SEQ_NUM_THREADS,
- .psc_max_threads = SEQ_NUM_THREADS,
- .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD
+ .psc_nbufs = MDS_NBUFS,
+ .psc_bufsize = MDS_BUFSIZE,
+ .psc_max_req_size = SEQ_MAXREQSIZE,
+ .psc_max_reply_size = SEQ_MAXREPSIZE,
+ .psc_req_portal = SEQ_CONTROLLER_PORTAL,
+ .psc_rep_portal = MDC_REPLY_PORTAL,
+ .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
+ .psc_min_threads = SEQ_NUM_THREADS,
+ .psc_max_threads = SEQ_NUM_THREADS,
+ .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD
};
m->mdt_mdsc_service =
ptlrpc_init_svc_conf(&conf, mdt_mdsc_handle,
LUSTRE_MDT_NAME"_mdsc",
- procfs_entry, NULL, "mdt_mdsc");
+ procfs_entry, target_print_req,"mdt_mdsc");
if (!m->mdt_mdsc_service) {
CERROR("failed to start seq controller service\n");
GOTO(err_mdt_svc, rc = -ENOMEM);
* metadata sequence server service configuration
*/
conf = (typeof(conf)) {
- .psc_nbufs = MDS_NBUFS,
- .psc_bufsize = MDS_BUFSIZE,
- .psc_max_req_size = SEQ_MAXREQSIZE,
- .psc_max_reply_size = SEQ_MAXREPSIZE,
- .psc_req_portal = SEQ_METADATA_PORTAL,
- .psc_rep_portal = MDC_REPLY_PORTAL,
- .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT,
- .psc_min_threads = SEQ_NUM_THREADS,
- .psc_max_threads = SEQ_NUM_THREADS,
- .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD
+ .psc_nbufs = MDS_NBUFS,
+ .psc_bufsize = MDS_BUFSIZE,
+ .psc_max_req_size = SEQ_MAXREQSIZE,
+ .psc_max_reply_size = SEQ_MAXREPSIZE,
+ .psc_req_portal = SEQ_METADATA_PORTAL,
+ .psc_rep_portal = MDC_REPLY_PORTAL,
+ .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
+ .psc_min_threads = SEQ_NUM_THREADS,
+ .psc_max_threads = SEQ_NUM_THREADS,
+ .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD
};
m->mdt_mdss_service =
ptlrpc_init_svc_conf(&conf, mdt_mdss_handle,
LUSTRE_MDT_NAME"_mdss",
- procfs_entry, NULL, "mdt_mdss");
+ procfs_entry, target_print_req,"mdt_mdss");
if (!m->mdt_mdss_service) {
CERROR("failed to start metadata seq server service\n");
GOTO(err_mdt_svc, rc = -ENOMEM);
* controller which manages space.
*/
conf = (typeof(conf)) {
- .psc_nbufs = MDS_NBUFS,
- .psc_bufsize = MDS_BUFSIZE,
- .psc_max_req_size = SEQ_MAXREQSIZE,
- .psc_max_reply_size = SEQ_MAXREPSIZE,
- .psc_req_portal = SEQ_DATA_PORTAL,
- .psc_rep_portal = OSC_REPLY_PORTAL,
- .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT,
- .psc_min_threads = SEQ_NUM_THREADS,
- .psc_max_threads = SEQ_NUM_THREADS,
- .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD
+ .psc_nbufs = MDS_NBUFS,
+ .psc_bufsize = MDS_BUFSIZE,
+ .psc_max_req_size = SEQ_MAXREQSIZE,
+ .psc_max_reply_size = SEQ_MAXREPSIZE,
+ .psc_req_portal = SEQ_DATA_PORTAL,
+ .psc_rep_portal = OSC_REPLY_PORTAL,
+ .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
+ .psc_min_threads = SEQ_NUM_THREADS,
+ .psc_max_threads = SEQ_NUM_THREADS,
+ .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD
};
m->mdt_dtss_service =
ptlrpc_init_svc_conf(&conf, mdt_dtss_handle,
LUSTRE_MDT_NAME"_dtss",
- procfs_entry, NULL, "mdt_dtss");
+ procfs_entry, target_print_req,"mdt_dtss");
if (!m->mdt_dtss_service) {
CERROR("failed to start data seq server service\n");
GOTO(err_mdt_svc, rc = -ENOMEM);
/* FLD service start */
conf = (typeof(conf)) {
- .psc_nbufs = MDS_NBUFS,
- .psc_bufsize = MDS_BUFSIZE,
- .psc_max_req_size = FLD_MAXREQSIZE,
- .psc_max_reply_size = FLD_MAXREPSIZE,
- .psc_req_portal = FLD_REQUEST_PORTAL,
- .psc_rep_portal = MDC_REPLY_PORTAL,
- .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT,
- .psc_min_threads = FLD_NUM_THREADS,
- .psc_max_threads = FLD_NUM_THREADS,
- .psc_ctx_tags = LCT_DT_THREAD|LCT_MD_THREAD
+ .psc_nbufs = MDS_NBUFS,
+ .psc_bufsize = MDS_BUFSIZE,
+ .psc_max_req_size = FLD_MAXREQSIZE,
+ .psc_max_reply_size = FLD_MAXREPSIZE,
+ .psc_req_portal = FLD_REQUEST_PORTAL,
+ .psc_rep_portal = MDC_REPLY_PORTAL,
+ .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
+ .psc_min_threads = FLD_NUM_THREADS,
+ .psc_max_threads = FLD_NUM_THREADS,
+ .psc_ctx_tags = LCT_DT_THREAD|LCT_MD_THREAD
};
m->mdt_fld_service =
ptlrpc_init_svc_conf(&conf, mdt_fld_handle,
LUSTRE_MDT_NAME"_fld",
- procfs_entry, NULL, "mdt_fld");
+ procfs_entry, target_print_req, "mdt_fld");
if (!m->mdt_fld_service) {
CERROR("failed to start fld service\n");
GOTO(err_mdt_svc, rc = -ENOMEM);
* mds-mds requests be not blocked during recovery.
*/
conf = (typeof(conf)) {
- .psc_nbufs = MDS_NBUFS,
- .psc_bufsize = MDS_BUFSIZE,
- .psc_max_req_size = MDS_MAXREQSIZE,
- .psc_max_reply_size = MDS_MAXREPSIZE,
- .psc_req_portal = MDS_MDS_PORTAL,
- .psc_rep_portal = MDC_REPLY_PORTAL,
- .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT,
- .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS),
- MDT_MAX_THREADS),
- .psc_max_threads = MDT_MAX_THREADS,
- .psc_ctx_tags = LCT_MD_THREAD
+ .psc_nbufs = MDS_NBUFS,
+ .psc_bufsize = MDS_BUFSIZE,
+ .psc_max_req_size = MDS_MAXREQSIZE,
+ .psc_max_reply_size = MDS_MAXREPSIZE,
+ .psc_req_portal = MDS_MDS_PORTAL,
+ .psc_rep_portal = MDC_REPLY_PORTAL,
+ .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
+ .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS),
+ MDT_MAX_THREADS),
+ .psc_max_threads = MDT_MAX_THREADS,
+ .psc_ctx_tags = LCT_MD_THREAD
};
- m->mdt_xmds_service = ptlrpc_init_svc_conf(&conf, mdt_xmds_handle,
- LUSTRE_MDT_NAME "_mds",
- procfs_entry, NULL, "mdt_xmds");
+ m->mdt_xmds_service =
+ ptlrpc_init_svc_conf(&conf, mdt_xmds_handle,
+ LUSTRE_MDT_NAME "_mds",
+ procfs_entry, target_print_req,"mdt_xmds");
if (m->mdt_xmds_service == NULL) {
CERROR("failed to start readpage service\n");
struct lprocfs_stats *mdt_stats;
};
-/*XXX copied from mds_internal.h */
-#define MDT_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000)
+#define MDT_SERVICE_WATCHDOG_FACTOR (2000)
#define MDT_ROCOMPAT_SUPP (OBD_ROCOMPAT_LOVOBJID)
#define MDT_INCOMPAT_SUPP (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR)
if (cld->cld_stopping)
RETURN(0);
- OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PROCESS_LOG, 20);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
lsi = s2lsi(cld->cld_cfg.cfg_sb);
mgs->mgs_service =
ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE,
MGS_MAXREPSIZE, MGS_REQUEST_PORTAL,
- MGC_REPLY_PORTAL, MGS_SERVICE_WATCHDOG_TIMEOUT,
+ MGC_REPLY_PORTAL, 2000,
mgs_handle, LUSTRE_MGS_NAME,
- obd->obd_proc_entry, NULL,
+ obd->obd_proc_entry, target_print_req,
MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX,
"ll_mgs", LCT_MD_THREAD);
obd->obd_name, lockrc);
}
- OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_TARGET_REG, 10);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_PAUSE_TARGET_REG, 10);
/* Log writing contention is handled by the fsdb_sem */
ENTRY;
req_capsule_init(&req->rq_pill, req, RCL_SERVER);
- OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_REQUEST_NET, 2);
+ OBD_FAIL_TIMEOUT_MS(OBD_FAIL_MGS_PAUSE_REQ, obd_fail_val);
+ if (OBD_FAIL_CHECK(OBD_FAIL_MGS_ALL_REQUEST_NET))
+ RETURN(0);
LASSERT(current->journal_info == NULL);
opc = lustre_msg_get_opc(req->rq_reqmsg);
#include <lustre_log.h>
#include <lustre_export.h>
-/* in ms */
-#define MGS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000)
-
/* mgs_llog.c */
int class_dentry_readdir(struct obd_device *obd, struct dentry *dir,
struct vfsmount *inmnt,
unsigned int obd_debug_peer_on_timeout;
unsigned int obd_dump_on_timeout;
unsigned int obd_dump_on_eviction;
-unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */
unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
-unsigned int obd_health_check_timeout = HEALTH_CHECK_TIMEOUT_DEFAULT; /* seconds */
unsigned int obd_max_dirty_pages = 256;
atomic_t obd_dirty_pages;
EXPORT_SYMBOL(obd_dump_on_eviction);
EXPORT_SYMBOL(obd_timeout);
EXPORT_SYMBOL(ldlm_timeout);
-EXPORT_SYMBOL(obd_health_check_timeout);
EXPORT_SYMBOL(obd_max_dirty_pages);
EXPORT_SYMBOL(obd_dirty_pages);
EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
ptlrpc_put_connection_superhack(exp->exp_connection);
LASSERT(list_empty(&exp->exp_outstanding_replies));
+ LASSERT(list_empty(&exp->exp_req_replay_queue));
obd_destroy_export(exp);
OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
atomic_set(&export->exp_rpc_count, 0);
export->exp_obd = obd;
CFS_INIT_LIST_HEAD(&export->exp_outstanding_replies);
+ CFS_INIT_LIST_HEAD(&export->exp_req_replay_queue);
/* XXX this should be in LDLM init */
CFS_INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks);
spin_lock_init(&export->exp_ldlm_data.led_lock);
EXIT;
}
+static void init_imp_at(struct imp_at *at) {
+ int i;
+ at_init(&at->iat_net_latency, 0, 0);
+ for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ /* max service estimates are tracked on the server side, so
+ don't use the AT history here, just use the last reported
+ val. (But keep hist for proc histogram, worst_ever) */
+ at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+ AT_FLG_NOHIST);
+ }
+}
+
struct obd_import *class_new_import(struct obd_device *obd)
{
struct obd_import *imp;
CFS_INIT_LIST_HEAD(&imp->imp_conn_list);
CFS_INIT_LIST_HEAD(&imp->imp_handle.h_link);
class_handle_hash(&imp->imp_handle, import_handle_addref);
+ init_imp_at(&imp->imp_at);
/* the default magic is V2, will be used in connect RPC, and
* then adjusted according to the flags in request/reply. */
fake_exp->exp_flags = flags;
spin_unlock(&fake_exp->exp_lock);
+ CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
+ "last request at %ld\n",
+ exp->exp_obd->obd_name, obd_export_nid2str(exp),
+ exp, exp->exp_last_request_time);
rc = obd_disconnect(fake_exp);
class_export_put(exp);
- CDEBUG(D_HA, "disconnecting export %s (%p): rc %d\n",
- exp->exp_client_uuid.uuid, exp, rc);
}
EXIT;
}
return rc;
}
-static int obd_proc_rd_health_timeout(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- *eof = 1;
- return snprintf(page, count, "%d\n", obd_health_check_timeout);
-}
-
-static int obd_proc_wr_health_timeout(struct file *file, const char *buffer,
- unsigned long count, void *data)
-{
- int val, rc;
-
- rc = lprocfs_write_helper(buffer, count, &val);
- if (rc)
- return rc;
-
- obd_health_check_timeout = val;
-
- return count;
-}
-
/* Root for /proc/fs/lustre */
struct proc_dir_entry *proc_lustre_root = NULL;
{ "version", obd_proc_read_version, NULL, NULL },
{ "pinger", obd_proc_read_pinger, NULL, NULL },
{ "health_check", obd_proc_read_health, NULL, NULL },
- { "health_check_timeout", obd_proc_rd_health_timeout,
- obd_proc_wr_health_timeout, NULL },
{ 0 }
};
#else
return rc;
}
+int lprocfs_at_hist_helper(char *page, int count, int rc,
+ struct adaptive_timeout *at)
+{
+ int i;
+ for (i = 0; i < AT_BINS; i++)
+ rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]);
+ rc += snprintf(page + rc, count - rc, "\n");
+ return rc;
+}
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_import *imp;
+ unsigned int cur, worst;
+ time_t now, worstt;
+ struct dhms ts;
+ int i, rc = 0;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+ *eof = 1;
+
+ now = cfs_time_current_sec();
+
+ /* Some network health info for kicks */
+ s2dhms(&ts, now - imp->imp_last_reply_time);
+ rc += snprintf(page + rc, count - rc,
+ "%-10s : %ld, "DHMS_FMT" ago\n",
+ "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+
+ cur = at_get(&imp->imp_at.iat_net_latency);
+ worst = imp->imp_at.iat_net_latency.at_worst_ever;
+ worstt = imp->imp_at.iat_net_latency.at_worst_time;
+ s2dhms(&ts, now - worstt);
+ rc += snprintf(page + rc, count - rc,
+ "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ",
+ "network", cur, worst, worstt, DHMS_VARS(&ts));
+ rc = lprocfs_at_hist_helper(page, count, rc,
+ &imp->imp_at.iat_net_latency);
+
+ for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ if (imp->imp_at.iat_portal[i] == 0)
+ break;
+ cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+ worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+ worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+ s2dhms(&ts, now - worstt);
+ rc += snprintf(page + rc, count - rc,
+ "portal %-2d : cur %3u worst %3u (at %ld, "
+ DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+ cur, worst, worstt, DHMS_VARS(&ts));
+ rc = lprocfs_at_hist_helper(page, count, rc,
+ &imp->imp_at.iat_service_estimate[i]);
+ }
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return rc;
+}
+
static const char *obd_connect_names[] = {
"read_only",
"lov_index",
if (lprocfs_obd_snprintf(&page, size, &len, "status: ") <= 0)
goto out;
-
if (obd->obd_max_recoverable_clients == 0) {
if (lprocfs_obd_snprintf(&page, size, &len, "INACTIVE\n") <= 0)
goto out;
if (obd->obd_recovering == 0) {
if (lprocfs_obd_snprintf(&page, size, &len, "COMPLETE\n") <= 0)
goto out;
-
- if (lprocfs_obd_snprintf(&page, size, &len, "recovery_start: %lu\n",
- obd->obd_recovery_start) <= 0)
+ if (lprocfs_obd_snprintf(&page, size, &len,
+ "recovery_start: %lu\n",
+ obd->obd_recovery_start) <= 0)
goto out;
-
- if (lprocfs_obd_snprintf(&page, size, &len, "recovery_end: %lu\n",
- obd->obd_recovery_end) <= 0)
+ if (lprocfs_obd_snprintf(&page, size, &len,
+ "recovery_duration: %lu\n",
+ obd->obd_recovery_end -
+ obd->obd_recovery_start) <= 0)
goto out;
-
- /* Number of clients have have completed recovery */
- if (lprocfs_obd_snprintf(&page, size, &len, "recovered_clients: %d\n",
- obd->obd_max_recoverable_clients - obd->obd_recoverable_clients) <= 0)
+ /* Number of clients that have completed recovery */
+ if (lprocfs_obd_snprintf(&page, size, &len,
+ "completed_clients: %d/%d\n",
+ obd->obd_max_recoverable_clients -
+ obd->obd_recoverable_clients,
+ obd->obd_max_recoverable_clients) <= 0)
goto out;
-
- if (lprocfs_obd_snprintf(&page, size, &len, "unrecovered_clients: %d\n",
- obd->obd_recoverable_clients) <= 0)
+ if (lprocfs_obd_snprintf(&page, size, &len,
+ "replayed_requests: %d\n",
+ obd->obd_replayed_requests) <= 0)
goto out;
-
- if (lprocfs_obd_snprintf(&page, size, &len, "last_transno: "LPD64"\n",
- obd->obd_next_recovery_transno - 1) <= 0)
+ if (lprocfs_obd_snprintf(&page, size, &len,
+ "last_transno: "LPD64"\n",
+ obd->obd_next_recovery_transno - 1)<=0)
goto out;
-
- lprocfs_obd_snprintf(&page, size, &len, "replayed_requests: %d\n", obd->obd_replayed_requests);
goto fclose;
}
if (lprocfs_obd_snprintf(&page, size, &len, "RECOVERING\n") <= 0)
goto out;
-
if (lprocfs_obd_snprintf(&page, size, &len, "recovery_start: %lu\n",
- obd->obd_recovery_start) <= 0)
+ obd->obd_recovery_start) <= 0)
goto out;
-
- if (lprocfs_obd_snprintf(&page, size, &len, "time remaining: %lu\n",
- cfs_time_current_sec() >= obd->obd_recovery_end ? 0 :
- obd->obd_recovery_end - cfs_time_current_sec()) <= 0)
+ if (lprocfs_obd_snprintf(&page, size, &len, "time_remaining: %lu\n",
+ cfs_time_current_sec() >= obd->obd_recovery_end ? 0 :
+ obd->obd_recovery_end - cfs_time_current_sec()) <= 0)
goto out;
-
- if(lprocfs_obd_snprintf(&page, size, &len, "connected_clients: %d/%d\n",
- obd->obd_connected_clients,
- obd->obd_max_recoverable_clients) <= 0)
+ if (lprocfs_obd_snprintf(&page, size, &len,"connected_clients: %d/%d\n",
+ obd->obd_connected_clients,
+ obd->obd_max_recoverable_clients) <= 0)
goto out;
-
- /* Number of clients have have completed recovery */
- if (lprocfs_obd_snprintf(&page, size, &len, "completed_clients: %d/%d\n",
- obd->obd_max_recoverable_clients - obd->obd_recoverable_clients,
+ /* Number of clients that have completed recovery */
+ if (lprocfs_obd_snprintf(&page, size, &len,"completed_clients: %d/%d\n",
+ obd->obd_max_recoverable_clients -
+ obd->obd_recoverable_clients,
obd->obd_max_recoverable_clients) <= 0)
goto out;
-
- if (lprocfs_obd_snprintf(&page, size, &len, "replayed_requests: %d/??\n",
+ if (lprocfs_obd_snprintf(&page, size, &len,"replayed_requests: %d/??\n",
obd->obd_replayed_requests) <= 0)
goto out;
-
if (lprocfs_obd_snprintf(&page, size, &len, "queued_requests: %d\n",
obd->obd_requests_queued_for_recovery) <= 0)
goto out;
EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
EXPORT_SYMBOL(lprocfs_rd_num_exports);
EXPORT_SYMBOL(lprocfs_rd_numrefs);
-
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
EXPORT_SYMBOL(lprocfs_rd_blksize);
EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
obd_timeout, lcfg->lcfg_num);
obd_timeout = max(lcfg->lcfg_num, 1U);
- obd_health_check_timeout = HEALTH_CHECK_TIMEOUT;
GOTO(out, err = 0);
}
case LCFG_SET_UPCALL: {
static int lustre_start_mgc(struct super_block *sb)
{
struct lustre_handle mgc_conn = {0, };
- struct obd_connect_data ocd = { 0 };
+ struct obd_connect_data *data = NULL;
struct lustre_sb_info *lsi = s2lsi(sb);
struct obd_device *obd;
struct obd_export *exp;
/* nonfatal */
CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
/* We connect to the MGS at setup, and don't disconnect until cleanup */
-
- ocd.ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID;
- ocd.ocd_version = LUSTRE_VERSION_CODE;
-
- rc = obd_connect(NULL, &mgc_conn, obd, &(obd->obd_uuid), &ocd, NULL);
+ OBD_ALLOC_PTR(data);
+ if (data == NULL)
+ GOTO(out, rc = -ENOMEM);
+ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID |
+ OBD_CONNECT_AT;
+ data->ocd_version = LUSTRE_VERSION_CODE;
+ rc = obd_connect(NULL, &mgc_conn, obd, &(obd->obd_uuid), data, NULL);
+ OBD_FREE_PTR(data);
if (rc) {
CERROR("connect failed %d\n", rc);
GOTO(out, rc);
return ERR_PTR(-ENOENT);
rc = filter_lock_dentry(obd, dparent);
- fsfilt_check_slow(obd, now, obd_timeout, "parent lock");
+ fsfilt_check_slow(obd, now, "parent lock");
return rc ? ERR_PTR(rc) : dparent;
}
if (obd->obd_recovering) {
LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in "
- "recovery until %d %s reconnect, or if no clients"
- " reconnect for %d:%.02d; during that time new "
- "clients will not be allowed to connect. "
+ "recovery for at least %d:%.02d, or until %d "
+ "client%s reconnect. During this time new clients"
+ " will not be allowed to connect. "
"Recovery progress can be monitored by watching "
"/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
obd->obd_name, lustre_cfg_string(lcfg, 1),
label ?: "", label ? "/" : "", str,
+ obd->obd_recovery_timeout / 60,
+ obd->obd_recovery_timeout % 60,
obd->obd_max_recoverable_clients,
- (obd->obd_max_recoverable_clients == 1)
- ? "client" : "clients",
- (int)(OBD_RECOVERY_TIMEOUT) / 60,
- (int)(OBD_RECOVERY_TIMEOUT) % 60,
+ (obd->obd_max_recoverable_clients == 1) ? "":"s",
obd->obd_name);
} else {
LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
struct filter_obd *filter;
struct obd_statfs *osfs;
int err = 0, rc = 0, recreate_obj = 0, i;
- unsigned long enough_time = jiffies + min(obd_timeout * HZ / 4, 10U*HZ);
+ cfs_time_t enough_time = cfs_time_shift(DISK_TIMEOUT/2);
obd_id next_id;
void *handle = NULL;
ENTRY;
#define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE)
#define GRANT_FOR_LLOG(obd) 16
-#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
-
extern struct file_operations filter_per_export_stats_fops;
extern struct file_operations filter_per_nid_stats_fops;
#else
#define FILTER_FMD_MAX_NUM_DEFAULT 32
#endif
+/* Client cache seconds */
#define FILTER_FMD_MAX_AGE_DEFAULT ((obd_timeout + 10) * HZ)
struct filter_mod_data *filter_fmd_find(struct obd_export *exp,
inode = dentry->d_inode;
obdo_to_inode(inode, oa, OBD_MD_FLATIME);
- fsfilt_check_slow(obd, now, obd_timeout, "preprw_read setup");
+ fsfilt_check_slow(obd, now, "preprw_read setup");
for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt;
i++, rnb++, lnb++) {
filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
}
- fsfilt_check_slow(obd, now, obd_timeout, "start_page_read");
+ fsfilt_check_slow(obd, now, "start_page_read");
rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf,
exp, NULL, NULL, NULL);
fso.fso_dentry = dentry;
fso.fso_bufcnt = obj->ioo_bufcnt;
- fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "preprw_write setup");
+ fsfilt_check_slow(exp->exp_obd, now, "preprw_write setup");
/* Don't update inode timestamps if this write is older than a
* setattr which modifies the timestamps. b=10150 */
rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp,
NULL, NULL, NULL);
- fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "start_page_write");
+ fsfilt_check_slow(exp->exp_obd, now, "start_page_write");
if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats)
lprocfs_counter_add(exp->exp_nid_stats->nid_stats,
DQUOT_INIT(inode);
LOCK_INODE_MUTEX(inode);
- fsfilt_check_slow(obd, now, obd_timeout, "i_mutex");
+ fsfilt_check_slow(obd, now, "i_mutex");
oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
oti);
if (IS_ERR(oti->oti_handle)) {
}
/* have to call fsfilt_commit() from this point on */
- fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
+ fsfilt_check_slow(obd, now, "brw_start");
i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
lquota_getflag(filter_quota_interface_ref, obd, oa);
- fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
+ fsfilt_check_slow(obd, now, "direct_io");
err = fsfilt_commit_wait(obd, inode, wait_handle);
if (err) {
"oti_transno "LPU64" last_committed "LPU64"\n",
oti->oti_transno, obd->obd_last_committed);
- fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
+ fsfilt_check_slow(obd, now, "commitrw commit");
cleanup:
filter_grant_commit(exp, niocount, res);
{ "checksums", osc_rd_checksum, osc_wr_checksum, 0 },
{ "checksum_type", osc_rd_checksum_type, osc_wd_checksum_type, 0 },
{ "resend_count", osc_rd_resend_count, osc_wr_resend_count, 0},
+ { "timeouts", lprocfs_rd_timeouts, 0, 0 },
{ 0 }
};
RETURN(-ENOMEM);
}
- request->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249
+ request->rq_request_portal = OST_CREATE_PORTAL;
+ ptlrpc_at_set_req_timeout(request);
body = lustre_msg_buf(request->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
spin_lock(&oscc->oscc_lock);
CDEBUG(D_HA,"%s: oscc recovery in progress, waiting\n",
oscc->oscc_obd->obd_name);
- lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout/4)),
- NULL, NULL);
+ lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(
+ obd_timeout / 4)), NULL, NULL);
rc = l_wait_event(oscc->oscc_waitq,
!oscc_recovering(oscc), &lwi);
LASSERT(rc == 0 || rc == -ETIMEDOUT);
RETURN(rc);
}
req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+ ptlrpc_at_set_req_timeout(req);
osc_pack_req_body(req, oinfo);
/* overload the size and blocks fields in the oa with start/end */
req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
req->rq_interpret_reply = osc_destroy_interpret;
+ ptlrpc_at_set_req_timeout(req);
if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE)
memcpy(obdo_logcookie(oa), oti->oti_logcookies,
RETURN(rc);
}
req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+ ptlrpc_at_set_req_timeout(req);
if (opc == OST_WRITE)
desc = ptlrpc_prep_bulk_imp(req, page_count,
RETURN(rc);
}
ptlrpc_request_set_replen(req);
- req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249
+ req->rq_request_portal = OST_CREATE_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
+
if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
/* procfs requests not want stat in wait for avoid deadlock */
req->rq_no_resend = 1;
RETURN(rc);
}
ptlrpc_request_set_replen(req);
- req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249
+ req->rq_request_portal = OST_CREATE_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
if (flags & OBD_STATFS_NODELAY) {
/* procfs requests not want stat in wait for avoid deadlock */
lvars->obd_vars = lprocfs_ost_obd_vars;
}
-void
-ost_print_req(void *seq_file, struct ptlrpc_request *req)
-{
- /* Called holding srv_lock with irqs disabled.
- * Print specific req contents and a newline.
- * CAVEAT EMPTOR: check request message length before printing!!!
- * You might have received any old crap so you must be just as
- * careful here as the service's request parser!!! */
- struct seq_file *sf = seq_file;
-
- switch (req->rq_phase) {
- case RQ_PHASE_NEW:
- /* still awaiting a service thread's attention, or rejected
- * because the generic request message didn't unpack */
- seq_printf(sf, "<not swabbed>\n");
- break;
-
- case RQ_PHASE_INTERPRET:
- /* being handled, so basic msg swabbed, and opc is valid
- * but racing with ost_handle() */
- seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
- break;
-
- case RQ_PHASE_COMPLETE:
- /* been handled by ost_handle() reply state possibly still
- * volatile */
- seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
- break;
-
- default:
- LBUG();
- }
-}
#endif /* LPROCFS */
* If getting the lock took more time than
* client was willing to wait, drop it. b=11330
*/
- if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout ||
+ if (cfs_time_current_sec() > req->rq_deadline ||
OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
no_reply = 1;
CERROR("Dropping timed-out read from %s because locking"
- "object "LPX64" took %ld seconds.\n",
+ "object "LPX64" took %ld seconds (limit was %ld).\n",
libcfs_id2str(req->rq_peer), ioo->ioo_id,
- cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
+ cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
+ req->rq_deadline - req->rq_arrival_time.tv_sec);
GOTO(out_lock, rc = -ETIMEDOUT);
}
}
if (rc == 0) {
- lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 4, HZ,
- ost_bulk_timeout, desc);
- rc = l_wait_event(desc->bd_waitq,
- !ptlrpc_bulk_active(desc) ||
- exp->exp_failed, &lwi);
- LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ time_t start = cfs_time_current_sec();
+ do {
+ long timeoutl = req->rq_deadline -
+ cfs_time_current_sec();
+ cfs_duration_t timeout = (timeoutl <= 0 || rc) ?
+ CFS_TICK : cfs_time_seconds(timeoutl);
+ lwi = LWI_TIMEOUT_INTERVAL(timeout,
+ cfs_time_seconds(1),
+ ost_bulk_timeout,
+ desc);
+ rc = l_wait_event(desc->bd_waitq,
+ !ptlrpc_bulk_active(desc) ||
+ exp->exp_failed, &lwi);
+ LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ /* Wait again if we changed deadline */
+ } while ((rc == -ETIMEDOUT) &&
+ (req->rq_deadline > cfs_time_current_sec()));
+
if (rc == -ETIMEDOUT) {
- DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT");
+ DEBUG_REQ(D_ERROR, req,
+ "timeout on bulk PUT after %ld%+lds",
+ req->rq_deadline - start,
+ cfs_time_current_sec() -
+ req->rq_deadline);
ptlrpc_abort_bulk(desc);
} else if (exp->exp_failed) {
DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT");
req->rq_status = rc;
ptlrpc_error(req);
} else {
- if (req->rq_reply_state != NULL) {
- /* reply out callback would free */
- ptlrpc_rs_decref(req->rq_reply_state);
- req->rq_reply_state = NULL;
- }
+ /* reply out callback would free */
+ ptlrpc_req_drop_rs(req);
CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - "
"client will retry\n",
exp->exp_obd->obd_name,
rc = lustre_pack_reply(req, 3, size, NULL);
if (rc != 0)
GOTO(out, rc);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_PACK, obd_fail_val);
rcs = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
niocount * sizeof(*rcs));
* If getting the lock took more time than
* client was willing to wait, drop it. b=11330
*/
- if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout ||
+ if (cfs_time_current_sec() > req->rq_deadline ||
OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) {
no_reply = 1;
- CERROR("Dropping timed-out write from %s because locking"
- "object "LPX64" took %ld seconds.\n",
+ CERROR("Dropping timed-out write from %s because locking "
+ "object "LPX64" took %ld seconds (limit was %ld).\n",
libcfs_id2str(req->rq_peer), ioo->ioo_id,
- cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
+ cfs_time_current_sec() - req->rq_arrival_time.tv_sec,
+ req->rq_deadline - req->rq_arrival_time.tv_sec);
GOTO(out_lock, rc = -ETIMEDOUT);
}
else
rc = ptlrpc_start_bulk_transfer (desc);
if (rc == 0) {
- lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 2, HZ,
- ost_bulk_timeout, desc);
- rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc) ||
- desc->bd_export->exp_failed, &lwi);
- LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ time_t start = cfs_time_current_sec();
+ do {
+ long timeoutl = req->rq_deadline -
+ cfs_time_current_sec();
+ cfs_duration_t timeout = (timeoutl <= 0 || rc) ?
+ CFS_TICK : cfs_time_seconds(timeoutl);
+ lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+ ost_bulk_timeout, desc);
+ rc = l_wait_event(desc->bd_waitq,
+ !ptlrpc_bulk_active(desc) ||
+ desc->bd_export->exp_failed, &lwi);
+ LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ /* Wait again if we changed deadline */
+ } while ((rc == -ETIMEDOUT) &&
+ (req->rq_deadline > cfs_time_current_sec()));
+
if (rc == -ETIMEDOUT) {
- DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
+ DEBUG_REQ(D_ERROR, req,
+ "timeout on bulk GET after %ld%+lds",
+ req->rq_deadline - start,
+ cfs_time_current_sec() -
+ req->rq_deadline);
ptlrpc_abort_bulk(desc);
} else if (desc->bd_export->exp_failed) {
DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET");
req->rq_status = rc;
ptlrpc_error(req);
} else {
- if (req->rq_reply_state != NULL) {
- /* reply out callback would free */
- ptlrpc_rs_decref(req->rq_reply_state);
- req->rq_reply_state = NULL;
- }
+ /* reply out callback would free */
+ ptlrpc_req_drop_rs(req);
CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - "
"client will retry\n",
exp->exp_obd->obd_name,
ost->ost_service =
ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
OST_MAXREPSIZE, OST_REQUEST_PORTAL,
- OSC_REPLY_PORTAL,
- OST_WATCHDOG_TIMEOUT, ost_handle,
- LUSTRE_OSS_NAME, obd->obd_proc_entry,
- ost_print_req, oss_min_threads,
- oss_max_threads, "ll_ost",
- LCT_DT_THREAD);
+ OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
+ ost_handle, LUSTRE_OSS_NAME,
+ obd->obd_proc_entry, target_print_req,
+ oss_min_threads, oss_max_threads,
+ "ll_ost", LCT_DT_THREAD);
if (ost->ost_service == NULL) {
CERROR("failed to start service\n");
GOTO(out_lprocfs, rc = -ENOMEM);
if (oss_num_create_threads) {
if (oss_num_create_threads > OSS_MAX_CREATE_THREADS)
oss_num_create_threads = OSS_MAX_CREATE_THREADS;
- if (oss_num_create_threads < OSS_DEF_CREATE_THREADS)
- oss_num_create_threads = OSS_DEF_CREATE_THREADS;
+ if (oss_num_create_threads < OSS_MIN_CREATE_THREADS)
+ oss_num_create_threads = OSS_MIN_CREATE_THREADS;
oss_min_create_threads = oss_max_create_threads =
oss_num_create_threads;
} else {
- oss_min_create_threads = OSS_DEF_CREATE_THREADS;
+ oss_min_create_threads = OSS_MIN_CREATE_THREADS;
oss_max_create_threads = OSS_MAX_CREATE_THREADS;
}
ost->ost_create_service =
ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
OST_MAXREPSIZE, OST_CREATE_PORTAL,
- OSC_REPLY_PORTAL,
- OST_WATCHDOG_TIMEOUT, ost_handle, "ost_create",
- obd->obd_proc_entry, ost_print_req,
- oss_min_create_threads,
- oss_max_create_threads,
+ OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
+ ost_handle, "ost_create",
+ obd->obd_proc_entry, target_print_req,
+ oss_min_create_threads, oss_max_create_threads,
"ll_ost_creat", LCT_DT_THREAD);
if (ost->ost_create_service == NULL) {
CERROR("failed to start OST create service\n");
ost->ost_io_service =
ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
OST_MAXREPSIZE, OST_IO_PORTAL,
- OSC_REPLY_PORTAL,
- OST_WATCHDOG_TIMEOUT, ost_handle, "ost_io",
- obd->obd_proc_entry, ost_print_req,
+ OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR,
+ ost_handle, "ost_io",
+ obd->obd_proc_entry, target_print_req,
oss_min_threads, oss_max_threads,
"ll_ost_io", LCT_DT_THREAD);
if (ost->ost_io_service == NULL) {
#ifndef OST_INTERNAL_H
#define OST_INTERNAL_H
-#ifdef LPROCFS
-extern void ost_print_req(void *seq_file, struct ptlrpc_request *req);
-#else
-# define ost_print_req NULL
-#endif
+#define OSS_SERVICE_WATCHDOG_FACTOR 2000
/*
* tunables for per-thread page pool (bug 5137)
struct ost_thread_local_cache *ost_tls(struct ptlrpc_request *r);
-#define OSS_DEF_CREATE_THREADS 1UL
+#define OSS_MIN_CREATE_THREADS 2UL
#define OSS_MAX_CREATE_THREADS 16UL
/* Quota stuff */
EXIT;
}
+/* Set server timelimit for this req */
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
+{
+ __u32 serv_est;
+ int idx;
+ struct imp_at *at;
+
+ LASSERT(req->rq_import);
+
+ if (AT_OFF) {
+ /* non-AT settings */
+ req->rq_timeout = req->rq_import->imp_server_timeout ?
+ obd_timeout / 2 : obd_timeout;
+ lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+ return;
+ }
+
+ at = &req->rq_import->imp_at;
+ idx = import_at_get_index(req->rq_import,
+ req->rq_request_portal);
+ serv_est = at_get(&at->iat_service_estimate[idx]);
+ /* add an arbitrary minimum: 125% +5 sec */
+ req->rq_timeout = serv_est + (serv_est >> 2) + 5;
+ /* We could get even fancier here, using history to predict increased
+ loading... */
+
+ /* Let the server know what this RPC timeout is by putting it in the
+ reqmsg*/
+ lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req)
+{
+ int idx;
+ unsigned int serv_est, oldse;
+ struct imp_at *at = &req->rq_import->imp_at;
+
+ LASSERT(req->rq_import);
+
+ /* service estimate is returned in the repmsg timeout field,
+ may be 0 on err */
+ serv_est = lustre_msg_get_timeout(req->rq_repmsg);
+
+ idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+ /* max service estimates are tracked on the server side,
+ so just keep minimal history here */
+ oldse = at_add(&at->iat_service_estimate[idx], serv_est);
+ if (oldse != 0)
+ CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
+ "has changed from %d to %d\n",
+ req->rq_import->imp_obd->obd_name,req->rq_request_portal,
+ oldse, at_get(&at->iat_service_estimate[idx]));
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
+ return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req)
+{
+ unsigned int st, nl, oldnl;
+ struct imp_at *at = &req->rq_import->imp_at;
+ time_t now = cfs_time_current_sec();
+
+ LASSERT(req->rq_import);
+
+ st = lustre_msg_get_service_time(req->rq_repmsg);
+
+ /* Network latency is total time less server processing time */
+ nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
+ if (st > now - req->rq_sent + 2 /* rounding */)
+ CERROR("Reported service time %u > total measured time %ld\n",
+ st, now - req->rq_sent);
+
+ oldnl = at_add(&at->iat_net_latency, nl);
+ if (oldnl != 0)
+ CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) "
+ "has changed from %d to %d\n",
+ req->rq_import->imp_obd->obd_name,
+ obd_uuid2str(
+ &req->rq_import->imp_connection->c_remote_uuid),
+ oldnl, at_get(&at->iat_net_latency));
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+ int rc;
+
+ /* Clear reply swab mask; we may have already swabbed an early reply */
+ req->rq_rep_swab_mask = 0;
+
+ rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
+ if (rc) {
+ DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+ return(-EPROTO);
+ }
+
+ rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+ if (rc) {
+ DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+ return(-EPROTO);
+ }
+ return 0;
+}
+
+/*
+ * Handle an early reply message, called with the rq_lock held.
+ * If anything goes wrong just ignore it - same as if it never happened
+ */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
+ time_t olddl;
+ int rc;
+ ENTRY;
+
+ req->rq_early = 0;
+ spin_unlock(&req->rq_lock);
+
+ rc = sptlrpc_cli_unwrap_early_reply(req);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = unpack_reply(req);
+ if (rc)
+ GOTO(out_cleanup, rc);
+
+ /* Expecting to increase the service time estimate here */
+ ptlrpc_at_adj_service(req);
+ ptlrpc_at_adj_net_latency(req);
+
+ /* Adjust the local timeout for this req */
+ ptlrpc_at_set_req_timeout(req);
+
+ olddl = req->rq_deadline;
+ /* server assumes it now has rq_timeout from when it sent the
+ early reply, so client should give it at least that long. */
+ req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
+ ptlrpc_at_get_net_latency(req);
+
+ DEBUG_REQ(D_ADAPTTO, req,
+ "Early reply #%d, new deadline in %lds (%+lds)",
+ req->rq_early_count, req->rq_deadline -
+ cfs_time_current_sec(), req->rq_deadline - olddl);
+
+out_cleanup:
+ sptlrpc_cli_finish_early_reply(req);
+out:
+ spin_lock(&req->rq_lock);
+ RETURN(rc);
+}
+
void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
{
struct list_head *l, *tmp;
}
lustre_msg_add_version(request->rq_reqmsg, version);
-
- if (imp->imp_server_timeout)
- request->rq_timeout = obd_timeout / 2;
- else
- request->rq_timeout = obd_timeout;
request->rq_send_state = LUSTRE_IMP_FULL;
request->rq_type = PTL_RPC_MSG_REQUEST;
request->rq_export = NULL;
request->rq_phase = RQ_PHASE_NEW;
- /* XXX FIXME bug 249 */
request->rq_request_portal = imp->imp_client->cli_request_portal;
request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+ ptlrpc_at_set_req_timeout(request);
+
spin_lock_init(&request->rq_lock);
CFS_INIT_LIST_HEAD(&request->rq_list);
+ CFS_INIT_LIST_HEAD(&request->rq_timed_list);
CFS_INIT_LIST_HEAD(&request->rq_replay_list);
CFS_INIT_LIST_HEAD(&request->rq_mod_list);
CFS_INIT_LIST_HEAD(&request->rq_ctx_chain);
atomic_set(&request->rq_refcount, 1);
lustre_msg_set_opc(request->rq_reqmsg, opcode);
- lustre_msg_set_flags(request->rq_reqmsg, 0);
RETURN(0);
out_ctx:
if (req->rq_restart)
GOTO(out, rc = 1);
+
+ if (req->rq_early) {
+ ptlrpc_at_recv_early_reply(req);
+ GOTO(out, rc = 0); /* keep waiting */
+ }
+
EXIT;
out:
spin_unlock(&req->rq_lock);
* including buflens, status etc is in the sender's byte order.
*/
- /*
- * Clear reply swab mask; this is a new reply in sender's byte order.
- */
- req->rq_rep_swab_mask = 0;
-
rc = sptlrpc_cli_unwrap_reply(req);
if (rc) {
DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
if (req->rq_resend)
RETURN(0);
- rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
- if (rc) {
- DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
- RETURN(-EPROTO);
- }
-
- rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
- if (rc) {
- DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
- RETURN(-EPROTO);
- }
+ rc = unpack_reply(req);
+ if (rc)
+ RETURN(rc);
do_gettimeofday(&work_start);
timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL);
RETURN(-EPROTO);
}
+ OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val);
+ ptlrpc_at_adj_service(req);
+ ptlrpc_at_adj_net_latency(req);
+
rc = ptlrpc_check_status(req);
imp->imp_connect_error = rc;
force_timer_recalc = 1;
}
+ spin_lock(&req->rq_lock);
+
+ if (req->rq_early) {
+ ptlrpc_at_recv_early_reply(req);
+ spin_unlock(&req->rq_lock);
+ continue;
+ }
+
/* Still waiting for a reply? */
- if (ptlrpc_client_receiving_reply(req))
+ if (req->rq_receiving_reply) {
+ spin_unlock(&req->rq_lock);
continue;
+ }
/* Did we actually receive a reply? */
- if (!ptlrpc_client_replied(req))
+ if (!req->rq_replied) {
+ spin_unlock(&req->rq_lock);
continue;
+ }
+
+ spin_unlock(&req->rq_lock);
spin_lock(&imp->imp_lock);
list_del_init(&req->rq_list);
RETURN(set->set_remaining == 0 || force_timer_recalc);
}
+/* Return 1 if we should give up, else 0 */
int ptlrpc_expire_one_request(struct ptlrpc_request *req)
{
struct obd_import *imp = req->rq_import;
int rc = 0;
ENTRY;
- DEBUG_REQ(D_ERROR|D_NETERROR, req, "%s (sent at %lu, "CFS_DURATION_T"s ago)",
+ DEBUG_REQ(D_ERROR|D_NETERROR, req,
+ "%s (sent at %lu, "CFS_DURATION_T"s ago)",
req->rq_net_err ? "network error" : "timeout",
(long)req->rq_sent, cfs_time_current_sec() - req->rq_sent);
+ if (imp) {
+ LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago"
+ " has timed out (limit %lus).\n", req->rq_xid,
+ req->rq_import->imp_obd->obd_name,
+ libcfs_nid2str(imp->imp_connection->c_peer.nid),
+ cfs_time_current_sec() - req->rq_sent,
+ req->rq_deadline - req->rq_sent);
+ }
+
if (imp != NULL && obd_debug_peer_on_timeout)
LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
if (req->rq_ctx_init || req->rq_ctx_fini ||
req->rq_send_state != LUSTRE_IMP_FULL ||
imp->imp_obd->obd_no_recov) {
+ DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+ ptlrpc_import_state_name(req->rq_send_state),
+ ptlrpc_import_state_name(imp->imp_state));
spin_lock(&req->rq_lock);
req->rq_status = -ETIMEDOUT;
req->rq_err = 1;
RETURN(1);
}
- /* if request can't be resend we can't wait answer after timeout */
+ /* if a request can't be resent we can't wait for an answer after
+ the timeout */
if (req->rq_no_resend) {
DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
rc = 1;
list_entry(tmp, struct ptlrpc_request, rq_set_chain);
/* request in-flight? */
- if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting &&
+ if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting &&
!req->rq_resend) ||
(req->rq_phase == RQ_PHASE_BULK)))
continue;
if (req->rq_timedout || /* already dealt with */
- req->rq_sent + req->rq_timeout > now) /* not expired */
+ req->rq_deadline > now) /* not expired */
continue;
/* deal with this guy */
}
}
+/* get the smallest timeout in the set; this does NOT set a timeout. */
int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
{
struct list_head *tmp;
time_t now = cfs_time_current_sec();
- time_t deadline;
int timeout = 0;
struct ptlrpc_request *req;
+ int deadline;
ENTRY;
SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
struct l_wait_info lwi;
LASSERT(!in_interrupt ()); /* might sleep */
-
- if (!ptlrpc_client_receiving_reply(request))
+ if (!ptlrpc_client_recv_or_unlink(request))
+ /* Nothing left to do */
return;
LNetMDUnlink (request->rq_reply_md_h);
/* We have to l_wait_event() whatever the result, to give liblustre
- * a chance to run reply_in_callback() */
+ * a chance to run reply_in_callback(), and to make sure we've
+ * unlinked before returning a req to the pool */
if (request->rq_set != NULL)
wq = &request->rq_set->set_waitq;
for (;;) {
/* Network access will complete in finite time but the HUGE
* timeout lets us CWARN for visibility of sluggish NALs */
- lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL);
- rc = l_wait_event (*wq, !ptlrpc_client_receiving_reply(request), &lwi);
+ lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL);
+ rc = l_wait_event (*wq, !ptlrpc_client_recv_or_unlink(request),
+ &lwi);
if (rc == 0)
return;
LASSERT (rc == -ETIMEDOUT);
- DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout");
+ DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
+ "rvcng=%d unlnk=%d", request->rq_receiving_reply,
+ request->rq_must_unlink);
}
}
imp->imp_generation == imp->imp_last_generation_checked) {
CDEBUG(D_RPCTRACE, "%s: skip recheck: last_committed "LPU64"\n",
imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+ EXIT;
return;
}
if (ptlrpc_check_suspend())
RETURN(1);
+ /* deadline may have changed with an early reply */
+ if (req->rq_deadline > cfs_time_current_sec())
+ RETURN(1);
+
RETURN(ptlrpc_expire_one_request(req));
}
int brc;
struct l_wait_info lwi;
struct obd_import *imp = req->rq_import;
- cfs_duration_t timeout = 0;
+ cfs_duration_t timeout = CFS_TICK;
+ long timeoutl;
ENTRY;
LASSERT(req->rq_set == NULL);
list_del_init(&req->rq_list);
if (req->rq_err) {
+ /* rq_status was set locally */
rc = -EIO;
}
else if (req->rq_intr) {
}
rc = ptl_send_rpc(req, 0);
- if (rc) {
+ if (rc)
DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc);
- timeout = CFS_TICK;
- } else {
- timeout = cfs_timeout_cap(cfs_time_seconds(req->rq_timeout));
- DEBUG_REQ(D_NET, req,
- "-- sleeping for "CFS_DURATION_T" jiffies", timeout);
- }
+
repeat:
+ timeoutl = req->rq_deadline - cfs_time_current_sec();
+ timeout = (timeoutl <= 0 || rc) ? CFS_TICK :
+ cfs_time_seconds(timeoutl);
+ DEBUG_REQ(D_NET, req,
+ "-- sleeping for "CFS_DURATION_T" ticks", timeout);
lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request,
req);
rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi);
- if (rc == -ETIMEDOUT && ptlrpc_check_and_wait_suspend(req))
+ if (rc == -ETIMEDOUT && ((req->rq_deadline > cfs_time_current_sec()) ||
+ ptlrpc_check_and_wait_suspend(req)))
goto repeat;
after_send:
* req->rq_receiving_reply is clear and returns. */
ptlrpc_unregister_reply (req);
- if (req->rq_err)
- GOTO(out, rc = -EIO);
- /* Resend if we need to, unless we were interrupted. */
- if (req->rq_resend && !req->rq_intr) {
- /* ...unless we were specifically told otherwise. */
- if (req->rq_no_resend)
- GOTO(out, rc = -ETIMEDOUT);
- spin_lock(&imp->imp_lock);
- goto restart;
+ if (req->rq_err) {
+ DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d",
+ rc, req->rq_status);
+ GOTO(out, rc = -EIO);
}
if (req->rq_intr) {
GOTO(out, rc = -EINTR);
}
+ /* Resend if we need to */
+ if (req->rq_resend) {
+ /* ...unless we were specifically told otherwise. */
+ if (req->rq_no_resend)
+ GOTO(out, rc = -ETIMEDOUT);
+ spin_lock(&imp->imp_lock);
+ goto restart;
+ }
+
if (req->rq_timedout) { /* non-recoverable timeout */
GOTO(out, rc = -ETIMEDOUT);
}
ENTRY;
LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
-
/* Not handling automatic bulk replay yet (or ever?) */
LASSERT(req->rq_bulk == NULL);
- DEBUG_REQ(D_HA, req, "REPLAY");
-
LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
aa = (struct ptlrpc_replay_async_args *)&req->rq_async_args;
memset(aa, 0, sizeof *aa);
aa->praa_old_state = req->rq_send_state;
req->rq_send_state = LUSTRE_IMP_REPLAY;
req->rq_phase = RQ_PHASE_NEW;
- aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+ if (req->rq_repmsg)
+ aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
req->rq_status = 0;
-
req->rq_interpret_reply = ptlrpc_replay_interpret;
+ /* Readjust the timeout for current conditions */
+ ptlrpc_at_set_req_timeout(req);
+
+ DEBUG_REQ(D_HA, req, "REPLAY");
+
atomic_inc(&req->rq_import->imp_replay_inflight);
ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
spin_lock (&req->rq_lock);
if (req->rq_import_generation < imp->imp_generation) {
req->rq_err = 1;
+ req->rq_status = -EINTR;
ptlrpc_wake_client_req(req);
}
spin_unlock (&req->rq_lock);
spin_lock (&req->rq_lock);
if (req->rq_import_generation < imp->imp_generation) {
req->rq_err = 1;
+ req->rq_status = -EINTR;
ptlrpc_wake_client_req(req);
}
spin_unlock (&req->rq_lock);
return tmp;
}
EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
struct ptlrpc_request *req = cbid->cbid_arg;
ENTRY;
- LASSERT (ev->type == LNET_EVENT_PUT ||
- ev->type == LNET_EVENT_UNLINK);
- LASSERT (ev->unlinked);
- LASSERT (ev->md.start == req->rq_repbuf);
- LASSERT (ev->offset == 0);
- LASSERT (ev->mlength <= req->rq_repbuf_len);
-
DEBUG_REQ((ev->status == 0) ? D_NET : D_ERROR, req,
"type %d, status %d", ev->type, ev->status);
+ LASSERT (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+ LASSERT (ev->md.start == req->rq_repbuf);
+ LASSERT (ev->mlength <= req->rq_repbuf_len);
+ /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+ for adaptive timeouts' early reply. */
+ LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
+
spin_lock(&req->rq_lock);
- LASSERT (req->rq_receiving_reply);
req->rq_receiving_reply = 0;
+ req->rq_early = 0;
+
+ if (ev->status)
+ goto out_wake;
+ if (ev->type == LNET_EVENT_UNLINK) {
+ req->rq_must_unlink = 0;
+ DEBUG_REQ(D_RPCTRACE, req, "unlink");
+ goto out_wake;
+ }
- if (ev->type == LNET_EVENT_PUT && ev->status == 0) {
+ if ((ev->offset == 0) &&
+ ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
+ /* Early reply */
+ DEBUG_REQ(D_ADAPTTO, req,
+ "Early reply received: mlen=%u offset=%d replen=%d "
+ "replied=%d unlinked=%d", ev->mlength, ev->offset,
+ req->rq_replen, req->rq_replied, ev->unlinked);
+
+ req->rq_early_count++; /* number received, client side */
+ if (req->rq_replied) {
+ /* If we already got the real reply, then we need to
+ * check if lnet_finalize() unlinked the md. In that
+ * case, there will be no further callback of type
+ * LNET_EVENT_UNLINK.
+ */
+ if (ev->unlinked)
+ req->rq_must_unlink = 0;
+ else
+ DEBUG_REQ(D_RPCTRACE, req, "unlinked in reply");
+ goto out_wake;
+ }
+ req->rq_early = 1;
+ req->rq_reply_off = ev->offset;
+ req->rq_nob_received = ev->mlength;
+ /* And we're still receiving */
+ req->rq_receiving_reply = 1;
+ } else {
+ /* Real reply */
req->rq_replied = 1;
+ req->rq_reply_off = ev->offset;
req->rq_nob_received = ev->mlength;
+ /* LNetMDUnlink can't be called under the LNET_LOCK,
+ so we must unlink in ptlrpc_unregister_reply */
+ DEBUG_REQ(D_INFO, req,
+ "reply in flags=%x mlen=%u offset=%d replen=%d",
+ lustre_msg_get_flags(req->rq_reqmsg),
+ ev->mlength, ev->offset, req->rq_replen);
}
+ req->rq_import->imp_last_reply_time = cfs_time_current_sec();
+
+out_wake:
/* NB don't unlock till after wakeup; req can disappear under us
* since we don't have our own ref */
ptlrpc_wake_client_req(req);
-
spin_unlock(&req->rq_lock);
EXIT;
}
#ifdef CRAY_XT3
req->rq_uid = ev->uid;
#endif
+ spin_lock_init(&req->rq_lock);
+ CFS_INIT_LIST_HEAD(&req->rq_timed_list);
+ atomic_set(&req->rq_refcount, 1);
+ if (ev->type == LNET_EVENT_PUT)
+ DEBUG_REQ(D_RPCTRACE, req, "incoming req");
CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
rqbd->rqbd_refcount++;
}
- list_add_tail(&req->rq_list, &service->srv_request_queue);
+ list_add_tail(&req->rq_list, &service->srv_req_in_queue);
service->srv_n_queued_reqs++;
/* NB everything can disappear under us once the request
switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
case SPTLRPC_SVC_NULL:
- vmsg = req->rq_repbuf;
+ vmsg = req->rq_repdata;
voff = vmsg->lm_bufcount - 1;
LASSERT(vmsg && vmsg->lm_bufcount >= 3);
break;
case SPTLRPC_SVC_AUTH:
case SPTLRPC_SVC_INTG:
- vmsg = req->rq_repbuf;
+ vmsg = req->rq_repdata;
voff = vmsg->lm_bufcount - 2;
LASSERT(vmsg && vmsg->lm_bufcount >= 4);
LASSERT(rmsg && rmsg->lm_bufcount >= 4);
break;
case SPTLRPC_SVC_PRIV:
- vmsg = req->rq_repbuf;
+ vmsg = req->rq_repdata;
voff = vmsg->lm_bufcount - 1;
LASSERT(vmsg && vmsg->lm_bufcount >= 2);
goto out_copy;
}
- lsize = ctx_init_parse_reply(req->rq_repbuf,
+ LASSERT(req->rq_repdata);
+ lsize = ctx_init_parse_reply(req->rq_repdata,
param.reply_buf, param.reply_buf_size);
if (lsize < 0) {
param.status = (int) lsize;
/*
* the timeout is only for the case that upcall child process die abnormally.
- * in any other cases it should finally update kernel key. so we set this
- * timeout value excessive long.
+ * in any other cases it should finally update kernel key.
+ *
+ * FIXME we'd better to incorporate the client & server side upcall timeouts
+ * into the framework of Adaptive Timeouts, but we need to figure out how to
+ * make sure that kernel knows the upcall processes is in-progress or died
+ * unexpectedly.
*/
#define KEYRING_UPCALL_TIMEOUT (obd_timeout + obd_timeout)
for (;;) {
key = request_key(&gss_key_type, desc, NULL);
if (IS_ERR(key)) {
- CWARN("No more key found for current user\n");
+ CDEBUG(D_SEC, "No more key found for current user\n");
break;
}
grctx->src_init = 1;
grctx->src_reserve_len = size_round4(rsip->out_token.len);
- rc = lustre_pack_reply_v2(req, 1, &replen, NULL);
+ rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
if (rc) {
CERROR("failed to pack reply: %d\n", rc);
GOTO(out, rc = SECSVC_DROP);
#include <obd.h>
#include <obd_class.h>
#include <obd_support.h>
+#include <obd_cksum.h>
#include <lustre/lustre_idl.h>
#include <lustre_net.h>
#include <lustre_import.h>
#include <linux/crypto.h>
+/*
+ * early reply have fixed size, respectively in privacy and integrity mode.
+ * so we calculate them only once.
+ */
+static int gss_at_reply_off_integ;
+static int gss_at_reply_off_priv;
+
static inline int msg_last_segidx(struct lustre_msg *msg)
{
/*
* payload should be obtained from mechanism. but currently since we
* only support kerberos, we could simply use fixed value.
- * krb5 header: 16
- * krb5 checksum: 20
+ * krb5 "meta" data:
+ * - krb5 header: 16
+ * - krb5 checksum: 20
+ *
+ * for privacy mode, payload also include the cipher text which has the same
+ * size as plain text, plus possible confounder, padding both at maximum cipher
+ * block size.
*/
#define GSS_KRB5_INTEG_MAX_PAYLOAD (40)
static inline
-int gss_estimate_payload(struct gss_ctx *mechctx, int msgsize, int privacy)
+int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy)
{
- if (privacy) {
- /* we suppose max cipher block size is 16 bytes. here we
- * add 16 for confounder and 16 for padding. */
- return GSS_KRB5_INTEG_MAX_PAYLOAD + msgsize + 16 + 16 + 16;
- } else {
+ if (privacy)
+ return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize;
+ else
return GSS_KRB5_INTEG_MAX_PAYLOAD;
- }
}
/*
* cred APIs *
***************************************/
-static inline
-int gss_cli_payload(struct ptlrpc_cli_ctx *ctx,
- int msgsize, int privacy)
+static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx,
+ int msgsize, int privacy)
{
- return gss_estimate_payload(NULL, msgsize, privacy);
+ return gss_mech_payload(NULL, msgsize, privacy);
}
int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
{
struct gss_cli_ctx *gctx;
struct gss_header *ghdr, *reqhdr;
- struct lustre_msg *msg = req->rq_repbuf;
+ struct lustre_msg *msg = req->rq_repdata;
__u32 major;
- int rc = 0;
+ int pack_bulk, early = 0, rc = 0;
ENTRY;
LASSERT(req->rq_cli_ctx == ctx);
LASSERT(msg);
- req->rq_repdata_len = req->rq_nob_received;
gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+ if ((char *) msg < req->rq_repbuf ||
+ (char *) msg >= req->rq_repbuf + req->rq_repbuf_len)
+ early = 1;
+
/* special case for context negotiation, rq_repmsg/rq_replen actually
- * are not used currently. */
- if (req->rq_ctx_init) {
+ * are not used currently. but early reply always be treated normally */
+ if (req->rq_ctx_init && !early) {
req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
req->rq_replen = msg->lm_buflens[1];
RETURN(0);
switch (ghdr->gh_proc) {
case PTLRPC_GSS_PROC_DATA:
- if (!equi(req->rq_pack_bulk == 1,
- ghdr->gh_flags & LUSTRE_GSS_PACK_BULK)) {
+ pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+ if (!early && !equi(req->rq_pack_bulk == 1, pack_bulk)) {
CERROR("%s bulk flag in reply\n",
req->rq_pack_bulk ? "missing" : "unexpected");
RETURN(-EPROTO);
if (major != GSS_S_COMPLETE)
RETURN(-EPERM);
- req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
- req->rq_replen = msg->lm_buflens[1];
+ if (early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) {
+ __u32 cksum;
- if (req->rq_pack_bulk) {
- /* FIXME */
+ cksum = crc32_le(!(__u32) 0,
+ lustre_msg_buf(msg, 1, 0),
+ lustre_msg_buflen(msg, 1));
+ if (cksum != msg->lm_cksum) {
+ CWARN("early reply checksum mismatch: "
+ "%08x != %08x\n", cksum, msg->lm_cksum);
+ RETURN(-EPROTO);
+ }
+ }
+
+ if (pack_bulk) {
/* bulk checksum is right after the lustre msg */
if (msg->lm_bufcount < 3) {
CERROR("Invalid reply bufcount %u\n",
}
rc = bulk_sec_desc_unpack(msg, 2);
+ if (rc) {
+ CERROR("unpack bulk desc: %d\n", rc);
+ RETURN(rc);
+ }
}
+
+ req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+ req->rq_replen = msg->lm_buflens[1];
break;
case PTLRPC_GSS_PROC_ERR:
- rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+ if (early) {
+ CERROR("server return error with early reply\n");
+ rc = -EPROTO;
+ } else {
+ rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+ }
break;
default:
CERROR("unknown gss proc %d\n", ghdr->gh_proc);
{
struct gss_cli_ctx *gctx;
struct gss_header *ghdr;
- int msglen, rc;
+ struct lustre_msg *msg = req->rq_repdata;
+ int msglen, pack_bulk, early = 0, rc;
__u32 major;
ENTRY;
- LASSERT(req->rq_repbuf);
LASSERT(req->rq_cli_ctx == ctx);
+ LASSERT(req->rq_ctx_init == 0);
+ LASSERT(msg);
gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
- ghdr = gss_swab_header(req->rq_repbuf, 0);
+ if ((char *) msg < req->rq_repbuf ||
+ (char *) msg >= req->rq_repbuf + req->rq_repbuf_len)
+ early = 1;
+
+ ghdr = gss_swab_header(msg, 0);
if (ghdr == NULL) {
CERROR("can't decode gss header\n");
RETURN(-EPROTO);
switch (ghdr->gh_proc) {
case PTLRPC_GSS_PROC_DATA:
- if (!equi(req->rq_pack_bulk == 1,
- ghdr->gh_flags & LUSTRE_GSS_PACK_BULK)) {
+ pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+ if (!early && !equi(req->rq_pack_bulk == 1, pack_bulk)) {
CERROR("%s bulk flag in reply\n",
req->rq_pack_bulk ? "missing" : "unexpected");
RETURN(-EPROTO);
}
- if (lustre_msg_swabbed(req->rq_repbuf))
+ if (lustre_msg_swabbed(msg))
gss_header_swabber(ghdr);
- major = gss_unseal_msg(gctx->gc_mechctx, req->rq_repbuf,
- &msglen, req->rq_repbuf_len);
+ /* use rq_repdata_len as buffer size, which assume unseal
+ * doesn't need extra memory space. for precise control, we'd
+ * better calculate out actual buffer size as
+ * (repbuf_len - offset - repdata_len) */
+ major = gss_unseal_msg(gctx->gc_mechctx, msg,
+ &msglen, req->rq_repdata_len);
if (major != GSS_S_COMPLETE) {
rc = -EPERM;
break;
}
- if (lustre_unpack_msg(req->rq_repbuf, msglen)) {
+ if (lustre_unpack_msg(msg, msglen)) {
CERROR("Failed to unpack after decryption\n");
RETURN(-EPROTO);
}
- req->rq_repdata_len = msglen;
- if (req->rq_repbuf->lm_bufcount < 1) {
+ if (msg->lm_bufcount < 1) {
CERROR("Invalid reply buffer: empty\n");
RETURN(-EPROTO);
}
- if (req->rq_pack_bulk) {
- if (req->rq_repbuf->lm_bufcount < 2) {
- CERROR("Too few request buffer segments %d\n",
- req->rq_repbuf->lm_bufcount);
+ if (pack_bulk) {
+ if (msg->lm_bufcount < 2) {
+ CERROR("bufcount %u: missing bulk sec desc\n",
+ msg->lm_bufcount);
RETURN(-EPROTO);
}
/* bulk checksum is the last segment */
- if (bulk_sec_desc_unpack(req->rq_repbuf,
- req->rq_repbuf->lm_bufcount-1))
+ if (bulk_sec_desc_unpack(msg, msg->lm_bufcount-1))
RETURN(-EPROTO);
}
- req->rq_repmsg = lustre_msg_buf(req->rq_repbuf, 0, 0);
- req->rq_replen = req->rq_repbuf->lm_buflens[0];
+ req->rq_repmsg = lustre_msg_buf(msg, 0, 0);
+ req->rq_replen = msg->lm_buflens[0];
rc = 0;
break;
struct ptlrpc_request *req,
int svc, int msgsize)
{
- int txtsize;
- int buflens[4], bufcnt = 2;
+ int txtsize;
+ int buflens[4], bufcnt = 2;
+ int alloc_size;
/*
* on-wire data layout:
else if (svc != SPTLRPC_SVC_NULL)
buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
- return do_alloc_repbuf(req, lustre_msg_size_v2(bufcnt, buflens));
+ alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+
+ /* add space for early reply */
+ alloc_size += gss_at_reply_off_integ;
+
+ return do_alloc_repbuf(req, alloc_size);
}
static
struct ptlrpc_request *req,
int msgsize)
{
- int txtsize;
- int buflens[3], bufcnt;
+ int txtsize;
+ int buflens[3], bufcnt;
+ int alloc_size;
/* Inner (clear) buffers
* - lustre message
buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0);
buflens[2] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1);
- return do_alloc_repbuf(req, lustre_msg_size_v2(bufcnt, buflens));
+ alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+
+ /* add space for early reply */
+ alloc_size += gss_at_reply_off_priv;
+
+ return do_alloc_repbuf(req, alloc_size);
}
int gss_alloc_repbuf(struct ptlrpc_sec *sec,
RETURN(rc);
rs->rs_repdata_len = rc;
+
+ if (likely(req->rq_packed_final)) {
+ req->rq_reply_off = gss_at_reply_off_integ;
+ } else {
+ if (svc == SPTLRPC_SVC_NULL)
+ rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0,
+ lustre_msg_buf(rs->rs_repbuf, 1, 0),
+ lustre_msg_buflen(rs->rs_repbuf, 1));
+ req->rq_reply_off = 0;
+ }
+
RETURN(0);
}
grctx->src_err_notify = 1;
grctx->src_reserve_len = 0;
- rc = lustre_pack_reply_v2(req, 1, &replen, NULL);
+ rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
if (rc) {
CERROR("could not pack reply, err %d\n", rc);
RETURN(rc);
}
static inline
-int gss_svc_payload(struct gss_svc_reqctx *grctx, int msgsize, int privacy)
+int gss_svc_payload(struct gss_svc_reqctx *grctx, int early,
+ int msgsize, int privacy)
{
- if (gss_svc_reqctx_is_special(grctx))
+ /* we should treat early reply normally, but which is actually sharing
+ * the same ctx with original request, so in this case we should
+ * ignore the special ctx's special flags */
+ if (early == 0 && gss_svc_reqctx_is_special(grctx))
return grctx->src_reserve_len;
- return gss_estimate_payload(NULL, msgsize, privacy);
+ return gss_mech_payload(NULL, msgsize, privacy);
}
int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
{
struct gss_svc_reqctx *grctx;
struct ptlrpc_reply_state *rs;
- int privacy, svc, bsd_off = 0;
+ int early, privacy, svc, bsd_off = 0;
int ibuflens[2], ibufcnt = 0;
int buflens[4], bufcnt;
int txtsize, wmsg_size, rs_size;
}
svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+ early = (req->rq_packed_final == 0);
grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
- if (gss_svc_reqctx_is_special(grctx))
+ if (!early && gss_svc_reqctx_is_special(grctx))
privacy = 0;
else
privacy = (svc == SPTLRPC_SVC_PRIV);
/* wrapper buffer */
bufcnt = 3;
buflens[0] = PTLRPC_GSS_HEADER_SIZE;
- buflens[1] = gss_svc_payload(grctx, buflens[0], 0);
- buflens[2] = gss_svc_payload(grctx, txtsize, 1);
+ buflens[1] = gss_svc_payload(grctx, early, buflens[0], 0);
+ buflens[2] = gss_svc_payload(grctx, early, txtsize, 1);
} else {
bufcnt = 2;
buflens[0] = PTLRPC_GSS_HEADER_SIZE;
bufcnt++;
}
- if (gss_svc_reqctx_is_special(grctx) ||
+ if ((!early && gss_svc_reqctx_is_special(grctx)) ||
svc != SPTLRPC_SVC_NULL)
- buflens[bufcnt++] = gss_svc_payload(grctx, txtsize, 0);
+ buflens[bufcnt++] = gss_svc_payload(grctx, early,
+ txtsize, 0);
}
wmsg_size = lustre_msg_size_v2(bufcnt, buflens);
msgobj.data = (__u8 *) rs->rs_repbuf;
/* allocate temporary cipher buffer */
- cipher_buflen = gss_estimate_payload(gctx->gsc_mechctx, msglen, 1);
+ cipher_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1);
OBD_ALLOC(cipher_buf, cipher_buflen);
if (!cipher_buf)
RETURN(-ENOMEM);
/* we are about to override data at rs->rs_repbuf, nullify pointers
* to which to catch further illegal usage. */
- grctx->src_repbsd = NULL;
- grctx->src_repbsd_size = 0;
+ if (req->rq_pack_bulk) {
+ grctx->src_repbsd = NULL;
+ grctx->src_repbsd_size = 0;
+ }
/* now the real wire data */
buflens[0] = PTLRPC_GSS_HEADER_SIZE;
- buflens[1] = gss_estimate_payload(gctx->gsc_mechctx, buflens[0], 0);
+ buflens[1] = gss_mech_payload(gctx->gsc_mechctx, buflens[0], 0);
buflens[2] = cipher_obj.len;
LASSERT(lustre_msg_size_v2(3, buflens) <= rs->rs_repbuf_len);
rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2,
cipher_obj.len, 0);
+ /* reply offset */
+ if (likely(req->rq_packed_final))
+ req->rq_reply_off = gss_at_reply_off_priv;
+ else
+ req->rq_reply_off = 0;
+
/* to catch upper layer's further access */
rs->rs_msg = NULL;
req->rq_repmsg = NULL;
{
struct ptlrpc_reply_state *rs = req->rq_reply_state;
struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
- struct gss_wire_ctx *gw;
- int rc;
+ struct gss_wire_ctx *gw = &grctx->src_wirectx;
+ int early, rc;
ENTRY;
- if (gss_svc_reqctx_is_special(grctx))
+ early = (req->rq_packed_final == 0);
+
+ if (!early && gss_svc_reqctx_is_special(grctx)) {
+ LASSERT(rs->rs_repdata_len != 0);
+
+ req->rq_reply_off = gss_at_reply_off_integ;
RETURN(0);
+ }
- gw = &grctx->src_wirectx;
- if (gw->gw_proc != PTLRPC_GSS_PROC_DATA &&
+ /* early reply could happen in many cases */
+ if (!early &&
+ gw->gw_proc != PTLRPC_GSS_PROC_DATA &&
gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) {
CERROR("proc %d not support\n", gw->gw_proc);
RETURN(-EINVAL);
LASSERT(rs->rs_svc_ctx);
grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base);
- /* paranoid, maybe not necessary */
- grctx->src_reqbsd = NULL;
- grctx->src_repbsd = NULL;
-
gss_svc_reqctx_decref(grctx);
rs->rs_svc_ctx = NULL;
return -ENOMEM;
}
+static void gss_init_at_reply_offset(void)
+{
+ int buflens[3], clearsize;
+
+ buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+ buflens[1] = lustre_msg_early_size();
+ buflens[2] = gss_cli_payload(NULL, buflens[1], 0);
+ gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens);
+
+ buflens[0] = lustre_msg_early_size();
+ clearsize = lustre_msg_size_v2(1, buflens);
+ buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+ buflens[1] = gss_cli_payload(NULL, clearsize, 0);
+ buflens[2] = gss_cli_payload(NULL, clearsize, 1);
+ gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens);
+}
+
int __init sptlrpc_gss_init(void)
{
int rc;
goto out_keyring;
#endif
+ gss_init_at_reply_offset();
+
return 0;
#ifdef HAVE_GSS_PIPEFS
*/
void ptlrpc_invalidate_import(struct obd_import *imp)
{
+ struct list_head *tmp, *n;
+ struct ptlrpc_request *req;
struct l_wait_info lwi;
int rc;
LASSERT(imp->imp_invalid);
- /* wait for all requests to error out and call completion callbacks */
- lwi = LWI_TIMEOUT_INTERVAL(cfs_timeout_cap(cfs_time_seconds(obd_timeout)),
- HZ, NULL, NULL);
+ /* wait for all requests to error out and call completion callbacks.
+ Cap it at obd_timeout -- these should all have been locally
+ cancelled by ptlrpc_abort_inflight. */
+ lwi = LWI_TIMEOUT_INTERVAL(
+ cfs_timeout_cap(cfs_time_seconds(obd_timeout)),
+ cfs_time_seconds(1), NULL, NULL);
rc = l_wait_event(imp->imp_recovery_waitq,
(atomic_read(&imp->imp_inflight) == 0), &lwi);
if (rc) {
- struct list_head *tmp, *n;
- struct ptlrpc_request *req;
-
CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
- obd2cli_tgt(imp->imp_obd), rc,
- atomic_read(&imp->imp_inflight));
+ obd2cli_tgt(imp->imp_obd), rc,
+ atomic_read(&imp->imp_inflight));
spin_lock(&imp->imp_lock);
list_for_each_safe(tmp, n, &imp->imp_sending_list) {
req = list_entry(tmp, struct ptlrpc_request, rq_list);
{
struct obd_import_conn *imp_conn = NULL, *conn;
struct obd_export *dlmexp;
+ int tried_all = 1;
ENTRY;
spin_lock(&imp->imp_lock);
imp->imp_obd->obd_name,
libcfs_nid2str(conn->oic_conn->c_peer.nid),
conn->oic_last_attempt);
- /* Throttle the reconnect rate to once per RECONNECT_INTERVAL */
- if (cfs_time_before_64(conn->oic_last_attempt +
- RECONNECT_INTERVAL * HZ,
- cfs_time_current_64())) {
- /* If we have never tried this connection since the
- the last successful attempt, go with this one */
- if (cfs_time_beforeq_64(conn->oic_last_attempt,
- imp->imp_last_success_conn)) {
- imp_conn = conn;
- break;
- }
+ /* Don't thrash connections */
+ if (cfs_time_before_64(cfs_time_current_64(),
+ conn->oic_last_attempt +
+ cfs_time_seconds(CONNECTION_SWITCH_MIN))) {
+ continue;
+ }
- /* Both of these connections have already been tried
- since the last successful connection; just choose the
- least recently used */
- if (!imp_conn)
- imp_conn = conn;
- else if (cfs_time_before_64(conn->oic_last_attempt,
- imp_conn->oic_last_attempt))
- imp_conn = conn;
+ /* If we have not tried this connection since the
+ the last successful attempt, go with this one */
+ if ((conn->oic_last_attempt == 0) ||
+ cfs_time_beforeq_64(conn->oic_last_attempt,
+ imp->imp_last_success_conn)) {
+ imp_conn = conn;
+ tried_all = 0;
+ break;
}
+
+ /* If all of the connections have already been tried
+ since the last successful connection; just choose the
+ least recently used */
+ if (!imp_conn)
+ imp_conn = conn;
+ else if (cfs_time_before_64(conn->oic_last_attempt,
+ imp_conn->oic_last_attempt))
+ imp_conn = conn;
}
/* if not found, simply choose the current one */
if (!imp_conn) {
LASSERT(imp->imp_conn_current);
imp_conn = imp->imp_conn_current;
+ tried_all = 0;
}
LASSERT(imp_conn->oic_conn);
+ /* If we've tried everything, and we're back to the beginning of the
+ list, increase our timeout and try again. It will be reset when
+ we do finally connect. (FIXME: really we should wait for all network
+ state associated with the last connection attempt to drain before
+ trying to reconnect on it.) */
+ if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item) &&
+ !imp->imp_recon_bk /* not retrying */) {
+ if (at_get(&imp->imp_at.iat_net_latency) <
+ CONNECTION_SWITCH_MAX) {
+ at_add(&imp->imp_at.iat_net_latency,
+ at_get(&imp->imp_at.iat_net_latency) +
+ CONNECTION_SWITCH_INC);
+ }
+ LASSERT(imp_conn->oic_last_attempt);
+ CWARN("%s: tried all connections, increasing latency to %ds\n",
+ imp->imp_obd->obd_name,
+ at_get(&imp->imp_at.iat_net_latency));
+ }
+
imp_conn->oic_last_attempt = cfs_time_current_64();
/* switch connection, don't mind if it's same as the current one */
/* Reset connect flags to the originally requested flags, in case
* the server is updated on-the-fly we will get the new features. */
imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
+ imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
&obd->obd_uuid, &imp->imp_connect_data);
if (rc)
spin_lock(&imp->imp_lock);
imp->imp_replayable = 1;
spin_unlock(&imp->imp_lock);
- /* On an initial connect, we don't know which one of a
- failover server pair is up. Don't wait long. */
-#ifdef CRAY_XT3
- request->rq_timeout = max((int)(obd_timeout / 2), 5);
-#else
- request->rq_timeout = max((int)(obd_timeout / 20), 5);
-#endif
lustre_msg_add_op_flags(request->rq_reqmsg,
MSG_CONNECT_INITIAL);
+ if (AT_OFF)
+ /* AT will use INITIAL_CONNECT_TIMEOUT the first
+ time, adaptive after that. */
+ request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
}
if (set_transno)
}
if (imp->imp_invalid) {
+ CDEBUG(D_HA, "%s: reconnected but import is invalid; "
+ "marking evicted\n", imp->imp_obd->obd_name);
IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
} else if (MSG_CONNECT_RECOVERING & msg_flags) {
CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
imp->imp_last_replay_transno = 0;
IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
} else {
+ DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
+ " not set: %x)", imp->imp_obd->obd_name, msg_flags);
imp->imp_remote_handle =
*lustre_msg_get_handle(request->rq_repmsg);
IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
imp->imp_obd->obd_namespace->ns_orig_connect_flags =
ocd->ocd_connect_flags;
+ if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
+ (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+ /* We need a per-message support flag, because
+ a. we don't know if the incoming connect reply
+ supports AT or not (in reply_in_callback)
+ until we unpack it.
+ b. failovered server means export and flags are gone
+ (in ptlrpc_send_reply).
+ Can only be set when we know AT is supported at
+ both ends */
+ imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+ else
+ imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
(cli->cl_max_pages_per_rpc > 0));
}
if (ptlrpc_import_in_recovery(imp)) {
struct l_wait_info lwi;
cfs_duration_t timeout;
- if (imp->imp_server_timeout)
- timeout = cfs_time_seconds(obd_timeout / 2);
- else
- timeout = cfs_time_seconds(obd_timeout);
-
- timeout = MAX(timeout * HZ, 1);
+
+
+ if (AT_OFF) {
+ if (imp->imp_server_timeout)
+ timeout = cfs_time_seconds(obd_timeout / 2);
+ else
+ timeout = cfs_time_seconds(obd_timeout);
+ } else {
+ int idx = import_at_get_index(imp,
+ imp->imp_client->cli_request_portal);
+ timeout = cfs_time_seconds(
+ at_get(&imp->imp_at.iat_service_estimate[idx]));
+ }
lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
* it fails. We can get through the above with a down server
* if the client doesn't know the server is gone yet. */
req->rq_no_resend = 1;
-#ifdef CRAY_XT3
- req->rq_timeout = obd_timeout / 3;
+
+#ifndef CRAY_XT3
+ /* We want client umounts to happen quickly, no matter the
+ server state... */
+ req->rq_timeout = min_t(int, req->rq_timeout,
+ INITIAL_CONNECT_TIMEOUT);
#else
- req->rq_timeout = 5;
+ /* ... but we always want liblustre clients to nicely
+ disconnect, so only use the adaptive value. */
+ if (AT_OFF)
+ req->rq_timeout = obd_timeout / 3;
#endif
+
IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
req->rq_send_state = LUSTRE_IMP_CONNECTING;
ptlrpc_request_set_replen(req);
IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
imp->imp_conn_cnt = 0;
+ /* Try all connections in the future - bz 12758 */
imp->imp_last_recon = 0;
spin_unlock(&imp->imp_lock);
RETURN(rc);
}
+
+/* Adaptive Timeout utils */
+extern unsigned int at_min, at_max, at_history;
+
+/* Bin into timeslices using AT_BINS bins.
+ This gives us a max of the last binlimit*AT_BINS secs without the storage,
+ but still smoothing out a return to normalcy from a slow response.
+ (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
+int at_add(struct adaptive_timeout *at, unsigned int val)
+{
+ unsigned int old = at->at_current;
+ time_t now = cfs_time_current_sec();
+ time_t binlimit = max_t(time_t, at_history / AT_BINS, 1);
+
+ LASSERT(at);
+#if 0
+ CDEBUG(D_INFO, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
+ val, at, now - at->at_binstart, at->at_current,
+ at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+#endif
+ if (val == 0)
+ /* 0's don't count, because we never want our timeout to
+ drop to 0, and because 0 could mean an error */
+ return 0;
+
+ spin_lock(&at->at_lock);
+
+ if (unlikely(at->at_binstart == 0)) {
+ /* Special case to remove default from history */
+ at->at_current = val;
+ at->at_worst_ever = val;
+ at->at_worst_time = now;
+ at->at_hist[0] = val;
+ at->at_binstart = now;
+ } else if (now - at->at_binstart < binlimit ) {
+ /* in bin 0 */
+ at->at_hist[0] = max(val, at->at_hist[0]);
+ at->at_current = max(val, at->at_current);
+ } else {
+ int i, shift;
+ unsigned int maxv = val;
+ /* move bins over */
+ shift = (now - at->at_binstart) / binlimit;
+ LASSERT(shift > 0);
+ for(i = AT_BINS - 1; i >= 0; i--) {
+ if (i >= shift) {
+ at->at_hist[i] = at->at_hist[i - shift];
+ maxv = max(maxv, at->at_hist[i]);
+ } else {
+ at->at_hist[i] = 0;
+ }
+ }
+ at->at_hist[0] = val;
+ at->at_current = maxv;
+ at->at_binstart += shift * binlimit;
+ }
+
+ if (at->at_current > at->at_worst_ever) {
+ at->at_worst_ever = at->at_current;
+ at->at_worst_time = now;
+ }
+
+ if (at->at_flags & AT_FLG_NOHIST)
+ /* Only keep last reported val; keeping the rest of the history
+ for proc only */
+ at->at_current = val;
+
+ if (at_max > 0)
+ at->at_current = min(at->at_current, at_max);
+ at->at_current = max(at->at_current, at_min);
+
+#if 0
+ if (at->at_current != old)
+ CDEBUG(D_ADAPTTO, "AT %p change: old=%u new=%u delta=%d "
+ "(val=%u) hist %u %u %u %u\n", at,
+ old, at->at_current, at->at_current - old, val,
+ at->at_hist[0], at->at_hist[1], at->at_hist[2],
+ at->at_hist[3]);
+#endif
+
+ /* if we changed, report the old value */
+ old = (at->at_current != old) ? old : 0;
+
+ spin_unlock(&at->at_lock);
+ return old;
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal)
+{
+ struct imp_at *at = &imp->imp_at;
+ int i;
+
+ for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ if (at->iat_portal[i] == portal)
+ return i;
+ if (at->iat_portal[i] == 0)
+ /* unused */
+ break;
+ }
+
+ /* Not found in list, add it under a lock */
+ spin_lock(&imp->imp_lock);
+
+ /* Check unused under lock */
+ for (; i < IMP_AT_MAX_PORTALS; i++) {
+ if (at->iat_portal[i] == portal)
+ goto out;
+ if (at->iat_portal[i] == 0)
+ /* unused */
+ break;
+ }
+
+ /* Not enough portals? */
+ LASSERT(i < IMP_AT_MAX_PORTALS);
+
+ at->iat_portal[i] = portal;
+out:
+ spin_unlock(&imp->imp_lock);
+ return i;
+}
+
svc_counter_config, "req_qdepth", "reqs");
lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
svc_counter_config, "req_active", "reqs");
+ lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+ svc_counter_config, "req_timeout", "sec");
lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
svc_counter_config, "reqbuf_avail", "bufs");
for (i = 0; i < EXTRA_LAST_OPC; i++) {
return srhi;
}
+/* common ost/mdt srv_request_history_print_fn */
+void target_print_req(void *seq_file, struct ptlrpc_request *req)
+{
+ /* Called holding srv_lock with irqs disabled.
+ * Print specific req contents and a newline.
+ * CAVEAT EMPTOR: check request message length before printing!!!
+ * You might have received any old crap so you must be just as
+ * careful here as the service's request parser!!! */
+ struct seq_file *sf = seq_file;
+
+ switch (req->rq_phase) {
+ case RQ_PHASE_NEW:
+ /* still awaiting a service thread's attention, or rejected
+ * because the generic request message didn't unpack */
+ seq_printf(sf, "<not swabbed>\n");
+ break;
+ case RQ_PHASE_INTERPRET:
+ /* being handled, so basic msg swabbed, and opc is valid
+ * but racing with mds_handle() */
+ case RQ_PHASE_COMPLETE:
+ /* been handled by mds_handle() reply state possibly still
+ * volatile */
+ seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
+ break;
+ default:
+ DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase);
+ }
+}
+EXPORT_SYMBOL(target_print_req);
+
static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
{
struct ptlrpc_service *svc = s->private;
* must be just as careful as the service's request
* parser. Currently I only print stuff here I know is OK
* to look at coz it was set up in request_in_callback()!!! */
- seq_printf(s, LPD64":%s:%s:"LPD64":%d:%s ",
+ seq_printf(s, LPD64":%s:%s:x"LPD64":%d:%s:%ld:%lds(%+lds) ",
req->rq_history_seq, libcfs_nid2str(req->rq_self),
libcfs_id2str(req->rq_peer), req->rq_xid,
- req->rq_reqlen,ptlrpc_rqphase2str(req));
-
+ req->rq_reqlen, ptlrpc_rqphase2str(req),
+ req->rq_arrival_time.tv_sec,
+ req->rq_sent - req->rq_arrival_time.tv_sec,
+ req->rq_sent - req->rq_deadline);
if (svc->srv_request_history_print_fn == NULL)
seq_printf(s, "\n");
else
return 0;
}
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct ptlrpc_service *svc = data;
+ unsigned int cur, worst;
+ time_t worstt;
+ struct dhms ts;
+ int rc = 0;
+
+ *eof = 1;
+ cur = at_get(&svc->srv_at_estimate);
+ worst = svc->srv_at_estimate.at_worst_ever;
+ worstt = svc->srv_at_estimate.at_worst_time;
+ s2dhms(&ts, cfs_time_current_sec() - worstt);
+ if (AT_OFF)
+ rc += snprintf(page + rc, count - rc,
+ "adaptive timeouts off, using obd_timeout %u\n",
+ obd_timeout);
+ rc += snprintf(page + rc, count - rc,
+ "%10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ",
+ "service", cur, worst, worstt,
+ DHMS_VARS(&ts));
+ rc = lprocfs_at_hist_helper(page, count, rc,
+ &svc->srv_at_estimate);
+ return rc;
+}
+
void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
struct ptlrpc_service *svc)
{
.write_fptr = ptlrpc_lprocfs_write_req_history_max,
.read_fptr = ptlrpc_lprocfs_read_req_history_max,
.data = svc},
+ {.name = "timeouts",
+ .read_fptr = ptlrpc_lprocfs_rd_timeouts,
+ .data = svc},
{NULL}
};
static struct file_operations req_history_fops = {
static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
- struct ptlrpc_connection *conn, int portal, __u64 xid)
+ struct ptlrpc_connection *conn, int portal, __u64 xid,
+ unsigned int offset)
{
int rc;
lnet_md_t md;
RETURN (-ENOMEM);
}
- CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
- len, portal, xid);
+ CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n",
+ len, portal, xid, offset);
rc = LNetPut (conn->c_self, *mdh, ack,
- conn->c_peer, portal, xid, 0, 0);
+ conn->c_peer, portal, xid, offset, 0);
if (unlikely(rc != 0)) {
int rc2;
/* We're going to get an UNLINK event when I unlink below,
}
}
-int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
+static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
+{
+ struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+ int service_time = max_t(int, cfs_time_current_sec() -
+ req->rq_arrival_time.tv_sec, 1);
+
+ if (!(flags & PTLRPC_REPLY_EARLY) &&
+ (req->rq_type != PTL_RPC_MSG_ERR)) {
+ /* early replies and errors don't count toward our service
+ time estimate */
+ int oldse = at_add(&svc->srv_at_estimate, service_time);
+ if (oldse != 0)
+ DEBUG_REQ(D_ADAPTTO, req,
+ "svc %s changed estimate from %d to %d",
+ svc->srv_name, oldse,
+ at_get(&svc->srv_at_estimate));
+ }
+ /* Report actual service time for client latency calc */
+ lustre_msg_set_service_time(req->rq_repmsg, service_time);
+ /* Report service time estimate for future client reqs, but report 0
+ * (to be ignored by client) if it's a error reply during recovery.
+ * (bz15815) */
+ if (req->rq_type == PTL_RPC_MSG_ERR &&
+ (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering))
+ lustre_msg_set_timeout(req->rq_repmsg, 0);
+ else
+ lustre_msg_set_timeout(req->rq_repmsg,
+ at_get(&svc->srv_at_estimate));
+
+ if (req->rq_reqmsg &&
+ !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+ CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
+ "req_flags=%#x magic=%d:%x/%x len=%d\n",
+ flags, lustre_msg_get_flags(req->rq_reqmsg),
+ lustre_msg_is_v1(req->rq_reqmsg),
+ lustre_msg_get_magic(req->rq_reqmsg),
+ lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+ }
+}
+
+int ptlrpc_send_reply (struct ptlrpc_request *req, int flags)
{
struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
struct ptlrpc_reply_state *rs = req->rq_reply_state;
LASSERT (req->rq_no_reply == 0);
LASSERT (req->rq_reqbuf != NULL);
LASSERT (rs != NULL);
- LASSERT (may_be_difficult || !rs->rs_difficult);
+ LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
LASSERT (req->rq_repmsg != NULL);
LASSERT (req->rq_repmsg == rs->rs_msg);
LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
LASSERT (rs->rs_cb_id.cbid_arg == rs);
+ /* There may be no rq_export during failover */
+
if (unlikely(req->rq_export && req->rq_export->exp_obd &&
req->rq_export->exp_obd->obd_fail)) {
/* Failed obd's only send ENODEV */
target_pack_pool_reply(req);
+ ptlrpc_at_set_reply(req, flags);
+
if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
conn = ptlrpc_get_connection(req->rq_peer, req->rq_self, NULL);
else
if (unlikely(rc))
goto out;
+ req->rq_sent = cfs_time_current_sec();
+
rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
rs->rs_difficult ? LNET_ACK_REQ : LNET_NOACK_REQ,
- &rs->rs_cb_id, conn,
- svc->srv_rep_portal, req->rq_xid);
+ &rs->rs_cb_id, conn, svc->srv_rep_portal,
+ req->rq_xid, req->rq_reply_off);
out:
if (unlikely(rc != 0)) {
atomic_dec (&svc->srv_outstanding_replies);
- ptlrpc_rs_decref(rs);
+ ptlrpc_req_drop_rs(req);
}
ptlrpc_put_connection(conn);
return rc;
lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
lustre_msg_set_conn_cnt(request->rq_reqmsg,
request->rq_import->imp_conn_cnt);
+ lustre_msghdr_set_flags(request->rq_reqmsg,
+ request->rq_import->imp_msghdr_flags);
rc = sptlrpc_cli_wrap_request(request);
if (rc)
if (!noreply) {
LASSERT (request->rq_replen != 0);
if (request->rq_repbuf == NULL) {
+ LASSERT(request->rq_repdata == NULL);
+ LASSERT(request->rq_repmsg == NULL);
rc = sptlrpc_cli_alloc_repbuf(request,
request->rq_replen);
if (rc)
GOTO(cleanup_bulk, rc);
+ } else {
+ request->rq_repdata = NULL;
+ request->rq_repmsg = NULL;
}
rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
spin_lock(&request->rq_lock);
/* If the MD attach succeeds, there _will_ be a reply_in callback */
request->rq_receiving_reply = !noreply;
+ /* We are responsible for unlinking the reply buffer */
+ request->rq_must_unlink = !noreply;
/* Clear any flags that may be present from previous sends. */
request->rq_replied = 0;
request->rq_err = 0;
if (!noreply) {
reply_md.start = request->rq_repbuf;
reply_md.length = request->rq_repbuf_len;
- reply_md.threshold = 1;
- reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT;
+ /* Allow multiple early replies */
+ reply_md.threshold = LNET_MD_THRESH_INF;
+ /* Manage remote for early replies */
+ reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+ LNET_MD_MANAGE_REMOTE;
reply_md.user_ptr = &request->rq_reply_cbid;
reply_md.eq_handle = ptlrpc_eq_h;
- rc = LNetMDAttach(reply_me_h, reply_md, LNET_UNLINK,
- &request->rq_reply_md_h);
+ /* We must see the unlink callback to unset rq_must_unlink,
+ so we can't auto-unlink */
+ rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
+ &request->rq_reply_md_h);
if (rc != 0) {
CERROR("LNetMDAttach failed: %d\n", rc);
LASSERT (rc == -ENOMEM);
OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
- request->rq_sent = cfs_time_current_sec();
do_gettimeofday(&request->rq_arrival_time);
+ request->rq_sent = cfs_time_current_sec();
+ /* We give the server rq_timeout secs to process the req, and
+ add the network latency for our local timeout. */
+ request->rq_deadline = request->rq_sent + request->rq_timeout +
+ ptlrpc_at_get_net_latency(request);
+
ptlrpc_pinger_sending_on_import(request->rq_import);
+
+ DEBUG_REQ(D_INFO, request, "send flg=%x",
+ lustre_msg_get_flags(request->rq_reqmsg));
rc = ptl_send_buf(&request->rq_req_md_h,
request->rq_reqbuf, request->rq_reqdata_len,
LNET_NOACK_REQ, &request->rq_req_cbid,
connection,
request->rq_request_portal,
- request->rq_xid);
+ request->rq_xid, 0);
if (rc == 0) {
ptlrpc_lprocfs_rpc_sent(request);
RETURN(rc);
ptlrpc_req_finished(request);
if (noreply)
RETURN(rc);
- else
- GOTO(cleanup_me, rc);
+
cleanup_me:
/* MEUnlink is safe; the PUT didn't even get off the ground, and
* nobody apart from the PUT's target has the right nid+XID to
#include <obd_support.h>
#include <obd_class.h>
#include <lustre_net.h>
+#include <obd_cksum.h>
static inline int lustre_msg_hdr_size_v2(int count)
{
}
}
+/* early reply size */
+int lustre_msg_early_size() {
+ static int size = 0;
+ if (!size)
+ size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
+ return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
int lustre_msg_size_v2(int count, int *lengths)
{
int size;
}
int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
- int *lens, char **bufs)
+ int *lens, char **bufs, int flags)
{
struct ptlrpc_reply_state *rs;
int msg_len, rc;
LASSERT(req->rq_reply_state == NULL);
+ if ((flags & LPRFL_EARLY_REPLY) == 0)
+ req->rq_packed_final = 1;
+
msg_len = lustre_msg_size_v2(count, lens);
rc = sptlrpc_svc_alloc_rs(req, msg_len);
if (rc)
req->rq_replen = msg_len;
req->rq_reply_state = rs;
req->rq_repmsg = rs->rs_msg;
+
lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
lustre_set_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
}
EXPORT_SYMBOL(lustre_pack_reply_v2);
-int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
- char **bufs)
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, int *lens,
+ char **bufs, int flags)
{
int rc = 0;
int size[] = { sizeof(struct ptlrpc_body) };
switch (req->rq_reqmsg->lm_magic) {
case LUSTRE_MSG_MAGIC_V2:
case LUSTRE_MSG_MAGIC_V2_SWABBED:
- rc = lustre_pack_reply_v2(req, count, lens, bufs);
+ rc = lustre_pack_reply_v2(req, count, lens, bufs, flags);
break;
default:
LASSERTF(0, "incorrect message magic: %08x\n",
return rc;
}
+int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
+ char **bufs)
+{
+ return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+
void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size)
{
int i, offset, buflen, bufcount;
__swab32s(&m->lm_bufcount);
__swab32s(&m->lm_secflvr);
__swab32s(&m->lm_repsize);
- __swab32s(&m->lm_timeout);
- CLASSERT(offsetof(typeof(*m), lm_padding_1) != 0);
+ __swab32s(&m->lm_cksum);
+ __swab32s(&m->lm_flags);
CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
}
return lustre_swab_buf(req->rq_repmsg, index, min_size, swabber);
}
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ case LUSTRE_MSG_MAGIC_V2_SWABBED:
+ /* already in host endian */
+ return msg->lm_flags;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2:
+ msg->lm_flags = flags;
+ return;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
__u32 lustre_msg_get_flags(struct lustre_msg *msg)
{
switch (msg->lm_magic) {
}
}
+int lustre_msg_is_v1(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
__u32 lustre_msg_get_magic(struct lustre_msg *msg)
{
switch (msg->lm_magic) {
}
}
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ case LUSTRE_MSG_MAGIC_V2_SWABBED: {
+ struct ptlrpc_body *pb;
+
+ pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+
+ }
+ return pb->pb_timeout;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ case LUSTRE_MSG_MAGIC_V2_SWABBED: {
+ struct ptlrpc_body *pb;
+
+ pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+
+ }
+ return pb->pb_service_time;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ case LUSTRE_MSG_MAGIC_V2_SWABBED:
+ return msg->lm_cksum;
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ case LUSTRE_MSG_MAGIC_V2_SWABBED: {
+ struct ptlrpc_body *pb;
+ pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ return crc32_le(~(__u32)0, (char *)pb, sizeof(*pb));
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
{
switch (msg->lm_magic) {
}
}
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb;
+
+ pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_timeout = timeout;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb;
+
+ pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb));
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_service_time = service_time;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2:
+ msg->lm_cksum = cksum;
+ return;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+
void ptlrpc_request_set_replen(struct ptlrpc_request *req)
{
int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER);
__swab32s (&b->pb_flags);
__swab32s (&b->pb_op_flags);
__swab32s (&b->pb_conn_cnt);
- CLASSERT(offsetof(typeof(*b), pb_padding_1) != 0);
- CLASSERT(offsetof(typeof(*b), pb_padding_2) != 0);
+ __swab32s (&b->pb_timeout);
+ __swab32s (&b->pb_service_time);
__swab32s (&b->pb_limit);
__swab64s (&b->pb_slv);
}
va_start(args, fmt);
libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file,
data->msg_fn, data->msg_line, fmt, args,
- " req@%p x"LPD64"/t"LPD64"("LPD64") o%d->%s@%s:%d lens"
- " %d/%d ref %d fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
+ " req@%p x"LPD64"/t"LPD64"("LPD64") o%d->%s@%s:%d/%d"
+ " lens %d/%d e %d to %d dl %ld ref %d "
+ "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
req, req->rq_xid, req->rq_transno,
req->rq_reqmsg ? lustre_msg_get_transno(req->rq_reqmsg) : 0,
req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
(char *)req->rq_import->imp_connection->c_remote_uuid.uuid :
req->rq_export ?
(char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "<?>",
- (req->rq_import && req->rq_import->imp_client) ?
- req->rq_import->imp_client->cli_request_portal : -1,
- req->rq_reqlen, req->rq_replen, atomic_read(&req->rq_refcount),
- DEBUG_REQ_FLAGS(req),
+ req->rq_request_portal, req->rq_reply_portal,
+ req->rq_reqlen, req->rq_replen,
+ req->rq_early_count, req->rq_timeout, req->rq_deadline,
+ atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req),
req->rq_reqmsg && req_ptlrpc_body_swabbed(req) ?
lustre_msg_get_flags(req->rq_reqmsg) : -1,
req->rq_repmsg && rep_ptlrpc_body_swabbed(req) ?
imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
req->rq_no_resend = req->rq_no_delay = 1;
ptlrpc_request_set_replen(req);
- req->rq_timeout = PING_INTERVAL;
ptlrpcd_add_req(req);
RETURN(0);
void ptlrpc_update_next_ping(struct obd_import *imp)
{
#ifdef ENABLE_PINGER
- imp->imp_next_ping = cfs_time_shift(
- (imp->imp_state == LUSTRE_IMP_DISCON ?
- RECONNECT_INTERVAL : PING_INTERVAL));
+ int time = PING_INTERVAL;
+ if (imp->imp_state == LUSTRE_IMP_DISCON) {
+ int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+ AT_OFF ? 0 :
+ at_get(&imp->imp_at.iat_net_latency));
+ time = min(time, dtime);
+ }
+ imp->imp_next_ping = cfs_time_shift(time);
#endif /* ENABLE_PINGER */
}
obd = pet_exp->exp_obd;
spin_unlock(&pet_lock);
- expire_time = cfs_time_current_sec() - (3 * obd_timeout / 2);
+ expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
obd->obd_name, expire_time);
EXPORT_SYMBOL(ptlrpc_init_rq_pool);
EXPORT_SYMBOL(ptlrpc_free_rq_pool);
EXPORT_SYMBOL(ptlrpc_prep_req_pool);
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
EXPORT_SYMBOL(ptlrpc_request_alloc);
EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
EXPORT_SYMBOL(ptlrpc_request_free);
EXPORT_SYMBOL(lustre_msg_check_version);
EXPORT_SYMBOL(lustre_pack_request);
EXPORT_SYMBOL(lustre_pack_reply);
+EXPORT_SYMBOL(lustre_pack_reply_flags);
EXPORT_SYMBOL(lustre_shrink_msg);
EXPORT_SYMBOL(lustre_free_reply_state);
EXPORT_SYMBOL(lustre_msg_size);
EXPORT_SYMBOL(lustre_msg_set_slv);
EXPORT_SYMBOL(lustre_msg_set_limit);
EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+EXPORT_SYMBOL(lustre_msg_is_v1);
EXPORT_SYMBOL(lustre_msg_get_magic);
EXPORT_SYMBOL(lustre_msg_set_handle);
EXPORT_SYMBOL(lustre_msg_set_type);
break;
}
- /* XXX FIXME bug 249, 5515 */
+ /* bug 5515 */
request->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
request->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+ ptlrpc_at_set_req_timeout(request);
ptlrpc_request_set_replen(request);
mutex_down(&llcd->llcd_ctxt->loc_sem);
if (number >= SPTLRPC_POLICY_MAX)
return NULL;
-again:
- read_lock(&policy_lock);
- policy = policies[number];
- if (policy && !try_module_get(policy->sp_owner))
- policy = NULL;
- if (policy == NULL)
- flag = atomic_read(&loaded);
- read_unlock(&policy_lock);
-
- /* if failure, try to load gss module, once */
- if (unlikely(policy == NULL) && flag == 0 &&
- number == SPTLRPC_POLICY_GSS) {
+ while (1) {
+ read_lock(&policy_lock);
+ policy = policies[number];
+ if (policy && !try_module_get(policy->sp_owner))
+ policy = NULL;
+ if (policy == NULL)
+ flag = atomic_read(&loaded);
+ read_unlock(&policy_lock);
+
+ if (policy != NULL || flag != 0 ||
+ number != SPTLRPC_POLICY_GSS)
+ break;
+
+ /* try to load gss module, once */
mutex_down(&load_mutex);
if (atomic_read(&loaded) == 0) {
- if (request_module("ptlrpc_gss") != 0)
- CERROR("Unable to load module ptlrpc_gss\n");
- else
+ if (request_module("ptlrpc_gss") == 0)
CWARN("module ptlrpc_gss loaded on demand\n");
+ else
+ CERROR("Unable to load module ptlrpc_gss\n");
atomic_set(&loaded, 1);
}
mutex_up(&load_mutex);
-
- goto again;
}
return policy;
return SPTLRPC_FLVR_PLAIN;
if (!strcmp(name, "krb5n"))
return SPTLRPC_FLVR_KRB5N;
+ if (!strcmp(name, "krb5a"))
+ return SPTLRPC_FLVR_KRB5A;
if (!strcmp(name, "krb5i"))
return SPTLRPC_FLVR_KRB5I;
if (!strcmp(name, "krb5p"))
RETURN(rc);
}
-/*
- * rq_nob_received is the actual received data length
- */
-int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+static int do_cli_unwrap_reply(struct ptlrpc_request *req)
{
struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
int rc;
LASSERT(ctx);
LASSERT(ctx->cc_sec);
- LASSERT(ctx->cc_ops);
LASSERT(req->rq_repbuf);
+ LASSERT(req->rq_repdata);
+ LASSERT(req->rq_repmsg == NULL);
- req->rq_repdata_len = req->rq_nob_received;
-
- if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+ if (req->rq_repdata_len < sizeof(struct lustre_msg)) {
CERROR("replied data length %d too small\n",
- req->rq_nob_received);
+ req->rq_repdata_len);
RETURN(-EPROTO);
}
+ /* v2 message, check request/reply policy match */
+ rpc_flvr = WIRE_FLVR_RPC(req->rq_repdata->lm_secflvr);
- /*
- * v2 message, check request/reply policy match
- */
- rpc_flvr = WIRE_FLVR_RPC(req->rq_repbuf->lm_secflvr);
-
- if (req->rq_repbuf->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED)
+ if (req->rq_repdata->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED)
__swab16s(&rpc_flvr);
if (RPC_FLVR_POLICY(rpc_flvr) !=
- RPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+ RPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
CERROR("request policy was %u while reply with %u\n",
- RPC_FLVR_POLICY(req->rq_flvr.sf_rpc),
- RPC_FLVR_POLICY(rpc_flvr));
+ RPC_FLVR_POLICY(req->rq_flvr.sf_rpc),
+ RPC_FLVR_POLICY(rpc_flvr));
RETURN(-EPROTO);
}
/* do nothing if it's null policy; otherwise unpack the
- * wrapper message
- */
+ * wrapper message */
if (RPC_FLVR_POLICY(rpc_flvr) != SPTLRPC_POLICY_NULL &&
- lustre_unpack_msg(req->rq_repbuf, req->rq_nob_received))
+ lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len))
RETURN(-EPROTO);
switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
RETURN(rc);
}
+/*
+ * upon this be called, the reply buffer should have been un-posted,
+ * so nothing is going to change.
+ */
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+ LASSERT(req->rq_repbuf);
+ LASSERT(req->rq_repdata == NULL);
+ LASSERT(req->rq_repmsg == NULL);
+ LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len);
+
+ if (req->rq_reply_off == 0) {
+ CERROR("real reply with offset 0\n");
+ return -EPROTO;
+ }
+
+ if (req->rq_reply_off % 8 != 0) {
+ CERROR("reply at odd offset %u\n", req->rq_reply_off);
+ return -EPROTO;
+ }
+
+ req->rq_repdata = (struct lustre_msg *)
+ (req->rq_repbuf + req->rq_reply_off);
+ req->rq_repdata_len = req->rq_nob_received;
+
+ return do_cli_unwrap_reply(req);
+}
+
+/*
+ * Upon called, the receive buffer might be still posted, so the reply data
+ * might be changed at any time, no matter we're holding rq_lock or not. we
+ * expect the rq_reply_off be 0, rq_nob_received is the early reply size.
+ *
+ * we allocate a separate buffer to hold early reply data, pointed by
+ * rq_repdata, rq_repdata_len is the early reply size, and round up to power2
+ * is the actual buffer size.
+ *
+ * caller _must_ call sptlrpc_cli_finish_early_reply() after this, before
+ * process another early reply or real reply, to restore ptlrpc_request
+ * to normal status.
+ */
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req)
+{
+ struct lustre_msg *early_buf;
+ int early_bufsz, early_size;
+ int rc;
+ ENTRY;
+
+ LASSERT(req->rq_repbuf);
+ LASSERT(req->rq_repdata == NULL);
+ LASSERT(req->rq_repmsg == NULL);
+
+ early_size = req->rq_nob_received;
+ if (early_size < sizeof(struct lustre_msg)) {
+ CERROR("early reply length %d too small\n", early_size);
+ RETURN(-EPROTO);
+ }
+
+ early_bufsz = size_roundup_power2(early_size);
+ OBD_ALLOC(early_buf, early_bufsz);
+ if (early_buf == NULL)
+ RETURN(-ENOMEM);
+
+ /* copy data out, do it inside spinlock */
+ spin_lock(&req->rq_lock);
+
+ if (req->rq_replied) {
+ spin_unlock(&req->rq_lock);
+ GOTO(err_free, rc = -EALREADY);
+ }
+
+ if (req->rq_reply_off != 0) {
+ CERROR("early reply with offset %u\n", req->rq_reply_off);
+ GOTO(err_free, rc = -EPROTO);
+ }
+
+ if (req->rq_nob_received != early_size) {
+ /* even another early arrived the size should be the same */
+ CWARN("data size has changed from %u to %u\n",
+ early_size, req->rq_nob_received);
+ spin_unlock(&req->rq_lock);
+ GOTO(err_free, rc = -EINVAL);
+ }
+
+ if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+ CERROR("early reply length %d too small\n",
+ req->rq_nob_received);
+ spin_unlock(&req->rq_lock);
+ GOTO(err_free, rc = -EALREADY);
+ }
+
+ memcpy(early_buf, req->rq_repbuf, early_size);
+ spin_unlock(&req->rq_lock);
+
+ req->rq_repdata = early_buf;
+ req->rq_repdata_len = early_size;
+
+ rc = do_cli_unwrap_reply(req);
+
+ /* treate resend as an error case. in fact server should never ask
+ * resend via early reply. */
+ if (req->rq_resend) {
+ req->rq_resend = 0;
+ rc = -EPROTO;
+ }
+
+ if (rc) {
+ LASSERT(req->rq_repmsg == NULL);
+ req->rq_repdata = NULL;
+ req->rq_repdata_len = 0;
+ GOTO(err_free, rc);
+ }
+
+ LASSERT(req->rq_repmsg);
+ RETURN(0);
+
+err_free:
+ OBD_FREE(early_buf, early_bufsz);
+ RETURN(rc);
+}
+
+int sptlrpc_cli_finish_early_reply(struct ptlrpc_request *req)
+{
+ int early_bufsz;
+
+ LASSERT(req->rq_repdata);
+ LASSERT(req->rq_repdata_len);
+ LASSERT(req->rq_repmsg);
+
+ early_bufsz = size_roundup_power2(req->rq_repdata_len);
+ OBD_FREE(req->rq_repdata, early_bufsz);
+
+ req->rq_repdata = NULL;
+ req->rq_repdata_len = 0;
+ req->rq_repmsg = NULL;
+ return 0;
+}
+
/**************************************************
* sec ID *
**************************************************/
#define IDLE_IDX_MAX (100)
#define IDLE_IDX_WEIGHT (3)
-#define CACHE_QUIESCENCE_PERIOD (20)
+#define CACHE_QUIESCENT_PERIOD (20)
static struct ptlrpc_enc_page_pool {
/*
/*
* statistics
*/
+ unsigned long epp_st_max_pages; /* # of pages ever reached */
unsigned int epp_st_grows; /* # of grows */
unsigned int epp_st_grow_fails; /* # of add pages failures */
unsigned int epp_st_shrinks; /* # of shrinks */
unsigned long epp_st_access; /* # of access */
unsigned long epp_st_missings; /* # of cache missing */
unsigned long epp_st_lowfree; /* lowest free pages reached */
- unsigned long epp_st_max_wqlen; /* highest waitqueue length */
+ unsigned int epp_st_max_wqlen; /* highest waitqueue length */
cfs_time_t epp_st_max_wait; /* in jeffies */
/*
* pointers to pools
"idle index: %lu/100\n"
"last shrink: %lds\n"
"last access: %lds\n"
+ "max pages reached: %lu\n"
"grows: %u\n"
"grows failure: %u\n"
"shrinks: %u\n"
"cache access: %lu\n"
"cache missing: %lu\n"
"low free mark: %lu\n"
- "max waitqueue depth: %lu\n"
+ "max waitqueue depth: %u\n"
"max wait time: "CFS_TIME_T"/%u\n"
,
num_physpages,
page_pools.epp_idle_idx,
cfs_time_current_sec() - page_pools.epp_last_shrink,
cfs_time_current_sec() - page_pools.epp_last_access,
+ page_pools.epp_st_max_pages,
page_pools.epp_st_grows,
page_pools.epp_st_grow_fails,
page_pools.epp_st_shrinks,
static void enc_pools_release_free_pages(long npages)
{
int p_idx, g_idx;
+ int p_idx_max1, p_idx_max2;
+ LASSERT(npages > 0);
LASSERT(npages <= page_pools.epp_free_pages);
+ LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
- p_idx = (page_pools.epp_free_pages - 1) / PAGES_PER_POOL;
- g_idx = (page_pools.epp_free_pages - 1) % PAGES_PER_POOL;
- LASSERT(page_pools.epp_pools[p_idx]);
+ /* max pool index before the release */
+ p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
page_pools.epp_free_pages -= npages;
page_pools.epp_total_pages -= npages;
- while (npages-- > 0) {
+ /* max pool index after the release */
+ p_idx_max1 = page_pools.epp_total_pages == 0 ? 0 :
+ ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
+
+ p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+ g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+ LASSERT(page_pools.epp_pools[p_idx]);
+
+ while (npages--) {
+ LASSERT(page_pools.epp_pools[p_idx]);
LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
cfs_free_page(page_pools.epp_pools[p_idx][g_idx]);
page_pools.epp_pools[p_idx][g_idx] = NULL;
- if (g_idx-- == 0) {
- p_idx--;
- g_idx = PAGES_PER_POOL - 1;
-
- LASSERT(page_pools.epp_pools[p_idx]);
+ if (++g_idx == PAGES_PER_POOL) {
+ p_idx++;
+ g_idx = 0;
}
+ };
+
+ /* free unused pools */
+ while (p_idx_max1 < p_idx_max2) {
+ LASSERT(page_pools.epp_pools[p_idx_max2]);
+ OBD_FREE(page_pools.epp_pools[p_idx_max2], CFS_PAGE_SIZE);
+ page_pools.epp_pools[p_idx_max2] = NULL;
+ p_idx_max2--;
}
}
spin_lock(&page_pools.epp_lock);
- if (nr_to_scan) {
- if (nr_to_scan > page_pools.epp_free_pages)
- nr_to_scan = page_pools.epp_free_pages;
+ if (nr_to_scan > page_pools.epp_free_pages)
+ nr_to_scan = page_pools.epp_free_pages;
+ if (nr_to_scan > 0) {
enc_pools_release_free_pages(nr_to_scan);
CDEBUG(D_SEC, "released %d pages, %ld left\n",
nr_to_scan, page_pools.epp_free_pages);
* if no pool access for a long time, we consider it's fully idle
*/
if (cfs_time_current_sec() - page_pools.epp_last_access >
- CACHE_QUIESCENCE_PERIOD)
+ CACHE_QUIESCENT_PERIOD)
page_pools.epp_idle_idx = IDLE_IDX_MAX;
LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
* (1) fill all the free slots of current pools.
*/
/* free slots are those left by rent pages, and the extra ones with
- * index >= eep_total_pages, locate at the tail of last pool. */
+ * index >= total_pages, locate at the tail of last pool. */
freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
if (freeslot != 0)
freeslot = PAGES_PER_POOL - freeslot;
page_pools.epp_free_pages += npages;
page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+ if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
+ page_pools.epp_st_max_pages = page_pools.epp_total_pages;
+
CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
page_pools.epp_total_pages);
page_pools.epp_total_pages = 0;
page_pools.epp_free_pages = 0;
+ page_pools.epp_st_max_pages = 0;
page_pools.epp_st_grows = 0;
page_pools.epp_st_grow_fails = 0;
page_pools.epp_st_shrinks = 0;
LASSERT(cleaned == page_pools.epp_total_pages);
enc_pools_free();
+
+ if (page_pools.epp_st_access > 0) {
+ CWARN("max pages %lu, grows %u, grow fails %u, shrinks %u, "
+ "access %lu, missing %lu, max qlen %u, max wait "
+ CFS_TIME_T"/%d\n",
+ page_pools.epp_st_max_pages, page_pools.epp_st_grows,
+ page_pools.epp_st_grow_fails,
+ page_pools.epp_st_shrinks, page_pools.epp_st_access,
+ page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
+ page_pools.epp_st_max_wait, HZ);
+ }
}
#else /* !__KERNEL__ */
#endif
#include <obd_support.h>
+#include <obd_cksum.h>
#include <obd_class.h>
#include <lustre_net.h>
#include <lustre_sec.h>
}
}
-static
-int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
{
/* should never reach here */
LBUG();
static
int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
{
- req->rq_repmsg = req->rq_repbuf;
+ __u32 cksums, cksumc;
+
+ LASSERT(req->rq_repdata);
+
+ /* real reply rq_repdata point inside of rq_reqbuf; early reply
+ * rq_repdata point to a separate allocated space */
+ if ((char *) req->rq_repdata < req->rq_repbuf ||
+ (char *) req->rq_repdata >= req->rq_repbuf + req->rq_repbuf_len) {
+ cksums = req->rq_repdata->lm_cksum;
+ req->rq_repdata->lm_cksum = 0;
+
+ if (req->rq_repdata->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED)
+ __swab32s(&cksums);
+
+ cksumc = crc32_le(!(__u32) 0, (char *) req->rq_repdata,
+ req->rq_repdata_len);
+ if (cksumc != cksums) {
+ CWARN("early reply checksum mismatch: %08x != %08x\n",
+ cksumc, cksums);
+ return -EINVAL;
+ }
+ }
+
+ req->rq_repmsg = req->rq_repdata;
req->rq_replen = req->rq_repdata_len;
return 0;
}
-static struct ptlrpc_ctx_ops null_ctx_ops = {
- .refresh = null_ctx_refresh,
- .sign = null_ctx_sign,
- .verify = null_ctx_verify,
-};
-
-static struct ptlrpc_svc_ctx null_svc_ctx = {
- .sc_refcount = ATOMIC_INIT(1),
- .sc_policy = &null_policy,
-};
-
static
struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
struct ptlrpc_svc_ctx *svc_ctx,
struct ptlrpc_request *req,
int msgsize)
{
+ /* add space for early replied */
+ msgsize += lustre_msg_early_size();
+
msgsize = size_roundup_power2(msgsize);
OBD_ALLOC(req->rq_repbuf, msgsize);
void null_free_repbuf(struct ptlrpc_sec *sec,
struct ptlrpc_request *req)
{
+ LASSERT(req->rq_repbuf);
+
OBD_FREE(req->rq_repbuf, req->rq_repbuf_len);
req->rq_repbuf = NULL;
req->rq_repbuf_len = 0;
return 0;
}
+static struct ptlrpc_svc_ctx null_svc_ctx = {
+ .sc_refcount = ATOMIC_INIT(1),
+ .sc_policy = &null_policy,
+};
+
static
int null_accept(struct ptlrpc_request *req)
{
struct ptlrpc_reply_state *rs = req->rq_reply_state;
LASSERT(rs);
+
rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
rs->rs_repdata_len = req->rq_replen;
+
+ if (likely(req->rq_packed_final)) {
+ req->rq_reply_off = lustre_msg_early_size();
+ } else {
+ rs->rs_repbuf->lm_cksum =
+ crc32_le(!(__u32) 0, (char *) rs->rs_repbuf,
+ rs->rs_repdata_len);
+ req->rq_reply_off = 0;
+ }
+
return 0;
}
+static struct ptlrpc_ctx_ops null_ctx_ops = {
+ .refresh = null_ctx_refresh,
+ .sign = null_ctx_sign,
+ .verify = null_ctx_verify,
+};
+
static struct ptlrpc_sec_cops null_sec_cops = {
.create_sec = null_create_sec,
.destroy_sec = null_destroy_sec,
.sp_sops = &null_sec_sops,
};
-static
-void null_init_internal(void)
+static void null_init_internal(void)
{
static HLIST_HEAD(__list);
#endif
#include <obd_support.h>
+#include <obd_cksum.h>
#include <obd_class.h>
#include <lustre_net.h>
#include <lustre_sec.h>
static struct ptlrpc_ctx_ops plain_ctx_ops;
static struct ptlrpc_svc_ctx plain_svc_ctx;
+static unsigned int plain_at_offset;
+
/*
* flavor flags (maximum 8 flags)
*/
static
int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
{
- struct lustre_msg *msg = req->rq_repbuf;
+ struct lustre_msg *msg = req->rq_repdata;
+ int early = 0;
+ __u32 cksum;
ENTRY;
if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
RETURN(-EPROTO);
}
+ /* find out if it's an early reply */
+ if ((char *) msg < req->rq_repbuf ||
+ (char *) msg >= req->rq_repbuf + req->rq_repbuf_len)
+ early = 1;
+
/* expect no user desc in reply */
if (PLAIN_WFLVR_HAS_USER(msg->lm_secflvr)) {
CERROR("Unexpected udesc flag in reply\n");
RETURN(-EPROTO);
}
- /* whether we sent with bulk or not, we expect the same in reply */
- if (!equi(req->rq_pack_bulk == 1,
- PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr))) {
- CERROR("%s bulk checksum in reply\n",
- req->rq_pack_bulk ? "Missing" : "Unexpected");
- RETURN(-EPROTO);
- }
+ if (unlikely(early)) {
+ cksum = crc32_le(!(__u32) 0,
+ lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+ lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF));
+ if (cksum != msg->lm_cksum) {
+ CWARN("early reply checksum mismatch: %08x != %08x\n",
+ cpu_to_le32(cksum), msg->lm_cksum);
+ RETURN(-EINVAL);
+ }
+ } else {
+ /* whether we sent with bulk or not, we expect the same
+ * in reply, except for early reply */
+ if (!early &&
+ !equi(req->rq_pack_bulk == 1,
+ PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr))) {
+ CERROR("%s bulk checksum in reply\n",
+ req->rq_pack_bulk ? "Missing" : "Unexpected");
+ RETURN(-EPROTO);
+ }
- if (req->rq_pack_bulk &&
- bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF)) {
- CERROR("Mal-formed bulk checksum reply\n");
- RETURN(-EINVAL);
+ if (PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr) &&
+ bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF)) {
+ CERROR("Mal-formed bulk checksum reply\n");
+ RETURN(-EINVAL);
+ }
}
req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
- req->rq_replen = msg->lm_buflens[PLAIN_PACK_MSG_OFF];
+ req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF);
RETURN(0);
}
{
LASSERT(req->rq_pack_bulk);
LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
- LASSERT(req->rq_repbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+ LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
return bulk_csum_cli_reply(desc, req->rq_bulk_read,
req->rq_reqbuf, PLAIN_PACK_BULK_OFF,
- req->rq_repbuf, PLAIN_PACK_BULK_OFF);
+ req->rq_repdata, PLAIN_PACK_BULK_OFF);
}
/****************************************
if (req->rq_pack_bulk) {
LASSERT(req->rq_bulk_read || req->rq_bulk_write);
-
buflens[PLAIN_PACK_BULK_OFF] = bulk_sec_desc_size(
req->rq_flvr.sf_bulk_hash, 0,
req->rq_bulk_read);
}
alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+ /* add space for early reply */
+ alloc_len += plain_at_offset;
+
alloc_len = size_roundup_power2(alloc_len);
OBD_ALLOC(req->rq_repbuf, alloc_len);
msg->lm_secflvr |= PLAIN_WFLVR_FLAG_BULK;
rs->rs_repdata_len = len;
+
+ if (likely(req->rq_packed_final)) {
+ req->rq_reply_off = plain_at_offset;
+ } else {
+ msg->lm_cksum = crc32_le(!(__u32) 0,
+ lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+ lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF));
+ req->rq_reply_off = 0;
+ }
+
RETURN(0);
}
int sptlrpc_plain_init(void)
{
+ int buflens[PLAIN_PACK_SEGMENTS] = { 0, };
int rc;
+ buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size();
+ plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
rc = sptlrpc_register_policy(&plain_policy);
if (rc)
CERROR("failed to register: %d\n", rc);
#include <lnet/types.h>
#include "ptlrpc_internal.h"
+/* The following are visible and mutable through /sys/module/ptlrpc */
int test_req_buffer_pressure = 0;
CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444,
"set non-zero to put pressure on request buffer pools");
+unsigned int at_min = 0;
+CFS_MODULE_PARM(at_min, "i", int, 0644,
+ "Adaptive timeout minimum (sec)");
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+CFS_MODULE_PARM(at_max, "i", int, 0644,
+ "Adaptive timeout maximum (sec)");
+unsigned int at_history = 600;
+CFS_MODULE_PARM(at_history, "i", int, 0644,
+ "Adaptive timeouts remember the slowest event that took place "
+ "within this period (sec)");
+static int at_early_margin = 5;
+CFS_MODULE_PARM(at_early_margin, "i", int, 0644,
+ "How soon before an RPC deadline to send an early reply");
+static int at_extra = 30;
+CFS_MODULE_PARM(at_extra, "i", int, 0644,
+ "How much extra time to give with each early reply");
+
/* forward ref */
static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc);
return ptlrpc_init_svc(c->psc_nbufs, c->psc_bufsize,
c->psc_max_req_size, c->psc_max_reply_size,
c->psc_req_portal, c->psc_rep_portal,
- c->psc_watchdog_timeout,
+ c->psc_watchdog_factor,
h, name, proc_entry,
prntfn, c->psc_min_threads, c->psc_max_threads,
threadname, c->psc_ctx_tags);
}
EXPORT_SYMBOL(ptlrpc_init_svc_conf);
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+ struct ptlrpc_service *svc = (struct ptlrpc_service *)castmeharder;
+ CDEBUG(D_INFO, "at timer %s hit at %ld%s\n",
+ svc->srv_name, cfs_time_current_sec(),
+ list_empty(&svc->srv_at_list) ? ", empty" : "");
+ svc->srv_at_check = 1;
+ svc->srv_at_checktime = cfs_time_current();
+ cfs_waitq_signal(&svc->srv_waitq);
+}
+
/* @threadname should be 11 characters or less - 3 will be added on */
struct ptlrpc_service *
ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
- int req_portal, int rep_portal, int watchdog_timeout,
+ int req_portal, int rep_portal, int watchdog_factor,
svc_handler_t handler, char *name,
cfs_proc_dir_entry_t *proc_entry,
svcreq_printfn_t svcreq_printfn,
service->srv_buf_size = bufsize;
service->srv_rep_portal = rep_portal;
service->srv_req_portal = req_portal;
- service->srv_watchdog_timeout = watchdog_timeout;
+ service->srv_watchdog_factor = watchdog_factor;
service->srv_handler = handler;
service->srv_request_history_print_fn = svcreq_printfn;
service->srv_request_seq = 1; /* valid seq #s start at 1 */
CFS_INIT_LIST_HEAD(&service->srv_free_rs_list);
cfs_waitq_init(&service->srv_free_rs_waitq);
+ spin_lock_init(&service->srv_at_lock);
+ CFS_INIT_LIST_HEAD(&service->srv_req_in_queue);
+ CFS_INIT_LIST_HEAD(&service->srv_at_list);
+ cfs_timer_init(&service->srv_at_timer, ptlrpc_at_timer, service);
+ /* At SOW, service time should be quick; 10s seems generous. If client
+ timeout is less than this, we'll be sending an early reply. */
+ at_init(&service->srv_at_estimate, 10, 0);
+
spin_lock (&ptlrpc_all_services_lock);
list_add (&service->srv_list, &ptlrpc_all_services);
spin_unlock (&ptlrpc_all_services_lock);
return NULL;
}
-static void __ptlrpc_server_free_request(struct ptlrpc_request *req)
+static void ptlrpc_server_req_decref(struct ptlrpc_request *req)
{
struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
- list_del(&req->rq_list);
-
- if (req->rq_reply_state != NULL) {
- ptlrpc_rs_decref(req->rq_reply_state);
- req->rq_reply_state = NULL;
- }
+ if (!atomic_dec_and_test(&req->rq_refcount))
+ return;
sptlrpc_svc_ctx_decref(req);
+ LASSERT(list_empty(&req->rq_timed_list));
if (req != &rqbd->rqbd_req) {
/* NB request buffers use an embedded
* req if the incoming req unlinked the
* MD; this isn't one of them! */
OBD_FREE(req, sizeof(*req));
+ } else {
+ struct ptlrpc_service *svc = rqbd->rqbd_service;
+ /* schedule request buffer for re-use.
+ * NB I can only do this after I've disposed of their
+ * reqs; particularly the embedded req */
+ spin_lock(&svc->srv_lock);
+ list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
+ spin_unlock(&svc->srv_lock);
}
}
+static void __ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+ list_del(&req->rq_list);
+ ptlrpc_req_drop_rs(req);
+ ptlrpc_server_req_decref(req);
+}
+
static void
ptlrpc_server_free_request(struct ptlrpc_request *req)
{
struct list_head *tmp;
struct list_head *nxt;
+ if (req->rq_phase != RQ_PHASE_NEW) /* incorrect message magic */
+ DEBUG_REQ(D_INFO, req, "free req");
+ spin_lock(&svc->srv_at_lock);
+ req->rq_sent_final = 1;
+ list_del_init(&req->rq_timed_list);
+ spin_unlock(&svc->srv_at_lock);
+
spin_lock(&svc->srv_lock);
svc->srv_n_active_reqs--;
}
spin_lock(&svc->srv_lock);
-
- /* schedule request buffer for re-use.
- * NB I can only do this after I've disposed of their
- * reqs; particularly the embedded req */
- list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds);
}
} else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
/* If we are low on memory, we are not interested in
/* Note - racing to start/reset the obd_eviction timer is safe */
if (exp->exp_obd->obd_eviction_timer == 0) {
/* Check if the oldest entry is expired. */
- if (cfs_time_current_sec() > (oldest_time +
- (3 * obd_timeout / 2) + extra_delay)) {
+ if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
+ extra_delay)) {
/* We need a second timer, in case the net was down and
* it just came back. Since the pinger may skip every
* other PING_INTERVAL (see note in ptlrpc_pinger_main),
* we better wait for 3. */
- exp->exp_obd->obd_eviction_timer = cfs_time_current_sec() +
- 3 * PING_INTERVAL;
+ exp->exp_obd->obd_eviction_timer =
+ cfs_time_current_sec() + 3 * PING_INTERVAL;
CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n",
exp->exp_obd->obd_name, obd_export_nid2str(exp),
oldest_time);
}
} else {
- if (cfs_time_current_sec() > (exp->exp_obd->obd_eviction_timer +
- extra_delay)) {
+ if (cfs_time_current_sec() >
+ (exp->exp_obd->obd_eviction_timer + extra_delay)) {
/* The evictor won't evict anyone who we've heard from
* recently, so we don't have to check before we start
* it. */
#endif
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+ if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+ req->rq_export->exp_conn_cnt)) {
+ DEBUG_REQ(D_ERROR, req,
+ "DROPPING req from old connection %d < %d",
+ lustre_msg_get_conn_cnt(req->rq_reqmsg),
+ req->rq_export->exp_conn_cnt);
+ return -EEXIST;
+ }
+ if (unlikely(req->rq_export->exp_obd &&
+ req->rq_export->exp_obd->obd_fail)) {
+ /* Failing over, don't handle any more reqs, send
+ error response instead. */
+ CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+ req, req->rq_export->exp_obd->obd_name);
+ req->rq_status = -ENODEV;
+ ptlrpc_error(req);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_request *rq;
+ __s32 next;
+
+ spin_lock(&svc->srv_at_lock);
+ if (list_empty(&svc->srv_at_list)) {
+ cfs_timer_disarm(&svc->srv_at_timer);
+ spin_unlock(&svc->srv_at_lock);
+ return;
+ }
+
+ /* Set timer for closest deadline */
+ rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request,
+ rq_timed_list);
+ next = (__s32)(rq->rq_deadline - cfs_time_current_sec() -
+ at_early_margin);
+ if (next <= 0)
+ ptlrpc_at_timer((unsigned long)svc);
+ else
+ cfs_timer_arm(&svc->srv_at_timer, cfs_time_shift(next));
+ spin_unlock(&svc->srv_at_lock);
+ CDEBUG(D_INFO, "armed %s at %+lds\n", svc->srv_name, next);
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+ struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+ struct ptlrpc_request *rq;
+ int found = 0;
+
+ if (AT_OFF)
+ return(0);
+
+ if (req->rq_no_reply)
+ return 0;
+
+ if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
+ return(-ENOSYS);
+
+ DEBUG_REQ(D_ADAPTTO, req, "add timed %lds",
+ req->rq_deadline - cfs_time_current_sec());
+
+ spin_lock(&svc->srv_at_lock);
+
+ if (unlikely(req->rq_sent_final)) {
+ spin_unlock(&svc->srv_at_lock);
+ return 0;
+ }
+
+ LASSERT(list_empty(&req->rq_timed_list));
+ /* Add to sorted list. Presumably latest rpcs will have the latest
+ deadlines, so search backward. */
+ list_for_each_entry_reverse(rq, &svc->srv_at_list, rq_timed_list) {
+ if (req->rq_deadline > rq->rq_deadline) {
+ list_add(&req->rq_timed_list, &rq->rq_timed_list);
+ found++;
+ break;
+ }
+ }
+ if (!found)
+ /* Add to front if shortest deadline or list empty */
+ list_add(&req->rq_timed_list, &svc->srv_at_list);
+
+ /* Check if we're the head of the list */
+ found = (svc->srv_at_list.next == &req->rq_timed_list);
+
+ spin_unlock(&svc->srv_at_lock);
+
+ if (found)
+ ptlrpc_at_set_timer(svc);
+
+ return 0;
+}
+
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
+ int extra_time)
+{
+ struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service;
+ struct ptlrpc_request *reqcopy;
+ struct lustre_msg *reqmsg;
+ long olddl = req->rq_deadline - cfs_time_current_sec();
+ time_t newdl;
+ int rc;
+ ENTRY;
+
+ /* deadline is when the client expects us to reply, margin is the
+ difference between clients' and servers' expectations */
+ DEBUG_REQ(D_ADAPTTO, req,
+ "%ssending early reply (deadline %+lds, margin %+lds) for "
+ "%d+%d", AT_OFF ? "AT off - not " : "",
+ olddl, olddl - at_get(&svc->srv_at_estimate),
+ at_get(&svc->srv_at_estimate), extra_time);
+
+ if (AT_OFF)
+ RETURN(0);
+
+ if (olddl < 0) {
+ CDEBUG(D_WARNING, "x"LPU64": Already past deadline (%+lds), not"
+ " sending early reply. Increase at_early_margin (%d)?\n",
+ req->rq_xid, olddl, at_early_margin);
+ /* Return an error so we're not re-added to the timed list. */
+ RETURN(-ETIMEDOUT);
+ }
+
+ if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){
+ CDEBUG(D_INFO, "Wanted to ask client for more time, but no AT "
+ "support\n");
+ RETURN(-ENOSYS);
+ }
+
+ if (extra_time) {
+ /* Fake our processing time into the future to ask the
+ clients for some extra amount of time */
+ extra_time += cfs_time_current_sec() -
+ req->rq_arrival_time.tv_sec;
+ at_add(&svc->srv_at_estimate, extra_time);
+ }
+
+ newdl = req->rq_arrival_time.tv_sec + at_get(&svc->srv_at_estimate);
+ if (req->rq_deadline >= newdl) {
+ /* We're not adding any time, no need to send an early reply
+ (e.g. maybe at adaptive_max) */
+ CDEBUG(D_ADAPTTO, "x"LPU64": Couldn't add any time (%ld/%ld), "
+ "not sending early reply\n", req->rq_xid, olddl,
+ newdl - cfs_time_current_sec());
+ RETURN(-ETIMEDOUT);
+ }
+
+ OBD_ALLOC(reqcopy, sizeof *reqcopy);
+ if (reqcopy == NULL)
+ RETURN(-ENOMEM);
+ OBD_ALLOC(reqmsg, req->rq_reqlen);
+ if (!reqmsg) {
+ OBD_FREE(reqcopy, sizeof *reqcopy);
+ RETURN(-ENOMEM);
+ }
+
+ *reqcopy = *req;
+ reqcopy->rq_reply_state = NULL;
+ reqcopy->rq_rep_swab_mask = 0;
+ reqcopy->rq_pack_bulk = 0;
+ reqcopy->rq_pack_udesc = 0;
+ reqcopy->rq_packed_final = 0;
+ sptlrpc_svc_ctx_addref(reqcopy);
+ /* We only need the reqmsg for the magic */
+ reqcopy->rq_reqmsg = reqmsg;
+ memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+ if (req->rq_sent_final) {
+ CDEBUG(D_ADAPTTO, "x"LPU64": normal reply already sent out, "
+ "abort sending early reply\n", req->rq_xid);
+ GOTO(out, rc = 0);
+ }
+
+ /* Connection ref */
+ reqcopy->rq_export = class_conn2export(
+ lustre_msg_get_handle(reqcopy->rq_reqmsg));
+ if (reqcopy->rq_export == NULL)
+ GOTO(out, rc = -ENODEV);
+
+ /* RPC ref */
+ class_export_rpc_get(reqcopy->rq_export);
+ if (reqcopy->rq_export->exp_obd &&
+ reqcopy->rq_export->exp_obd->obd_fail)
+ GOTO(out_put, rc = -ENODEV);
+
+ rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+ if (rc)
+ GOTO(out_put, rc);
+
+ rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+ if (!rc) {
+ /* Adjust our own deadline to what we told the client */
+ req->rq_deadline = newdl;
+ req->rq_early_count++; /* number sent, server side */
+ } else {
+ DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+ }
+
+ /* Free the (early) reply state from lustre_pack_reply.
+ (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+ ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+ class_export_rpc_put(reqcopy->rq_export);
+ class_export_put(reqcopy->rq_export);
+out:
+ sptlrpc_svc_ctx_decref(reqcopy);
+ OBD_FREE(reqmsg, req->rq_reqlen);
+ OBD_FREE(reqcopy, sizeof *reqcopy);
+ RETURN(rc);
+}
+
+/* Send early replies to everybody expiring within at_early_margin
+ asking for at_extra time */
+static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_request *rq, *n;
+ struct list_head work_list;
+ time_t now = cfs_time_current_sec();
+ cfs_duration_t delay;
+ int first, counter = 0;
+ ENTRY;
+
+ spin_lock(&svc->srv_at_lock);
+ if (svc->srv_at_check == 0) {
+ spin_unlock(&svc->srv_at_lock);
+ RETURN(0);
+ }
+ delay = cfs_time_sub(cfs_time_current(), svc->srv_at_checktime);
+ svc->srv_at_check = 0;
+
+ if (list_empty(&svc->srv_at_list)) {
+ spin_unlock(&svc->srv_at_lock);
+ RETURN(0);
+ }
+
+ /* The timer went off, but maybe the nearest rpc already completed. */
+ rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request,
+ rq_timed_list);
+ first = (int)(rq->rq_deadline - now);
+ if (first > at_early_margin) {
+ /* We've still got plenty of time. Reset the timer. */
+ spin_unlock(&svc->srv_at_lock);
+ ptlrpc_at_set_timer(svc);
+ RETURN(0);
+ }
+
+ /* We're close to a timeout, and we don't know how much longer the
+ server will take. Send early replies to everyone expiring soon. */
+ CFS_INIT_LIST_HEAD(&work_list);
+ list_for_each_entry_safe(rq, n, &svc->srv_at_list, rq_timed_list) {
+ if (rq->rq_deadline <= now + at_early_margin) {
+ list_move_tail(&rq->rq_timed_list, &work_list);
+ counter++;
+ } else {
+ break;
+ }
+ }
+
+ spin_unlock(&svc->srv_at_lock);
+
+ /* we have a new earliest deadline, restart the timer */
+ ptlrpc_at_set_timer(svc);
+
+ CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
+ "replies\n", first, at_extra, counter);
+ if (first < 0) {
+ /* We're already past request deadlines before we even get a
+ chance to send early replies */
+ LCONSOLE_WARN("%s: This server is not able to keep up with "
+ "request traffic (cpu-bound).\n", svc->srv_name);
+ CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, "
+ "delay="CFS_DURATION_T"(jiff)\n",
+ counter, svc->srv_n_queued_reqs, svc->srv_n_active_reqs,
+ at_get(&svc->srv_at_estimate), delay);
+ }
+
+ /* ptlrpc_server_free_request may delete an entry out of the work
+ list */
+ spin_lock(&svc->srv_at_lock);
+ while (!list_empty(&work_list)) {
+ rq = list_entry(work_list.next, struct ptlrpc_request,
+ rq_timed_list);
+ list_del_init(&rq->rq_timed_list);
+ /* if the entry is still in the worklist, it hasn't been
+ deleted, and is safe to take a ref to keep the req around */
+ atomic_inc(&rq->rq_refcount);
+ spin_unlock(&svc->srv_at_lock);
+
+ if (ptlrpc_at_send_early_reply(rq, at_extra) == 0)
+ ptlrpc_at_add_timed(rq);
+
+ ptlrpc_server_req_decref(rq);
+ spin_lock(&svc->srv_at_lock);
+ }
+ spin_unlock(&svc->srv_at_lock);
+
+ RETURN(0);
+}
+
+/* Handle freshly incoming reqs, add to timed early reply list,
+ pass on to regular request queue */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_request *req;
+ __u32 deadline;
+ int rc;
+ ENTRY;
+
+ LASSERT(svc);
+
+ spin_lock(&svc->srv_lock);
+ if (list_empty(&svc->srv_req_in_queue)) {
+ spin_unlock(&svc->srv_lock);
+ RETURN(0);
+ }
+
+ req = list_entry(svc->srv_req_in_queue.next,
+ struct ptlrpc_request, rq_list);
+ list_del_init (&req->rq_list);
+ /* Consider this still a "queued" request as far as stats are
+ concerned */
+ spin_unlock(&svc->srv_lock);
+
+ /* go through security check/transform */
+ rc = sptlrpc_svc_unwrap_request(req);
+ switch (rc) {
+ case SECSVC_OK:
+ break;
+ case SECSVC_COMPLETE:
+ target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+ goto err_req;
+ case SECSVC_DROP:
+ goto err_req;
+ default:
+ LBUG();
+ }
+
+ /* Clear request swab mask; this is a new request */
+ req->rq_req_swab_mask = 0;
+
+ rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
+ if (rc != 0) {
+ CERROR("error unpacking request: ptl %d from %s x"LPU64"\n",
+ svc->srv_req_portal, libcfs_id2str(req->rq_peer),
+ req->rq_xid);
+ goto err_req;
+ }
+
+ rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+ if (rc) {
+ CERROR ("error unpacking ptlrpc body: ptl %d from %s x"
+ LPU64"\n", svc->srv_req_portal,
+ libcfs_id2str(req->rq_peer), req->rq_xid);
+ goto err_req;
+ }
+
+ rc = -EINVAL;
+ if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+ CERROR("wrong packet type received (type=%u) from %s\n",
+ lustre_msg_get_type(req->rq_reqmsg),
+ libcfs_id2str(req->rq_peer));
+ goto err_req;
+ }
+
+ CDEBUG(D_NET, "got req "LPD64"\n", req->rq_xid);
+
+ req->rq_export = class_conn2export(
+ lustre_msg_get_handle(req->rq_reqmsg));
+ if (req->rq_export) {
+ rc = ptlrpc_check_req(req);
+ if (rc == 0) {
+ rc = sptlrpc_target_export_check(req->rq_export, req);
+ if (rc)
+ DEBUG_REQ(D_ERROR, req, "DROPPING req with "
+ "illegal security flavor,");
+ }
+
+ class_export_put(req->rq_export);
+ req->rq_export = NULL;
+ if (rc)
+ goto err_req;
+ }
+
+ /* req_in handling should/must be fast */
+ if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5)
+ DEBUG_REQ(D_WARNING, req, "Slow req_in handling %lus",
+ cfs_time_current_sec() - req->rq_arrival_time.tv_sec);
+
+ /* Set rpc server deadline and add it to the timed list */
+ deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+ MSGHDR_AT_SUPPORT) ?
+ /* The max time the client expects us to take */
+ lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+ req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+ if (unlikely(deadline == 0)) {
+ DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+ goto err_req;
+ }
+
+ ptlrpc_at_add_timed(req);
+
+ /* Move it over to the request processing queue */
+ spin_lock(&svc->srv_lock);
+ list_add_tail(&req->rq_list, &svc->srv_request_queue);
+ cfs_waitq_signal(&svc->srv_waitq);
+ spin_unlock(&svc->srv_lock);
+ RETURN(1);
+
+err_req:
+ spin_lock(&svc->srv_lock);
+ svc->srv_n_queued_reqs--;
+ svc->srv_n_active_reqs++;
+ spin_unlock(&svc->srv_lock);
+ ptlrpc_server_free_request(req);
+
+ RETURN(1);
+}
+
static int
ptlrpc_server_handle_request(struct ptlrpc_service *svc,
struct ptlrpc_thread *thread)
struct timeval work_start;
struct timeval work_end;
long timediff;
- int rc, reply;
+ int rc;
ENTRY;
LASSERT(svc);
spin_lock(&svc->srv_lock);
if (unlikely(list_empty (&svc->srv_request_queue) ||
- (svc->srv_n_difficult_replies != 0 &&
- svc->srv_n_active_reqs >= (svc->srv_threads_running - 1)))) {
- /* If all the other threads are handling requests, I must
- * remain free to handle any 'difficult' reply that might
- * block them */
+ (
+#ifndef __KERNEL__
+ /* !@%$# liblustre only has 1 thread */
+ svc->srv_n_difficult_replies != 0 &&
+#endif
+ svc->srv_n_active_reqs >= (svc->srv_threads_running - 1)))) {
+ /* Don't handle regular requests in the last thread, in order * re
+ * to handle difficult replies (which might block other threads)
+ * as well as handle any incoming reqs, early replies, etc.
+ * That means we always need at least 2 service threads. */
spin_unlock(&svc->srv_lock);
RETURN(0);
}
svc->srv_n_queued_reqs);
lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
svc->srv_n_active_reqs);
- }
-
- /* go through security check/transform */
- rc = sptlrpc_svc_unwrap_request(request);
- switch (rc) {
- case SECSVC_OK:
- break;
- case SECSVC_COMPLETE:
- target_send_reply(request, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
- goto out_stat;
- case SECSVC_DROP:
- goto out_req;
- default:
- LBUG();
- }
-
- /* Clear request swab mask; this is a new request */
- request->rq_req_swab_mask = 0;
-
- rc = lustre_unpack_msg(request->rq_reqmsg, request->rq_reqlen);
- if (rc != 0) {
- CERROR ("error unpacking request: ptl %d from %s"
- " xid "LPU64"\n", svc->srv_req_portal,
- libcfs_id2str(request->rq_peer), request->rq_xid);
- goto out_req;
- }
-
- rc = lustre_unpack_req_ptlrpc_body(request, MSG_PTLRPC_BODY_OFF);
- if (rc) {
- CERROR ("error unpacking ptlrpc body: ptl %d from %s"
- " xid "LPU64"\n", svc->srv_req_portal,
- libcfs_id2str(request->rq_peer), request->rq_xid);
- goto out_req;
- }
-
- rc = -EINVAL;
- if (lustre_msg_get_type(request->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
- CERROR("wrong packet type received (type=%u) from %s\n",
- lustre_msg_get_type(request->rq_reqmsg),
- libcfs_id2str(request->rq_peer));
- goto out_req;
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+ at_get(&svc->srv_at_estimate));
}
rc = lu_context_init(&request->rq_session, LCT_SESSION);
lustre_msg_get_handle(request->rq_reqmsg));
if (likely(request->rq_export)) {
- if (unlikely(lustre_msg_get_conn_cnt(request->rq_reqmsg) <
- request->rq_export->exp_conn_cnt)) {
- DEBUG_REQ(D_ERROR, request,
- "DROPPING req from old connection %d < %d",
- lustre_msg_get_conn_cnt(request->rq_reqmsg),
- request->rq_export->exp_conn_cnt);
+ if (unlikely(ptlrpc_check_req(request)))
goto put_conn;
- }
- if (unlikely(request->rq_export->exp_obd &&
- request->rq_export->exp_obd->obd_fail)) {
- /* Failing over, don't handle any more reqs, send
- error response instead. */
- CDEBUG(D_RPCTRACE,"Dropping req %p for failed obd %s\n",
- request, request->rq_export->exp_obd->obd_name);
- request->rq_status = -ENODEV;
- ptlrpc_error(request);
- goto put_conn;
- }
-
- rc = sptlrpc_target_export_check(request->rq_export, request);
- if (unlikely(rc)) {
- DEBUG_REQ(D_ERROR, request,
- "DROPPING req with illegal security flavor,");
- goto put_conn;
- }
-
- ptlrpc_update_export_timer(request->rq_export, timediff/500000);
+ ptlrpc_update_export_timer(request->rq_export, timediff >> 19);
export = class_export_rpc_get(request->rq_export);
}
- /* Discard requests queued for longer than my timeout. If the
- * client's timeout is similar to mine, she'll be timing out this
- * REQ anyway (bug 1502) */
- if (unlikely(timediff / 1000000 > (long)obd_timeout)) {
- CERROR("Dropping timed-out opc %d request from %s"
- ": %ld seconds old\n",
- lustre_msg_get_opc(request->rq_reqmsg),
- libcfs_id2str(request->rq_peer),
- timediff / 1000000);
+ /* Discard requests queued for longer than the deadline.
+ The deadline is increased if we send an early reply. */
+ if (cfs_time_current_sec() > request->rq_deadline) {
+ DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s"
+ ": deadline %ld%+lds ago\n",
+ libcfs_id2str(request->rq_peer),
+ request->rq_deadline -
+ request->rq_arrival_time.tv_sec,
+ cfs_time_current_sec() - request->rq_deadline);
goto put_rpc_export;
}
request->rq_phase = RQ_PHASE_INTERPRET;
CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
- "%s:%s+%d:%d:"LPU64":%s:%d\n", cfs_curproc_comm(),
+ "%s:%s+%d:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(),
(request->rq_export ?
(char *)request->rq_export->exp_client_uuid.uuid : "0"),
(request->rq_export ?
libcfs_id2str(request->rq_peer),
lustre_msg_get_opc(request->rq_reqmsg));
+ OBD_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, obd_fail_val);
+
rc = svc->srv_handler(request);
request->rq_phase = RQ_PHASE_COMPLETE;
CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
- "%s:%s+%d:%d:"LPU64":%s:%d\n", cfs_curproc_comm(),
+ "%s:%s+%d:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(),
(request->rq_export ?
(char *)request->rq_export->exp_client_uuid.uuid : "0"),
(request->rq_export ?
lu_context_exit(&request->rq_session);
lu_context_fini(&request->rq_session);
-out_stat:
- reply = request->rq_reply_state && request->rq_repmsg; /* bug 11169 */
- do_gettimeofday(&work_end);
+ if (unlikely(cfs_time_current_sec() > request->rq_deadline)) {
+ DEBUG_REQ(D_WARNING, request, "Request x"LPU64" took longer "
+ "than estimated (%ld%+lds); client may timeout.",
+ request->rq_xid, request->rq_deadline -
+ request->rq_arrival_time.tv_sec,
+ cfs_time_current_sec() - request->rq_deadline);
+ }
+ do_gettimeofday(&work_end);
timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
-
- if (unlikely(timediff / 1000000 > (long)obd_timeout))
- CERROR("request "LPU64" opc %u from %s processed in %lds "
- "trans "LPU64" rc %d/%d\n",
- request->rq_xid,
- request->rq_reqmsg ?
- lustre_msg_get_opc(request->rq_reqmsg) : 0,
- libcfs_id2str(request->rq_peer),
- cfs_timeval_sub(&work_end, &request->rq_arrival_time,
- NULL) / 1000000,
- reply ? lustre_msg_get_transno(request->rq_repmsg) :
- request->rq_transno, request->rq_status,
- reply ? lustre_msg_get_status(request->rq_repmsg) : -999);
- else
- CDEBUG(D_RPCTRACE,"request "LPU64" opc %u from %s processed in "
- "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
- request->rq_xid,
- request->rq_reqmsg ?
- lustre_msg_get_opc(request->rq_reqmsg) : 0,
- libcfs_id2str(request->rq_peer), timediff,
- cfs_timeval_sub(&work_end, &request->rq_arrival_time,
- NULL),
- request->rq_transno, request->rq_status,
- reply ? lustre_msg_get_status(request->rq_repmsg) : -999);
-
+ CDEBUG(D_RPCTRACE, "request x"LPU64" opc %u from %s processed in "
+ "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
+ request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg),
+ libcfs_id2str(request->rq_peer), timediff,
+ cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
+ request->rq_repmsg ? lustre_msg_get_transno(request->rq_repmsg) :
+ request->rq_transno, request->rq_status,
+ request->rq_repmsg ? lustre_msg_get_status(request->rq_repmsg):
+ -999);
if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
__u32 op = lustre_msg_get_opc(request->rq_reqmsg);
int opc = opcode_offset(op);
timediff);
}
}
+ if (unlikely(request->rq_early_count)) {
+ DEBUG_REQ(D_ADAPTTO, request,
+ "sent %d early replies before finishing in %lds",
+ request->rq_early_count,
+ work_end.tv_sec - request->rq_arrival_time.tv_sec);
+ }
out_req:
ptlrpc_server_free_request(request);
svc->srv_threads_running++;
do {
- rc = ptlrpc_server_handle_reply(svc);
+ rc = ptlrpc_server_handle_req_in(svc);
+ rc |= ptlrpc_server_handle_reply(svc);
+ rc |= ptlrpc_at_check_timed(svc);
rc |= ptlrpc_server_handle_request(svc, NULL);
rc |= (ptlrpc_server_post_idle_rqbds(svc) > 0);
did_something |= rc;
struct group_info *ginfo = NULL;
#endif
struct lu_env env;
- int rc = 0;
+ int counter = 0, rc = 0;
ENTRY;
ptlrpc_daemonize(data->name);
*/
cfs_waitq_signal(&thread->t_ctl_waitq);
- watchdog = lc_watchdog_add(svc->srv_watchdog_timeout, NULL, NULL);
+ watchdog = lc_watchdog_add(max_t(int, obd_timeout, AT_OFF ? 0 :
+ at_get(&svc->srv_at_estimate)) *
+ svc->srv_watchdog_factor, NULL, NULL);
spin_lock(&svc->srv_lock);
svc->srv_threads_running++;
svc->srv_n_difficult_replies == 0) ||
(!list_empty(&svc->srv_idle_rqbds) &&
svc->srv_rqbd_timeout == 0) ||
- !list_empty (&svc->srv_reply_queue) ||
- (!list_empty (&svc->srv_request_queue) &&
- (svc->srv_n_difficult_replies == 0 ||
- svc->srv_n_active_reqs <
- (svc->srv_threads_running - 1))),
+ !list_empty(&svc->srv_req_in_queue) ||
+ !list_empty(&svc->srv_reply_queue) ||
+ (!list_empty(&svc->srv_request_queue) &&
+ (svc->srv_n_active_reqs <
+ (svc->srv_threads_running - 1))) ||
+ svc->srv_at_check,
&lwi);
- lc_watchdog_touch(watchdog);
+ lc_watchdog_touch_ms(watchdog, max_t(int, obd_timeout,
+ AT_OFF ? 0 :
+ at_get(&svc->srv_at_estimate)) *
+ svc->srv_watchdog_factor);
ptlrpc_check_rqbd_pool(svc);
ptlrpc_start_thread(dev, svc);
}
- if (!list_empty (&svc->srv_reply_queue))
- ptlrpc_server_handle_reply (svc);
+ if (!list_empty(&svc->srv_reply_queue))
+ ptlrpc_server_handle_reply(svc);
- /* only handle requests if there are no difficult replies
- * outstanding, or I'm not the last thread handling
- * requests */
+ if (!list_empty(&svc->srv_req_in_queue)) {
+ /* Process all incoming reqs before handling any */
+ ptlrpc_server_handle_req_in(svc);
+ /* but limit ourselves in case of flood */
+ if (counter++ < 1000)
+ continue;
+ counter = 0;
+ }
+
+ if (svc->srv_at_check)
+ ptlrpc_at_check_timed(svc);
+
+ /* don't handle requests in the last thread */
if (!list_empty (&svc->srv_request_queue) &&
- (svc->srv_n_difficult_replies == 0 ||
- svc->srv_n_active_reqs < (svc->srv_threads_running - 1))) {
+ (svc->srv_n_active_reqs < (svc->srv_threads_running - 1))) {
lu_context_enter(&env.le_ctx);
ptlrpc_server_handle_request(svc, thread);
lu_context_exit(&env.le_ctx);
int i, rc = 0;
ENTRY;
- LASSERT(svc->srv_threads_min > 0);
+ /* We require 2 threads min - see note in
+ ptlrpc_server_handle_request */
+ LASSERT(svc->srv_threads_min >= 2);
for (i = 0; i < svc->srv_threads_min; i++) {
rc = ptlrpc_start_thread(dev, svc);
/* We have enough threads, don't start more. b=15759 */
struct list_head *tmp;
struct ptlrpc_reply_state *rs, *t;
+ cfs_timer_disarm(&service->srv_at_timer);
+
ptlrpc_stop_all_threads(service);
LASSERT(list_empty(&service->srv_threads));
/* Network access will complete in finite time but the HUGE
* timeout lets us CWARN for visibility of sluggish NALs */
- lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL);
+ lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL);
rc = l_wait_event(service->srv_waitq,
service->srv_nrqbd_receiving == 0,
&lwi);
/* purge the request queue. NB No new replies (rqbds all unlinked)
* and no service threads, so I'm the only thread noodling the
* request queue now */
+ while (!list_empty(&service->srv_req_in_queue)) {
+ struct ptlrpc_request *req =
+ list_entry(service->srv_req_in_queue.next,
+ struct ptlrpc_request,
+ rq_list);
+
+ list_del(&req->rq_list);
+ service->srv_n_queued_reqs--;
+ service->srv_n_active_reqs++;
+ ptlrpc_server_free_request(req);
+ }
while (!list_empty(&service->srv_request_queue)) {
struct ptlrpc_request *req =
list_entry(service->srv_request_queue.next,
OBD_FREE(rs, service->srv_max_reply_size);
}
+ /* In case somebody rearmed this in the meantime */
+ cfs_timer_disarm(&service->srv_at_timer);
+
OBD_FREE_PTR(service);
return 0;
}
{
struct ptlrpc_request *request;
struct timeval right_now;
- long timediff, cutoff;
- int rc = 0;
+ long timediff;
if (svc == NULL)
return 0;
- spin_lock(&svc->srv_lock);
+ do_gettimeofday(&right_now);
- if (list_empty(&svc->srv_request_queue))
- goto out;
+ spin_lock(&svc->srv_lock);
+ if (list_empty(&svc->srv_request_queue)) {
+ spin_unlock(&svc->srv_lock);
+ return 0;
+ }
+ /* How long has the next entry been waiting? */
request = list_entry(svc->srv_request_queue.next,
struct ptlrpc_request, rq_list);
-
- do_gettimeofday(&right_now);
timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
+ spin_unlock(&svc->srv_lock);
- cutoff = obd_health_check_timeout;
-
- if (timediff / 1000000 > cutoff) {
- rc = -1;
- goto out;
+ if ((timediff / ONE_MILLION) > (AT_OFF ? obd_timeout * 3/2 :
+ at_max)) {
+ CERROR("%s: unhealthy - request has been waiting %lds\n",
+ svc->srv_name, timediff / ONE_MILLION);
+ return (-1);
}
- out:
- spin_unlock(&svc->srv_lock);
- return rc;
+ return 0;
}
(long long)LUSTRE_MSG_MAGIC_V2);
LASSERTF(PTLRPC_MSG_VERSION == 0x00000003," found %lld\n",
(long long)PTLRPC_MSG_VERSION);
+ LASSERTF(MSGHDR_AT_SUPPORT == 1, " found %lld\n",
+ (long long)MSGHDR_AT_SUPPORT);
LASSERTF(PTL_RPC_MSG_REQUEST == 4711, " found %lld\n",
(long long)PTL_RPC_MSG_REQUEST);
LASSERTF(PTL_RPC_MSG_ERR == 4712, " found %lld\n",
(long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n",
(long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
- LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n",
- (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout));
- LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout));
- LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n",
- (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1));
- LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, " found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, " found %lld\n",
(long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n",
- (long long)(int)offsetof(struct ptlrpc_body, pb_padding_1));
- LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n",
- (long long)(int)offsetof(struct ptlrpc_body, pb_padding_2));
- LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2));
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body, pb_timeout));
+ LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout));
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body, pb_service_time));
+ LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time));
LASSERTF((int)offsetof(struct ptlrpc_body, pb_slv) == 80, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_slv));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_slv) == 8, " found %lld\n",
run_test 16 "timeout bulk put, don't evict client (2732)"
test_17() {
+ local at_max_saved=0
+
+ # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max
+ if at_is_valid && at_is_enabled; then
+ at_max_saved=$(at_max_get ost1)
+ at_max_set $TIMEOUT ost1
+ fi
+
# OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE
# OST bulk will time out here, client retries
do_facet ost1 lctl set_param fail_loc=0x80000503
do_facet client cp /etc/termcap $DIR/$tfile
sync
- sleep $TIMEOUT
+ # with AT, client will wait adaptive_max*factor+net_latency before
+ # expiring the req, hopefully timeout*2 is enough
+ sleep $(($TIMEOUT*2))
+
do_facet ost1 lctl set_param fail_loc=0
do_facet client "df $DIR"
# expect cmp to succeed, client resent bulk
do_facet client "cmp /etc/termcap $DIR/$tfile" || return 3
do_facet client "rm $DIR/$tfile" || return 4
+ [ $at_max_saved -ne 0 ] && $(at_max_set $at_max_saved ost1)
return 0
}
run_test 17 "timeout bulk get, don't evict client (2732)"
echo starting with $OST_NEXP1 OST exports
# OBD_FAIL_PTLRPC_DROP_RPC 0x505
do_facet client lctl set_param fail_loc=0x505
- # evictor takes up to 2.25x to evict. But if there's a
- # race to start the evictor from various obds, the loser
- # might have to wait for the next ping.
- echo Waiting for $(($TIMEOUT * 4)) secs
- sleep $(($TIMEOUT * 4))
+ # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.
+ # But if there's a race to start the evictor from various obds,
+ # the loser might have to wait for the next ping.
+ echo Waiting for $(($TIMEOUT * 8)) secs
+ sleep $(($TIMEOUT * 8))
OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`"
OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2`
echo ending with $OST_NEXP2 OST exports
sleep 1 # wait connections being established
MDS_FILE=mdt.${mds1_svc}.num_exports
MDS_NEXP1="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`"
- OST_FILE=obdfilter.${ost1_svc}.num_exports
+ OST_FILE=obdfilter.${ost1_svc}.num_exports
OST_NEXP1="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`"
- echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
- zconf_umount `hostname` $MOUNT2 -f
- # evictor takes up to 2.25x to evict. But if there's a
- # race to start the evictor from various obds, the loser
- # might have to wait for the next ping.
- echo Waiting for $(($TIMEOUT * 4)) secs
- sleep $(($TIMEOUT * 4))
+ echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
+ zconf_umount `hostname` $MOUNT2 -f
+ # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.
+ # But if there's a race to start the evictor from various obds,
+ # the loser might have to wait for the next ping.
+ echo Waiting for $(($TIMEOUT * 3)) secs
+ sleep $(($TIMEOUT * 3))
OST_NEXP2="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`"
MDS_NEXP2="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`"
- echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
+ echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
[ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST"
[ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS"
return 0
run_test 43 "mds osc import failure during recovery; don't LBUG"
test_44a() { # was test_44
+ local at_max_saved=0
+
mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
[ "$mdcdev" ] || exit 2
+
+ # adaptive timeouts slow this way down
+ if at_is_valid && at_is_enabled; then
+ at_max_saved=$(at_max_get mds)
+ at_max_set 40 mds
+ fi
+
for i in `seq 1 10`; do
+ echo "$i of 10 ($(date +%s))"
+ do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
#define OBD_FAIL_TGT_CONN_RACE 0x701
do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701"
$LCTL --device $mdcdev recover
df $MOUNT
done
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
+ [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds
return 0
}
run_test 44a "race in target handle connect"
mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
[ "$mdcdev" ] || exit 2
for i in `seq 1 10`; do
+ echo "$i of 10 ($(date +%s))"
+ do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704"
$LCTL --device $mdcdev recover
}
run_test 61c "test race mds llog sync vs llog cleanup"
+test_62() { # Bug 15756 - don't mis-drop resent replay
+ replay_barrier $SINGLEMDS
+ createmany -o $DIR/$tdir/$tfile- 25
+#define OBD_FAIL_TGT_REPLAY_DROP 0x706
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000707"
+ facet_failover $SINGLEMDS
+ df $MOUNT || return 1
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0"
+ unlinkmany $DIR/$tdir/$tfile- 25 || return 2
+ return 0
+}
+run_test 62 "don't mis-drop resent replay"
+
+#Adaptive Timeouts (bug 3055)
+AT_MAX_SET=0
+
+at_start()
+{
+ if ! at_is_valid; then
+ skip "AT env is invalid"
+ return 1
+ fi
+
+ if ! at_is_enabled; then
+ echo "AT is disabled, enable it by force temporarily"
+ at_max_set 600 mds ost client
+ AT_MAX_SET=1
+ fi
+
+ if [ -z "$ATOLDBASE" ]; then
+ local at_history=$(do_facet mds "find /sys/ -name at_history")
+ [ -z "$at_history" ] && skip "missing /sys/.../at_history " && return 1
+ ATOLDBASE=$(do_facet mds "cat $at_history")
+ # speed up the timebase so we can check decreasing AT
+ do_facet mds "echo 8 >> $at_history"
+ do_facet ost1 "echo 8 >> $at_history"
+ fi
+}
+
+test_65a() #bug 3055
+{
+ at_start || return 0
+ $LCTL dk > /dev/null
+ debugsave
+ sysctl -w lnet.debug="+other"
+ # slow down a request
+ do_facet mds sysctl -w lustre.fail_val=30000
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+ do_facet mds sysctl -w lustre.fail_loc=0x8000050a
+ createmany -o $DIR/$tfile 10 > /dev/null
+ unlinkmany $DIR/$tfile 10 > /dev/null
+ # check for log message
+ $LCTL dk | grep "Early reply #" || error "No early reply"
+ debugrestore
+ # client should show 30s estimates
+ grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+ sleep 9
+ grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+}
+run_test 65a "AT: verify early replies"
+
+test_65b() #bug 3055
+{
+ at_start || return 0
+ # turn on D_ADAPTTO
+ debugsave
+ sysctl -w lnet.debug="other trace"
+ $LCTL dk > /dev/null
+ # slow down bulk i/o
+ do_facet ost1 sysctl -w lustre.fail_val=30
+#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224
+ do_facet ost1 sysctl -w lustre.fail_loc=0x224
+
+ rm -f $DIR/$tfile
+ lfs setstripe $DIR/$tfile --index=0 --count=1
+ # force some real bulk transfer
+ multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
+
+ do_facet ost1 sysctl -w lustre.fail_loc=0
+ # check for log message
+ $LCTL dk | grep "Early reply #" || error "No early reply"
+ debugrestore
+ # client should show 30s estimates
+ grep portal $LPROC/osc/${FSNAME}-OST0000-osc-*/timeouts
+}
+run_test 65b "AT: verify early replies on packed reply / bulk"
+
+test_66a() #bug 3055
+{
+ at_start || return 0
+ grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+ # adjust 5s at a time so no early reply is sent (within deadline)
+ do_facet mds "sysctl -w lustre.fail_val=5000"
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+ do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
+ createmany -o $DIR/$tfile 20 > /dev/null
+ unlinkmany $DIR/$tfile 20 > /dev/null
+ grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+ do_facet mds "sysctl -w lustre.fail_val=10000"
+ do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
+ createmany -o $DIR/$tfile 20 > /dev/null
+ unlinkmany $DIR/$tfile 20 > /dev/null
+ grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
+ do_facet mds "sysctl -w lustre.fail_loc=0"
+ sleep 9
+ createmany -o $DIR/$tfile 20 > /dev/null
+ unlinkmany $DIR/$tfile 20 > /dev/null
+ grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12"
+ CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
+ WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
+ echo "Current MDT timeout $CUR, worst $WORST"
+ [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST"
+}
+run_test 66a "AT: verify MDT service time adjusts with no early replies"
+
+test_66b() #bug 3055
+{
+ at_start || return 0
+ ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/lustre-*/timeouts)
+ sysctl -w lustre.fail_val=$(($ORIG + 5))
+#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
+ sysctl -w lustre.fail_loc=0x50c
+ ls $DIR/$tfile > /dev/null 2>&1
+ sysctl -w lustre.fail_loc=0
+ CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts)
+ WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts)
+ echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
+ [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG"
+}
+run_test 66b "AT: verify net latency adjusts"
+
+test_67a() #bug 3055
+{
+ at_start || return 0
+ CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+ # sleeping threads may drive values above this
+ do_facet ost1 "sysctl -w lustre.fail_val=400"
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+ do_facet ost1 "sysctl -w lustre.fail_loc=0x50a"
+ createmany -o $DIR/$tfile 20 > /dev/null
+ unlinkmany $DIR/$tfile 20 > /dev/null
+ do_facet ost1 "sysctl -w lustre.fail_loc=0"
+ CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+ ATTEMPTS=$(($CONN2 - $CONN1))
+ echo "$ATTEMPTS osc reconnect attemps on gradual slow"
+ [ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect"
+ return 0
+}
+run_test 67a "AT: verify slow request processing doesn't induce reconnects"
+
+test_67b() #bug 3055
+{
+ at_start || return 0
+ CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+#define OBD_FAIL_OST_PAUSE_CREATE 0x223
+ do_facet ost1 "sysctl -w lustre.fail_val=20000"
+ do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
+ cp /etc/profile $DIR/$tfile || error "cp failed"
+ client_reconnect
+ cat $LPROC/ost/OSS/ost_create/timeouts
+ log "phase 2"
+ CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+ ATTEMPTS=$(($CONN2 - $CONN1))
+ echo "$ATTEMPTS osc reconnect attemps on instant slow"
+ # do it again; should not timeout
+ do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
+ cp /etc/profile $DIR/$tfile || error "cp failed"
+ do_facet ost1 "sysctl -w lustre.fail_loc=0"
+ client_reconnect
+ cat $LPROC/ost/OSS/ost_create/timeouts
+ CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
+ ATTEMPTS=$(($CONN3 - $CONN2))
+ echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
+ [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
+ return 0
+}
+run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
+
+test_68 () #bug 13813
+{
+ at_start || return 0
+ local ldlm_enqueue_min=$(find /sys -name ldlm_enqueue_min)
+ [ -z "$ldlm_enqueue_min" ] && skip "missing /sys/.../ldlm_enqueue_min" && return 0
+ local ENQ_MIN=$(cat $ldlm_enqueue_min)
+ echo $TIMEOUT >> $ldlm_enqueue_min
+ rm -f $DIR/${tfile}_[1-2]
+ lfs setstripe $DIR/$tfile --index=0 --count=1
+#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
+ sysctl -w lustre.fail_val=$(($TIMEOUT - 1))
+ sysctl -w lustre.fail_loc=0x80000312
+ cp /etc/profile $DIR/${tfile}_1 || error "1st cp failed $?"
+ sysctl -w lustre.fail_val=$((TIMEOUT * 3 / 2))
+ sysctl -w lustre.fail_loc=0x80000312
+ cp /etc/profile $DIR/${tfile}_2 || error "2nd cp failed $?"
+ sysctl -w lustre.fail_loc=0
+ echo $ENQ_MIN >> $ldlm_enqueue_min
+ return 0
+}
+run_test 68 "AT: verify slowing locks"
+
+if [ -n "$ATOLDBASE" ]; then
+ at_history=$(do_facet mds "find /sys/ -name at_history")
+ do_facet mds "echo $ATOLDBASE >> $at_history" || true
+ do_facet ost1 "echo $ATOLDBASE >> $at_history" || true
+fi
+
+if [ $AT_MAX_SET -ne 0 ]; then
+ echo "restore AT status to be disabled"
+ at_max_set 0 mds ost client
+fi
+
+# end of AT tests includes above lines
+
+
# start multi-client tests
test_70a () {
[ -z "$CLIENTS" ] && \
start_dbench()
{
NPROC=`cat /proc/cpuinfo 2>/dev/null | grep ^processor | wc -l`
- [ $NPROC -lt 2 ] && NPROC=2
+ [ $NPROC -gt 2 ] && NPROC=2
sh rundbench $NPROC 1>/dev/null &
DBENCH_PID=$!
sleep 2
run_test 5 "lsvcgssd dead, operations lead to recovery"
test_6() {
+ local nfile=10
+
mkdir $DIR/d6 || error "mkdir $DIR/d6 failed"
- cp -a /etc/* $DIR/d6/ || error "cp failed"
+ for ((i=0; i<$nfile; i++)); do
+ dd if=/dev/zero of=$DIR/d6/file$i bs=8k count=1 || error "dd file$i failed"
+ done
ls -l $DIR/d6/* > /dev/null || error "ls failed"
rm -rf $DIR2/d6/* || error "rm failed"
+ rmdir $DIR2/d6/ || error "rmdir failed"
}
run_test 6 "test basic DLM callback works"
}
run_test 7 "exercise enlarge_reqbuf()"
-test_8() {
+test_8()
+{
+ debugsave
+ sysctl -w lnet.debug="other"
+ $LCTL dk > /dev/null
+
+ # sleep sometime in ctx handle
+ do_facet mds sysctl -w lustre.fail_val=60
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204
+ do_facet mds sysctl -w lustre.fail_loc=0x1204
+
+ $RUNAS $LFS flushctx || error "can't flush ctx"
+
+ $RUNAS df $DIR &
+ DFPID=$!
+ echo "waiting df (pid $TOUCHPID) to finish..."
+ sleep 2 # give df a chance to really trigger context init rpc
+ do_facet mds sysctl -w lustre.fail_loc=0
+ wait $DFPID || error "df should have succeeded"
+
+ $LCTL dk | grep "Early reply #" || error "No early reply"
+ debugrestore
+}
+run_test 8 "Early reply sent for slow gss context negotiation"
+
+#
+# following tests will manipulate flavors and may end with any flavor set,
+# so each test should not assume any start flavor.
+#
+
+test_50() {
local sample=$TMP/sanity-gss-8
local tdir=$MOUNT/dir8
local iosize="256K"
rm -rf $tdir
rm -f $sample
}
-run_test 8 "verify bulk hash algorithms works"
+run_test 50 "verify bulk hash algorithms works"
-test_9() {
+test_51() {
local s1=$TMP/sanity-gss-9.1
local s2=$TMP/sanity-gss-9.2
local s3=$TMP/sanity-gss-9.3
rm -rf $tdir
rm -f $sample
}
-run_test 9 "bulk data alignment test under encryption mode"
+run_test 51 "bulk data alignment test under encryption mode"
test_90() {
if [ "$SLOW" = "no" ]; then
lctl set_param fail_loc=0x40c
remount_client $MOUNT
lctl set_param fail_loc=0
+ sleep 2 # wait async osc connect to finish
for VALUE in `lctl get_param osc.*osc-[^mM]*.checksum_type`; do
PARAM=`echo ${VALUE[0]} | cut -d "=" -f1`
algo=`lctl get_param -n $PARAM | sed 's/.*\[\(.*\)\].*/\1/g'`
export KRB5DIR=${KRB5DIR:-"/usr/kerberos"}
export DIR2
export SAVE_PWD=${SAVE_PWD:-$LUSTRE/tests}
+ export AT_MAX_PATH
if [ "$ACCEPTOR_PORT" ]; then
export PORT_OPT="--port $ACCEPTOR_PORT"
}
##################################
+# Adaptive Timeouts funcs
+
+at_is_valid() {
+ if [ -z "$AT_MAX_PATH" ]; then
+ AT_MAX_PATH=$(do_facet mds "find /sys/ -name at_max")
+ [ -z "$AT_MAX_PATH" ] && echo "missing /sys/.../at_max " && return 1
+ fi
+ return 0
+}
+
+at_is_enabled() {
+ at_is_valid || error "invalid call"
+
+ # only check mds, we assume at_max is the same on all nodes
+ local at_max=$(do_facet mds "cat $AT_MAX_PATH")
+ if [ $at_max -eq 0 ]; then
+ return 1
+ else
+ return 0
+ fi
+}
+
+at_max_get() {
+ at_is_valid || error "invalid call"
+
+ do_facet $1 "cat $AT_MAX_PATH"
+}
+
+at_max_set() {
+ local at_max=$1
+ shift
+
+ at_is_valid || error "invalid call"
+
+ for facet in $@; do
+ if [ $facet == "ost" ]; then
+ for i in `seq $OSTCOUNT`; do
+ do_facet ost$i "echo $at_max > $AT_MAX_PATH"
+ done
+ elif [ $facet == "mds" ]; then
+ for i in `seq $MDSCOUNT`; do
+ do_facet mds$i "echo $at_max > $AT_MAX_PATH"
+ done
+ else
+ do_facet $facet "echo $at_max > $AT_MAX_PATH"
+ fi
+ done
+}
+
+##################################
# OBD_FAIL funcs
drop_request() {
lgssd
lsvcgssd
l_idmap
+lgss_keyring
.*.cmd
.*.d
CHECK_MEMBER(lustre_msg_v2, lm_secflvr);
CHECK_MEMBER(lustre_msg_v2, lm_magic);
CHECK_MEMBER(lustre_msg_v2, lm_repsize);
- CHECK_MEMBER(lustre_msg_v2, lm_timeout);
- CHECK_MEMBER(lustre_msg_v2, lm_padding_1);
+ CHECK_MEMBER(lustre_msg_v2, lm_cksum);
+ CHECK_MEMBER(lustre_msg_v2, lm_flags);
CHECK_MEMBER(lustre_msg_v2, lm_padding_2);
CHECK_MEMBER(lustre_msg_v2, lm_padding_3);
CHECK_MEMBER(lustre_msg_v2, lm_buflens[0]);
CHECK_MEMBER(ptlrpc_body, pb_flags);
CHECK_MEMBER(ptlrpc_body, pb_op_flags);
CHECK_MEMBER(ptlrpc_body, pb_conn_cnt);
- CHECK_MEMBER(ptlrpc_body, pb_padding_1);
- CHECK_MEMBER(ptlrpc_body, pb_padding_2);
+ CHECK_MEMBER(ptlrpc_body, pb_timeout);
+ CHECK_MEMBER(ptlrpc_body, pb_service_time);
CHECK_MEMBER(ptlrpc_body, pb_slv);
CHECK_MEMBER(ptlrpc_body, pb_limit);
}
COMMENT("Constants...");
CHECK_DEFINE(LUSTRE_MSG_MAGIC_V2);
CHECK_DEFINE(PTLRPC_MSG_VERSION);
+ CHECK_VALUE(MSGHDR_AT_SUPPORT);
CHECK_VALUE(PTL_RPC_MSG_REQUEST);
CHECK_VALUE(PTL_RPC_MSG_ERR);
(long long)LUSTRE_MSG_MAGIC_V2);
LASSERTF(PTLRPC_MSG_VERSION == 0x00000003," found %lld\n",
(long long)PTLRPC_MSG_VERSION);
+ LASSERTF(MSGHDR_AT_SUPPORT == 1, " found %lld\n",
+ (long long)MSGHDR_AT_SUPPORT);
LASSERTF(PTL_RPC_MSG_REQUEST == 4711, " found %lld\n",
(long long)PTL_RPC_MSG_REQUEST);
LASSERTF(PTL_RPC_MSG_ERR == 4712, " found %lld\n",
(long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n",
(long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
- LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n",
- (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout));
- LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout));
- LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n",
- (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1));
- LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, " found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, " found %lld\n",
(long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n",
(long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n",
- (long long)(int)offsetof(struct ptlrpc_body, pb_padding_1));
- LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1));
- LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n",
- (long long)(int)offsetof(struct ptlrpc_body, pb_padding_2));
- LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2));
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body, pb_timeout));
+ LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout));
+ LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body, pb_service_time));
+ LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time));
LASSERTF((int)offsetof(struct ptlrpc_body, pb_slv) == 80, " found %lld\n",
(long long)(int)offsetof(struct ptlrpc_body, pb_slv));
LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_slv) == 8, " found %lld\n",