From de3c3fdf2597ba110a5247e271470b5dd626df75 Mon Sep 17 00:00:00 2001 From: ericm Date: Mon, 7 Jul 2008 19:12:54 +0000 Subject: [PATCH] branch: HEAD port Adaptive Timeouts from b1_6 to HEAD. b=14071 r=nathan r=adilger --- lustre/ChangeLog | 9 +- lustre/cmm/mdc_device.c | 3 +- lustre/fid/fid_request.c | 1 + lustre/fld/fld_request.c | 1 + lustre/include/linux/lustre_fsfilt.h | 26 +- lustre/include/lprocfs_status.h | 41 ++ lustre/include/lustre/lustre_idl.h | 35 +- lustre/include/lustre_dlm.h | 2 + lustre/include/lustre_export.h | 1 + lustre/include/lustre_import.h | 46 +- lustre/include/lustre_lib.h | 9 +- lustre/include/lustre_net.h | 103 +++-- lustre/include/lustre_sec.h | 3 + lustre/include/obd.h | 3 +- lustre/include/obd_support.h | 101 +++-- lustre/ldlm/ldlm_internal.h | 2 + lustre/ldlm/ldlm_lib.c | 238 ++++++++--- lustre/ldlm/ldlm_lockd.c | 59 ++- lustre/ldlm/ldlm_request.c | 44 +- lustre/ldlm/ldlm_resource.c | 2 + lustre/liblustre/llite_lib.c | 5 +- lustre/liblustre/super.c | 4 +- lustre/llite/llite_lib.c | 5 +- lustre/lvfs/lvfs_lib.c | 18 + lustre/mdc/lproc_mdc.c | 3 +- lustre/mdc/mdc_reint.c | 3 +- lustre/mdc/mdc_request.c | 6 +- lustre/mds/mds_lov.c | 5 +- lustre/mdt/mdt_handler.c | 209 +++++----- lustre/mdt/mdt_internal.h | 3 +- lustre/mgc/mgc_request.c | 2 +- lustre/mgs/mgs_handler.c | 10 +- lustre/mgs/mgs_internal.h | 3 - lustre/obdclass/class_obd.c | 4 +- lustre/obdclass/genops.c | 21 +- lustre/obdclass/linux/linux-module.c | 23 - lustre/obdclass/lprocfs_status.c | 134 ++++-- lustre/obdclass/obd_config.c | 1 - lustre/obdclass/obd_mount.c | 15 +- lustre/obdfilter/filter.c | 17 +- lustre/obdfilter/filter_internal.h | 3 +- lustre/obdfilter/filter_io.c | 8 +- lustre/obdfilter/filter_io_26.c | 8 +- lustre/osc/lproc_osc.c | 1 + lustre/osc/osc_create.c | 7 +- lustre/osc/osc_request.c | 10 +- lustre/ost/lproc_ost.c | 33 -- lustre/ost/ost_handler.c | 120 +++--- lustre/ost/ost_internal.h | 8 +- lustre/ptlrpc/client.c | 316 +++++++++++--- lustre/ptlrpc/events.c | 71 +++- lustre/ptlrpc/gss/gss_bulk.c | 6 +- lustre/ptlrpc/gss/gss_cli_upcall.c | 3 +- lustre/ptlrpc/gss/gss_keyring.c | 10 +- lustre/ptlrpc/gss/gss_svc_upcall.c | 2 +- lustre/ptlrpc/gss/sec_gss.c | 248 +++++++---- lustre/ptlrpc/import.c | 268 ++++++++++-- lustre/ptlrpc/lproc_ptlrpc.c | 71 +++- lustre/ptlrpc/niobuf.c | 102 ++++- lustre/ptlrpc/pack_generic.c | 221 +++++++++- lustre/ptlrpc/pinger.c | 14 +- lustre/ptlrpc/ptlrpc_module.c | 3 + lustre/ptlrpc/recov_thread.c | 3 +- lustre/ptlrpc/sec.c | 208 +++++++-- lustre/ptlrpc/sec_bulk.c | 69 ++- lustre/ptlrpc/sec_null.c | 70 +++- lustre/ptlrpc/sec_plain.c | 72 +++- lustre/ptlrpc/service.c | 786 +++++++++++++++++++++++++++-------- lustre/ptlrpc/wiretest.c | 34 +- lustre/tests/recovery-small.sh | 42 +- lustre/tests/replay-single.sh | 228 ++++++++++ lustre/tests/sanity-gss.sh | 47 ++- lustre/tests/sanity.sh | 1 + lustre/tests/test-framework.sh | 51 +++ lustre/utils/gss/.cvsignore | 1 + lustre/utils/wirecheck.c | 9 +- lustre/utils/wiretest.c | 34 +- 77 files changed, 3383 insertions(+), 1025 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 55c27af..e024b38 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -599,10 +599,11 @@ Bugzilla : 12836 Description: lfs find on -1 stripe looping in lsm_lmm_verify_common() Details : Avoid lov_verify_lmm_common() on directory with -1 stripe count. -Severity : major -Bugzilla : 12932 -Description: obd_health_check_timeout too short -Details : set obd_health_check_timeout as 1.5x of obd_timeout +Severity : enhancement +Bugzilla : 3055 +Description: Adaptive timeouts +Details : RPC timeouts adapt to changing server load and network + conditions to reduce resend attempts and improve recovery time. Severity : normal Bugzilla : 12192 diff --git a/lustre/cmm/mdc_device.c b/lustre/cmm/mdc_device.c index 3328740..146720c 100644 --- a/lustre/cmm/mdc_device.c +++ b/lustre/cmm/mdc_device.c @@ -137,7 +137,8 @@ static int mdc_obd_add(const struct lu_env *env, OBD_CONNECT_OSS_CAPA | OBD_CONNECT_IBITS | OBD_CONNECT_MDS_MDS | - OBD_CONNECT_FID; + OBD_CONNECT_FID | + OBD_CONNECT_AT; rc = obd_connect(env, conn, mdc, &mdc->obd_uuid, ocd, NULL); OBD_FREE_PTR(ocd); if (rc) { diff --git a/lustre/fid/fid_request.c b/lustre/fid/fid_request.c index 3910422..d4ecfd9 100644 --- a/lustre/fid/fid_request.c +++ b/lustre/fid/fid_request.c @@ -85,6 +85,7 @@ static int seq_client_rpc(struct lu_client_seq *seq, struct lu_range *input, req->rq_request_portal = (opc == SEQ_ALLOC_SUPER) ? SEQ_CONTROLLER_PORTAL : SEQ_DATA_PORTAL; } + ptlrpc_at_set_req_timeout(req); mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); rc = ptlrpc_queue_wait(req); diff --git a/lustre/fld/fld_request.c b/lustre/fld/fld_request.c index 23bbcf6..f0d5191 100644 --- a/lustre/fld/fld_request.c +++ b/lustre/fld/fld_request.c @@ -456,6 +456,7 @@ static int fld_client_rpc(struct obd_export *exp, ptlrpc_request_set_replen(req); req->rq_request_portal = FLD_REQUEST_PORTAL; + ptlrpc_at_set_req_timeout(req); if (fld_op != FLD_LOOKUP) mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 93c47ee..a8710eb 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -164,14 +164,14 @@ static inline lvfs_sbdev_type fsfilt_journal_sbdev(struct obd_device *obd, #define FSFILT_OP_JOIN 11 #define FSFILT_OP_NOOP 15 -#define __fsfilt_check_slow(obd, start, timeout, msg) \ +#define __fsfilt_check_slow(obd, start, msg) \ do { \ if (time_before(jiffies, start + 15 * HZ)) \ break; \ else if (time_before(jiffies, start + 30 * HZ)) \ CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \ msg, (jiffies-start) / HZ); \ - else if (time_before(jiffies, start + timeout / 2 * HZ)) \ + else if (time_before(jiffies, start + DISK_TIMEOUT * HZ)) \ CWARN("%s: slow %s %lus\n", obd->obd_name, msg, \ (jiffies - start) / HZ); \ else \ @@ -179,10 +179,10 @@ do { \ (jiffies - start) / HZ); \ } while (0) -#define fsfilt_check_slow(obd, start, timeout, msg) \ -do { \ - __fsfilt_check_slow(obd, start, timeout, msg); \ - start = jiffies; \ +#define fsfilt_check_slow(obd, start, msg) \ +do { \ + __fsfilt_check_slow(obd, start, msg); \ + start = jiffies; \ } while (0) static inline void *fsfilt_start_log(struct obd_device *obd, @@ -208,7 +208,7 @@ static inline void *fsfilt_start_log(struct obd_device *obd, LBUG(); } } - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return handle; } @@ -243,7 +243,7 @@ static inline void *fsfilt_brw_start_log(struct obd_device *obd, int objcount, LBUG(); } } - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return handle; } @@ -263,7 +263,7 @@ static inline int fsfilt_extend(struct obd_device *obd, struct inode *inode, int rc = obd->obd_fsops->fs_extend(inode, nblocks, handle); CDEBUG(D_INFO, "extending handle %p with %u blocks\n", handle, nblocks); - fsfilt_check_slow(obd, now, obd_timeout, "journal extend"); + fsfilt_check_slow(obd, now, "journal extend"); return rc; } @@ -275,7 +275,7 @@ static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode, int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync); CDEBUG(D_INFO, "committing handle %p\n", handle); - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return rc; } @@ -288,7 +288,7 @@ static inline int fsfilt_commit_async(struct obd_device *obd, int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle); CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle); - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return rc; } @@ -299,7 +299,7 @@ static inline int fsfilt_commit_wait(struct obd_device *obd, unsigned long now = jiffies; int rc = obd->obd_fsops->fs_commit_wait(inode, handle); CDEBUG(D_INFO, "waiting for completion %p\n", handle); - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return rc; } @@ -309,7 +309,7 @@ static inline int fsfilt_setattr(struct obd_device *obd, struct dentry *dentry, unsigned long now = jiffies; int rc; rc = obd->obd_fsops->fs_setattr(dentry, handle, iattr, do_trunc); - fsfilt_check_slow(obd, now, obd_timeout, "setattr"); + fsfilt_check_slow(obd, now, "setattr"); return rc; } diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index f6f0b40..4331fd6 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -283,6 +283,23 @@ struct obd_device; struct file; struct obd_histogram; +/* Days / hours / mins / seconds format */ +struct dhms { + int d,h,m,s; +}; +static inline void s2dhms(struct dhms *ts, time_t secs) +{ + ts->d = secs / 86400; + secs = secs % 86400; + ts->h = secs / 3600; + secs = secs % 3600; + ts->m = secs / 60; + ts->s = secs % 60; +} +#define DHMS_FMT "%dd%dh%02dm%02ds" +#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s + + #ifdef LPROCFS static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int type) @@ -436,6 +453,13 @@ extern int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count, int *eof, void *data); extern int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count, int *eof, void *data); +struct adaptive_timeout; +extern int lprocfs_at_hist_helper(char *page, int count, int rc, + struct adaptive_timeout *at); +extern int lprocfs_rd_timeouts(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_wr_timeouts(struct file *file, const char *buffer, + unsigned long count, void *data); extern int lprocfs_wr_evict_client(struct file *file, const char *buffer, unsigned long count, void *data); extern int lprocfs_wr_ping(struct file *file, const char *buffer, @@ -543,6 +567,10 @@ struct file_operations name##_fops = { \ #define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL) #define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write) +/* lproc_ptlrpc.c */ +struct ptlrpc_request; +extern void target_print_req(void *seq_file, struct ptlrpc_request *req); + /* lprocfs_status.c: read recovery max time bz13079 */ int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off, int count, int *eof, void *data); @@ -650,6 +678,16 @@ static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off, static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count, int *eof, void *data) { return 0; } +struct adaptive_timeout; +static inline int lprocfs_at_hist_helper(char *page, int count, int rc, + struct adaptive_timeout *at) +{ return 0; } +static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_wr_timeouts(struct file *file, const char *buffer, + unsigned long count, void *data) +{ return 0; } static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer, unsigned long count, void *data) { return 0; } @@ -706,6 +744,9 @@ __u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, #define LPROC_SEQ_FOPS_RO(name) #define LPROC_SEQ_FOPS(name) +/* lproc_ptlrpc.c */ +#define target_print_req NULL + #endif /* LPROCFS */ #endif /* LPROCFS_SNMP_H */ diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 016d8e8..82270b7 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -449,6 +449,9 @@ static inline void lustre_handle_copy(struct lustre_handle *tgt, tgt->cookie = src->cookie; } +/* flags for lm_flags */ +#define MSGHDR_AT_SUPPORT 0x1 + #define lustre_msg lustre_msg_v2 /* we depend on this structure to be 8-byte aligned */ /* this type is only endian-adjusted in lustre_unpack_msg() */ @@ -457,8 +460,8 @@ struct lustre_msg_v2 { __u32 lm_secflvr; __u32 lm_magic; __u32 lm_repsize; - __u32 lm_timeout; - __u32 lm_padding_1; + __u32 lm_cksum; + __u32 lm_flags; __u32 lm_padding_2; __u32 lm_padding_3; __u32 lm_buflens[0]; @@ -478,8 +481,8 @@ struct ptlrpc_body { __u32 pb_flags; __u32 pb_op_flags; __u32 pb_conn_cnt; - __u32 pb_padding_1; - __u32 pb_padding_2; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time */ __u32 pb_limit; __u64 pb_slv; }; @@ -511,12 +514,16 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); #define MSG_OP_FLAG_SHIFT 16 /* Flags that apply to all requests are in the bottom 16 bits */ -#define MSG_GEN_FLAG_MASK 0x0000ffff -#define MSG_LAST_REPLAY 1 -#define MSG_RESENT 2 -#define MSG_REPLAY 4 -#define MSG_REQ_REPLAY_DONE 8 -#define MSG_LOCK_REPLAY_DONE 16 +#define MSG_GEN_FLAG_MASK 0x0000ffff +#define MSG_LAST_REPLAY 0x0001 +#define MSG_RESENT 0x0002 +#define MSG_REPLAY 0x0004 +/* #define MSG_AT_SUPPORT 0x0008 + * This was used in early prototypes of adaptive timeouts, and while there + * shouldn't be any users of that code there also isn't a need for using this + * bits. Defer usage until at least 1.10 to avoid potential conflict. */ +#define MSG_REQ_REPLAY_DONE 0x0010 +#define MSG_LOCK_REPLAY_DONE 0x0020 /* * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT) @@ -581,16 +588,16 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | \ OBD_CONNECT_MDS_MDS | OBD_CONNECT_CANCELSET | \ OBD_CONNECT_FID | \ - LRU_RESIZE_CONNECT_FLAG) + LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_AT) #define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ OBD_CONNECT_BRW_SIZE | OBD_CONNECT_QUOTA64 | \ OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET | \ - OBD_CONNECT_CKSUM | \ - LRU_RESIZE_CONNECT_FLAG) + OBD_CONNECT_CKSUM | LRU_RESIZE_CONNECT_FLAG | \ + OBD_CONNECT_AT) #define ECHO_CONNECT_SUPPORTED (0) -#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION) +#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT) #define MAX_QUOTA_COUNT32 (0xffffffffULL) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index e6cac14..06b5a7b 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -429,6 +429,8 @@ struct ldlm_namespace { * Backward link to obd, required for ldlm pool to store new SLV. */ struct obd_device *ns_obd; + + struct adaptive_timeout ns_at_estimate;/* estimated lock callback time*/ }; static inline int ns_is_client(struct ldlm_namespace *ns) diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index 4fdbca3..917e0da 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -95,6 +95,7 @@ struct obd_export { struct ldlm_export_data exp_ldlm_data; struct list_head exp_outstanding_replies; time_t exp_last_request_time; + struct list_head exp_req_replay_queue; spinlock_t exp_lock; /* protects flags int below */ /* ^ protects exp_outstanding_replies too */ __u64 exp_connect_flags; diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 7a5c761..c3ab5fd 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -8,6 +8,22 @@ #include #include + +/* Adaptive Timeout stuff */ +#define D_ADAPTTO D_OTHER +#define AT_BINS 4 /* "bin" means "N seconds of history" */ +#define AT_FLG_NOHIST 0x1 /* use last reported value only */ + +struct adaptive_timeout { + time_t at_binstart; /* bin start time */ + unsigned int at_hist[AT_BINS]; /* timeout history bins */ + unsigned int at_flags; + unsigned int at_current; /* current timeout value */ + unsigned int at_worst_ever; /* worst-ever timeout value */ + time_t at_worst_time; /* worst-ever timeout timestamp */ + spinlock_t at_lock; +}; + enum lustre_imp_state { LUSTRE_IMP_CLOSED = 1, LUSTRE_IMP_NEW = 2, @@ -48,6 +64,13 @@ struct obd_import_conn { __u64 oic_last_attempt; /* jiffies, 64-bit */ }; +#define IMP_AT_MAX_PORTALS 8 +struct imp_at { + int iat_portal[IMP_AT_MAX_PORTALS]; + struct adaptive_timeout iat_net_latency; + struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS]; +}; + struct obd_import { struct portals_handle imp_handle; atomic_t imp_refcount; @@ -111,8 +134,12 @@ struct obd_import { int imp_connect_error; __u32 imp_msg_magic; + __u32 imp_msghdr_flags; /* adjusted based on server capability */ - struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ + struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ + + struct imp_at imp_at; /* adaptive timeout data */ + time_t imp_last_reply_time; /* for health check */ }; typedef void (*obd_import_callback)(struct obd_import *imp, void *closure, @@ -131,6 +158,23 @@ void class_unobserve_import(struct obd_import *imp, obd_import_callback cb, void class_notify_import_observers(struct obd_import *imp, int event, void *event_arg); +/* import.c */ +static inline void at_init(struct adaptive_timeout *at, int val, int flags) { + memset(at, 0, sizeof(*at)); + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = cfs_time_current_sec(); + at->at_flags = flags; + spin_lock_init(&at->at_lock); +} +static inline int at_get(struct adaptive_timeout *at) { + return at->at_current; +} +int at_add(struct adaptive_timeout *at, unsigned int val); +int import_at_get_index(struct obd_import *imp, int portal); +extern unsigned int at_max; +#define AT_OFF (at_max == 0) + /* genops.c */ struct obd_export; extern struct obd_import *class_exp2cliimp(struct obd_export *); diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index 98bcf81..abce43e 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -73,18 +73,15 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req); #define target_handle_qc_callback(req) (0) #endif -void target_cancel_recovery_timer(struct obd_device *obd); - -#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) /* *waves hands* */ #define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */ -void target_start_recovery_timer(struct obd_device *obd); + +void target_cancel_recovery_timer(struct obd_device *obd); int target_start_recovery_thread(struct obd_device *obd, - svc_handler_t handler); + svc_handler_t handler); void target_stop_recovery_thread(struct obd_device *obd); void target_cleanup_recovery(struct obd_device *obd); int target_queue_recovery_request(struct ptlrpc_request *req, struct obd_device *obd); -int target_queue_final_reply(struct ptlrpc_request *req, int rc); void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); /* client.c */ diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index d3bb4c6..0ca4cc5 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -293,11 +293,12 @@ struct lu_env; struct ptlrpc_request { int rq_type; /* one of PTL_RPC_MSG_* */ struct list_head rq_list; + struct list_head rq_timed_list; /* server-side early replies */ struct list_head rq_history_list; /* server-side history */ __u64 rq_history_seq; /* history sequence # */ int rq_status; spinlock_t rq_lock; - /* client-side flags */ + /* client-side flags are serialized by rq_lock */ unsigned long rq_intr:1, rq_replied:1, rq_err:1, rq_timedout:1, rq_resend:1, rq_restart:1, /* @@ -313,9 +314,15 @@ struct ptlrpc_request { /* this is the last request in the sequence. */ rq_sequence:1, rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, - rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1; + rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1, + rq_early:1, rq_must_unlink:1, + /* server-side flags */ + rq_packed_final:1, /* packed final reply */ + rq_sent_final:1; /* stop sending early replies */ + enum rq_phase rq_phase; /* one of RQ_PHASE_* */ - atomic_t rq_refcount; /* client-side refcount for SENT race */ + atomic_t rq_refcount; /* client-side refcount for SENT race, + server-side refcounf for multiple replies */ struct ptlrpc_thread *rq_svc_thread; /* initial thread servicing req */ @@ -327,7 +334,6 @@ struct ptlrpc_request { int rq_reqlen; struct lustre_msg *rq_reqmsg; - int rq_timeout; /* time to wait for reply (seconds) */ int rq_replen; struct lustre_msg *rq_repmsg; __u64 rq_transno; @@ -364,12 +370,16 @@ struct ptlrpc_request { /* (server side), pointed directly into req buffer */ struct ptlrpc_user_desc *rq_user_desc; + /* early replies go to offset 0, regular replies go after that */ + unsigned int rq_reply_off; + /* various buffer pointers */ struct lustre_msg *rq_reqbuf; /* req wrapper */ int rq_reqbuf_len; /* req wrapper buf len */ int rq_reqdata_len; /* req wrapper msg len */ - struct lustre_msg *rq_repbuf; /* rep wrapper */ - int rq_repbuf_len; /* rep wrapper buf len */ + char *rq_repbuf; /* rep buffer */ + int rq_repbuf_len; /* rep buffer len */ + struct lustre_msg *rq_repdata; /* rep wrapper msg */ int rq_repdata_len; /* rep wrapper msg len */ struct lustre_msg *rq_clrbuf; /* only in priv mode */ int rq_clrbuf_len; /* only in priv mode */ @@ -381,6 +391,8 @@ struct ptlrpc_request { int rq_import_generation; enum lustre_imp_state rq_send_state; + int rq_early_count; /* how many early replies (for stats) */ + /* client+server request */ lnet_handle_md_t rq_req_md_h; struct ptlrpc_cb_id rq_req_cbid; @@ -407,10 +419,17 @@ struct ptlrpc_request { void (*rq_commit_cb)(struct ptlrpc_request *); void *rq_cb_data; - struct ptlrpc_bulk_desc *rq_bulk; /* client side bulk */ - time_t rq_sent; /* when request sent, seconds, - * or time when request should - * be sent */ + struct ptlrpc_bulk_desc *rq_bulk;/* client side bulk */ + + /* client outgoing req */ + time_t rq_sent; /* when request/reply sent (secs), or + * time when request should be sent */ + + volatile time_t rq_deadline; /* when request must finish. volatile + so that servers' early reply updates to the deadline aren't + kept in per-cpu cache */ + int rq_timeout; /* service time estimate (secs) */ + /* Multi-rpc bits */ struct list_head rq_set_chain; struct ptlrpc_request_set *rq_set; @@ -604,13 +623,22 @@ struct ptlrpc_service { int srv_n_difficult_replies; /* # 'difficult' replies */ int srv_n_active_reqs; /* # reqs being served */ cfs_duration_t srv_rqbd_timeout; /* timeout before re-posting reqs, in tick */ - int srv_watchdog_timeout; /* soft watchdog timeout, in ms */ + int srv_watchdog_factor; /* soft watchdog timeout mutiplier */ unsigned srv_cpu_affinity:1; /* bind threads to CPUs */ + unsigned srv_at_check:1; /* check early replies */ + cfs_time_t srv_at_checktime; /* debug */ __u32 srv_req_portal; __u32 srv_rep_portal; - int srv_n_queued_reqs; /* # reqs waiting to be served */ + /* AT stuff */ + struct adaptive_timeout srv_at_estimate;/* estimated rpc service time */ + spinlock_t srv_at_lock; + struct list_head srv_at_list; /* reqs waiting for replies */ + cfs_timer_t srv_at_timer; /* early reply timer */ + + int srv_n_queued_reqs; /* # reqs in either of the queues below */ + struct list_head srv_req_in_queue; /* incoming reqs */ struct list_head srv_request_queue; /* reqs waiting for service */ struct list_head srv_request_history; /* request history */ @@ -707,11 +735,14 @@ static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc) return (rc); } -int ptlrpc_send_reply(struct ptlrpc_request *req, int); +#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 +#define PTLRPC_REPLY_EARLY 0x02 +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); int ptlrpc_reply(struct ptlrpc_request *req); int ptlrpc_send_error(struct ptlrpc_request *req, int difficult); int ptlrpc_error(struct ptlrpc_request *req); void ptlrpc_resend_req(struct ptlrpc_request *request); +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); int ptl_send_rpc(struct ptlrpc_request *request, int noreply); int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd); @@ -722,23 +753,12 @@ void ptlrpc_cleanup_client(struct obd_import *imp); struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); static inline int -ptlrpc_client_receiving_reply (struct ptlrpc_request *req) +ptlrpc_client_recv_or_unlink (struct ptlrpc_request *req) { int rc; spin_lock(&req->rq_lock); - rc = req->rq_receiving_reply; - spin_unlock(&req->rq_lock); - return (rc); -} - -static inline int -ptlrpc_client_replied (struct ptlrpc_request *req) -{ - int rc; - - spin_lock(&req->rq_lock); - rc = req->rq_replied; + rc = req->rq_receiving_reply || req->rq_must_unlink; spin_unlock(&req->rq_lock); return (rc); } @@ -776,6 +796,7 @@ void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); struct ptlrpc_request_pool *ptlrpc_init_rq_pool(int, int, void (*populate_pool)(struct ptlrpc_request_pool *, int)); +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, const struct req_format *format); struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, @@ -821,7 +842,7 @@ struct ptlrpc_service_conf { int psc_max_reply_size; int psc_req_portal; int psc_rep_portal; - int psc_watchdog_timeout; /* in ms */ + int psc_watchdog_factor; int psc_min_threads; int psc_max_threads; __u32 psc_ctx_tags; @@ -841,7 +862,7 @@ struct ptlrpc_service *ptlrpc_init_svc_conf(struct ptlrpc_service_conf *c, struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, int req_portal, int rep_portal, - int watchdog_timeout, /* in ms */ + int watchdog_factor, svc_handler_t, char *name, cfs_proc_dir_entry_t *proc_entry, svcreq_printfn_t, @@ -881,7 +902,10 @@ int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count, int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens, char **bufs); int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, - int *lens, char **bufs); + int *lens, char **bufs, int flags); +#define LPRFL_EARLY_REPLY 1 +int lustre_pack_reply_flags(struct ptlrpc_request *, int count, int *lens, + char **bufs, int flags); int lustre_shrink_msg(struct lustre_msg *msg, int segment, unsigned int newlen, int move_data); void lustre_free_reply_state(struct ptlrpc_reply_state *rs); @@ -889,6 +913,7 @@ int lustre_msg_hdr_size(__u32 magic, int count); int lustre_msg_size(__u32 magic, int count, int *lengths); int lustre_msg_size_v2(int count, int *lengths); int lustre_packed_msg_size(struct lustre_msg *msg); +int lustre_msg_early_size(void); int lustre_unpack_msg(struct lustre_msg *m, int len); void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size); void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen); @@ -901,6 +926,8 @@ void *lustre_swab_reqbuf(struct ptlrpc_request *req, int n, int minlen, void *swabber); void *lustre_swab_repbuf(struct ptlrpc_request *req, int n, int minlen, void *swabber); +__u32 lustre_msghdr_get_flags(struct lustre_msg *msg); +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags); __u32 lustre_msg_get_flags(struct lustre_msg *msg); void lustre_msg_add_flags(struct lustre_msg *msg, int flags); void lustre_msg_set_flags(struct lustre_msg *msg, int flags); @@ -922,7 +949,12 @@ void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv); void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit); int lustre_msg_get_status(struct lustre_msg *msg); __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg); +int lustre_msg_is_v1(struct lustre_msg *msg); __u32 lustre_msg_get_magic(struct lustre_msg *msg); +__u32 lustre_msg_get_timeout(struct lustre_msg *msg); +__u32 lustre_msg_get_service_time(struct lustre_msg *msg); +__u32 lustre_msg_get_cksum(struct lustre_msg *msg); +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg); void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle); void lustre_msg_set_type(struct lustre_msg *msg, __u32 type); void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc); @@ -933,6 +965,9 @@ void lustre_msg_set_status(struct lustre_msg *msg, __u32 status); void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt); void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, int *sizes); void ptlrpc_request_set_replen(struct ptlrpc_request *req); +void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout); +void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time); +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum); static inline void lustre_shrink_reply(struct ptlrpc_request *req, int segment, @@ -959,6 +994,16 @@ ptlrpc_rs_decref(struct ptlrpc_reply_state *rs) lustre_free_reply_state(rs); } +/* Should only be called once per req */ +static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req) +{ + if (req->rq_reply_state == NULL) + return; /* shouldn't occur */ + ptlrpc_rs_decref(req->rq_reply_state); + req->rq_reply_state = NULL; + req->rq_repmsg = NULL; +} + static inline __u32 lustre_request_magic(struct ptlrpc_request *req) { return lustre_msg_get_magic(req->rq_reqmsg); diff --git a/lustre/include/lustre_sec.h b/lustre/include/lustre_sec.h index 033a0cf..7c4be3c 100644 --- a/lustre/include/lustre_sec.h +++ b/lustre/include/lustre_sec.h @@ -656,6 +656,9 @@ int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize); void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req); int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, int segment, int newsize); +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req); +int sptlrpc_cli_finish_early_reply(struct ptlrpc_request *req); + void sptlrpc_request_out_callback(struct ptlrpc_request *req); /* diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 5836004..e5804fe 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -941,8 +941,9 @@ struct obd_device { spinlock_t obd_uncommitted_replies_lock; cfs_timer_t obd_recovery_timer; time_t obd_recovery_start; /* seconds */ - time_t obd_recovery_end; /* seconds */ + time_t obd_recovery_end; /* seconds, for lprocfs_status */ time_t obd_recovery_max_time; /* seconds, bz13079 */ + int obd_recovery_timeout; /* new recovery stuff from CMD2 */ struct target_recovery_data obd_recovery_data; diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 6765019..fd4d87f 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -56,11 +56,10 @@ extern unsigned int obd_fail_val; extern unsigned int obd_debug_peer_on_timeout; extern unsigned int obd_dump_on_timeout; extern unsigned int obd_dump_on_eviction; +/* obd_timeout should only be used for recovery, not for + networking / disk / timings affected by load (use Adaptive Timeouts) */ extern unsigned int obd_timeout; /* seconds */ -#define PING_INTERVAL max(obd_timeout / 4, 1U) -#define RECONNECT_INTERVAL max(obd_timeout / 10, 10U) -extern unsigned int ldlm_timeout; -extern unsigned int obd_health_check_timeout; +extern unsigned int ldlm_timeout; /* seconds */ extern unsigned int obd_sync_filter; extern unsigned int obd_max_dirty_pages; extern atomic_t obd_dirty_pages; @@ -69,17 +68,47 @@ extern int obd_race_state; extern unsigned int obd_alloc_fail_rate; int __obd_fail_check_set(__u32 id, __u32 value, int set); +int __obd_fail_timeout_set(__u32 id, __u32 value, int ms, int set); /* lvfs.c */ int obd_alloc_fail(const void *ptr, const char *name, const char *type, size_t size, const char *file, int line); /* Timeout definitions */ -#define LDLM_TIMEOUT_DEFAULT 20 #define OBD_TIMEOUT_DEFAULT 100 -#define HEALTH_CHECK_COEF 3 / 2 -#define HEALTH_CHECK_TIMEOUT_DEFAULT (OBD_TIMEOUT_DEFAULT * HEALTH_CHECK_COEF) -#define HEALTH_CHECK_TIMEOUT (obd_timeout * HEALTH_CHECK_COEF) +#define LDLM_TIMEOUT_DEFAULT 20 +/* Time to wait for all clients to reconnect during recovery */ +/* Should be very conservative; must catch the first reconnect after reboot */ +#define OBD_RECOVERY_FACTOR (3) /* times obd_timeout */ +/* Change recovery-small 26b time if you change this */ +#define PING_INTERVAL max(obd_timeout / 4, 1U) +/* Client may skip 1 ping; we must wait at least 2.5. But for multiple + * failover targets the client only pings one server at a time, and pings + * can be lost on a loaded network. Since eviction has serious consequences, + * and there's no urgent need to evict a client just because it's idle, we + * should be very conservative here. */ +#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6) +#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */ +#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */ + /* Max connect interval for nonresponsive servers; ~50s to avoid building up + connect requests in the LND queues, but within obd_timeout so we don't + miss the recovery window */ +#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout)) +#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */ +#ifndef CRAY_XT3 +/* In general this should be low to have quick detection of a system + running on a backup server. (If it's too low, import_select_connection + will increase the timeout anyhow.) */ +#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20) +#else +/* ...but for very large systems (e.g. CRAY) we need to keep the initial + connect t.o. high (bz 10803), because they will nearly ALWAYS be doing the + connects for the first time (clients "reboot" after every process, so no + chance to generate adaptive timeout data. */ +#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/2) +#endif +#define LONG_UNLINK 300 /* Unlink should happen before now */ + #define OBD_FAIL_MDS 0x100 #define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 @@ -165,7 +194,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_OST_BRW_READ_BULK 0x20f #define OBD_FAIL_OST_SYNC_NET 0x210 #define OBD_FAIL_OST_ALL_REPLY_NET 0x211 -#define OBD_FAIL_OST_ALL_REQUESTS_NET 0x212 +#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212 #define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 #define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 #define OBD_FAIL_OST_ENOSPC 0x215 @@ -182,6 +211,8 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 #define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 #define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 +#define OBD_FAIL_OST_PAUSE_CREATE 0x223 +#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 #define OBD_FAIL_OST_CONNECT_NET2 0x225 #define OBD_FAIL_LDLM 0x300 @@ -202,9 +233,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_LDLM_GLIMPSE 0x30f #define OBD_FAIL_LDLM_CANCEL_RACE 0x310 #define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311 -/* #define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 -*/ #define OBD_FAIL_LDLM_CLOSE_THREAD 0x313 #define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 @@ -231,6 +260,9 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_PTLRPC_DROP_RPC 0x505 #define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 #define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a +#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c #define OBD_FAIL_OBD_PING_NET 0x600 #define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 @@ -245,23 +277,25 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 #define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 #define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 +#define OBD_FAIL_TGT_REPLAY_DROP 0x707 #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 +#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 #define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 #define OBD_FAIL_MGS 0x900 #define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 #define OBD_FAIL_MGS_ALL_REPLY_NET 0x902 -#define OBD_FAIL_MGC_PROCESS_LOG 0x903 -#define OBD_FAIL_MGS_SLOW_REQUEST_NET 0x904 -#define OBD_FAIL_MGS_SLOW_TARGET_REG 0x905 +#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 +#define OBD_FAIL_MGS_PAUSE_REQ 0x904 +#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 -#define OBD_FAIL_QUOTA_QD_COUNT_32BIT 0xa00 +#define OBD_FAIL_QUOTA_QD_COUNT_32BIT 0xA00 -#define OBD_FAIL_LPROC_REMOVE 0xb00 +#define OBD_FAIL_LPROC_REMOVE 0xB00 -#define OBD_FAIL_GENERAL_ALLOC 0xc00 +#define OBD_FAIL_GENERAL_ALLOC 0xC00 #define OBD_FAIL_SEQ 0x1000 #define OBD_FAIL_SEQ_QUERY_NET 0x1001 @@ -273,6 +307,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_SEC_CTX_INIT_NET 0x1201 #define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202 #define OBD_FAIL_SEC_CTX_FINI_NET 0x1203 +#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 /* Failure injection control */ #define OBD_FAIL_MASK_SYS 0x0000FF00 @@ -321,28 +356,28 @@ static inline int obd_fail_check_set(__u32 id, __u32 value, int set) obd_fail_check_set(id, value, OBD_FAIL_LOC_RESET) -static inline int obd_fail_timeout_set(__u32 id, __u32 value, int secs, int set) +static inline int obd_fail_timeout_set(__u32 id, __u32 value, int ms, int set) { - int ret = 0; - if (unlikely(OBD_FAIL_PRECHECK(id) && - (ret = __obd_fail_check_set(id, value, set)))) { - CERROR("obd_fail_timeout id %x sleeping for %d secs\n", - id, secs); - set_current_state(TASK_UNINTERRUPTIBLE); - cfs_schedule_timeout(CFS_TASK_UNINT, cfs_time_seconds(secs)); - set_current_state(TASK_RUNNING); - CERROR("obd_fail_timeout id %x awake\n", id); - } - return ret; + if (unlikely(OBD_FAIL_PRECHECK(id))) + return __obd_fail_timeout_set(id, value, ms, set); + else + return 0; } -/* If id hit obd_fail_loc, sleep secs */ +/* If id hit obd_fail_loc, sleep for seconds or milliseconds */ #define OBD_FAIL_TIMEOUT(id, secs) \ - obd_fail_timeout_set(id, 0, secs, OBD_FAIL_LOC_NOSET) + obd_fail_timeout_set(id, 0, secs * 1000, OBD_FAIL_LOC_NOSET) + +#define OBD_FAIL_TIMEOUT_MS(id, ms) \ + obd_fail_timeout_set(id, 0, ms, OBD_FAIL_LOC_NOSET) -/* If id hit obd_fail_loc, obd_fail_loc |= value and sleep secs */ +/* If id hit obd_fail_loc, obd_fail_loc |= value and + * sleep seconds or milliseconds */ #define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) \ - obd_fail_timeout_set(id, value, secs, OBD_FAIL_LOC_ORSET) + obd_fail_timeout_set(id, value, secs * 1000, OBD_FAIL_LOC_ORSET) + +#define OBD_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \ + obd_fail_timeout_set(id, value, ms, OBD_FAIL_LOC_ORSET) #ifdef __KERNEL__ static inline void obd_fail_write(int id, struct super_block *sb) diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index 4cf0798..21ab977 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -49,6 +49,8 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, int count, int max, int cancel_flags, int flags); int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, int count, int max, int flags); +extern int ldlm_enqueue_min; +int ldlm_get_enq_timeout(struct ldlm_lock *lock); /* ldlm_resource.c */ int ldlm_resource_putref_locked(struct ldlm_resource *res); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 2bb5e64..2c2afb2 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -576,6 +576,10 @@ void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data, spin_unlock(&exp->exp_lock); } EXPORT_SYMBOL(target_client_add_cb); +static void +target_start_and_reset_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, + int new_client); int target_handle_connect(struct ptlrpc_request *req) { @@ -766,26 +770,33 @@ int target_handle_connect(struct ptlrpc_request *req) (time_t)cfs_time_current_sec()); } - /* We want to handle EALREADY but *not* -EALREADY from - * target_handle_reconnect(), return reconnection state in a flag */ - if (rc == EALREADY) { - lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); - rc = 0; - } else if (rc) { + if (rc < 0) { GOTO(out, rc); } - /* Tell the client if we're in recovery. */ - /* If this is the first client, start the recovery timer */ + CWARN("%s: connection from %s@%s %st"LPU64" exp %p cur %ld last %ld\n", target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), target->obd_recovering ? "recovering/" : "", data->ocd_transno, export, (long)cfs_time_current_sec(), export ? (long)export->exp_last_request_time : 0); - + /* Tell the client if we're in recovery. */ if (target->obd_recovering) { lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); - target_start_recovery_timer(target); + /* If this is the first time a client connects, + reset the recovery timer */ + if (rc == 0) + target_start_and_reset_recovery_timer(target, req, + !export); + } + + /* We want to handle EALREADY but *not* -EALREADY from + * target_handle_reconnect(), return reconnection state in a flag */ + if (rc == EALREADY) { + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); + rc = 0; + } else { + LASSERT(rc == 0); } /* Tell the client if we support replayable requests */ @@ -957,6 +968,12 @@ dont_check_exports: revimp->imp_state = LUSTRE_IMP_FULL; revimp->imp_msg_magic = req->rq_reqmsg->lm_magic; + if ((export->exp_connect_flags & OBD_CONNECT_AT) && + (revimp->imp_msg_magic != LUSTRE_MSG_MAGIC_V1)) + revimp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; + else + revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, req->rq_flvr.sf_rpc); if (rc) { @@ -1046,6 +1063,7 @@ struct ptlrpc_request *ptlrpc_clone_req( struct ptlrpc_request *orig_req) class_export_get(copy_req->rq_export); CFS_INIT_LIST_HEAD(©_req->rq_list); + CFS_INIT_LIST_HEAD(©_req->rq_replay_list); sptlrpc_svc_ctx_addref(copy_req); if (copy_req->rq_reply_state) { @@ -1059,13 +1077,11 @@ struct ptlrpc_request *ptlrpc_clone_req( struct ptlrpc_request *orig_req) return copy_req; } -void ptlrpc_free_clone( struct ptlrpc_request *req) +void ptlrpc_free_clone(struct ptlrpc_request *req) { - if (req->rq_reply_state) { - ptlrpc_rs_decref(req->rq_reply_state); - req->rq_reply_state = NULL; - } + LASSERT(list_empty(&req->rq_replay_list)); + ptlrpc_req_drop_rs(req); sptlrpc_svc_ctx_decref(req); class_export_put(req->rq_export); list_del(&req->rq_list); @@ -1078,6 +1094,48 @@ void ptlrpc_free_clone( struct ptlrpc_request *req) OBD_FREE_PTR(req); } +static int target_exp_enqueue_req_replay(struct ptlrpc_request *req) +{ + __u64 transno = lustre_msg_get_transno(req->rq_reqmsg); + struct obd_export *exp = req->rq_export; + struct ptlrpc_request *reqiter; + int dup = 0; + + LASSERT(exp); + + spin_lock(&exp->exp_lock); + list_for_each_entry(reqiter, &exp->exp_req_replay_queue, + rq_replay_list) { + if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) { + dup = 1; + break; + } + } + + if (dup) { + /* we expect it with RESENT and REPLAY flags */ + if ((lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY)) + CERROR("invalid flags %x of resent replay\n", + lustre_msg_get_flags(req->rq_reqmsg)); + } else { + list_add_tail(&req->rq_replay_list, &exp->exp_req_replay_queue); + } + + spin_unlock(&exp->exp_lock); + return dup; +} + +static void target_exp_dequeue_req_replay(struct ptlrpc_request *req) +{ + LASSERT(!list_empty(&req->rq_replay_list)); + LASSERT(req->rq_export); + + spin_lock(&req->rq_export->exp_lock); + list_del_init(&req->rq_replay_list); + spin_unlock(&req->rq_export->exp_lock); +} + #ifdef __KERNEL__ static void target_finish_recovery(struct obd_device *obd) { @@ -1109,6 +1167,7 @@ static void abort_req_replay_queue(struct obd_device *obd) DEBUG_REQ(D_ERROR, req, "failed abort_req_reply; skipping"); } + target_exp_dequeue_req_replay(req); ptlrpc_free_clone(req); } } @@ -1157,6 +1216,7 @@ void target_cleanup_recovery(struct obd_device *obd) list_for_each_entry_safe(req, n, &obd->obd_req_replay_queue, rq_list) { LASSERT (req->rq_reply_state == 0); + target_exp_dequeue_req_replay(req); ptlrpc_free_clone(req); } list_for_each_entry_safe(req, n, &obd->obd_lock_replay_queue, rq_list){ @@ -1174,7 +1234,11 @@ void target_cleanup_recovery(struct obd_device *obd) static void target_recovery_expired(unsigned long castmeharder) { struct obd_device *obd = (struct obd_device *)castmeharder; - CERROR("%s: recovery timed out, aborting\n", obd->obd_name); + LCONSOLE_WARN("%s: recovery timed out; %d clients never reconnected " + "after %lds (%d clients did)\n", + obd->obd_name, obd->obd_recoverable_clients, + cfs_time_current_sec()- obd->obd_recovery_start, + obd->obd_connected_clients); spin_lock_bh(&obd->obd_processing_task_lock); if (obd->obd_recovering) obd->obd_abort_recovery = 1; @@ -1189,43 +1253,96 @@ void target_cancel_recovery_timer(struct obd_device *obd) CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name); cfs_timer_disarm(&obd->obd_recovery_timer); } - -static void reset_recovery_timer(struct obd_device *obd) + +/* extend = 1 means require at least "duration" seconds left in the timer, + extend = 0 means set the total duration (start_recovery_timer) */ +static void reset_recovery_timer(struct obd_device *obd, int duration, + int extend) { - time_t timeout_shift = OBD_RECOVERY_TIMEOUT; + cfs_time_t now = cfs_time_current_sec(); + cfs_duration_t left; + spin_lock_bh(&obd->obd_processing_task_lock); - if (!obd->obd_recovering) { + if (!obd->obd_recovering || obd->obd_abort_recovery) { spin_unlock_bh(&obd->obd_processing_task_lock); return; } - if (cfs_time_current_sec() + OBD_RECOVERY_TIMEOUT > - obd->obd_recovery_start + obd->obd_recovery_max_time) - timeout_shift = obd->obd_recovery_start + - obd->obd_recovery_max_time - cfs_time_current_sec(); - cfs_timer_arm(&obd->obd_recovery_timer, cfs_time_shift(timeout_shift)); + + left = cfs_time_sub(obd->obd_recovery_end, now); + + if (extend && (duration > left)) + obd->obd_recovery_timeout += duration - left; + else if (!extend && (duration > obd->obd_recovery_timeout)) + /* Track the client's largest expected replay time */ + obd->obd_recovery_timeout = duration; +#ifdef CRAY_XT3 + /* + * If total recovery time already exceed the + * obd_recovery_max_time, then CRAY XT3 will + * abort the recovery + */ + if(obd->obd_recovery_timeout > obd->obd_recovery_max_time) + obd->obd_recovery_timeout = obd->obd_recovery_max_time; +#endif + obd->obd_recovery_end = obd->obd_recovery_start + + obd->obd_recovery_timeout; + if (!cfs_timer_is_armed(&obd->obd_recovery_timer) || + cfs_time_before(now, obd->obd_recovery_end)) { + left = cfs_time_sub(obd->obd_recovery_end, now); + cfs_timer_arm(&obd->obd_recovery_timer, cfs_time_shift(left)); + } spin_unlock_bh(&obd->obd_processing_task_lock); - CDEBUG(D_HA, "%s: timer will expire in %u seconds\n", obd->obd_name, - (unsigned int)timeout_shift); - /* Only used for lprocfs_status */ - obd->obd_recovery_end = cfs_time_current_sec() + timeout_shift; + CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n", + obd->obd_name, (unsigned)left); } +static void resume_recovery_timer(struct obd_device *obd) +{ + LASSERT(!cfs_timer_is_armed(&obd->obd_recovery_timer)); + + /* to be safe, make it at least OBD_RECOVERY_FACTOR * obd_timeout */ + reset_recovery_timer(obd, OBD_RECOVERY_FACTOR * obd_timeout, 1); +} -/* Only start it the first time called */ -void target_start_recovery_timer(struct obd_device *obd) +static void check_and_start_recovery_timer(struct obd_device *obd) { spin_lock_bh(&obd->obd_processing_task_lock); - if (obd->obd_recovery_handler - || timer_pending((struct timer_list *)&obd->obd_recovery_timer)) { + if (cfs_timer_is_armed(&obd->obd_recovery_timer)) { spin_unlock_bh(&obd->obd_processing_task_lock); return; } - CWARN("%s: starting recovery timer (%us)\n", obd->obd_name, - OBD_RECOVERY_TIMEOUT); - cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd); + CWARN("%s: starting recovery timer\n", obd->obd_name); + obd->obd_recovery_start = cfs_time_current_sec(); + /* minimum */ + obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; spin_unlock_bh(&obd->obd_processing_task_lock); - reset_recovery_timer(obd); + reset_recovery_timer(obd, obd->obd_recovery_timeout, 0); +} + +/* Reset the timer with each new client connection */ +/* + * This timer is actually reconnect_timer, which is for making sure + * the total recovery window is at least as big as my reconnect + * attempt timing. So the initial recovery time_out will be set to + * OBD_RECOVERY_FACTOR * obd_timeout. If the timeout coming + * from client is bigger than this, then the recovery time_out will + * be extend to make sure the client could be reconnected, in the + * process, the timeout from the new client should be ignored. + */ + +static void +target_start_and_reset_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, + int new_client) +{ + int req_timeout = OBD_RECOVERY_FACTOR * + lustre_msg_get_timeout(req->rq_reqmsg); + + check_and_start_recovery_timer(obd); + + if (req_timeout > obd->obd_recovery_timeout && !new_client) + reset_recovery_timer(obd, req_timeout, 0); } #ifdef __KERNEL__ @@ -1317,6 +1434,7 @@ static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd) } else if (!list_empty(&obd->obd_req_replay_queue)) { req = list_entry(obd->obd_req_replay_queue.next, struct ptlrpc_request, rq_list); + target_exp_dequeue_req_replay(req); list_del_init(&req->rq_list); obd->obd_requests_queued_for_recovery--; } else { @@ -1441,7 +1559,9 @@ static int handle_recovery_req(struct ptlrpc_thread *thread, /* don't reset timer for final stage */ if (!req_replay_done(req->rq_export) || !lock_replay_done(req->rq_export)) - reset_recovery_timer(class_exp2obd(req->rq_export)); + reset_recovery_timer(class_exp2obd(req->rq_export), + OBD_RECOVERY_FACTOR * AT_OFF ? obd_timeout : + at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1); ptlrpc_free_clone(req); RETURN(0); } @@ -1504,6 +1624,7 @@ static int target_recovery_thread(void *arg) CDEBUG(D_INFO, "1: request replay stage - %d clients from t"LPU64"\n", atomic_read(&obd->obd_req_replay_clients), obd->obd_next_recovery_transno); + resume_recovery_timer(obd); while ((req = target_next_replay_req(obd))) { LASSERT(trd->trd_processing_task == current->pid); DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s", @@ -1528,9 +1649,11 @@ static int target_recovery_thread(void *arg) class_disconnect_stale_exports(obd, req_replay_done); abort_req_replay_queue(obd); } + /* The second stage: replay locks */ CDEBUG(D_INFO, "2: lock replay stage - %d clients\n", atomic_read(&obd->obd_lock_replay_clients)); + resume_recovery_timer(obd); while ((req = target_next_replay_lock(obd))) { LASSERT(trd->trd_processing_task == current->pid); DEBUG_REQ(D_HA|D_WARNING, req, "processing lock from %s: ", @@ -1636,12 +1759,13 @@ void target_recovery_init(struct obd_device *obd, svc_handler_t handler) "last_transno "LPU64"\n", obd->obd_name, obd->obd_max_recoverable_clients, obd->obd_last_committed); obd->obd_next_recovery_transno = obd->obd_last_committed + 1; - target_start_recovery_thread(obd, handler); - obd->obd_recovery_start = cfs_time_current_sec(); - /* Only used for lprocfs_status */ - obd->obd_recovery_end = obd->obd_recovery_start + OBD_RECOVERY_TIMEOUT; + obd->obd_recovery_start = 0; + obd->obd_recovery_end = 0; + obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; /* bz13079: this should be set to desired value for ost but not for mds */ obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME; + cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd); + target_start_recovery_thread(obd, handler); } EXPORT_SYMBOL(target_recovery_init); @@ -1779,13 +1903,8 @@ int target_queue_recovery_request(struct ptlrpc_request *req, } spin_unlock_bh(&obd->obd_processing_task_lock); - /* A resent, replayed request that is still on the queue; just drop it. - The queued request will handle this. */ - if ((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT|MSG_REPLAY)) == - (MSG_RESENT | MSG_REPLAY)) { - DEBUG_REQ(D_ERROR, req, "dropping resent queued req"); + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_DROP)) RETURN(0); - } req = ptlrpc_clone_req(req); if (req == NULL) @@ -1800,6 +1919,13 @@ int target_queue_recovery_request(struct ptlrpc_request *req, } LASSERT(req->rq_export->exp_req_replay_needed); + if (target_exp_enqueue_req_replay(req)) { + spin_unlock_bh(&obd->obd_processing_task_lock); + DEBUG_REQ(D_ERROR, req, "dropping resent queued req"); + ptlrpc_free_clone(req); + RETURN(0); + } + /* XXX O(n^2) */ list_for_each(tmp, &obd->obd_req_replay_queue) { struct ptlrpc_request *reqiter = @@ -1810,6 +1936,16 @@ int target_queue_recovery_request(struct ptlrpc_request *req, inserted = 1; break; } + + if (unlikely(lustre_msg_get_transno(reqiter->rq_reqmsg) == + transno)) { + DEBUG_REQ(D_ERROR, req, "dropping replay: transno " + "has been claimed by another client"); + spin_unlock_bh(&obd->obd_processing_task_lock); + target_exp_dequeue_req_replay(req); + ptlrpc_free_clone(req); + RETURN(0); + } } if (!inserted) @@ -1819,7 +1955,6 @@ int target_queue_recovery_request(struct ptlrpc_request *req, wake_up(&obd->obd_next_transno_waitq); spin_unlock_bh(&obd->obd_processing_task_lock); RETURN(0); - } struct obd_device * target_req2obd(struct ptlrpc_request *req) @@ -1880,7 +2015,7 @@ int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id) DEBUG_REQ(D_NET, req, "sending reply"); } - return (ptlrpc_send_reply(req, 1)); + return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT)); } void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) @@ -1992,7 +2127,8 @@ void target_committed_to_req(struct ptlrpc_request *req) lustre_msg_set_last_committed(req->rq_repmsg, obd->obd_last_committed); else - DEBUG_REQ(D_IOCTL, req, "not sending last_committed update"); + DEBUG_REQ(D_IOCTL, req, "not sending last_committed update (%d/" + "%d)", obd->obd_no_transno, req->rq_repmsg == NULL); CDEBUG(D_INFO, "last_committed "LPU64", transno "LPU64", xid "LPU64"\n", obd->obd_last_committed, req->rq_transno, req->rq_xid); diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 7b3b159..39622786 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -60,10 +60,10 @@ inline cfs_time_t round_timeout(cfs_time_t timeout) return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1); } -/* timeout for initial callback (AST) reply */ -static inline unsigned int ldlm_get_rq_timeout(unsigned int ldlm_timeout, - unsigned int obd_timeout) +/* timeout for initial callback (AST) reply (bz10399) */ +static inline unsigned int ldlm_get_rq_timeout(void) { + /* Non-AT value */ unsigned int timeout = min(ldlm_timeout, obd_timeout / 3); return timeout < 1 ? 1 : timeout; @@ -263,11 +263,11 @@ repeat: goto repeat; } - LDLM_ERROR(lock, "lock callback timer expired: evicting client " - "%s@%s nid %s\n", - lock->l_export->exp_client_uuid.uuid, - lock->l_export->exp_connection->c_remote_uuid.uuid, - libcfs_nid2str(lock->l_export->exp_connection->c_peer.nid)); + LDLM_ERROR(lock, "lock callback timer expired after %lds: " + "evicting client at %s ", + cfs_time_current_sec()- lock->l_enqueued_time.tv_sec, + libcfs_nid2str( + lock->l_export->exp_connection->c_peer.nid)); last = lock; @@ -307,21 +307,25 @@ repeat: */ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock) { + int timeout; cfs_time_t timeout_rounded; if (!list_empty(&lock->l_pending_chain)) return 0; - lock->l_callback_timeout =cfs_time_add(cfs_time_current(), - cfs_time_seconds(obd_timeout)/2); + timeout = ldlm_get_enq_timeout(lock); + + lock->l_callback_timeout = cfs_time_shift(timeout); timeout_rounded = round_timeout(lock->l_callback_timeout); - if (cfs_time_before(timeout_rounded, cfs_timer_deadline(&waiting_locks_timer)) || + if (cfs_time_before(timeout_rounded, + cfs_timer_deadline(&waiting_locks_timer)) || !cfs_timer_is_armed(&waiting_locks_timer)) { cfs_timer_arm(&waiting_locks_timer, timeout_rounded); - } + /* if the new lock has a shorter timeout than something earlier on + the list, we'll wait the longer amount of time; no big deal. */ list_add_tail(&lock->l_pending_chain, &waiting_locks_list); /* FIFO */ return 1; } @@ -649,7 +653,9 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, } req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); + /* ptlrpc_prep_req already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); if (lock->l_export && lock->l_export->exp_ldlm_stats) lprocfs_counter_incr(lock->l_export->exp_ldlm_stats, @@ -678,7 +684,8 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) total_enqueue_wait = cfs_timeval_sub(&granted_time, &lock->l_enqueued_time, NULL); - if (total_enqueue_wait / 1000000 > obd_timeout) + if (total_enqueue_wait / ONE_MILLION > obd_timeout) + /* non-fatal with AT - change to LDLM_DEBUG? */ LDLM_ERROR(lock, "enqueue wait took %luus from "CFS_TIME_T, total_enqueue_wait, lock->l_enqueued_time.tv_sec); @@ -720,9 +727,17 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) LDLM_DEBUG(lock, "server preparing completion AST (after %ldus wait)", total_enqueue_wait); + /* Server-side enqueue wait time estimate, used in + __ldlm_add_waiting_lock to set future enqueue timers */ + at_add(&lock->l_resource->lr_namespace->ns_at_estimate, + total_enqueue_wait / ONE_MILLION); + ptlrpc_request_set_replen(req); + req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); + /* ptlrpc_prep_req already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); /* We only send real blocking ASTs after the lock is granted */ lock_res_and_lock(lock); @@ -786,7 +801,9 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); + /* ptlrpc_prep_req already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); if (lock->l_export && lock->l_export->exp_ldlm_stats) lprocfs_counter_incr(lock->l_export->exp_ldlm_stats, @@ -1084,7 +1101,7 @@ existing_lock: EXIT; out: req->rq_status = err; - if (req->rq_reply_state == NULL) { + if (!req->rq_packed_final) { err = lustre_pack_reply(req, 1, NULL, NULL); if (rc == 0) rc = err; @@ -1216,7 +1233,7 @@ int ldlm_handle_convert(struct ptlrpc_request *req) return rc; } -/* Cancel all the locks, which handles are packed into ldlm_request */ +/* Cancel all the locks whos handles are packed into ldlm_request */ int ldlm_request_cancel(struct ptlrpc_request *req, const struct ldlm_request *dlm_req, int first) { @@ -1471,7 +1488,7 @@ static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) return 0; req->rq_status = rc; - if (req->rq_reply_state == NULL) { + if (!req->rq_packed_final) { rc = lustre_pack_reply(req, 1, NULL, NULL); if (rc) return rc; @@ -2045,7 +2062,7 @@ static int ldlm_setup(void) ldlm_state->ldlm_cb_service = ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, LDLM_MAXREPSIZE, LDLM_CB_REQUEST_PORTAL, - LDLM_CB_REPLY_PORTAL, ldlm_timeout * 900, + LDLM_CB_REPLY_PORTAL, 1800, ldlm_callback_handler, "ldlm_cbd", ldlm_svc_proc_dir, NULL, ldlm_min_threads, ldlm_max_threads, @@ -2060,7 +2077,7 @@ static int ldlm_setup(void) ldlm_state->ldlm_cancel_service = ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, LDLM_MAXREPSIZE, LDLM_CANCEL_REQUEST_PORTAL, - LDLM_CANCEL_REPLY_PORTAL, ldlm_timeout * 6000, + LDLM_CANCEL_REPLY_PORTAL, 6000, ldlm_cancel_handler, "ldlm_canceld", ldlm_svc_proc_dir, NULL, ldlm_min_threads, ldlm_max_threads, diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index f2049ec..4e68bee 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -34,6 +34,10 @@ #include "ldlm_internal.h" +int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; +CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644, + "lock enqueue timeout minimum"); + static void interrupted_completion_wait(void *data) { } @@ -65,7 +69,8 @@ int ldlm_expired_completion_wait(void *data) CFS_DURATION_T"s ago); not entering recovery in " "server code, just going back to sleep", lock->l_enqueued_time.tv_sec, - cfs_time_current_sec() - lock->l_enqueued_time.tv_sec); + cfs_time_current_sec() - + lock->l_enqueued_time.tv_sec); if (cfs_time_after(cfs_time_current(), next_dump)) { last_dump = next_dump; next_dump = cfs_time_shift(300); @@ -89,6 +94,20 @@ int ldlm_expired_completion_wait(void *data) RETURN(0); } +/* We use the same basis for both server side and client side functions + from a single node. */ +int ldlm_get_enq_timeout(struct ldlm_lock *lock) +{ + int timeout = at_get(&lock->l_resource->lr_namespace->ns_at_estimate); + if (AT_OFF) + return obd_timeout / 2; + /* Since these are non-updating timeouts, we should be conservative. + It would be nice to have some kind of "early reply" mechanism for + lock callbacks too... */ + timeout = timeout + (timeout >> 1); /* 150% */ + return max(timeout, ldlm_enqueue_min); +} + static int is_granted_or_cancelled(struct ldlm_lock *lock) { int ret = 0; @@ -110,6 +129,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data) struct obd_device *obd; struct obd_import *imp = NULL; struct l_wait_info lwi; + __u32 timeout; int rc = 0; ENTRY; @@ -134,8 +154,14 @@ noreproc: obd = class_exp2obd(lock->l_conn_export); /* if this is a local lock, then there is no import */ - if (obd != NULL) + if (obd != NULL) { imp = obd->u.cli.cl_import; + } + + /* Wait a long time for enqueue - server may have to callback a + lock from another client. Server will evict the other client if it + doesn't respond reasonably, and then give us the lock. */ + timeout = ldlm_get_enq_timeout(lock) * 2; lwd.lwd_lock = lock; @@ -143,7 +169,7 @@ noreproc: LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT"); lwi = LWI_INTR(interrupted_completion_wait, &lwd); } else { - lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout), + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), ldlm_expired_completion_wait, interrupted_completion_wait, &lwd); } @@ -168,7 +194,13 @@ noreproc: RETURN(rc); } - LDLM_DEBUG(lock, "client-side enqueue waking up: granted"); + LDLM_DEBUG(lock, "client-side enqueue waking up: granted after %lds", + cfs_time_current_sec() - lock->l_enqueued_time.tv_sec); + + /* Update our time estimate */ + at_add(&lock->l_resource->lr_namespace->ns_at_estimate, + cfs_time_current_sec() - lock->l_enqueued_time.tv_sec); + RETURN(0); } @@ -921,6 +953,8 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels, LASSERT(exp != NULL); LASSERT(count > 0); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, obd_fail_val); + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE)) RETURN(count); @@ -955,9 +989,9 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels, req->rq_no_resend = 1; req->rq_no_delay = 1; - /* XXX FIXME bug 249 */ req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(req); ldlm_cancel_pack(req, cancels, count); diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 84b72fd..c3a3aa1 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -367,6 +367,8 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, GOTO(out_proc, rc); } + at_init(&ns->ns_at_estimate, ldlm_enqueue_min, 0); + ldlm_namespace_register(ns, client); RETURN(ns); out_proc: diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index 9aa5498..f023382 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -157,7 +157,8 @@ int liblustre_process_log(struct config_llog_instance *cfg, if (ocd == NULL) GOTO(out_cleanup, rc = -ENOMEM); - ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID; + ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID | + OBD_CONNECT_AT; #ifdef LIBLUSTRE_POSIX_ACL ocd->ocd_connect_flags |= OBD_CONNECT_ACL; #endif @@ -291,7 +292,7 @@ int _sysio_lustre_init(void) obd_timeout); } - /* debug peer on timeout? */ + /* debug peer on timeout? */ envstr = getenv("LIBLUSTRE_DEBUG_PEER_ON_TIMEOUT"); if (envstr != NULL) { obd_debug_peer_on_timeout = diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index fae94f2..5789360 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -2076,7 +2076,7 @@ llu_fsswop_mount(const char *source, sizeof(async), &async, NULL); ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION | - OBD_CONNECT_FID; + OBD_CONNECT_FID | OBD_CONNECT_AT; #ifdef LIBLUSTRE_POSIX_ACL ocd.ocd_connect_flags |= OBD_CONNECT_ACL; #endif @@ -2113,7 +2113,7 @@ llu_fsswop_mount(const char *source, ocd.ocd_connect_flags = OBD_CONNECT_SRVLOCK | OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK | - OBD_CONNECT_FID; + OBD_CONNECT_FID | OBD_CONNECT_AT; ocd.ocd_version = LUSTRE_VERSION_CODE; err = obd_connect(NULL, &dt_conn, obd, &sbi->ll_sb_uuid, &ocd, NULL); if (err) { diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 6c346db..941a42a 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -206,7 +206,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID | OBD_CONNECT_VERSION | OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET| - OBD_CONNECT_FID; + OBD_CONNECT_FID | OBD_CONNECT_AT; #ifdef HAVE_LRU_RESIZE_SUPPORT if (sbi->ll_flags & LL_SBI_LRU_RESIZE) @@ -371,7 +371,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | - OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK; + OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK| + OBD_CONNECT_AT; if (sbi->ll_flags & LL_SBI_OSS_CAPA) data->ocd_connect_flags |= OBD_CONNECT_OSS_CAPA; diff --git a/lustre/lvfs/lvfs_lib.c b/lustre/lvfs/lvfs_lib.c index 5725e25..ebe6c97 100644 --- a/lustre/lvfs/lvfs_lib.c +++ b/lustre/lvfs/lvfs_lib.c @@ -124,6 +124,24 @@ int __obd_fail_check_set(__u32 id, __u32 value, int set) } EXPORT_SYMBOL(__obd_fail_check_set); +int __obd_fail_timeout_set(__u32 id, __u32 value, int ms, int set) +{ + int ret = 0; + + ret = __obd_fail_check_set(id, value, set); + if (ret) { + CERROR("obd_fail_timeout id %x sleeping for %dms\n", + id, ms); + set_current_state(TASK_UNINTERRUPTIBLE); + cfs_schedule_timeout(CFS_TASK_UNINT, + cfs_time_seconds(ms) / 1000); + set_current_state(TASK_RUNNING); + CERROR("obd_fail_timeout id %x awake\n", id); + } + return ret; +} +EXPORT_SYMBOL(__obd_fail_timeout_set); + #ifdef LPROCFS void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount) diff --git a/lustre/mdc/lproc_mdc.c b/lustre/mdc/lproc_mdc.c index bea58ac..dafe84e 100644 --- a/lustre/mdc/lproc_mdc.c +++ b/lustre/mdc/lproc_mdc.c @@ -74,11 +74,12 @@ static struct lprocfs_vars lprocfs_mdc_obd_vars[] = { { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, { "filestotal", lprocfs_rd_filestotal, 0, 0 }, { "filesfree", lprocfs_rd_filesfree, 0, 0 }, - //{ "filegroups", lprocfs_rd_filegroups, 0, 0 }, + /*{ "filegroups", lprocfs_rd_filegroups, 0, 0 },*/ { "mds_server_uuid", lprocfs_rd_server_uuid, 0, 0 }, { "mds_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 }, { "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight, mdc_wr_max_rpcs_in_flight, 0 }, + { "timeouts", lprocfs_rd_timeouts, 0, 0 }, { 0 } }; diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index 5307cb11..399ddd4 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -143,7 +143,8 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, } if (op_data->op_attr.ia_valid & ATTR_FROM_OPEN) { - req->rq_request_portal = MDS_SETATTR_PORTAL; //XXX FIXME bug 249 + req->rq_request_portal = MDS_SETATTR_PORTAL; + ptlrpc_at_set_req_timeout(req); rpc_lock = obd->u.cli.cl_setattr_lock; } else { rpc_lock = obd->u.cli.cl_rpc_lock; diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 14ddcc9..2e3e349 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -810,8 +810,8 @@ int mdc_close(struct obd_export *exp, struct md_op_data *op_data, /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a * portal whose threads are not taking any DLM locks and are therefore * always progressing */ - /* XXX FIXME bug 249 */ req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); /* Ensure that this close's handle is fixed up during replay. */ if (likely(mod != NULL)) @@ -942,6 +942,7 @@ int mdc_sendpage(struct obd_export *exp, const struct lu_fid *fid, } req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); desc = ptlrpc_prep_bulk_imp(req, 1, BULK_GET_SOURCE, MDS_BULK_PORTAL); if (desc == NULL) @@ -983,8 +984,9 @@ int mdc_readpage(struct obd_export *exp, const struct lu_fid *fid, RETURN(rc); } - /* XXX FIXME bug 249 */ req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); + desc = ptlrpc_prep_bulk_imp(req, 1, BULK_PUT_SINK, MDS_BULK_PORTAL); if (desc == NULL) { ptlrpc_request_free(req); diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index e5a074b..506b648 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -476,9 +476,10 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) OBD_ALLOC(data, sizeof(*data)); if (data == NULL) RETURN(-ENOMEM); - data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX | + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX | OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64 | - OBD_CONNECT_OSS_CAPA | OBD_CONNECT_FID; + OBD_CONNECT_OSS_CAPA | OBD_CONNECT_FID | + OBD_CONNECT_AT; #ifdef HAVE_LRU_RESIZE_SUPPORT data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; #endif diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 686addd..8bc9c12 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -319,15 +319,19 @@ static int mdt_getstatus(struct mdt_thread_info *info) static int mdt_statfs(struct mdt_thread_info *info) { - struct md_device *next = info->mti_mdt->mdt_child; - struct obd_statfs *osfs; - int rc; + struct md_device *next = info->mti_mdt->mdt_child; + struct ptlrpc_service *svc; + struct obd_statfs *osfs; + int rc; ENTRY; + svc = info->mti_pill->rc_req->rq_rqbd->rqbd_service; + /* This will trigger a watchdog timeout */ OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP, - (MDT_SERVICE_WATCHDOG_TIMEOUT / 1000) + 1); + (MDT_SERVICE_WATCHDOG_FACTOR * + at_get(&svc->srv_at_estimate) / 1000) + 1); rc = mdt_check_ucred(info); if (rc) @@ -1123,6 +1127,7 @@ static int mdt_sendpage(struct mdt_thread_info *info, struct l_wait_info *lwi = &info->mti_u.rdpg.mti_wait_info; int tmpcount; int tmpsize; + int timeout; int i; int rc; ENTRY; @@ -1146,7 +1151,11 @@ static int mdt_sendpage(struct mdt_thread_info *info, if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) GOTO(abort_bulk, rc = 0); - *lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL); + timeout = (int) req->rq_deadline - cfs_time_current_sec(); + if (timeout < 0) + CERROR("Req deadline already passed %lu (now: %lu)\n", + req->rq_deadline, cfs_time_current_sec()); + *lwi = LWI_TIMEOUT(max(timeout, 1) * HZ, NULL, NULL); rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), lwi); LASSERT (rc == 0 || rc == -ETIMEDOUT); @@ -1707,6 +1716,8 @@ static int mdt_sec_ctx_handle(struct mdt_thread_info *info) sptlrpc_svc_ctx_invalidate(req); } + OBD_FAIL_TIMEOUT(OBD_FAIL_SEC_CTX_HDL_PAUSE, obd_fail_val); + return rc; } @@ -3337,21 +3348,21 @@ static int mdt_start_ptlrpc_service(struct mdt_device *m) procfs_entry = m->mdt_md_dev.md_lu_dev.ld_obd->obd_proc_entry; conf = (typeof(conf)) { - .psc_nbufs = MDS_NBUFS, - .psc_bufsize = MDS_BUFSIZE, - .psc_max_req_size = MDS_MAXREQSIZE, - .psc_max_reply_size = MDS_MAXREPSIZE, - .psc_req_portal = MDS_REQUEST_PORTAL, - .psc_rep_portal = MDC_REPLY_PORTAL, - .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT, + .psc_nbufs = MDS_NBUFS, + .psc_bufsize = MDS_BUFSIZE, + .psc_max_req_size = MDS_MAXREQSIZE, + .psc_max_reply_size = MDS_MAXREPSIZE, + .psc_req_portal = MDS_REQUEST_PORTAL, + .psc_rep_portal = MDC_REPLY_PORTAL, + .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, /* * We'd like to have a mechanism to set this on a per-device * basis, but alas... */ - .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS), - MDT_MAX_THREADS), - .psc_max_threads = MDT_MAX_THREADS, - .psc_ctx_tags = LCT_MD_THREAD + .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS), + MDT_MAX_THREADS), + .psc_max_threads = MDT_MAX_THREADS, + .psc_ctx_tags = LCT_MD_THREAD }; m->mdt_ldlm_client = &m->mdt_md_dev.md_lu_dev.ld_obd->obd_ldlm_client; @@ -3360,7 +3371,8 @@ static int mdt_start_ptlrpc_service(struct mdt_device *m) m->mdt_regular_service = ptlrpc_init_svc_conf(&conf, mdt_regular_handle, LUSTRE_MDT_NAME, - procfs_entry, NULL, LUSTRE_MDT_NAME); + procfs_entry, target_print_req, + LUSTRE_MDT_NAME); if (m->mdt_regular_service == NULL) RETURN(-ENOMEM); @@ -3373,22 +3385,22 @@ static int mdt_start_ptlrpc_service(struct mdt_device *m) * ideally. */ conf = (typeof(conf)) { - .psc_nbufs = MDS_NBUFS, - .psc_bufsize = MDS_BUFSIZE, - .psc_max_req_size = MDS_MAXREQSIZE, - .psc_max_reply_size = MDS_MAXREPSIZE, - .psc_req_portal = MDS_READPAGE_PORTAL, - .psc_rep_portal = MDC_REPLY_PORTAL, - .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT, - .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS), - MDT_MAX_THREADS), - .psc_max_threads = MDT_MAX_THREADS, - .psc_ctx_tags = LCT_MD_THREAD + .psc_nbufs = MDS_NBUFS, + .psc_bufsize = MDS_BUFSIZE, + .psc_max_req_size = MDS_MAXREQSIZE, + .psc_max_reply_size = MDS_MAXREPSIZE, + .psc_req_portal = MDS_READPAGE_PORTAL, + .psc_rep_portal = MDC_REPLY_PORTAL, + .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, + .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS), + MDT_MAX_THREADS), + .psc_max_threads = MDT_MAX_THREADS, + .psc_ctx_tags = LCT_MD_THREAD }; m->mdt_readpage_service = ptlrpc_init_svc_conf(&conf, mdt_readpage_handle, LUSTRE_MDT_NAME "_readpage", - procfs_entry, NULL, "mdt_rdpg"); + procfs_entry, target_print_req,"mdt_rdpg"); if (m->mdt_readpage_service == NULL) { CERROR("failed to start readpage service\n"); @@ -3401,23 +3413,23 @@ static int mdt_start_ptlrpc_service(struct mdt_device *m) * setattr service configuration. */ conf = (typeof(conf)) { - .psc_nbufs = MDS_NBUFS, - .psc_bufsize = MDS_BUFSIZE, - .psc_max_req_size = MDS_MAXREQSIZE, - .psc_max_reply_size = MDS_MAXREPSIZE, - .psc_req_portal = MDS_SETATTR_PORTAL, - .psc_rep_portal = MDC_REPLY_PORTAL, - .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT, + .psc_nbufs = MDS_NBUFS, + .psc_bufsize = MDS_BUFSIZE, + .psc_max_req_size = MDS_MAXREQSIZE, + .psc_max_reply_size = MDS_MAXREPSIZE, + .psc_req_portal = MDS_SETATTR_PORTAL, + .psc_rep_portal = MDC_REPLY_PORTAL, + .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS), - MDT_MAX_THREADS), - .psc_max_threads = MDT_MAX_THREADS, - .psc_ctx_tags = LCT_MD_THREAD + MDT_MAX_THREADS), + .psc_max_threads = MDT_MAX_THREADS, + .psc_ctx_tags = LCT_MD_THREAD }; m->mdt_setattr_service = ptlrpc_init_svc_conf(&conf, mdt_regular_handle, LUSTRE_MDT_NAME "_setattr", - procfs_entry, NULL, "mdt_attr"); + procfs_entry, target_print_req,"mdt_attr"); if (!m->mdt_setattr_service) { CERROR("failed to start setattr service\n"); @@ -3432,22 +3444,22 @@ static int mdt_start_ptlrpc_service(struct mdt_device *m) * sequence controller service configuration */ conf = (typeof(conf)) { - .psc_nbufs = MDS_NBUFS, - .psc_bufsize = MDS_BUFSIZE, - .psc_max_req_size = SEQ_MAXREQSIZE, - .psc_max_reply_size = SEQ_MAXREPSIZE, - .psc_req_portal = SEQ_CONTROLLER_PORTAL, - .psc_rep_portal = MDC_REPLY_PORTAL, - .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT, - .psc_min_threads = SEQ_NUM_THREADS, - .psc_max_threads = SEQ_NUM_THREADS, - .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD + .psc_nbufs = MDS_NBUFS, + .psc_bufsize = MDS_BUFSIZE, + .psc_max_req_size = SEQ_MAXREQSIZE, + .psc_max_reply_size = SEQ_MAXREPSIZE, + .psc_req_portal = SEQ_CONTROLLER_PORTAL, + .psc_rep_portal = MDC_REPLY_PORTAL, + .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, + .psc_min_threads = SEQ_NUM_THREADS, + .psc_max_threads = SEQ_NUM_THREADS, + .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD }; m->mdt_mdsc_service = ptlrpc_init_svc_conf(&conf, mdt_mdsc_handle, LUSTRE_MDT_NAME"_mdsc", - procfs_entry, NULL, "mdt_mdsc"); + procfs_entry, target_print_req,"mdt_mdsc"); if (!m->mdt_mdsc_service) { CERROR("failed to start seq controller service\n"); GOTO(err_mdt_svc, rc = -ENOMEM); @@ -3461,22 +3473,22 @@ static int mdt_start_ptlrpc_service(struct mdt_device *m) * metadata sequence server service configuration */ conf = (typeof(conf)) { - .psc_nbufs = MDS_NBUFS, - .psc_bufsize = MDS_BUFSIZE, - .psc_max_req_size = SEQ_MAXREQSIZE, - .psc_max_reply_size = SEQ_MAXREPSIZE, - .psc_req_portal = SEQ_METADATA_PORTAL, - .psc_rep_portal = MDC_REPLY_PORTAL, - .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT, - .psc_min_threads = SEQ_NUM_THREADS, - .psc_max_threads = SEQ_NUM_THREADS, - .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD + .psc_nbufs = MDS_NBUFS, + .psc_bufsize = MDS_BUFSIZE, + .psc_max_req_size = SEQ_MAXREQSIZE, + .psc_max_reply_size = SEQ_MAXREPSIZE, + .psc_req_portal = SEQ_METADATA_PORTAL, + .psc_rep_portal = MDC_REPLY_PORTAL, + .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, + .psc_min_threads = SEQ_NUM_THREADS, + .psc_max_threads = SEQ_NUM_THREADS, + .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD }; m->mdt_mdss_service = ptlrpc_init_svc_conf(&conf, mdt_mdss_handle, LUSTRE_MDT_NAME"_mdss", - procfs_entry, NULL, "mdt_mdss"); + procfs_entry, target_print_req,"mdt_mdss"); if (!m->mdt_mdss_service) { CERROR("failed to start metadata seq server service\n"); GOTO(err_mdt_svc, rc = -ENOMEM); @@ -3493,22 +3505,22 @@ static int mdt_start_ptlrpc_service(struct mdt_device *m) * controller which manages space. */ conf = (typeof(conf)) { - .psc_nbufs = MDS_NBUFS, - .psc_bufsize = MDS_BUFSIZE, - .psc_max_req_size = SEQ_MAXREQSIZE, - .psc_max_reply_size = SEQ_MAXREPSIZE, - .psc_req_portal = SEQ_DATA_PORTAL, - .psc_rep_portal = OSC_REPLY_PORTAL, - .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT, - .psc_min_threads = SEQ_NUM_THREADS, - .psc_max_threads = SEQ_NUM_THREADS, - .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD + .psc_nbufs = MDS_NBUFS, + .psc_bufsize = MDS_BUFSIZE, + .psc_max_req_size = SEQ_MAXREQSIZE, + .psc_max_reply_size = SEQ_MAXREPSIZE, + .psc_req_portal = SEQ_DATA_PORTAL, + .psc_rep_portal = OSC_REPLY_PORTAL, + .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, + .psc_min_threads = SEQ_NUM_THREADS, + .psc_max_threads = SEQ_NUM_THREADS, + .psc_ctx_tags = LCT_MD_THREAD|LCT_DT_THREAD }; m->mdt_dtss_service = ptlrpc_init_svc_conf(&conf, mdt_dtss_handle, LUSTRE_MDT_NAME"_dtss", - procfs_entry, NULL, "mdt_dtss"); + procfs_entry, target_print_req,"mdt_dtss"); if (!m->mdt_dtss_service) { CERROR("failed to start data seq server service\n"); GOTO(err_mdt_svc, rc = -ENOMEM); @@ -3520,22 +3532,22 @@ static int mdt_start_ptlrpc_service(struct mdt_device *m) /* FLD service start */ conf = (typeof(conf)) { - .psc_nbufs = MDS_NBUFS, - .psc_bufsize = MDS_BUFSIZE, - .psc_max_req_size = FLD_MAXREQSIZE, - .psc_max_reply_size = FLD_MAXREPSIZE, - .psc_req_portal = FLD_REQUEST_PORTAL, - .psc_rep_portal = MDC_REPLY_PORTAL, - .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT, - .psc_min_threads = FLD_NUM_THREADS, - .psc_max_threads = FLD_NUM_THREADS, - .psc_ctx_tags = LCT_DT_THREAD|LCT_MD_THREAD + .psc_nbufs = MDS_NBUFS, + .psc_bufsize = MDS_BUFSIZE, + .psc_max_req_size = FLD_MAXREQSIZE, + .psc_max_reply_size = FLD_MAXREPSIZE, + .psc_req_portal = FLD_REQUEST_PORTAL, + .psc_rep_portal = MDC_REPLY_PORTAL, + .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, + .psc_min_threads = FLD_NUM_THREADS, + .psc_max_threads = FLD_NUM_THREADS, + .psc_ctx_tags = LCT_DT_THREAD|LCT_MD_THREAD }; m->mdt_fld_service = ptlrpc_init_svc_conf(&conf, mdt_fld_handle, LUSTRE_MDT_NAME"_fld", - procfs_entry, NULL, "mdt_fld"); + procfs_entry, target_print_req, "mdt_fld"); if (!m->mdt_fld_service) { CERROR("failed to start fld service\n"); GOTO(err_mdt_svc, rc = -ENOMEM); @@ -3550,21 +3562,22 @@ static int mdt_start_ptlrpc_service(struct mdt_device *m) * mds-mds requests be not blocked during recovery. */ conf = (typeof(conf)) { - .psc_nbufs = MDS_NBUFS, - .psc_bufsize = MDS_BUFSIZE, - .psc_max_req_size = MDS_MAXREQSIZE, - .psc_max_reply_size = MDS_MAXREPSIZE, - .psc_req_portal = MDS_MDS_PORTAL, - .psc_rep_portal = MDC_REPLY_PORTAL, - .psc_watchdog_timeout = MDT_SERVICE_WATCHDOG_TIMEOUT, - .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS), - MDT_MAX_THREADS), - .psc_max_threads = MDT_MAX_THREADS, - .psc_ctx_tags = LCT_MD_THREAD + .psc_nbufs = MDS_NBUFS, + .psc_bufsize = MDS_BUFSIZE, + .psc_max_req_size = MDS_MAXREQSIZE, + .psc_max_reply_size = MDS_MAXREPSIZE, + .psc_req_portal = MDS_MDS_PORTAL, + .psc_rep_portal = MDC_REPLY_PORTAL, + .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, + .psc_min_threads = min(max(mdt_num_threads, MDT_MIN_THREADS), + MDT_MAX_THREADS), + .psc_max_threads = MDT_MAX_THREADS, + .psc_ctx_tags = LCT_MD_THREAD }; - m->mdt_xmds_service = ptlrpc_init_svc_conf(&conf, mdt_xmds_handle, - LUSTRE_MDT_NAME "_mds", - procfs_entry, NULL, "mdt_xmds"); + m->mdt_xmds_service = + ptlrpc_init_svc_conf(&conf, mdt_xmds_handle, + LUSTRE_MDT_NAME "_mds", + procfs_entry, target_print_req,"mdt_xmds"); if (m->mdt_xmds_service == NULL) { CERROR("failed to start readpage service\n"); diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index b0d0cf3..2d07448b 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -163,8 +163,7 @@ struct mdt_device { struct lprocfs_stats *mdt_stats; }; -/*XXX copied from mds_internal.h */ -#define MDT_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000) +#define MDT_SERVICE_WATCHDOG_FACTOR (2000) #define MDT_ROCOMPAT_SUPP (OBD_ROCOMPAT_LOVOBJID) #define MDT_INCOMPAT_SUPP (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR) diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index 1f0411f..e7e05b8 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -1081,7 +1081,7 @@ static int mgc_process_log(struct obd_device *mgc, if (cld->cld_stopping) RETURN(0); - OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PROCESS_LOG, 20); + OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20); lsi = s2lsi(cld->cld_cfg.cfg_sb); diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c index 3c3c186..79d6f0f 100644 --- a/lustre/mgs/mgs_handler.c +++ b/lustre/mgs/mgs_handler.c @@ -205,9 +205,9 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg) mgs->mgs_service = ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE, MGS_MAXREPSIZE, MGS_REQUEST_PORTAL, - MGC_REPLY_PORTAL, MGS_SERVICE_WATCHDOG_TIMEOUT, + MGC_REPLY_PORTAL, 2000, mgs_handle, LUSTRE_MGS_NAME, - obd->obd_proc_entry, NULL, + obd->obd_proc_entry, target_print_req, MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX, "ll_mgs", LCT_MD_THREAD); @@ -397,7 +397,7 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req) obd->obd_name, lockrc); } - OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_TARGET_REG, 10); + OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_PAUSE_TARGET_REG, 10); /* Log writing contention is handled by the fsdb_sem */ @@ -546,7 +546,9 @@ int mgs_handle(struct ptlrpc_request *req) ENTRY; req_capsule_init(&req->rq_pill, req, RCL_SERVER); - OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_REQUEST_NET, 2); + OBD_FAIL_TIMEOUT_MS(OBD_FAIL_MGS_PAUSE_REQ, obd_fail_val); + if (OBD_FAIL_CHECK(OBD_FAIL_MGS_ALL_REQUEST_NET)) + RETURN(0); LASSERT(current->journal_info == NULL); opc = lustre_msg_get_opc(req->rq_reqmsg); diff --git a/lustre/mgs/mgs_internal.h b/lustre/mgs/mgs_internal.h index e0833f9..1c31c24 100644 --- a/lustre/mgs/mgs_internal.h +++ b/lustre/mgs/mgs_internal.h @@ -15,9 +15,6 @@ #include #include -/* in ms */ -#define MGS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000) - /* mgs_llog.c */ int class_dentry_readdir(struct obd_device *obd, struct dentry *dir, struct vfsmount *inmnt, diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 9de3728..62ac5d3 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -65,9 +65,8 @@ __u64 obd_pages; unsigned int obd_debug_peer_on_timeout; unsigned int obd_dump_on_timeout; unsigned int obd_dump_on_eviction; -unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ +unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ -unsigned int obd_health_check_timeout = HEALTH_CHECK_TIMEOUT_DEFAULT; /* seconds */ unsigned int obd_max_dirty_pages = 256; atomic_t obd_dirty_pages; @@ -387,7 +386,6 @@ EXPORT_SYMBOL(obd_dump_on_timeout); EXPORT_SYMBOL(obd_dump_on_eviction); EXPORT_SYMBOL(obd_timeout); EXPORT_SYMBOL(ldlm_timeout); -EXPORT_SYMBOL(obd_health_check_timeout); EXPORT_SYMBOL(obd_max_dirty_pages); EXPORT_SYMBOL(obd_dirty_pages); EXPORT_SYMBOL(ptlrpc_put_connection_superhack); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 9e4a7f5..a630808 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -680,6 +680,7 @@ void class_export_destroy(struct obd_export *exp) ptlrpc_put_connection_superhack(exp->exp_connection); LASSERT(list_empty(&exp->exp_outstanding_replies)); + LASSERT(list_empty(&exp->exp_req_replay_queue)); obd_destroy_export(exp); OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle); @@ -705,6 +706,7 @@ struct obd_export *class_new_export(struct obd_device *obd, atomic_set(&export->exp_rpc_count, 0); export->exp_obd = obd; CFS_INIT_LIST_HEAD(&export->exp_outstanding_replies); + CFS_INIT_LIST_HEAD(&export->exp_req_replay_queue); /* XXX this should be in LDLM init */ CFS_INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks); spin_lock_init(&export->exp_ldlm_data.led_lock); @@ -837,6 +839,18 @@ void class_import_destroy(struct obd_import *import) EXIT; } +static void init_imp_at(struct imp_at *at) { + int i; + at_init(&at->iat_net_latency, 0, 0); + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + /* max service estimates are tracked on the server side, so + don't use the AT history here, just use the last reported + val. (But keep hist for proc histogram, worst_ever) */ + at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT, + AT_FLG_NOHIST); + } +} + struct obd_import *class_new_import(struct obd_device *obd) { struct obd_import *imp; @@ -863,6 +877,7 @@ struct obd_import *class_new_import(struct obd_device *obd) CFS_INIT_LIST_HEAD(&imp->imp_conn_list); CFS_INIT_LIST_HEAD(&imp->imp_handle.h_link); class_handle_hash(&imp->imp_handle, import_handle_addref); + init_imp_at(&imp->imp_at); /* the default magic is V2, will be used in connect RPC, and * then adjusted according to the flags in request/reply. */ @@ -1023,10 +1038,12 @@ static void class_disconnect_export_list(struct list_head *list, int flags) fake_exp->exp_flags = flags; spin_unlock(&fake_exp->exp_lock); + CDEBUG(D_HA, "%s: disconnecting export at %s (%p), " + "last request at %ld\n", + exp->exp_obd->obd_name, obd_export_nid2str(exp), + exp, exp->exp_last_request_time); rc = obd_disconnect(fake_exp); class_export_put(exp); - CDEBUG(D_HA, "disconnecting export %s (%p): rc %d\n", - exp->exp_client_uuid.uuid, exp, rc); } EXIT; } diff --git a/lustre/obdclass/linux/linux-module.c b/lustre/obdclass/linux/linux-module.c index 2215132..df8a15d 100644 --- a/lustre/obdclass/linux/linux-module.c +++ b/lustre/obdclass/linux/linux-module.c @@ -286,27 +286,6 @@ static int obd_proc_read_health(char *page, char **start, off_t off, return rc; } -static int obd_proc_rd_health_timeout(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - *eof = 1; - return snprintf(page, count, "%d\n", obd_health_check_timeout); -} - -static int obd_proc_wr_health_timeout(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - int val, rc; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - obd_health_check_timeout = val; - - return count; -} - /* Root for /proc/fs/lustre */ struct proc_dir_entry *proc_lustre_root = NULL; @@ -314,8 +293,6 @@ struct lprocfs_vars lprocfs_base[] = { { "version", obd_proc_read_version, NULL, NULL }, { "pinger", obd_proc_read_pinger, NULL, NULL }, { "health_check", obd_proc_read_health, NULL, NULL }, - { "health_check_timeout", obd_proc_rd_health_timeout, - obd_proc_wr_health_timeout, NULL }, { 0 } }; #else diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index c9fa3db..d34eba0 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -600,6 +600,70 @@ int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count, return rc; } +int lprocfs_at_hist_helper(char *page, int count, int rc, + struct adaptive_timeout *at) +{ + int i; + for (i = 0; i < AT_BINS; i++) + rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]); + rc += snprintf(page + rc, count - rc, "\n"); + return rc; +} + +/* See also ptlrpc_lprocfs_rd_timeouts */ +int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + unsigned int cur, worst; + time_t now, worstt; + struct dhms ts; + int i, rc = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + *eof = 1; + + now = cfs_time_current_sec(); + + /* Some network health info for kicks */ + s2dhms(&ts, now - imp->imp_last_reply_time); + rc += snprintf(page + rc, count - rc, + "%-10s : %ld, "DHMS_FMT" ago\n", + "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts)); + + + cur = at_get(&imp->imp_at.iat_net_latency); + worst = imp->imp_at.iat_net_latency.at_worst_ever; + worstt = imp->imp_at.iat_net_latency.at_worst_time; + s2dhms(&ts, now - worstt); + rc += snprintf(page + rc, count - rc, + "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ", + "network", cur, worst, worstt, DHMS_VARS(&ts)); + rc = lprocfs_at_hist_helper(page, count, rc, + &imp->imp_at.iat_net_latency); + + for(i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (imp->imp_at.iat_portal[i] == 0) + break; + cur = at_get(&imp->imp_at.iat_service_estimate[i]); + worst = imp->imp_at.iat_service_estimate[i].at_worst_ever; + worstt = imp->imp_at.iat_service_estimate[i].at_worst_time; + s2dhms(&ts, now - worstt); + rc += snprintf(page + rc, count - rc, + "portal %-2d : cur %3u worst %3u (at %ld, " + DHMS_FMT" ago) ", imp->imp_at.iat_portal[i], + cur, worst, worstt, DHMS_VARS(&ts)); + rc = lprocfs_at_hist_helper(page, count, rc, + &imp->imp_at.iat_service_estimate[i]); + } + + LPROCFS_CLIMP_EXIT(obd); + return rc; +} + static const char *obd_connect_names[] = { "read_only", "lov_index", @@ -1692,7 +1756,6 @@ int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off, if (lprocfs_obd_snprintf(&page, size, &len, "status: ") <= 0) goto out; - if (obd->obd_max_recoverable_clients == 0) { if (lprocfs_obd_snprintf(&page, size, &len, "INACTIVE\n") <= 0) goto out; @@ -1704,59 +1767,55 @@ int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off, if (obd->obd_recovering == 0) { if (lprocfs_obd_snprintf(&page, size, &len, "COMPLETE\n") <= 0) goto out; - - if (lprocfs_obd_snprintf(&page, size, &len, "recovery_start: %lu\n", - obd->obd_recovery_start) <= 0) + if (lprocfs_obd_snprintf(&page, size, &len, + "recovery_start: %lu\n", + obd->obd_recovery_start) <= 0) goto out; - - if (lprocfs_obd_snprintf(&page, size, &len, "recovery_end: %lu\n", - obd->obd_recovery_end) <= 0) + if (lprocfs_obd_snprintf(&page, size, &len, + "recovery_duration: %lu\n", + obd->obd_recovery_end - + obd->obd_recovery_start) <= 0) goto out; - - /* Number of clients have have completed recovery */ - if (lprocfs_obd_snprintf(&page, size, &len, "recovered_clients: %d\n", - obd->obd_max_recoverable_clients - obd->obd_recoverable_clients) <= 0) + /* Number of clients that have completed recovery */ + if (lprocfs_obd_snprintf(&page, size, &len, + "completed_clients: %d/%d\n", + obd->obd_max_recoverable_clients - + obd->obd_recoverable_clients, + obd->obd_max_recoverable_clients) <= 0) goto out; - - if (lprocfs_obd_snprintf(&page, size, &len, "unrecovered_clients: %d\n", - obd->obd_recoverable_clients) <= 0) + if (lprocfs_obd_snprintf(&page, size, &len, + "replayed_requests: %d\n", + obd->obd_replayed_requests) <= 0) goto out; - - if (lprocfs_obd_snprintf(&page, size, &len, "last_transno: "LPD64"\n", - obd->obd_next_recovery_transno - 1) <= 0) + if (lprocfs_obd_snprintf(&page, size, &len, + "last_transno: "LPD64"\n", + obd->obd_next_recovery_transno - 1)<=0) goto out; - - lprocfs_obd_snprintf(&page, size, &len, "replayed_requests: %d\n", obd->obd_replayed_requests); goto fclose; } if (lprocfs_obd_snprintf(&page, size, &len, "RECOVERING\n") <= 0) goto out; - if (lprocfs_obd_snprintf(&page, size, &len, "recovery_start: %lu\n", - obd->obd_recovery_start) <= 0) + obd->obd_recovery_start) <= 0) goto out; - - if (lprocfs_obd_snprintf(&page, size, &len, "time remaining: %lu\n", - cfs_time_current_sec() >= obd->obd_recovery_end ? 0 : - obd->obd_recovery_end - cfs_time_current_sec()) <= 0) + if (lprocfs_obd_snprintf(&page, size, &len, "time_remaining: %lu\n", + cfs_time_current_sec() >= obd->obd_recovery_end ? 0 : + obd->obd_recovery_end - cfs_time_current_sec()) <= 0) goto out; - - if(lprocfs_obd_snprintf(&page, size, &len, "connected_clients: %d/%d\n", - obd->obd_connected_clients, - obd->obd_max_recoverable_clients) <= 0) + if (lprocfs_obd_snprintf(&page, size, &len,"connected_clients: %d/%d\n", + obd->obd_connected_clients, + obd->obd_max_recoverable_clients) <= 0) goto out; - - /* Number of clients have have completed recovery */ - if (lprocfs_obd_snprintf(&page, size, &len, "completed_clients: %d/%d\n", - obd->obd_max_recoverable_clients - obd->obd_recoverable_clients, + /* Number of clients that have completed recovery */ + if (lprocfs_obd_snprintf(&page, size, &len,"completed_clients: %d/%d\n", + obd->obd_max_recoverable_clients - + obd->obd_recoverable_clients, obd->obd_max_recoverable_clients) <= 0) goto out; - - if (lprocfs_obd_snprintf(&page, size, &len, "replayed_requests: %d/??\n", + if (lprocfs_obd_snprintf(&page, size, &len,"replayed_requests: %d/??\n", obd->obd_replayed_requests) <= 0) goto out; - if (lprocfs_obd_snprintf(&page, size, &len, "queued_requests: %d\n", obd->obd_requests_queued_for_recovery) <= 0) goto out; @@ -1831,7 +1890,8 @@ EXPORT_SYMBOL(lprocfs_rd_server_uuid); EXPORT_SYMBOL(lprocfs_rd_conn_uuid); EXPORT_SYMBOL(lprocfs_rd_num_exports); EXPORT_SYMBOL(lprocfs_rd_numrefs); - +EXPORT_SYMBOL(lprocfs_at_hist_helper); +EXPORT_SYMBOL(lprocfs_rd_timeouts); EXPORT_SYMBOL(lprocfs_rd_blksize); EXPORT_SYMBOL(lprocfs_rd_kbytestotal); EXPORT_SYMBOL(lprocfs_rd_kbytesfree); diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index fbfb086..f0bdd83 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -756,7 +756,6 @@ int class_process_config(struct lustre_cfg *lcfg) CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", obd_timeout, lcfg->lcfg_num); obd_timeout = max(lcfg->lcfg_num, 1U); - obd_health_check_timeout = HEALTH_CHECK_TIMEOUT; GOTO(out, err = 0); } case LCFG_SET_UPCALL: { diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 58dc3bf..32d9654 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -543,7 +543,7 @@ DECLARE_MUTEX(mgc_start_lock); static int lustre_start_mgc(struct super_block *sb) { struct lustre_handle mgc_conn = {0, }; - struct obd_connect_data ocd = { 0 }; + struct obd_connect_data *data = NULL; struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *obd; struct obd_export *exp; @@ -723,11 +723,14 @@ static int lustre_start_mgc(struct super_block *sb) /* nonfatal */ CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc); /* We connect to the MGS at setup, and don't disconnect until cleanup */ - - ocd.ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID; - ocd.ocd_version = LUSTRE_VERSION_CODE; - - rc = obd_connect(NULL, &mgc_conn, obd, &(obd->obd_uuid), &ocd, NULL); + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out, rc = -ENOMEM); + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID | + OBD_CONNECT_AT; + data->ocd_version = LUSTRE_VERSION_CODE; + rc = obd_connect(NULL, &mgc_conn, obd, &(obd->obd_uuid), data, NULL); + OBD_FREE_PTR(data); if (rc) { CERROR("connect failed %d\n", rc); GOTO(out, rc); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 69dccff..f740f46 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1417,7 +1417,7 @@ struct dentry *filter_parent_lock(struct obd_device *obd, obd_gr group, return ERR_PTR(-ENOENT); rc = filter_lock_dentry(obd, dparent); - fsfilt_check_slow(obd, now, obd_timeout, "parent lock"); + fsfilt_check_slow(obd, now, "parent lock"); return rc ? ERR_PTR(rc) : dparent; } @@ -2040,18 +2040,17 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg, if (obd->obd_recovering) { LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in " - "recovery until %d %s reconnect, or if no clients" - " reconnect for %d:%.02d; during that time new " - "clients will not be allowed to connect. " + "recovery for at least %d:%.02d, or until %d " + "client%s reconnect. During this time new clients" + " will not be allowed to connect. " "Recovery progress can be monitored by watching " "/proc/fs/lustre/obdfilter/%s/recovery_status.\n", obd->obd_name, lustre_cfg_string(lcfg, 1), label ?: "", label ? "/" : "", str, + obd->obd_recovery_timeout / 60, + obd->obd_recovery_timeout % 60, obd->obd_max_recoverable_clients, - (obd->obd_max_recoverable_clients == 1) - ? "client" : "clients", - (int)(OBD_RECOVERY_TIMEOUT) / 60, - (int)(OBD_RECOVERY_TIMEOUT) % 60, + (obd->obd_max_recoverable_clients == 1) ? "":"s", obd->obd_name); } else { LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery " @@ -3553,7 +3552,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, struct filter_obd *filter; struct obd_statfs *osfs; int err = 0, rc = 0, recreate_obj = 0, i; - unsigned long enough_time = jiffies + min(obd_timeout * HZ / 4, 10U*HZ); + cfs_time_t enough_time = cfs_time_shift(DISK_TIMEOUT/2); obd_id next_id; void *handle = NULL; ENTRY; diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h index 6f2ac82..135a3d2 100644 --- a/lustre/obdfilter/filter_internal.h +++ b/lustre/obdfilter/filter_internal.h @@ -29,8 +29,6 @@ #define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE) #define GRANT_FOR_LLOG(obd) 16 -#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */ - extern struct file_operations filter_per_export_stats_fops; extern struct file_operations filter_per_nid_stats_fops; @@ -60,6 +58,7 @@ struct filter_mod_data { #else #define FILTER_FMD_MAX_NUM_DEFAULT 32 #endif +/* Client cache seconds */ #define FILTER_FMD_MAX_AGE_DEFAULT ((obd_timeout + 10) * HZ) struct filter_mod_data *filter_fmd_find(struct obd_export *exp, diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index 380e4f6..1fbf7a7 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -310,7 +310,7 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, inode = dentry->d_inode; obdo_to_inode(inode, oa, OBD_MD_FLATIME); - fsfilt_check_slow(obd, now, obd_timeout, "preprw_read setup"); + fsfilt_check_slow(obd, now, "preprw_read setup"); for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt; i++, rnb++, lnb++) { @@ -343,7 +343,7 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, filter_iobuf_add_page(obd, iobuf, inode, lnb->page); } - fsfilt_check_slow(obd, now, obd_timeout, "start_page_read"); + fsfilt_check_slow(obd, now, "start_page_read"); rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp, NULL, NULL, NULL); @@ -548,7 +548,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, fso.fso_dentry = dentry; fso.fso_bufcnt = obj->ioo_bufcnt; - fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "preprw_write setup"); + fsfilt_check_slow(exp->exp_obd, now, "preprw_write setup"); /* Don't update inode timestamps if this write is older than a * setattr which modifies the timestamps. b=10150 */ @@ -648,7 +648,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp, NULL, NULL, NULL); - fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "start_page_write"); + fsfilt_check_slow(exp->exp_obd, now, "start_page_write"); if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats) lprocfs_counter_add(exp->exp_nid_stats->nid_stats, diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index c06c60b..5e9f45b 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -708,7 +708,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, DQUOT_INIT(inode); LOCK_INODE_MUTEX(inode); - fsfilt_check_slow(obd, now, obd_timeout, "i_mutex"); + fsfilt_check_slow(obd, now, "i_mutex"); oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res, oti); if (IS_ERR(oti->oti_handle)) { @@ -721,7 +721,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, } /* have to call fsfilt_commit() from this point on */ - fsfilt_check_slow(obd, now, obd_timeout, "brw_start"); + fsfilt_check_slow(obd, now, "brw_start"); i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME; @@ -773,7 +773,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, lquota_getflag(filter_quota_interface_ref, obd, oa); - fsfilt_check_slow(obd, now, obd_timeout, "direct_io"); + fsfilt_check_slow(obd, now, "direct_io"); err = fsfilt_commit_wait(obd, inode, wait_handle); if (err) { @@ -786,7 +786,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, "oti_transno "LPU64" last_committed "LPU64"\n", oti->oti_transno, obd->obd_last_committed); - fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit"); + fsfilt_check_slow(obd, now, "commitrw commit"); cleanup: filter_grant_commit(exp, niocount, res); diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 8712ef6..1d7c0a4 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -412,6 +412,7 @@ static struct lprocfs_vars lprocfs_osc_obd_vars[] = { { "checksums", osc_rd_checksum, osc_wr_checksum, 0 }, { "checksum_type", osc_rd_checksum_type, osc_wd_checksum_type, 0 }, { "resend_count", osc_rd_resend_count, osc_wr_resend_count, 0}, + { "timeouts", lprocfs_rd_timeouts, 0, 0 }, { 0 } }; diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index 99dd705..81eef48 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -163,7 +163,8 @@ static int oscc_internal_create(struct osc_creator *oscc) RETURN(-ENOMEM); } - request->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249 + request->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(request); body = lustre_msg_buf(request->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); spin_lock(&oscc->oscc_lock); @@ -378,8 +379,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa, CDEBUG(D_HA,"%s: oscc recovery in progress, waiting\n", oscc->oscc_obd->obd_name); - lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout/4)), - NULL, NULL); + lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds( + obd_timeout / 4)), NULL, NULL); rc = l_wait_event(oscc->oscc_waitq, !oscc_recovering(oscc), &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index f82849b..17c7e1d 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -537,6 +537,7 @@ static int osc_punch(struct obd_export *exp, struct obd_info *oinfo, RETURN(rc); } req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); osc_pack_req_body(req, oinfo); /* overload the size and blocks fields in the oa with start/end */ @@ -703,6 +704,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa, req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ req->rq_interpret_reply = osc_destroy_interpret; + ptlrpc_at_set_req_timeout(req); if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) memcpy(obdo_logcookie(oa), oti->oti_logcookies, @@ -1062,6 +1064,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, RETURN(rc); } req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); if (opc == OST_WRITE) desc = ptlrpc_prep_bulk_imp(req, page_count, @@ -3326,7 +3329,9 @@ static int osc_statfs_async(struct obd_device *obd, struct obd_info *oinfo, RETURN(rc); } ptlrpc_request_set_replen(req); - req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249 + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); + if (oinfo->oi_flags & OBD_STATFS_NODELAY) { /* procfs requests not want stat in wait for avoid deadlock */ req->rq_no_resend = 1; @@ -3379,7 +3384,8 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs, RETURN(rc); } ptlrpc_request_set_replen(req); - req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249 + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); if (flags & OBD_STATFS_NODELAY) { /* procfs requests not want stat in wait for avoid deadlock */ diff --git a/lustre/ost/lproc_ost.c b/lustre/ost/lproc_ost.c index e07405a9..909abf8 100644 --- a/lustre/ost/lproc_ost.c +++ b/lustre/ost/lproc_ost.c @@ -46,37 +46,4 @@ void lprocfs_ost_init_vars(struct lprocfs_static_vars *lvars) lvars->obd_vars = lprocfs_ost_obd_vars; } -void -ost_print_req(void *seq_file, struct ptlrpc_request *req) -{ - /* Called holding srv_lock with irqs disabled. - * Print specific req contents and a newline. - * CAVEAT EMPTOR: check request message length before printing!!! - * You might have received any old crap so you must be just as - * careful here as the service's request parser!!! */ - struct seq_file *sf = seq_file; - - switch (req->rq_phase) { - case RQ_PHASE_NEW: - /* still awaiting a service thread's attention, or rejected - * because the generic request message didn't unpack */ - seq_printf(sf, "\n"); - break; - - case RQ_PHASE_INTERPRET: - /* being handled, so basic msg swabbed, and opc is valid - * but racing with ost_handle() */ - seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg)); - break; - - case RQ_PHASE_COMPLETE: - /* been handled by ost_handle() reply state possibly still - * volatile */ - seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg)); - break; - - default: - LBUG(); - } -} #endif /* LPROCFS */ diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 65b361d..45c5102 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -769,13 +769,14 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) * If getting the lock took more time than * client was willing to wait, drop it. b=11330 */ - if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout || + if (cfs_time_current_sec() > req->rq_deadline || OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) { no_reply = 1; CERROR("Dropping timed-out read from %s because locking" - "object "LPX64" took %ld seconds.\n", + "object "LPX64" took %ld seconds (limit was %ld).\n", libcfs_id2str(req->rq_peer), ioo->ioo_id, - cfs_time_current_sec() - req->rq_arrival_time.tv_sec); + cfs_time_current_sec() - req->rq_arrival_time.tv_sec, + req->rq_deadline - req->rq_arrival_time.tv_sec); GOTO(out_lock, rc = -ETIMEDOUT); } @@ -850,14 +851,30 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) } if (rc == 0) { - lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 4, HZ, - ost_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, - !ptlrpc_bulk_active(desc) || - exp->exp_failed, &lwi); - LASSERT(rc == 0 || rc == -ETIMEDOUT); + time_t start = cfs_time_current_sec(); + do { + long timeoutl = req->rq_deadline - + cfs_time_current_sec(); + cfs_duration_t timeout = (timeoutl <= 0 || rc) ? + CFS_TICK : cfs_time_seconds(timeoutl); + lwi = LWI_TIMEOUT_INTERVAL(timeout, + cfs_time_seconds(1), + ost_bulk_timeout, + desc); + rc = l_wait_event(desc->bd_waitq, + !ptlrpc_bulk_active(desc) || + exp->exp_failed, &lwi); + LASSERT(rc == 0 || rc == -ETIMEDOUT); + /* Wait again if we changed deadline */ + } while ((rc == -ETIMEDOUT) && + (req->rq_deadline > cfs_time_current_sec())); + if (rc == -ETIMEDOUT) { - DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT"); + DEBUG_REQ(D_ERROR, req, + "timeout on bulk PUT after %ld%+lds", + req->rq_deadline - start, + cfs_time_current_sec() - + req->rq_deadline); ptlrpc_abort_bulk(desc); } else if (exp->exp_failed) { DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT"); @@ -907,11 +924,8 @@ out: req->rq_status = rc; ptlrpc_error(req); } else { - if (req->rq_reply_state != NULL) { - /* reply out callback would free */ - ptlrpc_rs_decref(req->rq_reply_state); - req->rq_reply_state = NULL; - } + /* reply out callback would free */ + ptlrpc_req_drop_rs(req); CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " "client will retry\n", exp->exp_obd->obd_name, @@ -1023,6 +1037,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) rc = lustre_pack_reply(req, 3, size, NULL); if (rc != 0) GOTO(out, rc); + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_PACK, obd_fail_val); rcs = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, niocount * sizeof(*rcs)); @@ -1056,13 +1071,14 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) * If getting the lock took more time than * client was willing to wait, drop it. b=11330 */ - if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout || + if (cfs_time_current_sec() > req->rq_deadline || OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) { no_reply = 1; - CERROR("Dropping timed-out write from %s because locking" - "object "LPX64" took %ld seconds.\n", + CERROR("Dropping timed-out write from %s because locking " + "object "LPX64" took %ld seconds (limit was %ld).\n", libcfs_id2str(req->rq_peer), ioo->ioo_id, - cfs_time_current_sec() - req->rq_arrival_time.tv_sec); + cfs_time_current_sec() - req->rq_arrival_time.tv_sec, + req->rq_deadline - req->rq_arrival_time.tv_sec); GOTO(out_lock, rc = -ETIMEDOUT); } @@ -1102,13 +1118,28 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) else rc = ptlrpc_start_bulk_transfer (desc); if (rc == 0) { - lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 2, HZ, - ost_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc) || - desc->bd_export->exp_failed, &lwi); - LASSERT(rc == 0 || rc == -ETIMEDOUT); + time_t start = cfs_time_current_sec(); + do { + long timeoutl = req->rq_deadline - + cfs_time_current_sec(); + cfs_duration_t timeout = (timeoutl <= 0 || rc) ? + CFS_TICK : cfs_time_seconds(timeoutl); + lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1), + ost_bulk_timeout, desc); + rc = l_wait_event(desc->bd_waitq, + !ptlrpc_bulk_active(desc) || + desc->bd_export->exp_failed, &lwi); + LASSERT(rc == 0 || rc == -ETIMEDOUT); + /* Wait again if we changed deadline */ + } while ((rc == -ETIMEDOUT) && + (req->rq_deadline > cfs_time_current_sec())); + if (rc == -ETIMEDOUT) { - DEBUG_REQ(D_ERROR, req, "timeout on bulk GET"); + DEBUG_REQ(D_ERROR, req, + "timeout on bulk GET after %ld%+lds", + req->rq_deadline - start, + cfs_time_current_sec() - + req->rq_deadline); ptlrpc_abort_bulk(desc); } else if (desc->bd_export->exp_failed) { DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET"); @@ -1234,11 +1265,8 @@ out: req->rq_status = rc; ptlrpc_error(req); } else { - if (req->rq_reply_state != NULL) { - /* reply out callback would free */ - ptlrpc_rs_decref(req->rq_reply_state); - req->rq_reply_state = NULL; - } + /* reply out callback would free */ + ptlrpc_req_drop_rs(req); CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " "client will retry\n", exp->exp_obd->obd_name, @@ -1890,12 +1918,11 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg) ost->ost_service = ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_MAXREPSIZE, OST_REQUEST_PORTAL, - OSC_REPLY_PORTAL, - OST_WATCHDOG_TIMEOUT, ost_handle, - LUSTRE_OSS_NAME, obd->obd_proc_entry, - ost_print_req, oss_min_threads, - oss_max_threads, "ll_ost", - LCT_DT_THREAD); + OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, + ost_handle, LUSTRE_OSS_NAME, + obd->obd_proc_entry, target_print_req, + oss_min_threads, oss_max_threads, + "ll_ost", LCT_DT_THREAD); if (ost->ost_service == NULL) { CERROR("failed to start service\n"); GOTO(out_lprocfs, rc = -ENOMEM); @@ -1908,23 +1935,22 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg) if (oss_num_create_threads) { if (oss_num_create_threads > OSS_MAX_CREATE_THREADS) oss_num_create_threads = OSS_MAX_CREATE_THREADS; - if (oss_num_create_threads < OSS_DEF_CREATE_THREADS) - oss_num_create_threads = OSS_DEF_CREATE_THREADS; + if (oss_num_create_threads < OSS_MIN_CREATE_THREADS) + oss_num_create_threads = OSS_MIN_CREATE_THREADS; oss_min_create_threads = oss_max_create_threads = oss_num_create_threads; } else { - oss_min_create_threads = OSS_DEF_CREATE_THREADS; + oss_min_create_threads = OSS_MIN_CREATE_THREADS; oss_max_create_threads = OSS_MAX_CREATE_THREADS; } ost->ost_create_service = ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_MAXREPSIZE, OST_CREATE_PORTAL, - OSC_REPLY_PORTAL, - OST_WATCHDOG_TIMEOUT, ost_handle, "ost_create", - obd->obd_proc_entry, ost_print_req, - oss_min_create_threads, - oss_max_create_threads, + OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, + ost_handle, "ost_create", + obd->obd_proc_entry, target_print_req, + oss_min_create_threads, oss_max_create_threads, "ll_ost_creat", LCT_DT_THREAD); if (ost->ost_create_service == NULL) { CERROR("failed to start OST create service\n"); @@ -1938,9 +1964,9 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg) ost->ost_io_service = ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_MAXREPSIZE, OST_IO_PORTAL, - OSC_REPLY_PORTAL, - OST_WATCHDOG_TIMEOUT, ost_handle, "ost_io", - obd->obd_proc_entry, ost_print_req, + OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, + ost_handle, "ost_io", + obd->obd_proc_entry, target_print_req, oss_min_threads, oss_max_threads, "ll_ost_io", LCT_DT_THREAD); if (ost->ost_io_service == NULL) { diff --git a/lustre/ost/ost_internal.h b/lustre/ost/ost_internal.h index 64df2d2..55adc5f 100644 --- a/lustre/ost/ost_internal.h +++ b/lustre/ost/ost_internal.h @@ -5,11 +5,7 @@ #ifndef OST_INTERNAL_H #define OST_INTERNAL_H -#ifdef LPROCFS -extern void ost_print_req(void *seq_file, struct ptlrpc_request *req); -#else -# define ost_print_req NULL -#endif +#define OSS_SERVICE_WATCHDOG_FACTOR 2000 /* * tunables for per-thread page pool (bug 5137) @@ -37,7 +33,7 @@ struct ost_thread_local_cache { struct ost_thread_local_cache *ost_tls(struct ptlrpc_request *r); -#define OSS_DEF_CREATE_THREADS 1UL +#define OSS_MIN_CREATE_THREADS 2UL #define OSS_MAX_CREATE_THREADS 16UL /* Quota stuff */ diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index bc9c2c2..8fa9fff 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -193,6 +193,160 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) EXIT; } +/* Set server timelimit for this req */ +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) +{ + __u32 serv_est; + int idx; + struct imp_at *at; + + LASSERT(req->rq_import); + + if (AT_OFF) { + /* non-AT settings */ + req->rq_timeout = req->rq_import->imp_server_timeout ? + obd_timeout / 2 : obd_timeout; + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); + return; + } + + at = &req->rq_import->imp_at; + idx = import_at_get_index(req->rq_import, + req->rq_request_portal); + serv_est = at_get(&at->iat_service_estimate[idx]); + /* add an arbitrary minimum: 125% +5 sec */ + req->rq_timeout = serv_est + (serv_est >> 2) + 5; + /* We could get even fancier here, using history to predict increased + loading... */ + + /* Let the server know what this RPC timeout is by putting it in the + reqmsg*/ + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); +} + +/* Adjust max service estimate based on server value */ +static void ptlrpc_at_adj_service(struct ptlrpc_request *req) +{ + int idx; + unsigned int serv_est, oldse; + struct imp_at *at = &req->rq_import->imp_at; + + LASSERT(req->rq_import); + + /* service estimate is returned in the repmsg timeout field, + may be 0 on err */ + serv_est = lustre_msg_get_timeout(req->rq_repmsg); + + idx = import_at_get_index(req->rq_import, req->rq_request_portal); + /* max service estimates are tracked on the server side, + so just keep minimal history here */ + oldse = at_add(&at->iat_service_estimate[idx], serv_est); + if (oldse != 0) + CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d " + "has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name,req->rq_request_portal, + oldse, at_get(&at->iat_service_estimate[idx])); +} + +/* Expected network latency per remote node (secs) */ +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) +{ + return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); +} + +/* Adjust expected network latency */ +static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) +{ + unsigned int st, nl, oldnl; + struct imp_at *at = &req->rq_import->imp_at; + time_t now = cfs_time_current_sec(); + + LASSERT(req->rq_import); + + st = lustre_msg_get_service_time(req->rq_repmsg); + + /* Network latency is total time less server processing time */ + nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/; + if (st > now - req->rq_sent + 2 /* rounding */) + CERROR("Reported service time %u > total measured time %ld\n", + st, now - req->rq_sent); + + oldnl = at_add(&at->iat_net_latency, nl); + if (oldnl != 0) + CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) " + "has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name, + obd_uuid2str( + &req->rq_import->imp_connection->c_remote_uuid), + oldnl, at_get(&at->iat_net_latency)); +} + +static int unpack_reply(struct ptlrpc_request *req) +{ + int rc; + + /* Clear reply swab mask; we may have already swabbed an early reply */ + req->rq_rep_swab_mask = 0; + + rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); + return(-EPROTO); + } + + rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); + return(-EPROTO); + } + return 0; +} + +/* + * Handle an early reply message, called with the rq_lock held. + * If anything goes wrong just ignore it - same as if it never happened + */ +static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) { + time_t olddl; + int rc; + ENTRY; + + req->rq_early = 0; + spin_unlock(&req->rq_lock); + + rc = sptlrpc_cli_unwrap_early_reply(req); + if (rc) + GOTO(out, rc); + + rc = unpack_reply(req); + if (rc) + GOTO(out_cleanup, rc); + + /* Expecting to increase the service time estimate here */ + ptlrpc_at_adj_service(req); + ptlrpc_at_adj_net_latency(req); + + /* Adjust the local timeout for this req */ + ptlrpc_at_set_req_timeout(req); + + olddl = req->rq_deadline; + /* server assumes it now has rq_timeout from when it sent the + early reply, so client should give it at least that long. */ + req->rq_deadline = cfs_time_current_sec() + req->rq_timeout + + ptlrpc_at_get_net_latency(req); + + DEBUG_REQ(D_ADAPTTO, req, + "Early reply #%d, new deadline in %lds (%+lds)", + req->rq_early_count, req->rq_deadline - + cfs_time_current_sec(), req->rq_deadline - olddl); + +out_cleanup: + sptlrpc_cli_finish_early_reply(req); +out: + spin_lock(&req->rq_lock); + RETURN(rc); +} + void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) { struct list_head *l, *tmp; @@ -348,11 +502,6 @@ static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, } lustre_msg_add_version(request->rq_reqmsg, version); - - if (imp->imp_server_timeout) - request->rq_timeout = obd_timeout / 2; - else - request->rq_timeout = obd_timeout; request->rq_send_state = LUSTRE_IMP_FULL; request->rq_type = PTL_RPC_MSG_REQUEST; request->rq_export = NULL; @@ -365,12 +514,14 @@ static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, request->rq_phase = RQ_PHASE_NEW; - /* XXX FIXME bug 249 */ request->rq_request_portal = imp->imp_client->cli_request_portal; request->rq_reply_portal = imp->imp_client->cli_reply_portal; + ptlrpc_at_set_req_timeout(request); + spin_lock_init(&request->rq_lock); CFS_INIT_LIST_HEAD(&request->rq_list); + CFS_INIT_LIST_HEAD(&request->rq_timed_list); CFS_INIT_LIST_HEAD(&request->rq_replay_list); CFS_INIT_LIST_HEAD(&request->rq_mod_list); CFS_INIT_LIST_HEAD(&request->rq_ctx_chain); @@ -381,7 +532,6 @@ static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, atomic_set(&request->rq_refcount, 1); lustre_msg_set_opc(request->rq_reqmsg, opcode); - lustre_msg_set_flags(request->rq_reqmsg, 0); RETURN(0); out_ctx: @@ -721,6 +871,12 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) if (req->rq_restart) GOTO(out, rc = 1); + + if (req->rq_early) { + ptlrpc_at_recv_early_reply(req); + GOTO(out, rc = 0); /* keep waiting */ + } + EXIT; out: spin_unlock(&req->rq_lock); @@ -775,11 +931,6 @@ static int after_reply(struct ptlrpc_request *req) * including buflens, status etc is in the sender's byte order. */ - /* - * Clear reply swab mask; this is a new reply in sender's byte order. - */ - req->rq_rep_swab_mask = 0; - rc = sptlrpc_cli_unwrap_reply(req); if (rc) { DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc); @@ -792,17 +943,9 @@ static int after_reply(struct ptlrpc_request *req) if (req->rq_resend) RETURN(0); - rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen); - if (rc) { - DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); - RETURN(-EPROTO); - } - - rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); - if (rc) { - DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); - RETURN(-EPROTO); - } + rc = unpack_reply(req); + if (rc) + RETURN(rc); do_gettimeofday(&work_start); timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL); @@ -817,6 +960,10 @@ static int after_reply(struct ptlrpc_request *req) RETURN(-EPROTO); } + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val); + ptlrpc_at_adj_service(req); + ptlrpc_at_adj_net_latency(req); + rc = ptlrpc_check_status(req); imp->imp_connect_error = rc; @@ -1114,13 +1261,27 @@ check_ctx: force_timer_recalc = 1; } + spin_lock(&req->rq_lock); + + if (req->rq_early) { + ptlrpc_at_recv_early_reply(req); + spin_unlock(&req->rq_lock); + continue; + } + /* Still waiting for a reply? */ - if (ptlrpc_client_receiving_reply(req)) + if (req->rq_receiving_reply) { + spin_unlock(&req->rq_lock); continue; + } /* Did we actually receive a reply? */ - if (!ptlrpc_client_replied(req)) + if (!req->rq_replied) { + spin_unlock(&req->rq_lock); continue; + } + + spin_unlock(&req->rq_lock); spin_lock(&imp->imp_lock); list_del_init(&req->rq_list); @@ -1198,16 +1359,27 @@ check_ctx: RETURN(set->set_remaining == 0 || force_timer_recalc); } +/* Return 1 if we should give up, else 0 */ int ptlrpc_expire_one_request(struct ptlrpc_request *req) { struct obd_import *imp = req->rq_import; int rc = 0; ENTRY; - DEBUG_REQ(D_ERROR|D_NETERROR, req, "%s (sent at %lu, "CFS_DURATION_T"s ago)", + DEBUG_REQ(D_ERROR|D_NETERROR, req, + "%s (sent at %lu, "CFS_DURATION_T"s ago)", req->rq_net_err ? "network error" : "timeout", (long)req->rq_sent, cfs_time_current_sec() - req->rq_sent); + if (imp) { + LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago" + " has timed out (limit %lus).\n", req->rq_xid, + req->rq_import->imp_obd->obd_name, + libcfs_nid2str(imp->imp_connection->c_peer.nid), + cfs_time_current_sec() - req->rq_sent, + req->rq_deadline - req->rq_sent); + } + if (imp != NULL && obd_debug_peer_on_timeout) LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); @@ -1238,6 +1410,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req) if (req->rq_ctx_init || req->rq_ctx_fini || req->rq_send_state != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov) { + DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); spin_lock(&req->rq_lock); req->rq_status = -ETIMEDOUT; req->rq_err = 1; @@ -1245,7 +1420,8 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req) RETURN(1); } - /* if request can't be resend we can't wait answer after timeout */ + /* if a request can't be resent we can't wait for an answer after + the timeout */ if (req->rq_no_resend) { DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); rc = 1; @@ -1271,13 +1447,13 @@ int ptlrpc_expired_set(void *data) list_entry(tmp, struct ptlrpc_request, rq_set_chain); /* request in-flight? */ - if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting && + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting && !req->rq_resend) || (req->rq_phase == RQ_PHASE_BULK))) continue; if (req->rq_timedout || /* already dealt with */ - req->rq_sent + req->rq_timeout > now) /* not expired */ + req->rq_deadline > now) /* not expired */ continue; /* deal with this guy */ @@ -1317,13 +1493,14 @@ void ptlrpc_interrupted_set(void *data) } } +/* get the smallest timeout in the set; this does NOT set a timeout. */ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) { struct list_head *tmp; time_t now = cfs_time_current_sec(); - time_t deadline; int timeout = 0; struct ptlrpc_request *req; + int deadline; ENTRY; SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */ @@ -1543,14 +1720,15 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request) struct l_wait_info lwi; LASSERT(!in_interrupt ()); /* might sleep */ - - if (!ptlrpc_client_receiving_reply(request)) + if (!ptlrpc_client_recv_or_unlink(request)) + /* Nothing left to do */ return; LNetMDUnlink (request->rq_reply_md_h); /* We have to l_wait_event() whatever the result, to give liblustre - * a chance to run reply_in_callback() */ + * a chance to run reply_in_callback(), and to make sure we've + * unlinked before returning a req to the pool */ if (request->rq_set != NULL) wq = &request->rq_set->set_waitq; @@ -1560,13 +1738,16 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request) for (;;) { /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL); - rc = l_wait_event (*wq, !ptlrpc_client_receiving_reply(request), &lwi); + lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL); + rc = l_wait_event (*wq, !ptlrpc_client_recv_or_unlink(request), + &lwi); if (rc == 0) return; LASSERT (rc == -ETIMEDOUT); - DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout"); + DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout " + "rvcng=%d unlnk=%d", request->rq_receiving_reply, + request->rq_must_unlink); } } @@ -1587,6 +1768,7 @@ void ptlrpc_free_committed(struct obd_import *imp) imp->imp_generation == imp->imp_last_generation_checked) { CDEBUG(D_RPCTRACE, "%s: skip recheck: last_committed "LPU64"\n", imp->imp_obd->obd_name, imp->imp_peer_committed_transno); + EXIT; return; } @@ -1686,6 +1868,10 @@ static int expired_request(void *data) if (ptlrpc_check_suspend()) RETURN(1); + /* deadline may have changed with an early reply */ + if (req->rq_deadline > cfs_time_current_sec()) + RETURN(1); + RETURN(ptlrpc_expire_one_request(req)); } @@ -1757,7 +1943,8 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) int brc; struct l_wait_info lwi; struct obd_import *imp = req->rq_import; - cfs_duration_t timeout = 0; + cfs_duration_t timeout = CFS_TICK; + long timeoutl; ENTRY; LASSERT(req->rq_set == NULL); @@ -1805,6 +1992,7 @@ restart: list_del_init(&req->rq_list); if (req->rq_err) { + /* rq_status was set locally */ rc = -EIO; } else if (req->rq_intr) { @@ -1868,19 +2056,20 @@ restart: } rc = ptl_send_rpc(req, 0); - if (rc) { + if (rc) DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc); - timeout = CFS_TICK; - } else { - timeout = cfs_timeout_cap(cfs_time_seconds(req->rq_timeout)); - DEBUG_REQ(D_NET, req, - "-- sleeping for "CFS_DURATION_T" jiffies", timeout); - } + repeat: + timeoutl = req->rq_deadline - cfs_time_current_sec(); + timeout = (timeoutl <= 0 || rc) ? CFS_TICK : + cfs_time_seconds(timeoutl); + DEBUG_REQ(D_NET, req, + "-- sleeping for "CFS_DURATION_T" ticks", timeout); lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request, req); rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi); - if (rc == -ETIMEDOUT && ptlrpc_check_and_wait_suspend(req)) + if (rc == -ETIMEDOUT && ((req->rq_deadline > cfs_time_current_sec()) || + ptlrpc_check_and_wait_suspend(req))) goto repeat; after_send: @@ -1900,16 +2089,11 @@ after_send: * req->rq_receiving_reply is clear and returns. */ ptlrpc_unregister_reply (req); - if (req->rq_err) - GOTO(out, rc = -EIO); - /* Resend if we need to, unless we were interrupted. */ - if (req->rq_resend && !req->rq_intr) { - /* ...unless we were specifically told otherwise. */ - if (req->rq_no_resend) - GOTO(out, rc = -ETIMEDOUT); - spin_lock(&imp->imp_lock); - goto restart; + if (req->rq_err) { + DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d", + rc, req->rq_status); + GOTO(out, rc = -EIO); } if (req->rq_intr) { @@ -1920,6 +2104,15 @@ after_send: GOTO(out, rc = -EINTR); } + /* Resend if we need to */ + if (req->rq_resend) { + /* ...unless we were specifically told otherwise. */ + if (req->rq_no_resend) + GOTO(out, rc = -ETIMEDOUT); + spin_lock(&imp->imp_lock); + goto restart; + } + if (req->rq_timedout) { /* non-recoverable timeout */ GOTO(out, rc = -ETIMEDOUT); } @@ -2045,12 +2238,9 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) ENTRY; LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); - /* Not handling automatic bulk replay yet (or ever?) */ LASSERT(req->rq_bulk == NULL); - DEBUG_REQ(D_HA, req, "REPLAY"); - LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args)); aa = (struct ptlrpc_replay_async_args *)&req->rq_async_args; memset(aa, 0, sizeof *aa); @@ -2059,10 +2249,15 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) aa->praa_old_state = req->rq_send_state; req->rq_send_state = LUSTRE_IMP_REPLAY; req->rq_phase = RQ_PHASE_NEW; - aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); + if (req->rq_repmsg) + aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); req->rq_status = 0; - req->rq_interpret_reply = ptlrpc_replay_interpret; + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + + DEBUG_REQ(D_HA, req, "REPLAY"); + atomic_inc(&req->rq_import->imp_replay_inflight); ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ @@ -2094,6 +2289,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) spin_lock (&req->rq_lock); if (req->rq_import_generation < imp->imp_generation) { req->rq_err = 1; + req->rq_status = -EINTR; ptlrpc_wake_client_req(req); } spin_unlock (&req->rq_lock); @@ -2108,6 +2304,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) spin_lock (&req->rq_lock); if (req->rq_import_generation < imp->imp_generation) { req->rq_err = 1; + req->rq_status = -EINTR; ptlrpc_wake_client_req(req); } spin_unlock (&req->rq_lock); @@ -2144,3 +2341,4 @@ __u64 ptlrpc_sample_next_xid(void) return tmp; } EXPORT_SYMBOL(ptlrpc_sample_next_xid); + diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index d24c92d..72bbce3 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -83,30 +83,74 @@ void reply_in_callback(lnet_event_t *ev) struct ptlrpc_request *req = cbid->cbid_arg; ENTRY; - LASSERT (ev->type == LNET_EVENT_PUT || - ev->type == LNET_EVENT_UNLINK); - LASSERT (ev->unlinked); - LASSERT (ev->md.start == req->rq_repbuf); - LASSERT (ev->offset == 0); - LASSERT (ev->mlength <= req->rq_repbuf_len); - DEBUG_REQ((ev->status == 0) ? D_NET : D_ERROR, req, "type %d, status %d", ev->type, ev->status); + LASSERT (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK); + LASSERT (ev->md.start == req->rq_repbuf); + LASSERT (ev->mlength <= req->rq_repbuf_len); + /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests + for adaptive timeouts' early reply. */ + LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0); + spin_lock(&req->rq_lock); - LASSERT (req->rq_receiving_reply); req->rq_receiving_reply = 0; + req->rq_early = 0; + + if (ev->status) + goto out_wake; + if (ev->type == LNET_EVENT_UNLINK) { + req->rq_must_unlink = 0; + DEBUG_REQ(D_RPCTRACE, req, "unlink"); + goto out_wake; + } - if (ev->type == LNET_EVENT_PUT && ev->status == 0) { + if ((ev->offset == 0) && + ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) { + /* Early reply */ + DEBUG_REQ(D_ADAPTTO, req, + "Early reply received: mlen=%u offset=%d replen=%d " + "replied=%d unlinked=%d", ev->mlength, ev->offset, + req->rq_replen, req->rq_replied, ev->unlinked); + + req->rq_early_count++; /* number received, client side */ + if (req->rq_replied) { + /* If we already got the real reply, then we need to + * check if lnet_finalize() unlinked the md. In that + * case, there will be no further callback of type + * LNET_EVENT_UNLINK. + */ + if (ev->unlinked) + req->rq_must_unlink = 0; + else + DEBUG_REQ(D_RPCTRACE, req, "unlinked in reply"); + goto out_wake; + } + req->rq_early = 1; + req->rq_reply_off = ev->offset; + req->rq_nob_received = ev->mlength; + /* And we're still receiving */ + req->rq_receiving_reply = 1; + } else { + /* Real reply */ req->rq_replied = 1; + req->rq_reply_off = ev->offset; req->rq_nob_received = ev->mlength; + /* LNetMDUnlink can't be called under the LNET_LOCK, + so we must unlink in ptlrpc_unregister_reply */ + DEBUG_REQ(D_INFO, req, + "reply in flags=%x mlen=%u offset=%d replen=%d", + lustre_msg_get_flags(req->rq_reqmsg), + ev->mlength, ev->offset, req->rq_replen); } + req->rq_import->imp_last_reply_time = cfs_time_current_sec(); + +out_wake: /* NB don't unlock till after wakeup; req can disappear under us * since we don't have our own ref */ ptlrpc_wake_client_req(req); - spin_unlock(&req->rq_lock); EXIT; } @@ -212,6 +256,11 @@ void request_in_callback(lnet_event_t *ev) #ifdef CRAY_XT3 req->rq_uid = ev->uid; #endif + spin_lock_init(&req->rq_lock); + CFS_INIT_LIST_HEAD(&req->rq_timed_list); + atomic_set(&req->rq_refcount, 1); + if (ev->type == LNET_EVENT_PUT) + DEBUG_REQ(D_RPCTRACE, req, "incoming req"); CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer)); @@ -239,7 +288,7 @@ void request_in_callback(lnet_event_t *ev) rqbd->rqbd_refcount++; } - list_add_tail(&req->rq_list, &service->srv_request_queue); + list_add_tail(&req->rq_list, &service->srv_req_in_queue); service->srv_n_queued_reqs++; /* NB everything can disappear under us once the request diff --git a/lustre/ptlrpc/gss/gss_bulk.c b/lustre/ptlrpc/gss/gss_bulk.c index 7766a06..88d0599 100644 --- a/lustre/ptlrpc/gss/gss_bulk.c +++ b/lustre/ptlrpc/gss/gss_bulk.c @@ -509,7 +509,7 @@ int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { case SPTLRPC_SVC_NULL: - vmsg = req->rq_repbuf; + vmsg = req->rq_repdata; voff = vmsg->lm_bufcount - 1; LASSERT(vmsg && vmsg->lm_bufcount >= 3); @@ -519,7 +519,7 @@ int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, break; case SPTLRPC_SVC_AUTH: case SPTLRPC_SVC_INTG: - vmsg = req->rq_repbuf; + vmsg = req->rq_repdata; voff = vmsg->lm_bufcount - 2; LASSERT(vmsg && vmsg->lm_bufcount >= 4); @@ -528,7 +528,7 @@ int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, LASSERT(rmsg && rmsg->lm_bufcount >= 4); break; case SPTLRPC_SVC_PRIV: - vmsg = req->rq_repbuf; + vmsg = req->rq_repdata; voff = vmsg->lm_bufcount - 1; LASSERT(vmsg && vmsg->lm_bufcount >= 2); diff --git a/lustre/ptlrpc/gss/gss_cli_upcall.c b/lustre/ptlrpc/gss/gss_cli_upcall.c index ccc000f..b37502a 100644 --- a/lustre/ptlrpc/gss/gss_cli_upcall.c +++ b/lustre/ptlrpc/gss/gss_cli_upcall.c @@ -311,7 +311,8 @@ int gss_do_ctx_init_rpc(__user char *buffer, unsigned long count) goto out_copy; } - lsize = ctx_init_parse_reply(req->rq_repbuf, + LASSERT(req->rq_repdata); + lsize = ctx_init_parse_reply(req->rq_repdata, param.reply_buf, param.reply_buf_size); if (lsize < 0) { param.status = (int) lsize; diff --git a/lustre/ptlrpc/gss/gss_keyring.c b/lustre/ptlrpc/gss/gss_keyring.c index f2014e5..23a684ef 100644 --- a/lustre/ptlrpc/gss/gss_keyring.c +++ b/lustre/ptlrpc/gss/gss_keyring.c @@ -65,8 +65,12 @@ static int sec_install_rctx_kr(struct ptlrpc_sec *sec, /* * the timeout is only for the case that upcall child process die abnormally. - * in any other cases it should finally update kernel key. so we set this - * timeout value excessive long. + * in any other cases it should finally update kernel key. + * + * FIXME we'd better to incorporate the client & server side upcall timeouts + * into the framework of Adaptive Timeouts, but we need to figure out how to + * make sure that kernel knows the upcall processes is in-progress or died + * unexpectedly. */ #define KEYRING_UPCALL_TIMEOUT (obd_timeout + obd_timeout) @@ -833,7 +837,7 @@ void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec, for (;;) { key = request_key(&gss_key_type, desc, NULL); if (IS_ERR(key)) { - CWARN("No more key found for current user\n"); + CDEBUG(D_SEC, "No more key found for current user\n"); break; } diff --git a/lustre/ptlrpc/gss/gss_svc_upcall.c b/lustre/ptlrpc/gss/gss_svc_upcall.c index f76c22d..e1b1d7f 100644 --- a/lustre/ptlrpc/gss/gss_svc_upcall.c +++ b/lustre/ptlrpc/gss/gss_svc_upcall.c @@ -1330,7 +1330,7 @@ cache_check: grctx->src_init = 1; grctx->src_reserve_len = size_round4(rsip->out_token.len); - rc = lustre_pack_reply_v2(req, 1, &replen, NULL); + rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0); if (rc) { CERROR("failed to pack reply: %d\n", rc); GOTO(out, rc = SECSVC_DROP); diff --git a/lustre/ptlrpc/gss/sec_gss.c b/lustre/ptlrpc/gss/sec_gss.c index f33cddb..d4aef70 100644 --- a/lustre/ptlrpc/gss/sec_gss.c +++ b/lustre/ptlrpc/gss/sec_gss.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -76,6 +77,13 @@ #include +/* + * early reply have fixed size, respectively in privacy and integrity mode. + * so we calculate them only once. + */ +static int gss_at_reply_off_integ; +static int gss_at_reply_off_priv; + static inline int msg_last_segidx(struct lustre_msg *msg) { @@ -144,21 +152,23 @@ netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment) /* * payload should be obtained from mechanism. but currently since we * only support kerberos, we could simply use fixed value. - * krb5 header: 16 - * krb5 checksum: 20 + * krb5 "meta" data: + * - krb5 header: 16 + * - krb5 checksum: 20 + * + * for privacy mode, payload also include the cipher text which has the same + * size as plain text, plus possible confounder, padding both at maximum cipher + * block size. */ #define GSS_KRB5_INTEG_MAX_PAYLOAD (40) static inline -int gss_estimate_payload(struct gss_ctx *mechctx, int msgsize, int privacy) +int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy) { - if (privacy) { - /* we suppose max cipher block size is 16 bytes. here we - * add 16 for confounder and 16 for padding. */ - return GSS_KRB5_INTEG_MAX_PAYLOAD + msgsize + 16 + 16 + 16; - } else { + if (privacy) + return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize; + else return GSS_KRB5_INTEG_MAX_PAYLOAD; - } } /* @@ -575,11 +585,10 @@ exit: * cred APIs * ***************************************/ -static inline -int gss_cli_payload(struct ptlrpc_cli_ctx *ctx, - int msgsize, int privacy) +static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx, + int msgsize, int privacy) { - return gss_estimate_payload(NULL, msgsize, privacy); + return gss_mech_payload(NULL, msgsize, privacy); } int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred) @@ -731,20 +740,23 @@ int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, { struct gss_cli_ctx *gctx; struct gss_header *ghdr, *reqhdr; - struct lustre_msg *msg = req->rq_repbuf; + struct lustre_msg *msg = req->rq_repdata; __u32 major; - int rc = 0; + int pack_bulk, early = 0, rc = 0; ENTRY; LASSERT(req->rq_cli_ctx == ctx); LASSERT(msg); - req->rq_repdata_len = req->rq_nob_received; gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + if ((char *) msg < req->rq_repbuf || + (char *) msg >= req->rq_repbuf + req->rq_repbuf_len) + early = 1; + /* special case for context negotiation, rq_repmsg/rq_replen actually - * are not used currently. */ - if (req->rq_ctx_init) { + * are not used currently. but early reply always be treated normally */ + if (req->rq_ctx_init && !early) { req->rq_repmsg = lustre_msg_buf(msg, 1, 0); req->rq_replen = msg->lm_buflens[1]; RETURN(0); @@ -773,8 +785,9 @@ int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, switch (ghdr->gh_proc) { case PTLRPC_GSS_PROC_DATA: - if (!equi(req->rq_pack_bulk == 1, - ghdr->gh_flags & LUSTRE_GSS_PACK_BULK)) { + pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK; + + if (!early && !equi(req->rq_pack_bulk == 1, pack_bulk)) { CERROR("%s bulk flag in reply\n", req->rq_pack_bulk ? "missing" : "unexpected"); RETURN(-EPROTO); @@ -799,11 +812,20 @@ int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, if (major != GSS_S_COMPLETE) RETURN(-EPERM); - req->rq_repmsg = lustre_msg_buf(msg, 1, 0); - req->rq_replen = msg->lm_buflens[1]; + if (early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) { + __u32 cksum; - if (req->rq_pack_bulk) { - /* FIXME */ + cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(msg, 1, 0), + lustre_msg_buflen(msg, 1)); + if (cksum != msg->lm_cksum) { + CWARN("early reply checksum mismatch: " + "%08x != %08x\n", cksum, msg->lm_cksum); + RETURN(-EPROTO); + } + } + + if (pack_bulk) { /* bulk checksum is right after the lustre msg */ if (msg->lm_bufcount < 3) { CERROR("Invalid reply bufcount %u\n", @@ -812,10 +834,22 @@ int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, } rc = bulk_sec_desc_unpack(msg, 2); + if (rc) { + CERROR("unpack bulk desc: %d\n", rc); + RETURN(rc); + } } + + req->rq_repmsg = lustre_msg_buf(msg, 1, 0); + req->rq_replen = msg->lm_buflens[1]; break; case PTLRPC_GSS_PROC_ERR: - rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr); + if (early) { + CERROR("server return error with early reply\n"); + rc = -EPROTO; + } else { + rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr); + } break; default: CERROR("unknown gss proc %d\n", ghdr->gh_proc); @@ -947,16 +981,22 @@ int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, { struct gss_cli_ctx *gctx; struct gss_header *ghdr; - int msglen, rc; + struct lustre_msg *msg = req->rq_repdata; + int msglen, pack_bulk, early = 0, rc; __u32 major; ENTRY; - LASSERT(req->rq_repbuf); LASSERT(req->rq_cli_ctx == ctx); + LASSERT(req->rq_ctx_init == 0); + LASSERT(msg); gctx = container_of(ctx, struct gss_cli_ctx, gc_base); - ghdr = gss_swab_header(req->rq_repbuf, 0); + if ((char *) msg < req->rq_repbuf || + (char *) msg >= req->rq_repbuf + req->rq_repbuf_len) + early = 1; + + ghdr = gss_swab_header(msg, 0); if (ghdr == NULL) { CERROR("can't decode gss header\n"); RETURN(-EPROTO); @@ -971,49 +1011,52 @@ int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, switch (ghdr->gh_proc) { case PTLRPC_GSS_PROC_DATA: - if (!equi(req->rq_pack_bulk == 1, - ghdr->gh_flags & LUSTRE_GSS_PACK_BULK)) { + pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK; + + if (!early && !equi(req->rq_pack_bulk == 1, pack_bulk)) { CERROR("%s bulk flag in reply\n", req->rq_pack_bulk ? "missing" : "unexpected"); RETURN(-EPROTO); } - if (lustre_msg_swabbed(req->rq_repbuf)) + if (lustre_msg_swabbed(msg)) gss_header_swabber(ghdr); - major = gss_unseal_msg(gctx->gc_mechctx, req->rq_repbuf, - &msglen, req->rq_repbuf_len); + /* use rq_repdata_len as buffer size, which assume unseal + * doesn't need extra memory space. for precise control, we'd + * better calculate out actual buffer size as + * (repbuf_len - offset - repdata_len) */ + major = gss_unseal_msg(gctx->gc_mechctx, msg, + &msglen, req->rq_repdata_len); if (major != GSS_S_COMPLETE) { rc = -EPERM; break; } - if (lustre_unpack_msg(req->rq_repbuf, msglen)) { + if (lustre_unpack_msg(msg, msglen)) { CERROR("Failed to unpack after decryption\n"); RETURN(-EPROTO); } - req->rq_repdata_len = msglen; - if (req->rq_repbuf->lm_bufcount < 1) { + if (msg->lm_bufcount < 1) { CERROR("Invalid reply buffer: empty\n"); RETURN(-EPROTO); } - if (req->rq_pack_bulk) { - if (req->rq_repbuf->lm_bufcount < 2) { - CERROR("Too few request buffer segments %d\n", - req->rq_repbuf->lm_bufcount); + if (pack_bulk) { + if (msg->lm_bufcount < 2) { + CERROR("bufcount %u: missing bulk sec desc\n", + msg->lm_bufcount); RETURN(-EPROTO); } /* bulk checksum is the last segment */ - if (bulk_sec_desc_unpack(req->rq_repbuf, - req->rq_repbuf->lm_bufcount-1)) + if (bulk_sec_desc_unpack(msg, msg->lm_bufcount-1)) RETURN(-EPROTO); } - req->rq_repmsg = lustre_msg_buf(req->rq_repbuf, 0, 0); - req->rq_replen = req->rq_repbuf->lm_buflens[0]; + req->rq_repmsg = lustre_msg_buf(msg, 0, 0); + req->rq_replen = msg->lm_buflens[0]; rc = 0; break; @@ -1438,8 +1481,9 @@ int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec, struct ptlrpc_request *req, int svc, int msgsize) { - int txtsize; - int buflens[4], bufcnt = 2; + int txtsize; + int buflens[4], bufcnt = 2; + int alloc_size; /* * on-wire data layout: @@ -1476,7 +1520,12 @@ int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec, else if (svc != SPTLRPC_SVC_NULL) buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0); - return do_alloc_repbuf(req, lustre_msg_size_v2(bufcnt, buflens)); + alloc_size = lustre_msg_size_v2(bufcnt, buflens); + + /* add space for early reply */ + alloc_size += gss_at_reply_off_integ; + + return do_alloc_repbuf(req, alloc_size); } static @@ -1484,8 +1533,9 @@ int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec, struct ptlrpc_request *req, int msgsize) { - int txtsize; - int buflens[3], bufcnt; + int txtsize; + int buflens[3], bufcnt; + int alloc_size; /* Inner (clear) buffers * - lustre message @@ -1514,7 +1564,12 @@ int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec, buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0); buflens[2] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1); - return do_alloc_repbuf(req, lustre_msg_size_v2(bufcnt, buflens)); + alloc_size = lustre_msg_size_v2(bufcnt, buflens); + + /* add space for early reply */ + alloc_size += gss_at_reply_off_priv; + + return do_alloc_repbuf(req, alloc_size); } int gss_alloc_repbuf(struct ptlrpc_sec *sec, @@ -1853,6 +1908,17 @@ int gss_svc_sign(struct ptlrpc_request *req, RETURN(rc); rs->rs_repdata_len = rc; + + if (likely(req->rq_packed_final)) { + req->rq_reply_off = gss_at_reply_off_integ; + } else { + if (svc == SPTLRPC_SVC_NULL) + rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(rs->rs_repbuf, 1, 0), + lustre_msg_buflen(rs->rs_repbuf, 1)); + req->rq_reply_off = 0; + } + RETURN(0); } @@ -1871,7 +1937,7 @@ int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor) grctx->src_err_notify = 1; grctx->src_reserve_len = 0; - rc = lustre_pack_reply_v2(req, 1, &replen, NULL); + rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0); if (rc) { CERROR("could not pack reply, err %d\n", rc); RETURN(rc); @@ -2366,19 +2432,23 @@ void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx) } static inline -int gss_svc_payload(struct gss_svc_reqctx *grctx, int msgsize, int privacy) +int gss_svc_payload(struct gss_svc_reqctx *grctx, int early, + int msgsize, int privacy) { - if (gss_svc_reqctx_is_special(grctx)) + /* we should treat early reply normally, but which is actually sharing + * the same ctx with original request, so in this case we should + * ignore the special ctx's special flags */ + if (early == 0 && gss_svc_reqctx_is_special(grctx)) return grctx->src_reserve_len; - return gss_estimate_payload(NULL, msgsize, privacy); + return gss_mech_payload(NULL, msgsize, privacy); } int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) { struct gss_svc_reqctx *grctx; struct ptlrpc_reply_state *rs; - int privacy, svc, bsd_off = 0; + int early, privacy, svc, bsd_off = 0; int ibuflens[2], ibufcnt = 0; int buflens[4], bufcnt; int txtsize, wmsg_size, rs_size; @@ -2392,9 +2462,10 @@ int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) } svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc); + early = (req->rq_packed_final == 0); grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); - if (gss_svc_reqctx_is_special(grctx)) + if (!early && gss_svc_reqctx_is_special(grctx)) privacy = 0; else privacy = (svc == SPTLRPC_SVC_PRIV); @@ -2419,8 +2490,8 @@ int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) /* wrapper buffer */ bufcnt = 3; buflens[0] = PTLRPC_GSS_HEADER_SIZE; - buflens[1] = gss_svc_payload(grctx, buflens[0], 0); - buflens[2] = gss_svc_payload(grctx, txtsize, 1); + buflens[1] = gss_svc_payload(grctx, early, buflens[0], 0); + buflens[2] = gss_svc_payload(grctx, early, txtsize, 1); } else { bufcnt = 2; buflens[0] = PTLRPC_GSS_HEADER_SIZE; @@ -2442,9 +2513,10 @@ int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) bufcnt++; } - if (gss_svc_reqctx_is_special(grctx) || + if ((!early && gss_svc_reqctx_is_special(grctx)) || svc != SPTLRPC_SVC_NULL) - buflens[bufcnt++] = gss_svc_payload(grctx, txtsize, 0); + buflens[bufcnt++] = gss_svc_payload(grctx, early, + txtsize, 0); } wmsg_size = lustre_msg_size_v2(bufcnt, buflens); @@ -2518,7 +2590,7 @@ int gss_svc_seal(struct ptlrpc_request *req, msgobj.data = (__u8 *) rs->rs_repbuf; /* allocate temporary cipher buffer */ - cipher_buflen = gss_estimate_payload(gctx->gsc_mechctx, msglen, 1); + cipher_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1); OBD_ALLOC(cipher_buf, cipher_buflen); if (!cipher_buf) RETURN(-ENOMEM); @@ -2536,12 +2608,14 @@ int gss_svc_seal(struct ptlrpc_request *req, /* we are about to override data at rs->rs_repbuf, nullify pointers * to which to catch further illegal usage. */ - grctx->src_repbsd = NULL; - grctx->src_repbsd_size = 0; + if (req->rq_pack_bulk) { + grctx->src_repbsd = NULL; + grctx->src_repbsd_size = 0; + } /* now the real wire data */ buflens[0] = PTLRPC_GSS_HEADER_SIZE; - buflens[1] = gss_estimate_payload(gctx->gsc_mechctx, buflens[0], 0); + buflens[1] = gss_mech_payload(gctx->gsc_mechctx, buflens[0], 0); buflens[2] = cipher_obj.len; LASSERT(lustre_msg_size_v2(3, buflens) <= rs->rs_repbuf_len); @@ -2579,6 +2653,12 @@ int gss_svc_seal(struct ptlrpc_request *req, rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2, cipher_obj.len, 0); + /* reply offset */ + if (likely(req->rq_packed_final)) + req->rq_reply_off = gss_at_reply_off_priv; + else + req->rq_reply_off = 0; + /* to catch upper layer's further access */ rs->rs_msg = NULL; req->rq_repmsg = NULL; @@ -2594,15 +2674,22 @@ int gss_svc_authorize(struct ptlrpc_request *req) { struct ptlrpc_reply_state *rs = req->rq_reply_state; struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); - struct gss_wire_ctx *gw; - int rc; + struct gss_wire_ctx *gw = &grctx->src_wirectx; + int early, rc; ENTRY; - if (gss_svc_reqctx_is_special(grctx)) + early = (req->rq_packed_final == 0); + + if (!early && gss_svc_reqctx_is_special(grctx)) { + LASSERT(rs->rs_repdata_len != 0); + + req->rq_reply_off = gss_at_reply_off_integ; RETURN(0); + } - gw = &grctx->src_wirectx; - if (gw->gw_proc != PTLRPC_GSS_PROC_DATA && + /* early reply could happen in many cases */ + if (!early && + gw->gw_proc != PTLRPC_GSS_PROC_DATA && gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) { CERROR("proc %d not support\n", gw->gw_proc); RETURN(-EINVAL); @@ -2636,10 +2723,6 @@ void gss_svc_free_rs(struct ptlrpc_reply_state *rs) LASSERT(rs->rs_svc_ctx); grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base); - /* paranoid, maybe not necessary */ - grctx->src_reqbsd = NULL; - grctx->src_repbsd = NULL; - gss_svc_reqctx_decref(grctx); rs->rs_svc_ctx = NULL; @@ -2706,6 +2789,23 @@ err_out: return -ENOMEM; } +static void gss_init_at_reply_offset(void) +{ + int buflens[3], clearsize; + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = lustre_msg_early_size(); + buflens[2] = gss_cli_payload(NULL, buflens[1], 0); + gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens); + + buflens[0] = lustre_msg_early_size(); + clearsize = lustre_msg_size_v2(1, buflens); + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(NULL, clearsize, 0); + buflens[2] = gss_cli_payload(NULL, clearsize, 1); + gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens); +} + int __init sptlrpc_gss_init(void) { int rc; @@ -2739,6 +2839,8 @@ int __init sptlrpc_gss_init(void) goto out_keyring; #endif + gss_init_at_reply_offset(); + return 0; #ifdef HAVE_GSS_PIPEFS diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 345b2b6..7214a05 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -199,6 +199,8 @@ void ptlrpc_deactivate_import(struct obd_import *imp) */ void ptlrpc_invalidate_import(struct obd_import *imp) { + struct list_head *tmp, *n; + struct ptlrpc_request *req; struct l_wait_info lwi; int rc; @@ -216,19 +218,19 @@ void ptlrpc_invalidate_import(struct obd_import *imp) LASSERT(imp->imp_invalid); - /* wait for all requests to error out and call completion callbacks */ - lwi = LWI_TIMEOUT_INTERVAL(cfs_timeout_cap(cfs_time_seconds(obd_timeout)), - HZ, NULL, NULL); + /* wait for all requests to error out and call completion callbacks. + Cap it at obd_timeout -- these should all have been locally + cancelled by ptlrpc_abort_inflight. */ + lwi = LWI_TIMEOUT_INTERVAL( + cfs_timeout_cap(cfs_time_seconds(obd_timeout)), + cfs_time_seconds(1), NULL, NULL); rc = l_wait_event(imp->imp_recovery_waitq, (atomic_read(&imp->imp_inflight) == 0), &lwi); if (rc) { - struct list_head *tmp, *n; - struct ptlrpc_request *req; - CERROR("%s: rc = %d waiting for callback (%d != 0)\n", - obd2cli_tgt(imp->imp_obd), rc, - atomic_read(&imp->imp_inflight)); + obd2cli_tgt(imp->imp_obd), rc, + atomic_read(&imp->imp_inflight)); spin_lock(&imp->imp_lock); list_for_each_safe(tmp, n, &imp->imp_sending_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); @@ -325,6 +327,7 @@ static int import_select_connection(struct obd_import *imp) { struct obd_import_conn *imp_conn = NULL, *conn; struct obd_export *dlmexp; + int tried_all = 1; ENTRY; spin_lock(&imp->imp_lock); @@ -341,36 +344,60 @@ static int import_select_connection(struct obd_import *imp) imp->imp_obd->obd_name, libcfs_nid2str(conn->oic_conn->c_peer.nid), conn->oic_last_attempt); - /* Throttle the reconnect rate to once per RECONNECT_INTERVAL */ - if (cfs_time_before_64(conn->oic_last_attempt + - RECONNECT_INTERVAL * HZ, - cfs_time_current_64())) { - /* If we have never tried this connection since the - the last successful attempt, go with this one */ - if (cfs_time_beforeq_64(conn->oic_last_attempt, - imp->imp_last_success_conn)) { - imp_conn = conn; - break; - } + /* Don't thrash connections */ + if (cfs_time_before_64(cfs_time_current_64(), + conn->oic_last_attempt + + cfs_time_seconds(CONNECTION_SWITCH_MIN))) { + continue; + } - /* Both of these connections have already been tried - since the last successful connection; just choose the - least recently used */ - if (!imp_conn) - imp_conn = conn; - else if (cfs_time_before_64(conn->oic_last_attempt, - imp_conn->oic_last_attempt)) - imp_conn = conn; + /* If we have not tried this connection since the + the last successful attempt, go with this one */ + if ((conn->oic_last_attempt == 0) || + cfs_time_beforeq_64(conn->oic_last_attempt, + imp->imp_last_success_conn)) { + imp_conn = conn; + tried_all = 0; + break; } + + /* If all of the connections have already been tried + since the last successful connection; just choose the + least recently used */ + if (!imp_conn) + imp_conn = conn; + else if (cfs_time_before_64(conn->oic_last_attempt, + imp_conn->oic_last_attempt)) + imp_conn = conn; } /* if not found, simply choose the current one */ if (!imp_conn) { LASSERT(imp->imp_conn_current); imp_conn = imp->imp_conn_current; + tried_all = 0; } LASSERT(imp_conn->oic_conn); + /* If we've tried everything, and we're back to the beginning of the + list, increase our timeout and try again. It will be reset when + we do finally connect. (FIXME: really we should wait for all network + state associated with the last connection attempt to drain before + trying to reconnect on it.) */ + if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item) && + !imp->imp_recon_bk /* not retrying */) { + if (at_get(&imp->imp_at.iat_net_latency) < + CONNECTION_SWITCH_MAX) { + at_add(&imp->imp_at.iat_net_latency, + at_get(&imp->imp_at.iat_net_latency) + + CONNECTION_SWITCH_INC); + } + LASSERT(imp_conn->oic_last_attempt); + CWARN("%s: tried all connections, increasing latency to %ds\n", + imp->imp_obd->obd_name, + at_get(&imp->imp_at.iat_net_latency)); + } + imp_conn->oic_last_attempt = cfs_time_current_64(); /* switch connection, don't mind if it's same as the current one */ @@ -509,6 +536,8 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) /* Reset connect flags to the originally requested flags, in case * the server is updated on-the-fly we will get the new features. */ imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig; + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd, &obd->obd_uuid, &imp->imp_connect_data); if (rc) @@ -548,15 +577,12 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) spin_lock(&imp->imp_lock); imp->imp_replayable = 1; spin_unlock(&imp->imp_lock); - /* On an initial connect, we don't know which one of a - failover server pair is up. Don't wait long. */ -#ifdef CRAY_XT3 - request->rq_timeout = max((int)(obd_timeout / 2), 5); -#else - request->rq_timeout = max((int)(obd_timeout / 20), 5); -#endif lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_INITIAL); + if (AT_OFF) + /* AT will use INITIAL_CONNECT_TIMEOUT the first + time, adaptive after that. */ + request->rq_timeout = INITIAL_CONNECT_TIMEOUT; } if (set_transno) @@ -710,6 +736,8 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, } if (imp->imp_invalid) { + CDEBUG(D_HA, "%s: reconnected but import is invalid; " + "marking evicted\n", imp->imp_obd->obd_name); IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); } else if (MSG_CONNECT_RECOVERING & msg_flags) { CDEBUG(D_HA, "%s: reconnected to %s during replay\n", @@ -731,6 +759,8 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, imp->imp_last_replay_transno = 0; IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); } else { + DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags" + " not set: %x)", imp->imp_obd->obd_name, msg_flags); imp->imp_remote_handle = *lustre_msg_get_handle(request->rq_repmsg); IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); @@ -885,6 +915,20 @@ finish: imp->imp_obd->obd_namespace->ns_orig_connect_flags = ocd->ocd_connect_flags; + if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) && + (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) + /* We need a per-message support flag, because + a. we don't know if the incoming connect reply + supports AT or not (in reply_in_callback) + until we unpack it. + b. failovered server means export and flags are gone + (in ptlrpc_send_reply). + Can only be set when we know AT is supported at + both ends */ + imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; + else + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && (cli->cl_max_pages_per_rpc > 0)); } @@ -1150,12 +1194,19 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) if (ptlrpc_import_in_recovery(imp)) { struct l_wait_info lwi; cfs_duration_t timeout; - if (imp->imp_server_timeout) - timeout = cfs_time_seconds(obd_timeout / 2); - else - timeout = cfs_time_seconds(obd_timeout); - - timeout = MAX(timeout * HZ, 1); + + + if (AT_OFF) { + if (imp->imp_server_timeout) + timeout = cfs_time_seconds(obd_timeout / 2); + else + timeout = cfs_time_seconds(obd_timeout); + } else { + int idx = import_at_get_index(imp, + imp->imp_client->cli_request_portal); + timeout = cfs_time_seconds( + at_get(&imp->imp_at.iat_service_estimate[idx])); + } lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout), back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL); @@ -1177,11 +1228,19 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) * it fails. We can get through the above with a down server * if the client doesn't know the server is gone yet. */ req->rq_no_resend = 1; -#ifdef CRAY_XT3 - req->rq_timeout = obd_timeout / 3; + +#ifndef CRAY_XT3 + /* We want client umounts to happen quickly, no matter the + server state... */ + req->rq_timeout = min_t(int, req->rq_timeout, + INITIAL_CONNECT_TIMEOUT); #else - req->rq_timeout = 5; + /* ... but we always want liblustre clients to nicely + disconnect, so only use the adaptive value. */ + if (AT_OFF) + req->rq_timeout = obd_timeout / 3; #endif + IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING); req->rq_send_state = LUSTRE_IMP_CONNECTING; ptlrpc_request_set_replen(req); @@ -1198,9 +1257,132 @@ out: IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED); memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); imp->imp_conn_cnt = 0; + /* Try all connections in the future - bz 12758 */ imp->imp_last_recon = 0; spin_unlock(&imp->imp_lock); RETURN(rc); } + +/* Adaptive Timeout utils */ +extern unsigned int at_min, at_max, at_history; + +/* Bin into timeslices using AT_BINS bins. + This gives us a max of the last binlimit*AT_BINS secs without the storage, + but still smoothing out a return to normalcy from a slow response. + (E.g. remember the maximum latency in each minute of the last 4 minutes.) */ +int at_add(struct adaptive_timeout *at, unsigned int val) +{ + unsigned int old = at->at_current; + time_t now = cfs_time_current_sec(); + time_t binlimit = max_t(time_t, at_history / AT_BINS, 1); + + LASSERT(at); +#if 0 + CDEBUG(D_INFO, "add %u to %p time=%lu v=%u (%u %u %u %u)\n", + val, at, now - at->at_binstart, at->at_current, + at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); +#endif + if (val == 0) + /* 0's don't count, because we never want our timeout to + drop to 0, and because 0 could mean an error */ + return 0; + + spin_lock(&at->at_lock); + + if (unlikely(at->at_binstart == 0)) { + /* Special case to remove default from history */ + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = now; + at->at_hist[0] = val; + at->at_binstart = now; + } else if (now - at->at_binstart < binlimit ) { + /* in bin 0 */ + at->at_hist[0] = max(val, at->at_hist[0]); + at->at_current = max(val, at->at_current); + } else { + int i, shift; + unsigned int maxv = val; + /* move bins over */ + shift = (now - at->at_binstart) / binlimit; + LASSERT(shift > 0); + for(i = AT_BINS - 1; i >= 0; i--) { + if (i >= shift) { + at->at_hist[i] = at->at_hist[i - shift]; + maxv = max(maxv, at->at_hist[i]); + } else { + at->at_hist[i] = 0; + } + } + at->at_hist[0] = val; + at->at_current = maxv; + at->at_binstart += shift * binlimit; + } + + if (at->at_current > at->at_worst_ever) { + at->at_worst_ever = at->at_current; + at->at_worst_time = now; + } + + if (at->at_flags & AT_FLG_NOHIST) + /* Only keep last reported val; keeping the rest of the history + for proc only */ + at->at_current = val; + + if (at_max > 0) + at->at_current = min(at->at_current, at_max); + at->at_current = max(at->at_current, at_min); + +#if 0 + if (at->at_current != old) + CDEBUG(D_ADAPTTO, "AT %p change: old=%u new=%u delta=%d " + "(val=%u) hist %u %u %u %u\n", at, + old, at->at_current, at->at_current - old, val, + at->at_hist[0], at->at_hist[1], at->at_hist[2], + at->at_hist[3]); +#endif + + /* if we changed, report the old value */ + old = (at->at_current != old) ? old : 0; + + spin_unlock(&at->at_lock); + return old; +} + +/* Find the imp_at index for a given portal; assign if space available */ +int import_at_get_index(struct obd_import *imp, int portal) +{ + struct imp_at *at = &imp->imp_at; + int i; + + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + return i; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not found in list, add it under a lock */ + spin_lock(&imp->imp_lock); + + /* Check unused under lock */ + for (; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + goto out; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not enough portals? */ + LASSERT(i < IMP_AT_MAX_PORTALS); + + at->iat_portal[i] = portal; +out: + spin_unlock(&imp->imp_lock); + return i; +} + diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 29c4cfc..20a5673 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -183,6 +183,8 @@ void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir, svc_counter_config, "req_qdepth", "reqs"); lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR, svc_counter_config, "req_active", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT, + svc_counter_config, "req_timeout", "sec"); lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR, svc_counter_config, "reqbuf_avail", "bufs"); for (i = 0; i < EXTRA_LAST_OPC; i++) { @@ -359,6 +361,36 @@ ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s, return srhi; } +/* common ost/mdt srv_request_history_print_fn */ +void target_print_req(void *seq_file, struct ptlrpc_request *req) +{ + /* Called holding srv_lock with irqs disabled. + * Print specific req contents and a newline. + * CAVEAT EMPTOR: check request message length before printing!!! + * You might have received any old crap so you must be just as + * careful here as the service's request parser!!! */ + struct seq_file *sf = seq_file; + + switch (req->rq_phase) { + case RQ_PHASE_NEW: + /* still awaiting a service thread's attention, or rejected + * because the generic request message didn't unpack */ + seq_printf(sf, "\n"); + break; + case RQ_PHASE_INTERPRET: + /* being handled, so basic msg swabbed, and opc is valid + * but racing with mds_handle() */ + case RQ_PHASE_COMPLETE: + /* been handled by mds_handle() reply state possibly still + * volatile */ + seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg)); + break; + default: + DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase); + } +} +EXPORT_SYMBOL(target_print_req); + static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter) { struct ptlrpc_service *svc = s->private; @@ -379,11 +411,13 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter) * must be just as careful as the service's request * parser. Currently I only print stuff here I know is OK * to look at coz it was set up in request_in_callback()!!! */ - seq_printf(s, LPD64":%s:%s:"LPD64":%d:%s ", + seq_printf(s, LPD64":%s:%s:x"LPD64":%d:%s:%ld:%lds(%+lds) ", req->rq_history_seq, libcfs_nid2str(req->rq_self), libcfs_id2str(req->rq_peer), req->rq_xid, - req->rq_reqlen,ptlrpc_rqphase2str(req)); - + req->rq_reqlen, ptlrpc_rqphase2str(req), + req->rq_arrival_time.tv_sec, + req->rq_sent - req->rq_arrival_time.tv_sec, + req->rq_sent - req->rq_deadline); if (svc->srv_request_history_print_fn == NULL) seq_printf(s, "\n"); else @@ -420,6 +454,34 @@ ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file) return 0; } +/* See also lprocfs_rd_timeouts */ +static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + unsigned int cur, worst; + time_t worstt; + struct dhms ts; + int rc = 0; + + *eof = 1; + cur = at_get(&svc->srv_at_estimate); + worst = svc->srv_at_estimate.at_worst_ever; + worstt = svc->srv_at_estimate.at_worst_time; + s2dhms(&ts, cfs_time_current_sec() - worstt); + if (AT_OFF) + rc += snprintf(page + rc, count - rc, + "adaptive timeouts off, using obd_timeout %u\n", + obd_timeout); + rc += snprintf(page + rc, count - rc, + "%10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ", + "service", cur, worst, worstt, + DHMS_VARS(&ts)); + rc = lprocfs_at_hist_helper(page, count, rc, + &svc->srv_at_estimate); + return rc; +} + void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, struct ptlrpc_service *svc) { @@ -432,6 +494,9 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, .write_fptr = ptlrpc_lprocfs_write_req_history_max, .read_fptr = ptlrpc_lprocfs_read_req_history_max, .data = svc}, + {.name = "timeouts", + .read_fptr = ptlrpc_lprocfs_rd_timeouts, + .data = svc}, {NULL} }; static struct file_operations req_history_fops = { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 4f71dcb..d7256c2 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -35,7 +35,8 @@ static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len, lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid, - struct ptlrpc_connection *conn, int portal, __u64 xid) + struct ptlrpc_connection *conn, int portal, __u64 xid, + unsigned int offset) { int rc; lnet_md_t md; @@ -64,11 +65,11 @@ static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len, RETURN (-ENOMEM); } - CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n", - len, portal, xid); + CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n", + len, portal, xid, offset); rc = LNetPut (conn->c_self, *mdh, ack, - conn->c_peer, portal, xid, 0, 0); + conn->c_peer, portal, xid, offset, 0); if (unlikely(rc != 0)) { int rc2; /* We're going to get an UNLINK event when I unlink below, @@ -302,7 +303,47 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req) } } -int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) +static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) +{ + struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; + int service_time = max_t(int, cfs_time_current_sec() - + req->rq_arrival_time.tv_sec, 1); + + if (!(flags & PTLRPC_REPLY_EARLY) && + (req->rq_type != PTL_RPC_MSG_ERR)) { + /* early replies and errors don't count toward our service + time estimate */ + int oldse = at_add(&svc->srv_at_estimate, service_time); + if (oldse != 0) + DEBUG_REQ(D_ADAPTTO, req, + "svc %s changed estimate from %d to %d", + svc->srv_name, oldse, + at_get(&svc->srv_at_estimate)); + } + /* Report actual service time for client latency calc */ + lustre_msg_set_service_time(req->rq_repmsg, service_time); + /* Report service time estimate for future client reqs, but report 0 + * (to be ignored by client) if it's a error reply during recovery. + * (bz15815) */ + if (req->rq_type == PTL_RPC_MSG_ERR && + (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering)) + lustre_msg_set_timeout(req->rq_repmsg, 0); + else + lustre_msg_set_timeout(req->rq_repmsg, + at_get(&svc->srv_at_estimate)); + + if (req->rq_reqmsg && + !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x " + "req_flags=%#x magic=%d:%x/%x len=%d\n", + flags, lustre_msg_get_flags(req->rq_reqmsg), + lustre_msg_is_v1(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); + } +} + +int ptlrpc_send_reply (struct ptlrpc_request *req, int flags) { struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; struct ptlrpc_reply_state *rs = req->rq_reply_state; @@ -319,12 +360,14 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) LASSERT (req->rq_no_reply == 0); LASSERT (req->rq_reqbuf != NULL); LASSERT (rs != NULL); - LASSERT (may_be_difficult || !rs->rs_difficult); + LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); LASSERT (req->rq_repmsg != NULL); LASSERT (req->rq_repmsg == rs->rs_msg); LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback); LASSERT (rs->rs_cb_id.cbid_arg == rs); + /* There may be no rq_export during failover */ + if (unlikely(req->rq_export && req->rq_export->exp_obd && req->rq_export->exp_obd->obd_fail)) { /* Failed obd's only send ENODEV */ @@ -344,6 +387,8 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) target_pack_pool_reply(req); + ptlrpc_at_set_reply(req, flags); + if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) conn = ptlrpc_get_connection(req->rq_peer, req->rq_self, NULL); else @@ -360,14 +405,16 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) if (unlikely(rc)) goto out; + req->rq_sent = cfs_time_current_sec(); + rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, rs->rs_difficult ? LNET_ACK_REQ : LNET_NOACK_REQ, - &rs->rs_cb_id, conn, - svc->srv_rep_portal, req->rq_xid); + &rs->rs_cb_id, conn, svc->srv_rep_portal, + req->rq_xid, req->rq_reply_off); out: if (unlikely(rc != 0)) { atomic_dec (&svc->srv_outstanding_replies); - ptlrpc_rs_decref(rs); + ptlrpc_req_drop_rs(req); } ptlrpc_put_connection(conn); return rc; @@ -441,6 +488,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); lustre_msg_set_conn_cnt(request->rq_reqmsg, request->rq_import->imp_conn_cnt); + lustre_msghdr_set_flags(request->rq_reqmsg, + request->rq_import->imp_msghdr_flags); rc = sptlrpc_cli_wrap_request(request); if (rc) @@ -456,10 +505,15 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) if (!noreply) { LASSERT (request->rq_replen != 0); if (request->rq_repbuf == NULL) { + LASSERT(request->rq_repdata == NULL); + LASSERT(request->rq_repmsg == NULL); rc = sptlrpc_cli_alloc_repbuf(request, request->rq_replen); if (rc) GOTO(cleanup_bulk, rc); + } else { + request->rq_repdata = NULL; + request->rq_repmsg = NULL; } rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ @@ -475,6 +529,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) spin_lock(&request->rq_lock); /* If the MD attach succeeds, there _will_ be a reply_in callback */ request->rq_receiving_reply = !noreply; + /* We are responsible for unlinking the reply buffer */ + request->rq_must_unlink = !noreply; /* Clear any flags that may be present from previous sends. */ request->rq_replied = 0; request->rq_err = 0; @@ -487,13 +543,18 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) if (!noreply) { reply_md.start = request->rq_repbuf; reply_md.length = request->rq_repbuf_len; - reply_md.threshold = 1; - reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT; + /* Allow multiple early replies */ + reply_md.threshold = LNET_MD_THRESH_INF; + /* Manage remote for early replies */ + reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | + LNET_MD_MANAGE_REMOTE; reply_md.user_ptr = &request->rq_reply_cbid; reply_md.eq_handle = ptlrpc_eq_h; - rc = LNetMDAttach(reply_me_h, reply_md, LNET_UNLINK, - &request->rq_reply_md_h); + /* We must see the unlink callback to unset rq_must_unlink, + so we can't auto-unlink */ + rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, + &request->rq_reply_md_h); if (rc != 0) { CERROR("LNetMDAttach failed: %d\n", rc); LASSERT (rc == -ENOMEM); @@ -518,15 +579,23 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); - request->rq_sent = cfs_time_current_sec(); do_gettimeofday(&request->rq_arrival_time); + request->rq_sent = cfs_time_current_sec(); + /* We give the server rq_timeout secs to process the req, and + add the network latency for our local timeout. */ + request->rq_deadline = request->rq_sent + request->rq_timeout + + ptlrpc_at_get_net_latency(request); + ptlrpc_pinger_sending_on_import(request->rq_import); + + DEBUG_REQ(D_INFO, request, "send flg=%x", + lustre_msg_get_flags(request->rq_reqmsg)); rc = ptl_send_buf(&request->rq_req_md_h, request->rq_reqbuf, request->rq_reqdata_len, LNET_NOACK_REQ, &request->rq_req_cbid, connection, request->rq_request_portal, - request->rq_xid); + request->rq_xid, 0); if (rc == 0) { ptlrpc_lprocfs_rpc_sent(request); RETURN(rc); @@ -535,8 +604,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) ptlrpc_req_finished(request); if (noreply) RETURN(rc); - else - GOTO(cleanup_me, rc); + cleanup_me: /* MEUnlink is safe; the PUT didn't even get off the ground, and * nobody apart from the PUT's target has the right nid+XID to diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 19f66f5..4e8a146 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -38,6 +38,7 @@ #include #include #include +#include static inline int lustre_msg_hdr_size_v2(int count) { @@ -84,6 +85,15 @@ int lustre_msg_check_version(struct lustre_msg *msg, __u32 version) } } +/* early reply size */ +int lustre_msg_early_size() { + static int size = 0; + if (!size) + size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL); + return size; +} +EXPORT_SYMBOL(lustre_msg_early_size); + int lustre_msg_size_v2(int count, int *lengths) { int size; @@ -272,7 +282,7 @@ void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs) } int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, - int *lens, char **bufs) + int *lens, char **bufs, int flags) { struct ptlrpc_reply_state *rs; int msg_len, rc; @@ -280,6 +290,9 @@ int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, LASSERT(req->rq_reply_state == NULL); + if ((flags & LPRFL_EARLY_REPLY) == 0) + req->rq_packed_final = 1; + msg_len = lustre_msg_size_v2(count, lens); rc = sptlrpc_svc_alloc_rs(req, msg_len); if (rc) @@ -296,6 +309,7 @@ int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, req->rq_replen = msg_len; req->rq_reply_state = rs; req->rq_repmsg = rs->rs_msg; + lustre_init_msg_v2(rs->rs_msg, count, lens, bufs); lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION); lustre_set_rep_swabbed(req, MSG_PTLRPC_BODY_OFF); @@ -306,8 +320,8 @@ int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, } EXPORT_SYMBOL(lustre_pack_reply_v2); -int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens, - char **bufs) +int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, int *lens, + char **bufs, int flags) { int rc = 0; int size[] = { sizeof(struct ptlrpc_body) }; @@ -323,7 +337,7 @@ int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens, switch (req->rq_reqmsg->lm_magic) { case LUSTRE_MSG_MAGIC_V2: case LUSTRE_MSG_MAGIC_V2_SWABBED: - rc = lustre_pack_reply_v2(req, count, lens, bufs); + rc = lustre_pack_reply_v2(req, count, lens, bufs, flags); break; default: LASSERTF(0, "incorrect message magic: %08x\n", @@ -336,6 +350,12 @@ int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens, return rc; } +int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens, + char **bufs) +{ + return lustre_pack_reply_flags(req, count, lens, bufs, 0); +} + void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size) { int i, offset, buflen, bufcount; @@ -468,8 +488,8 @@ static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) __swab32s(&m->lm_bufcount); __swab32s(&m->lm_secflvr); __swab32s(&m->lm_repsize); - __swab32s(&m->lm_timeout); - CLASSERT(offsetof(typeof(*m), lm_padding_1) != 0); + __swab32s(&m->lm_cksum); + __swab32s(&m->lm_flags); CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0); CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0); } @@ -729,6 +749,35 @@ void *lustre_swab_repbuf(struct ptlrpc_request *req, int index, int min_size, return lustre_swab_buf(req->rq_repmsg, index, min_size, swabber); } +__u32 lustre_msghdr_get_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + case LUSTRE_MSG_MAGIC_V2_SWABBED: + /* already in host endian */ + return msg->lm_flags; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: + msg->lm_flags = flags; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + __u32 lustre_msg_get_flags(struct lustre_msg *msg) { switch (msg->lm_magic) { @@ -1132,6 +1181,17 @@ __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg) } } +int lustre_msg_is_v1(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 1; + default: + return 0; + } +} + __u32 lustre_msg_get_magic(struct lustre_msg *msg) { switch (msg->lm_magic) { @@ -1144,6 +1204,88 @@ __u32 lustre_msg_get_magic(struct lustre_msg *msg) } } +__u32 lustre_msg_get_timeout(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + case LUSTRE_MSG_MAGIC_V2_SWABBED: { + struct ptlrpc_body *pb; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + + } + return pb->pb_timeout; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_get_service_time(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + case LUSTRE_MSG_MAGIC_V2_SWABBED: { + struct ptlrpc_body *pb; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + + } + return pb->pb_service_time; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_get_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + case LUSTRE_MSG_MAGIC_V2_SWABBED: + return msg->lm_cksum; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + case LUSTRE_MSG_MAGIC_V2_SWABBED: { + struct ptlrpc_body *pb; + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + return crc32_le(~(__u32)0, (char *)pb, sizeof(*pb)); + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle) { switch (msg->lm_magic) { @@ -1272,6 +1414,56 @@ void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt) } } +void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_timeout = timeout; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_service_time = service_time; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: + msg->lm_cksum = cksum; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + + void ptlrpc_request_set_replen(struct ptlrpc_request *req) { int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER); @@ -1305,8 +1497,8 @@ void lustre_swab_ptlrpc_body(struct ptlrpc_body *b) __swab32s (&b->pb_flags); __swab32s (&b->pb_op_flags); __swab32s (&b->pb_conn_cnt); - CLASSERT(offsetof(typeof(*b), pb_padding_1) != 0); - CLASSERT(offsetof(typeof(*b), pb_padding_2) != 0); + __swab32s (&b->pb_timeout); + __swab32s (&b->pb_service_time); __swab32s (&b->pb_limit); __swab64s (&b->pb_slv); } @@ -1978,8 +2170,9 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, va_start(args, fmt); libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file, data->msg_fn, data->msg_line, fmt, args, - " req@%p x"LPD64"/t"LPD64"("LPD64") o%d->%s@%s:%d lens" - " %d/%d ref %d fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n", + " req@%p x"LPD64"/t"LPD64"("LPD64") o%d->%s@%s:%d/%d" + " lens %d/%d e %d to %d dl %ld ref %d " + "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n", req, req->rq_xid, req->rq_transno, req->rq_reqmsg ? lustre_msg_get_transno(req->rq_reqmsg) : 0, req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1, @@ -1990,10 +2183,10 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, (char *)req->rq_import->imp_connection->c_remote_uuid.uuid : req->rq_export ? (char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "", - (req->rq_import && req->rq_import->imp_client) ? - req->rq_import->imp_client->cli_request_portal : -1, - req->rq_reqlen, req->rq_replen, atomic_read(&req->rq_refcount), - DEBUG_REQ_FLAGS(req), + req->rq_request_portal, req->rq_reply_portal, + req->rq_reqlen, req->rq_replen, + req->rq_early_count, req->rq_timeout, req->rq_deadline, + atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req), req->rq_reqmsg && req_ptlrpc_body_swabbed(req) ? lustre_msg_get_flags(req->rq_reqmsg) : -1, req->rq_repmsg && rep_ptlrpc_body_swabbed(req) ? diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 60e0555..318ed0e 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -57,7 +57,6 @@ int ptlrpc_ping(struct obd_import *imp) imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); req->rq_no_resend = req->rq_no_delay = 1; ptlrpc_request_set_replen(req); - req->rq_timeout = PING_INTERVAL; ptlrpcd_add_req(req); RETURN(0); @@ -66,9 +65,14 @@ int ptlrpc_ping(struct obd_import *imp) void ptlrpc_update_next_ping(struct obd_import *imp) { #ifdef ENABLE_PINGER - imp->imp_next_ping = cfs_time_shift( - (imp->imp_state == LUSTRE_IMP_DISCON ? - RECONNECT_INTERVAL : PING_INTERVAL)); + int time = PING_INTERVAL; + if (imp->imp_state == LUSTRE_IMP_DISCON) { + int dtime = max_t(int, CONNECTION_SWITCH_MIN, + AT_OFF ? 0 : + at_get(&imp->imp_at.iat_net_latency)); + time = min(time, dtime); + } + imp->imp_next_ping = cfs_time_shift(time); #endif /* ENABLE_PINGER */ } @@ -436,7 +440,7 @@ static int ping_evictor_main(void *arg) obd = pet_exp->exp_obd; spin_unlock(&pet_lock); - expire_time = cfs_time_current_sec() - (3 * obd_timeout / 2); + expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT; CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n", obd->obd_name, expire_time); diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 21c97cb..2449ac9 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -152,6 +152,7 @@ EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); EXPORT_SYMBOL(ptlrpc_init_rq_pool); EXPORT_SYMBOL(ptlrpc_free_rq_pool); EXPORT_SYMBOL(ptlrpc_prep_req_pool); +EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); EXPORT_SYMBOL(ptlrpc_request_alloc); EXPORT_SYMBOL(ptlrpc_request_alloc_pool); EXPORT_SYMBOL(ptlrpc_request_free); @@ -202,6 +203,7 @@ EXPORT_SYMBOL(lustre_msg_swabbed); EXPORT_SYMBOL(lustre_msg_check_version); EXPORT_SYMBOL(lustre_pack_request); EXPORT_SYMBOL(lustre_pack_reply); +EXPORT_SYMBOL(lustre_pack_reply_flags); EXPORT_SYMBOL(lustre_shrink_msg); EXPORT_SYMBOL(lustre_free_reply_state); EXPORT_SYMBOL(lustre_msg_size); @@ -270,6 +272,7 @@ EXPORT_SYMBOL(lustre_msg_get_limit); EXPORT_SYMBOL(lustre_msg_set_slv); EXPORT_SYMBOL(lustre_msg_set_limit); EXPORT_SYMBOL(lustre_msg_get_conn_cnt); +EXPORT_SYMBOL(lustre_msg_is_v1); EXPORT_SYMBOL(lustre_msg_get_magic); EXPORT_SYMBOL(lustre_msg_set_handle); EXPORT_SYMBOL(lustre_msg_set_type); diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index cb06039..eac5128 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -426,9 +426,10 @@ static int log_commit_thread(void *arg) break; } - /* XXX FIXME bug 249, 5515 */ + /* bug 5515 */ request->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; request->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(request); ptlrpc_request_set_replen(request); mutex_down(&llcd->llcd_ctxt->loc_sem); diff --git a/lustre/ptlrpc/sec.c b/lustre/ptlrpc/sec.c index 315d439..a92a5c4 100644 --- a/lustre/ptlrpc/sec.c +++ b/lustre/ptlrpc/sec.c @@ -110,30 +110,30 @@ struct ptlrpc_sec_policy * sptlrpc_rpcflavor2policy(__u16 flavor) if (number >= SPTLRPC_POLICY_MAX) return NULL; -again: - read_lock(&policy_lock); - policy = policies[number]; - if (policy && !try_module_get(policy->sp_owner)) - policy = NULL; - if (policy == NULL) - flag = atomic_read(&loaded); - read_unlock(&policy_lock); - - /* if failure, try to load gss module, once */ - if (unlikely(policy == NULL) && flag == 0 && - number == SPTLRPC_POLICY_GSS) { + while (1) { + read_lock(&policy_lock); + policy = policies[number]; + if (policy && !try_module_get(policy->sp_owner)) + policy = NULL; + if (policy == NULL) + flag = atomic_read(&loaded); + read_unlock(&policy_lock); + + if (policy != NULL || flag != 0 || + number != SPTLRPC_POLICY_GSS) + break; + + /* try to load gss module, once */ mutex_down(&load_mutex); if (atomic_read(&loaded) == 0) { - if (request_module("ptlrpc_gss") != 0) - CERROR("Unable to load module ptlrpc_gss\n"); - else + if (request_module("ptlrpc_gss") == 0) CWARN("module ptlrpc_gss loaded on demand\n"); + else + CERROR("Unable to load module ptlrpc_gss\n"); atomic_set(&loaded, 1); } mutex_up(&load_mutex); - - goto again; } return policy; @@ -147,6 +147,8 @@ __u16 sptlrpc_name2rpcflavor(const char *name) return SPTLRPC_FLVR_PLAIN; if (!strcmp(name, "krb5n")) return SPTLRPC_FLVR_KRB5N; + if (!strcmp(name, "krb5a")) + return SPTLRPC_FLVR_KRB5A; if (!strcmp(name, "krb5i")) return SPTLRPC_FLVR_KRB5I; if (!strcmp(name, "krb5p")) @@ -844,10 +846,7 @@ int sptlrpc_cli_wrap_request(struct ptlrpc_request *req) RETURN(rc); } -/* - * rq_nob_received is the actual received data length - */ -int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req) +static int do_cli_unwrap_reply(struct ptlrpc_request *req) { struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; int rc; @@ -856,39 +855,34 @@ int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req) LASSERT(ctx); LASSERT(ctx->cc_sec); - LASSERT(ctx->cc_ops); LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata); + LASSERT(req->rq_repmsg == NULL); - req->rq_repdata_len = req->rq_nob_received; - - if (req->rq_nob_received < sizeof(struct lustre_msg)) { + if (req->rq_repdata_len < sizeof(struct lustre_msg)) { CERROR("replied data length %d too small\n", - req->rq_nob_received); + req->rq_repdata_len); RETURN(-EPROTO); } + /* v2 message, check request/reply policy match */ + rpc_flvr = WIRE_FLVR_RPC(req->rq_repdata->lm_secflvr); - /* - * v2 message, check request/reply policy match - */ - rpc_flvr = WIRE_FLVR_RPC(req->rq_repbuf->lm_secflvr); - - if (req->rq_repbuf->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED) + if (req->rq_repdata->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED) __swab16s(&rpc_flvr); if (RPC_FLVR_POLICY(rpc_flvr) != - RPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) { + RPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) { CERROR("request policy was %u while reply with %u\n", - RPC_FLVR_POLICY(req->rq_flvr.sf_rpc), - RPC_FLVR_POLICY(rpc_flvr)); + RPC_FLVR_POLICY(req->rq_flvr.sf_rpc), + RPC_FLVR_POLICY(rpc_flvr)); RETURN(-EPROTO); } /* do nothing if it's null policy; otherwise unpack the - * wrapper message - */ + * wrapper message */ if (RPC_FLVR_POLICY(rpc_flvr) != SPTLRPC_POLICY_NULL && - lustre_unpack_msg(req->rq_repbuf, req->rq_nob_received)) + lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len)) RETURN(-EPROTO); switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { @@ -910,6 +904,144 @@ int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req) RETURN(rc); } +/* + * upon this be called, the reply buffer should have been un-posted, + * so nothing is going to change. + */ +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req) +{ + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len); + + if (req->rq_reply_off == 0) { + CERROR("real reply with offset 0\n"); + return -EPROTO; + } + + if (req->rq_reply_off % 8 != 0) { + CERROR("reply at odd offset %u\n", req->rq_reply_off); + return -EPROTO; + } + + req->rq_repdata = (struct lustre_msg *) + (req->rq_repbuf + req->rq_reply_off); + req->rq_repdata_len = req->rq_nob_received; + + return do_cli_unwrap_reply(req); +} + +/* + * Upon called, the receive buffer might be still posted, so the reply data + * might be changed at any time, no matter we're holding rq_lock or not. we + * expect the rq_reply_off be 0, rq_nob_received is the early reply size. + * + * we allocate a separate buffer to hold early reply data, pointed by + * rq_repdata, rq_repdata_len is the early reply size, and round up to power2 + * is the actual buffer size. + * + * caller _must_ call sptlrpc_cli_finish_early_reply() after this, before + * process another early reply or real reply, to restore ptlrpc_request + * to normal status. + */ +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req) +{ + struct lustre_msg *early_buf; + int early_bufsz, early_size; + int rc; + ENTRY; + + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + + early_size = req->rq_nob_received; + if (early_size < sizeof(struct lustre_msg)) { + CERROR("early reply length %d too small\n", early_size); + RETURN(-EPROTO); + } + + early_bufsz = size_roundup_power2(early_size); + OBD_ALLOC(early_buf, early_bufsz); + if (early_buf == NULL) + RETURN(-ENOMEM); + + /* copy data out, do it inside spinlock */ + spin_lock(&req->rq_lock); + + if (req->rq_replied) { + spin_unlock(&req->rq_lock); + GOTO(err_free, rc = -EALREADY); + } + + if (req->rq_reply_off != 0) { + CERROR("early reply with offset %u\n", req->rq_reply_off); + GOTO(err_free, rc = -EPROTO); + } + + if (req->rq_nob_received != early_size) { + /* even another early arrived the size should be the same */ + CWARN("data size has changed from %u to %u\n", + early_size, req->rq_nob_received); + spin_unlock(&req->rq_lock); + GOTO(err_free, rc = -EINVAL); + } + + if (req->rq_nob_received < sizeof(struct lustre_msg)) { + CERROR("early reply length %d too small\n", + req->rq_nob_received); + spin_unlock(&req->rq_lock); + GOTO(err_free, rc = -EALREADY); + } + + memcpy(early_buf, req->rq_repbuf, early_size); + spin_unlock(&req->rq_lock); + + req->rq_repdata = early_buf; + req->rq_repdata_len = early_size; + + rc = do_cli_unwrap_reply(req); + + /* treate resend as an error case. in fact server should never ask + * resend via early reply. */ + if (req->rq_resend) { + req->rq_resend = 0; + rc = -EPROTO; + } + + if (rc) { + LASSERT(req->rq_repmsg == NULL); + req->rq_repdata = NULL; + req->rq_repdata_len = 0; + GOTO(err_free, rc); + } + + LASSERT(req->rq_repmsg); + RETURN(0); + +err_free: + OBD_FREE(early_buf, early_bufsz); + RETURN(rc); +} + +int sptlrpc_cli_finish_early_reply(struct ptlrpc_request *req) +{ + int early_bufsz; + + LASSERT(req->rq_repdata); + LASSERT(req->rq_repdata_len); + LASSERT(req->rq_repmsg); + + early_bufsz = size_roundup_power2(req->rq_repdata_len); + OBD_FREE(req->rq_repdata, early_bufsz); + + req->rq_repdata = NULL; + req->rq_repdata_len = 0; + req->rq_repmsg = NULL; + return 0; +} + /************************************************** * sec ID * **************************************************/ diff --git a/lustre/ptlrpc/sec_bulk.c b/lustre/ptlrpc/sec_bulk.c index a161e83..f6861bc 100644 --- a/lustre/ptlrpc/sec_bulk.c +++ b/lustre/ptlrpc/sec_bulk.c @@ -56,7 +56,7 @@ #define IDLE_IDX_MAX (100) #define IDLE_IDX_WEIGHT (3) -#define CACHE_QUIESCENCE_PERIOD (20) +#define CACHE_QUIESCENT_PERIOD (20) static struct ptlrpc_enc_page_pool { /* @@ -96,13 +96,14 @@ static struct ptlrpc_enc_page_pool { /* * statistics */ + unsigned long epp_st_max_pages; /* # of pages ever reached */ unsigned int epp_st_grows; /* # of grows */ unsigned int epp_st_grow_fails; /* # of add pages failures */ unsigned int epp_st_shrinks; /* # of shrinks */ unsigned long epp_st_access; /* # of access */ unsigned long epp_st_missings; /* # of cache missing */ unsigned long epp_st_lowfree; /* lowest free pages reached */ - unsigned long epp_st_max_wqlen; /* highest waitqueue length */ + unsigned int epp_st_max_wqlen; /* highest waitqueue length */ cfs_time_t epp_st_max_wait; /* in jeffies */ /* * pointers to pools @@ -137,13 +138,14 @@ int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, "idle index: %lu/100\n" "last shrink: %lds\n" "last access: %lds\n" + "max pages reached: %lu\n" "grows: %u\n" "grows failure: %u\n" "shrinks: %u\n" "cache access: %lu\n" "cache missing: %lu\n" "low free mark: %lu\n" - "max waitqueue depth: %lu\n" + "max waitqueue depth: %u\n" "max wait time: "CFS_TIME_T"/%u\n" , num_physpages, @@ -155,6 +157,7 @@ int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, page_pools.epp_idle_idx, cfs_time_current_sec() - page_pools.epp_last_shrink, cfs_time_current_sec() - page_pools.epp_last_access, + page_pools.epp_st_max_pages, page_pools.epp_st_grows, page_pools.epp_st_grow_fails, page_pools.epp_st_shrinks, @@ -172,28 +175,45 @@ int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, static void enc_pools_release_free_pages(long npages) { int p_idx, g_idx; + int p_idx_max1, p_idx_max2; + LASSERT(npages > 0); LASSERT(npages <= page_pools.epp_free_pages); + LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages); - p_idx = (page_pools.epp_free_pages - 1) / PAGES_PER_POOL; - g_idx = (page_pools.epp_free_pages - 1) % PAGES_PER_POOL; - LASSERT(page_pools.epp_pools[p_idx]); + /* max pool index before the release */ + p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL; page_pools.epp_free_pages -= npages; page_pools.epp_total_pages -= npages; - while (npages-- > 0) { + /* max pool index after the release */ + p_idx_max1 = page_pools.epp_total_pages == 0 ? 0 : + ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + LASSERT(page_pools.epp_pools[p_idx]); + + while (npages--) { + LASSERT(page_pools.epp_pools[p_idx]); LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); cfs_free_page(page_pools.epp_pools[p_idx][g_idx]); page_pools.epp_pools[p_idx][g_idx] = NULL; - if (g_idx-- == 0) { - p_idx--; - g_idx = PAGES_PER_POOL - 1; - - LASSERT(page_pools.epp_pools[p_idx]); + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; } + }; + + /* free unused pools */ + while (p_idx_max1 < p_idx_max2) { + LASSERT(page_pools.epp_pools[p_idx_max2]); + OBD_FREE(page_pools.epp_pools[p_idx_max2], CFS_PAGE_SIZE); + page_pools.epp_pools[p_idx_max2] = NULL; + p_idx_max2--; } } @@ -206,10 +226,10 @@ static int enc_pools_shrink(int nr_to_scan, unsigned int gfp_mask) spin_lock(&page_pools.epp_lock); - if (nr_to_scan) { - if (nr_to_scan > page_pools.epp_free_pages) - nr_to_scan = page_pools.epp_free_pages; + if (nr_to_scan > page_pools.epp_free_pages) + nr_to_scan = page_pools.epp_free_pages; + if (nr_to_scan > 0) { enc_pools_release_free_pages(nr_to_scan); CDEBUG(D_SEC, "released %d pages, %ld left\n", nr_to_scan, page_pools.epp_free_pages); @@ -230,7 +250,7 @@ static int enc_pools_shrink(int nr_to_scan, unsigned int gfp_mask) * if no pool access for a long time, we consider it's fully idle */ if (cfs_time_current_sec() - page_pools.epp_last_access > - CACHE_QUIESCENCE_PERIOD) + CACHE_QUIESCENT_PERIOD) page_pools.epp_idle_idx = IDLE_IDX_MAX; LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); @@ -297,7 +317,7 @@ static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages) * (1) fill all the free slots of current pools. */ /* free slots are those left by rent pages, and the extra ones with - * index >= eep_total_pages, locate at the tail of last pool. */ + * index >= total_pages, locate at the tail of last pool. */ freeslot = page_pools.epp_total_pages % PAGES_PER_POOL; if (freeslot != 0) freeslot = PAGES_PER_POOL - freeslot; @@ -352,6 +372,9 @@ static void enc_pools_insert(cfs_page_t ***pools, int npools, int npages) page_pools.epp_free_pages += npages; page_pools.epp_st_lowfree = page_pools.epp_free_pages; + if (page_pools.epp_total_pages > page_pools.epp_st_max_pages) + page_pools.epp_st_max_pages = page_pools.epp_total_pages; + CDEBUG(D_SEC, "add %d pages to total %lu\n", npages, page_pools.epp_total_pages); @@ -687,6 +710,7 @@ int sptlrpc_enc_pool_init(void) page_pools.epp_total_pages = 0; page_pools.epp_free_pages = 0; + page_pools.epp_st_max_pages = 0; page_pools.epp_st_grows = 0; page_pools.epp_st_grow_fails = 0; page_pools.epp_st_shrinks = 0; @@ -724,6 +748,17 @@ void sptlrpc_enc_pool_fini(void) LASSERT(cleaned == page_pools.epp_total_pages); enc_pools_free(); + + if (page_pools.epp_st_access > 0) { + CWARN("max pages %lu, grows %u, grow fails %u, shrinks %u, " + "access %lu, missing %lu, max qlen %u, max wait " + CFS_TIME_T"/%d\n", + page_pools.epp_st_max_pages, page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, page_pools.epp_st_access, + page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, + page_pools.epp_st_max_wait, HZ); + } } #else /* !__KERNEL__ */ diff --git a/lustre/ptlrpc/sec_null.c b/lustre/ptlrpc/sec_null.c index 59e2400..586b1f7 100644 --- a/lustre/ptlrpc/sec_null.c +++ b/lustre/ptlrpc/sec_null.c @@ -30,6 +30,7 @@ #endif #include +#include #include #include #include @@ -62,8 +63,7 @@ enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg) } } -static -int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx) +static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx) { /* should never reach here */ LBUG(); @@ -87,22 +87,34 @@ int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) static int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) { - req->rq_repmsg = req->rq_repbuf; + __u32 cksums, cksumc; + + LASSERT(req->rq_repdata); + + /* real reply rq_repdata point inside of rq_reqbuf; early reply + * rq_repdata point to a separate allocated space */ + if ((char *) req->rq_repdata < req->rq_repbuf || + (char *) req->rq_repdata >= req->rq_repbuf + req->rq_repbuf_len) { + cksums = req->rq_repdata->lm_cksum; + req->rq_repdata->lm_cksum = 0; + + if (req->rq_repdata->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED) + __swab32s(&cksums); + + cksumc = crc32_le(!(__u32) 0, (char *) req->rq_repdata, + req->rq_repdata_len); + if (cksumc != cksums) { + CWARN("early reply checksum mismatch: %08x != %08x\n", + cksumc, cksums); + return -EINVAL; + } + } + + req->rq_repmsg = req->rq_repdata; req->rq_replen = req->rq_repdata_len; return 0; } -static struct ptlrpc_ctx_ops null_ctx_ops = { - .refresh = null_ctx_refresh, - .sign = null_ctx_sign, - .verify = null_ctx_verify, -}; - -static struct ptlrpc_svc_ctx null_svc_ctx = { - .sc_refcount = ATOMIC_INIT(1), - .sc_policy = &null_policy, -}; - static struct ptlrpc_sec *null_create_sec(struct obd_import *imp, struct ptlrpc_svc_ctx *svc_ctx, @@ -196,6 +208,9 @@ int null_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, int msgsize) { + /* add space for early replied */ + msgsize += lustre_msg_early_size(); + msgsize = size_roundup_power2(msgsize); OBD_ALLOC(req->rq_repbuf, msgsize); @@ -210,6 +225,8 @@ static void null_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req) { + LASSERT(req->rq_repbuf); + OBD_FREE(req->rq_repbuf, req->rq_repbuf_len); req->rq_repbuf = NULL; req->rq_repbuf_len = 0; @@ -260,6 +277,11 @@ int null_enlarge_reqbuf(struct ptlrpc_sec *sec, return 0; } +static struct ptlrpc_svc_ctx null_svc_ctx = { + .sc_refcount = ATOMIC_INIT(1), + .sc_policy = &null_policy, +}; + static int null_accept(struct ptlrpc_request *req) { @@ -329,11 +351,28 @@ int null_authorize(struct ptlrpc_request *req) struct ptlrpc_reply_state *rs = req->rq_reply_state; LASSERT(rs); + rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL; rs->rs_repdata_len = req->rq_replen; + + if (likely(req->rq_packed_final)) { + req->rq_reply_off = lustre_msg_early_size(); + } else { + rs->rs_repbuf->lm_cksum = + crc32_le(!(__u32) 0, (char *) rs->rs_repbuf, + rs->rs_repdata_len); + req->rq_reply_off = 0; + } + return 0; } +static struct ptlrpc_ctx_ops null_ctx_ops = { + .refresh = null_ctx_refresh, + .sign = null_ctx_sign, + .verify = null_ctx_verify, +}; + static struct ptlrpc_sec_cops null_sec_cops = { .create_sec = null_create_sec, .destroy_sec = null_destroy_sec, @@ -361,8 +400,7 @@ static struct ptlrpc_sec_policy null_policy = { .sp_sops = &null_sec_sops, }; -static -void null_init_internal(void) +static void null_init_internal(void) { static HLIST_HEAD(__list); diff --git a/lustre/ptlrpc/sec_plain.c b/lustre/ptlrpc/sec_plain.c index 2b69ced..6763f1d 100644 --- a/lustre/ptlrpc/sec_plain.c +++ b/lustre/ptlrpc/sec_plain.c @@ -30,6 +30,7 @@ #endif #include +#include #include #include #include @@ -49,6 +50,8 @@ static struct ptlrpc_sec_policy plain_policy; static struct ptlrpc_ctx_ops plain_ctx_ops; static struct ptlrpc_svc_ctx plain_svc_ctx; +static unsigned int plain_at_offset; + /* * flavor flags (maximum 8 flags) */ @@ -129,7 +132,9 @@ int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) static int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) { - struct lustre_msg *msg = req->rq_repbuf; + struct lustre_msg *msg = req->rq_repdata; + int early = 0; + __u32 cksum; ENTRY; if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) { @@ -137,28 +142,46 @@ int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) RETURN(-EPROTO); } + /* find out if it's an early reply */ + if ((char *) msg < req->rq_repbuf || + (char *) msg >= req->rq_repbuf + req->rq_repbuf_len) + early = 1; + /* expect no user desc in reply */ if (PLAIN_WFLVR_HAS_USER(msg->lm_secflvr)) { CERROR("Unexpected udesc flag in reply\n"); RETURN(-EPROTO); } - /* whether we sent with bulk or not, we expect the same in reply */ - if (!equi(req->rq_pack_bulk == 1, - PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr))) { - CERROR("%s bulk checksum in reply\n", - req->rq_pack_bulk ? "Missing" : "Unexpected"); - RETURN(-EPROTO); - } + if (unlikely(early)) { + cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), + lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF)); + if (cksum != msg->lm_cksum) { + CWARN("early reply checksum mismatch: %08x != %08x\n", + cpu_to_le32(cksum), msg->lm_cksum); + RETURN(-EINVAL); + } + } else { + /* whether we sent with bulk or not, we expect the same + * in reply, except for early reply */ + if (!early && + !equi(req->rq_pack_bulk == 1, + PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr))) { + CERROR("%s bulk checksum in reply\n", + req->rq_pack_bulk ? "Missing" : "Unexpected"); + RETURN(-EPROTO); + } - if (req->rq_pack_bulk && - bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF)) { - CERROR("Mal-formed bulk checksum reply\n"); - RETURN(-EINVAL); + if (PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr) && + bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF)) { + CERROR("Mal-formed bulk checksum reply\n"); + RETURN(-EINVAL); + } } req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); - req->rq_replen = msg->lm_buflens[PLAIN_PACK_MSG_OFF]; + req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF); RETURN(0); } @@ -183,11 +206,11 @@ int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, { LASSERT(req->rq_pack_bulk); LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); - LASSERT(req->rq_repbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); + LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS); return bulk_csum_cli_reply(desc, req->rq_bulk_read, req->rq_reqbuf, PLAIN_PACK_BULK_OFF, - req->rq_repbuf, PLAIN_PACK_BULK_OFF); + req->rq_repdata, PLAIN_PACK_BULK_OFF); } /**************************************** @@ -445,13 +468,16 @@ int plain_alloc_repbuf(struct ptlrpc_sec *sec, if (req->rq_pack_bulk) { LASSERT(req->rq_bulk_read || req->rq_bulk_write); - buflens[PLAIN_PACK_BULK_OFF] = bulk_sec_desc_size( req->rq_flvr.sf_bulk_hash, 0, req->rq_bulk_read); } alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + /* add space for early reply */ + alloc_len += plain_at_offset; + alloc_len = size_roundup_power2(alloc_len); OBD_ALLOC(req->rq_repbuf, alloc_len); @@ -672,6 +698,16 @@ int plain_authorize(struct ptlrpc_request *req) msg->lm_secflvr |= PLAIN_WFLVR_FLAG_BULK; rs->rs_repdata_len = len; + + if (likely(req->rq_packed_final)) { + req->rq_reply_off = plain_at_offset; + } else { + msg->lm_cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), + lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF)); + req->rq_reply_off = 0; + } + RETURN(0); } @@ -761,8 +797,12 @@ static struct ptlrpc_sec_policy plain_policy = { int sptlrpc_plain_init(void) { + int buflens[PLAIN_PACK_SEGMENTS] = { 0, }; int rc; + buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size(); + plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + rc = sptlrpc_register_policy(&plain_policy); if (rc) CERROR("failed to register: %d\n", rc); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 95fcefcb..1a5730e 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -34,9 +34,28 @@ #include #include "ptlrpc_internal.h" +/* The following are visible and mutable through /sys/module/ptlrpc */ int test_req_buffer_pressure = 0; CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444, "set non-zero to put pressure on request buffer pools"); +unsigned int at_min = 0; +CFS_MODULE_PARM(at_min, "i", int, 0644, + "Adaptive timeout minimum (sec)"); +unsigned int at_max = 600; +EXPORT_SYMBOL(at_max); +CFS_MODULE_PARM(at_max, "i", int, 0644, + "Adaptive timeout maximum (sec)"); +unsigned int at_history = 600; +CFS_MODULE_PARM(at_history, "i", int, 0644, + "Adaptive timeouts remember the slowest event that took place " + "within this period (sec)"); +static int at_early_margin = 5; +CFS_MODULE_PARM(at_early_margin, "i", int, 0644, + "How soon before an RPC deadline to send an early reply"); +static int at_extra = 30; +CFS_MODULE_PARM(at_extra, "i", int, 0644, + "How much extra time to give with each early reply"); + /* forward ref */ static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc); @@ -260,17 +279,28 @@ struct ptlrpc_service *ptlrpc_init_svc_conf(struct ptlrpc_service_conf *c, return ptlrpc_init_svc(c->psc_nbufs, c->psc_bufsize, c->psc_max_req_size, c->psc_max_reply_size, c->psc_req_portal, c->psc_rep_portal, - c->psc_watchdog_timeout, + c->psc_watchdog_factor, h, name, proc_entry, prntfn, c->psc_min_threads, c->psc_max_threads, threadname, c->psc_ctx_tags); } EXPORT_SYMBOL(ptlrpc_init_svc_conf); +static void ptlrpc_at_timer(unsigned long castmeharder) +{ + struct ptlrpc_service *svc = (struct ptlrpc_service *)castmeharder; + CDEBUG(D_INFO, "at timer %s hit at %ld%s\n", + svc->srv_name, cfs_time_current_sec(), + list_empty(&svc->srv_at_list) ? ", empty" : ""); + svc->srv_at_check = 1; + svc->srv_at_checktime = cfs_time_current(); + cfs_waitq_signal(&svc->srv_waitq); +} + /* @threadname should be 11 characters or less - 3 will be added on */ struct ptlrpc_service * ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, - int req_portal, int rep_portal, int watchdog_timeout, + int req_portal, int rep_portal, int watchdog_factor, svc_handler_t handler, char *name, cfs_proc_dir_entry_t *proc_entry, svcreq_printfn_t svcreq_printfn, @@ -301,7 +331,7 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, service->srv_buf_size = bufsize; service->srv_rep_portal = rep_portal; service->srv_req_portal = req_portal; - service->srv_watchdog_timeout = watchdog_timeout; + service->srv_watchdog_factor = watchdog_factor; service->srv_handler = handler; service->srv_request_history_print_fn = svcreq_printfn; service->srv_request_seq = 1; /* valid seq #s start at 1 */ @@ -324,6 +354,14 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, CFS_INIT_LIST_HEAD(&service->srv_free_rs_list); cfs_waitq_init(&service->srv_free_rs_waitq); + spin_lock_init(&service->srv_at_lock); + CFS_INIT_LIST_HEAD(&service->srv_req_in_queue); + CFS_INIT_LIST_HEAD(&service->srv_at_list); + cfs_timer_init(&service->srv_at_timer, ptlrpc_at_timer, service); + /* At SOW, service time should be quick; 10s seems generous. If client + timeout is less than this, we'll be sending an early reply. */ + at_init(&service->srv_at_estimate, 10, 0); + spin_lock (&ptlrpc_all_services_lock); list_add (&service->srv_list, &ptlrpc_all_services); spin_unlock (&ptlrpc_all_services_lock); @@ -354,27 +392,39 @@ failed: return NULL; } -static void __ptlrpc_server_free_request(struct ptlrpc_request *req) +static void ptlrpc_server_req_decref(struct ptlrpc_request *req) { struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; - list_del(&req->rq_list); - - if (req->rq_reply_state != NULL) { - ptlrpc_rs_decref(req->rq_reply_state); - req->rq_reply_state = NULL; - } + if (!atomic_dec_and_test(&req->rq_refcount)) + return; sptlrpc_svc_ctx_decref(req); + LASSERT(list_empty(&req->rq_timed_list)); if (req != &rqbd->rqbd_req) { /* NB request buffers use an embedded * req if the incoming req unlinked the * MD; this isn't one of them! */ OBD_FREE(req, sizeof(*req)); + } else { + struct ptlrpc_service *svc = rqbd->rqbd_service; + /* schedule request buffer for re-use. + * NB I can only do this after I've disposed of their + * reqs; particularly the embedded req */ + spin_lock(&svc->srv_lock); + list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds); + spin_unlock(&svc->srv_lock); } } +static void __ptlrpc_server_free_request(struct ptlrpc_request *req) +{ + list_del(&req->rq_list); + ptlrpc_req_drop_rs(req); + ptlrpc_server_req_decref(req); +} + static void ptlrpc_server_free_request(struct ptlrpc_request *req) { @@ -384,6 +434,13 @@ ptlrpc_server_free_request(struct ptlrpc_request *req) struct list_head *tmp; struct list_head *nxt; + if (req->rq_phase != RQ_PHASE_NEW) /* incorrect message magic */ + DEBUG_REQ(D_INFO, req, "free req"); + spin_lock(&svc->srv_at_lock); + req->rq_sent_final = 1; + list_del_init(&req->rq_timed_list); + spin_unlock(&svc->srv_at_lock); + spin_lock(&svc->srv_lock); svc->srv_n_active_reqs--; @@ -429,11 +486,6 @@ ptlrpc_server_free_request(struct ptlrpc_request *req) } spin_lock(&svc->srv_lock); - - /* schedule request buffer for re-use. - * NB I can only do this after I've disposed of their - * reqs; particularly the embedded req */ - list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds); } } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { /* If we are low on memory, we are not interested in @@ -499,21 +551,21 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) /* Note - racing to start/reset the obd_eviction timer is safe */ if (exp->exp_obd->obd_eviction_timer == 0) { /* Check if the oldest entry is expired. */ - if (cfs_time_current_sec() > (oldest_time + - (3 * obd_timeout / 2) + extra_delay)) { + if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT + + extra_delay)) { /* We need a second timer, in case the net was down and * it just came back. Since the pinger may skip every * other PING_INTERVAL (see note in ptlrpc_pinger_main), * we better wait for 3. */ - exp->exp_obd->obd_eviction_timer = cfs_time_current_sec() + - 3 * PING_INTERVAL; + exp->exp_obd->obd_eviction_timer = + cfs_time_current_sec() + 3 * PING_INTERVAL; CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n", exp->exp_obd->obd_name, obd_export_nid2str(exp), oldest_time); } } else { - if (cfs_time_current_sec() > (exp->exp_obd->obd_eviction_timer + - extra_delay)) { + if (cfs_time_current_sec() > + (exp->exp_obd->obd_eviction_timer + extra_delay)) { /* The evictor won't evict anyone who we've heard from * recently, so we don't have to check before we start * it. */ @@ -545,6 +597,434 @@ void lu_context_exit(struct lu_context *ctx) #endif +static int ptlrpc_check_req(struct ptlrpc_request *req) +{ + if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < + req->rq_export->exp_conn_cnt)) { + DEBUG_REQ(D_ERROR, req, + "DROPPING req from old connection %d < %d", + lustre_msg_get_conn_cnt(req->rq_reqmsg), + req->rq_export->exp_conn_cnt); + return -EEXIST; + } + if (unlikely(req->rq_export->exp_obd && + req->rq_export->exp_obd->obd_fail)) { + /* Failing over, don't handle any more reqs, send + error response instead. */ + CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", + req, req->rq_export->exp_obd->obd_name); + req->rq_status = -ENODEV; + ptlrpc_error(req); + return -ENODEV; + } + + return 0; +} + +static void ptlrpc_at_set_timer(struct ptlrpc_service *svc) +{ + struct ptlrpc_request *rq; + __s32 next; + + spin_lock(&svc->srv_at_lock); + if (list_empty(&svc->srv_at_list)) { + cfs_timer_disarm(&svc->srv_at_timer); + spin_unlock(&svc->srv_at_lock); + return; + } + + /* Set timer for closest deadline */ + rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, + rq_timed_list); + next = (__s32)(rq->rq_deadline - cfs_time_current_sec() - + at_early_margin); + if (next <= 0) + ptlrpc_at_timer((unsigned long)svc); + else + cfs_timer_arm(&svc->srv_at_timer, cfs_time_shift(next)); + spin_unlock(&svc->srv_at_lock); + CDEBUG(D_INFO, "armed %s at %+lds\n", svc->srv_name, next); +} + +/* Add rpc to early reply check list */ +static int ptlrpc_at_add_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; + struct ptlrpc_request *rq; + int found = 0; + + if (AT_OFF) + return(0); + + if (req->rq_no_reply) + return 0; + + if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) + return(-ENOSYS); + + DEBUG_REQ(D_ADAPTTO, req, "add timed %lds", + req->rq_deadline - cfs_time_current_sec()); + + spin_lock(&svc->srv_at_lock); + + if (unlikely(req->rq_sent_final)) { + spin_unlock(&svc->srv_at_lock); + return 0; + } + + LASSERT(list_empty(&req->rq_timed_list)); + /* Add to sorted list. Presumably latest rpcs will have the latest + deadlines, so search backward. */ + list_for_each_entry_reverse(rq, &svc->srv_at_list, rq_timed_list) { + if (req->rq_deadline > rq->rq_deadline) { + list_add(&req->rq_timed_list, &rq->rq_timed_list); + found++; + break; + } + } + if (!found) + /* Add to front if shortest deadline or list empty */ + list_add(&req->rq_timed_list, &svc->srv_at_list); + + /* Check if we're the head of the list */ + found = (svc->srv_at_list.next == &req->rq_timed_list); + + spin_unlock(&svc->srv_at_lock); + + if (found) + ptlrpc_at_set_timer(svc); + + return 0; +} + +static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, + int extra_time) +{ + struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; + struct ptlrpc_request *reqcopy; + struct lustre_msg *reqmsg; + long olddl = req->rq_deadline - cfs_time_current_sec(); + time_t newdl; + int rc; + ENTRY; + + /* deadline is when the client expects us to reply, margin is the + difference between clients' and servers' expectations */ + DEBUG_REQ(D_ADAPTTO, req, + "%ssending early reply (deadline %+lds, margin %+lds) for " + "%d+%d", AT_OFF ? "AT off - not " : "", + olddl, olddl - at_get(&svc->srv_at_estimate), + at_get(&svc->srv_at_estimate), extra_time); + + if (AT_OFF) + RETURN(0); + + if (olddl < 0) { + CDEBUG(D_WARNING, "x"LPU64": Already past deadline (%+lds), not" + " sending early reply. Increase at_early_margin (%d)?\n", + req->rq_xid, olddl, at_early_margin); + /* Return an error so we're not re-added to the timed list. */ + RETURN(-ETIMEDOUT); + } + + if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){ + CDEBUG(D_INFO, "Wanted to ask client for more time, but no AT " + "support\n"); + RETURN(-ENOSYS); + } + + if (extra_time) { + /* Fake our processing time into the future to ask the + clients for some extra amount of time */ + extra_time += cfs_time_current_sec() - + req->rq_arrival_time.tv_sec; + at_add(&svc->srv_at_estimate, extra_time); + } + + newdl = req->rq_arrival_time.tv_sec + at_get(&svc->srv_at_estimate); + if (req->rq_deadline >= newdl) { + /* We're not adding any time, no need to send an early reply + (e.g. maybe at adaptive_max) */ + CDEBUG(D_ADAPTTO, "x"LPU64": Couldn't add any time (%ld/%ld), " + "not sending early reply\n", req->rq_xid, olddl, + newdl - cfs_time_current_sec()); + RETURN(-ETIMEDOUT); + } + + OBD_ALLOC(reqcopy, sizeof *reqcopy); + if (reqcopy == NULL) + RETURN(-ENOMEM); + OBD_ALLOC(reqmsg, req->rq_reqlen); + if (!reqmsg) { + OBD_FREE(reqcopy, sizeof *reqcopy); + RETURN(-ENOMEM); + } + + *reqcopy = *req; + reqcopy->rq_reply_state = NULL; + reqcopy->rq_rep_swab_mask = 0; + reqcopy->rq_pack_bulk = 0; + reqcopy->rq_pack_udesc = 0; + reqcopy->rq_packed_final = 0; + sptlrpc_svc_ctx_addref(reqcopy); + /* We only need the reqmsg for the magic */ + reqcopy->rq_reqmsg = reqmsg; + memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); + + if (req->rq_sent_final) { + CDEBUG(D_ADAPTTO, "x"LPU64": normal reply already sent out, " + "abort sending early reply\n", req->rq_xid); + GOTO(out, rc = 0); + } + + /* Connection ref */ + reqcopy->rq_export = class_conn2export( + lustre_msg_get_handle(reqcopy->rq_reqmsg)); + if (reqcopy->rq_export == NULL) + GOTO(out, rc = -ENODEV); + + /* RPC ref */ + class_export_rpc_get(reqcopy->rq_export); + if (reqcopy->rq_export->exp_obd && + reqcopy->rq_export->exp_obd->obd_fail) + GOTO(out_put, rc = -ENODEV); + + rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); + if (rc) + GOTO(out_put, rc); + + rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); + + if (!rc) { + /* Adjust our own deadline to what we told the client */ + req->rq_deadline = newdl; + req->rq_early_count++; /* number sent, server side */ + } else { + DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); + } + + /* Free the (early) reply state from lustre_pack_reply. + (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */ + ptlrpc_req_drop_rs(reqcopy); + +out_put: + class_export_rpc_put(reqcopy->rq_export); + class_export_put(reqcopy->rq_export); +out: + sptlrpc_svc_ctx_decref(reqcopy); + OBD_FREE(reqmsg, req->rq_reqlen); + OBD_FREE(reqcopy, sizeof *reqcopy); + RETURN(rc); +} + +/* Send early replies to everybody expiring within at_early_margin + asking for at_extra time */ +static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) +{ + struct ptlrpc_request *rq, *n; + struct list_head work_list; + time_t now = cfs_time_current_sec(); + cfs_duration_t delay; + int first, counter = 0; + ENTRY; + + spin_lock(&svc->srv_at_lock); + if (svc->srv_at_check == 0) { + spin_unlock(&svc->srv_at_lock); + RETURN(0); + } + delay = cfs_time_sub(cfs_time_current(), svc->srv_at_checktime); + svc->srv_at_check = 0; + + if (list_empty(&svc->srv_at_list)) { + spin_unlock(&svc->srv_at_lock); + RETURN(0); + } + + /* The timer went off, but maybe the nearest rpc already completed. */ + rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, + rq_timed_list); + first = (int)(rq->rq_deadline - now); + if (first > at_early_margin) { + /* We've still got plenty of time. Reset the timer. */ + spin_unlock(&svc->srv_at_lock); + ptlrpc_at_set_timer(svc); + RETURN(0); + } + + /* We're close to a timeout, and we don't know how much longer the + server will take. Send early replies to everyone expiring soon. */ + CFS_INIT_LIST_HEAD(&work_list); + list_for_each_entry_safe(rq, n, &svc->srv_at_list, rq_timed_list) { + if (rq->rq_deadline <= now + at_early_margin) { + list_move_tail(&rq->rq_timed_list, &work_list); + counter++; + } else { + break; + } + } + + spin_unlock(&svc->srv_at_lock); + + /* we have a new earliest deadline, restart the timer */ + ptlrpc_at_set_timer(svc); + + CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early " + "replies\n", first, at_extra, counter); + if (first < 0) { + /* We're already past request deadlines before we even get a + chance to send early replies */ + LCONSOLE_WARN("%s: This server is not able to keep up with " + "request traffic (cpu-bound).\n", svc->srv_name); + CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, " + "delay="CFS_DURATION_T"(jiff)\n", + counter, svc->srv_n_queued_reqs, svc->srv_n_active_reqs, + at_get(&svc->srv_at_estimate), delay); + } + + /* ptlrpc_server_free_request may delete an entry out of the work + list */ + spin_lock(&svc->srv_at_lock); + while (!list_empty(&work_list)) { + rq = list_entry(work_list.next, struct ptlrpc_request, + rq_timed_list); + list_del_init(&rq->rq_timed_list); + /* if the entry is still in the worklist, it hasn't been + deleted, and is safe to take a ref to keep the req around */ + atomic_inc(&rq->rq_refcount); + spin_unlock(&svc->srv_at_lock); + + if (ptlrpc_at_send_early_reply(rq, at_extra) == 0) + ptlrpc_at_add_timed(rq); + + ptlrpc_server_req_decref(rq); + spin_lock(&svc->srv_at_lock); + } + spin_unlock(&svc->srv_at_lock); + + RETURN(0); +} + +/* Handle freshly incoming reqs, add to timed early reply list, + pass on to regular request queue */ +static int +ptlrpc_server_handle_req_in(struct ptlrpc_service *svc) +{ + struct ptlrpc_request *req; + __u32 deadline; + int rc; + ENTRY; + + LASSERT(svc); + + spin_lock(&svc->srv_lock); + if (list_empty(&svc->srv_req_in_queue)) { + spin_unlock(&svc->srv_lock); + RETURN(0); + } + + req = list_entry(svc->srv_req_in_queue.next, + struct ptlrpc_request, rq_list); + list_del_init (&req->rq_list); + /* Consider this still a "queued" request as far as stats are + concerned */ + spin_unlock(&svc->srv_lock); + + /* go through security check/transform */ + rc = sptlrpc_svc_unwrap_request(req); + switch (rc) { + case SECSVC_OK: + break; + case SECSVC_COMPLETE: + target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); + goto err_req; + case SECSVC_DROP: + goto err_req; + default: + LBUG(); + } + + /* Clear request swab mask; this is a new request */ + req->rq_req_swab_mask = 0; + + rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); + if (rc != 0) { + CERROR("error unpacking request: ptl %d from %s x"LPU64"\n", + svc->srv_req_portal, libcfs_id2str(req->rq_peer), + req->rq_xid); + goto err_req; + } + + rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + CERROR ("error unpacking ptlrpc body: ptl %d from %s x" + LPU64"\n", svc->srv_req_portal, + libcfs_id2str(req->rq_peer), req->rq_xid); + goto err_req; + } + + rc = -EINVAL; + if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { + CERROR("wrong packet type received (type=%u) from %s\n", + lustre_msg_get_type(req->rq_reqmsg), + libcfs_id2str(req->rq_peer)); + goto err_req; + } + + CDEBUG(D_NET, "got req "LPD64"\n", req->rq_xid); + + req->rq_export = class_conn2export( + lustre_msg_get_handle(req->rq_reqmsg)); + if (req->rq_export) { + rc = ptlrpc_check_req(req); + if (rc == 0) { + rc = sptlrpc_target_export_check(req->rq_export, req); + if (rc) + DEBUG_REQ(D_ERROR, req, "DROPPING req with " + "illegal security flavor,"); + } + + class_export_put(req->rq_export); + req->rq_export = NULL; + if (rc) + goto err_req; + } + + /* req_in handling should/must be fast */ + if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5) + DEBUG_REQ(D_WARNING, req, "Slow req_in handling %lus", + cfs_time_current_sec() - req->rq_arrival_time.tv_sec); + + /* Set rpc server deadline and add it to the timed list */ + deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & + MSGHDR_AT_SUPPORT) ? + /* The max time the client expects us to take */ + lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; + req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; + if (unlikely(deadline == 0)) { + DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); + goto err_req; + } + + ptlrpc_at_add_timed(req); + + /* Move it over to the request processing queue */ + spin_lock(&svc->srv_lock); + list_add_tail(&req->rq_list, &svc->srv_request_queue); + cfs_waitq_signal(&svc->srv_waitq); + spin_unlock(&svc->srv_lock); + RETURN(1); + +err_req: + spin_lock(&svc->srv_lock); + svc->srv_n_queued_reqs--; + svc->srv_n_active_reqs++; + spin_unlock(&svc->srv_lock); + ptlrpc_server_free_request(req); + + RETURN(1); +} + static int ptlrpc_server_handle_request(struct ptlrpc_service *svc, struct ptlrpc_thread *thread) @@ -554,18 +1034,23 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, struct timeval work_start; struct timeval work_end; long timediff; - int rc, reply; + int rc; ENTRY; LASSERT(svc); spin_lock(&svc->srv_lock); if (unlikely(list_empty (&svc->srv_request_queue) || - (svc->srv_n_difficult_replies != 0 && - svc->srv_n_active_reqs >= (svc->srv_threads_running - 1)))) { - /* If all the other threads are handling requests, I must - * remain free to handle any 'difficult' reply that might - * block them */ + ( +#ifndef __KERNEL__ + /* !@%$# liblustre only has 1 thread */ + svc->srv_n_difficult_replies != 0 && +#endif + svc->srv_n_active_reqs >= (svc->srv_threads_running - 1)))) { + /* Don't handle regular requests in the last thread, in order * re + * to handle difficult replies (which might block other threads) + * as well as handle any incoming reqs, early replies, etc. + * That means we always need at least 2 service threads. */ spin_unlock(&svc->srv_lock); RETURN(0); } @@ -587,47 +1072,8 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, svc->srv_n_queued_reqs); lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, svc->srv_n_active_reqs); - } - - /* go through security check/transform */ - rc = sptlrpc_svc_unwrap_request(request); - switch (rc) { - case SECSVC_OK: - break; - case SECSVC_COMPLETE: - target_send_reply(request, 0, OBD_FAIL_MDS_ALL_REPLY_NET); - goto out_stat; - case SECSVC_DROP: - goto out_req; - default: - LBUG(); - } - - /* Clear request swab mask; this is a new request */ - request->rq_req_swab_mask = 0; - - rc = lustre_unpack_msg(request->rq_reqmsg, request->rq_reqlen); - if (rc != 0) { - CERROR ("error unpacking request: ptl %d from %s" - " xid "LPU64"\n", svc->srv_req_portal, - libcfs_id2str(request->rq_peer), request->rq_xid); - goto out_req; - } - - rc = lustre_unpack_req_ptlrpc_body(request, MSG_PTLRPC_BODY_OFF); - if (rc) { - CERROR ("error unpacking ptlrpc body: ptl %d from %s" - " xid "LPU64"\n", svc->srv_req_portal, - libcfs_id2str(request->rq_peer), request->rq_xid); - goto out_req; - } - - rc = -EINVAL; - if (lustre_msg_get_type(request->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { - CERROR("wrong packet type received (type=%u) from %s\n", - lustre_msg_get_type(request->rq_reqmsg), - libcfs_id2str(request->rq_peer)); - goto out_req; + lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, + at_get(&svc->srv_at_estimate)); } rc = lu_context_init(&request->rq_session, LCT_SESSION); @@ -648,52 +1094,28 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, lustre_msg_get_handle(request->rq_reqmsg)); if (likely(request->rq_export)) { - if (unlikely(lustre_msg_get_conn_cnt(request->rq_reqmsg) < - request->rq_export->exp_conn_cnt)) { - DEBUG_REQ(D_ERROR, request, - "DROPPING req from old connection %d < %d", - lustre_msg_get_conn_cnt(request->rq_reqmsg), - request->rq_export->exp_conn_cnt); + if (unlikely(ptlrpc_check_req(request))) goto put_conn; - } - if (unlikely(request->rq_export->exp_obd && - request->rq_export->exp_obd->obd_fail)) { - /* Failing over, don't handle any more reqs, send - error response instead. */ - CDEBUG(D_RPCTRACE,"Dropping req %p for failed obd %s\n", - request, request->rq_export->exp_obd->obd_name); - request->rq_status = -ENODEV; - ptlrpc_error(request); - goto put_conn; - } - - rc = sptlrpc_target_export_check(request->rq_export, request); - if (unlikely(rc)) { - DEBUG_REQ(D_ERROR, request, - "DROPPING req with illegal security flavor,"); - goto put_conn; - } - - ptlrpc_update_export_timer(request->rq_export, timediff/500000); + ptlrpc_update_export_timer(request->rq_export, timediff >> 19); export = class_export_rpc_get(request->rq_export); } - /* Discard requests queued for longer than my timeout. If the - * client's timeout is similar to mine, she'll be timing out this - * REQ anyway (bug 1502) */ - if (unlikely(timediff / 1000000 > (long)obd_timeout)) { - CERROR("Dropping timed-out opc %d request from %s" - ": %ld seconds old\n", - lustre_msg_get_opc(request->rq_reqmsg), - libcfs_id2str(request->rq_peer), - timediff / 1000000); + /* Discard requests queued for longer than the deadline. + The deadline is increased if we send an early reply. */ + if (cfs_time_current_sec() > request->rq_deadline) { + DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s" + ": deadline %ld%+lds ago\n", + libcfs_id2str(request->rq_peer), + request->rq_deadline - + request->rq_arrival_time.tv_sec, + cfs_time_current_sec() - request->rq_deadline); goto put_rpc_export; } request->rq_phase = RQ_PHASE_INTERPRET; CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc " - "%s:%s+%d:%d:"LPU64":%s:%d\n", cfs_curproc_comm(), + "%s:%s+%d:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(), (request->rq_export ? (char *)request->rq_export->exp_client_uuid.uuid : "0"), (request->rq_export ? @@ -702,12 +1124,14 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, libcfs_id2str(request->rq_peer), lustre_msg_get_opc(request->rq_reqmsg)); + OBD_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, obd_fail_val); + rc = svc->srv_handler(request); request->rq_phase = RQ_PHASE_COMPLETE; CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc " - "%s:%s+%d:%d:"LPU64":%s:%d\n", cfs_curproc_comm(), + "%s:%s+%d:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(), (request->rq_export ? (char *)request->rq_export->exp_client_uuid.uuid : "0"), (request->rq_export ? @@ -725,37 +1149,26 @@ put_conn: lu_context_exit(&request->rq_session); lu_context_fini(&request->rq_session); -out_stat: - reply = request->rq_reply_state && request->rq_repmsg; /* bug 11169 */ - do_gettimeofday(&work_end); + if (unlikely(cfs_time_current_sec() > request->rq_deadline)) { + DEBUG_REQ(D_WARNING, request, "Request x"LPU64" took longer " + "than estimated (%ld%+lds); client may timeout.", + request->rq_xid, request->rq_deadline - + request->rq_arrival_time.tv_sec, + cfs_time_current_sec() - request->rq_deadline); + } + do_gettimeofday(&work_end); timediff = cfs_timeval_sub(&work_end, &work_start, NULL); - - if (unlikely(timediff / 1000000 > (long)obd_timeout)) - CERROR("request "LPU64" opc %u from %s processed in %lds " - "trans "LPU64" rc %d/%d\n", - request->rq_xid, - request->rq_reqmsg ? - lustre_msg_get_opc(request->rq_reqmsg) : 0, - libcfs_id2str(request->rq_peer), - cfs_timeval_sub(&work_end, &request->rq_arrival_time, - NULL) / 1000000, - reply ? lustre_msg_get_transno(request->rq_repmsg) : - request->rq_transno, request->rq_status, - reply ? lustre_msg_get_status(request->rq_repmsg) : -999); - else - CDEBUG(D_RPCTRACE,"request "LPU64" opc %u from %s processed in " - "%ldus (%ldus total) trans "LPU64" rc %d/%d\n", - request->rq_xid, - request->rq_reqmsg ? - lustre_msg_get_opc(request->rq_reqmsg) : 0, - libcfs_id2str(request->rq_peer), timediff, - cfs_timeval_sub(&work_end, &request->rq_arrival_time, - NULL), - request->rq_transno, request->rq_status, - reply ? lustre_msg_get_status(request->rq_repmsg) : -999); - + CDEBUG(D_RPCTRACE, "request x"LPU64" opc %u from %s processed in " + "%ldus (%ldus total) trans "LPU64" rc %d/%d\n", + request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg), + libcfs_id2str(request->rq_peer), timediff, + cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL), + request->rq_repmsg ? lustre_msg_get_transno(request->rq_repmsg) : + request->rq_transno, request->rq_status, + request->rq_repmsg ? lustre_msg_get_status(request->rq_repmsg): + -999); if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { __u32 op = lustre_msg_get_opc(request->rq_reqmsg); int opc = opcode_offset(op); @@ -766,6 +1179,12 @@ out_stat: timediff); } } + if (unlikely(request->rq_early_count)) { + DEBUG_REQ(D_ADAPTTO, request, + "sent %d early replies before finishing in %lds", + request->rq_early_count, + work_end.tv_sec - request->rq_arrival_time.tv_sec); + } out_req: ptlrpc_server_free_request(request); @@ -894,7 +1313,9 @@ liblustre_check_services (void *arg) svc->srv_threads_running++; do { - rc = ptlrpc_server_handle_reply(svc); + rc = ptlrpc_server_handle_req_in(svc); + rc |= ptlrpc_server_handle_reply(svc); + rc |= ptlrpc_at_check_timed(svc); rc |= ptlrpc_server_handle_request(svc, NULL); rc |= (ptlrpc_server_post_idle_rqbds(svc) > 0); did_something |= rc; @@ -964,7 +1385,7 @@ static int ptlrpc_main(void *arg) struct group_info *ginfo = NULL; #endif struct lu_env env; - int rc = 0; + int counter = 0, rc = 0; ENTRY; ptlrpc_daemonize(data->name); @@ -1025,7 +1446,9 @@ static int ptlrpc_main(void *arg) */ cfs_waitq_signal(&thread->t_ctl_waitq); - watchdog = lc_watchdog_add(svc->srv_watchdog_timeout, NULL, NULL); + watchdog = lc_watchdog_add(max_t(int, obd_timeout, AT_OFF ? 0 : + at_get(&svc->srv_at_estimate)) * + svc->srv_watchdog_factor, NULL, NULL); spin_lock(&svc->srv_lock); svc->srv_threads_running++; @@ -1053,14 +1476,18 @@ static int ptlrpc_main(void *arg) svc->srv_n_difficult_replies == 0) || (!list_empty(&svc->srv_idle_rqbds) && svc->srv_rqbd_timeout == 0) || - !list_empty (&svc->srv_reply_queue) || - (!list_empty (&svc->srv_request_queue) && - (svc->srv_n_difficult_replies == 0 || - svc->srv_n_active_reqs < - (svc->srv_threads_running - 1))), + !list_empty(&svc->srv_req_in_queue) || + !list_empty(&svc->srv_reply_queue) || + (!list_empty(&svc->srv_request_queue) && + (svc->srv_n_active_reqs < + (svc->srv_threads_running - 1))) || + svc->srv_at_check, &lwi); - lc_watchdog_touch(watchdog); + lc_watchdog_touch_ms(watchdog, max_t(int, obd_timeout, + AT_OFF ? 0 : + at_get(&svc->srv_at_estimate)) * + svc->srv_watchdog_factor); ptlrpc_check_rqbd_pool(svc); @@ -1070,15 +1497,24 @@ static int ptlrpc_main(void *arg) ptlrpc_start_thread(dev, svc); } - if (!list_empty (&svc->srv_reply_queue)) - ptlrpc_server_handle_reply (svc); + if (!list_empty(&svc->srv_reply_queue)) + ptlrpc_server_handle_reply(svc); - /* only handle requests if there are no difficult replies - * outstanding, or I'm not the last thread handling - * requests */ + if (!list_empty(&svc->srv_req_in_queue)) { + /* Process all incoming reqs before handling any */ + ptlrpc_server_handle_req_in(svc); + /* but limit ourselves in case of flood */ + if (counter++ < 1000) + continue; + counter = 0; + } + + if (svc->srv_at_check) + ptlrpc_at_check_timed(svc); + + /* don't handle requests in the last thread */ if (!list_empty (&svc->srv_request_queue) && - (svc->srv_n_difficult_replies == 0 || - svc->srv_n_active_reqs < (svc->srv_threads_running - 1))) { + (svc->srv_n_active_reqs < (svc->srv_threads_running - 1))) { lu_context_enter(&env.le_ctx); ptlrpc_server_handle_request(svc, thread); lu_context_exit(&env.le_ctx); @@ -1161,7 +1597,9 @@ int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc) int i, rc = 0; ENTRY; - LASSERT(svc->srv_threads_min > 0); + /* We require 2 threads min - see note in + ptlrpc_server_handle_request */ + LASSERT(svc->srv_threads_min >= 2); for (i = 0; i < svc->srv_threads_min; i++) { rc = ptlrpc_start_thread(dev, svc); /* We have enough threads, don't start more. b=15759 */ @@ -1247,6 +1685,8 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) struct list_head *tmp; struct ptlrpc_reply_state *rs, *t; + cfs_timer_disarm(&service->srv_at_timer); + ptlrpc_stop_all_threads(service); LASSERT(list_empty(&service->srv_threads)); @@ -1288,7 +1728,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL); + lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL); rc = l_wait_event(service->srv_waitq, service->srv_nrqbd_receiving == 0, &lwi); @@ -1310,6 +1750,17 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) /* purge the request queue. NB No new replies (rqbds all unlinked) * and no service threads, so I'm the only thread noodling the * request queue now */ + while (!list_empty(&service->srv_req_in_queue)) { + struct ptlrpc_request *req = + list_entry(service->srv_req_in_queue.next, + struct ptlrpc_request, + rq_list); + + list_del(&req->rq_list); + service->srv_n_queued_reqs--; + service->srv_n_active_reqs++; + ptlrpc_server_free_request(req); + } while (!list_empty(&service->srv_request_queue)) { struct ptlrpc_request *req = list_entry(service->srv_request_queue.next, @@ -1359,6 +1810,9 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) OBD_FREE(rs, service->srv_max_reply_size); } + /* In case somebody rearmed this in the meantime */ + cfs_timer_disarm(&service->srv_at_timer); + OBD_FREE_PTR(service); return 0; } @@ -1372,31 +1826,31 @@ int ptlrpc_service_health_check(struct ptlrpc_service *svc) { struct ptlrpc_request *request; struct timeval right_now; - long timediff, cutoff; - int rc = 0; + long timediff; if (svc == NULL) return 0; - spin_lock(&svc->srv_lock); + do_gettimeofday(&right_now); - if (list_empty(&svc->srv_request_queue)) - goto out; + spin_lock(&svc->srv_lock); + if (list_empty(&svc->srv_request_queue)) { + spin_unlock(&svc->srv_lock); + return 0; + } + /* How long has the next entry been waiting? */ request = list_entry(svc->srv_request_queue.next, struct ptlrpc_request, rq_list); - - do_gettimeofday(&right_now); timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL); + spin_unlock(&svc->srv_lock); - cutoff = obd_health_check_timeout; - - if (timediff / 1000000 > cutoff) { - rc = -1; - goto out; + if ((timediff / ONE_MILLION) > (AT_OFF ? obd_timeout * 3/2 : + at_max)) { + CERROR("%s: unhealthy - request has been waiting %lds\n", + svc->srv_name, timediff / ONE_MILLION); + return (-1); } - out: - spin_unlock(&svc->srv_lock); - return rc; + return 0; } diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index bb2db8c..6ebc60f 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -39,6 +39,8 @@ void lustre_assert_wire_constants(void) (long long)LUSTRE_MSG_MAGIC_V2); LASSERTF(PTLRPC_MSG_VERSION == 0x00000003," found %lld\n", (long long)PTLRPC_MSG_VERSION); + LASSERTF(MSGHDR_AT_SUPPORT == 1, " found %lld\n", + (long long)MSGHDR_AT_SUPPORT); LASSERTF(PTL_RPC_MSG_REQUEST == 4711, " found %lld\n", (long long)PTL_RPC_MSG_REQUEST); LASSERTF(PTL_RPC_MSG_ERR == 4712, " found %lld\n", @@ -264,14 +266,14 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize)); LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n", (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, " found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_flags)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags)); LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, " found %lld\n", (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2)); LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, " found %lld\n", @@ -336,14 +338,14 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body, pb_padding_1)); - LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body, pb_padding_2)); - LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2)); + LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time)); LASSERTF((int)offsetof(struct ptlrpc_body, pb_slv) == 80, " found %lld\n", (long long)(int)offsetof(struct ptlrpc_body, pb_slv)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_slv) == 8, " found %lld\n", diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 7b7a978..b5f812d 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -194,6 +194,14 @@ test_16() { run_test 16 "timeout bulk put, don't evict client (2732)" test_17() { + local at_max_saved=0 + + # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max + if at_is_valid && at_is_enabled; then + at_max_saved=$(at_max_get ost1) + at_max_set $TIMEOUT ost1 + fi + # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE # OST bulk will time out here, client retries do_facet ost1 lctl set_param fail_loc=0x80000503 @@ -201,12 +209,16 @@ test_17() { do_facet client cp /etc/termcap $DIR/$tfile sync - sleep $TIMEOUT + # with AT, client will wait adaptive_max*factor+net_latency before + # expiring the req, hopefully timeout*2 is enough + sleep $(($TIMEOUT*2)) + do_facet ost1 lctl set_param fail_loc=0 do_facet client "df $DIR" # expect cmp to succeed, client resent bulk do_facet client "cmp /etc/termcap $DIR/$tfile" || return 3 do_facet client "rm $DIR/$tfile" || return 4 + [ $at_max_saved -ne 0 ] && $(at_max_set $at_max_saved ost1) return 0 } run_test 17 "timeout bulk get, don't evict client (2732)" @@ -599,11 +611,11 @@ test_26a() { # was test_26 bug 5921 - evict dead exports by pinger echo starting with $OST_NEXP1 OST exports # OBD_FAIL_PTLRPC_DROP_RPC 0x505 do_facet client lctl set_param fail_loc=0x505 - # evictor takes up to 2.25x to evict. But if there's a - # race to start the evictor from various obds, the loser - # might have to wait for the next ping. - echo Waiting for $(($TIMEOUT * 4)) secs - sleep $(($TIMEOUT * 4)) + # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict. + # But if there's a race to start the evictor from various obds, + # the loser might have to wait for the next ping. + echo Waiting for $(($TIMEOUT * 8)) secs + sleep $(($TIMEOUT * 8)) OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`" OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2` echo ending with $OST_NEXP2 OST exports @@ -619,18 +631,18 @@ test_26b() { # bug 10140 - evict dead exports by pinger sleep 1 # wait connections being established MDS_FILE=mdt.${mds1_svc}.num_exports MDS_NEXP1="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`" - OST_FILE=obdfilter.${ost1_svc}.num_exports + OST_FILE=obdfilter.${ost1_svc}.num_exports OST_NEXP1="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`" - echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports - zconf_umount `hostname` $MOUNT2 -f - # evictor takes up to 2.25x to evict. But if there's a - # race to start the evictor from various obds, the loser - # might have to wait for the next ping. - echo Waiting for $(($TIMEOUT * 4)) secs - sleep $(($TIMEOUT * 4)) + echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports + zconf_umount `hostname` $MOUNT2 -f + # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict. + # But if there's a race to start the evictor from various obds, + # the loser might have to wait for the next ping. + echo Waiting for $(($TIMEOUT * 3)) secs + sleep $(($TIMEOUT * 3)) OST_NEXP2="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`" MDS_NEXP2="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`" - echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports + echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST" [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS" return 0 diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index da1c31e..597905c 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -920,15 +920,27 @@ test_43() { # bug 2530 run_test 43 "mds osc import failure during recovery; don't LBUG" test_44a() { # was test_44 + local at_max_saved=0 + mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'` [ "$mdcdev" ] || exit 2 + + # adaptive timeouts slow this way down + if at_is_valid && at_is_enabled; then + at_max_saved=$(at_max_get mds) + at_max_set 40 mds + fi + for i in `seq 1 10`; do + echo "$i of 10 ($(date +%s))" + do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts" #define OBD_FAIL_TGT_CONN_RACE 0x701 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701" $LCTL --device $mdcdev recover df $MOUNT done do_facet $SINGLEMDS "lctl set_param fail_loc=0" + [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds return 0 } run_test 44a "race in target handle connect" @@ -937,6 +949,8 @@ test_44b() { mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'` [ "$mdcdev" ] || exit 2 for i in `seq 1 10`; do + echo "$i of 10 ($(date +%s))" + do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts" #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704" $LCTL --device $mdcdev recover @@ -1415,6 +1429,220 @@ test_61c() { } run_test 61c "test race mds llog sync vs llog cleanup" +test_62() { # Bug 15756 - don't mis-drop resent replay + replay_barrier $SINGLEMDS + createmany -o $DIR/$tdir/$tfile- 25 +#define OBD_FAIL_TGT_REPLAY_DROP 0x706 + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000707" + facet_failover $SINGLEMDS + df $MOUNT || return 1 + do_facet $SINGLEMDS "lctl set_param fail_loc=0" + unlinkmany $DIR/$tdir/$tfile- 25 || return 2 + return 0 +} +run_test 62 "don't mis-drop resent replay" + +#Adaptive Timeouts (bug 3055) +AT_MAX_SET=0 + +at_start() +{ + if ! at_is_valid; then + skip "AT env is invalid" + return 1 + fi + + if ! at_is_enabled; then + echo "AT is disabled, enable it by force temporarily" + at_max_set 600 mds ost client + AT_MAX_SET=1 + fi + + if [ -z "$ATOLDBASE" ]; then + local at_history=$(do_facet mds "find /sys/ -name at_history") + [ -z "$at_history" ] && skip "missing /sys/.../at_history " && return 1 + ATOLDBASE=$(do_facet mds "cat $at_history") + # speed up the timebase so we can check decreasing AT + do_facet mds "echo 8 >> $at_history" + do_facet ost1 "echo 8 >> $at_history" + fi +} + +test_65a() #bug 3055 +{ + at_start || return 0 + $LCTL dk > /dev/null + debugsave + sysctl -w lnet.debug="+other" + # slow down a request + do_facet mds sysctl -w lustre.fail_val=30000 +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a + do_facet mds sysctl -w lustre.fail_loc=0x8000050a + createmany -o $DIR/$tfile 10 > /dev/null + unlinkmany $DIR/$tfile 10 > /dev/null + # check for log message + $LCTL dk | grep "Early reply #" || error "No early reply" + debugrestore + # client should show 30s estimates + grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + sleep 9 + grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts +} +run_test 65a "AT: verify early replies" + +test_65b() #bug 3055 +{ + at_start || return 0 + # turn on D_ADAPTTO + debugsave + sysctl -w lnet.debug="other trace" + $LCTL dk > /dev/null + # slow down bulk i/o + do_facet ost1 sysctl -w lustre.fail_val=30 +#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 + do_facet ost1 sysctl -w lustre.fail_loc=0x224 + + rm -f $DIR/$tfile + lfs setstripe $DIR/$tfile --index=0 --count=1 + # force some real bulk transfer + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + + do_facet ost1 sysctl -w lustre.fail_loc=0 + # check for log message + $LCTL dk | grep "Early reply #" || error "No early reply" + debugrestore + # client should show 30s estimates + grep portal $LPROC/osc/${FSNAME}-OST0000-osc-*/timeouts +} +run_test 65b "AT: verify early replies on packed reply / bulk" + +test_66a() #bug 3055 +{ + at_start || return 0 + grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + # adjust 5s at a time so no early reply is sent (within deadline) + do_facet mds "sysctl -w lustre.fail_val=5000" +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a + do_facet mds "sysctl -w lustre.fail_loc=0x8000050a" + createmany -o $DIR/$tfile 20 > /dev/null + unlinkmany $DIR/$tfile 20 > /dev/null + grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + do_facet mds "sysctl -w lustre.fail_val=10000" + do_facet mds "sysctl -w lustre.fail_loc=0x8000050a" + createmany -o $DIR/$tfile 20 > /dev/null + unlinkmany $DIR/$tfile 20 > /dev/null + grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + do_facet mds "sysctl -w lustre.fail_loc=0" + sleep 9 + createmany -o $DIR/$tfile 20 > /dev/null + unlinkmany $DIR/$tfile 20 > /dev/null + grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12" + CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts) + WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts) + echo "Current MDT timeout $CUR, worst $WORST" + [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST" +} +run_test 66a "AT: verify MDT service time adjusts with no early replies" + +test_66b() #bug 3055 +{ + at_start || return 0 + ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/lustre-*/timeouts) + sysctl -w lustre.fail_val=$(($ORIG + 5)) +#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c + sysctl -w lustre.fail_loc=0x50c + ls $DIR/$tfile > /dev/null 2>&1 + sysctl -w lustre.fail_loc=0 + CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts) + WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts) + echo "network timeout orig $ORIG, cur $CUR, worst $WORST" + [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG" +} +run_test 66b "AT: verify net latency adjusts" + +test_67a() #bug 3055 +{ + at_start || return 0 + CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + # sleeping threads may drive values above this + do_facet ost1 "sysctl -w lustre.fail_val=400" +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a + do_facet ost1 "sysctl -w lustre.fail_loc=0x50a" + createmany -o $DIR/$tfile 20 > /dev/null + unlinkmany $DIR/$tfile 20 > /dev/null + do_facet ost1 "sysctl -w lustre.fail_loc=0" + CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + ATTEMPTS=$(($CONN2 - $CONN1)) + echo "$ATTEMPTS osc reconnect attemps on gradual slow" + [ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect" + return 0 +} +run_test 67a "AT: verify slow request processing doesn't induce reconnects" + +test_67b() #bug 3055 +{ + at_start || return 0 + CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) +#define OBD_FAIL_OST_PAUSE_CREATE 0x223 + do_facet ost1 "sysctl -w lustre.fail_val=20000" + do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" + cp /etc/profile $DIR/$tfile || error "cp failed" + client_reconnect + cat $LPROC/ost/OSS/ost_create/timeouts + log "phase 2" + CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + ATTEMPTS=$(($CONN2 - $CONN1)) + echo "$ATTEMPTS osc reconnect attemps on instant slow" + # do it again; should not timeout + do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" + cp /etc/profile $DIR/$tfile || error "cp failed" + do_facet ost1 "sysctl -w lustre.fail_loc=0" + client_reconnect + cat $LPROC/ost/OSS/ost_create/timeouts + CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + ATTEMPTS=$(($CONN3 - $CONN2)) + echo "$ATTEMPTS osc reconnect attemps on 2nd slow" + [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect" + return 0 +} +run_test 67b "AT: verify instant slowdown doesn't induce reconnects" + +test_68 () #bug 13813 +{ + at_start || return 0 + local ldlm_enqueue_min=$(find /sys -name ldlm_enqueue_min) + [ -z "$ldlm_enqueue_min" ] && skip "missing /sys/.../ldlm_enqueue_min" && return 0 + local ENQ_MIN=$(cat $ldlm_enqueue_min) + echo $TIMEOUT >> $ldlm_enqueue_min + rm -f $DIR/${tfile}_[1-2] + lfs setstripe $DIR/$tfile --index=0 --count=1 +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 + sysctl -w lustre.fail_val=$(($TIMEOUT - 1)) + sysctl -w lustre.fail_loc=0x80000312 + cp /etc/profile $DIR/${tfile}_1 || error "1st cp failed $?" + sysctl -w lustre.fail_val=$((TIMEOUT * 3 / 2)) + sysctl -w lustre.fail_loc=0x80000312 + cp /etc/profile $DIR/${tfile}_2 || error "2nd cp failed $?" + sysctl -w lustre.fail_loc=0 + echo $ENQ_MIN >> $ldlm_enqueue_min + return 0 +} +run_test 68 "AT: verify slowing locks" + +if [ -n "$ATOLDBASE" ]; then + at_history=$(do_facet mds "find /sys/ -name at_history") + do_facet mds "echo $ATOLDBASE >> $at_history" || true + do_facet ost1 "echo $ATOLDBASE >> $at_history" || true +fi + +if [ $AT_MAX_SET -ne 0 ]; then + echo "restore AT status to be disabled" + at_max_set 0 mds ost client +fi + +# end of AT tests includes above lines + + # start multi-client tests test_70a () { [ -z "$CLIENTS" ] && \ diff --git a/lustre/tests/sanity-gss.sh b/lustre/tests/sanity-gss.sh index f4346df..f1a9c26 100644 --- a/lustre/tests/sanity-gss.sh +++ b/lustre/tests/sanity-gss.sh @@ -332,7 +332,7 @@ set_flavor_all() start_dbench() { NPROC=`cat /proc/cpuinfo 2>/dev/null | grep ^processor | wc -l` - [ $NPROC -lt 2 ] && NPROC=2 + [ $NPROC -gt 2 ] && NPROC=2 sh rundbench $NPROC 1>/dev/null & DBENCH_PID=$! sleep 2 @@ -589,10 +589,15 @@ test_5() { run_test 5 "lsvcgssd dead, operations lead to recovery" test_6() { + local nfile=10 + mkdir $DIR/d6 || error "mkdir $DIR/d6 failed" - cp -a /etc/* $DIR/d6/ || error "cp failed" + for ((i=0; i<$nfile; i++)); do + dd if=/dev/zero of=$DIR/d6/file$i bs=8k count=1 || error "dd file$i failed" + done ls -l $DIR/d6/* > /dev/null || error "ls failed" rm -rf $DIR2/d6/* || error "rm failed" + rmdir $DIR2/d6/ || error "rmdir failed" } run_test 6 "test basic DLM callback works" @@ -629,7 +634,37 @@ test_7() { } run_test 7 "exercise enlarge_reqbuf()" -test_8() { +test_8() +{ + debugsave + sysctl -w lnet.debug="other" + $LCTL dk > /dev/null + + # sleep sometime in ctx handle + do_facet mds sysctl -w lustre.fail_val=60 +#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 + do_facet mds sysctl -w lustre.fail_loc=0x1204 + + $RUNAS $LFS flushctx || error "can't flush ctx" + + $RUNAS df $DIR & + DFPID=$! + echo "waiting df (pid $TOUCHPID) to finish..." + sleep 2 # give df a chance to really trigger context init rpc + do_facet mds sysctl -w lustre.fail_loc=0 + wait $DFPID || error "df should have succeeded" + + $LCTL dk | grep "Early reply #" || error "No early reply" + debugrestore +} +run_test 8 "Early reply sent for slow gss context negotiation" + +# +# following tests will manipulate flavors and may end with any flavor set, +# so each test should not assume any start flavor. +# + +test_50() { local sample=$TMP/sanity-gss-8 local tdir=$MOUNT/dir8 local iosize="256K" @@ -657,9 +692,9 @@ test_8() { rm -rf $tdir rm -f $sample } -run_test 8 "verify bulk hash algorithms works" +run_test 50 "verify bulk hash algorithms works" -test_9() { +test_51() { local s1=$TMP/sanity-gss-9.1 local s2=$TMP/sanity-gss-9.2 local s3=$TMP/sanity-gss-9.3 @@ -719,7 +754,7 @@ test_9() { rm -rf $tdir rm -f $sample } -run_test 9 "bulk data alignment test under encryption mode" +run_test 51 "bulk data alignment test under encryption mode" test_90() { if [ "$SLOW" = "no" ]; then diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index f2fc49d..e3e4f14 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -3447,6 +3447,7 @@ test_77j() { # bug 13805 lctl set_param fail_loc=0x40c remount_client $MOUNT lctl set_param fail_loc=0 + sleep 2 # wait async osc connect to finish for VALUE in `lctl get_param osc.*osc-[^mM]*.checksum_type`; do PARAM=`echo ${VALUE[0]} | cut -d "=" -f1` algo=`lctl get_param -n $PARAM | sed 's/.*\[\(.*\)\].*/\1/g'` diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 22398ef..eab0e0a 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -124,6 +124,7 @@ init_test_env() { export KRB5DIR=${KRB5DIR:-"/usr/kerberos"} export DIR2 export SAVE_PWD=${SAVE_PWD:-$LUSTRE/tests} + export AT_MAX_PATH if [ "$ACCEPTOR_PORT" ]; then export PORT_OPT="--port $ACCEPTOR_PORT" @@ -1246,6 +1247,56 @@ absolute_path() { } ################################## +# Adaptive Timeouts funcs + +at_is_valid() { + if [ -z "$AT_MAX_PATH" ]; then + AT_MAX_PATH=$(do_facet mds "find /sys/ -name at_max") + [ -z "$AT_MAX_PATH" ] && echo "missing /sys/.../at_max " && return 1 + fi + return 0 +} + +at_is_enabled() { + at_is_valid || error "invalid call" + + # only check mds, we assume at_max is the same on all nodes + local at_max=$(do_facet mds "cat $AT_MAX_PATH") + if [ $at_max -eq 0 ]; then + return 1 + else + return 0 + fi +} + +at_max_get() { + at_is_valid || error "invalid call" + + do_facet $1 "cat $AT_MAX_PATH" +} + +at_max_set() { + local at_max=$1 + shift + + at_is_valid || error "invalid call" + + for facet in $@; do + if [ $facet == "ost" ]; then + for i in `seq $OSTCOUNT`; do + do_facet ost$i "echo $at_max > $AT_MAX_PATH" + done + elif [ $facet == "mds" ]; then + for i in `seq $MDSCOUNT`; do + do_facet mds$i "echo $at_max > $AT_MAX_PATH" + done + else + do_facet $facet "echo $at_max > $AT_MAX_PATH" + fi + done +} + +################################## # OBD_FAIL funcs drop_request() { diff --git a/lustre/utils/gss/.cvsignore b/lustre/utils/gss/.cvsignore index 15f680f..6359b75 100644 --- a/lustre/utils/gss/.cvsignore +++ b/lustre/utils/gss/.cvsignore @@ -7,5 +7,6 @@ TAGS lgssd lsvcgssd l_idmap +lgss_keyring .*.cmd .*.d diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index d5d2d09..e8fe257 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -110,8 +110,8 @@ check_lustre_msg_v2(void) CHECK_MEMBER(lustre_msg_v2, lm_secflvr); CHECK_MEMBER(lustre_msg_v2, lm_magic); CHECK_MEMBER(lustre_msg_v2, lm_repsize); - CHECK_MEMBER(lustre_msg_v2, lm_timeout); - CHECK_MEMBER(lustre_msg_v2, lm_padding_1); + CHECK_MEMBER(lustre_msg_v2, lm_cksum); + CHECK_MEMBER(lustre_msg_v2, lm_flags); CHECK_MEMBER(lustre_msg_v2, lm_padding_2); CHECK_MEMBER(lustre_msg_v2, lm_padding_3); CHECK_MEMBER(lustre_msg_v2, lm_buflens[0]); @@ -134,8 +134,8 @@ check_ptlrpc_body(void) CHECK_MEMBER(ptlrpc_body, pb_flags); CHECK_MEMBER(ptlrpc_body, pb_op_flags); CHECK_MEMBER(ptlrpc_body, pb_conn_cnt); - CHECK_MEMBER(ptlrpc_body, pb_padding_1); - CHECK_MEMBER(ptlrpc_body, pb_padding_2); + CHECK_MEMBER(ptlrpc_body, pb_timeout); + CHECK_MEMBER(ptlrpc_body, pb_service_time); CHECK_MEMBER(ptlrpc_body, pb_slv); CHECK_MEMBER(ptlrpc_body, pb_limit); } @@ -1102,6 +1102,7 @@ main(int argc, char **argv) COMMENT("Constants..."); CHECK_DEFINE(LUSTRE_MSG_MAGIC_V2); CHECK_DEFINE(PTLRPC_MSG_VERSION); + CHECK_VALUE(MSGHDR_AT_SUPPORT); CHECK_VALUE(PTL_RPC_MSG_REQUEST); CHECK_VALUE(PTL_RPC_MSG_ERR); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index a8af0a7..449f52e 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -37,6 +37,8 @@ void lustre_assert_wire_constants(void) (long long)LUSTRE_MSG_MAGIC_V2); LASSERTF(PTLRPC_MSG_VERSION == 0x00000003," found %lld\n", (long long)PTLRPC_MSG_VERSION); + LASSERTF(MSGHDR_AT_SUPPORT == 1, " found %lld\n", + (long long)MSGHDR_AT_SUPPORT); LASSERTF(PTL_RPC_MSG_REQUEST == 4711, " found %lld\n", (long long)PTL_RPC_MSG_REQUEST); LASSERTF(PTL_RPC_MSG_ERR == 4712, " found %lld\n", @@ -262,14 +264,14 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize)); LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n", (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, " found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_flags)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags)); LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, " found %lld\n", (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2)); LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, " found %lld\n", @@ -334,14 +336,14 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body, pb_padding_1)); - LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body, pb_padding_2)); - LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2)); + LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time)); LASSERTF((int)offsetof(struct ptlrpc_body, pb_slv) == 80, " found %lld\n", (long long)(int)offsetof(struct ptlrpc_body, pb_slv)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_slv) == 8, " found %lld\n", -- 1.8.3.1