From c2851054f0e5b1c2976a865ee66a8f941d7a31a5 Mon Sep 17 00:00:00 2001 From: zhanghc Date: Tue, 13 Jan 2009 15:00:50 +0000 Subject: [PATCH] b=17887 1, print the actual time to wait for lock cancellation in "waiting_locks_callback" 2, compare resource in "ost_rw_hpreq_lock_match" 3, replace "LDLM_ERROR" with "LDLM_WARN" if a lock is completed after waiting more than obd_timeout i=vitaly.fertman i=johann --- lustre/include/lustre_dlm.h | 9 ++++++--- lustre/ldlm/ldlm_lock.c | 2 +- lustre/ldlm/ldlm_lockd.c | 24 ++++++++++++------------ lustre/ldlm/ldlm_request.c | 13 ++++++------- lustre/ost/ost_handler.c | 6 +++++- 5 files changed, 30 insertions(+), 24 deletions(-) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 94f7e3b..b8df94d 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -532,8 +532,8 @@ struct ldlm_lock { * it's no longer in use. If the lock is not granted, a process sleeps * on this waitq to learn when it becomes granted. */ cfs_waitq_t l_waitq; - struct timeval l_enqueued_time; + cfs_time_t l_last_activity; /* seconds */ cfs_time_t l_last_used; /* jiffies */ struct ldlm_extent l_req_extent; @@ -642,13 +642,16 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 mask, ...) __attribute__ ((format (printf, 4, 5))); -#define LDLM_ERROR(lock, fmt, a...) do { \ +#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \ static cfs_debug_limit_state_t _ldlm_cdls; \ - ldlm_lock_debug(&_ldlm_cdls, D_ERROR, lock, \ + ldlm_lock_debug(&_ldlm_cdls, mask, lock, \ __FILE__, __FUNCTION__, __LINE__, \ "### " fmt , ##a); \ } while (0) +#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a) +#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a) + #define LDLM_DEBUG(lock, fmt, a...) do { \ ldlm_lock_debug(NULL, D_DLMTRACE, lock, \ __FILE__, __FUNCTION__, __LINE__, \ diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 972d457..f37f834 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -1177,7 +1177,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, struct ldlm_interval *node = NULL; ENTRY; - do_gettimeofday(&lock->l_enqueued_time); + lock->l_last_activity = cfs_time_current_sec(); /* policies are not executed on the client or during replay */ if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT && !local && ns->ns_policy) { diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 75fac78..7fd5fa2 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -300,7 +300,7 @@ static void waiting_locks_callback(unsigned long unused) lock->l_resource->lr_namespace->ns_timeouts++; LDLM_ERROR(lock, "lock callback timer expired after %lds: " "evicting client at %s ", - cfs_time_current_sec()- lock->l_enqueued_time.tv_sec, + cfs_time_current_sec()- lock->l_last_activity, libcfs_nid2str( lock->l_export->exp_connection->c_peer.nid)); if (lock == last) { @@ -755,6 +755,8 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, LDLM_DEBUG(lock, "server preparing blocking AST"); + lock->l_last_activity = cfs_time_current_sec(); + ptlrpc_req_set_repsize(req, 1, NULL); if (instant_cancel) { unlock_res(lock->l_resource); @@ -786,7 +788,6 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) struct ldlm_cb_set_arg *arg = data; struct ldlm_request *body; struct ptlrpc_request *req; - struct timeval granted_time; long total_enqueue_wait; __u32 size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREQ_OFF] = sizeof(*body) }; @@ -796,14 +797,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) LASSERT(lock != NULL); LASSERT(data != NULL); - do_gettimeofday(&granted_time); - total_enqueue_wait = cfs_timeval_sub(&granted_time, - &lock->l_enqueued_time, NULL); + total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(), + lock->l_last_activity); - if (total_enqueue_wait / ONE_MILLION > obd_timeout) + if (total_enqueue_wait > obd_timeout) /* non-fatal with AT - change to LDLM_DEBUG? */ - LDLM_ERROR(lock, "enqueue wait took %luus from %lu", - total_enqueue_wait, lock->l_enqueued_time.tv_sec); + LDLM_WARN(lock, "enqueue wait took %lus from %lu", + total_enqueue_wait, lock->l_last_activity); lock_res_and_lock(lock); if (lock->l_resource->lr_lvb_len) { @@ -839,13 +839,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) unlock_res_and_lock(lock); } - LDLM_DEBUG(lock, "server preparing completion AST (after %ldus wait)", + LDLM_DEBUG(lock, "server preparing completion AST (after %lds wait)", total_enqueue_wait); /* Server-side enqueue wait time estimate, used in __ldlm_add_waiting_lock to set future enqueue timers */ at_add(&lock->l_resource->lr_namespace->ns_at_estimate, - total_enqueue_wait / ONE_MILLION); + total_enqueue_wait); ptlrpc_req_set_repsize(req, 1, NULL); @@ -1086,7 +1086,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, if (!lock) GOTO(out, rc = -ENOMEM); - do_gettimeofday(&lock->l_enqueued_time); + lock->l_last_activity = cfs_time_current_sec(); lock->l_remote_handle = dlm_req->lock_handle[0]; LDLM_DEBUG(lock, "server-side enqueue handler, new lock created"); @@ -1291,7 +1291,7 @@ int ldlm_handle_convert(struct ptlrpc_request *req) LDLM_DEBUG(lock, "server-side convert handler START"); - do_gettimeofday(&lock->l_enqueued_time); + lock->l_last_activity = cfs_time_current_sec(); res = ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode, &dlm_rep->lock_flags); if (res) { diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index e145f03..29286cf 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -76,9 +76,8 @@ int ldlm_expired_completion_wait(void *data) LDLM_ERROR(lock, "lock timed out (enqueued at %lu, %lus ago); " "not entering recovery in server code, just going " - "back to sleep", lock->l_enqueued_time.tv_sec, - cfs_time_current_sec() - - lock->l_enqueued_time.tv_sec); + "back to sleep", lock->l_last_activity, + cfs_time_current_sec() - lock->l_last_activity); if (cfs_time_after(cfs_time_current(), next_dump)) { last_dump = next_dump; next_dump = cfs_time_shift(300); @@ -94,8 +93,8 @@ int ldlm_expired_completion_wait(void *data) imp = obd->u.cli.cl_import; ptlrpc_fail_import(imp, lwd->lwd_conn_cnt); LDLM_ERROR(lock, "lock timed out (enqueued at %lu, %lus ago), entering " - "recovery for %s@%s", lock->l_enqueued_time.tv_sec, - CURRENT_SECONDS - lock->l_enqueued_time.tv_sec, + "recovery for %s@%s", lock->l_last_activity, + cfs_time_current_sec() - lock->l_last_activity, obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid); RETURN(0); @@ -211,11 +210,11 @@ noreproc: } LDLM_DEBUG(lock, "client-side enqueue waking up: granted after %lds", - cfs_time_current_sec() - lock->l_enqueued_time.tv_sec); + cfs_time_current_sec() - lock->l_last_activity); /* Update our time estimate */ at_add(&lock->l_resource->lr_namespace->ns_at_estimate, - cfs_time_current_sec() - lock->l_enqueued_time.tv_sec); + cfs_time_current_sec() - lock->l_last_activity); RETURN(0); } diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index f84d696..17ad02e 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1510,7 +1510,7 @@ int ost_msg_check_version(struct lustre_msg *msg) } static int ost_rw_hpreq_lock_match(struct ptlrpc_request *req, - struct ldlm_lock *lock) + struct ldlm_lock *lock) { struct niobuf_remote *nb; struct obd_ioobj *ioo; @@ -1547,6 +1547,10 @@ static int ost_rw_hpreq_lock_match(struct ptlrpc_request *req, end = (nb[ioo->ioo_bufcnt - 1].offset + nb[ioo->ioo_bufcnt - 1].len - 1) | ~CFS_PAGE_MASK; + LASSERT(lock->l_resource != NULL); + if (lock->l_resource->lr_name.name[0] != ioo->ioo_id) + RETURN(0); + if (!(lock->l_granted_mode & mode)) RETURN(0); -- 1.8.3.1