From e6ba48a08c9fe831b3f60985a72bb31b9806bb34 Mon Sep 17 00:00:00 2001 From: zhanghc Date: Tue, 3 Feb 2009 14:16:52 +0000 Subject: [PATCH] b=17887 1, print the actual time to wait for lock cancellation in "waiting_locks_callback" 2, compare resource in "ost_rw_hpreq_lock_match" 3, replace "LDLM_ERROR" with "LDLM_WARN" if a lock is completed after waiting more than odb_timeout i=vitaly.fertman i=johann --- lustre/include/lustre_dlm.h | 13 ++++++++++--- lustre/ldlm/ldlm_lock.c | 2 +- lustre/ldlm/ldlm_lockd.c | 22 ++++++++++------------ lustre/ldlm/ldlm_request.c | 11 +++++------ lustre/ost/ost_handler.c | 5 +++++ 5 files changed, 31 insertions(+), 22 deletions(-) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 133b8fa..9edf487 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -639,7 +639,11 @@ struct ldlm_lock { */ cfs_waitq_t l_waitq; - struct timeval l_enqueued_time; + /** + * Seconds. it will be updated if there is any activity related to + * the lock, e.g. enqueue the lock or send block AST. + */ + cfs_time_t l_last_activity; /** * Jiffies. Should be converted to time if needed. @@ -790,13 +794,16 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 mask, ...) __attribute__ ((format (printf, 4, 5))); -#define LDLM_ERROR(lock, fmt, a...) do { \ +#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \ static cfs_debug_limit_state_t _ldlm_cdls; \ - ldlm_lock_debug(&_ldlm_cdls, D_ERROR, lock, \ + ldlm_lock_debug(&_ldlm_cdls, mask, lock, \ __FILE__, __FUNCTION__, __LINE__, \ "### " fmt , ##a); \ } while (0) +#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a) +#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a) + #define LDLM_DEBUG(lock, fmt, a...) do { \ ldlm_lock_debug(NULL, D_DLMTRACE, lock, \ __FILE__, __FUNCTION__, __LINE__, \ diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 0a26b56..b3b36a8 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -1216,7 +1216,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, struct ldlm_interval *node = NULL; ENTRY; - do_gettimeofday(&lock->l_enqueued_time); + lock->l_last_activity = cfs_time_current_sec(); /* policies are not executed on the client or during replay */ if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT && !local && ns->ns_policy) { diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index ce66f5c..b56a42c 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -336,7 +336,7 @@ repeat: lock->l_resource->lr_namespace->ns_timeouts++; LDLM_ERROR(lock, "lock callback timer expired after %lds: " "evicting client at %s ", - cfs_time_current_sec()- lock->l_enqueued_time.tv_sec, + cfs_time_current_sec()- lock->l_last_activity, libcfs_nid2str( lock->l_export->exp_connection->c_peer.nid)); @@ -795,7 +795,6 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) struct ldlm_cb_set_arg *arg = data; struct ldlm_request *body; struct ptlrpc_request *req; - struct timeval granted_time; long total_enqueue_wait; int instant_cancel = 0; int rc = 0; @@ -804,14 +803,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) LASSERT(lock != NULL); LASSERT(data != NULL); - do_gettimeofday(&granted_time); - total_enqueue_wait = cfs_timeval_sub(&granted_time, - &lock->l_enqueued_time, NULL); + total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(), + lock->l_last_activity); - if (total_enqueue_wait / ONE_MILLION > obd_timeout) + if (total_enqueue_wait > obd_timeout) /* non-fatal with AT - change to LDLM_DEBUG? */ - LDLM_ERROR(lock, "enqueue wait took %luus from "CFS_TIME_T, - total_enqueue_wait, lock->l_enqueued_time.tv_sec); + LDLM_WARN(lock, "enqueue wait took %lus from "CFS_TIME_T, + total_enqueue_wait, lock->l_last_activity); req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse, &RQF_LDLM_CP_CALLBACK); @@ -848,13 +846,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) unlock_res_and_lock(lock); } - LDLM_DEBUG(lock, "server preparing completion AST (after %ldus wait)", + LDLM_DEBUG(lock, "server preparing completion AST (after %lds wait)", total_enqueue_wait); /* Server-side enqueue wait time estimate, used in __ldlm_add_waiting_lock to set future enqueue timers */ at_add(&lock->l_resource->lr_namespace->ns_at_estimate, - total_enqueue_wait / ONE_MILLION); + total_enqueue_wait); ptlrpc_request_set_replen(req); @@ -1092,7 +1090,7 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns, if (!lock) GOTO(out, rc = -ENOMEM); - do_gettimeofday(&lock->l_enqueued_time); + lock->l_last_activity = cfs_time_current_sec(); lock->l_remote_handle = dlm_req->lock_handle[0]; LDLM_DEBUG(lock, "server-side enqueue handler, new lock created"); @@ -1305,7 +1303,7 @@ int ldlm_handle_convert0(struct ptlrpc_request *req, LDLM_DEBUG(lock, "server-side convert handler START"); - do_gettimeofday(&lock->l_enqueued_time); + lock->l_last_activity = cfs_time_current_sec(); res = ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode, &dlm_rep->lock_flags); if (res) { diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 6e11110..b6fb1d1 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -80,9 +80,9 @@ int ldlm_expired_completion_wait(void *data) LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", " CFS_DURATION_T"s ago); not entering recovery in " "server code, just going back to sleep", - lock->l_enqueued_time.tv_sec, + lock->l_last_activity, cfs_time_sub(cfs_time_current_sec(), - lock->l_enqueued_time.tv_sec)); + lock->l_last_activity)); if (cfs_time_after(cfs_time_current(), next_dump)) { last_dump = next_dump; next_dump = cfs_time_shift(300); @@ -99,9 +99,8 @@ int ldlm_expired_completion_wait(void *data) ptlrpc_fail_import(imp, lwd->lwd_conn_cnt); LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", " CFS_DURATION_T"s ago), entering recovery for %s@%s", - lock->l_enqueued_time.tv_sec, - cfs_time_sub(cfs_time_current_sec(), - lock->l_enqueued_time.tv_sec), + lock->l_last_activity, + cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity), obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid); RETURN(0); @@ -136,7 +135,7 @@ static int ldlm_completion_tail(struct ldlm_lock *lock) result = -EIO; } else { delay = cfs_time_sub(cfs_time_current_sec(), - lock->l_enqueued_time.tv_sec); + lock->l_last_activity); LDLM_DEBUG(lock, "client-side enqueue: granted after " CFS_DURATION_T"s", delay); diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 1ae7308..cb520bb 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1605,6 +1605,11 @@ static int ost_rw_hpreq_lock_match(struct ptlrpc_request *req, end = (nb[ioo->ioo_bufcnt - 1].offset + nb[ioo->ioo_bufcnt - 1].len - 1) | ~CFS_PAGE_MASK; + LASSERT(lock->l_resource != NULL); + if (!osc_res_name_eq(ioo->ioo_id, ioo->ioo_gr, + &lock->l_resource->lr_name)) + RETURN(0); + if (!(lock->l_granted_mode & mode)) RETURN(0); -- 1.8.3.1