From b6a137e6e11872152ab3569ebea1fe92ccd29303 Mon Sep 17 00:00:00 2001 From: nathan Date: Tue, 9 Jun 2009 20:43:41 +0000 Subject: [PATCH] b=18618 i=adilger i=johann don't increase ldlm timeout if previous client was evicted --- lustre/ChangeLog | 8 ++++++++ lustre/ldlm/ldlm_lockd.c | 19 ++++++++++++------- lustre/ldlm/ldlm_request.c | 2 +- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 29fa414..e44e759 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -15,6 +15,14 @@ tbd Sun Microsystems, Inc. more information, please refer to bugzilla 17630. Severity : normal +Frequency : rare, if clients are evicted due to failure to return a lock +Bugzilla : 18618 +Description: don't increase ldlm timeout if previous client was evicted +Details : if a client doesn't respond to a BAST within the adaptive ldlm + enqueue timeout, don't adjust the adaptive estimate when the lock + is next granted. + +Severity : normal Bugzilla : 18674 Description: abort bulk too early if client is reconnected diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 3c06d8f..95e37fb 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -802,11 +802,6 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity); - if (total_enqueue_wait > obd_timeout) - /* non-fatal with AT - change to LDLM_DEBUG? */ - LDLM_WARN(lock, "enqueue wait took %lus from %lu", - total_enqueue_wait, lock->l_last_activity); - lock_res_and_lock(lock); if (lock->l_resource->lr_lvb_len) { size[DLM_REQ_REC_OFF] = lock->l_resource->lr_lvb_len; @@ -846,8 +841,18 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) /* Server-side enqueue wait time estimate, used in __ldlm_add_waiting_lock to set future enqueue timers */ - at_add(&lock->l_resource->lr_namespace->ns_at_estimate, - total_enqueue_wait); + if (total_enqueue_wait < ldlm_get_enq_timeout(lock)) + at_add(&lock->l_resource->lr_namespace->ns_at_estimate, + total_enqueue_wait); + else + /* bz18618. Don't add lock enqueue time we spend waiting for a + previous callback to fail. Locks waiting legitimately will + get extended by ldlm_refresh_waiting_lock regardless of the + estimate, so it's okay to underestimate here. */ + LDLM_DEBUG(lock, "lock completed after %lus; estimate was %ds. " + "It is likely that a previous callback timed out.", + total_enqueue_wait, + at_get(&lock->l_resource->lr_namespace->ns_at_estimate)); ptlrpc_req_set_repsize(req, 1, NULL); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index cd85edd..3621d9d 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -110,7 +110,7 @@ int ldlm_get_enq_timeout(struct ldlm_lock *lock) /* Since these are non-updating timeouts, we should be conservative. It would be nice to have some kind of "early reply" mechanism for lock callbacks too... */ - timeout = timeout + (timeout >> 1); /* 150% */ + timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */ return max(timeout, ldlm_enqueue_min); } EXPORT_SYMBOL(ldlm_get_enq_timeout); -- 1.8.3.1