The lock callback timeout is calculated as an average per namespace.
This does not reflect individual client behavior.
Instead, we should calculate it on a per-export basis.
Signed-off-by: Vitaly Fertman <vitaly_fertman@xyratex.com>
Change-Id: I12e3fc5f8d261cce252fcf13f22193273dc054ee
Tested-by: Elena Gryaznova <Elena_Gryaznova@xyratex.com>
Reviewed-by: Andriy Skulysh <Andriy_Skulysh@xyratex.com>
Reviewed-by: Alexey Lyashkov <Alexey_Lyashkov@xyratex.com>
Xyratex-bug-id: MRP-417
Reviewed-on: http://review.whamcloud.com/9336
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: James Simmons <uja.ornl@gmail.com>
const struct ldlm_request *dlm_req);
int ldlm_handle_cancel(struct ptlrpc_request *req);
int ldlm_request_cancel(struct ptlrpc_request *req,
- const struct ldlm_request *dlm_req, int first);
+ const struct ldlm_request *dlm_req,
+ int first, enum lustre_at_flags flags);
/** @} ldlm_handlers */
void ldlm_revoke_export_locks(struct obd_export *exp);
+unsigned int ldlm_bl_timeout(struct ldlm_lock *lock);
#endif
int ldlm_del_waiting_lock(struct ldlm_lock *lock);
int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
} u;
struct nodemap *exp_nodemap;
+ struct adaptive_timeout exp_bl_lock_at;
};
#define exp_target_data u.eu_target_data
spinlock_t at_lock;
};
+enum lustre_at_flags {
+ LATF_SKIP = 0x0,
+ LATF_STATS = 0x1,
+};
+
struct ptlrpc_at_array {
struct list_head *paa_reqs_array; /** array to hold requests */
__u32 paa_size; /** the size of array */
int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
struct list_head *cancels, int count, int max,
ldlm_cancel_flags_t cancel_flags, int flags);
-extern int ldlm_enqueue_min;
-int ldlm_get_enq_timeout(struct ldlm_lock *lock);
-
+extern unsigned int ldlm_enqueue_min;
/* ldlm_resource.c */
int ldlm_resource_putref_locked(struct ldlm_resource *res);
void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
struct ldlm_interval *node = NULL;
ENTRY;
- lock->l_last_activity = cfs_time_current_sec();
/* policies are not executed on the client or during replay */
if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
&& !local && ns->ns_policy) {
spin_unlock_bh(&waiting_locks_spinlock);
LDLM_DEBUG(lock, "prolong the busy lock");
ldlm_refresh_waiting_lock(lock,
- ldlm_get_enq_timeout(lock));
+ ldlm_bl_timeout(lock) >> 1);
spin_lock_bh(&waiting_locks_spinlock);
if (!cont) {
static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
{
int ret;
- int timeout = ldlm_get_enq_timeout(lock);
+ int timeout = ldlm_bl_timeout(lock);
/* NB: must be called with hold of lock_res_and_lock() */
LASSERT(ldlm_is_res_locked(lock));
if (ldlm_is_destroyed(lock)) {
static cfs_time_t next;
spin_unlock_bh(&waiting_locks_spinlock);
- LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
- if (cfs_time_after(cfs_time_current(), next)) {
- next = cfs_time_shift(14400);
- libcfs_debug_dumpstack(NULL);
- }
- return 0;
- }
+ LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
+ if (cfs_time_after(cfs_time_current(), next)) {
+ next = cfs_time_shift(14400);
+ libcfs_debug_dumpstack(NULL);
+ }
+ return 0;
+ }
- ret = __ldlm_add_waiting_lock(lock, timeout);
- if (ret) {
- /* grab ref on the lock if it has been added to the
- * waiting list */
- LDLM_LOCK_GET(lock);
- }
+ lock->l_last_activity = cfs_time_current_sec();
+ ret = __ldlm_add_waiting_lock(lock, timeout);
+ if (ret) {
+ /* grab ref on the lock if it has been added to the
+ * waiting list */
+ LDLM_LOCK_GET(lock);
+ }
spin_unlock_bh(&waiting_locks_spinlock);
if (ret) {
#ifdef HAVE_SERVER_SUPPORT
/**
+ * Calculate the per-export Blocking timeout (covering BL AST, data flush,
+ * lock cancel, and their replies). Used for lock callback timeout and AST
+ * re-send period.
+ *
+ * \param[in] lock lock which is getting the blocking callback
+ *
+ * \retval timeout in seconds to wait for the client reply
+ */
+unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
+{
+ unsigned int timeout;
+
+ if (AT_OFF)
+ return obd_timeout / 2;
+
+ /* Since these are non-updating timeouts, we should be conservative.
+ * Take more than usually, 150%
+ * It would be nice to have some kind of "early reply" mechanism for
+ * lock callbacks too... */
+ timeout = at_get(&lock->l_export->exp_bl_lock_at);
+ return max(timeout + (timeout >> 1), ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_bl_timeout);
+
+/**
* Perform lock cleanup if AST sending failed.
*/
static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
}
} else {
- LDLM_ERROR(lock, "client (nid %s) returned %d: rc=%d "
+ LDLM_ERROR(lock, "client (nid %s) returned %d: rc = %d "
"from %s AST", libcfs_nid2str(peer.nid),
(req->rq_repmsg != NULL) ?
lustre_msg_get_status(req->rq_repmsg) : 0,
struct ldlm_cb_async_args *ca = data;
struct ldlm_lock *lock = ca->ca_lock;
- ldlm_refresh_waiting_lock(lock, ldlm_get_enq_timeout(lock));
+ ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
}
static inline int ldlm_ast_fini(struct ptlrpc_request *req,
unlock_res_and_lock(lock);
/* Do not resend after lock callback timeout */
- req->rq_delay_limit = ldlm_get_enq_timeout(lock);
+ req->rq_delay_limit = ldlm_bl_timeout(lock);
req->rq_resend_cb = ldlm_update_resend;
}
struct ldlm_request *body;
struct ptlrpc_request *req;
struct ldlm_cb_async_args *ca;
- long total_enqueue_wait;
int instant_cancel = 0;
int rc = 0;
int lvb_len;
LASSERT(lock != NULL);
LASSERT(data != NULL);
- total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(),
- lock->l_last_activity);
-
if (OBD_FAIL_PRECHECK(OBD_FAIL_OST_LDLM_REPLY_NET)) {
LDLM_DEBUG(lock, "dropping CP AST");
RETURN(0);
}
}
- LDLM_DEBUG(lock, "server preparing completion AST (after %lds wait)",
- total_enqueue_wait);
-
lock->l_last_activity = cfs_time_current_sec();
- /* Server-side enqueue wait time estimate, used in
- __ldlm_add_waiting_lock to set future enqueue timers */
- if (total_enqueue_wait < ldlm_get_enq_timeout(lock))
- at_measured(ldlm_lock_to_ns_at(lock),
- total_enqueue_wait);
- else
- /* bz18618. Don't add lock enqueue time we spend waiting for a
- previous callback to fail. Locks waiting legitimately will
- get extended by ldlm_refresh_waiting_lock regardless of the
- estimate, so it's okay to underestimate here. */
- LDLM_DEBUG(lock, "lock completed after %lus; estimate was %ds. "
- "It is likely that a previous callback timed out.",
- total_enqueue_wait,
- at_get(ldlm_lock_to_ns_at(lock)));
+ LDLM_DEBUG(lock, "server preparing completion AST");
ptlrpc_request_set_replen(req);
/* start the lock-timeout clock */
ldlm_add_waiting_lock(lock);
/* Do not resend after lock callback timeout */
- req->rq_delay_limit = ldlm_get_enq_timeout(lock);
+ req->rq_delay_limit = ldlm_bl_timeout(lock);
req->rq_resend_cb = ldlm_update_resend;
}
}
LDLM_DEBUG_NOLOCK("server-side enqueue handler START");
- ldlm_request_cancel(req, dlm_req, LDLM_ENQUEUE_CANCEL_OFF);
+ ldlm_request_cancel(req, dlm_req, LDLM_ENQUEUE_CANCEL_OFF, LATF_SKIP);
flags = ldlm_flags_from_wire(dlm_req->lock_flags);
LASSERT(req->rq_export);
GOTO(out, rc);
}
- lock->l_last_activity = cfs_time_current_sec();
lock->l_remote_handle = dlm_req->lock_handle[0];
LDLM_DEBUG(lock, "server-side enqueue handler, new lock created");
LDLM_DEBUG(lock, "server-side convert handler START");
- lock->l_last_activity = cfs_time_current_sec();
res = ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode,
&dlm_rep->lock_flags);
if (res) {
* requests.
*/
int ldlm_request_cancel(struct ptlrpc_request *req,
- const struct ldlm_request *dlm_req, int first)
+ const struct ldlm_request *dlm_req,
+ int first, enum lustre_at_flags flags)
{
struct ldlm_resource *res, *pres = NULL;
struct ldlm_lock *lock;
}
pres = res;
}
+
+ if ((flags & LATF_STATS) && ldlm_is_ast_sent(lock)) {
+ long delay = cfs_time_sub(cfs_time_current_sec(),
+ lock->l_last_activity);
+ LDLM_DEBUG(lock, "server cancels blocked lock after "
+ CFS_DURATION_T"s", delay);
+ at_measured(&lock->l_export->exp_bl_lock_at, delay);
+ }
ldlm_lock_cancel(lock);
LDLM_LOCK_PUT(lock);
}
if (rc)
RETURN(rc);
- if (!ldlm_request_cancel(req, dlm_req, 0))
+ if (!ldlm_request_cancel(req, dlm_req, 0, LATF_STATS))
req->rq_status = LUSTRE_ESTALE;
RETURN(ptlrpc_reply(req));
#include "ldlm_internal.h"
-int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
-CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
+unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+CFS_MODULE_PARM(ldlm_enqueue_min, "i", uint, 0644,
"lock enqueue timeout minimum");
/* in client side, whether the cached locks will be canceled before replay */
}
EXPORT_SYMBOL(ldlm_expired_completion_wait);
+/**
+ * Calculate the Completion timeout (covering enqueue, BL AST, data flush,
+ * lock cancel, and their replies). Used for lock completion timeout on the
+ * client side.
+ *
+ * \param[in] lock lock which is waiting the completion callback
+ *
+ * \retval timeout in seconds to wait for the server reply
+ */
+
/* We use the same basis for both server side and client side functions
from a single node. */
-int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
{
- int timeout = at_get(ldlm_lock_to_ns_at(lock));
- if (AT_OFF)
- return obd_timeout / 2;
- /* Since these are non-updating timeouts, we should be conservative.
- It would be nice to have some kind of "early reply" mechanism for
- lock callbacks too... */
- timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
- return max(timeout, ldlm_enqueue_min);
+ unsigned int timeout;
+
+ if (AT_OFF)
+ return obd_timeout;
+
+ /* Wait a long time for enqueue - server may have to callback a
+ * lock from another client. Server will evict the other client if it
+ * doesn't respond reasonably, and then give us the lock. */
+ timeout = at_get(ldlm_lock_to_ns_at(lock));
+ return max(3 * timeout, ldlm_enqueue_min);
}
-EXPORT_SYMBOL(ldlm_get_enq_timeout);
/**
* Helper function for ldlm_completion_ast(), updating timings when lock is
* actually granted.
*/
-static int ldlm_completion_tail(struct ldlm_lock *lock)
+static int ldlm_completion_tail(struct ldlm_lock *lock, void *data)
{
long delay;
- int result;
+ int result = 0;
if (ldlm_is_destroyed(lock) || ldlm_is_failed(lock)) {
LDLM_DEBUG(lock, "client-side enqueue: destroyed");
result = -EIO;
+ } else if (data == NULL) {
+ LDLM_DEBUG(lock, "client-side enqueue: granted");
} else {
+ /* Take into AT only CP RPC, not immediately granted locks */
delay = cfs_time_sub(cfs_time_current_sec(),
lock->l_last_activity);
LDLM_DEBUG(lock, "client-side enqueue: granted after "
CFS_DURATION_T"s", delay);
/* Update our time estimate */
- at_measured(ldlm_lock_to_ns_at(lock),
- delay);
- result = 0;
+ at_measured(ldlm_lock_to_ns_at(lock), delay);
}
return result;
}
if (!(flags & LDLM_FL_BLOCKED_MASK)) {
wake_up(&lock->l_waitq);
- RETURN(ldlm_completion_tail(lock));
+ RETURN(ldlm_completion_tail(lock, data));
}
LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
imp = obd->u.cli.cl_import;
}
- /* Wait a long time for enqueue - server may have to callback a
- lock from another client. Server will evict the other client if it
- doesn't respond reasonably, and then give us the lock. */
- timeout = ldlm_get_enq_timeout(lock) * 2;
+ timeout = ldlm_cp_timeout(lock);
- lwd.lwd_lock = lock;
+ lwd.lwd_lock = lock;
+ lock->l_last_activity = cfs_time_current_sec();
if (ldlm_is_no_timeout(lock)) {
LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
rc);
RETURN(rc);
- }
+ }
- RETURN(ldlm_completion_tail(lock));
+ RETURN(ldlm_completion_tail(lock, data));
}
EXPORT_SYMBOL(ldlm_completion_ast);
lock->l_export = NULL;
lock->l_blocking_ast = einfo->ei_cb_bl;
lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
+ lock->l_last_activity = cfs_time_current_sec();
/* lock not sent to server yet */
info = tsi2mdt_info(tsi);
if (info->mti_dlm_req != NULL)
- ldlm_request_cancel(req, info->mti_dlm_req, 0);
+ ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
if (req_capsule_get_size(info->mti_pill, &RMF_CAPA1, RCL_CLIENT))
mdt_set_capainfo(info, 0, &info->mti_body->mbo_fid1,
DEBUG_REQ(D_INODE, req, "setattr "DFID" %x", PFID(rr->rr_fid1),
(unsigned int)ma->ma_attr.la_valid);
- if (info->mti_dlm_req)
- ldlm_request_cancel(req, info->mti_dlm_req, 0);
+ if (info->mti_dlm_req)
+ ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
mo = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid1);
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_CREATE))
RETURN(err_serious(-ESTALE));
- if (info->mti_dlm_req)
- ldlm_request_cancel(mdt_info_req(info), info->mti_dlm_req, 0);
+ if (info->mti_dlm_req)
+ ldlm_request_cancel(mdt_info_req(info),
+ info->mti_dlm_req, 0, LATF_SKIP);
if (!lu_name_is_valid(&info->mti_rr.rr_name))
RETURN(-EPROTO);
DEBUG_REQ(D_INODE, req, "unlink "DFID"/"DNAME"", PFID(rr->rr_fid1),
PNAME(&rr->rr_name));
- if (info->mti_dlm_req)
- ldlm_request_cancel(req, info->mti_dlm_req, 0);
+ if (info->mti_dlm_req)
+ ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNLINK))
RETURN(err_serious(-ENOENT));
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_LINK))
RETURN(err_serious(-ENOENT));
- if (info->mti_dlm_req)
- ldlm_request_cancel(req, info->mti_dlm_req, 0);
+ if (info->mti_dlm_req)
+ ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
/* Invalid case so return error immediately instead of
* processing it */
ENTRY;
if (info->mti_dlm_req)
- ldlm_request_cancel(req, info->mti_dlm_req, 0);
+ ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
if (!fid_is_md_operative(rr->rr_fid1) ||
!fid_is_md_operative(rr->rr_fid2))
CDEBUG(D_INODE, "setxattr for "DFID"\n", PFID(rr->rr_fid1));
if (info->mti_dlm_req)
- ldlm_request_cancel(req, info->mti_dlm_req, 0);
+ ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SETXATTR))
RETURN(err_serious(-ENOMEM));
}
}
+ at_init(&export->exp_bl_lock_at, obd_timeout, 0);
spin_lock(&obd->obd_dev_lock);
if (obd->obd_stopping) {
cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
dlm = req_capsule_client_get(tsi->tsi_pill, &RMF_DLM_REQ);
if (dlm == NULL)
RETURN(-EFAULT);
- ldlm_request_cancel(tgt_ses_req(tsi), dlm, 0);
+ ldlm_request_cancel(tgt_ses_req(tsi), dlm, 0, LATF_SKIP);
}
*fid = body->oa.o_oi.oi_fid;
*
* \retval amount of time to extend the timeout with
*/
-static inline int prolong_timeout(struct ptlrpc_request *req)
+static inline int prolong_timeout(struct ptlrpc_request *req,
+ struct ldlm_lock *lock)
{
struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
if (AT_OFF)
return obd_timeout / 2;
- return max(at_est2timeout(at_get(&svcpt->scp_at_estimate)),
- ldlm_timeout);
+ /* We are in the middle of the process - BL AST is sent, CANCEL
+ is ahead. Take half of AT + IO process time. */
+ return at_est2timeout(at_get(&svcpt->scp_at_estimate)) +
+ (ldlm_bl_timeout(lock) >> 1);
}
/**
*/
static int ofd_prolong_one_lock(struct tgt_session_info *tsi,
struct ldlm_lock *lock,
- struct ldlm_extent *extent, int timeout)
+ struct ldlm_extent *extent)
{
+ int timeout = prolong_timeout(tgt_ses_req(tsi), lock);
if (lock->l_flags & LDLM_FL_DESTROYED) /* lock already cancelled */
return 0;
.end = end
};
struct ldlm_lock *lock;
- int timeout = prolong_timeout(tgt_ses_req(tsi));
int lock_count = 0;
ENTRY;
/* bingo */
LASSERT(lock->l_export == exp);
lock_count = ofd_prolong_one_lock(tsi, lock,
- &extent, timeout);
+ &extent);
LDLM_LOCK_PUT(lock);
RETURN(lock_count);
}
&extent))
continue;
- lock_count += ofd_prolong_one_lock(tsi, lock, &extent, timeout);
+ lock_count += ofd_prolong_one_lock(tsi, lock, &extent);
}
spin_unlock_bh(&exp->exp_bl_list_lock);
svc = req->rq_rqbd->rqbd_svcpt;
timeout = at_est2timeout(at_get(&svc->scp_at_estimate));
- timeout = max(timeout, ldlm_timeout);
+ timeout += (ldlm_bl_timeout(lock) >> 1);
/* lock is being cancelled, prolong timeout */
ldlm_refresh_waiting_lock(lock, timeout);