unsigned ns_contention_time;
/* limit size of nolock requests, in bytes */
unsigned ns_max_nolock_size;
+
+ struct adaptive_timeout ns_at_estimate;/* estimated lock callback time*/
};
static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
spin_lock_init(&at->at_lock);
}
static inline int at_get(struct adaptive_timeout *at) {
- extern unsigned int adaptive_timeout_max;
- if (adaptive_timeout_max)
- return min(at->at_current, adaptive_timeout_max);
return at->at_current;
}
int at_add(struct adaptive_timeout *at, unsigned int val);
int import_at_get_index(struct obd_import *imp, int portal);
-int import_at_get_ldlm(struct obd_import *imp);
-#define AT_OFF (adaptive_timeout_max == 0)
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
/* genops.c */
struct obd_export;
__u32 srv_rep_portal;
/* AT stuff */
- struct adaptive_timeout srv_at_estimate;/* estimated service time */
+ struct adaptive_timeout srv_at_estimate;/* estimated rpc service time */
spinlock_t srv_at_lock;
struct list_head srv_at_list; /* reqs waiting for replies */
cfs_timer_t srv_at_timer; /* early reply timer */
networking / disk / timings affected by load (use Adaptive Timeouts) */
extern unsigned int obd_timeout; /* seconds */
extern unsigned int ldlm_timeout; /* seconds */
-extern unsigned int adaptive_timeout_max; /* seconds */
-extern unsigned int adaptive_timeout_history; /* seconds */
extern unsigned int obd_sync_filter;
extern unsigned int obd_max_dirty_pages;
extern atomic_t obd_dirty_pages;
#define OBD_FAIL_LDLM_GLIMPSE 0x30f
#define OBD_FAIL_LDLM_CANCEL_RACE 0x310
#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
#define OBD_FAIL_OSC 0x400
#define OBD_FAIL_OSC_BRW_READ_BULK 0x401
int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync);
int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
int count, int max, int flags);
+extern int ldlm_enqueue_min;
+int ldlm_get_enq_timeout(struct ldlm_lock *lock);
/* ldlm_resource.c */
int ldlm_resource_putref_locked(struct ldlm_resource *res);
*/
static int __ldlm_add_waiting_lock(struct ldlm_lock *lock)
{
- int timeout = obd_timeout / 2; /* Non-AT value */
+ int timeout;
cfs_time_t timeout_rounded;
if (!list_empty(&lock->l_pending_chain))
return 0;
- if (lock->l_export && !AT_OFF)
- timeout = import_at_get_ldlm(lock->l_export->exp_imp_reverse);
+ timeout = ldlm_get_enq_timeout(lock);
lock->l_callback_timeout = cfs_time_shift(timeout);
!cfs_timer_is_armed(&waiting_locks_timer)) {
cfs_timer_arm(&waiting_locks_timer, timeout_rounded);
}
+ /* if the new lock has a shorter timeout than something earlier on
+ the list, we'll wait the longer amount of time; no big deal. */
list_add_tail(&lock->l_pending_chain, &waiting_locks_list); /* FIFO */
return 1;
}
LDLM_DEBUG(lock, "server preparing completion AST (after %ldus wait)",
total_enqueue_wait);
+ /* Server-side enqueue wait time estimate, used in
+ __ldlm_add_waiting_lock to set future enqueue timers */
+ at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
+ total_enqueue_wait / ONE_MILLION);
+
ptlrpc_req_set_repsize(req, 1, NULL);
req->rq_send_state = LUSTRE_IMP_FULL;
RETURN(0);
}
-/* Cancel all the locks, which handles are packed into ldlm_request */
+/* Cancel all the locks whos handles are packed into ldlm_request */
int ldlm_request_cancel(struct ptlrpc_request *req,
struct ldlm_request *dlm_req, int first)
{
#include "ldlm_internal.h"
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
+ "lock enqueue timeout minimum");
+
static void interrupted_completion_wait(void *data)
{
}
RETURN(0);
}
+/* We use the same basis for both server side and client side functions
+ from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+ int timeout = at_get(&lock->l_resource->lr_namespace->ns_at_estimate);
+ if (AT_OFF)
+ return obd_timeout / 2;
+ /* Since these are non-updating timeouts, we should be conservative.
+ It would be nice to have some kind of "early reply" mechanism for
+ lock callbacks too... */
+ timeout = timeout + (timeout >> 1); /* 150% */
+ return max(timeout, ldlm_enqueue_min);
+}
+
int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
{
/* XXX ALLOCATE - 160 bytes */
struct obd_device *obd;
struct obd_import *imp = NULL;
struct l_wait_info lwi;
- __u32 timeout = obd_timeout; /* Non-AT value */
+ __u32 timeout;
int rc = 0;
ENTRY;
/* if this is a local lock, then there is no import */
if (obd != NULL) {
imp = obd->u.cli.cl_import;
- timeout = import_at_get_ldlm(imp);
}
+ /* Wait a long time for enqueue - server may have to callback a
+ lock from another client. Server will evict the other client if it
+ doesn't respond reasonably, and then give us the lock. */
+ timeout = ldlm_get_enq_timeout(lock) * 2;
+
lwd.lwd_lock = lock;
if (lock->l_flags & LDLM_FL_NO_TIMEOUT) {
RETURN(rc);
}
- LDLM_DEBUG(lock, "client-side enqueue waking up: granted");
+ LDLM_DEBUG(lock, "client-side enqueue waking up: granted after %lds",
+ cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+
+ /* Update our time estimate */
+ at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
+ cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+
RETURN(0);
}
LASSERT(exp != NULL);
LASSERT(count > 0);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, obd_fail_val);
+
if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
RETURN(count);
GOTO(out_proc, rc);
}
+ at_init(&ns->ns_at_estimate, ldlm_enqueue_min, 0);
+
mutex_down(ldlm_namespace_lock(client));
list_add(&ns->ns_list_chain, ldlm_namespace_list(client));
atomic_inc(ldlm_namespace_nr(client));
unsigned int obd_dump_on_eviction;
unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */
unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
-unsigned int adaptive_timeout_max = 600; /* seconds */
-/* We remember the slowest event that took place within history */
-unsigned int adaptive_timeout_history = 600; /* seconds */
unsigned int obd_max_dirty_pages = 256;
atomic_t obd_dirty_pages;
EXPORT_SYMBOL(obd_dump_on_eviction);
EXPORT_SYMBOL(obd_timeout);
EXPORT_SYMBOL(ldlm_timeout);
-EXPORT_SYMBOL(adaptive_timeout_max);
-EXPORT_SYMBOL(adaptive_timeout_history);
EXPORT_SYMBOL(obd_max_dirty_pages);
EXPORT_SYMBOL(obd_dirty_pages);
EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
OBD_DUMP_ON_EVICTION, /* dump kernel debug log upon eviction */
OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */
- ADAPTIVE_MAX, /* Adaptive timeout upper limit */
- ADAPTIVE_HISTORY, /* Adaptive timeout timebase */
};
int LL_PROC_PROTO(proc_fail_loc)
.proc_handler = &proc_alloc_fail_rate
},
#endif
- {
- .ctl_name = ADAPTIVE_MAX,
- .procname = "adaptive_max",
- .data = &adaptive_timeout_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- .ctl_name = ADAPTIVE_HISTORY,
- .procname = "adaptive_history",
- .data = &adaptive_timeout_history,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
{ 0 }
};
imp = obd->u.cli.cl_import;
*eof = 1;
- if (AT_OFF)
- rc += snprintf(page + rc, count - rc,
- "adaptive timeouts off, using obd_timeout %u\n",
- obd_timeout);
-
now = cfs_time_current_sec();
/* Some network health info for kicks */
/* Adaptive Timeout utils */
+extern unsigned int at_min, at_max, at_history;
/* Bin into timeslices using AT_BINS bins.
This gives us a max of the last binlimit*AT_BINS secs without the storage,
{
unsigned int old = at->at_current;
time_t now = cfs_time_current_sec();
- time_t binlimit = max_t(time_t, adaptive_timeout_history / AT_BINS, 1);
+ time_t binlimit = max_t(time_t, at_history / AT_BINS, 1);
LASSERT(at);
#if 0
for proc only */
at->at_current = val;
+ if (at_max > 0)
+ at->at_current = min(at->at_current, at_max);
+ at->at_current = max(at->at_current, at_min);
+
#if 0
if (at->at_current != old)
- CDEBUG(D_ADAPTTO, "AT change: old=%u new=%u delta=%d (val=%u) "
- "hist %u %u %u %u\n",
+ CDEBUG(D_ADAPTTO, "AT %p change: old=%u new=%u delta=%d "
+ "(val=%u) hist %u %u %u %u\n", at,
old, at->at_current, at->at_current - old, val,
at->at_hist[0], at->at_hist[1], at->at_hist[2],
at->at_hist[3]);
return i;
}
-/* Get total expected lock callback time (net + service).
- Since any early reply will only affect the RPC wait time, and not
- any local lock timer we set based on the return value here,
- we should be conservative. */
-int import_at_get_ldlm(struct obd_import *imp)
-{
- int idx, tot;
-
- if (!imp || !imp->imp_client || AT_OFF)
- return obd_timeout;
-
- idx = import_at_get_index(imp, imp->imp_client->cli_request_portal);
- tot = at_get(&imp->imp_at.iat_net_latency) +
- at_get(&imp->imp_at.iat_service_estimate[idx]);
-
- /* add an arbitrary minimum: 150% + 10 sec */
- tot += (tot >> 1) + 10;
- return tot;
-}
-
lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
svc_counter_config, "req_active", "reqs");
lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
- svc_counter_config, "adaptive_timeout", "sec");
+ svc_counter_config, "req_timeout", "sec");
lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
svc_counter_config, "reqbuf_avail", "bufs");
for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
#include <lnet/types.h>
#include "ptlrpc_internal.h"
+/* The following are visible and mutable through /sys/module/ptlrpc */
int test_req_buffer_pressure = 0;
CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444,
"set non-zero to put pressure on request buffer pools");
-
+unsigned int at_min = 0;
+CFS_MODULE_PARM(at_min, "i", int, 0644,
+ "Adaptive timeout minimum (sec)");
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+CFS_MODULE_PARM(at_max, "i", int, 0644,
+ "Adaptive timeout maximum (sec)");
+unsigned int at_history = 600;
+CFS_MODULE_PARM(at_history, "i", int, 0644,
+ "Adaptive timeouts remember the slowest event that took place "
+ "within this period (sec)");
static int at_early_margin = 3;
CFS_MODULE_PARM(at_early_margin, "i", int, 0644,
- "How far before the deadline we send an early reply");
-
-static int at_extra = 10;
+ "How soon before an RPC deadline to send an early reply");
+static int at_extra = 30;
CFS_MODULE_PARM(at_extra, "i", int, 0644,
- "How much extra time we give with an early reply");
+ "How much extra time to give with each early reply");
+
/* forward ref */
static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc);
spin_unlock(&svc->srv_lock);
if ((timediff / ONE_MILLION) > (AT_OFF ? obd_timeout * 3/2 :
- adaptive_timeout_max)) {
+ at_max)) {
CERROR("%s: unhealthy - request has been waiting %lds\n",
svc->srv_name, timediff / ONE_MILLION);
return (-1);
test_17() {
# With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max
- OST_AT_MAX=$(do_facet ost1 sysctl -n lustre.adaptive_max)
- do_facet ost1 sysctl -w lustre.adaptive_max=$TIMEOUT
+ OST_AT_MAX=$(do_facet ost1 "cat /sys/module/ptlrpc/at_max")
+ do_facet ost1 "echo $TIMEOUT >> /sys/module/ptlrpc/at_max"
# OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE
# OST bulk will time out here, client retries
# expect cmp to succeed, client resent bulk
do_facet client "cmp $SAMPLE_FILE $DIR/$tfile" || return 3
do_facet client "rm $DIR/$tfile" || return 4
- do_facet ost1 sysctl -w lustre.adaptive_max=$OST_AT_MAX
+ do_facet ost1 "echo $OST_AT_MAX >> /sys/module/ptlrpc/at_max"
return 0
}
run_test 17 "timeout bulk get, don't evict client (2732)"
mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
[ "$mdcdev" ] || exit 2
# adaptive timeouts slow this way down
- MDS_AT_MAX=$(do_facet mds "sysctl -n lustre.adaptive_max")
- do_facet mds "sysctl -w lustre.adaptive_max=40"
+ MDS_AT_MAX=$(do_facet mds "cat /sys/module/ptlrpc/at_max")
+ do_facet mds "echo 40 >> /sys/module/ptlrpc/at_max"
for i in `seq 1 10`; do
echo "$i of 10 ($(date +%s))"
do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
df $MOUNT
done
do_facet mds "sysctl -w lustre.fail_loc=0"
- do_facet mds "sysctl -w lustre.adaptive_max=$MDS_AT_MAX"
+ do_facet mds "echo $MDS_AT_MAX >> /sys/module/ptlrpc/at_max"
return 0
}
run_test 44 "race in target handle connect"
at_start() #bug 3055
{
if [ -z "$ATOLDBASE" ]; then
- ATOLDBASE=$(do_facet mds "sysctl -n lustre.adaptive_history")
+ ATOLDBASE=$(do_facet mds "cat /sys/module/ptlrpc/at_history")
# speed up the timebase so we can check decreasing AT
- do_facet mds "sysctl -w lustre.adaptive_history=8"
- do_facet ost1 "sysctl -w lustre.adaptive_history=8"
+ do_facet mds "echo 8 >> /sys/module/ptlrpc/at_history"
+ do_facet ost1 "echo 8 >> /sys/module/ptlrpc/at_history"
fi
}
}
run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
+test_68 () #bug 13813
+{
+ at_start
+ local ENQ_MIN=$(cat /sys/module/ptlrpc/ldlm_enqueue_min)
+ echo $TIMEOUT >> /sys/module/ptlrpc/ldlm_enqueue_min
+ rm -f $DIR/${tfile}_[1-2]
+ lfs setstripe $DIR/$tfile --index=0 --count=1
+#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
+ sysctl -w lustre.fail_val=$(($TIMEOUT - 1))
+ sysctl -w lustre.fail_loc=0x80000312
+ cp /etc/profile $DIR/${tfile}_1 || error "1st cp failed $?"
+ sysctl -w lustre.fail_val=$((TIMEOUT * 3 / 2))
+ sysctl -w lustre.fail_loc=0x80000312
+ cp /etc/profile $DIR/${tfile}_2 || error "2nd cp failed $?"
+ sysctl -w lustre.fail_loc=0
+ echo $ENQ_MIN >> /sys/module/ptlrpc/ldlm_enqueue_min
+ return 0
+}
+run_test 68 "AT: verify slowing locks"
+
if [ -n "$ATOLDBASE" ]; then
- do_facet mds "sysctl -w lustre.adaptive_history=$ATOLDBASE"
- do_facet ost1 "sysctl -w lustre.adaptive_history=$ATOLDBASE"
+ do_facet mds "echo $ATOLDBASE >> /sys/module/ptlrpc/at_history"
+ do_facet ost1 "echo $ATOLDBASE >> /sys/module/ptlrpc/at_history"
fi
# end of AT tests includes above lines