#define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \
? conn->gnc_last_rx : conn->gnc_last_rx_cq)
+/* fmablk registration failures timeout before failing node */
+#define GNILND_REGFAILTO_DISABLE -1
+
/************************************************************************
* Enum, flag and tag data
*/
int *kgn_fast_reconn; /* fast reconnection on conn timeout */
int *kgn_efault_lbug; /* LBUG on receiving an EFAULT */
int *kgn_max_purgatory; /* # conns/peer to keep in purgatory */
+ int *kgn_reg_fail_timeout; /* registration failure timeout */
int *kgn_thread_affinity; /* bind scheduler threads to cpus */
int *kgn_thread_safe; /* use thread safe kgni API */
#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
{
gni_return_t rrc;
__u32 flags = GNI_MEM_READWRITE;
+ static unsigned long reg_to;
+ int rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
flags |= GNI_MEM_PHYS_CONT;
fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
flags, &fma_blk->gnm_hndl);
if (rrc != GNI_RC_SUCCESS) {
- /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
- * -- like when under MDD or GART pressure on big systems
- */
+ if (rfto != GNILND_REGFAILTO_DISABLE) {
+ if (reg_to == 0) {
+ reg_to = jiffies + cfs_time_seconds(rfto);
+ } else if (time_after(jiffies, reg_to)) {
+ CERROR("FATAL:fmablk registration has failed "
+ "for %ld seconds.\n",
+ cfs_duration_sec(jiffies - reg_to) +
+ rfto);
+ LBUG();
+ }
+ }
+
CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
fma_blk, fma_blk->gnm_mbox_size, flags);
RETURN(-ENOMEM);
}
+ reg_to = 0;
+
/* PHYS_CONT memory isn't really mapped, at least not in GART -
* but all mappings chew up a MDD
*/
CFS_MODULE_PARM(thread_safe, "i", int, 0444,
"Use kgni thread safe API if available");
+static int reg_fail_timeout = GNILND_REGFAILTO_DISABLE;
+CFS_MODULE_PARM(reg_fail_timeout, "i", int, 0644,
+ "fmablk registration timeout LBUG");
+
kgn_tunables_t kgnilnd_tunables = {
.kgn_min_reconnect_interval = &min_reconnect_interval,
.kgn_max_reconnect_interval = &max_reconnect_interval,
.kgn_efault_lbug = &efault_lbug,
.kgn_thread_affinity = &thread_affinity,
.kgn_thread_safe = &thread_safe,
+ .kgn_reg_fail_timeout = ®_fail_timeout,
.kgn_max_purgatory = &max_conn_purg
};
},
{
INIT_CTL_NAME
+ .procname = "reg_fail_timeout"
+ .data = ®_fail_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME
.procname = "max_conn_purg"
.data = &max_conn_purg,
.maxlen = sizeof(int),