X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fgnilnd%2Fgnilnd_conn.c;h=066fe1eb1fa10374db8682ee024735798aeae360;hb=2c7da05ca58b4146fa47cfcbc86de51099cf452a;hp=4ca454227fb53d01e0a3b737590cd891e2bc5f8a;hpb=c06faad10dc402499e324d2c866fa43ff214f81d;p=fs%2Flustre-release.git diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c index 4ca4542..066fe1e 100644 --- a/lnet/klnds/gnilnd/gnilnd_conn.c +++ b/lnet/klnds/gnilnd/gnilnd_conn.c @@ -38,6 +38,8 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk) { gni_return_t rrc; __u32 flags = GNI_MEM_READWRITE; + static unsigned long reg_to; + int rfto = *kgnilnd_tunables.kgn_reg_fail_timeout; if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) { flags |= GNI_MEM_PHYS_CONT; @@ -52,14 +54,25 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk) fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh, flags, &fma_blk->gnm_hndl); if (rrc != GNI_RC_SUCCESS) { - /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail - * -- like when under MDD or GART pressure on big systems - */ + if (rfto != GNILND_REGFAILTO_DISABLE) { + if (reg_to == 0) { + reg_to = jiffies + cfs_time_seconds(rfto); + } else if (time_after(jiffies, reg_to)) { + CERROR("FATAL:fmablk registration has failed " + "for %ld seconds.\n", + cfs_duration_sec(jiffies - reg_to) + + rfto); + LBUG(); + } + } + CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n", fma_blk, fma_blk->gnm_mbox_size, flags); RETURN(-ENOMEM); } + reg_to = 0; + /* PHYS_CONT memory isn't really mapped, at least not in GART - * but all mappings chew up a MDD */ @@ -166,7 +179,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys) num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size, *kgnilnd_tunables.kgn_mbox_per_block); - LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size); + fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size); if (fma_blk->gnm_block == NULL) { CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size); rc = -ENOMEM; @@ -204,7 +217,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys) fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox; CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d " - "mbox_size %d MDD "LPX64"."LPX64"\n", + "mbox_size %d MDD %#llx.%#llx\n", fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit, fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); @@ -402,7 +415,7 @@ kgnilnd_find_free_mbox(kgn_conn_t *conn) CDEBUG(D_NET, "conn %p smsg %p fmablk %p " "allocating SMSG mbox %d buf %p " - "offset %u hndl "LPX64"."LPX64"\n", + "offset %u hndl %#llx.%#llx\n", conn, smsg_attr, fma_blk, id, smsg_attr->msg_buffer, smsg_attr->mbox_offset, fma_blk->gnm_hndl.qword1, @@ -492,14 +505,14 @@ kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold) * > 0 - hold it for now */ if (purgatory_hold == 0) { CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d " - "hndl "LPX64"."LPX64"\n", + "hndl %#llx.%#llx\n", conn, smsg_attr, fma_blk, id, fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); fma_blk->gnm_avail_mboxs++; } else if (purgatory_hold > 0) { CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d " - "hndl "LPX64"."LPX64"\n", + "hndl %#llx.%#llx\n", conn, smsg_attr, fma_blk, id, fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); @@ -508,7 +521,7 @@ kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold) conn->gnc_timeout); } else { CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d " - "hndl "LPX64"."LPX64"\n", + "hndl %#llx.%#llx\n", conn, smsg_attr, fma_blk, id, fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); @@ -928,7 +941,7 @@ kgnilnd_unpack_connreq(kgn_dgram_t *dgram) } if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) { - CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n", + CERROR("Recived bad timestamps peer %llu conn %llu\n", connreq->gncr_peerstamp, connreq->gncr_connstamp); return -EPROTO; } @@ -1426,13 +1439,13 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp) RETURN(0); } - CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n", + CDEBUG(D_NET, "ready %#llx on device 0x%p\n", readyid, dev); dgram = (kgn_dgram_t *)readyid; LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC, - "dgram 0x%p from id "LPX64" with bad magic %x\n", + "dgram 0x%p from id %#llx with bad magic %x\n", dgram, readyid, dgram->gndg_magic); LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED || @@ -1466,7 +1479,7 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp) spin_unlock(&dev->gnd_dgram_lock); LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that" - " id "LPU64" was ready\n", readyid); + " id %llu was ready\n", readyid); CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d " "remote_addr %u remote_id %u\n", grc, dgram, @@ -1682,7 +1695,7 @@ kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev) if (grc != GNI_RC_SUCCESS) continue; - CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n", + CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n", readyid, dev->gnd_id, dev); rc = kgnilnd_probe_for_dgram(dev, &dgram); @@ -2513,8 +2526,9 @@ kgnilnd_dgram_mover(void *arg) /* last second chance for others to poke us */ did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE); - /* check flag variables before comittingi even if we did something; - * if we are after the deadline call schedule */ + /* check flag variables before committing even if we + * did something; if we are after the deadline call + * schedule */ if ((!did_something || time_after(jiffies, deadline)) && !kgnilnd_data.kgn_shutdown && !kgnilnd_data.kgn_quiesce_trigger) {