X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fgnilnd%2Fgnilnd_conn.c;h=066fe1eb1fa10374db8682ee024735798aeae360;hb=2c7da05ca58b4146fa47cfcbc86de51099cf452a;hp=80cfbdcbb0a35a1eaeddb1239b7ecc166f155fd0;hpb=e8bf4e3eadf1cec9a0c9dca609a0b023fc5a397d;p=fs%2Flustre-release.git diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c index 80cfbdc..066fe1e 100644 --- a/lnet/klnds/gnilnd/gnilnd_conn.c +++ b/lnet/klnds/gnilnd/gnilnd_conn.c @@ -1,6 +1,8 @@ /* * Copyright (C) 2012 Cray, Inc. * + * Copyright (c) 2014, Intel Corporation. + * * Author: Nic Henke * Author: James Shimek * @@ -36,11 +38,15 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk) { gni_return_t rrc; __u32 flags = GNI_MEM_READWRITE; + static unsigned long reg_to; + int rfto = *kgnilnd_tunables.kgn_reg_fail_timeout; if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) { flags |= GNI_MEM_PHYS_CONT; } + fma_blk->gnm_hold_timeout = 0; + /* make sure we are mapping a clean block */ LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk); @@ -48,14 +54,25 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk) fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh, flags, &fma_blk->gnm_hndl); if (rrc != GNI_RC_SUCCESS) { - /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail - * -- like when under MDD or GART pressure on big systems - */ + if (rfto != GNILND_REGFAILTO_DISABLE) { + if (reg_to == 0) { + reg_to = jiffies + cfs_time_seconds(rfto); + } else if (time_after(jiffies, reg_to)) { + CERROR("FATAL:fmablk registration has failed " + "for %ld seconds.\n", + cfs_duration_sec(jiffies - reg_to) + + rfto); + LBUG(); + } + } + CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n", fma_blk, fma_blk->gnm_mbox_size, flags); RETURN(-ENOMEM); } + reg_to = 0; + /* PHYS_CONT memory isn't really mapped, at least not in GART - * but all mappings chew up a MDD */ @@ -79,9 +96,22 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys) gni_smsg_attr_t smsg_attr; unsigned long fmablk_vers; - /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access +#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE) + /* We allocate large blocks of memory here potentially leading + * to memory exhaustion during massive reconnects during a network + * outage. Limit the amount of fma blocks to use by always keeping + * a percent of pages free initially set to 25% of total memory. */ + if (global_page_state(NR_FREE_PAGES) < kgnilnd_data.free_pages_limit) { + LCONSOLE_INFO("Exceeding free page limit of %ld. " + "Free pages available %ld\n", + kgnilnd_data.free_pages_limit, + global_page_state(NR_FREE_PAGES)); + return -ENOMEM; + } +#endif + /* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access * to this allocation code. Everyone will sample the version - * before and after getting the semaphore. If it has changed, + * before and after getting the mutex. If it has changed, * we'll bail out to check the lists again - this indicates that * some sort of change was made to the lists and it is possible * that there is a mailbox for us to find now. This should prevent @@ -89,12 +119,12 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys) * that need a yet-to-be-allocated mailbox for a connection. */ fmablk_vers = atomic_read(&device->gnd_fmablk_vers); - down(&device->gnd_fmablk_sem); + mutex_lock(&device->gnd_fmablk_mutex); if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) { /* version changed while we were waiting for semaphore, * we'll recheck the lists assuming something nice happened */ - up(&device->gnd_fmablk_sem); + mutex_unlock(&device->gnd_fmablk_mutex); return 0; } @@ -149,7 +179,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys) num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size, *kgnilnd_tunables.kgn_mbox_per_block); - LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size); + fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size); if (fma_blk->gnm_block == NULL) { CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size); rc = -ENOMEM; @@ -187,7 +217,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys) fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox; CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d " - "mbox_size %d MDD "LPX64"."LPX64"\n", + "mbox_size %d MDD %#llx.%#llx\n", fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit, fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); @@ -203,7 +233,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys) spin_unlock(&device->gnd_fmablk_lock); - up(&device->gnd_fmablk_sem); + mutex_unlock(&device->gnd_fmablk_mutex); return 0; @@ -220,7 +250,7 @@ free_blk: free_desc: LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t)); out: - up(&device->gnd_fmablk_sem); + mutex_unlock(&device->gnd_fmablk_mutex); return rc; } @@ -230,8 +260,11 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk) gni_return_t rrc; /* if some held, set hold_timeout from conn timeouts used in this block - * but not during shutdown, then just nuke and pave */ - if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) { + * but not during shutdown, then just nuke and pave + * During a stack reset, we need to deregister with a hold timeout + * set so we don't use the same mdd after reset is complete */ + if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) || + kgnilnd_data.kgn_in_reset) { fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN; } @@ -253,7 +286,9 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk) "tried to double unmap or something bad, fma_blk %p (rrc %d)\n", fma_blk, rrc); - if (fma_blk->gnm_hold_timeout) { + if (fma_blk->gnm_hold_timeout && + !(kgnilnd_data.kgn_in_reset && + fma_blk->gnm_state == GNILND_FMABLK_PHYS)) { atomic_inc(&dev->gnd_n_mdd_held); } else { atomic_dec(&dev->gnd_n_mdd); @@ -380,7 +415,7 @@ kgnilnd_find_free_mbox(kgn_conn_t *conn) CDEBUG(D_NET, "conn %p smsg %p fmablk %p " "allocating SMSG mbox %d buf %p " - "offset %u hndl "LPX64"."LPX64"\n", + "offset %u hndl %#llx.%#llx\n", conn, smsg_attr, fma_blk, id, smsg_attr->msg_buffer, smsg_attr->mbox_offset, fma_blk->gnm_hndl.qword1, @@ -470,14 +505,14 @@ kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold) * > 0 - hold it for now */ if (purgatory_hold == 0) { CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d " - "hndl "LPX64"."LPX64"\n", + "hndl %#llx.%#llx\n", conn, smsg_attr, fma_blk, id, fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); fma_blk->gnm_avail_mboxs++; } else if (purgatory_hold > 0) { CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d " - "hndl "LPX64"."LPX64"\n", + "hndl %#llx.%#llx\n", conn, smsg_attr, fma_blk, id, fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); @@ -486,7 +521,7 @@ kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold) conn->gnc_timeout); } else { CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d " - "hndl "LPX64"."LPX64"\n", + "hndl %#llx.%#llx\n", conn, smsg_attr, fma_blk, id, fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); @@ -584,8 +619,8 @@ kgnilnd_map_phys_fmablk(kgn_device_t *device) int rc = 0; kgn_fma_memblock_t *fma_blk; - /* use sem to gate access to single thread, just in case */ - down(&device->gnd_fmablk_sem); + /* use mutex to gate access to single thread, just in case */ + mutex_lock(&device->gnd_fmablk_mutex); spin_lock(&device->gnd_fmablk_lock); @@ -598,7 +633,7 @@ kgnilnd_map_phys_fmablk(kgn_device_t *device) } spin_unlock(&device->gnd_fmablk_lock); - up(&device->gnd_fmablk_sem); + mutex_unlock(&device->gnd_fmablk_mutex); RETURN(rc); } @@ -609,8 +644,8 @@ kgnilnd_unmap_fma_blocks(kgn_device_t *device) kgn_fma_memblock_t *fma_blk; - /* use sem to gate access to single thread, just in case */ - down(&device->gnd_fmablk_sem); + /* use mutex to gate access to single thread, just in case */ + mutex_lock(&device->gnd_fmablk_mutex); spin_lock(&device->gnd_fmablk_lock); @@ -619,7 +654,7 @@ kgnilnd_unmap_fma_blocks(kgn_device_t *device) } spin_unlock(&device->gnd_fmablk_lock); - up(&device->gnd_fmablk_sem); + mutex_unlock(&device->gnd_fmablk_mutex); } void @@ -628,8 +663,8 @@ kgnilnd_free_phys_fmablk(kgn_device_t *device) kgn_fma_memblock_t *fma_blk, *fma_blkN; - /* use sem to gate access to single thread, just in case */ - down(&device->gnd_fmablk_sem); + /* use mutex to gate access to single thread, just in case */ + mutex_lock(&device->gnd_fmablk_mutex); spin_lock(&device->gnd_fmablk_lock); @@ -639,7 +674,7 @@ kgnilnd_free_phys_fmablk(kgn_device_t *device) } spin_unlock(&device->gnd_fmablk_lock); - up(&device->gnd_fmablk_sem); + mutex_unlock(&device->gnd_fmablk_mutex); } /* kgnilnd dgram nid->struct managment */ @@ -906,7 +941,7 @@ kgnilnd_unpack_connreq(kgn_dgram_t *dgram) } if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) { - CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n", + CERROR("Recived bad timestamps peer %llu conn %llu\n", connreq->gncr_peerstamp, connreq->gncr_connstamp); return -EPROTO; } @@ -1404,13 +1439,13 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp) RETURN(0); } - CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n", + CDEBUG(D_NET, "ready %#llx on device 0x%p\n", readyid, dev); dgram = (kgn_dgram_t *)readyid; LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC, - "dgram 0x%p from id "LPX64" with bad magic %x\n", + "dgram 0x%p from id %#llx with bad magic %x\n", dgram, readyid, dgram->gndg_magic); LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED || @@ -1444,7 +1479,7 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp) spin_unlock(&dev->gnd_dgram_lock); LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that" - " id "LPU64" was ready\n", readyid); + " id %llu was ready\n", readyid); CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d " "remote_addr %u remote_id %u\n", grc, dgram, @@ -1660,7 +1695,7 @@ kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev) if (grc != GNI_RC_SUCCESS) continue; - CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n", + CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n", readyid, dev->gnd_id, dev); rc = kgnilnd_probe_for_dgram(dev, &dgram); @@ -1815,8 +1850,8 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram) } if (peer->gnp_down == GNILND_RCA_NODE_DOWN) { - CNETERR("Received connection request from %s that RCA thinks is" - " down.\n", libcfs_nid2str(her_nid)); + CNETERR("Received connection request from down nid %s\n", + libcfs_nid2str(her_nid)); peer->gnp_down = GNILND_RCA_NODE_UP; } @@ -2168,7 +2203,7 @@ inform_peer: /* now that we are outside the lock, tell Mommy */ if (peer != NULL) { - kgnilnd_peer_notify(peer, rc); + kgnilnd_peer_notify(peer, rc, 0); kgnilnd_peer_decref(peer); } } @@ -2491,8 +2526,9 @@ kgnilnd_dgram_mover(void *arg) /* last second chance for others to poke us */ did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE); - /* check flag variables before comittingi even if we did something; - * if we are after the deadline call schedule */ + /* check flag variables before committing even if we + * did something; if we are after the deadline call + * schedule */ if ((!did_something || time_after(jiffies, deadline)) && !kgnilnd_data.kgn_shutdown && !kgnilnd_data.kgn_quiesce_trigger) {