X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fgnilnd%2Fgnilnd_conn.c;h=066fe1eb1fa10374db8682ee024735798aeae360;hb=2c7da05ca58b4146fa47cfcbc86de51099cf452a;hp=80cfbdcbb0a35a1eaeddb1239b7ecc166f155fd0;hpb=e8bf4e3eadf1cec9a0c9dca609a0b023fc5a397d;p=fs%2Flustre-release.git

diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c
index 80cfbdc..066fe1e 100644
--- a/lnet/klnds/gnilnd/gnilnd_conn.c
+++ b/lnet/klnds/gnilnd/gnilnd_conn.c
@@ -1,6 +1,8 @@
 /*
  * Copyright (C) 2012 Cray, Inc.
  *
+ * Copyright (c) 2014, Intel Corporation.
+ *
  *   Author: Nic Henke <nic@cray.com>
  *   Author: James Shimek <jshimek@cray.com>
  *
@@ -36,11 +38,15 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
 {
 	gni_return_t            rrc;
 	__u32                   flags = GNI_MEM_READWRITE;
+	static unsigned long    reg_to;
+	int                     rfto = *kgnilnd_tunables.kgn_reg_fail_timeout;
 
 	if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
 		flags |= GNI_MEM_PHYS_CONT;
 	}
 
+	fma_blk->gnm_hold_timeout = 0;
+
 	/* make sure we are mapping a clean block */
 	LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
 
@@ -48,14 +54,25 @@ kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
 				   fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
 				   flags, &fma_blk->gnm_hndl);
 	if (rrc != GNI_RC_SUCCESS) {
-		/* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
-		 * -- like when under MDD or GART pressure on big systems
-		 */
+		if (rfto != GNILND_REGFAILTO_DISABLE) {
+			if (reg_to == 0) {
+				reg_to = jiffies + cfs_time_seconds(rfto);
+			} else if (time_after(jiffies, reg_to)) {
+				CERROR("FATAL:fmablk registration has failed "
+				       "for %ld seconds.\n",
+				       cfs_duration_sec(jiffies - reg_to) +
+						rfto);
+				LBUG();
+			}
+		}
+
 		CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
 			fma_blk, fma_blk->gnm_mbox_size, flags);
 		RETURN(-ENOMEM);
 	}
 
+	reg_to = 0;
+
 	/* PHYS_CONT memory isn't really mapped, at least not in GART -
 	 *  but all mappings chew up a MDD
 	 */
@@ -79,9 +96,22 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
 	gni_smsg_attr_t         smsg_attr;
 	unsigned long           fmablk_vers;
 
-	/* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+	/* We allocate large blocks of memory here potentially leading
+	 * to memory exhaustion during massive reconnects during a network
+	 * outage. Limit the amount of fma blocks to use by always keeping
+	 * a percent of pages free initially set to 25% of total memory. */
+	if (global_page_state(NR_FREE_PAGES) < kgnilnd_data.free_pages_limit) {
+		LCONSOLE_INFO("Exceeding free page limit of %ld. "
+			      "Free pages available %ld\n",
+			      kgnilnd_data.free_pages_limit,
+			      global_page_state(NR_FREE_PAGES));
+		return -ENOMEM;
+	}
+#endif
+	/* we'll use fmablk_vers and the gnd_fmablk_mutex to gate access
 	 * to this allocation code. Everyone will sample the version
-	 * before and after getting the semaphore. If it has changed,
+	 * before and after getting the mutex. If it has changed,
 	 * we'll bail out to check the lists again - this indicates that
 	 * some sort of change was made to the lists and it is possible
 	 * that there is a mailbox for us to find now. This should prevent
@@ -89,12 +119,12 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
 	 * that need a yet-to-be-allocated mailbox for a connection. */
 
 	fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
-	down(&device->gnd_fmablk_sem);
+	mutex_lock(&device->gnd_fmablk_mutex);
 
 	if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
 		/* version changed while we were waiting for semaphore,
 		 * we'll recheck the lists assuming something nice happened */
-		up(&device->gnd_fmablk_sem);
+		mutex_unlock(&device->gnd_fmablk_mutex);
 		return 0;
 	}
 
@@ -149,7 +179,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
 			 num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
 			 *kgnilnd_tunables.kgn_mbox_per_block);
 
-		LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+		fma_blk->gnm_block = kgnilnd_vzalloc(fma_blk->gnm_blk_size);
 		if (fma_blk->gnm_block == NULL) {
 			CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
 			rc = -ENOMEM;
@@ -187,7 +217,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
 	fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
 
 	CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
-		"mbox_size %d MDD "LPX64"."LPX64"\n",
+		"mbox_size %d MDD %#llx.%#llx\n",
 		fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
 		fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
 		fma_blk->gnm_hndl.qword2);
@@ -203,7 +233,7 @@ kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
 
 	spin_unlock(&device->gnd_fmablk_lock);
 
-	up(&device->gnd_fmablk_sem);
+	mutex_unlock(&device->gnd_fmablk_mutex);
 
 	return 0;
 
@@ -220,7 +250,7 @@ free_blk:
 free_desc:
 	LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
 out:
-	up(&device->gnd_fmablk_sem);
+	mutex_unlock(&device->gnd_fmablk_mutex);
 	return rc;
 }
 
@@ -230,8 +260,11 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 	gni_return_t            rrc;
 
 	/* if some held, set hold_timeout from conn timeouts used in this block
-	 * but not during shutdown, then just nuke and pave */
-	if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+	 * but not during shutdown, then just nuke and pave
+	 * During a stack reset, we need to deregister with a hold timeout
+	 * set so we don't use the same mdd after reset is complete */
+	if ((fma_blk->gnm_held_mboxs && !kgnilnd_data.kgn_shutdown) ||
+	    kgnilnd_data.kgn_in_reset) {
 		fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
 	}
 
@@ -253,7 +286,9 @@ kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
 		"tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
 		fma_blk, rrc);
 
-	if (fma_blk->gnm_hold_timeout) {
+	if (fma_blk->gnm_hold_timeout &&
+	    !(kgnilnd_data.kgn_in_reset &&
+	      fma_blk->gnm_state == GNILND_FMABLK_PHYS)) {
 		atomic_inc(&dev->gnd_n_mdd_held);
 	} else {
 		atomic_dec(&dev->gnd_n_mdd);
@@ -380,7 +415,7 @@ kgnilnd_find_free_mbox(kgn_conn_t *conn)
 
 		CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
 			"allocating SMSG mbox %d buf %p "
-			"offset %u hndl "LPX64"."LPX64"\n",
+			"offset %u hndl %#llx.%#llx\n",
 			conn, smsg_attr, fma_blk, id,
 			smsg_attr->msg_buffer, smsg_attr->mbox_offset,
 			fma_blk->gnm_hndl.qword1,
@@ -470,14 +505,14 @@ kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
 	 * > 0 - hold it for now */
 	if (purgatory_hold == 0) {
 		CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
-			"hndl "LPX64"."LPX64"\n",
+			"hndl %#llx.%#llx\n",
 			conn, smsg_attr, fma_blk, id,
 			fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 		fma_blk->gnm_avail_mboxs++;
 
 	} else if (purgatory_hold > 0) {
 		CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
-			"hndl "LPX64"."LPX64"\n",
+			"hndl %#llx.%#llx\n",
 			conn, smsg_attr, fma_blk, id,
 			fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 
@@ -486,7 +521,7 @@ kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
 						conn->gnc_timeout);
 	} else {
 		CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
-			"hndl "LPX64"."LPX64"\n",
+			"hndl %#llx.%#llx\n",
 			conn, smsg_attr, fma_blk, id,
 			fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
 
@@ -584,8 +619,8 @@ kgnilnd_map_phys_fmablk(kgn_device_t *device)
 	int                     rc = 0;
 	kgn_fma_memblock_t     *fma_blk;
 
-	/* use sem to gate access to single thread, just in case */
-	down(&device->gnd_fmablk_sem);
+	/* use mutex to gate access to single thread, just in case */
+	mutex_lock(&device->gnd_fmablk_mutex);
 
 	spin_lock(&device->gnd_fmablk_lock);
 
@@ -598,7 +633,7 @@ kgnilnd_map_phys_fmablk(kgn_device_t *device)
 	}
 	spin_unlock(&device->gnd_fmablk_lock);
 
-	up(&device->gnd_fmablk_sem);
+	mutex_unlock(&device->gnd_fmablk_mutex);
 
 	RETURN(rc);
 }
@@ -609,8 +644,8 @@ kgnilnd_unmap_fma_blocks(kgn_device_t *device)
 
 	kgn_fma_memblock_t      *fma_blk;
 
-	/* use sem to gate access to single thread, just in case */
-	down(&device->gnd_fmablk_sem);
+	/* use mutex to gate access to single thread, just in case */
+	mutex_lock(&device->gnd_fmablk_mutex);
 
 	spin_lock(&device->gnd_fmablk_lock);
 
@@ -619,7 +654,7 @@ kgnilnd_unmap_fma_blocks(kgn_device_t *device)
 	}
 	spin_unlock(&device->gnd_fmablk_lock);
 
-	up(&device->gnd_fmablk_sem);
+	mutex_unlock(&device->gnd_fmablk_mutex);
 }
 
 void
@@ -628,8 +663,8 @@ kgnilnd_free_phys_fmablk(kgn_device_t *device)
 
 	kgn_fma_memblock_t      *fma_blk, *fma_blkN;
 
-	/* use sem to gate access to single thread, just in case */
-	down(&device->gnd_fmablk_sem);
+	/* use mutex to gate access to single thread, just in case */
+	mutex_lock(&device->gnd_fmablk_mutex);
 
 	spin_lock(&device->gnd_fmablk_lock);
 
@@ -639,7 +674,7 @@ kgnilnd_free_phys_fmablk(kgn_device_t *device)
 	}
 	spin_unlock(&device->gnd_fmablk_lock);
 
-	up(&device->gnd_fmablk_sem);
+	mutex_unlock(&device->gnd_fmablk_mutex);
 }
 
 /* kgnilnd dgram nid->struct managment */
@@ -906,7 +941,7 @@ kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
 	}
 
 	if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
-		CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
+		CERROR("Recived bad timestamps peer %llu conn %llu\n",
 		connreq->gncr_peerstamp, connreq->gncr_connstamp);
 		return -EPROTO;
 	}
@@ -1404,13 +1439,13 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
 		RETURN(0);
 	}
 
-	CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
+	CDEBUG(D_NET, "ready %#llx on device 0x%p\n",
 		readyid, dev);
 
 	dgram = (kgn_dgram_t *)readyid;
 
 	LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
-		 "dgram 0x%p from id "LPX64" with bad magic %x\n",
+		 "dgram 0x%p from id %#llx with bad magic %x\n",
 		 dgram, readyid, dgram->gndg_magic);
 
 	LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
@@ -1444,7 +1479,7 @@ kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
 	spin_unlock(&dev->gnd_dgram_lock);
 
 	LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
-		 " id "LPU64" was ready\n", readyid);
+		 " id %llu was ready\n", readyid);
 
 	CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
 		"remote_addr %u remote_id %u\n", grc, dgram,
@@ -1660,7 +1695,7 @@ kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
 		if (grc != GNI_RC_SUCCESS)
 			continue;
 
-		CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
+		CDEBUG(D_NET, "ready %#llx on device %d->0x%p\n",
 			readyid, dev->gnd_id, dev);
 
 		rc = kgnilnd_probe_for_dgram(dev, &dgram);
@@ -1815,8 +1850,8 @@ kgnilnd_finish_connect(kgn_dgram_t *dgram)
 	}
 
 	if (peer->gnp_down == GNILND_RCA_NODE_DOWN) {
-		CNETERR("Received connection request from %s that RCA thinks is"
-			" down.\n", libcfs_nid2str(her_nid));
+		CNETERR("Received connection request from down nid %s\n",
+			libcfs_nid2str(her_nid));
 		peer->gnp_down = GNILND_RCA_NODE_UP;
 	}
 
@@ -2168,7 +2203,7 @@ inform_peer:
 
 		/* now that we are outside the lock, tell Mommy */
 		if (peer != NULL) {
-			kgnilnd_peer_notify(peer, rc);
+			kgnilnd_peer_notify(peer, rc, 0);
 			kgnilnd_peer_decref(peer);
 		}
 	}
@@ -2491,8 +2526,9 @@ kgnilnd_dgram_mover(void *arg)
 		/* last second chance for others to poke us */
 		did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
 
-		/* check flag variables before comittingi even if we did something;
-		 * if we are after the deadline call schedule */
+		/* check flag variables before committing even if we
+		 * did something; if we are after the deadline call
+		 * schedule */
 		if ((!did_something || time_after(jiffies, deadline)) &&
 		    !kgnilnd_data.kgn_shutdown &&
 		    !kgnilnd_data.kgn_quiesce_trigger) {