It is possible for multiple threads to both be processing a
transaction error that causes the connection to close.
If the close path stalls after changing the conn state to CLOSING,
the tx_done path will put a tx in purgatory since the conn state is
not ESTABLISHED anymore. When the close path continues, it expects to
find that the conn->gnc_mdd_list is empty and assert if it isn't
empty.
Change the error code for the tx_done path so that we don't put this
tx in purgatory since we are closing the connection anyway.
Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: I4458522f16508eb53d380f62320c65c7bf84657a
Reviewed-on: http://review.whamcloud.com/15439
Tested-by: Jenkins
Reviewed-by: James Shimek <jshimek@cray.com>
Reviewed-by: James Simmons <uja.ornl@yahoo.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
conn->gnc_state = GNILND_CONN_CLOSING;
}
conn->gnc_state = GNILND_CONN_CLOSING;
}
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RDMA_CQ_ERROR)) {
+ msleep_interruptible(MSEC_PER_SEC);
+ }
+
/* leave on peer->gnp_conns to make sure we don't let the reaper
* or others try to unlink this peer until the conn is fully
* processed for closing */
/* leave on peer->gnp_conns to make sure we don't let the reaper
* or others try to unlink this peer until the conn is fully
* processed for closing */
/* Max number of connections to keep in purgatory per peer */
#define GNILND_PURGATORY_MAX 5
/* Max number of connections to keep in purgatory per peer */
#define GNILND_PURGATORY_MAX 5
+/* Closing, don't put in purgatory */
+#define GNILND_NOPURG 222
/* payload size to add to the base mailbox size
* This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
/* payload size to add to the base mailbox size
* This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
#define CFS_FAIL_GNI_SCHED_DEADLINE 0xf052
#define CFS_FAIL_GNI_DGRAM_DEADLINE 0xf053
#define CFS_FAIL_GNI_DGRAM_DROP_TX 0xf054
#define CFS_FAIL_GNI_SCHED_DEADLINE 0xf052
#define CFS_FAIL_GNI_DGRAM_DEADLINE 0xf053
#define CFS_FAIL_GNI_DGRAM_DROP_TX 0xf054
+#define CFS_FAIL_GNI_RDMA_CQ_ERROR 0xf055
/* helper macros */
extern void
/* helper macros */
extern void
* verified peer notification - the theory is that
* a TX error can be communicated in all other cases */
if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED &&
* verified peer notification - the theory is that
* a TX error can be communicated in all other cases */
if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED &&
+ error != -GNILND_NOPURG &&
kgnilnd_check_purgatory_conn(tx->tx_conn)) {
kgnilnd_add_purgatory_tx(tx);
kgnilnd_check_purgatory_conn(tx->tx_conn)) {
kgnilnd_add_purgatory_tx(tx);
spin_unlock(&conn->gnc_list_lock);
kgnilnd_conn_mutex_unlock(&conn->gnc_rdma_mutex);
spin_unlock(&conn->gnc_list_lock);
kgnilnd_conn_mutex_unlock(&conn->gnc_rdma_mutex);
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RDMA_CQ_ERROR)) {
+ event_data = 1LL << 48;
+ rc = 1;
+ }
+
if (likely(desc->status == GNI_RC_SUCCESS) && rc == 0) {
atomic_inc(&dev->gnd_rdma_ntx);
atomic64_add(tx->tx_nob, &dev->gnd_rdma_txbytes);
if (likely(desc->status == GNI_RC_SUCCESS) && rc == 0) {
atomic_inc(&dev->gnd_rdma_ntx);
atomic64_add(tx->tx_nob, &dev->gnd_rdma_txbytes);
-EFAULT,
rcookie,
tx->tx_msg.gnm_srcnid);
-EFAULT,
rcookie,
tx->tx_msg.gnm_srcnid);
- kgnilnd_tx_done(tx, -EFAULT);
+ kgnilnd_tx_done(tx, -GNILND_NOPURG);
kgnilnd_close_conn(conn, -ECOMM);
}
kgnilnd_close_conn(conn, -ECOMM);
}