Whamcloud - gitweb
LU-6261 gnilnd: kgnilnd_check_rdma_cq race in error path.
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_cb.c
index 7175f0d..b8f56f5 100644 (file)
@@ -1164,6 +1164,7 @@ kgnilnd_unmap_buffer(kgn_tx_t *tx, int error)
                 * verified peer notification  - the theory is that
                 * a TX error can be communicated in all other cases */
                if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED &&
+                   error != -GNILND_NOPURG &&
                    kgnilnd_check_purgatory_conn(tx->tx_conn)) {
                        kgnilnd_add_purgatory_tx(tx);
 
@@ -1471,7 +1472,6 @@ kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
         */
        msg->gnm_connstamp = conn->gnc_my_connstamp;
        msg->gnm_payload_len = immediatenob;
-       kgnilnd_conn_mutex_lock(&conn->gnc_smsg_mutex);
        msg->gnm_seq = atomic_read(&conn->gnc_tx_seq);
 
        /* always init here - kgn_checksum is a /sys module tunable
@@ -1586,6 +1586,7 @@ kgnilnd_sendmsg(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
 
        timestamp = jiffies;
        kgnilnd_gl_mutex_lock(&dev->gnd_cq_mutex);
+       kgnilnd_conn_mutex_lock(&tx->tx_conn->gnc_smsg_mutex);
        /* delay in jiffies - we are really concerned only with things that
         * result in a schedule() or really holding this off for long times .
         * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
@@ -1630,7 +1631,8 @@ kgnilnd_sendmsg_trylock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob
                rc = 0;
        } else {
                atomic_inc(&conn->gnc_device->gnd_fast_try);
-               rc = kgnilnd_gl_mutex_trylock(&conn->gnc_device->gnd_cq_mutex);
+               rc = kgnilnd_trylock(&conn->gnc_device->gnd_cq_mutex,
+                                    &conn->gnc_smsg_mutex);
        }
        if (!rc) {
                rc = -EAGAIN;
@@ -3245,6 +3247,11 @@ kgnilnd_check_rdma_cq(kgn_device_t *dev)
                spin_unlock(&conn->gnc_list_lock);
                kgnilnd_conn_mutex_unlock(&conn->gnc_rdma_mutex);
 
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RDMA_CQ_ERROR)) {
+                       event_data = 1LL << 48;
+                       rc = 1;
+               }
+
                if (likely(desc->status == GNI_RC_SUCCESS) && rc == 0) {
                        atomic_inc(&dev->gnd_rdma_ntx);
                        atomic64_add(tx->tx_nob, &dev->gnd_rdma_txbytes);
@@ -3299,7 +3306,7 @@ kgnilnd_check_rdma_cq(kgn_device_t *dev)
                                         -EFAULT,
                                         rcookie,
                                         tx->tx_msg.gnm_srcnid);
-                       kgnilnd_tx_done(tx, -EFAULT);
+                       kgnilnd_tx_done(tx, -GNILND_NOPURG);
                        kgnilnd_close_conn(conn, -ECOMM);
                }