From 972d3eb8c5e154edded9b29a72e72c5bb9458770 Mon Sep 17 00:00:00 2001 From: Vitaly Fertman Date: Thu, 16 Dec 2010 03:19:37 +0300 Subject: [PATCH] b=24218 fix race on ksock_tx_t::tx_resid o=Liang i=isaac If a connection is closed before ksocknal_transmit() returns to ksocknal_process_transmit(), then nobody has refcount on conn::ksnc_sock and all pending ZC requests will be finalized by ksocknal_connsock_decref-> ksocknal_finalize_zcreq, ksocknal_finalize_zcreq will mark not-acked ZC request as error by setting tx::tx_reside = -1. This is race because ksocknal_process_transmit() will check tx::tx_resid right after calling ksocknal_transmit(), and it can get tx->tx_resid != 0 and rc == 0 then hit later LASSERT(rc < 0). --- lnet/klnds/socklnd/socklnd.c | 3 +-- lnet/klnds/socklnd/socklnd.h | 7 ++++--- lnet/klnds/socklnd/socklnd_cb.c | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index e516349..96a9bc3 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1551,8 +1551,7 @@ ksocknal_finalize_zcreq(ksock_conn_t *conn) LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0); tx->tx_msg.ksm_zc_cookies[0] = 0; - if (tx->tx_resid == 0) - tx->tx_resid = -1; /* mark it as not-acked */ + tx->tx_zc_aborted = 1; /* mark it as not-acked */ cfs_list_del(&tx->tx_zc_list); cfs_list_add(&tx->tx_zc_list, &zlist); } diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index ae6045f..724e216 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -218,9 +218,10 @@ typedef struct /* transmit packet */ int tx_niov; /* # packet iovec frags */ struct iovec *tx_iov; /* packet iovec frags */ int tx_nkiov; /* # packet page frags */ - unsigned int tx_zc_capable:1; /* payload is large enough for ZC */ - unsigned int tx_zc_checked:1; /* Have I checked if I should ZC? */ - unsigned int tx_nonblk:1; /* it's a non-blocking ACK */ + unsigned short tx_zc_aborted; /* aborted ZC request */ + unsigned short tx_zc_capable:1; /* payload is large enough for ZC */ + unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */ + unsigned short tx_nonblk:1; /* it's a non-blocking ACK */ lnet_kiov_t *tx_kiov; /* packet page frags */ struct ksock_conn *tx_conn; /* owning conn */ lnet_msg_t *tx_lnetmsg; /* lnet message for lnet_finalize() */ diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 0a74d3e..27b816b 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -54,6 +54,7 @@ ksocknal_alloc_tx(int type, int size) return NULL; cfs_atomic_set(&tx->tx_refcount, 1); + tx->tx_zc_aborted = 0; tx->tx_zc_capable = 0; tx->tx_zc_checked = 0; tx->tx_desc_size = size; @@ -387,7 +388,7 @@ void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx) { lnet_msg_t *lnetmsg = tx->tx_lnetmsg; - int rc = (tx->tx_resid == 0) ? 0 : -EIO; + int rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO; ENTRY; LASSERT(ni != NULL || tx->tx_conn != NULL); -- 1.8.3.1