From: Olaf Weber Date: Fri, 27 Jan 2017 15:13:53 +0000 (+0100) Subject: LU-9119 socklnd: propagate errors on send failure X-Git-Tag: 2.9.57~4 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=61aa09e1ed8463ccda1f5d83d2c5aff8080a6116 LU-9119 socklnd: propagate errors on send failure When an attempt to send a message fails, for example because no connection could be established with the remote address, socklnd drops the message. For a PUT or REPLY message with non-zero payload, ksocknal_tx_done() calls lnet_finalize() with -EIO as the error code. But for an ACK or GET message there is no payload, and lnet_finalize() is called with 0 (no error) as the error code. This leaves upper layers to rely on other means to determine that sending the message did actually fail, and that (for example) no REPLY will ever answer a failed GET. Add an error code parameter to ksocknal_tx_done(). In ksocknal_txlist_done() change the 0/1 'error' indicator to be an actual error code that is passed on the ksocknal_tx_done(). Update the callers of ksocknal_txlist_done() to pass in the error code if they have encountered an error. Test-Parameters: trivial Signed-off-by: Olaf Weber Change-Id: I66b897a31e537e70dcc2622ffdfcc6e96fa93193 Reviewed-on: https://review.whamcloud.com/26691 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Doug Oucharek Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 99f9781..d0e073c 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -618,7 +618,7 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip) write_unlock_bh(&ksocknal_data.ksnd_global_lock); - ksocknal_txlist_done(ni, &zombies, 1); + ksocknal_txlist_done(ni, &zombies, -ENETDOWN); return rc; } @@ -1030,6 +1030,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route, ksock_tx_t *tx; ksock_tx_t *txtmp; int rc; + int rc2; int active; char *warn = NULL; @@ -1384,7 +1385,13 @@ failed_2: write_unlock_bh(global_lock); } - ksocknal_txlist_done(ni, &zombies, 1); + /* + * If we get here without an error code, just use -EALREADY. + * Depending on how we got here, the error may be positive + * or negative. Normalize the value for ksocknal_txlist_done(). + */ + rc2 = (rc == 0 ? -EALREADY : (rc > 0 ? -rc : rc)); + ksocknal_txlist_done(ni, &zombies, rc2); ksocknal_peer_decref(peer_ni); failed_1: diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index e3634c0..544690b 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -564,14 +564,14 @@ ksocknal_tx_addref (ksock_tx_t *tx) } extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx); -extern void ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx); +extern void ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int error); static inline void ksocknal_tx_decref (ksock_tx_t *tx) { LASSERT (atomic_read(&tx->tx_refcount) > 0); if (atomic_dec_and_test(&tx->tx_refcount)) - ksocknal_tx_done(NULL, tx); + ksocknal_tx_done(NULL, tx, 0); } static inline void diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 06bfe3e..2da283a 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -389,25 +389,24 @@ ksocknal_receive (ksock_conn_t *conn) } void -ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx) +ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int rc) { struct lnet_msg *lnetmsg = tx->tx_lnetmsg; - int rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO; ENTRY; - LASSERT(ni != NULL || tx->tx_conn != NULL); + LASSERT(ni != NULL || tx->tx_conn != NULL); - if (tx->tx_conn != NULL) - ksocknal_conn_decref(tx->tx_conn); + if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) + rc = -EIO; - if (ni == NULL && tx->tx_conn != NULL) - ni = tx->tx_conn->ksnc_peer->ksnp_ni; + if (tx->tx_conn != NULL) + ksocknal_conn_decref(tx->tx_conn); ksocknal_free_tx(tx); if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */ lnet_finalize(lnetmsg, rc); - EXIT; + EXIT; } void @@ -418,21 +417,21 @@ ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error) while (!list_empty(txlist)) { tx = list_entry(txlist->next, ksock_tx_t, tx_list); - if (error && tx->tx_lnetmsg != NULL) { - CNETERR("Deleting packet type %d len %d %s->%s\n", - le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type), - le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length), - libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), - libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid))); - } else if (error) { - CNETERR("Deleting noop packet\n"); - } + if (error && tx->tx_lnetmsg != NULL) { + CNETERR("Deleting packet type %d len %d %s->%s\n", + le32_to_cpu(tx->tx_lnetmsg->msg_hdr.type), + le32_to_cpu(tx->tx_lnetmsg->msg_hdr.payload_length), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid))); + } else if (error) { + CNETERR("Deleting noop packet\n"); + } list_del(&tx->tx_list); - LASSERT (atomic_read(&tx->tx_refcount) == 1); - ksocknal_tx_done (ni, tx); - } + LASSERT(atomic_read(&tx->tx_refcount) == 1); + ksocknal_tx_done(ni, tx, error); + } } static void @@ -2000,9 +1999,9 @@ ksocknal_connect (ksock_route_t *route) write_unlock_bh(&ksocknal_data.ksnd_global_lock); - ksocknal_peer_failed(peer_ni); - ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, 1); - return 0; + ksocknal_peer_failed(peer_ni); + ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, rc); + return 0; } /* @@ -2332,26 +2331,26 @@ ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni) static inline void ksocknal_flush_stale_txs(ksock_peer_ni_t *peer_ni) { - ksock_tx_t *tx; - struct list_head stale_txs = LIST_HEAD_INIT(stale_txs); + ksock_tx_t *tx; + struct list_head stale_txs = LIST_HEAD_INIT(stale_txs); write_lock_bh(&ksocknal_data.ksnd_global_lock); while (!list_empty(&peer_ni->ksnp_tx_queue)) { tx = list_entry(peer_ni->ksnp_tx_queue.next, - ksock_tx_t, tx_list); + ksock_tx_t, tx_list); - if (!cfs_time_aftereq(cfs_time_current(), - tx->tx_deadline)) - break; + if (!cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) + break; list_del(&tx->tx_list); list_add_tail(&tx->tx_list, &stale_txs); - } + } write_unlock_bh(&ksocknal_data.ksnd_global_lock); - ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, 1); + ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, -ETIMEDOUT); } static int