Whamcloud - gitweb
LU-15076 socklnd: lock ksnc_tx_queue list processing 79/45179/2
authorArtem Blagodarenko <artem.blagodarenko@gmail.com>
Sat, 9 Oct 2021 04:35:19 +0000 (00:35 -0400)
committerOleg Drokin <green@whamcloud.com>
Wed, 27 Oct 2021 00:35:10 +0000 (00:35 +0000)
A GFP occurred in the ksocknal_find_timed_out_conn() while processing
ksnc_tx_queue list.

Add locking to this list.

Change-Id: I1f76683e5798c5015f11e3fa285db9613b1af906
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@hpe.com>
HPE-bug-id: LUS-10248
Fixes: 25c1cb2c4d ("LU-9120 lnet: handle socklnd tx failure")
Reviewed-by: Chris Horn <hornc@cray.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-on: https://review.whamcloud.com/45179
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/socklnd/socklnd_cb.c

index fcdbd16..614b864 100644 (file)
@@ -2309,12 +2309,14 @@ ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
         /* We're called with a shared lock on ksnd_global_lock */
        struct ksock_conn *conn;
        struct ksock_tx *tx;
+       struct ksock_sched *sched;
 
        list_for_each_entry(conn, &peer_ni->ksnp_conns, ksnc_list) {
                int error;
 
                 /* Don't need the {get,put}connsock dance to deref ksnc_sock */
                 LASSERT (!conn->ksnc_closing);
+               sched = conn->ksnc_scheduler;
 
                error = conn->ksnc_sock->sk->sk_err;
                 if (error != 0) {
@@ -2355,6 +2357,7 @@ ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
                        return conn;
                }
 
+               spin_lock_bh(&sched->kss_lock);
                if ((!list_empty(&conn->ksnc_tx_queue) ||
                     conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
                    ktime_get_seconds() >= conn->ksnc_tx_deadline) {
@@ -2369,8 +2372,10 @@ ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni)
                        CNETERR("Timeout sending data to %s (%pISp) the network or that node may be down.\n",
                                libcfs_idstr(&peer_ni->ksnp_id),
                                &conn->ksnc_peeraddr);
+                               spin_unlock_bh(&sched->kss_lock);
                                return conn;
                }
+               spin_unlock_bh(&sched->kss_lock);
        }
 
        return (NULL);