Whamcloud - gitweb
LU-14742 socklnd: detect link state to set fatal error on ni
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Tue, 8 Jun 2021 21:11:41 +0000 (14:11 -0700)
committerAndreas Dilger <adilger@whamcloud.com>
Tue, 20 Jul 2021 20:04:25 +0000 (20:04 +0000)
To help avoid selecting lnet ni which corresponds to a downed
ethernet link for sending, add a mechanism for detecting link
events in socklnd. On link up/down events, find corresponding
ni and toggle ni_fatal_error_on flag, similar to o2iblnd way.

Lustre-change: https://review.whamcloud.com/43952
Lustre-commit: fc2df80e96dc5db9f3fb710893ccf6f442664471

Test-Parameters: trivial
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: Ie9f4f02fcb8b988c77bf63f751d5a621e79e9f58
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/44329
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h

index 2edcda9..fec7d92 100644 (file)
@@ -1784,6 +1784,78 @@ ksocknal_free_buffers (void)
        }
 }
 
+static int ksocknal_get_link_status(struct net_device *dev)
+{
+       int ret = -1;
+
+       LASSERT(dev);
+
+       if (!netif_running(dev))
+               ret = 0;
+       /* Some devices may not be providing link settings */
+       else if (dev->ethtool_ops->get_link)
+               ret = dev->ethtool_ops->get_link(dev);
+
+       return ret;
+}
+
+static int
+ksocknal_handle_link_state_change(struct net_device *dev,
+                                 unsigned char operstate)
+{
+       struct lnet_ni *ni;
+       struct ksock_net *net;
+       struct ksock_net *cnxt;
+       int ifindex;
+       unsigned char link_down = !(operstate == IF_OPER_UP);
+
+       ifindex = dev->ifindex;
+
+       if (!ksocknal_data.ksnd_nnets)
+               goto out;
+
+       list_for_each_entry_safe(net, cnxt, &ksocknal_data.ksnd_nets,
+                                ksnn_list) {
+               if (net->ksnn_interface.ksni_index != ifindex)
+                       continue;
+               ni = net->ksnn_ni;
+               if (link_down)
+                       atomic_set(&ni->ni_fatal_error_on, link_down);
+               else
+                       atomic_set(&ni->ni_fatal_error_on,
+                                  (ksocknal_get_link_status(dev) == 0));
+       }
+out:
+       return 0;
+}
+
+
+/************************************
+ * Net device notifier event handler
+ ************************************/
+static int ksocknal_device_event(struct notifier_block *unused,
+                                unsigned long event, void *ptr)
+{
+       struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+       unsigned char operstate;
+
+       operstate = dev->operstate;
+
+       switch (event) {
+       case NETDEV_UP:
+       case NETDEV_DOWN:
+       case NETDEV_CHANGE:
+               ksocknal_handle_link_state_change(dev, operstate);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block ksocknal_notifier_block = {
+       .notifier_call = ksocknal_device_event,
+};
+
 static void
 ksocknal_base_shutdown(void)
 {
@@ -1795,6 +1867,9 @@ ksocknal_base_shutdown(void)
               libcfs_kmem_read());
        LASSERT (ksocknal_data.ksnd_nnets == 0);
 
+       if (ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL)
+               unregister_netdevice_notifier(&ksocknal_notifier_block);
+
        switch (ksocknal_data.ksnd_init) {
        default:
                LASSERT(0);
@@ -1961,6 +2036,8 @@ ksocknal_base_startup(void)
                 goto failed;
         }
 
+       register_netdevice_notifier(&ksocknal_notifier_block);
+
         /* flag everything initialised */
         ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
 
@@ -2236,6 +2313,7 @@ ksocknal_startup(struct lnet_ni *ni)
        LASSERT(ksi);
        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ksi->ksni_ipaddr);
        list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+       net->ksnn_ni = ni;
        ksocknal_data.ksnd_nnets++;
 
         return 0;
index 6160b3a..eae4981 100644 (file)
@@ -168,6 +168,7 @@ struct ksock_net {
        struct list_head  ksnn_list;            /* chain on global list */
        atomic_t          ksnn_npeers;          /* # peers */
        struct ksock_interface ksnn_interface;  /* IP interface */
+       struct lnet_ni    *ksnn_ni;
 };
 /* When the ksock_net is shut down, this (negative) bias is added to
  * ksnn_npeers, which prevents new peers from being added.
@@ -457,6 +458,11 @@ extern const struct ksock_proto ksocknal_protocol_v3x;
 #define CPU_MASK_NONE   0UL
 #endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0)
+#undef netdev_notifier_info_to_dev
+#define netdev_notifier_info_to_dev(ndev) ndev
+#endif
+
 static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
 {
 #if 1