Whamcloud - gitweb
LU-12945 lnet: Disable zero copy when running on VM 00/37300/9
authorShaun Tancheff <shaun.tancheff@hpe.com>
Wed, 19 Feb 2020 16:17:06 +0000 (10:17 -0600)
committerOleg Drokin <green@whamcloud.com>
Sun, 1 Mar 2020 05:36:45 +0000 (05:36 +0000)
When running on a hypervisor platform zero copy buffers
may still be referenced when write queue size is zero

So when running on a hypervisor push the zero copy size limit
above max payload size of 16M.

Use the hypervisor test added to linux v4.14-119-g79cc74155218
and provide a replacement for earlier kernels.

kernel-commit: 79cc74155218316b9a5d28577c7077b2adba8e58

Cray-bug-id: LUS-8072
Signed-off-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Change-Id: I5582a6aa8da6f48deafaf13d60cf67a09d7a7231
Reviewed-on: https://review.whamcloud.com/37300
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Petros Koutoupis <petros.koutoupis@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/autoconf/lustre-lnet.m4
lnet/klnds/socklnd/socklnd_modparams.c

index ccb61b1..ebd3400 100644 (file)
@@ -638,6 +638,27 @@ LB_CHECK_EXPORT([kmap_to_page], [mm/highmem.c],
 ]) # LN_EXPORT_KMAP_TO_PAG
 
 #
+# LN_HAVE_HYPERVISOR_IS_TYPE
+#
+# 4.14 commit 79cc74155218316b9a5d28577c7077b2adba8e58
+# x86/paravirt: Provide a way to check for hypervisors
+#
+AC_DEFUN([LN_HAVE_HYPERVISOR_IS_TYPE], [
+tmp_flags="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="-Werror"
+LB_CHECK_COMPILE([if hypervisor_is_type function is available],
+hypervisor_is_type_exists, [
+       #include <asm/hypervisor.h>
+],[
+       (void)hypervisor_is_type(X86_HYPER_NATIVE);
+],[
+       AC_DEFINE(HAVE_HYPERVISOR_IS_TYPE, 1,
+               [hypervisor_is_type function exists])
+])
+EXTRA_KCFLAGS="$tmp_flags"
+]) # LN_HAVE_HYPERVISOR_IS_TYPE
+
+#
 # LN_HAVE_ORACLE_OFED_EXTENSIONS
 #
 # Oracle UEK 5
@@ -746,6 +767,7 @@ LN_CONFIG_SK_DATA_READY
 # 4.x
 LN_CONFIG_SOCK_CREATE_KERN
 # 4.14
+LN_HAVE_HYPERVISOR_IS_TYPE
 LN_HAVE_ORACLE_OFED_EXTENSIONS
 # 4.17
 LN_CONFIG_SOCK_GETNAME
index 046436f..737cb88 100644 (file)
 
 #include "socklnd.h"
 
+#include <linux/kvm_host.h>
+#if defined(__x86_64__) || defined(__i386__)
+#include <asm/hypervisor.h>
+#endif
+
 static int sock_timeout = 50;
 module_param(sock_timeout, int, 0644);
 MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
@@ -153,39 +158,49 @@ module_param(protocol, int, 0644);
 MODULE_PARM_DESC(protocol, "protocol version");
 #endif
 
+static inline bool is_native_host(void)
+{
+#ifdef HAVE_HYPERVISOR_IS_TYPE
+       return hypervisor_is_type(X86_HYPER_NATIVE);
+#elif defined(__x86_64__) || defined(__i386__)
+       return x86_hyper == NULL;
+#else
+       return true;
+#endif
+}
+
 struct ksock_tunables ksocknal_tunables;
 
 int ksocknal_tunables_init(void)
 {
-
-        /* initialize ksocknal_tunables structure */
-        ksocknal_tunables.ksnd_timeout            = &sock_timeout;
+       /* initialize ksocknal_tunables structure */
+       ksocknal_tunables.ksnd_timeout            = &sock_timeout;
        ksocknal_tunables.ksnd_nscheds            = &nscheds;
-        ksocknal_tunables.ksnd_nconnds            = &nconnds;
-        ksocknal_tunables.ksnd_nconnds_max        = &nconnds_max;
-        ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
-        ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
-        ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
-        ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
-        ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
-        ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
-        ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
-        ksocknal_tunables.ksnd_nagle              = &nagle;
-        ksocknal_tunables.ksnd_round_robin        = &round_robin;
-        ksocknal_tunables.ksnd_keepalive          = &keepalive;
-        ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
-        ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
-        ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
-        ksocknal_tunables.ksnd_credits            = &credits;
-        ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
-        ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
-        ksocknal_tunables.ksnd_peertimeout        = &peer_timeout;
-        ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
-        ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
-        ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
-        ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
-        ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
-        ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+       ksocknal_tunables.ksnd_nconnds            = &nconnds;
+       ksocknal_tunables.ksnd_nconnds_max        = &nconnds_max;
+       ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+       ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+       ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
+       ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
+       ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
+       ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+       ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+       ksocknal_tunables.ksnd_nagle              = &nagle;
+       ksocknal_tunables.ksnd_round_robin        = &round_robin;
+       ksocknal_tunables.ksnd_keepalive          = &keepalive;
+       ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+       ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+       ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+       ksocknal_tunables.ksnd_credits            = &credits;
+       ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+       ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+       ksocknal_tunables.ksnd_peertimeout        = &peer_timeout;
+       ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
+       ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+       ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+       ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+       ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
+       ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
 
        if (enable_irq_affinity) {
                CWARN("irq_affinity is removed from socklnd because modern "
@@ -193,19 +208,25 @@ int ksocknal_tunables_init(void)
                      "# NICs, although you still can set irq_affinity by "
                      "another way, please check manual for details.\n");
        }
-        ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
+       ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
 
 #ifdef SOCKNAL_BACKOFF
-        ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
-        ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
+       ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
+       ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
 #endif
 
 #if SOCKNAL_VERSION_DEBUG
-        ksocknal_tunables.ksnd_protocol           = &protocol;
+       ksocknal_tunables.ksnd_protocol           = &protocol;
 #endif
 
-        if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
-                *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+       if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+               *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+       /* When on a hypervisor set the minimum zero copy size
+        * above the maximum payload size
+        */
+       if (!is_native_host())
+               *ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1;
 
        return 0;
 };