From 0c45e49457a3f61ca661f4f7b0ad749cceaf7709 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Wed, 19 Feb 2020 10:17:06 -0600 Subject: [PATCH] LU-12945 lnet: Disable zero copy when running on VM When running on a hypervisor platform zero copy buffers may still be referenced when write queue size is zero So when running on a hypervisor push the zero copy size limit above max payload size of 16M. Use the hypervisor test added to linux v4.14-119-g79cc74155218 and provide a replacement for earlier kernels. kernel-commit: 79cc74155218316b9a5d28577c7077b2adba8e58 Cray-bug-id: LUS-8072 Signed-off-by: Shaun Tancheff Change-Id: I5582a6aa8da6f48deafaf13d60cf67a09d7a7231 Reviewed-on: https://review.whamcloud.com/37300 Tested-by: jenkins Reviewed-by: Alexey Lyashkov Tested-by: Maloo Reviewed-by: Petros Koutoupis Reviewed-by: Oleg Drokin --- lnet/autoconf/lustre-lnet.m4 | 22 +++++++++ lnet/klnds/socklnd/socklnd_modparams.c | 89 +++++++++++++++++++++------------- 2 files changed, 77 insertions(+), 34 deletions(-) diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index ccb61b1..ebd3400 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -638,6 +638,27 @@ LB_CHECK_EXPORT([kmap_to_page], [mm/highmem.c], ]) # LN_EXPORT_KMAP_TO_PAG # +# LN_HAVE_HYPERVISOR_IS_TYPE +# +# 4.14 commit 79cc74155218316b9a5d28577c7077b2adba8e58 +# x86/paravirt: Provide a way to check for hypervisors +# +AC_DEFUN([LN_HAVE_HYPERVISOR_IS_TYPE], [ +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-Werror" +LB_CHECK_COMPILE([if hypervisor_is_type function is available], +hypervisor_is_type_exists, [ + #include +],[ + (void)hypervisor_is_type(X86_HYPER_NATIVE); +],[ + AC_DEFINE(HAVE_HYPERVISOR_IS_TYPE, 1, + [hypervisor_is_type function exists]) +]) +EXTRA_KCFLAGS="$tmp_flags" +]) # LN_HAVE_HYPERVISOR_IS_TYPE + +# # LN_HAVE_ORACLE_OFED_EXTENSIONS # # Oracle UEK 5 @@ -746,6 +767,7 @@ LN_CONFIG_SK_DATA_READY # 4.x LN_CONFIG_SOCK_CREATE_KERN # 4.14 +LN_HAVE_HYPERVISOR_IS_TYPE LN_HAVE_ORACLE_OFED_EXTENSIONS # 4.17 LN_CONFIG_SOCK_GETNAME diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c index 046436f..737cb88 100644 --- a/lnet/klnds/socklnd/socklnd_modparams.c +++ b/lnet/klnds/socklnd/socklnd_modparams.c @@ -21,6 +21,11 @@ #include "socklnd.h" +#include +#if defined(__x86_64__) || defined(__i386__) +#include +#endif + static int sock_timeout = 50; module_param(sock_timeout, int, 0644); MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)"); @@ -153,39 +158,49 @@ module_param(protocol, int, 0644); MODULE_PARM_DESC(protocol, "protocol version"); #endif +static inline bool is_native_host(void) +{ +#ifdef HAVE_HYPERVISOR_IS_TYPE + return hypervisor_is_type(X86_HYPER_NATIVE); +#elif defined(__x86_64__) || defined(__i386__) + return x86_hyper == NULL; +#else + return true; +#endif +} + struct ksock_tunables ksocknal_tunables; int ksocknal_tunables_init(void) { - - /* initialize ksocknal_tunables structure */ - ksocknal_tunables.ksnd_timeout = &sock_timeout; + /* initialize ksocknal_tunables structure */ + ksocknal_tunables.ksnd_timeout = &sock_timeout; ksocknal_tunables.ksnd_nscheds = &nscheds; - ksocknal_tunables.ksnd_nconnds = &nconnds; - ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; - ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; - ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; - ksocknal_tunables.ksnd_eager_ack = &eager_ack; - ksocknal_tunables.ksnd_typed_conns = &typed_conns; - ksocknal_tunables.ksnd_min_bulk = &min_bulk; - ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; - ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; - ksocknal_tunables.ksnd_nagle = &nagle; - ksocknal_tunables.ksnd_round_robin = &round_robin; - ksocknal_tunables.ksnd_keepalive = &keepalive; - ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; - ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; - ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; - ksocknal_tunables.ksnd_credits = &credits; - ksocknal_tunables.ksnd_peertxcredits = &peer_credits; - ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; - ksocknal_tunables.ksnd_peertimeout = &peer_timeout; - ksocknal_tunables.ksnd_enable_csum = &enable_csum; - ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; - ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; - ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; - ksocknal_tunables.ksnd_zc_recv = &zc_recv; - ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; + ksocknal_tunables.ksnd_nconnds = &nconnds; + ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; + ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; + ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; + ksocknal_tunables.ksnd_eager_ack = &eager_ack; + ksocknal_tunables.ksnd_typed_conns = &typed_conns; + ksocknal_tunables.ksnd_min_bulk = &min_bulk; + ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; + ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; + ksocknal_tunables.ksnd_nagle = &nagle; + ksocknal_tunables.ksnd_round_robin = &round_robin; + ksocknal_tunables.ksnd_keepalive = &keepalive; + ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; + ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; + ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; + ksocknal_tunables.ksnd_credits = &credits; + ksocknal_tunables.ksnd_peertxcredits = &peer_credits; + ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; + ksocknal_tunables.ksnd_peertimeout = &peer_timeout; + ksocknal_tunables.ksnd_enable_csum = &enable_csum; + ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; + ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; + ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; + ksocknal_tunables.ksnd_zc_recv = &zc_recv; + ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; if (enable_irq_affinity) { CWARN("irq_affinity is removed from socklnd because modern " @@ -193,19 +208,25 @@ int ksocknal_tunables_init(void) "# NICs, although you still can set irq_affinity by " "another way, please check manual for details.\n"); } - ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity; + ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity; #ifdef SOCKNAL_BACKOFF - ksocknal_tunables.ksnd_backoff_init = &backoff_init; - ksocknal_tunables.ksnd_backoff_max = &backoff_max; + ksocknal_tunables.ksnd_backoff_init = &backoff_init; + ksocknal_tunables.ksnd_backoff_max = &backoff_max; #endif #if SOCKNAL_VERSION_DEBUG - ksocknal_tunables.ksnd_protocol = &protocol; + ksocknal_tunables.ksnd_protocol = &protocol; #endif - if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) - *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10); + if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) + *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10); + + /* When on a hypervisor set the minimum zero copy size + * above the maximum payload size + */ + if (!is_native_host()) + *ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1; return 0; }; -- 1.8.3.1