From: Serguei Smirnov Date: Wed, 23 Jun 2021 22:51:21 +0000 (-0700) Subject: LU-14662 lnet: set eth routes needed for multi rail X-Git-Tag: 2.14.56~73 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=c9bfe57bd2495671fa66eb7e52184f76e1f4a6eb LU-14662 lnet: set eth routes needed for multi rail When ksocklnd is initialized or new ethernet interfaces are added via lnetctl, set the routing rules using a common shell script ksocklnd-config. This ensures control over source interface when sending traffic. For example, for eth0 with ip 192.168.122.142/24: the output of "ip route show table eth0" should be 192.168.122.0/24 dev eth0 proto kernel scope link src 192.168.122.142 This step can be omitted by specifying options ksocklnd skip_mr_route_setup=1 in the conf file, or by using switch --skip-mr-route-setup when adding NI with lnetctl. Note that the module parameter takes priority over the lnetctl switch: if skip-mr-route-setup is not specified when adding NI with lnetctl, the route still won't get created if the conf file has skip_mr_route_setup=1. The route also won't be created if any route already exists for the given interface, assuming advanced users who manage routing on their own will want to continue doing so. Test-Parameters: trivial Signed-off-by: Serguei Smirnov Change-Id: Ia14e637bd29d4bbce5dd93daad9992336b2e6b15 Reviewed-on: https://review.whamcloud.com/44065 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Amir Shehata Reviewed-by: Cyril Bordage Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c index cfc758a..163eaf6 100644 --- a/lnet/klnds/socklnd/socklnd_modparams.c +++ b/lnet/klnds/socklnd/socklnd_modparams.c @@ -152,6 +152,11 @@ static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER; module_param(conns_per_peer, uint, 0644); MODULE_PARM_DESC(conns_per_peer, "number of connections per peer"); +/* By default skip_mr_route_setup is 0 (do not skip) */ +static unsigned int skip_mr_route_setup; +module_param(skip_mr_route_setup, uint, 0444); +MODULE_PARM_DESC(skip_mr_route_setup, "skip automatic setup of linux routes for MR"); + #ifdef SOCKNAL_BACKOFF static int backoff_init = 3; module_param(backoff_init, int, 0644); diff --git a/lnet/utils/lnetconfig/liblnetconfig.c b/lnet/utils/lnetconfig/liblnetconfig.c index c74d53a..e9541f4 100644 --- a/lnet/utils/lnetconfig/liblnetconfig.c +++ b/lnet/utils/lnetconfig/liblnetconfig.c @@ -3636,6 +3636,142 @@ int lustre_lnet_calc_service_id(__u64 *service_id) return LUSTRE_CFG_RC_NO_ERR; } +int lustre_lnet_setup_mrrouting(struct cYAML **err_rc) +{ + char *buf; + int rc = LUSTRE_CFG_RC_OUT_OF_MEM, i; + int l_errno = 0; + char err_str[LNET_MAX_STR_LEN] = "\"out of memory\""; + struct lnet_ioctl_config_ni *ni_data; + struct lnet_ioctl_config_lnd_tunables *lnd; + struct lnet_ioctl_element_stats *stats; + size_t buf_size = sizeof(*ni_data) + sizeof(*lnd) + sizeof(*stats); + char ifstr_buf[LNET_INTERFACES_NUM*LNET_MAX_STR_LEN]; + char *ifstr_ptr, *tmp_ptr, *tmp_ptr2; + int if_cnt = 0, prc; + char syscmdbuf[LNET_MAX_STR_LEN]; + char cmdpath[LNET_MAX_STR_LEN]; + bool use_custom = false; + + buf = calloc(1, buf_size); + if (buf == NULL) + goto out; + + ni_data = (struct lnet_ioctl_config_ni *)buf; + + ifstr_buf[0] = 0; + ifstr_ptr = ifstr_buf; + + for (i = 0;; i++) { + __u32 rc_net; + + memset(buf, 0, buf_size); + + LIBCFS_IOC_INIT_V2(*ni_data, lic_cfg_hdr); + /* set the ioc_len to the proper value since INIT assumes + * size of data + */ + ni_data->lic_cfg_hdr.ioc_len = buf_size; + ni_data->lic_idx = i; + + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_LOCAL_NI, ni_data); + if (rc != 0) { + l_errno = errno; + break; + } + + rc_net = LNET_NIDNET(ni_data->lic_nid); + + /* only need to setup routing for tcp */ + if (LNET_NETTYP(rc_net) != SOCKLND) + continue; + + /* don't add interfaces unless there is at least one + * interface + */ + if (strlen(ni_data->lic_ni_intf) > 0) { + if (if_cnt > 0) + strcat(ifstr_ptr, ","); + strcat(ifstr_ptr, ni_data->lic_ni_intf); + if_cnt++; + } + } + + if (l_errno != ENOENT) { + snprintf(err_str, + sizeof(err_str), + "\"cannot get networks: %s\"", + strerror(l_errno)); + rc = -l_errno; + goto out; + } else { + rc = LUSTRE_CFG_RC_NO_ERR; + } + + snprintf(err_str, sizeof(err_str), "\"success\""); + + if (if_cnt > 0) { + tmp_ptr = getenv("KSOCKLND_CONFIG"); + if (tmp_ptr) { + tmp_ptr2 = strrchr(tmp_ptr, '/'); + if (tmp_ptr2 && !strcmp(tmp_ptr2, "/ksocklnd-config")) { + snprintf(cmdpath, sizeof(cmdpath), "%s", + tmp_ptr); + use_custom = true; + } + } + + if (!use_custom) + snprintf(cmdpath, sizeof(cmdpath), + "/usr/sbin/ksocklnd-config"); + + prc = snprintf(0, 0, "%s %s", cmdpath, ifstr_ptr); + + if (prc < 0) { + l_errno = errno; + snprintf(err_str, + sizeof(err_str), + "\"snprintf failed : %s\"", + strerror(l_errno)); + rc = -l_errno; + } else if (prc >= LNET_MAX_STR_LEN) { + snprintf(err_str, sizeof(err_str), + "\"ksocklnd-config: argument too long\""); + } else { + prc = snprintf(syscmdbuf, sizeof(syscmdbuf), "%s %s", + cmdpath, ifstr_ptr); + + if (prc < 0) { + l_errno = errno; + snprintf(err_str, + sizeof(err_str), + "\"snprintf failed : %s\"", + strerror(l_errno)); + rc = -l_errno; + goto out; + } + + rc = system(syscmdbuf); + if (rc != 0) { + l_errno = errno; + snprintf(err_str, + sizeof(err_str), + "\"failed to execute ksocklnd-config : %s\"", + strerror(l_errno)); + rc = -l_errno; + } + } + } +out: + if (buf) + free(buf); + + cYAML_build_error(rc, -1, MANAGE_CMD, "setup-mrrouting", err_str, + err_rc); + + return rc; +} + int show_recovery_queue(enum lnet_health_type type, char *name, int seq_no, struct cYAML **show_rc, struct cYAML **err_rc) { diff --git a/lnet/utils/lnetconfig/liblnetconfig.h b/lnet/utils/lnetconfig/liblnetconfig.h index 74ecf32..67403ec 100644 --- a/lnet/utils/lnetconfig/liblnetconfig.h +++ b/lnet/utils/lnetconfig/liblnetconfig.h @@ -530,6 +530,15 @@ int lustre_lnet_show_max_intf(int seq_no, struct cYAML **show_rc, int lustre_lnet_calc_service_id(__u64 *service_id); /* + * lustre_lnet_setup_mrrouting + * configure linux routing tables for tcp interfaces + * + * err_rc - [OUT] struct cYAML tree describing the error. Freed by + * caller + */ +int lustre_lnet_setup_mrrouting(struct cYAML **err_rc); + +/* * lustre_lnet_config_discovery * Enable or disable peer discovery. Peer discovery is enabled by default. * diff --git a/lnet/utils/lnetctl.c b/lnet/utils/lnetctl.c index c935e8f..23888fb 100644 --- a/lnet/utils/lnetctl.c +++ b/lnet/utils/lnetctl.c @@ -90,6 +90,7 @@ static int jt_calc_service_id(int argc, char **argv); static int jt_set_response_tracking(int argc, char **argv); static int jt_set_recovery_limit(int argc, char **argv); static int jt_udsp(int argc, char **argv); +static int jt_setup_mrrouting(int argc, char **argv); command_t cmd_list[] = { {"lnet", jt_lnet, 0, "lnet {configure | unconfigure} [--all]"}, @@ -112,6 +113,8 @@ command_t cmd_list[] = { {"discover", jt_discover, 0, "discover nid[,nid,...]"}, {"service-id", jt_calc_service_id, 0, "Calculate IB Lustre service ID\n"}, {"udsp", jt_udsp, 0, "udsp {add | del | help}"}, + {"setup-mrrouting", jt_setup_mrrouting, 0, + "setup linux routing tables\n"}, {"help", Parser_help, 0, "help"}, {"exit", Parser_quit, 0, "quit"}, {"quit", Parser_quit, 0, "quit"}, @@ -156,7 +159,8 @@ command_t net_cmds[] = { "\t--peer-buffer-credits: the number of buffer credits per peer\n" "\t--credits: Network Interface credits\n" "\t--cpt: CPU Partitions configured net uses (e.g. [0,1]\n" - "\t--conns-per-peer: number of connections per peer\n"}, + "\t--conns-per-peer: number of connections per peer\n" + "\t--skip-mr-route-setup: do not add linux route for the ni\n"}, {"del", jt_del_ni, 0, "delete a network\n" "\t--net: net name (e.g. tcp0)\n" "\t--if: physical interface (e.g. eth0)\n"}, @@ -301,6 +305,21 @@ static int jt_calc_service_id(int argc, char **argv) return rc; } +static int jt_setup_mrrouting(int argc, char **argv) +{ + int rc; + struct cYAML *err_rc = NULL; + + rc = lustre_lnet_setup_mrrouting(&err_rc); + + if (rc != LUSTRE_CFG_RC_NO_ERR) + cYAML_print_tree2file(stderr, err_rc); + + cYAML_free_tree(err_rc); + + return rc; +} + static inline void print_help(const command_t cmds[], const char *cmd_type, const char *pc_name) { @@ -957,16 +976,19 @@ static int jt_add_ni(int argc, char **argv) struct cfs_expr_list *global_cpts = NULL; struct lnet_ioctl_config_lnd_tunables tunables; bool found = false; + bool skip_mr_route_setup = false; memset(&tunables, 0, sizeof(tunables)); lustre_lnet_init_nw_descr(&nw_descr); - const char *const short_options = "b:c:i:m:n:p:r:s:t:"; + const char *const short_options = "b:c:i:k:m:n:p:r:s:t:"; static const struct option long_options[] = { { .name = "peer-buffer-credits", .has_arg = required_argument, .val = 'b' }, { .name = "peer-credits", .has_arg = required_argument, .val = 'c' }, { .name = "if", .has_arg = required_argument, .val = 'i' }, + { .name = "skip-mr-route-setup", + .has_arg = no_argument, .val = 'k' }, { .name = "conns-per-peer", .has_arg = required_argument, .val = 'm' }, { .name = "net", .has_arg = required_argument, .val = 'n' }, @@ -1008,6 +1030,9 @@ static int jt_add_ni(int argc, char **argv) goto failed; } break; + case 'k': + skip_mr_route_setup = true; + break; case 'm': rc = parse_long(optarg, &cpp); if (rc != 0) { @@ -1073,6 +1098,16 @@ failed: cYAML_free_tree(err_rc); + if (rc == LUSTRE_CFG_RC_NO_ERR && !skip_mr_route_setup) { + err_rc = NULL; + rc = lustre_lnet_setup_mrrouting(&err_rc); + + if (rc != LUSTRE_CFG_RC_NO_ERR) + cYAML_print_tree2file(stderr, err_rc); + + cYAML_free_tree(err_rc); + } + return rc; } diff --git a/lustre/conf/99-lustre.rules b/lustre/conf/99-lustre.rules index c5632f6..22616e9 100644 --- a/lustre/conf/99-lustre.rules +++ b/lustre/conf/99-lustre.rules @@ -2,3 +2,5 @@ KERNEL=="obd", MODE="0666" # set sysfs values on client SUBSYSTEM=="lustre", ACTION=="change", ENV{PARAM}=="?*", RUN+="/usr/sbin/lctl set_param '$env{PARAM}=$env{SETTING}'" +# setup linux routes for mr on lustre load +SUBSYSTEM=="module", ACTION=="add", DEVPATH=="/module/lustre", RUN+="/usr/sbin/lnetctl setup-mrrouting" diff --git a/lustre/scripts/Makefile.am b/lustre/scripts/Makefile.am index bdadab4..cb30006 100644 --- a/lustre/scripts/Makefile.am +++ b/lustre/scripts/Makefile.am @@ -36,7 +36,7 @@ genscripts = lc_modprobe lc_net lc_hb lc_cluman lc_md lc_lvm lustre_start lnet SUBDIRS = systemd -sbin_SCRIPTS = lustre_rmmod ko2iblnd-probe +sbin_SCRIPTS = lustre_rmmod ko2iblnd-probe ksocklnd-config if RHEL initdir = $(sysconfdir)/init.d @@ -92,7 +92,7 @@ EXTRA_DIST = license-status lustre_rmmod ldev lc_mon lhbadm \ lc_servip lustre_routes_config lustre_routes_conversion \ $(addsuffix .in,$(genscripts)) lfs_migrate lustre_req_history \ lustre lsvcgss lc_common haconfig Lustre.ha_v2 dkms.mkconf \ - zfsobj2fid ko2iblnd-probe statechange-lustre.sh \ + zfsobj2fid ko2iblnd-probe ksocklnd-config statechange-lustre.sh \ vdev_attach-lustre.sh vdev_remove-lustre.sh vdev_clear-lustre.sh \ bash-completion/lustre bash-completion/lctl bash-completion/lfs diff --git a/lustre/scripts/ksocklnd-config b/lustre/scripts/ksocklnd-config new file mode 100755 index 0000000..c06e822 --- /dev/null +++ b/lustre/scripts/ksocklnd-config @@ -0,0 +1,165 @@ +#!/bin/sh + +me="${0##*/}" + +# convert number of mask bits to x.x.x.x mask format +cidr2mask() { + local i mask="" + local full_octets=$(($1/8)) + local partial_octet=$(($1%8)) + + for ((i=0;i<4;i+=1)); do + if [ $i -lt $full_octets ]; then + mask+=255 + elif [ $i -eq $full_octets ]; then + mask+=$((256 - 2**(8-$partial_octet))) + else + mask+=0 + fi + test $i -lt 3 && mask+=. + done + + echo $mask +} + +# apply netmask (second argument) to ip address (first argument) +netcalc() { + local ipa=$(echo ${1} | awk -F. '{ print $1 }') + local ipb=$(echo ${1} | awk -F. '{ print $2 }') + local ipc=$(echo ${1} | awk -F. '{ print $3 }') + local ipd=$(echo ${1} | awk -F. '{ print $4 }') + local mka=$(echo ${2} | awk -F. '{ print $1 }') + local mkb=$(echo ${2} | awk -F. '{ print $2 }') + local mkc=$(echo ${2} | awk -F. '{ print $3 }') + local mkd=$(echo ${2} | awk -F. '{ print $4 }') + local nta="$(( $ipa & $mka ))" + local ntb="$(( $ipb & $mkb ))" + local ntc="$(( $ipc & $mkc ))" + local ntd="$(( $ipd & $mkd ))" + echo "$nta.$ntb.$ntc.$ntd" +} + +# Check if the user wants to skip setting the routes +checkskipcmd=$(cat /sys/module/ksocklnd/parameters/skip_mr_route_setup 2>&-) +if [ "$checkskipcmd" == "1" ]; then + exit 0 +fi + +# Extract comma-separated interfaces from the argument +j=0 +declare -a interfaces +for i in $(echo $1 | sed "s/,/ /g") +do + # verify that the interface exists + #echo "$i" + addr=$(/sbin/ip -o -4 addr list $i 2>&- | awk '{print $4}' | cut -d/ -f1) + linelen=$(echo -n $addr | wc -m) + if [[ $linelen -eq 0 ]]; then + # there's a problem with this interface, skip it + #echo 'bad!' + continue + fi + # check if route is already set up for this interface + intfroute=$(/sbin/ip route show table $i 2>&-) + if [[ ! -z $intfroute ]]; then + # route exists so skip this interface + logcmd=(logger "${me}: skip setting up route for ${i}: don\'t overwrite existing route") + eval "${logcmd[@]}" + continue + fi + interfaces[$j]=$i + j=$((j+1)) +done + +# this array will contain the interfaces +# already listed in rt_tables +interfaces_listed=() + +# flush cache for every interface +for i in "${interfaces[@]}" +do + # build command + redirect="2>&-" + flushcmd=(/sbin/ip route flush table ${i} ${redirect} ) + # execute command + eval "${flushcmd[@]}" + logcmd=(logger "${me}: ${flushcmd[@]}") + eval "${logcmd[@]}" +done + +filename='/etc/iproute2/rt_tables' +n=1 +max_table_num=0 +while read line; do + # reading each line + # trim leading and trailing spaces + line=`echo $line | sed -e 's/^[[:space:]]*//'` + linelen=$(echo -n $line | wc -m) + # don't check empty lines + if [ $linelen -lt 1 ]; then + continue + fi + # don't check comments + if [[ ${line:0:1} == "#" ]]; then + continue + fi + # split using space as separator + splitline=( $line ) + # check the table number and update the max + if [ $max_table_num -lt ${splitline[0]} ]; then + max_table_num=${splitline[0]} + fi + # check if any of the interfaces are listed + for i in "${interfaces[@]}" + do + if [[ " ${splitline[@]} " =~ " ${i} " ]]; then + if [[ " ${interfaces[@]} " =~ " ${i} " ]]; then + interfaces_listed+=($i) + fi + fi + done + #echo "Line No. $n : $line: $max_table_num" + n=$((n+1)) +done < $filename + +# add entries for unlisted interfaces +for i in "${interfaces[@]}" +do + if [[ ! " ${interfaces_listed[@]} " =~ " ${i} " ]]; then + max_table_num=$((max_table_num+1)) + echo "$max_table_num $i" >> $filename + fi +done + +# add the routing entries and rules +for i in "${interfaces[@]}" +do + # extract ipv4 address and netmask in cidr format + addr=($(/sbin/ip -o -4 addr list $i 2>&- | awk '{print $4}' | cut -d/ -f1)) + cidrmask=($(/sbin/ip -o -4 addr list $i 2>&- | awk '{print $4}' | cut -d/ -f2)) + # convert cidr mask to mask in dot format + dotmask=$(cidr2mask ${cidrmask[0]}) + # apply mask to ip addr + net=$(netcalc ${addr[0]} $dotmask) + # build and execute route commands + routecmd=(/sbin/ip route add ${net}/${cidrmask[0]} dev ${i} proto kernel scope link src ${addr[0]} table ${i}) + ruledelcmd=(/sbin/ip rule del from ${addr[0]} table ${i} '&>/dev/null') + ruleaddcmd=(/sbin/ip rule add from ${addr[0]} table ${i}) + eval ${routecmd[@]} + eval ${ruledelcmd[@]} + eval ${ruleaddcmd[@]} + logcmd1=(logger "${me}: ${routecmd[@]}") + logcmd2=(logger "${me}: ${ruledelcmd[@]}") + logcmd3=(logger "${me}: ${ruleaddcmd[@]}") + eval "${logcmd1[@]}" + eval "${logcmd2[@]}" + eval "${logcmd3[@]}" +done + +# flush arp tables +for i in "${interfaces[@]}" +do + flushcmd=(/sbin/ip neigh flush dev ${i}) + eval ${flushcmd[@]} +done + diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 03e4a30..f64fcec 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -2088,6 +2088,7 @@ test_230() { lnid="$(lctl list_nids | head -n 1)" do_lnetctl ping "$lnid" || error "failed to ping myself" + # "lctl --net tcp conn_list" prints the list of active # connections. Since we're pinging ourselves, there should be # 2 Control connections plus 2*conns_per_peer connections @@ -2121,6 +2122,15 @@ test_230() { } run_test 230 "Test setting conns-per-peer" +### Test that linux route is added for each ni +test_250() { + have_interface "eth0" || skip "Need eth0 interface with ipv4 configured" + reinit_dlc || return $? + add_net "tcp" "eth0" || return $? + ip route show table eth0 | grep -q "eth0" +} +run_test 250 "test that linux routes are added" + test_300() { # LU-13274 local header diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 5b472e4..bd4e5db 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -299,6 +299,9 @@ init_test_env() { [ ! -f "$LCTL" ] && export LCTL=$(which lctl) export LFS=${LFS:-"$LUSTRE/utils/lfs"} [ ! -f "$LFS" ] && export LFS=$(which lfs) + export KSOCKLND_CONFIG=${KSOCKLND_CONFIG:-"$LUSTRE/scripts/ksocklnd-config"} + [ ! -f "$KSOCKLND_CONFIG" ] && + export KSOCKLND_CONFIG=$(which ksocklnd-config 2> /dev/null) export PERM_CMD=${PERM_CMD:-"$LCTL conf_param"}