3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 # bug number for skipped test:
12 ALWAYS_EXCEPT="$SANITY_LNET_EXCEPT "
13 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
15 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
19 . $LUSTRE/tests/test-framework.sh
23 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
28 [[ -z $LNETCTL ]] && skip "Need lnetctl"
32 if is_mounted $MOUNT || is_mounted $MOUNT2; then
33 cleanupall || error "Failed cleanup prior to test execution"
38 echo "Cleaning up LNet"
39 lsmod | grep -q lnet &&
40 $LNETCTL lnet unconfigure 2>/dev/null
45 if module_loaded lnet ; then
46 cleanup_lnet || error "Failed to unload modules before test execution"
52 # Cleanup any tmp files created by the sub tests
53 rm -f $TMP/sanity-lnet-*.yaml $LNET_PARAMS_FILE
56 if $restore_mounts; then
57 setupall || error "Failed to setup Lustre after test execution"
58 elif $restore_modules; then
60 error "Couldn't load modules after test execution"
69 echo "ip netns exec $TESTNS $*"
70 ip netns exec $TESTNS "$@"
78 netns_arg="netns $netns"
80 ip link add 'test1pl' type veth peer name $FAKE_IF $netns_arg
81 ip link set 'test1pl' up
82 if [[ -n $netns ]]; then
83 do_ns ip addr add "${FAKE_IP}/31" dev $FAKE_IF
84 do_ns ip link set $FAKE_IF up
86 ip addr add "${FAKE_IP}/31" dev $FAKE_IF
87 ip link set $FAKE_IF up
92 ip link show test1pl >& /dev/null && ip link del test1pl || return 0
103 (ip netns list | grep -q $TESTNS) && ip netns del $TESTNS
108 echo "Loading LNet and configuring DLC"
109 load_lnet || return $?
110 do_lnetctl lnet configure
113 GLOBAL_YAML_FILE=$TMP/sanity-lnet-global.yaml
114 define_global_yaml() {
115 $LNETCTL export --backup >${GLOBAL_YAML_FILE} ||
116 error "Failed to export global yaml $?"
120 if lsmod | grep -q lnet; then
121 do_lnetctl lnet unconfigure ||
122 error "lnetctl lnet unconfigure failed $?"
123 do_lnetctl lnet configure ||
124 error "lnetctl lnet configure failed $?"
126 configure_dlc || error "configure_dlc failed $?"
131 append_global_yaml() {
132 [[ ! -e ${GLOBAL_YAML_FILE} ]] &&
133 error "Missing global yaml at ${GLOBAL_YAML_FILE}"
135 cat ${GLOBAL_YAML_FILE} >> $TMP/sanity-lnet-$testnum-expected.yaml
138 create_base_yaml_file() {
142 compare_yaml_files() {
143 local expected="$TMP/sanity-lnet-$testnum-expected.yaml"
144 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
146 ! [[ -e $expected ]] && echo "$expected not found" && return 1
147 ! [[ -e $actual ]] && echo "$actual not found" && return 1
148 diff -upN ${actual} ${expected} || rc=$?
158 local net="${nid//*@/}"
159 local addr="${nid//@*/}"
161 local num_re='[0-9]+'
162 local ip_re="[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}"
164 if [[ $net =~ (gni|kfi)[0-9]* ]]; then
165 [[ $addr =~ ${num_re} ]] && return 0
167 [[ $addr =~ ${ip_re} ]] && return 0
172 local yfile=$TMP/sanity-lnet-$testnum-actual.yaml
173 local primary_nids=$(awk '/- primary nid:/{print $NF}' $yfile | xargs echo)
174 local secondary_nids=$(awk '/- nid:/{print $NF}' $yfile | xargs echo)
175 local gateway_nids=$(awk '/gateway:/{print $NF}' $yfile | xargs echo)
178 for nid in $primary_nids $secondary_nids; do
179 validate_nid "$nid" || error "Bad NID \"${nid}\""
184 validate_peer_nids() {
186 local nids_per_peer="$2"
188 local expect_p="$num_peers"
189 # The primary nid also shows up in the list of secondary nids
190 local expect_s="$(($num_peers + $(($nids_per_peer*$num_peers))))"
192 local actual_p=$(grep -c -- '- primary nid:' $TMP/sanity-lnet-$testnum-actual.yaml)
193 local actual_s=$(grep -c -- '- nid:' $TMP/sanity-lnet-$testnum-actual.yaml)
194 if [[ $expect_p -ne $actual_p ]]; then
196 error "Expected $expect_p but found $actual_p primary nids"
197 elif [[ $expect_s -ne $actual_s ]]; then
199 error "Expected $expect_s but found $actual_s secondary nids"
204 validate_gateway_nids() {
205 local expect_gw=$(grep -c -- 'gateway:' $TMP/sanity-lnet-$testnum-expected.yaml)
206 local actual_gw=$(grep -c -- 'gateway:' $TMP/sanity-lnet-$testnum-actual.yaml)
207 if [[ $expect_gw -ne $actual_gw ]]; then
209 error "Expected $expect_gw gateways but found $actual_gw gateways"
212 local expect_gwnids=$(awk '/gateway:/{print $NF}' $TMP/sanity-lnet-$testnum-expected.yaml |
215 for nid in ${expect_gwnids}; do
216 if ! grep -q "gateway: ${nid}" $TMP/sanity-lnet-$testnum-actual.yaml; then
217 error "${nid} not configured as gateway"
225 setup_netns || error "setup_netns failed with $?"
227 # Determine the local interface(s) used for LNet
228 load_lnet "config_on_load=1" || error "Failed to load modules"
233 INTERFACES=( $(lnet_if_list) )
235 cleanup_lnet || error "Failed to cleanup LNet"
237 stack_trap 'cleanup_testsuite' EXIT
240 configure_dlc || error "Failed to configure DLC rc = $?"
242 reinit_dlc || return $?
243 do_lnetctl import < ${GLOBAL_YAML_FILE} || error "Import failed $?"
244 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
245 create_base_yaml_file
246 compare_yaml_files || error "Configuration changed after import"
248 run_test 0 "Export empty config, import the config, compare"
251 local prim_nid="${1:+--prim_nid $1}"
252 local nid="${2:+--nid $2}"
254 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
256 do_lnetctl peer add ${prim_nid} ${nid} || error "peer add failed $?"
257 $LNETCTL export --backup > $actual || error "export failed $?"
263 reinit_dlc || return $?
264 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
266 - primary nid: 1.1.1.1@tcp
272 compare_peer_add "1.1.1.1@tcp"
274 run_test 1 "Add peer with single nid (tcp)"
277 reinit_dlc || return $?
278 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
280 - primary nid: 2.2.2.2@o2ib
286 compare_peer_add "2.2.2.2@o2ib"
288 run_test 2 "Add peer with single nid (o2ib)"
291 reinit_dlc || return $?
292 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
294 - primary nid: 3.3.3.3@tcp
301 compare_peer_add "3.3.3.3@tcp" "3.3.3.3@o2ib"
303 run_test 3 "Add peer with tcp primary o2ib secondary"
306 reinit_dlc || return $?
307 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
309 - primary nid: 4.4.4.4@tcp
318 echo "Add peer with nidrange (tcp)"
319 compare_peer_add "4.4.4.4@tcp" "4.4.4.[1-3]@tcp"
321 echo "Add peer with nidrange that overlaps primary nid (tcp)"
322 compare_peer_add "4.4.4.4@tcp" "4.4.4.[1-4]@tcp"
324 run_test 4 "Add peer with nidrange (tcp)"
327 reinit_dlc || return $?
328 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
330 - primary nid: 5.5.5.5@o2ib
340 echo "Add peer with nidrange (o2ib)"
341 compare_peer_add "5.5.5.5@o2ib" "5.5.5.[1-4]@o2ib"
343 echo "Add peer with nidranage that overlaps primary nid (o2ib)"
344 compare_peer_add "5.5.5.5@o2ib" "5.5.5.[1-4]@o2ib"
346 run_test 5 "Add peer with nidrange (o2ib)"
349 reinit_dlc || return $?
350 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
352 - primary nid: 6.6.6.6@tcp
375 local nid_expr="6.6.[6-7].[0-4/2]@tcp"
376 nid_expr+=",6.6.[1-4/2].[0-6/3]@o2ib"
377 nid_expr+=",[6-12/4]@gni"
378 nid_expr+=",[6-12/4]@kfi"
380 compare_peer_add "6.6.6.6@tcp" "${nid_expr}"
382 run_test 6 "Add peer with multiple nidranges"
385 local prim_nid="${1:+--prim_nid $1}"
386 local nid="${2:+--nid $2}"
388 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
390 do_lnetctl peer del ${prim_nid} ${nid} || error "peer del failed $?"
391 $LNETCTL export --backup > $actual || error "export failed $?"
397 reinit_dlc || return $?
398 create_base_yaml_file
400 echo "Delete peer with single nid (tcp)"
401 do_lnetctl peer add --prim_nid 7.7.7.7@tcp || error "Peer add failed $?"
402 compare_peer_del "7.7.7.7@tcp"
404 echo "Delete peer with single nid (o2ib)"
405 do_lnetctl peer add --prim_nid 7.7.7.7@o2ib || error "Peer add failed $?"
406 compare_peer_del "7.7.7.7@o2ib"
408 echo "Delete peer that has multiple nids (tcp)"
409 do_lnetctl peer add --prim_nid 7.7.7.7@tcp --nid 7.7.7.[8-12]@tcp ||
410 error "Peer add failed $?"
411 compare_peer_del "7.7.7.7@tcp"
413 echo "Delete peer that has multiple nids (o2ib)"
414 do_lnetctl peer add --prim_nid 7.7.7.7@o2ib --nid 7.7.7.[8-12]@o2ib ||
415 error "Peer add failed $?"
416 compare_peer_del "7.7.7.7@o2ib"
418 echo "Delete peer that has both tcp and o2ib nids"
419 do_lnetctl peer add --prim_nid 7.7.7.7@tcp \
420 --nid 7.7.7.[9-12]@tcp,7.7.7.[13-15]@o2ib ||
421 error "Peer add failed $?"
422 compare_peer_del "7.7.7.7@tcp"
424 echo "Delete peer with single nid (gni)"
425 do_lnetctl peer add --prim_nid 7@gni || error "Peer add failed $?"
426 compare_peer_del "7@gni"
428 echo "Delete peer that has multiple nids (gni)"
429 do_lnetctl peer add --prim_nid 7@gni --nid [8-12]@gni ||
430 error "Peer add failed $?"
431 compare_peer_del "7@gni"
433 echo "Delete peer with single nid (kfi)"
434 do_lnetctl peer add --prim_nid 7@kfi || error "Peer add failed $?"
435 compare_peer_del "7@kfi"
437 echo "Delete peer that has multiple nids (kfi)"
438 do_lnetctl peer add --prim_nid 7@kfi --nid [8-12]@kfi ||
439 error "Peer add failed $?"
440 compare_peer_del "7@kfi"
442 echo "Delete peer that has tcp, o2ib, gni and kfi nids"
443 do_lnetctl peer add --prim_nid 7@gni \
444 --nid [8-12]@gni,7.7.7.[1-4]@tcp,7.7.7.[5-9]@o2ib,[1-5]@kfi ||
445 error "Peer add failed $?"
446 compare_peer_del "7@gni"
448 run_test 7 "Various peer delete tests"
451 reinit_dlc || return $?
453 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
455 - primary nid: 8.8.8.8@tcp
467 do_lnetctl peer add --prim_nid 8.8.8.8@tcp --nid 8.8.8.[10-15]@tcp ||
468 error "Peer add failed $?"
469 compare_peer_del "8.8.8.8@tcp" "8.8.8.13@tcp"
471 run_test 8 "Delete single secondary nid from peer (tcp)"
474 reinit_dlc || return $?
476 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
478 - primary nid: 9.9.9.9@tcp
485 do_lnetctl peer add --prim_nid 9.9.9.9@tcp \
486 --nid 9.9.9.[11-16]@tcp || error "Peer add failed $?"
487 compare_peer_del "9.9.9.9@tcp" "9.9.9.[11-16]@tcp"
489 run_test 9 "Delete all secondary nids from peer (tcp)"
492 reinit_dlc || return $?
494 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
496 - primary nid: 10.10.10.10@tcp
499 - nid: 10.10.10.10@tcp
500 - nid: 10.10.10.12@tcp
501 - nid: 10.10.10.13@tcp
502 - nid: 10.10.10.15@tcp
503 - nid: 10.10.10.16@tcp
506 do_lnetctl peer add --prim_nid 10.10.10.10@tcp \
507 --nid 10.10.10.[12-16]@tcp || error "Peer add failed $?"
508 compare_peer_del "10.10.10.10@tcp" "10.10.10.14@tcp"
510 run_test 10 "Delete single secondary nid from peer (o2ib)"
513 reinit_dlc || return $?
515 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
517 - primary nid: 11.11.11.11@tcp
520 - nid: 11.11.11.11@tcp
523 do_lnetctl peer add --prim_nid 11.11.11.11@tcp \
524 --nid 11.11.11.[13-17]@tcp || error "Peer add failed $?"
525 compare_peer_del "11.11.11.11@tcp" "11.11.11.[13-17]@tcp"
527 run_test 11 "Delete all secondary nids from peer (o2ib)"
530 reinit_dlc || return $?
532 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
534 - primary nid: 12.12.12.12@o2ib
537 - nid: 12.12.12.12@o2ib
538 - nid: 13.13.13.13@o2ib
539 - nid: 14.13.13.13@o2ib
540 - nid: 14.15.13.13@o2ib
542 - nid: 15.17.1.10@tcp
543 - nid: 15.17.1.20@tcp
546 do_lnetctl peer add --prim_nid 12.12.12.12@o2ib \
547 --nid [13-14/1].[13-15/2].13.13@o2ib,[15-16/3].[17-19/4].[1].[5-20/5]@tcp ||
548 error "Peer add failed $?"
549 compare_peer_del "12.12.12.12@o2ib" "13.15.13.13@o2ib,15.17.1.15@tcp"
551 run_test 12 "Delete a secondary nid from peer (tcp and o2ib)"
554 reinit_dlc || return $?
556 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
558 - primary nid: 13.13.13.13@o2ib
561 - nid: 13.13.13.13@o2ib
564 do_lnetctl peer add --prim_nid 13.13.13.13@o2ib \
565 --nid [14-15].[1-2/1].[1].[100-254/10]@tcp,14.14.[254].14@o2ib ||
566 error "Peer add failed $?"
567 compare_peer_del "13.13.13.13@o2ib" \
568 "[14-15].[1-2/1].[1].[100-254/10]@tcp,14.14.[254].14@o2ib"
570 run_test 13 "Delete all secondary nids from peer (tcp and o2ib)"
576 if [[ $net =~ gni* ]] || [[ $net =~ kfi* ]]; then
579 echo "${num}.${num}.${num}.${num}@${net}"
583 create_mr_peer_yaml() {
585 local secondary_nids="$2"
588 echo "Generating peer yaml for $num_peers peers with $secondary_nids secondary nids"
589 echo "peer:" >> $TMP/sanity-lnet-$testnum-expected.yaml
591 local total_nids=$((num_peers + $((num_peers * secondary_nids))))
594 while [[ $created -lt $num_peers ]]; do
595 local primary=$(create_nid ${nidnum} ${net})
596 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
597 - primary nid: $primary
603 local start=$((nidnum + 1))
604 local end=$((nidnum + $secondary_nids))
605 for j in $(seq ${start} ${end}); do
606 local nid=$(create_nid $j ${net})
607 echo " - nid: $nid" >> $TMP/sanity-lnet-$testnum-expected.yaml
615 reinit_dlc || return $?
617 echo "Create single peer, single nid, using import"
618 create_mr_peer_yaml 1 0 tcp
619 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
620 error "Import failed $?"
622 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
625 echo "Delete single peer using import --del"
626 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml ||
627 error "Import failed $?"
628 rm -f $TMP/sanity-lnet-$testnum-expected.yaml
629 create_base_yaml_file
630 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
633 run_test 14 "import peer create/delete with single nid"
636 reinit_dlc || return $?
638 echo "Create multiple peers, single nid per peer, using import"
639 create_mr_peer_yaml 5 0 o2ib
640 # The ordering of nids for this use-case is non-deterministic, so we
641 # we can't just diff the expected/actual output.
642 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
643 error "Import failed $?"
644 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
645 validate_peer_nids 5 0
647 echo "Delete multiple peers, single nid per peer, using import --del"
648 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml ||
649 error "Import failed $?"
650 rm -f $TMP/sanity-lnet-$testnum-expected.yaml
651 create_base_yaml_file
652 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
655 run_test 15 "import multi peer create/delete with single nid per peer"
658 reinit_dlc || return $?
660 echo "Create single peer, multiple nids, using import"
661 create_mr_peer_yaml 1 5 tcp
662 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
663 error "Import failed $?"
664 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
665 validate_peer_nids 1 5
667 echo "Delete single peer, multiple nids, using import --del"
668 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml ||
669 error "Import failed $?"
670 rm -f $TMP/sanity-lnet-$testnum-expected.yaml
671 create_base_yaml_file
672 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
675 run_test 16 "import peer create/delete with multiple nids"
678 reinit_dlc || return $?
680 echo "Create multiple peers, multiple nids per peer, using import"
681 create_mr_peer_yaml 5 7 o2ib
682 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
683 error "Import failed $?"
684 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
685 validate_peer_nids 5 7
687 echo "Delete multiple peers, multiple nids per peer, using import --del"
688 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml ||
689 error "Import failed $?"
690 rm -f $TMP/sanity-lnet-$testnum-expected.yaml
691 create_base_yaml_file
692 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
695 run_test 17 "import multi peer create/delete with multiple nids"
698 reinit_dlc || return $?
700 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
702 - primary nid: 1.1.1.1@tcp
711 echo "Import peer with 5 nids"
712 cat $TMP/sanity-lnet-$testnum-expected.yaml
713 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
714 error "Import failed $?"
715 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
717 - primary nid: 1.1.1.1@tcp
724 echo "Delete three of the nids"
725 cat $TMP/sanity-lnet-$testnum-expected.yaml
726 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml
727 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
729 - primary nid: 1.1.1.1@tcp
735 echo "Check peer has expected nids remaining"
736 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
740 run_test 18a "Delete a subset of nids from a single peer using import --del"
743 reinit_dlc || return $?
745 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
747 - primary nid: 1.1.1.1@tcp
755 - primary nid: 6.6.6.6@o2ib
764 echo "Import two peers with 5 nids each"
765 cat $TMP/sanity-lnet-$testnum-expected.yaml
766 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
767 error "Import failed $?"
768 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
770 - primary nid: 1.1.1.1@tcp
776 - primary nid: 6.6.6.6@o2ib
783 echo "Delete three of the nids from each peer"
784 cat $TMP/sanity-lnet-$testnum-expected.yaml
785 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml
786 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
788 - primary nid: 6.6.6.6@o2ib
793 - primary nid: 1.1.1.1@tcp
800 echo "Check peers have expected nids remaining"
801 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
803 validate_peer_nids 2 1
805 run_test 18b "Delete multiple nids from multiple peers using import --del"
808 reinit_dlc || return $?
809 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
811 - primary nid: 19@gni
817 compare_peer_add "19@gni"
819 run_test 19 "Add peer with single nid (gni)"
822 reinit_dlc || return $?
823 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
825 - primary nid: 20@gni
829 - nid: 20.20.20.20@tcp
830 - nid: 20.20.20.20@o2ib
833 compare_peer_add "20@gni" "20.20.20.20@tcp,20.20.20.20@o2ib"
835 run_test 20 "Add peer with gni primary and tcp, o2ib secondary"
838 reinit_dlc || return $?
839 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
841 - primary nid: 21@gni
851 echo "Add peer with nidrange (gni)"
852 compare_peer_add "21@gni" "[22-25]@gni" || error
853 echo "Add peer with nidrange that overlaps primary nid (gni)"
854 compare_peer_add "21@gni" "[21-25]@gni"
856 run_test 21 "Add peer with nidrange (gni)"
859 reinit_dlc || return $?
860 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
862 - primary nid: 22@gni
873 do_lnetctl peer add --prim_nid 22@gni --nid [24-29]@gni ||
874 error "Peer add failed $?"
875 compare_peer_del "22@gni" "26@gni"
877 run_test 22 "Delete single secondary nid from peer (gni)"
880 reinit_dlc || return $?
881 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
883 - primary nid: 23@gni
890 do_lnetctl peer add --prim_nid 23@gni --nid [25-29]@gni ||
891 error "Peer add failed $?"
892 compare_peer_del "23@gni" "[25-29]@gni"
894 run_test 23 "Delete all secondary nids from peer (gni)"
897 reinit_dlc || return $?
898 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
900 - primary nid: 24@gni
905 - nid: 13.13.13.13@o2ib
906 - nid: 14.13.13.13@o2ib
907 - nid: 14.15.13.13@o2ib
909 - nid: 15.17.1.10@tcp
910 - nid: 15.17.1.20@tcp
913 do_lnetctl peer add --prim_nid 24@gni \
914 --nid [13-14/1].[13-15/2].13.13@o2ib,[15-16/3].[17-19/4].[1].[5-20/5]@tcp,[5-12/6]@gni ||
915 error "Peer add failed $?"
916 compare_peer_del "24@gni" "5@gni,13.15.13.13@o2ib,15.17.1.15@tcp"
918 run_test 24 "Delete a secondary nid from peer (tcp, o2ib and gni)"
921 reinit_dlc || return $?
922 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
924 - primary nid: 25@gni
930 do_lnetctl peer add --prim_nid 25@gni \
931 --nid [26-27].[4-10/3].26.26@tcp,26.26.26.26@o2ib,[30-35]@gni ||
932 error "Peer add failed $?"
933 compare_peer_del "25@gni" \
934 "[26-27].[4-10/3].26.26@tcp,26.26.26.26@o2ib,[30-35]@gni"
936 run_test 25 "Delete all secondary nids from peer (tcp, gni and o2ib)"
939 reinit_dlc || return $?
941 do_lnetctl peer add --prim_nid 1.1.1.1@tcp --lock_prim ||
942 error "Peer add with --lock_prim option failed $?"
943 local peer_state=$($LNETCTL peer show -v 4 --nid 1.1.1.1@tcp |
944 awk '/peer state/ {print $NF}')
945 # This relies on the following peer state definition:
946 # #define LNET_PEER_LOCK_PRIMARY BIT(20)
947 if ((!("$peer_state" & (1 << 20)))); then
948 error "Peer state does not have 'locked' bit set: $peer_state"
950 do_lnetctl peer del --prim_nid 1.1.1.1@tcp ||
951 error "Peer del failed $?"
952 $LNETCTL peer show --nid 1.1.1.1@tcp | grep -q 1.1.1.1@tcp ||
953 error "1.1.1.1@tcp is not listed"
954 do_lnetctl peer del --prim_nid 1.1.1.1@tcp --force ||
955 error "Peer del --force failed $?"
956 do_lnetctl peer show --nid 1.1.1.1@tcp &&
957 error "failed to delete 1.1.1.1@tcp"
961 run_test 26 "Delete peer with primary nid locked"
964 reinit_dlc || return $?
966 echo "Invalid prim_nid - peer add"
967 do_lnetctl peer add --prim_nid foobar &&
968 error "Command should have failed"
970 echo "Invalid prim_nid - peer del"
971 do_lnetctl peer del --prim_nid foobar &&
972 error "Command should have failed"
974 echo "Delete non-existing peer"
975 do_lnetctl peer del --prim_nid 1.1.1.1@o2ib &&
976 error "Command should have failed"
978 echo "Don't provide mandatory argument for peer del"
979 do_lnetctl peer del --nid 1.1.1.1@tcp &&
980 error "Command should have failed"
982 echo "Don't provide mandatory argument for peer add"
983 do_lnetctl peer add --nid 1.1.1.1@tcp &&
984 error "Command should have failed"
986 echo "Don't provide mandatory arguments peer add"
987 do_lnetctl peer add &&
988 error "Command should have failed"
990 echo "Invalid secondary nids"
991 do_lnetctl peer add --prim_nid 1.1.1.1@tcp --nid foobar &&
992 error "Command should have failed"
994 echo "Exceed max nids per peer"
995 do_lnetctl peer add --prim_nid 1.1.1.1@tcp --nid 1.1.1.[2-255]@tcp &&
996 error "Command should have failed"
998 echo "Invalid net type"
999 do_lnetctl peer add --prim_nid 1@foo &&
1000 error "Command should have failed"
1002 echo "Invalid nid format"
1003 local invalid_nids="1@tcp 1@o2ib 1.1.1.1@gni"
1006 for nid in ${invalid_nids}; do
1007 echo "Check invalid primary nid - '$nid'"
1008 do_lnetctl peer add --prim_nid $nid &&
1009 error "Command should have failed"
1012 local invalid_strs="[2-1]@gni [a-f/x]@gni 256.256.256.256@tcp"
1013 invalid_strs+=" 1.1.1.1.[2-5/f]@tcp 1.]2[.3.4@o2ib"
1014 invalid_strs+="1.[2-4,[5-6],7-8].1.1@tcp foobar"
1017 for nidstr in ${invalid_strs}; do
1018 echo "Check invalid nidstring - '$nidstr'"
1019 do_lnetctl peer add --prim_nid 1.1.1.1@tcp --nid $nidstr &&
1020 error "Command should have failed"
1023 echo "Add non-local gateway"
1024 do_lnetctl route add --net tcp --gateway 1@gni &&
1025 error "Command should have failed"
1029 run_test 99a "Check various invalid inputs to lnetctl peer"
1032 reinit_dlc || return $?
1034 create_base_yaml_file
1036 cat <<EOF > $TMP/sanity-lnet-$testnum-invalid.yaml
1038 - primary nid: 99.99.99.99@tcp
1041 - nid: 99.99.99.99@tcp
1043 do_lnetctl import < $TMP/sanity-lnet-$testnum-invalid.yaml &&
1044 error "import should have failed"
1045 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
1048 run_test 99b "Invalid value for Multi-Rail in yaml import"
1052 local ip=$(ip addr show dev $if | awk '/ inet /{print $2}')
1060 do_lnetctl net add --net ${net} --if ${if} ||
1061 error "Failed to add net ${net} on if ${if}"
1064 compare_route_add() {
1068 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
1070 do_lnetctl route add --net ${rnet} --gateway ${gw} ||
1071 error "route add failed $?"
1072 $LNETCTL export --backup > $actual ||
1073 error "export failed $?"
1074 validate_gateway_nids
1078 append_net_tunables() {
1081 $LNETCTL net show -v --net ${net} | grep -v 'dev cpt' |
1082 awk '/^\s+tunables:$/,/^\s+CPT:/' >> $TMP/sanity-lnet-$testnum-expected.yaml
1085 IF0_IP=$(ip -o -4 a s ${INTERFACES[0]} |
1086 awk '{print $4}' | sed 's/\/.*//')
1087 IF0_NET=$(awk -F. '{print $1"."$2"."$3}'<<<"${IF0_IP}")
1088 IF0_HOSTNUM=$(awk -F. '{print $4}'<<<"${IF0_IP}")
1089 if (((IF0_HOSTNUM + 5) > 254)); then
1092 GW_HOSTNUM=$((IF0_HOSTNUM + 1))
1094 GW_NID="${IF0_NET}.${GW_HOSTNUM}@${NETTYPE}"
1096 [[ ${NETTYPE} == tcp* ]] ||
1097 skip "Need tcp NETTYPE"
1098 reinit_dlc || return $?
1099 add_net "${NETTYPE}" "${INTERFACES[0]}"
1100 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
1102 - net type: ${NETTYPE}
1107 append_net_tunables tcp
1108 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
1114 health_sensitivity: 1
1116 - primary nid: ${GW_NID}
1122 compare_route_add "tcp7" "${GW_NID}"
1124 run_test 100 "Add route with single gw (tcp)"
1127 [[ ${NETTYPE} == tcp* ]] ||
1128 skip "Need tcp NETTYPE"
1129 reinit_dlc || return $?
1130 add_net "${NETTYPE}" "${INTERFACES[0]}"
1131 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
1133 - net type: ${NETTYPE}
1138 append_net_tunables tcp
1140 echo "route:" >> $TMP/sanity-lnet-$testnum-expected.yaml
1141 for i in $(seq $GW_HOSTNUM $((GW_HOSTNUM + 4))); do
1142 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
1144 gateway: ${IF0_NET}.${i}@tcp
1147 health_sensitivity: 1
1151 echo "peer:" >> $TMP/sanity-lnet-$testnum-expected.yaml
1152 for i in $(seq $GW_HOSTNUM $((GW_HOSTNUM + 4))); do
1153 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
1154 - primary nid: ${IF0_NET}.${i}@tcp
1157 - nid: ${IF0_NET}.${i}@tcp
1162 local gw="${IF0_NET}.[$GW_HOSTNUM-$((GW_HOSTNUM + 4))]@tcp"
1164 compare_route_add "tcp8" "${gw}"
1166 run_test 101 "Add route with multiple gw (tcp)"
1168 compare_route_del() {
1172 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
1174 do_lnetctl route del --net ${rnet} --gateway ${gw} ||
1175 error "route del failed $?"
1176 $LNETCTL export --backup > $actual ||
1177 error "export failed $?"
1178 validate_gateway_nids
1184 if [[ ${net} =~ (tcp|o2ib)[0-9]* ]]; then
1187 echo "$((${testnum} % 255))@${net}"
1192 reinit_dlc || return $?
1193 add_net "${NETTYPE}" "${INTERFACES[0]}"
1194 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-expected.yaml
1196 local gwnid=$(generate_gw_nid ${NETTYPE})
1198 do_lnetctl route add --net ${NETTYPE}2 --gateway ${gwnid} ||
1199 error "route add failed $?"
1200 compare_route_del "${NETTYPE}2" "${gwnid}"
1202 run_test 102 "Delete route with single gw"
1204 IP_NID_EXPR='103.103.103.[103-120/4]'
1205 NUM_NID_EXPR='[103-120/4]'
1207 reinit_dlc || return $?
1208 add_net "${NETTYPE}" "${INTERFACES[0]}"
1209 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-expected.yaml
1213 if [[ $NETTYPE =~ (tcp|o2ib)[0-9]* ]]; then
1214 nid_expr="${IF0_NET}.[$GW_HOSTNUM-$((GW_HOSTNUM+5))/2]"
1216 nid_expr="${NUM_NID_EXPR}"
1219 do_lnetctl route add --net ${NETTYPE}103 \
1220 --gateway ${nid_expr}@${NETTYPE} ||
1221 error "route add failed $?"
1222 compare_route_del "${NETTYPE}103" "${nid_expr}@${NETTYPE}"
1224 run_test 103 "Delete route with multiple gw"
1227 local tyaml="$TMP/sanity-lnet-$testnum-expected.yaml"
1229 reinit_dlc || return $?
1231 # Default value is '3'
1232 local val=$($LNETCTL global show | awk '/response_tracking/{print $NF}')
1234 error "Expect 3 found $val"
1236 echo "Set < 0; Should fail"
1237 do_lnetctl set response_tracking -1 &&
1238 error "should have failed $?"
1240 reinit_dlc || return $?
1243 response_tracking: -10
1245 do_lnetctl import < $tyaml &&
1246 error "should have failed $?"
1248 echo "Check valid values; Should succeed"
1250 for ((i = 0; i < 4; i++)); do
1251 reinit_dlc || return $?
1252 do_lnetctl set response_tracking $i ||
1253 error "should have succeeded $?"
1254 $LNETCTL global show | grep -q "response_tracking: $i" ||
1255 error "Failed to set response_tracking to $i"
1256 reinit_dlc || return $?
1259 response_tracking: $i
1261 do_lnetctl import < $tyaml ||
1262 error "should have succeeded $?"
1263 $LNETCTL global show | grep -q "response_tracking: $i" ||
1264 error "Failed to set response_tracking to $i"
1267 reinit_dlc || return $?
1268 echo "Set > 3; Should fail"
1269 do_lnetctl set response_tracking 4 &&
1270 error "should have failed $?"
1272 reinit_dlc || return $?
1275 response_tracking: 10
1277 do_lnetctl import < $tyaml &&
1278 error "should have failed $?"
1281 run_test 104 "Set/check response_tracking param"
1284 reinit_dlc || return $?
1285 add_net "${NETTYPE}" "${INTERFACES[0]}"
1287 local gwnid=$(generate_gw_nid ${NETTYPE})
1289 do_lnetctl route add --net ${NETTYPE}105 --gateway ${gwnid} ||
1290 error "route add failed $?"
1291 do_lnetctl peer add --prim ${gwnid} &&
1292 error "peer add should fail"
1296 run_test 105 "Adding duplicate GW peer should fail"
1299 reinit_dlc || return $?
1300 add_net "${NETTYPE}" "${INTERFACES[0]}"
1302 local gwnid=$(generate_gw_nid ${NETTYPE})
1304 do_lnetctl route add --net ${NETTYPE}106 --gateway ${gwnid} ||
1305 error "route add failed $?"
1306 do_lnetctl peer del --prim ${gwnid} &&
1307 error "peer del should fail"
1311 run_test 106 "Deleting GW peer should fail"
1314 [[ ${NETTYPE} == tcp* ]] ||
1315 skip "Need tcp NETTYPE"
1316 cleanup_lnet || exit 1
1317 load_lnet "networks=\"\""
1318 do_ns $LNETCTL lnet configure --all || exit 1
1319 $LNETCTL net show --net tcp | grep -q "nid: ${FAKE_IP}@tcp$"
1321 run_test 200 "load lnet w/o module option, configure in a non-default namespace"
1324 [[ ${NETTYPE} == tcp* ]] ||
1325 skip "Need tcp NETTYPE"
1326 cleanup_lnet || exit 1
1327 load_lnet "networks=tcp($FAKE_IF)"
1328 do_ns $LNETCTL lnet configure --all || exit 1
1329 $LNETCTL net show --net tcp | grep -q "nid: ${FAKE_IP}@tcp$"
1331 run_test 201 "load lnet using networks module options in a non-default namespace"
1334 [[ ${NETTYPE} == tcp* ]] ||
1335 skip "Need tcp NETTYPE"
1336 cleanup_lnet || exit 1
1337 load_lnet "networks=\"\" ip2nets=\"tcp0($FAKE_IF) ${FAKE_IP}\""
1338 do_ns $LNETCTL lnet configure --all || exit 1
1339 $LNETCTL net show | grep -q "nid: ${FAKE_IP}@tcp$"
1341 run_test 202 "load lnet using ip2nets in a non-default namespace"
1344 ### Add the interfaces in the target namespace
1347 [[ ${NETTYPE} == tcp* ]] ||
1348 skip "Need tcp NETTYPE"
1349 cleanup_lnet || exit 1
1351 do_lnetctl lnet configure || exit 1
1352 do_ns $LNETCTL net add --net tcp0 --if $FAKE_IF
1354 run_test 203 "add a network using an interface in the non-default namespace"
1356 LNET_PARAMS_FILE="$TMP/$TESTSUITE.parameters"
1357 function save_lnet_params() {
1358 $LNETCTL global show | egrep -v '^global:$' |
1359 sed 's/://' > $LNET_PARAMS_FILE
1362 function restore_lnet_params() {
1364 while read param value; do
1365 [[ $param == max_intf ]] && continue
1366 [[ $param == lnd_timeout ]] && continue
1367 $LNETCTL set ${param} ${value} ||
1368 error "Failed to restore ${param} to ${value}"
1369 done < $LNET_PARAMS_FILE
1372 function lnet_health_pre() {
1375 # Lower transaction timeout to speed up test execution
1376 $LNETCTL set transaction_timeout 10 ||
1377 error "Failed to set transaction_timeout $?"
1379 RETRY_PARAM=$($LNETCTL global show | awk '/retry_count/{print $NF}')
1380 RSND_PRE=$($LNETCTL stats show | awk '/resend_count/{print $NF}')
1381 LO_HVAL_PRE=$($LNETCTL net show -v 2 | awk '/health value/{print $NF}' |
1382 xargs echo | sed 's/ /+/g' | bc -l)
1384 RMT_HVAL_PRE=$($LNETCTL peer show --nid ${RNIDS[0]} -v 2 2>/dev/null |
1385 awk '/health value/{print $NF}' | xargs echo |
1386 sed 's/ /+/g' | bc -l)
1388 # Might not have any peers so initialize to zero.
1389 RMT_HVAL_PRE=${RMT_HVAL_PRE:-0}
1394 function lnet_health_post() {
1395 RSND_POST=$($LNETCTL stats show | awk '/resend_count/{print $NF}')
1396 LO_HVAL_POST=$($LNETCTL net show -v 2 |
1397 awk '/health value/{print $NF}' |
1398 xargs echo | sed 's/ /+/g' | bc -l)
1400 RMT_HVAL_POST=$($LNETCTL peer show --nid ${RNIDS[0]} -v 2 2>/dev/null |
1401 awk '/health value/{print $NF}' | xargs echo |
1402 sed 's/ /+/g' | bc -l)
1404 # Might not have any peers so initialize to zero.
1405 RMT_HVAL_POST=${RMT_HVAL_POST:-0}
1408 echo "Pre resends: $RSND_PRE" &&
1409 echo "Post resends: $RSND_POST" &&
1410 echo "Resends delta: $((RSND_POST - RSND_PRE))" &&
1411 echo "Pre local health: $LO_HVAL_PRE" &&
1412 echo "Post local health: $LO_HVAL_POST" &&
1413 echo "Pre remote health: $RMT_HVAL_PRE" &&
1414 echo "Post remote health: $RMT_HVAL_POST"
1418 do_lnetctl peer set --health 1000 --all
1419 do_lnetctl net set --health 1000 --all
1424 function check_no_resends() {
1425 echo "Check that no resends took place"
1426 [[ $RSND_POST -ne $RSND_PRE ]] &&
1427 error "Found resends: $RSND_POST != $RSND_PRE"
1432 function check_resends() {
1433 local delta=$((RSND_POST - RSND_PRE))
1435 echo "Check that $RETRY_PARAM resends took place"
1436 [[ $delta -ne $RETRY_PARAM ]] &&
1437 error "Expected $RETRY_PARAM resends found $delta"
1442 function check_no_local_health() {
1443 echo "Check that local NI health is unchanged"
1444 [[ $LO_HVAL_POST -ne $LO_HVAL_PRE ]] &&
1445 error "Local health changed: $LO_HVAL_POST != $LO_HVAL_PRE"
1450 function check_local_health() {
1451 echo "Check that local NI health has been changed"
1452 [[ $LO_HVAL_POST -eq $LO_HVAL_PRE ]] &&
1453 error "Local health unchanged: $LO_HVAL_POST == $LO_HVAL_PRE"
1458 function check_no_remote_health() {
1459 echo "Check that remote NI health is unchanged"
1460 [[ $RMT_HVAL_POST -ne $RMT_HVAL_PRE ]] &&
1461 error "Remote health changed: $RMT_HVAL_POST != $RMT_HVAL_PRE"
1466 function check_remote_health() {
1467 echo "Check that remote NI health has been changed"
1468 [[ $RMT_HVAL_POST -eq $RMT_HVAL_PRE ]] &&
1469 error "Remote health unchanged: $RMT_HVAL_POST == $RMT_HVAL_PRE"
1479 setup_health_test() {
1483 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
1485 local rnodes=$(remote_nodes_list)
1486 [[ -z $rnodes ]] && skip "Need at least 1 remote node"
1488 cleanup_lnet || error "Failed to cleanup before test execution"
1490 # Loading modules should configure LNet with the appropriate
1491 # test-framework configuration
1492 load_lnet "config_on_load=1" || error "Failed to load modules"
1494 LNIDS=( $($LCTL list_nids | xargs echo) )
1496 RNODE=$(awk '{print $1}' <<<$rnodes)
1497 RNIDS=( $(do_node $RNODE $LCTL list_nids | xargs echo) )
1499 if [[ -z ${RNIDS[@]} ]]; then
1500 do_rpc_nodes $RNODE load_lnet "config_on_load=1"
1502 RNIDS=( $(do_node $RNODE $LCTL list_nids | xargs echo) )
1505 [[ ${#LNIDS[@]} -lt 1 ]] &&
1506 error "No NIDs configured for local host $HOSTNAME"
1507 [[ ${#RNIDS[@]} -lt 1 ]] &&
1508 error "No NIDs configured for remote host $RNODE"
1510 # Ensure all peer NIs are local (i.e. non-routed config)
1511 local rnid rnet lnid lnet
1513 for rnid in ${RNIDS[@]}; do
1515 for lnid in ${LNIDS[@]}; do
1517 [[ ${lnet} == ${rnet} ]] &&
1520 [[ ${lnet} != ${rnet} ]] &&
1521 skip "Need non-routed configuration"
1524 do_lnetctl discover ${RNIDS[0]} ||
1525 error "Unable to discover ${RNIDS[0]}"
1527 local mr=$($LNETCTL peer show --nid ${RNIDS[0]} |
1528 awk '/Multi-Rail/{print $NF}')
1530 if ${need_mr} && [[ $mr == False ]]; then
1531 cleanup_health_test || return $?
1535 if ( ! ${need_mr} && [[ ${#RNIDS[@]} -gt 1 ]] ) ||
1536 ( ! ${need_mr} && [[ ${#LNIDS[@]} -gt 1 ]] ); then
1537 cleanup_health_test || return $?
1541 if ${need_mr} && [[ ${#RNIDS[@]} -lt 2 ]]; then
1542 # Add a second, reachable NID to rnode.
1543 local net=${RNIDS[0]}
1547 local if=$(do_rpc_nodes --quiet $RNODE lnet_if_list)
1549 error "Failed to determine interface for $RNODE"
1551 do_rpc_nodes $RNODE "$LNETCTL lnet configure"
1552 do_rpc_nodes $RNODE "$LNETCTL net add --net $net --if $if" ||
1554 if [[ $rc -ne 0 ]]; then
1555 error "Failed to add interface to $RNODE rc=$?"
1557 RNIDS[1]="${RNIDS[0]}1"
1558 NET_DEL_ARGS="--net $net --if $if"
1562 if ${need_mr} && [[ ${#LNIDS[@]} -lt 2 ]]; then
1563 local net=${LNIDS[0]}
1566 do_lnetctl lnet configure &&
1567 do_lnetctl net add --net $net --if ${INTERFACES[0]} ||
1569 if [[ $rc -ne 0 ]]; then
1570 error "Failed to add interface rc=$?"
1572 LNIDS[1]="${LNIDS[0]}1"
1578 $LNETCTL peer show -v 2 | egrep -e nid -e health
1580 $LCTL set_param debug=+net
1586 cleanup_health_test() {
1589 if [[ -n $NET_DEL_ARGS ]]; then
1590 do_rpc_nodes $RNODE \
1591 "$LNETCTL net del $NET_DEL_ARGS" ||
1596 unload_modules || rc=$?
1599 do_rpc_nodes $RNODE unload_modules_local ||
1605 error "Failed cleanup"
1610 add_health_test_drop_rules() {
1611 local args="-m GET -r 1 -e ${1}"
1614 for src in "${LNIDS[@]}"; do
1615 for dst in "${RNIDS[@]}" "${LNIDS[@]}"; do
1616 $LCTL net_drop_add -s $src -d $dst ${args} ||
1617 error "Failed to add drop rule $src $dst $args"
1622 do_lnet_health_ping_test() {
1625 echo "Simulate $hstatus"
1627 lnet_health_pre || return $?
1629 add_health_test_drop_rules ${hstatus}
1630 do_lnetctl ping ${RNIDS[0]} &&
1631 error "Should have failed"
1635 $LCTL net_drop_del -a
1640 # See lnet/lnet/lib-msg.c:lnet_health_check()
1641 LNET_LOCAL_RESEND_STATUSES="local_interrupt local_dropped local_aborted"
1642 LNET_LOCAL_RESEND_STATUSES+=" local_no_route local_timeout"
1643 LNET_LOCAL_NO_RESEND_STATUSES="local_error"
1645 setup_health_test false || return $?
1648 for hstatus in ${LNET_LOCAL_RESEND_STATUSES} \
1649 ${LNET_LOCAL_NO_RESEND_STATUSES}; do
1650 do_lnet_health_ping_test "${hstatus}" || return $?
1651 check_no_resends || return $?
1652 check_no_local_health || return $?
1655 cleanup_health_test || return $?
1659 run_test 204 "Check no health or resends for single-rail local failures"
1662 setup_health_test true || return $?
1665 for hstatus in ${LNET_LOCAL_RESEND_STATUSES}; do
1666 do_lnet_health_ping_test "${hstatus}" || return $?
1667 check_resends || return $?
1668 check_local_health || return $?
1671 for hstatus in ${LNET_LOCAL_NO_RESEND_STATUSES}; do
1672 do_lnet_health_ping_test "${hstatus}" || return $?
1673 check_no_resends || return $?
1674 check_local_health || return $?
1677 cleanup_health_test || return $?
1681 run_test 205 "Check health and resends for multi-rail local failures"
1683 # See lnet/lnet/lib-msg.c:lnet_health_check()
1684 LNET_REMOTE_RESEND_STATUSES="remote_dropped"
1685 LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout"
1687 setup_health_test false || return $?
1690 for hstatus in ${LNET_REMOTE_RESEND_STATUSES} \
1691 ${LNET_REMOTE_NO_RESEND_STATUSES}; do
1692 do_lnet_health_ping_test "${hstatus}" || return $?
1693 check_no_resends || return $?
1694 check_no_local_health || return $?
1695 check_no_remote_health || return $?
1698 cleanup_health_test || return $?
1702 run_test 206 "Check no health or resends for single-rail remote failures"
1705 setup_health_test true || return $?
1708 for hstatus in ${LNET_REMOTE_RESEND_STATUSES}; do
1709 do_lnet_health_ping_test "${hstatus}" || return $?
1710 check_resends || return $?
1711 check_no_local_health || return $?
1712 check_remote_health || return $?
1713 do_lnetctl peer set --health 1000 --all ||
1714 error "Unable to reset health rc=$?"
1716 for hstatus in ${LNET_REMOTE_NO_RESEND_STATUSES}; do
1717 do_lnet_health_ping_test "${hstatus}" || return $?
1718 check_no_resends || return $?
1719 check_no_local_health || return $?
1720 check_remote_health || return $?
1721 do_lnetctl peer set --health 1000 --all ||
1722 error "Unable to reset health rc=$?"
1725 cleanup_health_test || return $?
1729 run_test 207 "Check health and resends for multi-rail remote errors"
1731 test_208_load_and_check_lnet() {
1735 local num_expected=1
1737 load_lnet "networks=\"\" ip2nets=\"${ip2nets_str}\""
1740 error "Failed to load LNet with ip2nets \"${ip2nets_str}\""
1746 nids=( $($LCTL list_nids) )
1748 [[ ${#nids[@]} -ne ${num_expected} ]] &&
1749 error "Expect ${num_expected} NIDs found ${#nids[@]}"
1751 [[ ${nids[0]} == ${p_nid} ]] ||
1752 error "Expect NID \"${p_nid}\" found \"${nids[0]}\""
1754 [[ -n $s_nid ]] && [[ ${nids[1]} != ${s_nid} ]] &&
1755 error "Expect second NID \"${s_nid}\" found \"${nids[1]}\""
1757 $LCTL net down &>/dev/null
1762 [[ ${NETTYPE} == tcp* ]] ||
1763 skip "Need tcp NETTYPE"
1765 cleanup_netns || error "Failed to cleanup netns before test execution"
1766 cleanup_lnet || error "Failed to unload modules before test execution"
1767 setup_fakeif || error "Failed to add fake IF"
1769 have_interface "$FAKE_IF" ||
1770 error "Expect $FAKE_IF configured but not found"
1772 local if0_ip=$(ip --oneline addr show dev ${INTERFACES[0]} |
1773 awk '/inet /{print $4}' |
1775 if0_ip=($(echo "${if0_ip[@]}" | tr ' ' '\n' | uniq | tr '\n' ' '))
1776 local ip2nets_str="tcp(${INTERFACES[0]}) $if0_ip"
1778 echo "Configure single NID \"$ip2nets_str\""
1779 test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp"
1781 ip2nets_str="tcp(${INTERFACES[0]}) $if0_ip; tcp1($FAKE_IF) $FAKE_IP"
1782 echo "Configure two NIDs; two NETs \"$ip2nets_str\""
1783 test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" \
1786 ip2nets_str="tcp(${INTERFACES[0]}) $if0_ip; tcp($FAKE_IF) $FAKE_IP"
1787 echo "Configure two NIDs; one NET \"$ip2nets_str\""
1788 test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" \
1790 local addr1=( ${if0_ip//./ } )
1791 local addr2=( ${FAKE_IP//./ } )
1792 local range="[${addr1[0]},${addr2[0]}]"
1795 for i in $(seq 1 3); do
1796 range+=".[${addr1[$i]},${addr2[$i]}]"
1798 ip2nets_str="tcp(${INTERFACES[0]},${FAKE_IF}) ${range}"
1800 echo "Configured two NIDs; one NET alt syntax \"$ip2nets_str\""
1801 test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" \
1806 echo "alt syntax with missing IF \"$ip2nets_str\""
1807 load_lnet "networks=\"\" ip2nets=\"${ip2nets_str}\""
1809 echo "$LCTL net up should fail"
1811 error "LNet bring up should have failed"
1815 run_test 208 "Test various kernel ip2nets configurations"
1818 setup_health_test false || return $?
1820 echo "Simulate network_timeout w/SR config"
1823 add_health_test_drop_rules network_timeout
1825 do_lnetctl discover ${RNIDS[0]} &&
1826 error "Should have failed"
1830 check_no_resends || return $?
1831 check_no_local_health || return $?
1832 check_no_remote_health || return $?
1834 cleanup_health_test || return $?
1836 setup_health_test true || return $?
1838 echo "Simulate network_timeout w/MR config"
1842 add_health_test_drop_rules network_timeout
1844 do_lnetctl discover ${RNIDS[0]} &&
1845 error "Should have failed"
1849 check_no_resends || return $?
1850 check_local_health || return $?
1851 check_remote_health || return $?
1853 cleanup_health_test || return $?
1857 run_test 209 "Check health, but not resends, for network timeout"
1859 check_nid_in_recovq() {
1860 local recovq=$($LNETCTL debug recovery $1)
1862 local nids=$($LCTL list_nids | xargs echo)
1866 echo "Check \"$1\" recovery queue"
1868 if [[ $(grep -c 'nid-'<<<$recovq) -ne $expect ]]; then
1869 error "Expect $expect NIDs found: \"$recovq\""
1872 [[ $expect -eq 0 ]] && return 0
1874 for nid in ${nids}; do
1875 grep -q "nid-0: $nid"<<<$recovq &&
1880 error "Didn't find local NIDs in recovery queue: \"$recovq\""
1886 # First enqueue happens at time 0.
1887 # 2nd at 0 + 2^0 = 1
1888 # 3rd at 1 + 2^1 = 3
1889 # 4th at 3 + 2^2 = 7
1890 # 5th at 7 + 2^3 = 15
1891 # e.g. after 10 seconds we would expect to have seen the 4th enqueue,
1892 # (3 pings sent, 4th about to happen) and the 5th enqueue is yet to
1894 # If the recovery limit is 10 seconds, then when the 5th enqueue happens
1895 # we expect the peer NI to have aged out, so it will not actually be
1897 # If max_recovery_ping_interval is set to 4 then:
1898 # First enqueue happens at time 0.
1899 # 2nd at 0 + min(2^0, 4) = 1
1900 # 3rd at 1 + min(2^1, 4) = 3
1901 # 4th at 3 + min(2^2, 4) = 7
1902 # 5th at 7 + min(2^3, 4) = 11
1903 # 6th at 11 + min(2^4, 4) = 15
1904 # 7th at 15 + min(2^5, 4) = 19
1905 # e.g. after 4 seconds we would expect to have seen the 3rd enqueue,
1906 # (2 pings sent, 3rd about to happen), and the 4th enqueue is yet to happen
1907 # e.g. after 13 seconds we would expect to have seen the 5th enqueue,
1908 # (4 pings sent, 5th about to happen), and the 6th enqueue is yet to happen
1909 check_ping_count() {
1913 echo "Check ping counts:"
1915 if [[ $queue == "ni" ]]; then
1916 $LNETCTL net show -v 2 | egrep 'nid|health value|ping'
1917 ping_count=( $($LNETCTL net show -v 2 |
1918 awk '/ping_count/{print $NF}') )
1919 elif [[ $queue == "peer_ni" ]]; then
1920 $LNETCTL peer show -v 2 | egrep 'nid|health value|ping'
1921 ping_count=( $($LNETCTL peer show -v 2 |
1922 awk '/ping_count/{print $NF}') )
1924 error "Unrecognized queue \"$queue\""
1930 for count in "${ping_count[@]}"; do
1931 if [[ $count -eq $expect ]]; then
1932 if [[ $expect -ne 0 ]] && $found ; then
1933 error "Found more than one interface matching \"$expect\" ping count"
1936 echo "Expect ping count \"$expect\" found \"$count\""
1939 elif [[ $count -ne 0 ]]; then
1940 error "Found interface with ping count \"$count\" but expect \"$expect\""
1949 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
1951 reinit_dlc || return $?
1952 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
1953 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
1955 local prim_nid=$($LCTL list_nids | head -n 1)
1957 do_lnetctl discover $prim_nid ||
1958 error "failed to discover myself"
1960 local default=$($LNETCTL global show |
1961 awk '/recovery_limit/{print $NF}')
1962 # Set recovery limit to 10 seconds.
1963 do_lnetctl set recovery_limit 10 ||
1964 error "failed to set recovery_limit"
1966 $LCTL set_param debug=+net
1967 # Use local_error so LNet doesn't attempt to resend the discovery ping
1968 $LCTL net_drop_add -s *@${NETTYPE} -d *@${NETTYPE} -m GET -r 1 -e local_error
1969 $LCTL net_drop_add -s *@${NETTYPE}1 -d *@${NETTYPE}1 -m GET -r 1 -e local_error
1970 do_lnetctl discover $prim_nid &&
1971 error "Expected discovery to fail"
1973 # See comment for check_ping_count()
1975 check_nid_in_recovq "-l" "1"
1976 check_ping_count "ni" "2"
1980 check_nid_in_recovq "-l" "1"
1981 check_ping_count "ni" "3"
1983 $LCTL net_drop_del -a
1985 reinit_dlc || return $?
1986 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
1987 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
1989 local prim_nid=$($LCTL list_nids | head -n 1)
1991 do_lnetctl discover $prim_nid ||
1992 error "failed to discover myself"
1994 do_lnetctl set recovery_limit $default ||
1995 error "failed to set recovery_limit"
1997 default=$($LNETCTL global show |
1998 awk '/max_recovery_ping_interval/{print $NF}')
1999 do_lnetctl set max_recovery_ping_interval 4 ||
2000 error "failed to set max_recovery_ping_interval"
2002 $LCTL set_param debug=+net
2003 # Use local_error so LNet doesn't attempt to resend the discovery ping
2004 $LCTL net_drop_add -s *@${NETTYPE} -d *@${NETTYPE} -m GET -r 1 -e local_error
2005 $LCTL net_drop_add -s *@${NETTYPE}1 -d *@${NETTYPE}1 -m GET -r 1 -e local_error
2006 do_lnetctl discover $prim_nid &&
2007 error "Expected discovery to fail"
2009 # See comment for check_ping_count()
2011 check_nid_in_recovq "-l" "1"
2012 check_ping_count "ni" "2"
2015 check_nid_in_recovq "-l" "1"
2016 check_ping_count "ni" "4"
2018 $LCTL net_drop_del -a
2020 do_lnetctl set max_recovery_ping_interval $default ||
2021 error "failed to set max_recovery_ping_interval"
2025 run_test 210 "Local NI recovery checks"
2028 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
2030 reinit_dlc || return $?
2031 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2032 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2034 local prim_nid=$($LCTL list_nids | head -n 1)
2036 do_lnetctl discover $prim_nid ||
2037 error "failed to discover myself"
2039 local default=$($LNETCTL global show |
2040 awk '/recovery_limit/{print $NF}')
2041 # Set recovery limit to 10 seconds.
2042 do_lnetctl set recovery_limit 10 ||
2043 error "failed to set recovery_limit"
2045 $LCTL net_drop_add -s *@${NETTYPE} -d *@${NETTYPE} -m GET -r 1 -e remote_error
2046 $LCTL net_drop_add -s *@${NETTYPE}1 -d *@${NETTYPE}1 -m GET -r 1 -e remote_error
2048 # Set health to 0 on one interface. This forces it onto the recovery
2050 $LNETCTL peer set --nid $prim_nid --health 0
2052 # After 5 seconds, we expect the peer NI to still be in recovery
2054 check_nid_in_recovq "-p" 1
2055 check_ping_count "peer_ni" "2"
2057 # After 15 seconds, the peer NI should have been fully processed out of
2058 # the recovery queue. We'll allow a total of 17 seconds to account for
2059 # differences in sleeping for whole seconds vs. the more accurate time
2060 # keeping that is done in the recovery code.
2062 check_nid_in_recovq "-p" 0
2063 check_ping_count "peer_ni" "4"
2065 $LCTL net_drop_del -a
2067 # Set health to force it back onto the recovery queue. Set to 500 means
2068 # in 5 seconds it should be back at maximum value. We'll wait a couple
2069 # more seconds than that to be safe.
2070 # NB: we reset the recovery limit to 0 (indefinite) so the peer NI is
2072 do_lnetctl set recovery_limit 0 ||
2073 error "failed to set recovery_limit"
2075 $LNETCTL peer set --nid $prim_nid --health 500
2077 check_nid_in_recovq "-p" 1
2078 check_ping_count "peer_ni" "2"
2082 check_nid_in_recovq "-p" 0
2083 check_ping_count "peer_ni" "0"
2085 reinit_dlc || return $?
2086 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2087 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2089 local prim_nid=$($LCTL list_nids | head -n 1)
2091 do_lnetctl discover $prim_nid ||
2092 error "failed to discover myself"
2094 do_lnetctl set recovery_limit $default ||
2095 error "failed to set recovery_limit"
2097 default=$($LNETCTL global show |
2098 awk '/max_recovery_ping_interval/{print $NF}')
2099 do_lnetctl set max_recovery_ping_interval 4 ||
2100 error "failed to set max_recovery_ping_interval"
2102 $LCTL net_drop_add -s *@${NETTYPE} -d *@${NETTYPE} -m GET -r 1 -e remote_error
2103 $LCTL net_drop_add -s *@${NETTYPE}1 -d *@${NETTYPE}1 -m GET -r 1 -e remote_error
2105 # Set health to 0 on one interface. This forces it onto the recovery
2107 $LNETCTL peer set --nid $prim_nid --health 0
2109 # See comment for check_ping_count()
2111 check_nid_in_recovq "-p" "1"
2112 check_ping_count "peer_ni" "2"
2115 check_nid_in_recovq "-p" "1"
2116 check_ping_count "peer_ni" "4"
2118 $LCTL net_drop_del -a
2120 do_lnetctl set max_recovery_ping_interval $default ||
2121 error "failed to set max_recovery_ping_interval"
2125 run_test 211 "Remote NI recovery checks"
2128 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
2130 local rnodes=$(remote_nodes_list)
2131 [[ -z $rnodes ]] && skip "Need at least 1 remote node"
2133 cleanup_lnet || error "Failed to cleanup before test execution"
2135 # Loading modules should configure LNet with the appropriate
2136 # test-framework configuration
2137 load_lnet "config_on_load=1" || error "Failed to load modules"
2139 local my_nid=$($LCTL list_nids | head -n 1)
2141 error "Failed to get primary NID for local host $HOSTNAME"
2143 local rnode=$(awk '{print $1}' <<<$rnodes)
2144 local rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo)
2147 if [[ -z $rnodenids ]]; then
2148 do_rpc_nodes $rnode load_lnet "config_on_load=1"
2150 rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo)
2153 local rnodepnid=$(awk '{print $1}' <<< $rnodenids)
2155 [[ -z $rnodepnid ]] &&
2156 error "Failed to get primary NID for remote host $rnode"
2158 log "Initial discovery"
2159 do_lnetctl discover --force $rnodepnid ||
2160 error "Failed to discover $rnodepnid"
2162 do_node $rnode "$LNETCTL discover --force $my_nid" ||
2163 error "$rnode failed to discover $my_nid"
2165 log "Fail local discover ping to set LNET_PEER_REDISCOVER flag"
2166 $LCTL net_drop_add -s "*@$NETTYPE" -d "*@$NETTYPE" -r 1 -e local_error
2167 do_lnetctl discover --force $rnodepnid &&
2168 error "Discovery should have failed"
2169 $LCTL net_drop_del -a
2172 for nid in $rnodenids; do
2173 # We need GET (PING) delay just long enough so we can trigger
2174 # discovery on the remote peer
2175 $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -l 3
2176 $LCTL net_drop_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -e local_error
2177 # We need PUT (PUSH) delay just long enough so we can process
2179 $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m PUT -l 6
2182 log "Force $HOSTNAME to discover $rnodepnid (in background)"
2183 # We want to get a PING sent that we know will eventually fail.
2184 # The delay rules we added will ensure the ping is not sent until
2185 # the PUSH is also in flight (see below), and the drop rule ensures that
2186 # when the PING is eventually sent it will error out
2187 do_lnetctl discover --force $rnodepnid &
2190 # We want a discovery PUSH from rnode to put rnode back on our
2191 # discovery queue. This should cause us to try and send a PUSH to rnode
2192 # while the PING is still outstanding.
2193 log "Force $rnode to discover $my_nid"
2194 do_node $rnode $LNETCTL discover --force $my_nid
2196 # At this point we'll have both PING_SENT and PUSH_SENT set for the
2197 # rnode peer. Wait for the PING to error out which should terminate the
2198 # discovery process that we backgrounded.
2199 log "Wait for $pid1"
2201 log "Finished wait on $pid1"
2203 # The PING send failure clears the PING_SENT flag and puts the peer back
2204 # on the discovery queue. When discovery thread processes the peer it
2205 # will mistakenly clear the PUSH_SENT flag (and set PUSH_FAILED).
2206 # Discovery will then complete for this peer even though we have an
2208 # When PUSH is actually unlinked it will be forced back onto the
2209 # discovery queue, but we no longer have a ref on the peer. When
2210 # discovery completes again, we'll trip the ASSERT in
2211 # lnet_destroy_peer_locked()
2213 # Delete the delay rules to send the PUSH
2214 $LCTL net_delay_del -a
2215 # Delete the drop rules
2216 $LCTL net_drop_del -a
2219 error "Failed to unload modules"
2221 do_rpc_nodes $rnode unload_modules_local ||
2222 error "Failed to unload modules on $rnode"
2227 run_test 212 "Check discovery refcount loss bug (LU-14627)"
2230 [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
2232 cleanup_netns || error "Failed to cleanup netns before test execution"
2233 cleanup_lnet || error "Failed to unload modules before test execution"
2235 setup_fakeif || error "Failed to add fake IF"
2236 have_interface "$FAKE_IF" ||
2237 error "Expect $FAKE_IF configured but not found"
2239 reinit_dlc || return $?
2241 add_net "tcp" "${INTERFACES[0]}" || return $?
2242 add_net "tcp" "$FAKE_IF" || return $?
2244 local nid1=$(lctl list_nids | head -n 1)
2245 local nid2=$(lctl list_nids | tail --lines 1)
2247 [[ $(lctl which_nid $nid1 $nid2) == $nid1 ]] ||
2248 error "Expect nid1 \"$nid1\" to be preferred"
2250 [[ $(lctl which_nid $nid2 $nid1) == $nid2 ]] ||
2251 error "Expect nid2 \"$nid2\" to be preferred"
2255 run_test 213 "Check LNetDist calculation for multiple local NIDs"
2257 function check_ni_status() {
2261 local status=$($LNETCTL net show |
2263 awk '/status/{print $NF}')
2265 echo "NI ${nid} expect status \"${expect}\" found \"${status}\""
2266 if [[ $status != $expect ]]; then
2267 error "Error: Expect NI status \"$expect\" for NID \"$nid\" but found \"$status\""
2274 [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
2276 cleanup_netns || error "Failed to cleanup netns before test execution"
2277 cleanup_lnet || error "Failed to unload modules before test execution"
2279 setup_fakeif || error "Failed to add fake IF"
2280 have_interface "$FAKE_IF" ||
2281 error "Expect $FAKE_IF configured but not found"
2283 reinit_dlc || return $?
2285 add_net "tcp" "${INTERFACES[0]}" || return $?
2286 add_net "tcp" "$FAKE_IF" || return $?
2288 local nid1=$(lctl list_nids | head -n 1)
2289 local nid2=$(lctl list_nids | tail --lines 1)
2291 check_ni_status "0@lo" up
2292 check_ni_status "$nid1" up
2293 check_ni_status "$nid2" up
2295 do_lnetctl ping --source $nid2 $nid1 ||
2296 error "$LNETCTL ping --source $nid2 $nid1 failed"
2298 echo "Set $FAKE_IF down"
2299 echo "ip link set dev $FAKE_IF down"
2300 ip link set dev $FAKE_IF down
2301 check_ni_status "0@lo" up
2302 check_ni_status "$nid1" up
2303 check_ni_status "$nid2" down
2305 run_test 214 "Check local NI status when link is downed"
2311 $LNETCTL net show -v 2 |
2312 egrep -e nid -e $stat |
2314 awk '/'$stat':/{print $NF}'
2319 for nidvar in nid1 nid2; do
2320 for stat in send_count recv_count; do
2321 s=$(get_ni_stat ${!nidvar} $stat)
2322 eval ${nidvar}_pre_${stat}=$s
2329 for nidvar in nid1 nid2; do
2330 for stat in send_count recv_count; do
2331 s=$(get_ni_stat ${!nidvar} $stat)
2332 eval ${nidvar}_post_${stat}=$s
2342 eval pre=\${${nidvar}_pre_${stat}}
2343 eval post=\${${nidvar}_post_${stat}}
2345 echo "${!nidvar} pre ${stat} $pre post ${stat} $post"
2347 [[ $pre -ne $post ]]
2351 cleanup_netns || error "Failed to cleanup netns before test execution"
2352 cleanup_lnet || error "Failed to unload modules before test execution"
2354 reinit_dlc || return $?
2356 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2357 add_net "${NETTYPE}2" "${INTERFACES[0]}" || return $?
2359 local nid1=$($LCTL list_nids | head -n 1)
2360 local nid2=$($LCTL list_nids | tail --lines 1)
2362 do_lnetctl peer add --prim $nid1 --nid $nid2 ||
2363 error "Failed to add peer"
2367 for nidvarA in nid1 nid2; do
2370 for nidvarB in nid1 nid2; do
2371 [[ $nidvarA == $nidvarB ]] && continue
2375 echo "$LNETCTL ping $dst x $npings"
2376 for i in $(seq 1 $npings); do
2377 $LNETCTL ping $dst &>/dev/null ||
2378 error "$LNETCTL ping $dst failed"
2383 # No source specified, sends to either NID should cause
2384 # counts to increase across both NIs
2385 for nidvar in nid1 nid2; do
2386 for stat in send_count recv_count; do
2387 ni_stat_changed $nidvar $stat ||
2388 error "$stat unchanged for ${!nidvar}"
2394 echo "$LNETCTL ping --source $src $dst x $npings"
2395 for i in $(seq 1 $npings); do
2396 $LNETCTL ping --source $src $dst &>/dev/null ||
2397 error "$LNETCTL ping --source $src $dst failed"
2402 # src nid == dest nid means stats for the _other_ NI
2403 # should be unchanged
2404 for nidvar in nid1 nid2; do
2405 for stat in send_count recv_count; do
2406 if [[ ${!nidvar} == $src ]]; then
2407 ni_stat_changed $nidvar $stat ||
2408 error "$stat unchanged for ${!nidvar}"
2410 ni_stat_changed $nidvar $stat &&
2411 error "$stat changed for ${!nidvar}"
2416 # Double number of pings for next iteration because the net
2417 # sequence numbers will have diverged
2418 npings=$(($npings * 2))
2421 # Ping from nid1 to nid2 should fail
2422 do_lnetctl ping --source $nid1 $nid2 &&
2423 error "ping from $nid1 to $nid2 should fail"
2425 # Ping from nid2 to nid1 should fail
2426 do_lnetctl ping --source $nid2 $nid1 &&
2427 error "ping from $nid2 to $nid1 should fail"
2431 run_test 215 "Test lnetctl ping --source option"
2434 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
2438 reinit_dlc || return $?
2440 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2441 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2443 local nids=( $($LCTL list_nids | xargs echo) )
2445 do_lnetctl discover ${nids[0]} ||
2446 error "Initial discovery failed"
2448 do_lnetctl ping --source ${nids[0]} ${nids[0]} ||
2449 error "Initial ping failed $?"
2451 do_lnetctl ping --source ${nids[1]} ${nids[1]} ||
2452 error "Initial ping failed $?"
2455 for src in "${nids[@]}"; do
2456 for dst in "${nids[@]}"; do
2457 $LCTL net_drop_add -r 1 -s $src -d $dst -e network_timeout
2461 do_lnetctl ping ${nids[0]} || rc=$?
2463 $LCTL net_drop_del -a
2466 error "expected ping to fail"
2468 check_nid_in_recovq "-p" 0
2469 check_nid_in_recovq "-l" 1
2473 run_test 216 "Failed send to peer NI owned by local host should not trigger peer NI recovery"
2476 reinit_dlc || return $?
2478 [[ $($LNETCTL net show | grep -c nid) -ne 1 ]] &&
2479 error "Unexpected number of NIs after initalizing DLC"
2481 do_lnetctl discover 0@lo ||
2482 error "Failed to discover 0@lo"
2486 run_test 217 "Don't leak memory when discovering peer with nnis <= 1"
2489 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
2491 reinit_dlc || return $?
2493 [[ ${#INTERFACES[@]} -lt 2 ]] &&
2494 skip "Need two LNet interfaces"
2496 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2498 local nid1=$($LCTL list_nids | head -n 1)
2500 do_lnetctl ping $nid1 ||
2503 add_net "${NETTYPE}" "${INTERFACES[1]}" || return $?
2505 local nid2=$($LCTL list_nids | tail --lines 1)
2507 do_lnetctl ping $nid2 ||
2510 $LCTL net_drop_add -s $nid1 -d $nid1 -e local_error -r 1
2512 do_lnetctl ping --source $nid1 $nid1 &&
2513 error "ping should have failed"
2515 local health_recovered
2518 for i in $(seq 1 5); do
2519 health_recovered=$($LNETCTL net show -v 2 |
2520 grep -c 'health value: 1000')
2522 if [[ $health_recovered -ne 2 ]]; then
2523 echo "Wait 1 second for health to recover"
2530 health_recovered=$($LNETCTL net show -v 2 |
2531 grep -c 'health value: 1000')
2533 $LCTL net_drop_del -a
2535 [[ $health_recovered -ne 2 ]] &&
2536 do_lnetctl net show -v 2 | egrep -e nid -e health &&
2537 error "Health hasn't recovered"
2541 run_test 218 "Local recovery pings should exercise all available paths"
2544 reinit_dlc || return $?
2545 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2546 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2548 local nid1=$(lctl list_nids | head -n 1)
2549 local nid2=$(lctl list_nids | tail --lines 1)
2551 do_lnetctl ping $nid1 ||
2552 error "Ping failed $?"
2553 do_lnetctl ping $nid2 ||
2554 error "Ping failed $?"
2556 do_lnetctl discover $nid2 ||
2557 error "Discovery failed"
2559 $LNETCTL peer show --nid $nid1 | grep -q $nid2 ||
2560 error "$nid2 is not listed under $nid1"
2562 run_test 219 "Consolidate peer entries"
2570 do_rpc_nodes $node "$LNETCTL net add --net $net --if $if $opts" ||
2571 error "add $net on interface $if on node $node failed rc=$?"
2579 do_node $node "$LNETCTL route add --net $net --gateway $gw" ||
2580 error "route add to $net via $gw failed rc=$?"
2584 ROUTER_INTERFACES=()
2587 init_router_test_vars() {
2588 local rnodes=$(remote_nodes_list)
2589 [[ -z $rnodes || $(wc -w <<<$rnodes) -lt 2 ]] &&
2590 skip "Need at least 2 remote nodes found \"$rnodes\""
2592 ROUTER=$(awk '{print $1}' <<<$rnodes)
2593 RPEER=$(awk '{print $2}' <<<$rnodes)
2595 rnodes=$(comma_list $ROUTER $RPEER)
2596 local all_nodes=$(comma_list $rnodes $HOSTNAME)
2598 do_nodes $rnodes $LUSTRE_RMMOD ||
2599 error "failed to unload modules"
2601 do_rpc_nodes $rnodes "load_lnet config_on_load=1" ||
2602 error "Failed to load and configure LNet"
2604 ROUTER_INTERFACES=( $(do_rpc_nodes --quiet $ROUTER lnet_if_list) )
2606 RPEER_INTERFACES=( $(do_rpc_nodes --quiet $RPEER lnet_if_list) )
2608 do_nodes $all_nodes $LUSTRE_RMMOD ||
2609 error "Failed to unload modules"
2611 [[ ${#INTERFACES[@]} -eq 0 ]] &&
2612 error "No interfaces configured for local host $HOSTNAME"
2613 [[ ${#ROUTER_INTERFACES[@]} -eq 0 ]] &&
2614 error "No interfaces configured for router $ROUTER"
2615 [[ ${#RPEER_INTERFACES[@]} -eq 0 ]] &&
2616 error "No interfaces configured for remote peer $RPEER"
2624 LOCAL_NET=${NETTYPE}1
2625 REMOTE_NET=${NETTYPE}2
2626 setup_router_test() {
2628 local rtr_net_opts="$2"
2630 (( $MDS1_VERSION >= $(version_code 2.15.0) )) ||
2631 skip "need at least 2.15.0 for load_lnet"
2633 if [[ ${#RPEER_INTERFACES[@]} -eq 0 ]]; then
2634 init_router_test_vars ||
2638 local all_nodes=$(comma_list $ROUTER $RPEER $HOSTNAME)
2640 do_nodes $all_nodes $LUSTRE_RMMOD ||
2641 error "failed to unload modules"
2643 mod_opts+=" alive_router_check_interval=5"
2644 mod_opts+=" router_ping_timeout=5"
2645 mod_opts+=" large_router_buffers=4"
2646 mod_opts+=" small_router_buffers=8"
2647 mod_opts+=" tiny_router_buffers=16"
2648 do_rpc_nodes $all_nodes load_lnet "${mod_opts}" ||
2649 error "Failed to load lnet"
2651 do_nodes $all_nodes "$LNETCTL lnet configure" ||
2652 error "Failed to initialize DLC"
2654 do_net_add $ROUTER $LOCAL_NET ${ROUTER_INTERFACES[0]} $rtr_net_opts ||
2657 do_net_add $ROUTER $REMOTE_NET ${ROUTER_INTERFACES[0]} ||
2660 do_net_add $RPEER $REMOTE_NET ${RPEER_INTERFACES[0]} ||
2663 add_net $LOCAL_NET ${INTERFACES[0]} ||
2666 ROUTER_NIDS=( $(do_node $ROUTER $LCTL list_nids 2>/dev/null |
2668 RPEER_NIDS=( $(do_node $RPEER $LCTL list_nids 2>/dev/null |
2670 LNIDS=( $($LCTL list_nids 2>/dev/null | xargs echo) )
2678 do_nodesv $node "if $LNETCTL route show --net $net --gateway $gw; then \
2679 $LNETCTL route del --net $net --gateway $gw; \
2685 cleanup_router_test() {
2686 local all_nodes=$(comma_list $HOSTNAME $ROUTER $RPEER)
2688 do_route_del $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
2689 error "Failed to delete $REMOTE_NET route"
2691 do_route_del $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
2692 error "Failed to delete $LOCAL_NET route"
2694 do_nodes $all_nodes $LUSTRE_RMMOD ||
2695 error "failed to unload modules"
2700 check_route_aliveness() {
2705 local lnetctl_actual
2709 chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval)
2711 lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}')
2712 lnetctl_actual=$(do_node $node $LNETCTL route show -v |
2713 awk '/state/{print $NF}')
2715 for ((i = 0; i < $chk_intvl; i++)); do
2716 if [[ $lctl_actual == $expected ]] &&
2717 [[ $lnetctl_actual == $expected ]]; then
2721 echo "wait 1s for route state change"
2724 lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}')
2725 lnetctl_actual=$(do_node $node $LNETCTL route show -v |
2726 awk '/state/{print $NF}')
2729 [[ $lctl_actual != $expected ]] &&
2730 error "Wanted \"$expected\" lctl found \"$lctl_actual\""
2732 [[ $lnetctl_actual != $expected ]] &&
2733 error "Wanted \"$expected\" lnetctl found \"$lnetctl_actual\""
2738 check_router_ni_status() {
2739 local expected_local="$1"
2740 local expected_remote="$2"
2748 chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval)
2749 timeout=$(cat /sys/module/lnet/parameters/router_ping_timeout)
2751 actual_local=$(do_node $ROUTER "$LNETCTL net show --net $LOCAL_NET" |
2752 awk '/status/{print $NF}')
2753 actual_remote=$(do_node $ROUTER "$LNETCTL net show --net $REMOTE_NET" |
2754 awk '/status/{print $NF}')
2756 for ((i = 0; i < $((chk_intvl + timeout)); i++)); do
2757 if [[ $actual_local == $expected_local ]] &&
2758 [[ $actual_remote == $expected_remote ]]; then
2762 echo "wait 1s for NI state change"
2765 actual_local=$(do_node $ROUTER \
2766 "$LNETCTL net show --net $LOCAL_NET" |
2767 awk '/status/{print $NF}')
2768 actual_remote=$(do_node $ROUTER \
2769 "$LNETCTL net show --net $REMOTE_NET" |
2770 awk '/status/{print $NF}')
2773 [[ $actual_local == $expected_local ]] ||
2774 error "$LOCAL_NET should be $expected_local"
2776 [[ $actual_remote == $expected_remote ]] ||
2777 error "$REMOTE_NET should be $expected_remote"
2782 do_basic_rtr_test() {
2783 do_node $ROUTER "$LNETCTL set routing 1" ||
2784 error "Unable to enable routing on $ROUTER"
2786 do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
2789 do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
2792 check_route_aliveness "$HOSTNAME" "up" ||
2795 check_route_aliveness "$RPEER" "up" ||
2798 do_lnetctl ping ${RPEER_NIDS[0]} ||
2799 error "Failed to ping ${RPEER_NIDS[0]}"
2801 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
2802 error "$RPEER failed to ping ${LNIDS[0]}"
2808 setup_router_test || return $?
2810 do_basic_rtr_test || return $?
2812 do_rpc_nodes $HOSTNAME,$RPEER load_module ../lnet/selftest/lnet_selftest ||
2813 error "Failed to load lnet-selftest module"
2815 $LSTSH -H -t $HOSTNAME -f $RPEER -m rw -s 4k ||
2818 $LSTSH -H -t $HOSTNAME -f $RPEER -m rw ||
2821 cleanup_router_test || return $?
2823 run_test 220 "Add routes w/default options - check aliveness"
2826 setup_router_test lnet_peer_discovery_disabled=1 || return $?
2828 do_basic_rtr_test || return $?
2830 cleanup_router_test || return $?
2832 run_test 221 "Add routes w/DD disabled - check aliveness"
2834 do_aarf_enabled_test() {
2835 do_node $ROUTER "$LNETCTL set routing 1" ||
2836 error "Unable to enable routing on $ROUTER"
2838 check_router_ni_status "down" "down"
2840 do_lnetctl ping ${RPEER_NIDS[0]} &&
2841 error "Ping should fail"
2843 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" &&
2844 error "$RPEER ping should fail"
2846 # Adding a route should cause the router's NI on LOCAL_NET to get up
2847 do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
2850 check_router_ni_status "up" "down" ||
2853 # But route should still be down because of avoid_asym_router_failure
2854 check_route_aliveness "$HOSTNAME" "down" ||
2857 do_lnetctl ping ${RPEER_NIDS[0]} &&
2858 error "Ping should fail"
2860 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" &&
2861 error "$RPEER ping should fail"
2863 # Adding the symmetric route should cause the remote NI to go up and
2865 do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
2868 check_router_ni_status "up" "up" ||
2871 check_route_aliveness "$HOSTNAME" "up" ||
2874 check_route_aliveness "$RPEER" "up" ||
2877 do_lnetctl ping ${RPEER_NIDS[0]} ||
2878 error "Failed to ping ${RPEER_NIDS[0]}"
2880 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
2881 error "$RPEER failed to ping ${LNIDS[0]}"
2883 # Stop LNet on local host
2884 do_lnetctl lnet unconfigure ||
2885 error "Failed to stop LNet rc=$?"
2887 check_router_ni_status "down" "up" ||
2890 check_route_aliveness "$RPEER" "down" ||
2893 do_lnetctl ping ${RPEER_NIDS[0]} &&
2894 error "Ping should fail"
2896 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" &&
2897 error "$RPEER ping should fail"
2903 setup_router_test avoid_asym_router_failure=1 || return $?
2905 do_aarf_enabled_test || return $?
2907 cleanup_router_test || return $?
2909 run_test 222 "Check avoid_asym_router_failure=1"
2912 local opts="avoid_asym_router_failure=1 lnet_peer_discovery_disabled=1"
2914 setup_router_test "$opts" || return $?
2916 do_aarf_enabled_test || return $?
2918 cleanup_router_test || return $?
2920 run_test 223 "Check avoid_asym_router_failure=1 w/DD disabled"
2922 do_aarf_disabled_test() {
2923 do_node $ROUTER "$LNETCTL set routing 1" ||
2924 error "Unable to enable routing on $ROUTER"
2926 check_router_ni_status "down" "down"
2928 do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
2931 check_router_ni_status "up" "down" ||
2934 check_route_aliveness "$HOSTNAME" "up" ||
2937 do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
2940 check_router_ni_status "up" "up" ||
2943 check_route_aliveness "$HOSTNAME" "up" ||
2946 check_route_aliveness "$RPEER" "up" ||
2949 do_lnetctl ping ${RPEER_NIDS[0]} ||
2950 error "Failed to ping ${RPEER_NIDS[0]}"
2952 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
2953 error "$RPEER failed to ping ${LNIDS[0]}"
2955 # Stop LNet on local host
2956 do_lnetctl lnet unconfigure ||
2957 error "Failed to stop LNet rc=$?"
2959 check_router_ni_status "down" "up" ||
2962 check_route_aliveness "$RPEER" "up" ||
2969 setup_router_test avoid_asym_router_failure=0 ||
2972 do_aarf_disabled_test ||
2975 cleanup_router_test ||
2978 run_test 224 "Check avoid_asym_router_failure=0"
2981 local opts="avoid_asym_router_failure=0 lnet_peer_discovery_disabled=1"
2983 setup_router_test "$opts" || return $?
2985 do_aarf_disabled_test || return $?
2987 cleanup_router_test ||
2990 run_test 225 "Check avoid_asym_router_failure=0 w/DD disabled"
2992 do_rtr_peer_health_test() {
2995 do_node $ROUTER "$LNETCTL set routing 1" ||
2996 error "Unable to enable routing on $ROUTER"
2998 do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
3001 do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
3004 check_router_ni_status "up" "up" ||
3007 check_route_aliveness "$HOSTNAME" "up" ||
3010 check_route_aliveness "$RPEER" "up" ||
3013 do_lnetctl ping ${RPEER_NIDS[0]} ||
3014 error "Failed to ping ${RPEER_NIDS[0]}"
3016 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
3017 error "$RPEER failed to ping ${LNIDS[0]}"
3019 # Stop LNet on local host
3020 do_lnetctl lnet unconfigure ||
3021 error "Failed to stop LNet rc=$?"
3023 check_router_ni_status "down" "up" ||
3026 check_route_aliveness "$RPEER" "up" ||
3029 # The NI used to send the message to the destination will be the
3030 # router's NI on LOCAL_NET, so that's the drop count that will be
3032 local d1=$(do_node $ROUTER $LNETCTL net show -v --net $LOCAL_NET | \
3033 awk '/drop_count:/{print $NF}')
3035 # Ping from RPEER to local host should be dropped by the router
3036 do_node $RPEER "$LCTL ping ${LNIDS[0]}" &&
3037 error "$RPEER expected ping to fail"
3039 local d2=$(do_node $ROUTER $LNETCTL net show -v --net $LOCAL_NET | \
3040 awk '/drop_count:/{print $NF}')
3042 [[ $((d2 - d1)) -ne $expected ]] &&
3043 error "Expected drop count change by $expected: $d1 -> $d2"
3049 setup_router_test avoid_asym_router_failure=0 --peer-timeout=10 ||
3052 do_rtr_peer_health_test 1 ||
3055 cleanup_router_test ||
3058 run_test 226 "Check router peer health enabled"
3061 setup_router_test avoid_asym_router_failure=0 --peer-timeout=0 ||
3064 do_rtr_peer_health_test 0 ||
3067 cleanup_router_test ||
3070 run_test 227 "Check router peer health disabled"
3073 [[ ${NETTYPE} == tcp* ]] ||
3074 skip "Need tcp NETTYPE"
3076 echo "Check valid values; Should succeed"
3080 for ((i = 4; i < 16; i+=1)); do
3081 reinit_dlc || return $?
3082 add_net "tcp" "${INTERFACES[0]}" || return $?
3083 do_lnetctl net set --all --conns-per-peer $i ||
3084 error "should have succeeded $?"
3085 $LNETCTL net show -v 1 | grep -q "conns_per_peer: $i" ||
3086 error "failed to set conns-per-peer to $i"
3087 lnid="$(lctl list_nids | head -n 1)"
3088 do_lnetctl ping "$lnid" ||
3089 error "failed to ping myself"
3091 # "lctl --net tcp conn_list" prints the list of active
3092 # connections. Since we're pinging ourselves, there should be
3093 # 2 Control connections plus 2*conns_per_peer connections
3094 # created (one Bulk Input, one Bulk Output in each pair).
3095 # Here's the sample output for conns_per_peer set to 1:
3096 # 12345-1.1.1.1@tcp I[0]host01->host01:988 2626560/1061296 nonagle
3097 # 12345-1.1.1.1@tcp O[0]host01->host01:1022 2626560/1061488 nonagle
3098 # 12345-1.1.1.1@tcp C[0]host01->host01:988 2626560/1061296 nonagle
3099 # 12345-1.1.1.1@tcp C[0]host01->host01:1023 2626560/1061488 nonagle
3100 cmd="printf 'network tcp\nconn_list\n' | lctl | grep -c '$lnid'"
3102 # Expect 2+conns_per_peer*2 connections. Wait no longer
3104 wait_update $HOSTNAME "$cmd" "$((2+i*2))" 2 ||
3105 error "expected number of tcp connections $((2+i*2))"
3108 reinit_dlc || return $?
3109 add_net "tcp" "${INTERFACES[0]}" || return $?
3110 echo "Set > 127; Should fail"
3111 do_lnetctl net set --all --conns-per-peer 128 &&
3112 error "should have failed $?"
3114 reinit_dlc || return $?
3115 add_net "tcp" "${INTERFACES[0]}" || return $?
3117 local default=$($LNETCTL net show -v 1 |
3118 awk '/conns_per_peer/{print $NF}')
3120 echo "Set < 0; Should be ignored"
3121 do_lnetctl net set --all --conns-per-peer -1 ||
3122 error "should have succeeded $?"
3123 $LNETCTL net show -v 1 | grep -q "conns_per_peer: ${default}" ||
3124 error "Did not stay at default"
3126 run_test 230 "Test setting conns-per-peer"
3129 reinit_dlc || return $?
3131 local net=${NETTYPE}231
3133 do_lnetctl net add --net $net --if ${INTERFACES[0]} ||
3134 error "Failed to add net"
3136 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-expected.yaml
3137 sed -i 's/peer_timeout: .*$/peer_timeout: 0/' \
3138 $TMP/sanity-lnet-$testnum-expected.yaml
3140 reinit_dlc || return $?
3142 do_lnetctl import $TMP/sanity-lnet-$testnum-expected.yaml ||
3143 error "Failed to import configuration"
3145 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
3147 compare_yaml_files || error "Wrong config after import"
3149 do_lnetctl net del --net $net --if ${INTERFACES[0]} ||
3150 error "Failed to delete net $net"
3152 do_lnetctl net add --net $net --if ${INTERFACES[0]} --peer-timeout=0 ||
3153 error "Failed to add net with peer-timeout=0"
3155 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
3157 compare_yaml_files || error "Wrong config after lnetctl net add"
3159 reinit_dlc || return $?
3161 # lnet/include/lnet/lib-lnet.h defines DEFAULT_PEER_TIMEOUT 180
3162 sed -i 's/peer_timeout: .*$/peer_timeout: 180/' \
3163 $TMP/sanity-lnet-$testnum-expected.yaml
3165 sed -i '/^.*peer_timeout:.*$/d' $TMP/sanity-lnet-$testnum-actual.yaml
3167 do_lnetctl import $TMP/sanity-lnet-$testnum-actual.yaml ||
3168 error "Failed to import config without peer_timeout"
3170 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
3174 run_test 231 "Check DLC handling of peer_timeout parameter"
3176 ### Test that linux route is added for each ni
3180 [[ ${NETTYPE} == tcp* ]] ||
3181 skip "Need tcp NETTYPE"
3182 reinit_dlc || return $?
3183 add_net "tcp" "${INTERFACES[0]}" || return $?
3185 skip_param=$(cat /sys/module/ksocklnd/parameters/skip_mr_route_setup)
3186 [[ ${skip_param:-0} -ne 0 ]] &&
3187 skip "Need skip_mr_route_setup=0 found $skip_param"
3189 ip route show table ${INTERFACES[0]} | grep -q "${INTERFACES[0]}"
3191 run_test 250 "test that linux routes are added"
3194 [[ ${NETTYPE} =~ kfi* ]] ||
3195 skip "Need kfi NETTYPE"
3197 reinit_dlc || return $?
3198 add_net "kfi" "${INTERFACES[0]}" || return $?
3199 add_net "kfi1" "${INTERFACES[0]}" || return $?
3200 add_net "kfi10" "${INTERFACES[0]}" || return $?
3203 run_test 251 "Define multiple kfi networks on single interface"
3206 setup_health_test false || return $?
3210 do_rpc_nodes $RNODE unload_modules_local || rc=$?
3212 if [[ $rc -ne 0 ]]; then
3213 cleanup_health_test || return $?
3215 error "Failed to unload modules on $RNODE rc=$rc"
3220 local ts1=$(date +%s)
3222 do_lnetctl ping --timeout 15 ${RNIDS[0]} &&
3223 error "Expected ping ${RNIDS[0]} to fail"
3225 local ts2=$(date +%s)
3227 local delta=$(echo "$ts2 - $ts1" | bc)
3229 [[ $delta -lt 15 ]] ||
3230 error "Ping took longer than expected to fail: $delta"
3234 run_test 252 "Ping to down peer should unlink quickly"
3236 do_expired_message_drop_test() {
3237 local rnid lnid old_tto
3239 old_tto=$($LNETCTL global show |
3240 awk '/transaction_timeout:/{print $NF}')
3242 [[ -z $old_tto ]] &&
3243 error "Cannot determine LNet transaction timeout"
3247 do_lnetctl set transaction_timeout "${tto}" ||
3248 error "Failed to set transaction_timeout"
3250 # We want to consume all peer credits for at least transaction_timeout
3256 for lnid in "${LNIDS[@]}"; do
3257 for rnid in "${RNIDS[@]}"; do
3258 $LCTL net_delay_add -s "${lnid}" -d "${rnid}" \
3259 -l "${delay}" -r 1 -m GET
3265 pcs=( $($LNETCTL peer show -v --nid "${RNIDS[0]}" |
3266 awk '/max_ni_tx_credits:/{print $NF}' |
3269 [[ ${#RNIDS[@]} -ne ${#pcs[@]} ]] &&
3270 error "Expect ${#RNIDS[@]} peer credit values found ${#pcs[@]}"
3272 local rnet lnid lnet i j
3274 # Need to use --source for multi-rail configs to ensure we consume
3275 # all available peer credits
3276 for ((i = 0; i < ${#RNIDS[@]}; i++)); do
3277 local ping_args="--timeout $((delay+2))"
3279 rnet=${RNIDS[i]##*@}
3280 for lnid in ${LNIDS[@]}; do
3282 [[ $rnet == $lnet ]] && break
3285 ping_args+=" --source ${lnid} ${RNIDS[i]}"
3286 for j in $(seq 1 "${pcs[i]}"); do
3287 $LNETCTL ping ${ping_args} 1>/dev/null &
3290 echo "Issued ${pcs[i]} pings to ${RNIDS[i]} from $lnid"
3293 # This ping should be queued on peer NI tx credit
3294 $LNETCTL ping --timeout $((delay+2)) "${RNIDS[0]}" &
3298 $LCTL net_delay_del -a
3302 # Messages sent from the delay list do not go through
3303 # lnet_post_send_locked(), thus we should only have a single drop
3306 dropped=$($LNETCTL peer show -v 2 --nid "${RNIDS[0]}" |
3307 grep -A 2 dropped_stats |
3308 awk '/get:/{print $2}' |
3310 sed 's/ /\+/g' | bc)
3312 [[ $dropped -ne 1 ]] &&
3313 error "Expect 1 dropped GET but found $dropped"
3315 do_lnetctl set transaction_timeout "${old_tto}"
3321 setup_health_test false || return $?
3323 do_expired_message_drop_test || return $?
3327 run_test 253 "Message delayed beyond deadline should be dropped (single-rail)"
3330 setup_health_test true || return $?
3332 do_expired_message_drop_test || return $?
3336 run_test 254 "Message delayed beyond deadline should be dropped (multi-rail)"
3339 [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
3341 reinit_dlc || return $?
3343 cleanup_lnet || return $?
3345 local routes_str="o2ib ${IF0_NET}.[$GW_HOSTNUM-$((GW_HOSTNUM+4))]"
3346 local network_str="${NETTYPE}(${INTERFACES[0]})"
3348 load_lnet "networks=\"${network_str}\" routes=\"${routes_str}\"" ||
3349 error "Failed to load LNet"
3352 error "Failed to load LNet with networks=\"${network_str}\" routes=\"${routes_str}\""
3354 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
3356 - net type: ${NETTYPE}
3361 append_net_tunables tcp
3363 echo "route:" >> $TMP/sanity-lnet-$testnum-expected.yaml
3364 for i in $(seq $GW_HOSTNUM $((GW_HOSTNUM + 4))); do
3365 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
3367 gateway: ${IF0_NET}.${i}@${NETTYPE}
3370 health_sensitivity: 1
3374 echo "peer:" >> $TMP/sanity-lnet-$testnum-expected.yaml
3375 for i in $(seq $GW_HOSTNUM $((GW_HOSTNUM + 4))); do
3376 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
3377 - primary nid: ${IF0_NET}.${i}@${NETTYPE}
3380 - nid: ${IF0_NET}.${i}@${NETTYPE}
3386 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml ||
3387 error "export failed $?"
3389 validate_gateway_nids
3391 run_test 255 "Use lnet routes param with pdsh syntax"
3396 local out=$TMP/$tfile
3397 local prefix=/usr/include/linux/lnet
3399 # We use a hard coded prefix so that this test will not fail
3402 if ! which $CC > /dev/null 2>&1; then
3403 skip_env "$CC is not installed"
3406 cleanup_lnet || exit 1
3409 local cc_args="-Wall -Werror -std=c99 -c -x c /dev/null -o $out"
3410 if ! [[ -d $prefix ]]; then
3411 # Assume we're running in tree and fixup the include path.
3412 prefix=$LUSTRE/../lnet/include/uapi/linux/lnet
3413 cc_args+=" -I $LUSTRE/../lnet/include/uapi"
3416 for header in $prefix/*.h; do
3417 if ! [[ -f "$header" ]]; then
3421 echo "$CC $cc_args -include $header"
3422 $CC $cc_args -include $header ||
3423 error "cannot compile '$header'"
3427 run_test 300 "packaged LNet UAPI headers can be compiled"
3429 # LU-16081 lnet: Memory leak on adding existing interface
3432 reinit_dlc || return $?
3433 do_lnetctl net add --net ${NETTYPE} --if ${INTERFACES[0]} ||
3434 error "Failed to add net"
3435 do_lnetctl net add --net ${NETTYPE} --if ${INTERFACES[0]} &&
3436 error "add net should have failed"
3437 do_lnetctl net del --net ${NETTYPE} --if ${INTERFACES[0]} ||
3438 error "Failed to del net"
3441 run_test 301 "Check for dynamic adds of same/wrong interface (memory leak)"
3444 ! [[ $NETTYPE =~ (tcp|o2ib) ]] && skip "Need tcp or o2ib NETTYPE"
3445 reinit_dlc || return $?
3447 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
3449 local nid=$($LCTL list_nids)
3451 do_lnetctl ping ${nid} ||
3452 error "pinging self failed $?"
3453 do_lnetctl debug peer --nid ${nid} ||
3454 error "failed to dump peer debug info $?"
3456 run_test 302 "Check that peer debug info can be dumped"
3459 [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
3461 setup_health_test true || return $?
3463 cleanup_netns || error "Failed to cleanup netns before test execution"
3464 setup_fakeif || error "Failed to add fake IF"
3465 have_interface "$FAKE_IF" ||
3466 error "Expect $FAKE_IF configured but not found"
3468 add_net "${NETTYPE}99" "$FAKE_IF" || return $?
3470 local nid=$($LCTL list_nids | tail --lines 1)
3472 # Our updated config should be pushed to RNODE
3473 local found=$(do_node $RNODE "$LNETCTL peer show --nid $nid")
3475 [[ -z $found ]] && error "Peer not updated on $RNODE"
3477 local prim=$($LCTL list_nids | head -n 1)
3479 if ! grep -q -- "- primary nid: $prim"<<<"${found}"; then
3481 error "Wrong primary nid"
3484 echo "Set $FAKE_IF down"
3485 echo "ip link set dev $FAKE_IF down"
3486 ip link set dev $FAKE_IF down
3487 check_ni_status "$nid" down
3489 local hval=$(do_node $RNODE "$LNETCTL peer show --nid $nid -v 2 | \
3490 grep -e '- nid:' -e 'health value:'")
3492 hval=$(grep -A 1 $nid<<<"$hval" | tail -n 1 | awk '{print $NF}')
3493 (( hval < 1000 )) ||
3494 error "Expect $hval < 1000"
3498 run_test 303 "Check peer NI health after link down"
3505 echo "check parameter ${para} value ${value}"
3507 return $(( $(do_lnetctl net show -v | \
3509 grep -c "^ \+${para}: ${value}$") != 1 ))
3516 cleanup_lnet || error "Failed to cleanup LNet"
3518 load_module ../libcfs/libcfs/libcfs ||
3519 error "Failed to load module libcfs rc = $?"
3521 load_module ../lnet/lnet/lnet ||
3522 error "Failed to load module lnet rc = $?"
3524 echo "loading ${module} ${setting} type ${NETTYPE}"
3525 load_module "${module}" "${setting}" ||
3526 error "Failed to load module ${module} rc = $?"
3528 do_lnetctl lnet configure --all || error "lnet configure failed rc = $?"
3536 if [[ ${NETTYPE} == tcp* ]];then
3537 static_config "../lnet/klnds/socklnd/ksocklnd" \
3538 "sock_timeout=${value}"
3539 elif [[ ${NETTYPE} == o2ib* ]]; then
3540 static_config "../lnet/klnds/o2iblnd/ko2iblnd" \
3542 elif [[ ${NETTYPE} == gni* ]]; then
3543 static_config "../lnet/klnds/gnilnd/kgnilnd" \
3546 skip "NETTYPE ${NETTYPE} not supported"
3549 check_parameter "timeout" $value
3553 run_test 310 "Set timeout and verify"
3557 local target_net="${1}"
3558 local target_nid="${2}"
3559 local expect_net="${3}"
3560 local expect_nid="${4}"
3564 declare -a net_prios
3565 declare -a nid_prios
3567 nids=( $($LNETCTL ${type} show -v 5 | awk '/- nid:/{print $NF}' |
3570 net_prios=( $($LNETCTL ${type} show -v 5 |
3571 awk '/net priority:/{print $NF}' | xargs echo) )
3573 nid_prios=( $($LNETCTL ${type} show -v 5 |
3574 awk '/nid priority:/{print $NF}' | xargs echo) )
3576 (( ${#nids[@]} != ${#net_prios[@]} )) &&
3577 error "Wrong # net prios ${#nids[@]} != ${#net_prios[@]}"
3579 (( ${#nids[@]} != ${#nid_prios[@]} )) &&
3580 error "Wrong # nid prios ${#nids[@]} != ${#nid_prios[@]}"
3584 for ((i = 0; i < ${#nids[@]}; i++)); do
3585 [[ -n ${target_net} ]] &&
3586 [[ ${nids[i]##*@} != "${target_net}" ]] &&
3588 [[ -n ${target_nid} ]] &&
3589 [[ ${nids[i]} != "${target_nid}" ]] &&
3592 echo "${nids[i]}: net_prio ${net_prios[i]} expect ${expect_net}"
3593 (( net_prios[i] != expect_net )) &&
3594 error "Wrong net priority \"${net_prios[i]}\" expect ${expect_net}"
3596 echo "${nids[i]}: nid_prio ${nid_prios[i]} expect ${expect_nid}"
3597 (( nid_prios[i] != expect_nid )) &&
3598 error "Wrong nid priority \"${nid_prios[i]}\" expect ${expect_nid}"
3604 check_peer_udsp_prio() {
3605 check_udsp_prio "${1}" "${2}" "${3}" "${4}" "peer"
3608 check_net_udsp_prio() {
3609 check_udsp_prio "${1}" "${2}" "${3}" "${4}" "net"
3613 reinit_dlc || return $?
3615 do_lnetctl udsp add --src tcp --priority 0 ||
3616 error "Failed to add udsp rule"
3617 do_lnetctl udsp del --idx 0 ||
3618 error "Failed to del udsp rule"
3621 run_test 400 "Check for udsp add/delete net rule without net num"
3624 reinit_dlc || return $?
3626 do_lnetctl net add --net ${NETTYPE} --if ${INTERFACES[0]} ||
3627 error "Failed to add net"
3629 do_lnetctl udsp add --dst ${NETTYPE} --prio 1 ||
3630 error "Failed to add peer net priority rule"
3632 do_lnetctl discover $($LCTL list_nids | head -n 1) ||
3633 error "Failed to discover peer"
3635 check_peer_udsp_prio "${NETTYPE}" "" "1" "-1"
3639 run_test 401 "Discover peer after adding peer net UDSP rule"
3642 reinit_dlc || return $?
3644 do_lnetctl udsp add --dst kfi --priority 0 ||
3645 error "Failed to add UDSP rule"
3647 do_lnetctl peer add --prim 402@kfi ||
3648 error "Failed to add peer"
3652 run_test 402 "Destination net rule should not panic"