3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 # bug number for skipped test:
12 ALWAYS_EXCEPT="$SANITY_LNET_EXCEPT "
13 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
15 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
19 . $LUSTRE/tests/test-framework.sh
23 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
28 [[ -z $LNETCTL ]] && skip "Need lnetctl"
32 if is_mounted $MOUNT || is_mounted $MOUNT2; then
33 cleanupall || error "Failed cleanup prior to test execution"
38 echo "Cleaning up LNet"
39 lsmod | grep -q lnet &&
40 $LNETCTL lnet unconfigure 2>/dev/null
45 if module_loaded lnet ; then
46 cleanup_lnet || error "Failed to unload modules before test execution"
52 # Cleanup any tmp files created by the sub tests
53 rm -f $TMP/sanity-lnet-*.yaml $LNET_PARAMS_FILE
56 if $restore_mounts; then
57 setupall || error "Failed to setup Lustre after test execution"
58 elif $restore_modules; then
60 error "Couldn't load modules after test execution"
69 echo "ip netns exec $TESTNS $*"
70 ip netns exec $TESTNS "$@"
78 netns_arg="netns $netns"
80 ip link add 'test1pl' type veth peer name $FAKE_IF $netns_arg
81 ip link set 'test1pl' up
82 if [[ -n $netns ]]; then
83 do_ns ip addr add "${FAKE_IP}/31" dev $FAKE_IF
84 do_ns ip link set $FAKE_IF up
86 ip addr add "${FAKE_IP}/31" dev $FAKE_IF
87 ip link set $FAKE_IF up
92 ip link show test1pl >& /dev/null && ip link del test1pl || return 0
103 (ip netns list | grep -q $TESTNS) && ip netns del $TESTNS
108 echo "Loading LNet and configuring DLC"
109 load_lnet || return $?
110 do_lnetctl lnet configure
113 GLOBAL_YAML_FILE=$TMP/sanity-lnet-global.yaml
114 define_global_yaml() {
115 $LNETCTL export --backup >${GLOBAL_YAML_FILE} ||
116 error "Failed to export global yaml $?"
120 if lsmod | grep -q lnet; then
121 do_lnetctl lnet unconfigure ||
122 error "lnetctl lnet unconfigure failed $?"
123 do_lnetctl lnet configure ||
124 error "lnetctl lnet configure failed $?"
126 configure_dlc || error "configure_dlc failed $?"
131 append_global_yaml() {
132 [[ ! -e ${GLOBAL_YAML_FILE} ]] &&
133 error "Missing global yaml at ${GLOBAL_YAML_FILE}"
135 cat ${GLOBAL_YAML_FILE} >> $TMP/sanity-lnet-$testnum-expected.yaml
138 create_base_yaml_file() {
142 compare_yaml_files() {
143 local expected="$TMP/sanity-lnet-$testnum-expected.yaml"
144 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
146 ! [[ -e $expected ]] && echo "$expected not found" && return 1
147 ! [[ -e $actual ]] && echo "$actual not found" && return 1
148 if [ verify_yaml_available ]; then
149 verify_compare_yaml $actual $expected || rc=$?
151 diff -upN ${actual} ${expected} || rc=$?
162 local net="${nid//*@/}"
163 local addr="${nid//@*/}"
165 local num_re='[0-9]+'
166 local ip_re="[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}"
168 if [[ $net =~ (gni|kfi)[0-9]* ]]; then
169 [[ $addr =~ ${num_re} ]] && return 0
171 [[ $addr =~ ${ip_re} ]] && return 0
176 local yfile=$TMP/sanity-lnet-$testnum-actual.yaml
177 local primary_nids=$(awk '/- primary nid:/{print $NF}' $yfile | xargs echo)
178 local secondary_nids=$(awk '/- nid:/{print $NF}' $yfile | xargs echo)
179 local gateway_nids=$(awk '/gateway:/{print $NF}' $yfile | xargs echo)
182 for nid in $primary_nids $secondary_nids; do
183 validate_nid "$nid" || error "Bad NID \"${nid}\""
188 validate_peer_nids() {
190 local nids_per_peer="$2"
192 local expect_p="$num_peers"
193 # The primary nid also shows up in the list of secondary nids
194 local expect_s="$(($num_peers + $(($nids_per_peer*$num_peers))))"
196 local actual_p=$(grep -c -- '- primary nid:' $TMP/sanity-lnet-$testnum-actual.yaml)
197 local actual_s=$(grep -c -- '- nid:' $TMP/sanity-lnet-$testnum-actual.yaml)
198 if [[ $expect_p -ne $actual_p ]]; then
200 error "Expected $expect_p but found $actual_p primary nids"
201 elif [[ $expect_s -ne $actual_s ]]; then
203 error "Expected $expect_s but found $actual_s secondary nids"
208 validate_gateway_nids() {
209 local expect_gw=$(grep -c -- 'gateway:' $TMP/sanity-lnet-$testnum-expected.yaml)
210 local actual_gw=$(grep -c -- 'gateway:' $TMP/sanity-lnet-$testnum-actual.yaml)
211 if [[ $expect_gw -ne $actual_gw ]]; then
213 error "Expected $expect_gw gateways but found $actual_gw gateways"
216 local expect_gwnids=$(awk '/gateway:/{print $NF}' $TMP/sanity-lnet-$testnum-expected.yaml |
219 for nid in ${expect_gwnids}; do
220 if ! grep -q "gateway: ${nid}" $TMP/sanity-lnet-$testnum-actual.yaml; then
221 error "${nid} not configured as gateway"
229 setup_netns || error "setup_netns failed with $?"
231 # Determine the local interface(s) used for LNet
232 load_lnet "config_on_load=1" || error "Failed to load modules"
237 INTERFACES=( $(lnet_if_list) )
239 cleanup_lnet || error "Failed to cleanup LNet"
241 stack_trap 'cleanup_testsuite' EXIT
244 configure_dlc || error "Failed to configure DLC rc = $?"
246 reinit_dlc || return $?
247 do_lnetctl import < ${GLOBAL_YAML_FILE} || error "Import failed $?"
248 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
249 create_base_yaml_file
250 compare_yaml_files || error "Configuration changed after import"
252 run_test 0 "Export empty config, import the config, compare"
255 local prim_nid="${1:+--prim_nid $1}"
256 local nid="${2:+--nid $2}"
258 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
260 do_lnetctl peer add ${prim_nid} ${nid} || error "peer add failed $?"
261 $LNETCTL export --backup > $actual || error "export failed $?"
267 reinit_dlc || return $?
268 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
270 - primary nid: 1.1.1.1@tcp
276 compare_peer_add "1.1.1.1@tcp"
278 run_test 1 "Add peer with single nid (tcp)"
281 reinit_dlc || return $?
282 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
284 - primary nid: 2.2.2.2@o2ib
290 compare_peer_add "2.2.2.2@o2ib"
292 run_test 2 "Add peer with single nid (o2ib)"
295 reinit_dlc || return $?
296 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
298 - primary nid: 3.3.3.3@tcp
305 compare_peer_add "3.3.3.3@tcp" "3.3.3.3@o2ib"
307 run_test 3 "Add peer with tcp primary o2ib secondary"
310 reinit_dlc || return $?
311 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
313 - primary nid: 4.4.4.4@tcp
322 echo "Add peer with nidrange (tcp)"
323 compare_peer_add "4.4.4.4@tcp" "4.4.4.[1-3]@tcp"
325 echo "Add peer with nidrange that overlaps primary nid (tcp)"
326 compare_peer_add "4.4.4.4@tcp" "4.4.4.[1-4]@tcp"
328 run_test 4 "Add peer with nidrange (tcp)"
331 reinit_dlc || return $?
332 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
334 - primary nid: 5.5.5.5@o2ib
344 echo "Add peer with nidrange (o2ib)"
345 compare_peer_add "5.5.5.5@o2ib" "5.5.5.[1-4]@o2ib"
347 echo "Add peer with nidranage that overlaps primary nid (o2ib)"
348 compare_peer_add "5.5.5.5@o2ib" "5.5.5.[1-4]@o2ib"
350 run_test 5 "Add peer with nidrange (o2ib)"
353 reinit_dlc || return $?
354 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
356 - primary nid: 6.6.6.6@tcp
379 local nid_expr="6.6.[6-7].[0-4/2]@tcp"
380 nid_expr+=",6.6.[1-4/2].[0-6/3]@o2ib"
381 nid_expr+=",[6-12/4]@gni"
382 nid_expr+=",[6-12/4]@kfi"
384 compare_peer_add "6.6.6.6@tcp" "${nid_expr}"
386 run_test 6 "Add peer with multiple nidranges"
389 local prim_nid="${1:+--prim_nid $1}"
390 local nid="${2:+--nid $2}"
392 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
394 do_lnetctl peer del ${prim_nid} ${nid} || error "peer del failed $?"
395 $LNETCTL export --backup > $actual || error "export failed $?"
401 reinit_dlc || return $?
402 create_base_yaml_file
404 echo "Delete peer with single nid (tcp)"
405 do_lnetctl peer add --prim_nid 7.7.7.7@tcp || error "Peer add failed $?"
406 compare_peer_del "7.7.7.7@tcp"
408 echo "Delete peer with single nid (o2ib)"
409 do_lnetctl peer add --prim_nid 7.7.7.7@o2ib || error "Peer add failed $?"
410 compare_peer_del "7.7.7.7@o2ib"
412 echo "Delete peer that has multiple nids (tcp)"
413 do_lnetctl peer add --prim_nid 7.7.7.7@tcp --nid 7.7.7.[8-12]@tcp ||
414 error "Peer add failed $?"
415 compare_peer_del "7.7.7.7@tcp"
417 echo "Delete peer that has multiple nids (o2ib)"
418 do_lnetctl peer add --prim_nid 7.7.7.7@o2ib --nid 7.7.7.[8-12]@o2ib ||
419 error "Peer add failed $?"
420 compare_peer_del "7.7.7.7@o2ib"
422 echo "Delete peer that has both tcp and o2ib nids"
423 do_lnetctl peer add --prim_nid 7.7.7.7@tcp \
424 --nid 7.7.7.[9-12]@tcp,7.7.7.[13-15]@o2ib ||
425 error "Peer add failed $?"
426 compare_peer_del "7.7.7.7@tcp"
428 echo "Delete peer with single nid (gni)"
429 do_lnetctl peer add --prim_nid 7@gni || error "Peer add failed $?"
430 compare_peer_del "7@gni"
432 echo "Delete peer that has multiple nids (gni)"
433 do_lnetctl peer add --prim_nid 7@gni --nid [8-12]@gni ||
434 error "Peer add failed $?"
435 compare_peer_del "7@gni"
437 echo "Delete peer with single nid (kfi)"
438 do_lnetctl peer add --prim_nid 7@kfi || error "Peer add failed $?"
439 compare_peer_del "7@kfi"
441 echo "Delete peer that has multiple nids (kfi)"
442 do_lnetctl peer add --prim_nid 7@kfi --nid [8-12]@kfi ||
443 error "Peer add failed $?"
444 compare_peer_del "7@kfi"
446 echo "Delete peer that has tcp, o2ib, gni and kfi nids"
447 do_lnetctl peer add --prim_nid 7@gni \
448 --nid [8-12]@gni,7.7.7.[1-4]@tcp,7.7.7.[5-9]@o2ib,[1-5]@kfi ||
449 error "Peer add failed $?"
450 compare_peer_del "7@gni"
452 run_test 7 "Various peer delete tests"
455 reinit_dlc || return $?
457 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
459 - primary nid: 8.8.8.8@tcp
471 do_lnetctl peer add --prim_nid 8.8.8.8@tcp --nid 8.8.8.[10-15]@tcp ||
472 error "Peer add failed $?"
473 compare_peer_del "8.8.8.8@tcp" "8.8.8.13@tcp"
475 run_test 8 "Delete single secondary nid from peer (tcp)"
478 reinit_dlc || return $?
480 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
482 - primary nid: 9.9.9.9@tcp
489 do_lnetctl peer add --prim_nid 9.9.9.9@tcp \
490 --nid 9.9.9.[11-16]@tcp || error "Peer add failed $?"
491 compare_peer_del "9.9.9.9@tcp" "9.9.9.[11-16]@tcp"
493 run_test 9 "Delete all secondary nids from peer (tcp)"
496 reinit_dlc || return $?
498 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
500 - primary nid: 10.10.10.10@tcp
503 - nid: 10.10.10.10@tcp
504 - nid: 10.10.10.12@tcp
505 - nid: 10.10.10.13@tcp
506 - nid: 10.10.10.15@tcp
507 - nid: 10.10.10.16@tcp
510 do_lnetctl peer add --prim_nid 10.10.10.10@tcp \
511 --nid 10.10.10.[12-16]@tcp || error "Peer add failed $?"
512 compare_peer_del "10.10.10.10@tcp" "10.10.10.14@tcp"
514 run_test 10 "Delete single secondary nid from peer (o2ib)"
517 reinit_dlc || return $?
519 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
521 - primary nid: 11.11.11.11@tcp
524 - nid: 11.11.11.11@tcp
527 do_lnetctl peer add --prim_nid 11.11.11.11@tcp \
528 --nid 11.11.11.[13-17]@tcp || error "Peer add failed $?"
529 compare_peer_del "11.11.11.11@tcp" "11.11.11.[13-17]@tcp"
531 run_test 11 "Delete all secondary nids from peer (o2ib)"
534 reinit_dlc || return $?
536 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
538 - primary nid: 12.12.12.12@o2ib
541 - nid: 12.12.12.12@o2ib
542 - nid: 13.13.13.13@o2ib
543 - nid: 14.13.13.13@o2ib
544 - nid: 14.15.13.13@o2ib
546 - nid: 15.17.1.10@tcp
547 - nid: 15.17.1.20@tcp
550 do_lnetctl peer add --prim_nid 12.12.12.12@o2ib \
551 --nid [13-14/1].[13-15/2].13.13@o2ib,[15-16/3].[17-19/4].[1].[5-20/5]@tcp ||
552 error "Peer add failed $?"
553 compare_peer_del "12.12.12.12@o2ib" "13.15.13.13@o2ib,15.17.1.15@tcp"
555 run_test 12 "Delete a secondary nid from peer (tcp and o2ib)"
558 reinit_dlc || return $?
560 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
562 - primary nid: 13.13.13.13@o2ib
565 - nid: 13.13.13.13@o2ib
568 do_lnetctl peer add --prim_nid 13.13.13.13@o2ib \
569 --nid [14-15].[1-2/1].[1].[100-254/10]@tcp,14.14.[254].14@o2ib ||
570 error "Peer add failed $?"
571 compare_peer_del "13.13.13.13@o2ib" \
572 "[14-15].[1-2/1].[1].[100-254/10]@tcp,14.14.[254].14@o2ib"
574 run_test 13 "Delete all secondary nids from peer (tcp and o2ib)"
580 if [[ $net =~ gni* ]] || [[ $net =~ kfi* ]]; then
583 echo "${num}.${num}.${num}.${num}@${net}"
587 create_mr_peer_yaml() {
589 local secondary_nids="$2"
592 echo "Generating peer yaml for $num_peers peers with $secondary_nids secondary nids"
593 echo "peer:" >> $TMP/sanity-lnet-$testnum-expected.yaml
595 local total_nids=$((num_peers + $((num_peers * secondary_nids))))
598 while [[ $created -lt $num_peers ]]; do
599 local primary=$(create_nid ${nidnum} ${net})
600 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
601 - primary nid: $primary
607 local start=$((nidnum + 1))
608 local end=$((nidnum + $secondary_nids))
609 for j in $(seq ${start} ${end}); do
610 local nid=$(create_nid $j ${net})
611 echo " - nid: $nid" >> $TMP/sanity-lnet-$testnum-expected.yaml
619 reinit_dlc || return $?
621 echo "Create single peer, single nid, using import"
622 create_mr_peer_yaml 1 0 tcp
623 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
624 error "Import failed $?"
626 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
629 echo "Delete single peer using import --del"
630 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml ||
631 error "Import failed $?"
632 rm -f $TMP/sanity-lnet-$testnum-expected.yaml
633 create_base_yaml_file
634 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
637 run_test 14 "import peer create/delete with single nid"
640 reinit_dlc || return $?
642 echo "Create multiple peers, single nid per peer, using import"
643 create_mr_peer_yaml 5 0 o2ib
644 # The ordering of nids for this use-case is non-deterministic, so we
645 # we can't just diff the expected/actual output.
646 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
647 error "Import failed $?"
648 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
649 validate_peer_nids 5 0
651 echo "Delete multiple peers, single nid per peer, using import --del"
652 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml ||
653 error "Import failed $?"
654 rm -f $TMP/sanity-lnet-$testnum-expected.yaml
655 create_base_yaml_file
656 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
659 run_test 15 "import multi peer create/delete with single nid per peer"
662 reinit_dlc || return $?
664 echo "Create single peer, multiple nids, using import"
665 create_mr_peer_yaml 1 5 tcp
666 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
667 error "Import failed $?"
668 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
669 validate_peer_nids 1 5
671 echo "Delete single peer, multiple nids, using import --del"
672 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml ||
673 error "Import failed $?"
674 rm -f $TMP/sanity-lnet-$testnum-expected.yaml
675 create_base_yaml_file
676 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
679 run_test 16 "import peer create/delete with multiple nids"
682 reinit_dlc || return $?
684 echo "Create multiple peers, multiple nids per peer, using import"
685 create_mr_peer_yaml 5 7 o2ib
686 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
687 error "Import failed $?"
688 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
689 validate_peer_nids 5 7
691 echo "Delete multiple peers, multiple nids per peer, using import --del"
692 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml ||
693 error "Import failed $?"
694 rm -f $TMP/sanity-lnet-$testnum-expected.yaml
695 create_base_yaml_file
696 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
699 run_test 17 "import multi peer create/delete with multiple nids"
702 reinit_dlc || return $?
704 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
706 - primary nid: 1.1.1.1@tcp
715 echo "Import peer with 5 nids"
716 cat $TMP/sanity-lnet-$testnum-expected.yaml
717 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
718 error "Import failed $?"
719 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
721 - primary nid: 1.1.1.1@tcp
728 echo "Delete three of the nids"
729 cat $TMP/sanity-lnet-$testnum-expected.yaml
730 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml
731 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
733 - primary nid: 1.1.1.1@tcp
739 echo "Check peer has expected nids remaining"
740 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
744 run_test 18a "Delete a subset of nids from a single peer using import --del"
747 reinit_dlc || return $?
749 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
751 - primary nid: 1.1.1.1@tcp
759 - primary nid: 6.6.6.6@o2ib
768 echo "Import two peers with 5 nids each"
769 cat $TMP/sanity-lnet-$testnum-expected.yaml
770 do_lnetctl import < $TMP/sanity-lnet-$testnum-expected.yaml ||
771 error "Import failed $?"
772 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
774 - primary nid: 1.1.1.1@tcp
780 - primary nid: 6.6.6.6@o2ib
787 echo "Delete three of the nids from each peer"
788 cat $TMP/sanity-lnet-$testnum-expected.yaml
789 do_lnetctl import --del < $TMP/sanity-lnet-$testnum-expected.yaml
790 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
792 - primary nid: 6.6.6.6@o2ib
797 - primary nid: 1.1.1.1@tcp
804 echo "Check peers have expected nids remaining"
805 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
807 validate_peer_nids 2 1
809 run_test 18b "Delete multiple nids from multiple peers using import --del"
812 reinit_dlc || return $?
813 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
815 - primary nid: 19@gni
821 compare_peer_add "19@gni"
823 run_test 19 "Add peer with single nid (gni)"
826 reinit_dlc || return $?
827 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
829 - primary nid: 20@gni
833 - nid: 20.20.20.20@tcp
834 - nid: 20.20.20.20@o2ib
837 compare_peer_add "20@gni" "20.20.20.20@tcp,20.20.20.20@o2ib"
839 run_test 20 "Add peer with gni primary and tcp, o2ib secondary"
842 reinit_dlc || return $?
843 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
845 - primary nid: 21@gni
855 echo "Add peer with nidrange (gni)"
856 compare_peer_add "21@gni" "[22-25]@gni" || error
857 echo "Add peer with nidrange that overlaps primary nid (gni)"
858 compare_peer_add "21@gni" "[21-25]@gni"
860 run_test 21 "Add peer with nidrange (gni)"
863 reinit_dlc || return $?
864 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
866 - primary nid: 22@gni
877 do_lnetctl peer add --prim_nid 22@gni --nid [24-29]@gni ||
878 error "Peer add failed $?"
879 compare_peer_del "22@gni" "26@gni"
881 run_test 22 "Delete single secondary nid from peer (gni)"
884 reinit_dlc || return $?
885 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
887 - primary nid: 23@gni
894 do_lnetctl peer add --prim_nid 23@gni --nid [25-29]@gni ||
895 error "Peer add failed $?"
896 compare_peer_del "23@gni" "[25-29]@gni"
898 run_test 23 "Delete all secondary nids from peer (gni)"
901 reinit_dlc || return $?
902 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
904 - primary nid: 24@gni
909 - nid: 13.13.13.13@o2ib
910 - nid: 14.13.13.13@o2ib
911 - nid: 14.15.13.13@o2ib
913 - nid: 15.17.1.10@tcp
914 - nid: 15.17.1.20@tcp
917 do_lnetctl peer add --prim_nid 24@gni \
918 --nid [13-14/1].[13-15/2].13.13@o2ib,[15-16/3].[17-19/4].[1].[5-20/5]@tcp,[5-12/6]@gni ||
919 error "Peer add failed $?"
920 compare_peer_del "24@gni" "5@gni,13.15.13.13@o2ib,15.17.1.15@tcp"
922 run_test 24 "Delete a secondary nid from peer (tcp, o2ib and gni)"
925 reinit_dlc || return $?
926 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
928 - primary nid: 25@gni
934 do_lnetctl peer add --prim_nid 25@gni \
935 --nid [26-27].[4-10/3].26.26@tcp,26.26.26.26@o2ib,[30-35]@gni ||
936 error "Peer add failed $?"
937 compare_peer_del "25@gni" \
938 "[26-27].[4-10/3].26.26@tcp,26.26.26.26@o2ib,[30-35]@gni"
940 run_test 25 "Delete all secondary nids from peer (tcp, gni and o2ib)"
943 reinit_dlc || return $?
945 do_lnetctl peer add --prim_nid 1.1.1.1@tcp --lock_prim ||
946 error "Peer add with --lock_prim option failed $?"
947 local peer_state=$($LNETCTL peer show -v 4 --nid 1.1.1.1@tcp |
948 awk '/peer state/ {print $NF}')
949 # This relies on the following peer state definition:
950 # #define LNET_PEER_LOCK_PRIMARY BIT(20)
951 if ((!("$peer_state" & (1 << 20)))); then
952 error "Peer state does not have 'locked' bit set: $peer_state"
954 do_lnetctl peer del --prim_nid 1.1.1.1@tcp ||
955 error "Peer del failed $?"
956 $LNETCTL peer show --nid 1.1.1.1@tcp | grep -q 1.1.1.1@tcp ||
957 error "1.1.1.1@tcp is not listed"
958 do_lnetctl peer del --prim_nid 1.1.1.1@tcp --force ||
959 error "Peer del --force failed $?"
960 do_lnetctl peer show --nid 1.1.1.1@tcp &&
961 error "failed to delete 1.1.1.1@tcp"
965 run_test 26 "Delete peer with primary nid locked"
968 reinit_dlc || return $?
970 echo "Invalid prim_nid - peer add"
971 do_lnetctl peer add --prim_nid foobar &&
972 error "Command should have failed"
974 echo "Invalid prim_nid - peer del"
975 do_lnetctl peer del --prim_nid foobar &&
976 error "Command should have failed"
978 echo "Delete non-existing peer"
979 do_lnetctl peer del --prim_nid 1.1.1.1@o2ib &&
980 error "Command should have failed"
982 echo "Don't provide mandatory argument for peer del"
983 do_lnetctl peer del --nid 1.1.1.1@tcp &&
984 error "Command should have failed"
986 echo "Don't provide mandatory argument for peer add"
987 do_lnetctl peer add --nid 1.1.1.1@tcp &&
988 error "Command should have failed"
990 echo "Don't provide mandatory arguments peer add"
991 do_lnetctl peer add &&
992 error "Command should have failed"
994 echo "Invalid secondary nids"
995 do_lnetctl peer add --prim_nid 1.1.1.1@tcp --nid foobar &&
996 error "Command should have failed"
998 echo "Exceed max nids per peer"
999 do_lnetctl peer add --prim_nid 1.1.1.1@tcp --nid 1.1.1.[2-255]@tcp &&
1000 error "Command should have failed"
1002 echo "Invalid net type"
1003 do_lnetctl peer add --prim_nid 1@foo &&
1004 error "Command should have failed"
1006 echo "Invalid nid format"
1007 local invalid_nids="1@tcp 1@o2ib 1.1.1.1@gni"
1010 for nid in ${invalid_nids}; do
1011 echo "Check invalid primary nid - '$nid'"
1012 do_lnetctl peer add --prim_nid $nid &&
1013 error "Command should have failed"
1016 local invalid_strs="[2-1]@gni [a-f/x]@gni 256.256.256.256@tcp"
1017 invalid_strs+=" 1.1.1.1.[2-5/f]@tcp 1.]2[.3.4@o2ib"
1018 invalid_strs+="1.[2-4,[5-6],7-8].1.1@tcp foobar"
1021 for nidstr in ${invalid_strs}; do
1022 echo "Check invalid nidstring - '$nidstr'"
1023 do_lnetctl peer add --prim_nid 1.1.1.1@tcp --nid $nidstr &&
1024 error "Command should have failed"
1027 echo "Add non-local gateway"
1028 do_lnetctl route add --net tcp --gateway 1@gni &&
1029 error "Command should have failed"
1033 run_test 99a "Check various invalid inputs to lnetctl peer"
1036 reinit_dlc || return $?
1038 create_base_yaml_file
1040 cat <<EOF > $TMP/sanity-lnet-$testnum-invalid.yaml
1042 - primary nid: 99.99.99.99@tcp
1045 - nid: 99.99.99.99@tcp
1047 do_lnetctl import < $TMP/sanity-lnet-$testnum-invalid.yaml &&
1048 error "import should have failed"
1049 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
1052 run_test 99b "Invalid value for Multi-Rail in yaml import"
1056 local ip=$(ip addr show dev $if | awk '/ inet /{print $2}')
1064 do_lnetctl net add --net ${net} --if ${if} ||
1065 error "Failed to add net ${net} on if ${if}"
1068 compare_route_add() {
1072 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
1074 do_lnetctl route add --net ${rnet} --gateway ${gw} ||
1075 error "route add failed $?"
1076 $LNETCTL export --backup > $actual ||
1077 error "export failed $?"
1078 validate_gateway_nids
1082 append_net_tunables() {
1085 $LNETCTL net show -v --net ${net} | grep -v 'dev cpt' |
1086 awk '/^\s+tunables:$/,/^\s+CPT:/' >> $TMP/sanity-lnet-$testnum-expected.yaml
1089 IF0_IP=$(ip -o -4 a s ${INTERFACES[0]} |
1090 awk '{print $4}' | sed 's/\/.*//')
1091 IF0_NET=$(awk -F. '{print $1"."$2"."$3}'<<<"${IF0_IP}")
1092 IF0_HOSTNUM=$(awk -F. '{print $4}'<<<"${IF0_IP}")
1093 if (((IF0_HOSTNUM + 5) > 254)); then
1096 GW_HOSTNUM=$((IF0_HOSTNUM + 1))
1098 GW_NID="${IF0_NET}.${GW_HOSTNUM}@${NETTYPE}"
1100 [[ ${NETTYPE} == tcp* ]] ||
1101 skip "Need tcp NETTYPE"
1102 reinit_dlc || return $?
1103 add_net "${NETTYPE}" "${INTERFACES[0]}"
1104 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
1106 - net type: ${NETTYPE}
1111 append_net_tunables tcp
1112 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
1118 health_sensitivity: 1
1120 - primary nid: ${GW_NID}
1126 compare_route_add "tcp7" "${GW_NID}"
1128 run_test 100 "Add route with single gw (tcp)"
1131 [[ ${NETTYPE} == tcp* ]] ||
1132 skip "Need tcp NETTYPE"
1133 reinit_dlc || return $?
1134 add_net "${NETTYPE}" "${INTERFACES[0]}"
1135 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
1137 - net type: ${NETTYPE}
1142 append_net_tunables tcp
1144 echo "route:" >> $TMP/sanity-lnet-$testnum-expected.yaml
1145 for i in $(seq $GW_HOSTNUM $((GW_HOSTNUM + 4))); do
1146 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
1148 gateway: ${IF0_NET}.${i}@tcp
1151 health_sensitivity: 1
1155 echo "peer:" >> $TMP/sanity-lnet-$testnum-expected.yaml
1156 for i in $(seq $GW_HOSTNUM $((GW_HOSTNUM + 4))); do
1157 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
1158 - primary nid: ${IF0_NET}.${i}@tcp
1161 - nid: ${IF0_NET}.${i}@tcp
1166 local gw="${IF0_NET}.[$GW_HOSTNUM-$((GW_HOSTNUM + 4))]@tcp"
1168 compare_route_add "tcp8" "${gw}"
1170 run_test 101 "Add route with multiple gw (tcp)"
1172 compare_route_del() {
1176 local actual="$TMP/sanity-lnet-$testnum-actual.yaml"
1178 do_lnetctl route del --net ${rnet} --gateway ${gw} ||
1179 error "route del failed $?"
1180 $LNETCTL export --backup > $actual ||
1181 error "export failed $?"
1182 validate_gateway_nids
1188 if [[ ${net} =~ (tcp|o2ib)[0-9]* ]]; then
1191 echo "$((${testnum} % 255))@${net}"
1196 reinit_dlc || return $?
1197 add_net "${NETTYPE}" "${INTERFACES[0]}"
1198 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-expected.yaml
1200 local gwnid=$(generate_gw_nid ${NETTYPE})
1202 do_lnetctl route add --net ${NETTYPE}2 --gateway ${gwnid} ||
1203 error "route add failed $?"
1204 compare_route_del "${NETTYPE}2" "${gwnid}"
1206 run_test 102 "Delete route with single gw"
1208 IP_NID_EXPR='103.103.103.[103-120/4]'
1209 NUM_NID_EXPR='[103-120/4]'
1211 reinit_dlc || return $?
1212 add_net "${NETTYPE}" "${INTERFACES[0]}"
1213 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-expected.yaml
1217 if [[ $NETTYPE =~ (tcp|o2ib)[0-9]* ]]; then
1218 nid_expr="${IF0_NET}.[$GW_HOSTNUM-$((GW_HOSTNUM+5))/2]"
1220 nid_expr="${NUM_NID_EXPR}"
1223 do_lnetctl route add --net ${NETTYPE}103 \
1224 --gateway ${nid_expr}@${NETTYPE} ||
1225 error "route add failed $?"
1226 compare_route_del "${NETTYPE}103" "${nid_expr}@${NETTYPE}"
1228 run_test 103 "Delete route with multiple gw"
1231 local tyaml="$TMP/sanity-lnet-$testnum-expected.yaml"
1233 reinit_dlc || return $?
1235 # Default value is '3'
1236 local val=$($LNETCTL global show | awk '/response_tracking/{print $NF}')
1238 error "Expect 3 found $val"
1240 echo "Set < 0; Should fail"
1241 do_lnetctl set response_tracking -1 &&
1242 error "should have failed $?"
1244 reinit_dlc || return $?
1247 response_tracking: -10
1249 do_lnetctl import < $tyaml &&
1250 error "should have failed $?"
1252 echo "Check valid values; Should succeed"
1254 for ((i = 0; i < 4; i++)); do
1255 reinit_dlc || return $?
1256 do_lnetctl set response_tracking $i ||
1257 error "should have succeeded $?"
1258 $LNETCTL global show | grep -q "response_tracking: $i" ||
1259 error "Failed to set response_tracking to $i"
1260 reinit_dlc || return $?
1263 response_tracking: $i
1265 do_lnetctl import < $tyaml ||
1266 error "should have succeeded $?"
1267 $LNETCTL global show | grep -q "response_tracking: $i" ||
1268 error "Failed to set response_tracking to $i"
1271 reinit_dlc || return $?
1272 echo "Set > 3; Should fail"
1273 do_lnetctl set response_tracking 4 &&
1274 error "should have failed $?"
1276 reinit_dlc || return $?
1279 response_tracking: 10
1281 do_lnetctl import < $tyaml &&
1282 error "should have failed $?"
1285 run_test 104 "Set/check response_tracking param"
1288 reinit_dlc || return $?
1289 add_net "${NETTYPE}" "${INTERFACES[0]}"
1291 local gwnid=$(generate_gw_nid ${NETTYPE})
1293 do_lnetctl route add --net ${NETTYPE}105 --gateway ${gwnid} ||
1294 error "route add failed $?"
1295 do_lnetctl peer add --prim ${gwnid} &&
1296 error "peer add should fail"
1300 run_test 105 "Adding duplicate GW peer should fail"
1303 reinit_dlc || return $?
1304 add_net "${NETTYPE}" "${INTERFACES[0]}"
1306 local gwnid=$(generate_gw_nid ${NETTYPE})
1308 do_lnetctl route add --net ${NETTYPE}106 --gateway ${gwnid} ||
1309 error "route add failed $?"
1310 do_lnetctl peer del --prim ${gwnid} &&
1311 error "peer del should fail"
1315 run_test 106 "Deleting GW peer should fail"
1318 [[ ${NETTYPE} == tcp* ]] ||
1319 skip "Need tcp NETTYPE"
1320 cleanup_lnet || exit 1
1321 load_lnet "networks=\"\""
1322 do_ns $LNETCTL lnet configure --all || exit 1
1323 $LNETCTL net show --net tcp | grep -q "nid: ${FAKE_IP}@tcp$"
1325 run_test 200 "load lnet w/o module option, configure in a non-default namespace"
1328 [[ ${NETTYPE} == tcp* ]] ||
1329 skip "Need tcp NETTYPE"
1330 cleanup_lnet || exit 1
1331 load_lnet "networks=tcp($FAKE_IF)"
1332 do_ns $LNETCTL lnet configure --all || exit 1
1333 $LNETCTL net show --net tcp | grep -q "nid: ${FAKE_IP}@tcp$"
1335 run_test 201 "load lnet using networks module options in a non-default namespace"
1338 [[ ${NETTYPE} == tcp* ]] ||
1339 skip "Need tcp NETTYPE"
1340 cleanup_lnet || exit 1
1341 load_lnet "networks=\"\" ip2nets=\"tcp0($FAKE_IF) ${FAKE_IP}\""
1342 do_ns $LNETCTL lnet configure --all || exit 1
1343 $LNETCTL net show | grep -q "nid: ${FAKE_IP}@tcp$"
1345 run_test 202 "load lnet using ip2nets in a non-default namespace"
1348 ### Add the interfaces in the target namespace
1351 [[ ${NETTYPE} == tcp* ]] ||
1352 skip "Need tcp NETTYPE"
1353 cleanup_lnet || exit 1
1355 do_lnetctl lnet configure || exit 1
1356 do_ns $LNETCTL net add --net tcp0 --if $FAKE_IF
1358 run_test 203 "add a network using an interface in the non-default namespace"
1360 LNET_PARAMS_FILE="$TMP/$TESTSUITE.parameters"
1361 function save_lnet_params() {
1362 $LNETCTL global show | egrep -v '^global:$' |
1363 sed 's/://' > $LNET_PARAMS_FILE
1366 function restore_lnet_params() {
1368 while read param value; do
1369 [[ $param == max_intf ]] && continue
1370 [[ $param == lnd_timeout ]] && continue
1371 $LNETCTL set ${param} ${value} ||
1372 error "Failed to restore ${param} to ${value}"
1373 done < $LNET_PARAMS_FILE
1376 function lnet_health_pre() {
1379 # Lower transaction timeout to speed up test execution
1380 $LNETCTL set transaction_timeout 10 ||
1381 error "Failed to set transaction_timeout $?"
1383 RETRY_PARAM=$($LNETCTL global show | awk '/retry_count/{print $NF}')
1384 RSND_PRE=$($LNETCTL stats show | awk '/resend_count/{print $NF}')
1385 LO_HVAL_PRE=$($LNETCTL net show -v 2 | awk '/health value/{print $NF}' |
1386 xargs echo | sed 's/ /+/g' | bc -l)
1388 RMT_HVAL_PRE=$($LNETCTL peer show --nid ${RNIDS[0]} -v 2 2>/dev/null |
1389 awk '/health value/{print $NF}' | xargs echo |
1390 sed 's/ /+/g' | bc -l)
1392 # Might not have any peers so initialize to zero.
1393 RMT_HVAL_PRE=${RMT_HVAL_PRE:-0}
1398 function lnet_health_post() {
1399 RSND_POST=$($LNETCTL stats show | awk '/resend_count/{print $NF}')
1400 LO_HVAL_POST=$($LNETCTL net show -v 2 |
1401 awk '/health value/{print $NF}' |
1402 xargs echo | sed 's/ /+/g' | bc -l)
1404 RMT_HVAL_POST=$($LNETCTL peer show --nid ${RNIDS[0]} -v 2 2>/dev/null |
1405 awk '/health value/{print $NF}' | xargs echo |
1406 sed 's/ /+/g' | bc -l)
1408 # Might not have any peers so initialize to zero.
1409 RMT_HVAL_POST=${RMT_HVAL_POST:-0}
1412 echo "Pre resends: $RSND_PRE" &&
1413 echo "Post resends: $RSND_POST" &&
1414 echo "Resends delta: $((RSND_POST - RSND_PRE))" &&
1415 echo "Pre local health: $LO_HVAL_PRE" &&
1416 echo "Post local health: $LO_HVAL_POST" &&
1417 echo "Pre remote health: $RMT_HVAL_PRE" &&
1418 echo "Post remote health: $RMT_HVAL_POST"
1422 do_lnetctl peer set --health 1000 --all
1423 do_lnetctl net set --health 1000 --all
1428 function check_no_resends() {
1429 echo "Check that no resends took place"
1430 [[ $RSND_POST -ne $RSND_PRE ]] &&
1431 error "Found resends: $RSND_POST != $RSND_PRE"
1436 function check_resends() {
1437 local delta=$((RSND_POST - RSND_PRE))
1439 echo "Check that $RETRY_PARAM resends took place"
1440 [[ $delta -ne $RETRY_PARAM ]] &&
1441 error "Expected $RETRY_PARAM resends found $delta"
1446 function check_no_local_health() {
1447 echo "Check that local NI health is unchanged"
1448 [[ $LO_HVAL_POST -ne $LO_HVAL_PRE ]] &&
1449 error "Local health changed: $LO_HVAL_POST != $LO_HVAL_PRE"
1454 function check_local_health() {
1455 echo "Check that local NI health has been changed"
1456 [[ $LO_HVAL_POST -eq $LO_HVAL_PRE ]] &&
1457 error "Local health unchanged: $LO_HVAL_POST == $LO_HVAL_PRE"
1462 function check_no_remote_health() {
1463 echo "Check that remote NI health is unchanged"
1464 [[ $RMT_HVAL_POST -ne $RMT_HVAL_PRE ]] &&
1465 error "Remote health changed: $RMT_HVAL_POST != $RMT_HVAL_PRE"
1470 function check_remote_health() {
1471 echo "Check that remote NI health has been changed"
1472 [[ $RMT_HVAL_POST -eq $RMT_HVAL_PRE ]] &&
1473 error "Remote health unchanged: $RMT_HVAL_POST == $RMT_HVAL_PRE"
1483 setup_health_test() {
1487 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
1489 local rnodes=$(remote_nodes_list)
1490 [[ -z $rnodes ]] && skip "Need at least 1 remote node"
1492 cleanup_lnet || error "Failed to cleanup before test execution"
1494 # Loading modules should configure LNet with the appropriate
1495 # test-framework configuration
1496 load_lnet "config_on_load=1" || error "Failed to load modules"
1498 LNIDS=( $($LCTL list_nids | xargs echo) )
1500 RNODE=$(awk '{print $1}' <<<$rnodes)
1501 RNIDS=( $(do_node $RNODE $LCTL list_nids | xargs echo) )
1503 if [[ -z ${RNIDS[@]} ]]; then
1504 do_rpc_nodes $RNODE load_lnet "config_on_load=1"
1506 RNIDS=( $(do_node $RNODE $LCTL list_nids | xargs echo) )
1509 [[ ${#LNIDS[@]} -lt 1 ]] &&
1510 error "No NIDs configured for local host $HOSTNAME"
1511 [[ ${#RNIDS[@]} -lt 1 ]] &&
1512 error "No NIDs configured for remote host $RNODE"
1514 # Ensure all peer NIs are local (i.e. non-routed config)
1515 local rnid rnet lnid lnet
1517 for rnid in ${RNIDS[@]}; do
1519 for lnid in ${LNIDS[@]}; do
1521 [[ ${lnet} == ${rnet} ]] &&
1524 [[ ${lnet} != ${rnet} ]] &&
1525 skip "Need non-routed configuration"
1528 do_lnetctl discover ${RNIDS[0]} ||
1529 error "Unable to discover ${RNIDS[0]}"
1531 local mr=$($LNETCTL peer show --nid ${RNIDS[0]} |
1532 awk '/Multi-Rail/{print $NF}')
1534 if ${need_mr} && [[ $mr == False ]]; then
1535 cleanup_health_test || return $?
1539 if ( ! ${need_mr} && [[ ${#RNIDS[@]} -gt 1 ]] ) ||
1540 ( ! ${need_mr} && [[ ${#LNIDS[@]} -gt 1 ]] ); then
1541 cleanup_health_test || return $?
1545 if ${need_mr} && [[ ${#RNIDS[@]} -lt 2 ]]; then
1546 # Add a second, reachable NID to rnode.
1547 local net=${RNIDS[0]}
1551 local if=$(do_rpc_nodes --quiet $RNODE lnet_if_list)
1553 error "Failed to determine interface for $RNODE"
1555 do_rpc_nodes $RNODE "$LNETCTL lnet configure"
1556 do_rpc_nodes $RNODE "$LNETCTL net add --net $net --if $if" ||
1558 if [[ $rc -ne 0 ]]; then
1559 error "Failed to add interface to $RNODE rc=$?"
1561 RNIDS[1]="${RNIDS[0]}1"
1562 NET_DEL_ARGS="--net $net --if $if"
1566 if ${need_mr} && [[ ${#LNIDS[@]} -lt 2 ]]; then
1567 local net=${LNIDS[0]}
1570 do_lnetctl lnet configure &&
1571 do_lnetctl net add --net $net --if ${INTERFACES[0]} ||
1573 if [[ $rc -ne 0 ]]; then
1574 error "Failed to add interface rc=$?"
1576 LNIDS[1]="${LNIDS[0]}1"
1582 $LNETCTL peer show -v 2 | egrep -e nid -e health
1584 $LCTL set_param debug=+net
1590 cleanup_health_test() {
1593 if [[ -n $NET_DEL_ARGS ]]; then
1594 do_rpc_nodes $RNODE \
1595 "$LNETCTL net del $NET_DEL_ARGS" ||
1600 unload_modules || rc=$?
1603 do_rpc_nodes $RNODE unload_modules_local ||
1609 error "Failed cleanup"
1614 add_health_test_drop_rules() {
1615 local args="-m GET -r 1 -e ${1}"
1618 for src in "${LNIDS[@]}"; do
1619 for dst in "${RNIDS[@]}" "${LNIDS[@]}"; do
1620 $LCTL net_drop_add -s $src -d $dst ${args} ||
1621 error "Failed to add drop rule $src $dst $args"
1626 do_lnet_health_ping_test() {
1629 echo "Simulate $hstatus"
1631 lnet_health_pre || return $?
1633 add_health_test_drop_rules ${hstatus}
1634 do_lnetctl ping ${RNIDS[0]} &&
1635 error "Should have failed"
1639 $LCTL net_drop_del -a
1644 # See lnet/lnet/lib-msg.c:lnet_health_check()
1645 LNET_LOCAL_RESEND_STATUSES="local_interrupt local_dropped local_aborted"
1646 LNET_LOCAL_RESEND_STATUSES+=" local_no_route local_timeout"
1647 LNET_LOCAL_NO_RESEND_STATUSES="local_error"
1649 setup_health_test false || return $?
1652 for hstatus in ${LNET_LOCAL_RESEND_STATUSES} \
1653 ${LNET_LOCAL_NO_RESEND_STATUSES}; do
1654 do_lnet_health_ping_test "${hstatus}" || return $?
1655 check_no_resends || return $?
1656 check_no_local_health || return $?
1659 cleanup_health_test || return $?
1663 run_test 204 "Check no health or resends for single-rail local failures"
1666 setup_health_test true || return $?
1669 for hstatus in ${LNET_LOCAL_RESEND_STATUSES}; do
1670 do_lnet_health_ping_test "${hstatus}" || return $?
1671 check_resends || return $?
1672 check_local_health || return $?
1675 for hstatus in ${LNET_LOCAL_NO_RESEND_STATUSES}; do
1676 do_lnet_health_ping_test "${hstatus}" || return $?
1677 check_no_resends || return $?
1678 check_local_health || return $?
1681 cleanup_health_test || return $?
1685 run_test 205 "Check health and resends for multi-rail local failures"
1687 # See lnet/lnet/lib-msg.c:lnet_health_check()
1688 LNET_REMOTE_RESEND_STATUSES="remote_dropped"
1689 LNET_REMOTE_NO_RESEND_STATUSES="remote_error remote_timeout"
1691 setup_health_test false || return $?
1694 for hstatus in ${LNET_REMOTE_RESEND_STATUSES} \
1695 ${LNET_REMOTE_NO_RESEND_STATUSES}; do
1696 do_lnet_health_ping_test "${hstatus}" || return $?
1697 check_no_resends || return $?
1698 check_no_local_health || return $?
1699 check_no_remote_health || return $?
1702 cleanup_health_test || return $?
1706 run_test 206 "Check no health or resends for single-rail remote failures"
1709 setup_health_test true || return $?
1712 for hstatus in ${LNET_REMOTE_RESEND_STATUSES}; do
1713 do_lnet_health_ping_test "${hstatus}" || return $?
1714 check_resends || return $?
1715 check_no_local_health || return $?
1716 check_remote_health || return $?
1717 do_lnetctl peer set --health 1000 --all ||
1718 error "Unable to reset health rc=$?"
1720 for hstatus in ${LNET_REMOTE_NO_RESEND_STATUSES}; do
1721 do_lnet_health_ping_test "${hstatus}" || return $?
1722 check_no_resends || return $?
1723 check_no_local_health || return $?
1724 check_remote_health || return $?
1725 do_lnetctl peer set --health 1000 --all ||
1726 error "Unable to reset health rc=$?"
1729 cleanup_health_test || return $?
1733 run_test 207 "Check health and resends for multi-rail remote errors"
1735 test_208_load_and_check_lnet() {
1739 local num_expected=1
1741 load_lnet "networks=\"\" ip2nets=\"${ip2nets_str}\""
1744 error "Failed to load LNet with ip2nets \"${ip2nets_str}\""
1750 nids=( $($LCTL list_nids) )
1752 [[ ${#nids[@]} -ne ${num_expected} ]] &&
1753 error "Expect ${num_expected} NIDs found ${#nids[@]}"
1755 [[ ${nids[0]} == ${p_nid} ]] ||
1756 error "Expect NID \"${p_nid}\" found \"${nids[0]}\""
1758 [[ -n $s_nid ]] && [[ ${nids[1]} != ${s_nid} ]] &&
1759 error "Expect second NID \"${s_nid}\" found \"${nids[1]}\""
1761 $LCTL net down &>/dev/null
1766 [[ ${NETTYPE} == tcp* ]] ||
1767 skip "Need tcp NETTYPE"
1769 cleanup_netns || error "Failed to cleanup netns before test execution"
1770 cleanup_lnet || error "Failed to unload modules before test execution"
1771 setup_fakeif || error "Failed to add fake IF"
1773 have_interface "$FAKE_IF" ||
1774 error "Expect $FAKE_IF configured but not found"
1776 local if0_ip=$(ip --oneline addr show dev ${INTERFACES[0]} |
1777 awk '/inet /{print $4}' |
1779 if0_ip=($(echo "${if0_ip[@]}" | tr ' ' '\n' | uniq | tr '\n' ' '))
1780 local ip2nets_str="tcp(${INTERFACES[0]}) $if0_ip"
1782 echo "Configure single NID \"$ip2nets_str\""
1783 test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp"
1785 ip2nets_str="tcp(${INTERFACES[0]}) $if0_ip; tcp1($FAKE_IF) $FAKE_IP"
1786 echo "Configure two NIDs; two NETs \"$ip2nets_str\""
1787 test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" \
1790 ip2nets_str="tcp(${INTERFACES[0]}) $if0_ip; tcp($FAKE_IF) $FAKE_IP"
1791 echo "Configure two NIDs; one NET \"$ip2nets_str\""
1792 test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" \
1794 local addr1=( ${if0_ip//./ } )
1795 local addr2=( ${FAKE_IP//./ } )
1796 local range="[${addr1[0]},${addr2[0]}]"
1799 for i in $(seq 1 3); do
1800 range+=".[${addr1[$i]},${addr2[$i]}]"
1802 ip2nets_str="tcp(${INTERFACES[0]},${FAKE_IF}) ${range}"
1804 echo "Configured two NIDs; one NET alt syntax \"$ip2nets_str\""
1805 test_208_load_and_check_lnet "${ip2nets_str}" "${if0_ip}@tcp" \
1810 echo "alt syntax with missing IF \"$ip2nets_str\""
1811 load_lnet "networks=\"\" ip2nets=\"${ip2nets_str}\""
1813 echo "$LCTL net up should fail"
1815 error "LNet bring up should have failed"
1819 run_test 208 "Test various kernel ip2nets configurations"
1822 setup_health_test false || return $?
1824 echo "Simulate network_timeout w/SR config"
1827 add_health_test_drop_rules network_timeout
1829 do_lnetctl discover ${RNIDS[0]} &&
1830 error "Should have failed"
1834 check_no_resends || return $?
1835 check_no_local_health || return $?
1836 check_no_remote_health || return $?
1838 cleanup_health_test || return $?
1840 setup_health_test true || return $?
1842 echo "Simulate network_timeout w/MR config"
1846 add_health_test_drop_rules network_timeout
1848 do_lnetctl discover ${RNIDS[0]} &&
1849 error "Should have failed"
1853 check_no_resends || return $?
1854 check_local_health || return $?
1855 check_remote_health || return $?
1857 cleanup_health_test || return $?
1861 run_test 209 "Check health, but not resends, for network timeout"
1863 check_nid_in_recovq() {
1864 local recovq=$($LNETCTL debug recovery $1)
1866 local nids=$($LCTL list_nids | xargs echo)
1870 echo "Check \"$1\" recovery queue"
1872 if [[ $(grep -c 'nid-'<<<$recovq) -ne $expect ]]; then
1873 error "Expect $expect NIDs found: \"$recovq\""
1876 [[ $expect -eq 0 ]] && return 0
1878 for nid in ${nids}; do
1879 grep -q "nid-0: $nid"<<<$recovq &&
1884 error "Didn't find local NIDs in recovery queue: \"$recovq\""
1890 # First enqueue happens at time 0.
1891 # 2nd at 0 + 2^0 = 1
1892 # 3rd at 1 + 2^1 = 3
1893 # 4th at 3 + 2^2 = 7
1894 # 5th at 7 + 2^3 = 15
1895 # e.g. after 10 seconds we would expect to have seen the 4th enqueue,
1896 # (3 pings sent, 4th about to happen) and the 5th enqueue is yet to
1898 # If the recovery limit is 10 seconds, then when the 5th enqueue happens
1899 # we expect the peer NI to have aged out, so it will not actually be
1901 # If max_recovery_ping_interval is set to 4 then:
1902 # First enqueue happens at time 0.
1903 # 2nd at 0 + min(2^0, 4) = 1
1904 # 3rd at 1 + min(2^1, 4) = 3
1905 # 4th at 3 + min(2^2, 4) = 7
1906 # 5th at 7 + min(2^3, 4) = 11
1907 # 6th at 11 + min(2^4, 4) = 15
1908 # 7th at 15 + min(2^5, 4) = 19
1909 # e.g. after 4 seconds we would expect to have seen the 3rd enqueue,
1910 # (2 pings sent, 3rd about to happen), and the 4th enqueue is yet to happen
1911 # e.g. after 13 seconds we would expect to have seen the 5th enqueue,
1912 # (4 pings sent, 5th about to happen), and the 6th enqueue is yet to happen
1913 check_ping_count() {
1917 echo "Check ping counts:"
1919 if [[ $queue == "ni" ]]; then
1920 $LNETCTL net show -v 2 | egrep 'nid|health value|ping'
1921 ping_count=( $($LNETCTL net show -v 2 |
1922 awk '/ping_count/{print $NF}') )
1923 elif [[ $queue == "peer_ni" ]]; then
1924 $LNETCTL peer show -v 2 | egrep 'nid|health value|ping'
1925 ping_count=( $($LNETCTL peer show -v 2 |
1926 awk '/ping_count/{print $NF}') )
1928 error "Unrecognized queue \"$queue\""
1934 for count in "${ping_count[@]}"; do
1935 if [[ $count -eq $expect ]]; then
1936 if [[ $expect -ne 0 ]] && $found ; then
1937 error "Found more than one interface matching \"$expect\" ping count"
1940 echo "Expect ping count \"$expect\" found \"$count\""
1943 elif [[ $count -ne 0 ]]; then
1944 error "Found interface with ping count \"$count\" but expect \"$expect\""
1953 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
1955 reinit_dlc || return $?
1956 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
1957 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
1959 local prim_nid=$($LCTL list_nids | head -n 1)
1961 do_lnetctl discover $prim_nid ||
1962 error "failed to discover myself"
1964 local default=$($LNETCTL global show |
1965 awk '/recovery_limit/{print $NF}')
1966 # Set recovery limit to 10 seconds.
1967 do_lnetctl set recovery_limit 10 ||
1968 error "failed to set recovery_limit"
1970 $LCTL set_param debug=+net
1971 # Use local_error so LNet doesn't attempt to resend the discovery ping
1972 $LCTL net_drop_add -s *@${NETTYPE} -d *@${NETTYPE} -m GET -r 1 -e local_error
1973 $LCTL net_drop_add -s *@${NETTYPE}1 -d *@${NETTYPE}1 -m GET -r 1 -e local_error
1974 do_lnetctl discover $prim_nid &&
1975 error "Expected discovery to fail"
1977 # See comment for check_ping_count()
1979 check_nid_in_recovq "-l" "1"
1980 check_ping_count "ni" "2"
1984 check_nid_in_recovq "-l" "1"
1985 check_ping_count "ni" "3"
1987 $LCTL net_drop_del -a
1989 reinit_dlc || return $?
1990 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
1991 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
1993 local prim_nid=$($LCTL list_nids | head -n 1)
1995 do_lnetctl discover $prim_nid ||
1996 error "failed to discover myself"
1998 do_lnetctl set recovery_limit $default ||
1999 error "failed to set recovery_limit"
2001 default=$($LNETCTL global show |
2002 awk '/max_recovery_ping_interval/{print $NF}')
2003 do_lnetctl set max_recovery_ping_interval 4 ||
2004 error "failed to set max_recovery_ping_interval"
2006 $LCTL set_param debug=+net
2007 # Use local_error so LNet doesn't attempt to resend the discovery ping
2008 $LCTL net_drop_add -s *@${NETTYPE} -d *@${NETTYPE} -m GET -r 1 -e local_error
2009 $LCTL net_drop_add -s *@${NETTYPE}1 -d *@${NETTYPE}1 -m GET -r 1 -e local_error
2010 do_lnetctl discover $prim_nid &&
2011 error "Expected discovery to fail"
2013 # See comment for check_ping_count()
2015 check_nid_in_recovq "-l" "1"
2016 check_ping_count "ni" "2"
2019 check_nid_in_recovq "-l" "1"
2020 check_ping_count "ni" "4"
2022 $LCTL net_drop_del -a
2024 do_lnetctl set max_recovery_ping_interval $default ||
2025 error "failed to set max_recovery_ping_interval"
2029 run_test 210 "Local NI recovery checks"
2032 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
2034 reinit_dlc || return $?
2035 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2036 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2038 local prim_nid=$($LCTL list_nids | head -n 1)
2040 do_lnetctl discover $prim_nid ||
2041 error "failed to discover myself"
2043 local default=$($LNETCTL global show |
2044 awk '/recovery_limit/{print $NF}')
2045 # Set recovery limit to 10 seconds.
2046 do_lnetctl set recovery_limit 10 ||
2047 error "failed to set recovery_limit"
2049 $LCTL net_drop_add -s *@${NETTYPE} -d *@${NETTYPE} -m GET -r 1 -e remote_error
2050 $LCTL net_drop_add -s *@${NETTYPE}1 -d *@${NETTYPE}1 -m GET -r 1 -e remote_error
2052 # Set health to 0 on one interface. This forces it onto the recovery
2054 $LNETCTL peer set --nid $prim_nid --health 0
2056 # After 5 seconds, we expect the peer NI to still be in recovery
2058 check_nid_in_recovq "-p" 1
2059 check_ping_count "peer_ni" "2"
2061 # After 15 seconds, the peer NI should have been fully processed out of
2062 # the recovery queue. We'll allow a total of 17 seconds to account for
2063 # differences in sleeping for whole seconds vs. the more accurate time
2064 # keeping that is done in the recovery code.
2066 check_nid_in_recovq "-p" 0
2067 check_ping_count "peer_ni" "4"
2069 $LCTL net_drop_del -a
2071 # Set health to force it back onto the recovery queue. Set to 500 means
2072 # in 5 seconds it should be back at maximum value. We'll wait a couple
2073 # more seconds than that to be safe.
2074 # NB: we reset the recovery limit to 0 (indefinite) so the peer NI is
2076 do_lnetctl set recovery_limit 0 ||
2077 error "failed to set recovery_limit"
2079 $LNETCTL peer set --nid $prim_nid --health 500
2081 check_nid_in_recovq "-p" 1
2082 check_ping_count "peer_ni" "2"
2086 check_nid_in_recovq "-p" 0
2087 check_ping_count "peer_ni" "0"
2089 reinit_dlc || return $?
2090 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2091 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2093 local prim_nid=$($LCTL list_nids | head -n 1)
2095 do_lnetctl discover $prim_nid ||
2096 error "failed to discover myself"
2098 do_lnetctl set recovery_limit $default ||
2099 error "failed to set recovery_limit"
2101 default=$($LNETCTL global show |
2102 awk '/max_recovery_ping_interval/{print $NF}')
2103 do_lnetctl set max_recovery_ping_interval 4 ||
2104 error "failed to set max_recovery_ping_interval"
2106 $LCTL net_drop_add -s *@${NETTYPE} -d *@${NETTYPE} -m GET -r 1 -e remote_error
2107 $LCTL net_drop_add -s *@${NETTYPE}1 -d *@${NETTYPE}1 -m GET -r 1 -e remote_error
2109 # Set health to 0 on one interface. This forces it onto the recovery
2111 $LNETCTL peer set --nid $prim_nid --health 0
2113 # See comment for check_ping_count()
2115 check_nid_in_recovq "-p" "1"
2116 check_ping_count "peer_ni" "2"
2119 check_nid_in_recovq "-p" "1"
2120 check_ping_count "peer_ni" "4"
2122 $LCTL net_drop_del -a
2124 do_lnetctl set max_recovery_ping_interval $default ||
2125 error "failed to set max_recovery_ping_interval"
2129 run_test 211 "Remote NI recovery checks"
2132 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
2134 local rnodes=$(remote_nodes_list)
2135 [[ -z $rnodes ]] && skip "Need at least 1 remote node"
2137 cleanup_lnet || error "Failed to cleanup before test execution"
2139 # Loading modules should configure LNet with the appropriate
2140 # test-framework configuration
2141 load_lnet "config_on_load=1" || error "Failed to load modules"
2143 local my_nid=$($LCTL list_nids | head -n 1)
2145 error "Failed to get primary NID for local host $HOSTNAME"
2147 local rnode=$(awk '{print $1}' <<<$rnodes)
2148 local rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo)
2151 if [[ -z $rnodenids ]]; then
2152 do_rpc_nodes $rnode load_lnet "config_on_load=1"
2154 rnodenids=$(do_node $rnode $LCTL list_nids | xargs echo)
2157 local rnodepnid=$(awk '{print $1}' <<< $rnodenids)
2159 [[ -z $rnodepnid ]] &&
2160 error "Failed to get primary NID for remote host $rnode"
2162 log "Initial discovery"
2163 do_lnetctl discover --force $rnodepnid ||
2164 error "Failed to discover $rnodepnid"
2166 do_node $rnode "$LNETCTL discover --force $my_nid" ||
2167 error "$rnode failed to discover $my_nid"
2169 log "Fail local discover ping to set LNET_PEER_REDISCOVER flag"
2170 $LCTL net_drop_add -s "*@$NETTYPE" -d "*@$NETTYPE" -r 1 -e local_error
2171 do_lnetctl discover --force $rnodepnid &&
2172 error "Discovery should have failed"
2173 $LCTL net_drop_del -a
2176 for nid in $rnodenids; do
2177 # We need GET (PING) delay just long enough so we can trigger
2178 # discovery on the remote peer
2179 $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -l 3
2180 $LCTL net_drop_add -s "*@$NETTYPE" -d $nid -r 1 -m GET -e local_error
2181 # We need PUT (PUSH) delay just long enough so we can process
2183 $LCTL net_delay_add -s "*@$NETTYPE" -d $nid -r 1 -m PUT -l 6
2186 log "Force $HOSTNAME to discover $rnodepnid (in background)"
2187 # We want to get a PING sent that we know will eventually fail.
2188 # The delay rules we added will ensure the ping is not sent until
2189 # the PUSH is also in flight (see below), and the drop rule ensures that
2190 # when the PING is eventually sent it will error out
2191 do_lnetctl discover --force $rnodepnid &
2194 # We want a discovery PUSH from rnode to put rnode back on our
2195 # discovery queue. This should cause us to try and send a PUSH to rnode
2196 # while the PING is still outstanding.
2197 log "Force $rnode to discover $my_nid"
2198 do_node $rnode $LNETCTL discover --force $my_nid
2200 # At this point we'll have both PING_SENT and PUSH_SENT set for the
2201 # rnode peer. Wait for the PING to error out which should terminate the
2202 # discovery process that we backgrounded.
2203 log "Wait for $pid1"
2205 log "Finished wait on $pid1"
2207 # The PING send failure clears the PING_SENT flag and puts the peer back
2208 # on the discovery queue. When discovery thread processes the peer it
2209 # will mistakenly clear the PUSH_SENT flag (and set PUSH_FAILED).
2210 # Discovery will then complete for this peer even though we have an
2212 # When PUSH is actually unlinked it will be forced back onto the
2213 # discovery queue, but we no longer have a ref on the peer. When
2214 # discovery completes again, we'll trip the ASSERT in
2215 # lnet_destroy_peer_locked()
2217 # Delete the delay rules to send the PUSH
2218 $LCTL net_delay_del -a
2219 # Delete the drop rules
2220 $LCTL net_drop_del -a
2223 error "Failed to unload modules"
2225 do_rpc_nodes $rnode unload_modules_local ||
2226 error "Failed to unload modules on $rnode"
2231 run_test 212 "Check discovery refcount loss bug (LU-14627)"
2234 [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
2236 cleanup_netns || error "Failed to cleanup netns before test execution"
2237 cleanup_lnet || error "Failed to unload modules before test execution"
2239 setup_fakeif || error "Failed to add fake IF"
2240 have_interface "$FAKE_IF" ||
2241 error "Expect $FAKE_IF configured but not found"
2243 reinit_dlc || return $?
2245 add_net "tcp" "${INTERFACES[0]}" || return $?
2246 add_net "tcp" "$FAKE_IF" || return $?
2248 local nid1=$(lctl list_nids | head -n 1)
2249 local nid2=$(lctl list_nids | tail --lines 1)
2251 [[ $(lctl which_nid $nid1 $nid2) == $nid1 ]] ||
2252 error "Expect nid1 \"$nid1\" to be preferred"
2254 [[ $(lctl which_nid $nid2 $nid1) == $nid2 ]] ||
2255 error "Expect nid2 \"$nid2\" to be preferred"
2259 run_test 213 "Check LNetDist calculation for multiple local NIDs"
2261 function check_ni_status() {
2265 local status=$($LNETCTL net show |
2267 awk '/status/{print $NF}')
2269 echo "NI ${nid} expect status \"${expect}\" found \"${status}\""
2270 if [[ $status != $expect ]]; then
2271 error "Error: Expect NI status \"$expect\" for NID \"$nid\" but found \"$status\""
2278 [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
2280 cleanup_netns || error "Failed to cleanup netns before test execution"
2281 cleanup_lnet || error "Failed to unload modules before test execution"
2283 setup_fakeif || error "Failed to add fake IF"
2284 have_interface "$FAKE_IF" ||
2285 error "Expect $FAKE_IF configured but not found"
2287 reinit_dlc || return $?
2289 add_net "tcp" "${INTERFACES[0]}" || return $?
2290 add_net "tcp" "$FAKE_IF" || return $?
2292 local nid1=$(lctl list_nids | head -n 1)
2293 local nid2=$(lctl list_nids | tail --lines 1)
2295 check_ni_status "0@lo" up
2296 check_ni_status "$nid1" up
2297 check_ni_status "$nid2" up
2299 do_lnetctl ping --source $nid2 $nid1 ||
2300 error "$LNETCTL ping --source $nid2 $nid1 failed"
2302 echo "Set $FAKE_IF down"
2303 echo "ip link set dev $FAKE_IF down"
2304 ip link set dev $FAKE_IF down
2305 check_ni_status "0@lo" up
2306 check_ni_status "$nid1" up
2307 check_ni_status "$nid2" down
2309 run_test 214 "Check local NI status when link is downed"
2315 $LNETCTL net show -v 2 |
2316 egrep -e nid -e $stat |
2318 awk '/'$stat':/{print $NF}'
2323 for nidvar in nid1 nid2; do
2324 for stat in send_count recv_count; do
2325 s=$(get_ni_stat ${!nidvar} $stat)
2326 eval ${nidvar}_pre_${stat}=$s
2333 for nidvar in nid1 nid2; do
2334 for stat in send_count recv_count; do
2335 s=$(get_ni_stat ${!nidvar} $stat)
2336 eval ${nidvar}_post_${stat}=$s
2346 eval pre=\${${nidvar}_pre_${stat}}
2347 eval post=\${${nidvar}_post_${stat}}
2349 echo "${!nidvar} pre ${stat} $pre post ${stat} $post"
2351 [[ $pre -ne $post ]]
2355 cleanup_netns || error "Failed to cleanup netns before test execution"
2356 cleanup_lnet || error "Failed to unload modules before test execution"
2358 reinit_dlc || return $?
2360 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2361 add_net "${NETTYPE}2" "${INTERFACES[0]}" || return $?
2363 local nid1=$($LCTL list_nids | head -n 1)
2364 local nid2=$($LCTL list_nids | tail --lines 1)
2366 do_lnetctl peer add --prim $nid1 --nid $nid2 ||
2367 error "Failed to add peer"
2371 for nidvarA in nid1 nid2; do
2374 for nidvarB in nid1 nid2; do
2375 [[ $nidvarA == $nidvarB ]] && continue
2379 echo "$LNETCTL ping $dst x $npings"
2380 for i in $(seq 1 $npings); do
2381 $LNETCTL ping $dst &>/dev/null ||
2382 error "$LNETCTL ping $dst failed"
2387 # No source specified, sends to either NID should cause
2388 # counts to increase across both NIs
2389 for nidvar in nid1 nid2; do
2390 for stat in send_count recv_count; do
2391 ni_stat_changed $nidvar $stat ||
2392 error "$stat unchanged for ${!nidvar}"
2398 echo "$LNETCTL ping --source $src $dst x $npings"
2399 for i in $(seq 1 $npings); do
2400 $LNETCTL ping --source $src $dst &>/dev/null ||
2401 error "$LNETCTL ping --source $src $dst failed"
2406 # src nid == dest nid means stats for the _other_ NI
2407 # should be unchanged
2408 for nidvar in nid1 nid2; do
2409 for stat in send_count recv_count; do
2410 if [[ ${!nidvar} == $src ]]; then
2411 ni_stat_changed $nidvar $stat ||
2412 error "$stat unchanged for ${!nidvar}"
2414 ni_stat_changed $nidvar $stat &&
2415 error "$stat changed for ${!nidvar}"
2420 # Double number of pings for next iteration because the net
2421 # sequence numbers will have diverged
2422 npings=$(($npings * 2))
2425 # Ping from nid1 to nid2 should fail
2426 do_lnetctl ping --source $nid1 $nid2 &&
2427 error "ping from $nid1 to $nid2 should fail"
2429 # Ping from nid2 to nid1 should fail
2430 do_lnetctl ping --source $nid2 $nid1 &&
2431 error "ping from $nid2 to $nid1 should fail"
2435 run_test 215 "Test lnetctl ping --source option"
2438 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
2442 reinit_dlc || return $?
2444 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2445 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2447 local nids=( $($LCTL list_nids | xargs echo) )
2449 do_lnetctl discover ${nids[0]} ||
2450 error "Initial discovery failed"
2452 do_lnetctl ping --source ${nids[0]} ${nids[0]} ||
2453 error "Initial ping failed $?"
2455 do_lnetctl ping --source ${nids[1]} ${nids[1]} ||
2456 error "Initial ping failed $?"
2459 for src in "${nids[@]}"; do
2460 for dst in "${nids[@]}"; do
2461 $LCTL net_drop_add -r 1 -s $src -d $dst -e network_timeout
2465 do_lnetctl ping ${nids[0]} || rc=$?
2467 $LCTL net_drop_del -a
2470 error "expected ping to fail"
2472 check_nid_in_recovq "-p" 0
2473 check_nid_in_recovq "-l" 1
2477 run_test 216 "Failed send to peer NI owned by local host should not trigger peer NI recovery"
2480 reinit_dlc || return $?
2482 [[ $($LNETCTL net show | grep -c nid) -ne 1 ]] &&
2483 error "Unexpected number of NIs after initalizing DLC"
2485 do_lnetctl discover 0@lo ||
2486 error "Failed to discover 0@lo"
2490 run_test 217 "Don't leak memory when discovering peer with nnis <= 1"
2493 [[ ${NETTYPE} == kfi* ]] && skip "kfi doesn't support drop rules"
2495 reinit_dlc || return $?
2497 [[ ${#INTERFACES[@]} -lt 2 ]] &&
2498 skip "Need two LNet interfaces"
2500 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2502 local nid1=$($LCTL list_nids | head -n 1)
2504 do_lnetctl ping $nid1 ||
2507 add_net "${NETTYPE}" "${INTERFACES[1]}" || return $?
2509 local nid2=$($LCTL list_nids | tail --lines 1)
2511 do_lnetctl ping $nid2 ||
2514 $LCTL net_drop_add -s $nid1 -d $nid1 -e local_error -r 1
2516 do_lnetctl ping --source $nid1 $nid1 &&
2517 error "ping should have failed"
2519 local health_recovered
2522 for i in $(seq 1 5); do
2523 health_recovered=$($LNETCTL net show -v 2 |
2524 grep -c 'health value: 1000')
2526 if [[ $health_recovered -ne 2 ]]; then
2527 echo "Wait 1 second for health to recover"
2534 health_recovered=$($LNETCTL net show -v 2 |
2535 grep -c 'health value: 1000')
2537 $LCTL net_drop_del -a
2539 [[ $health_recovered -ne 2 ]] &&
2540 do_lnetctl net show -v 2 | egrep -e nid -e health &&
2541 error "Health hasn't recovered"
2545 run_test 218 "Local recovery pings should exercise all available paths"
2548 reinit_dlc || return $?
2549 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
2550 add_net "${NETTYPE}1" "${INTERFACES[0]}" || return $?
2552 local nid1=$(lctl list_nids | head -n 1)
2553 local nid2=$(lctl list_nids | tail --lines 1)
2555 do_lnetctl ping $nid1 ||
2556 error "Ping failed $?"
2557 do_lnetctl ping $nid2 ||
2558 error "Ping failed $?"
2560 do_lnetctl discover $nid2 ||
2561 error "Discovery failed"
2563 $LNETCTL peer show --nid $nid1 | grep -q $nid2 ||
2564 error "$nid2 is not listed under $nid1"
2566 run_test 219 "Consolidate peer entries"
2574 do_rpc_nodes $node "$LNETCTL net add --net $net --if $if $opts" ||
2575 error "add $net on interface $if on node $node failed rc=$?"
2583 do_node $node "$LNETCTL route add --net $net --gateway $gw" ||
2584 error "route add to $net via $gw failed rc=$?"
2588 ROUTER_INTERFACES=()
2591 init_router_test_vars() {
2592 local rnodes=$(remote_nodes_list)
2593 [[ -z $rnodes || $(wc -w <<<$rnodes) -lt 2 ]] &&
2594 skip "Need at least 2 remote nodes found \"$rnodes\""
2596 ROUTER=$(awk '{print $1}' <<<$rnodes)
2597 RPEER=$(awk '{print $2}' <<<$rnodes)
2599 rnodes=$(comma_list $ROUTER $RPEER)
2600 local all_nodes=$(comma_list $rnodes $HOSTNAME)
2602 do_nodes $rnodes $LUSTRE_RMMOD ||
2603 error "failed to unload modules"
2605 do_rpc_nodes $rnodes "load_lnet config_on_load=1" ||
2606 error "Failed to load and configure LNet"
2608 ROUTER_INTERFACES=( $(do_rpc_nodes --quiet $ROUTER lnet_if_list) )
2610 RPEER_INTERFACES=( $(do_rpc_nodes --quiet $RPEER lnet_if_list) )
2612 do_nodes $all_nodes $LUSTRE_RMMOD ||
2613 error "Failed to unload modules"
2615 [[ ${#INTERFACES[@]} -eq 0 ]] &&
2616 error "No interfaces configured for local host $HOSTNAME"
2617 [[ ${#ROUTER_INTERFACES[@]} -eq 0 ]] &&
2618 error "No interfaces configured for router $ROUTER"
2619 [[ ${#RPEER_INTERFACES[@]} -eq 0 ]] &&
2620 error "No interfaces configured for remote peer $RPEER"
2628 LOCAL_NET=${NETTYPE}1
2629 REMOTE_NET=${NETTYPE}2
2630 setup_router_test() {
2632 local rtr_net_opts="$2"
2634 (( $MDS1_VERSION >= $(version_code 2.15.0) )) ||
2635 skip "need at least 2.15.0 for load_lnet"
2637 if [[ ${#RPEER_INTERFACES[@]} -eq 0 ]]; then
2638 init_router_test_vars ||
2642 local all_nodes=$(comma_list $ROUTER $RPEER $HOSTNAME)
2644 do_nodes $all_nodes $LUSTRE_RMMOD ||
2645 error "failed to unload modules"
2647 mod_opts+=" alive_router_check_interval=5"
2648 mod_opts+=" router_ping_timeout=5"
2649 mod_opts+=" large_router_buffers=4"
2650 mod_opts+=" small_router_buffers=8"
2651 mod_opts+=" tiny_router_buffers=16"
2652 do_rpc_nodes $all_nodes load_lnet "${mod_opts}" ||
2653 error "Failed to load lnet"
2655 do_nodes $all_nodes "$LNETCTL lnet configure" ||
2656 error "Failed to initialize DLC"
2658 do_net_add $ROUTER $LOCAL_NET ${ROUTER_INTERFACES[0]} $rtr_net_opts ||
2661 do_net_add $ROUTER $REMOTE_NET ${ROUTER_INTERFACES[0]} ||
2664 do_net_add $RPEER $REMOTE_NET ${RPEER_INTERFACES[0]} ||
2667 add_net $LOCAL_NET ${INTERFACES[0]} ||
2670 ROUTER_NIDS=( $(do_node $ROUTER $LCTL list_nids 2>/dev/null |
2672 RPEER_NIDS=( $(do_node $RPEER $LCTL list_nids 2>/dev/null |
2674 LNIDS=( $($LCTL list_nids 2>/dev/null | xargs echo) )
2682 do_nodesv $node "if $LNETCTL route show --net $net --gateway $gw; then \
2683 $LNETCTL route del --net $net --gateway $gw; \
2689 cleanup_router_test() {
2690 local all_nodes=$(comma_list $HOSTNAME $ROUTER $RPEER)
2692 do_route_del $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
2693 error "Failed to delete $REMOTE_NET route"
2695 do_route_del $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
2696 error "Failed to delete $LOCAL_NET route"
2698 do_nodes $all_nodes $LUSTRE_RMMOD ||
2699 error "failed to unload modules"
2704 check_route_aliveness() {
2709 local lnetctl_actual
2713 chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval)
2715 lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}')
2716 lnetctl_actual=$(do_node $node $LNETCTL route show -v |
2717 awk '/state/{print $NF}')
2719 for ((i = 0; i < $chk_intvl; i++)); do
2720 if [[ $lctl_actual == $expected ]] &&
2721 [[ $lnetctl_actual == $expected ]]; then
2725 echo "wait 1s for route state change"
2728 lctl_actual=$(do_node $node $LCTL show_route | awk '{print $7}')
2729 lnetctl_actual=$(do_node $node $LNETCTL route show -v |
2730 awk '/state/{print $NF}')
2733 [[ $lctl_actual != $expected ]] &&
2734 error "Wanted \"$expected\" lctl found \"$lctl_actual\""
2736 [[ $lnetctl_actual != $expected ]] &&
2737 error "Wanted \"$expected\" lnetctl found \"$lnetctl_actual\""
2742 check_router_ni_status() {
2743 local expected_local="$1"
2744 local expected_remote="$2"
2752 chk_intvl=$(cat /sys/module/lnet/parameters/alive_router_check_interval)
2753 timeout=$(cat /sys/module/lnet/parameters/router_ping_timeout)
2755 actual_local=$(do_node $ROUTER "$LNETCTL net show --net $LOCAL_NET" |
2756 awk '/status/{print $NF}')
2757 actual_remote=$(do_node $ROUTER "$LNETCTL net show --net $REMOTE_NET" |
2758 awk '/status/{print $NF}')
2760 for ((i = 0; i < $((chk_intvl + timeout)); i++)); do
2761 if [[ $actual_local == $expected_local ]] &&
2762 [[ $actual_remote == $expected_remote ]]; then
2766 echo "wait 1s for NI state change"
2769 actual_local=$(do_node $ROUTER \
2770 "$LNETCTL net show --net $LOCAL_NET" |
2771 awk '/status/{print $NF}')
2772 actual_remote=$(do_node $ROUTER \
2773 "$LNETCTL net show --net $REMOTE_NET" |
2774 awk '/status/{print $NF}')
2777 [[ $actual_local == $expected_local ]] ||
2778 error "$LOCAL_NET should be $expected_local"
2780 [[ $actual_remote == $expected_remote ]] ||
2781 error "$REMOTE_NET should be $expected_remote"
2786 do_basic_rtr_test() {
2787 do_node $ROUTER "$LNETCTL set routing 1" ||
2788 error "Unable to enable routing on $ROUTER"
2790 do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
2793 do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
2796 check_route_aliveness "$HOSTNAME" "up" ||
2799 check_route_aliveness "$RPEER" "up" ||
2802 do_lnetctl ping ${RPEER_NIDS[0]} ||
2803 error "Failed to ping ${RPEER_NIDS[0]}"
2805 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
2806 error "$RPEER failed to ping ${LNIDS[0]}"
2812 setup_router_test || return $?
2814 do_basic_rtr_test || return $?
2816 do_rpc_nodes $HOSTNAME,$RPEER load_module ../lnet/selftest/lnet_selftest ||
2817 error "Failed to load lnet-selftest module"
2819 $LSTSH -H -t $HOSTNAME -f $RPEER -m rw -s 4k ||
2822 $LSTSH -H -t $HOSTNAME -f $RPEER -m rw ||
2825 cleanup_router_test || return $?
2827 run_test 220 "Add routes w/default options - check aliveness"
2830 setup_router_test lnet_peer_discovery_disabled=1 || return $?
2832 do_basic_rtr_test || return $?
2834 cleanup_router_test || return $?
2836 run_test 221 "Add routes w/DD disabled - check aliveness"
2838 do_aarf_enabled_test() {
2839 do_node $ROUTER "$LNETCTL set routing 1" ||
2840 error "Unable to enable routing on $ROUTER"
2842 check_router_ni_status "down" "down"
2844 do_lnetctl ping ${RPEER_NIDS[0]} &&
2845 error "Ping should fail"
2847 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" &&
2848 error "$RPEER ping should fail"
2850 # Adding a route should cause the router's NI on LOCAL_NET to get up
2851 do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
2854 check_router_ni_status "up" "down" ||
2857 # But route should still be down because of avoid_asym_router_failure
2858 check_route_aliveness "$HOSTNAME" "down" ||
2861 do_lnetctl ping ${RPEER_NIDS[0]} &&
2862 error "Ping should fail"
2864 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" &&
2865 error "$RPEER ping should fail"
2867 # Adding the symmetric route should cause the remote NI to go up and
2869 do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
2872 check_router_ni_status "up" "up" ||
2875 check_route_aliveness "$HOSTNAME" "up" ||
2878 check_route_aliveness "$RPEER" "up" ||
2881 do_lnetctl ping ${RPEER_NIDS[0]} ||
2882 error "Failed to ping ${RPEER_NIDS[0]}"
2884 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
2885 error "$RPEER failed to ping ${LNIDS[0]}"
2887 # Stop LNet on local host
2888 do_lnetctl lnet unconfigure ||
2889 error "Failed to stop LNet rc=$?"
2891 check_router_ni_status "down" "up" ||
2894 check_route_aliveness "$RPEER" "down" ||
2897 do_lnetctl ping ${RPEER_NIDS[0]} &&
2898 error "Ping should fail"
2900 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" &&
2901 error "$RPEER ping should fail"
2907 setup_router_test avoid_asym_router_failure=1 || return $?
2909 do_aarf_enabled_test || return $?
2911 cleanup_router_test || return $?
2913 run_test 222 "Check avoid_asym_router_failure=1"
2916 local opts="avoid_asym_router_failure=1 lnet_peer_discovery_disabled=1"
2918 setup_router_test "$opts" || return $?
2920 do_aarf_enabled_test || return $?
2922 cleanup_router_test || return $?
2924 run_test 223 "Check avoid_asym_router_failure=1 w/DD disabled"
2926 do_aarf_disabled_test() {
2927 do_node $ROUTER "$LNETCTL set routing 1" ||
2928 error "Unable to enable routing on $ROUTER"
2930 check_router_ni_status "down" "down"
2932 do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
2935 check_router_ni_status "up" "down" ||
2938 check_route_aliveness "$HOSTNAME" "up" ||
2941 do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
2944 check_router_ni_status "up" "up" ||
2947 check_route_aliveness "$HOSTNAME" "up" ||
2950 check_route_aliveness "$RPEER" "up" ||
2953 do_lnetctl ping ${RPEER_NIDS[0]} ||
2954 error "Failed to ping ${RPEER_NIDS[0]}"
2956 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
2957 error "$RPEER failed to ping ${LNIDS[0]}"
2959 # Stop LNet on local host
2960 do_lnetctl lnet unconfigure ||
2961 error "Failed to stop LNet rc=$?"
2963 check_router_ni_status "down" "up" ||
2966 check_route_aliveness "$RPEER" "up" ||
2973 setup_router_test avoid_asym_router_failure=0 ||
2976 do_aarf_disabled_test ||
2979 cleanup_router_test ||
2982 run_test 224 "Check avoid_asym_router_failure=0"
2985 local opts="avoid_asym_router_failure=0 lnet_peer_discovery_disabled=1"
2987 setup_router_test "$opts" || return $?
2989 do_aarf_disabled_test || return $?
2991 cleanup_router_test ||
2994 run_test 225 "Check avoid_asym_router_failure=0 w/DD disabled"
2996 do_rtr_peer_health_test() {
2999 do_node $ROUTER "$LNETCTL set routing 1" ||
3000 error "Unable to enable routing on $ROUTER"
3002 do_route_add $HOSTNAME $REMOTE_NET ${ROUTER_NIDS[0]} ||
3005 do_route_add $RPEER $LOCAL_NET ${ROUTER_NIDS[1]} ||
3008 check_router_ni_status "up" "up" ||
3011 check_route_aliveness "$HOSTNAME" "up" ||
3014 check_route_aliveness "$RPEER" "up" ||
3017 do_lnetctl ping ${RPEER_NIDS[0]} ||
3018 error "Failed to ping ${RPEER_NIDS[0]}"
3020 do_node $RPEER "$LNETCTL ping ${LNIDS[0]}" ||
3021 error "$RPEER failed to ping ${LNIDS[0]}"
3023 # Stop LNet on local host
3024 do_lnetctl lnet unconfigure ||
3025 error "Failed to stop LNet rc=$?"
3027 check_router_ni_status "down" "up" ||
3030 check_route_aliveness "$RPEER" "up" ||
3033 # The NI used to send the message to the destination will be the
3034 # router's NI on LOCAL_NET, so that's the drop count that will be
3036 local d1=$(do_node $ROUTER $LNETCTL net show -v --net $LOCAL_NET | \
3037 awk '/drop_count:/{print $NF}')
3039 # Ping from RPEER to local host should be dropped by the router
3040 do_node $RPEER "$LCTL ping ${LNIDS[0]}" &&
3041 error "$RPEER expected ping to fail"
3043 local d2=$(do_node $ROUTER $LNETCTL net show -v --net $LOCAL_NET | \
3044 awk '/drop_count:/{print $NF}')
3046 [[ $((d2 - d1)) -ne $expected ]] &&
3047 error "Expected drop count change by $expected: $d1 -> $d2"
3053 setup_router_test avoid_asym_router_failure=0 --peer-timeout=10 ||
3056 do_rtr_peer_health_test 1 ||
3059 cleanup_router_test ||
3062 run_test 226 "Check router peer health enabled"
3065 setup_router_test avoid_asym_router_failure=0 --peer-timeout=0 ||
3068 do_rtr_peer_health_test 0 ||
3071 cleanup_router_test ||
3074 run_test 227 "Check router peer health disabled"
3077 [[ ${NETTYPE} == tcp* ]] ||
3078 skip "Need tcp NETTYPE"
3080 echo "Check valid values; Should succeed"
3084 for ((i = 4; i < 16; i+=1)); do
3085 reinit_dlc || return $?
3086 add_net "tcp" "${INTERFACES[0]}" || return $?
3087 do_lnetctl net set --all --conns-per-peer $i ||
3088 error "should have succeeded $?"
3089 $LNETCTL net show -v 1 | grep -q "conns_per_peer: $i" ||
3090 error "failed to set conns-per-peer to $i"
3091 lnid="$(lctl list_nids | head -n 1)"
3092 do_lnetctl ping "$lnid" ||
3093 error "failed to ping myself"
3095 # "lctl --net tcp conn_list" prints the list of active
3096 # connections. Since we're pinging ourselves, there should be
3097 # 2 Control connections plus 2*conns_per_peer connections
3098 # created (one Bulk Input, one Bulk Output in each pair).
3099 # Here's the sample output for conns_per_peer set to 1:
3100 # 12345-1.1.1.1@tcp I[0]host01->host01:988 2626560/1061296 nonagle
3101 # 12345-1.1.1.1@tcp O[0]host01->host01:1022 2626560/1061488 nonagle
3102 # 12345-1.1.1.1@tcp C[0]host01->host01:988 2626560/1061296 nonagle
3103 # 12345-1.1.1.1@tcp C[0]host01->host01:1023 2626560/1061488 nonagle
3104 cmd="printf 'network tcp\nconn_list\n' | lctl | grep -c '$lnid'"
3106 # Expect 2+conns_per_peer*2 connections. Wait no longer
3108 wait_update $HOSTNAME "$cmd" "$((2+i*2))" 2 ||
3109 error "expected number of tcp connections $((2+i*2))"
3112 reinit_dlc || return $?
3113 add_net "tcp" "${INTERFACES[0]}" || return $?
3114 echo "Set > 127; Should fail"
3115 do_lnetctl net set --all --conns-per-peer 128 &&
3116 error "should have failed $?"
3118 reinit_dlc || return $?
3119 add_net "tcp" "${INTERFACES[0]}" || return $?
3121 local default=$($LNETCTL net show -v 1 |
3122 awk '/conns_per_peer/{print $NF}')
3124 echo "Set < 0; Should be ignored"
3125 do_lnetctl net set --all --conns-per-peer -1 ||
3126 error "should have succeeded $?"
3127 $LNETCTL net show -v 1 | grep -q "conns_per_peer: ${default}" ||
3128 error "Did not stay at default"
3130 run_test 230 "Test setting conns-per-peer"
3133 reinit_dlc || return $?
3135 local net=${NETTYPE}231
3137 do_lnetctl net add --net $net --if ${INTERFACES[0]} ||
3138 error "Failed to add net"
3140 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-expected.yaml
3141 sed -i 's/peer_timeout: .*$/peer_timeout: 0/' \
3142 $TMP/sanity-lnet-$testnum-expected.yaml
3144 reinit_dlc || return $?
3146 do_lnetctl import $TMP/sanity-lnet-$testnum-expected.yaml ||
3147 error "Failed to import configuration"
3149 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
3151 compare_yaml_files || error "Wrong config after import"
3153 do_lnetctl net del --net $net --if ${INTERFACES[0]} ||
3154 error "Failed to delete net $net"
3156 do_lnetctl net add --net $net --if ${INTERFACES[0]} --peer-timeout=0 ||
3157 error "Failed to add net with peer-timeout=0"
3159 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
3161 compare_yaml_files || error "Wrong config after lnetctl net add"
3163 reinit_dlc || return $?
3165 # lnet/include/lnet/lib-lnet.h defines DEFAULT_PEER_TIMEOUT 180
3166 sed -i 's/peer_timeout: .*$/peer_timeout: 180/' \
3167 $TMP/sanity-lnet-$testnum-expected.yaml
3169 sed -i '/^.*peer_timeout:.*$/d' $TMP/sanity-lnet-$testnum-actual.yaml
3171 do_lnetctl import $TMP/sanity-lnet-$testnum-actual.yaml ||
3172 error "Failed to import config without peer_timeout"
3174 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml
3178 run_test 231 "Check DLC handling of peer_timeout parameter"
3180 ### Test that linux route is added for each ni
3184 [[ ${NETTYPE} == tcp* ]] ||
3185 skip "Need tcp NETTYPE"
3186 reinit_dlc || return $?
3187 add_net "tcp" "${INTERFACES[0]}" || return $?
3189 skip_param=$(cat /sys/module/ksocklnd/parameters/skip_mr_route_setup)
3190 [[ ${skip_param:-0} -ne 0 ]] &&
3191 skip "Need skip_mr_route_setup=0 found $skip_param"
3193 ip route show table ${INTERFACES[0]} | grep -q "${INTERFACES[0]}"
3195 run_test 250 "test that linux routes are added"
3198 [[ ${NETTYPE} =~ kfi* ]] ||
3199 skip "Need kfi NETTYPE"
3201 reinit_dlc || return $?
3202 add_net "kfi" "${INTERFACES[0]}" || return $?
3203 add_net "kfi1" "${INTERFACES[0]}" || return $?
3204 add_net "kfi10" "${INTERFACES[0]}" || return $?
3207 run_test 251 "Define multiple kfi networks on single interface"
3210 setup_health_test false || return $?
3214 do_rpc_nodes $RNODE unload_modules_local || rc=$?
3216 if [[ $rc -ne 0 ]]; then
3217 cleanup_health_test || return $?
3219 error "Failed to unload modules on $RNODE rc=$rc"
3224 local ts1=$(date +%s)
3226 do_lnetctl ping --timeout 15 ${RNIDS[0]} &&
3227 error "Expected ping ${RNIDS[0]} to fail"
3229 local ts2=$(date +%s)
3231 local delta=$(echo "$ts2 - $ts1" | bc)
3233 [[ $delta -lt 15 ]] ||
3234 error "Ping took longer than expected to fail: $delta"
3238 run_test 252 "Ping to down peer should unlink quickly"
3240 do_expired_message_drop_test() {
3241 local rnid lnid old_tto
3243 old_tto=$($LNETCTL global show |
3244 awk '/transaction_timeout:/{print $NF}')
3246 [[ -z $old_tto ]] &&
3247 error "Cannot determine LNet transaction timeout"
3251 do_lnetctl set transaction_timeout "${tto}" ||
3252 error "Failed to set transaction_timeout"
3254 # We want to consume all peer credits for at least transaction_timeout
3260 for lnid in "${LNIDS[@]}"; do
3261 for rnid in "${RNIDS[@]}"; do
3262 $LCTL net_delay_add -s "${lnid}" -d "${rnid}" \
3263 -l "${delay}" -r 1 -m GET
3269 pcs=( $($LNETCTL peer show -v --nid "${RNIDS[0]}" |
3270 awk '/max_ni_tx_credits:/{print $NF}' |
3273 [[ ${#RNIDS[@]} -ne ${#pcs[@]} ]] &&
3274 error "Expect ${#RNIDS[@]} peer credit values found ${#pcs[@]}"
3276 local rnet lnid lnet i j
3278 # Need to use --source for multi-rail configs to ensure we consume
3279 # all available peer credits
3280 for ((i = 0; i < ${#RNIDS[@]}; i++)); do
3281 local ping_args="--timeout $((delay+2))"
3283 rnet=${RNIDS[i]##*@}
3284 for lnid in ${LNIDS[@]}; do
3286 [[ $rnet == $lnet ]] && break
3289 ping_args+=" --source ${lnid} ${RNIDS[i]}"
3290 for j in $(seq 1 "${pcs[i]}"); do
3291 $LNETCTL ping ${ping_args} 1>/dev/null &
3294 echo "Issued ${pcs[i]} pings to ${RNIDS[i]} from $lnid"
3297 # This ping should be queued on peer NI tx credit
3298 $LNETCTL ping --timeout $((delay+2)) "${RNIDS[0]}" &
3302 $LCTL net_delay_del -a
3306 # Messages sent from the delay list do not go through
3307 # lnet_post_send_locked(), thus we should only have a single drop
3310 dropped=$($LNETCTL peer show -v 2 --nid "${RNIDS[0]}" |
3311 grep -A 2 dropped_stats |
3312 awk '/get:/{print $2}' |
3314 sed 's/ /\+/g' | bc)
3316 [[ $dropped -ne 1 ]] &&
3317 error "Expect 1 dropped GET but found $dropped"
3319 do_lnetctl set transaction_timeout "${old_tto}"
3325 setup_health_test false || return $?
3327 do_expired_message_drop_test || return $?
3331 run_test 253 "Message delayed beyond deadline should be dropped (single-rail)"
3334 setup_health_test true || return $?
3336 do_expired_message_drop_test || return $?
3340 run_test 254 "Message delayed beyond deadline should be dropped (multi-rail)"
3343 [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
3345 reinit_dlc || return $?
3347 cleanup_lnet || return $?
3349 local routes_str="o2ib ${IF0_NET}.[$GW_HOSTNUM-$((GW_HOSTNUM+4))]"
3350 local network_str="${NETTYPE}(${INTERFACES[0]})"
3352 load_lnet "networks=\"${network_str}\" routes=\"${routes_str}\"" ||
3353 error "Failed to load LNet"
3356 error "Failed to load LNet with networks=\"${network_str}\" routes=\"${routes_str}\""
3358 cat <<EOF > $TMP/sanity-lnet-$testnum-expected.yaml
3360 - net type: ${NETTYPE}
3365 append_net_tunables tcp
3367 echo "route:" >> $TMP/sanity-lnet-$testnum-expected.yaml
3368 for i in $(seq $GW_HOSTNUM $((GW_HOSTNUM + 4))); do
3369 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
3371 gateway: ${IF0_NET}.${i}@${NETTYPE}
3374 health_sensitivity: 1
3378 echo "peer:" >> $TMP/sanity-lnet-$testnum-expected.yaml
3379 for i in $(seq $GW_HOSTNUM $((GW_HOSTNUM + 4))); do
3380 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
3381 - primary nid: ${IF0_NET}.${i}@${NETTYPE}
3384 - nid: ${IF0_NET}.${i}@${NETTYPE}
3390 $LNETCTL export --backup > $TMP/sanity-lnet-$testnum-actual.yaml ||
3391 error "export failed $?"
3393 validate_gateway_nids
3395 run_test 255 "Use lnet routes param with pdsh syntax"
3400 local out=$TMP/$tfile
3401 local prefix=/usr/include/linux/lnet
3403 # We use a hard coded prefix so that this test will not fail
3406 if ! which $CC > /dev/null 2>&1; then
3407 skip_env "$CC is not installed"
3410 cleanup_lnet || exit 1
3413 local cc_args="-Wall -Werror -std=c99 -c -x c /dev/null -o $out"
3414 if ! [[ -d $prefix ]]; then
3415 # Assume we're running in tree and fixup the include path.
3416 prefix=$LUSTRE/../lnet/include/uapi/linux/lnet
3417 cc_args+=" -I $LUSTRE/../lnet/include/uapi"
3420 for header in $prefix/*.h; do
3421 if ! [[ -f "$header" ]]; then
3425 echo "$CC $cc_args -include $header"
3426 $CC $cc_args -include $header ||
3427 error "cannot compile '$header'"
3431 run_test 300 "packaged LNet UAPI headers can be compiled"
3433 # LU-16081 lnet: Memory leak on adding existing interface
3436 reinit_dlc || return $?
3437 do_lnetctl net add --net ${NETTYPE} --if ${INTERFACES[0]} ||
3438 error "Failed to add net"
3439 do_lnetctl net add --net ${NETTYPE} --if ${INTERFACES[0]} &&
3440 error "add net should have failed"
3441 do_lnetctl net del --net ${NETTYPE} --if ${INTERFACES[0]} ||
3442 error "Failed to del net"
3445 run_test 301 "Check for dynamic adds of same/wrong interface (memory leak)"
3448 ! [[ $NETTYPE =~ (tcp|o2ib) ]] && skip "Need tcp or o2ib NETTYPE"
3449 reinit_dlc || return $?
3451 add_net "${NETTYPE}" "${INTERFACES[0]}" || return $?
3453 local nid=$($LCTL list_nids)
3455 do_lnetctl ping ${nid} ||
3456 error "pinging self failed $?"
3457 do_lnetctl debug peer --nid ${nid} ||
3458 error "failed to dump peer debug info $?"
3460 run_test 302 "Check that peer debug info can be dumped"
3463 [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
3465 setup_health_test true || return $?
3467 cleanup_netns || error "Failed to cleanup netns before test execution"
3468 setup_fakeif || error "Failed to add fake IF"
3469 have_interface "$FAKE_IF" ||
3470 error "Expect $FAKE_IF configured but not found"
3472 add_net "${NETTYPE}99" "$FAKE_IF" || return $?
3474 local nid=$($LCTL list_nids | tail --lines 1)
3476 # Our updated config should be pushed to RNODE
3477 local found=$(do_node $RNODE "$LNETCTL peer show --nid $nid")
3479 [[ -z $found ]] && error "Peer not updated on $RNODE"
3481 local prim=$($LCTL list_nids | head -n 1)
3483 if ! grep -q -- "- primary nid: $prim"<<<"${found}"; then
3485 error "Wrong primary nid"
3488 echo "Set $FAKE_IF down"
3489 echo "ip link set dev $FAKE_IF down"
3490 ip link set dev $FAKE_IF down
3491 check_ni_status "$nid" down
3493 local hval=$(do_node $RNODE "$LNETCTL peer show --nid $nid -v 2 | \
3494 grep -e '- nid:' -e 'health value:'")
3496 hval=$(grep -A 1 $nid<<<"$hval" | tail -n 1 | awk '{print $NF}')
3497 (( hval < 1000 )) ||
3498 error "Expect $hval < 1000"
3502 run_test 303 "Check peer NI health after link down"
3505 [[ ${NETTYPE} == tcp* ]] || skip "Need tcp NETTYPE"
3507 cleanup_netns || error "Failed to cleanup netns before test execution"
3508 cleanup_lnet || error "Failed to unload modules before test execution"
3510 setup_fakeif || error "Failed to add fake IF"
3511 have_interface "$FAKE_IF" ||
3512 error "Expect $FAKE_IF configured but not found"
3514 reinit_dlc || return $?
3516 add_net "tcp" "${INTERFACES[0]}" || return $?
3517 add_net "tcp" "$FAKE_IF" || return $?
3519 local nid1=$(lctl list_nids | head -n 1)
3520 local nid2=$(lctl list_nids | tail --lines 1)
3522 check_ni_status "$nid1" up
3523 check_ni_status "$nid2" up
3525 do_lnetctl peer add --prim_nid ${nid2} --lock_prim ||
3526 error "peer add failed $?"
3527 local locked_peer_state=($(do_lnetctl peer show -v 4 --nid ${nid2} |
3528 awk '/peer state/{print $NF}'))
3530 # Expect peer state bits:
3531 # LNET_PEER_MULTI_RAIL(0) | LNET_PEER_CONFIGURED(3) |
3532 # LNET_PEER_LOCK_PRIMARY(20)
3533 (( $locked_peer_state != "1048585")) &&
3534 error "Wrong peer state \"$locked_peer_state\" expected 1048585"
3536 # Clear LNET_PEER_CONFIGURED bit and verify
3537 do_lnetctl peer set --nid ${nid2} --state 1048577 ||
3538 error "peer add failed $?"
3539 locked_peer_state=($(do_lnetctl peer show -v 4 --nid ${nid2} |
3540 awk '/peer state/{print $NF}'))
3541 (( $locked_peer_state != "1048577")) &&
3542 error "Wrong peer state \"$locked_peer_state\" expected 1048577"
3543 do_lnetctl discover ${nid1} ||
3544 error "Failed to discover peer"
3546 # Expect nid2 and nid1 peer entries to be consolidated,
3547 # nid2 to stay primary
3548 cat <<EOF >> $TMP/sanity-lnet-$testnum-expected.yaml
3550 - primary nid: ${nid2}
3558 $LNETCTL peer show > $TMP/sanity-lnet-$testnum-actual.yaml
3559 compare_yaml_files ||
3560 error "Unexpected peer configuration"
3562 locked_peer_state=($(do_lnetctl peer show -v 4 --nid ${nid2} |
3563 awk '/peer state/{print $NF}'))
3564 # Expect peer state bits to be added:
3565 # LNET_PEER_DISCOVERED(4) | LNET_PEER_NIDS_UPTODATE(8)
3566 (( $locked_peer_state != "1048849")) &&
3567 error "Wrong peer state \"$locked_peer_state\" expected 1048849"
3570 run_test 304 "Check locked primary peer nid consolidation"
3576 echo "check parameter ${para} value ${value}"
3578 return $(( $(do_lnetctl net show -v | \
3580 grep -c "^ \+${para}: ${value}$") != 1 ))
3587 cleanup_lnet || error "Failed to cleanup LNet"
3589 load_module ../libcfs/libcfs/libcfs ||
3590 error "Failed to load module libcfs rc = $?"
3592 load_module ../lnet/lnet/lnet ||
3593 error "Failed to load module lnet rc = $?"
3595 echo "loading ${module} ${setting} type ${NETTYPE}"
3596 load_module "${module}" "${setting}" ||
3597 error "Failed to load module ${module} rc = $?"
3599 do_lnetctl lnet configure --all || error "lnet configure failed rc = $?"
3607 if [[ ${NETTYPE} == tcp* ]];then
3608 static_config "../lnet/klnds/socklnd/ksocklnd" \
3609 "sock_timeout=${value}"
3610 elif [[ ${NETTYPE} == o2ib* ]]; then
3611 static_config "../lnet/klnds/o2iblnd/ko2iblnd" \
3613 elif [[ ${NETTYPE} == gni* ]]; then
3614 static_config "../lnet/klnds/gnilnd/kgnilnd" \
3617 skip "NETTYPE ${NETTYPE} not supported"
3620 check_parameter "timeout" $value
3624 run_test 310 "Set timeout and verify"
3627 [[ $NETTYPE == kfi* ]] ||
3628 skip "Need kfi network type"
3630 setupall || error "setupall failed"
3632 mkdir -p $DIR/$tdir || error "mkdir failed"
3633 dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=1 oflag=direct ||
3634 error "dd write failed"
3636 local list=$(comma_list $(osts_nodes))
3638 #define CFS_KFI_FAIL_WAIT_SEND_COMP1 0xF115
3639 do_nodes $list $LCTL set_param fail_loc=0x8000F115
3640 dd if=$DIR/$tdir/$tfile of=/dev/null bs=1M count=1 ||
3641 error "dd read failed"
3643 rm -f $DIR/$tdir/$tfile
3646 cleanupall || error "Failed cleanup"
3648 run_test 311 "Fail bulk put in send wait completion"
3651 [[ $NETTYPE == kfi* ]] ||
3652 skip "Need kfi network type"
3654 setupall || error "setupall failed"
3656 mkdir -p $DIR/$tdir || error "mkdir failed"
3658 local list=$(comma_list $(osts_nodes))
3660 #define CFS_KFI_FAIL_WAIT_SEND_COMP3 0xF117
3661 do_nodes $list $LCTL set_param fail_loc=0x8000F117
3662 dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=1 oflag=direct ||
3663 error "dd write failed"
3665 local tfile2="$DIR/$tdir/testfile2"
3667 do_nodes $list $LCTL set_param fail_loc=0x8000F117
3668 dd if=$DIR/$tdir/$tfile of=$tfile2 bs=1M count=1 oflag=direct ||
3669 error "dd read failed"
3671 rm -f $DIR/$tdir/$tfile
3675 cleanupall || error "Failed cleanup"
3677 run_test 312 "TAG_RX_OK is possible after TX_FAIL"
3680 local target_net="${1}"
3681 local target_nid="${2}"
3682 local expect_net="${3}"
3683 local expect_nid="${4}"
3687 declare -a net_prios
3688 declare -a nid_prios
3690 nids=( $($LNETCTL ${type} show -v 5 | awk '/- nid:/{print $NF}' |
3693 net_prios=( $($LNETCTL ${type} show -v 5 |
3694 awk '/net priority:/{print $NF}' | xargs echo) )
3696 nid_prios=( $($LNETCTL ${type} show -v 5 |
3697 awk '/nid priority:/{print $NF}' | xargs echo) )
3699 (( ${#nids[@]} != ${#net_prios[@]} )) &&
3700 error "Wrong # net prios ${#nids[@]} != ${#net_prios[@]}"
3702 (( ${#nids[@]} != ${#nid_prios[@]} )) &&
3703 error "Wrong # nid prios ${#nids[@]} != ${#nid_prios[@]}"
3707 for ((i = 0; i < ${#nids[@]}; i++)); do
3708 [[ -n ${target_net} ]] &&
3709 [[ ${nids[i]##*@} != "${target_net}" ]] &&
3711 [[ -n ${target_nid} ]] &&
3712 [[ ${nids[i]} != "${target_nid}" ]] &&
3715 echo "${nids[i]}: net_prio ${net_prios[i]} expect ${expect_net}"
3716 (( net_prios[i] != expect_net )) &&
3717 error "Wrong net priority \"${net_prios[i]}\" expect ${expect_net}"
3719 echo "${nids[i]}: nid_prio ${nid_prios[i]} expect ${expect_nid}"
3720 (( nid_prios[i] != expect_nid )) &&
3721 error "Wrong nid priority \"${nid_prios[i]}\" expect ${expect_nid}"
3727 check_peer_udsp_prio() {
3728 check_udsp_prio "${1}" "${2}" "${3}" "${4}" "peer"
3731 check_net_udsp_prio() {
3732 check_udsp_prio "${1}" "${2}" "${3}" "${4}" "net"
3736 reinit_dlc || return $?
3738 do_lnetctl udsp add --src tcp --priority 0 ||
3739 error "Failed to add udsp rule"
3740 do_lnetctl udsp del --idx 0 ||
3741 error "Failed to del udsp rule"
3744 run_test 400 "Check for udsp add/delete net rule without net num"
3747 reinit_dlc || return $?
3749 do_lnetctl net add --net ${NETTYPE} --if ${INTERFACES[0]} ||
3750 error "Failed to add net"
3752 do_lnetctl udsp add --dst ${NETTYPE} --prio 1 ||
3753 error "Failed to add peer net priority rule"
3755 do_lnetctl discover $($LCTL list_nids | head -n 1) ||
3756 error "Failed to discover peer"
3758 check_peer_udsp_prio "${NETTYPE}" "" "1" "-1"
3762 run_test 401 "Discover peer after adding peer net UDSP rule"
3765 reinit_dlc || return $?
3767 do_lnetctl udsp add --dst kfi --priority 0 ||
3768 error "Failed to add UDSP rule"
3770 do_lnetctl peer add --prim 402@kfi ||
3771 error "Failed to add peer"
3775 run_test 402 "Destination net rule should not panic"
3777 complete_test $SECONDS