6 ALWAYS_EXCEPT="24 27 52 $RECOVERY_SMALL_EXCEPT"
8 PTLDEBUG=${PTLDEBUG:--1}
9 LUSTRE=${LUSTRE:-`dirname $0`/..}
10 . $LUSTRE/tests/test-framework.sh
12 . ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
16 # Allow us to override the setup if we already have a mounted system by
17 # setting SETUP=" " and CLEANUP=" "
18 SETUP=${SETUP:-"setup"}
19 CLEANUP=${CLEANUP:-"cleanup"}
27 cleanupall || { echo "FAILed to clean up"; exit 20; }
30 if [ ! -z "$EVAL" ]; then
35 if [ "$ONLY" == "cleanup" ]; then
36 sysctl -w lnet.debug=0 || true
43 [ "$ONLY" == "setup" ] && exit
46 drop_request "mcreate $MOUNT/1" || return 1
47 drop_reint_reply "mcreate $MOUNT/2" || return 2
49 run_test 1 "mcreate: drop req, drop rep"
52 drop_request "tchmod 111 $MOUNT/2" || return 1
53 drop_reint_reply "tchmod 666 $MOUNT/2" || return 2
55 run_test 2 "chmod: drop req, drop rep"
58 drop_request "statone $MOUNT/2" || return 1
59 drop_reply "statone $MOUNT/2" || return 2
61 run_test 3 "stat: drop req, drop rep"
64 do_facet client "cp /etc/inittab $MOUNT/inittab" || return 1
65 drop_request "cat $MOUNT/inittab > /dev/null" || return 2
66 drop_reply "cat $MOUNT/inittab > /dev/null" || return 3
68 run_test 4 "open: drop req, drop rep"
71 drop_request "mv $MOUNT/inittab $MOUNT/renamed" || return 1
72 drop_reint_reply "mv $MOUNT/renamed $MOUNT/renamed-again" || return 2
73 do_facet client "checkstat -v $MOUNT/renamed-again" || return 3
75 run_test 5 "rename: drop req, drop rep"
78 drop_request "mlink $MOUNT/renamed-again $MOUNT/link1" || return 1
79 drop_reint_reply "mlink $MOUNT/renamed-again $MOUNT/link2" || return 2
81 run_test 6 "link: drop req, drop rep"
84 drop_request "munlink $MOUNT/link1" || return 1
85 drop_reint_reply "munlink $MOUNT/link2" || return 2
87 run_test 7 "unlink: drop req, drop rep"
91 drop_reint_reply "touch $MOUNT/renamed" || return 1
93 run_test 8 "touch: drop rep (bug 1423)"
97 pause_bulk "cp /etc/profile $MOUNT" || return 1
98 do_facet client "cp /etc/termcap $MOUNT" || return 2
99 do_facet client "sync"
100 do_facet client "rm $MOUNT/termcap $MOUNT/profile" || return 3
102 run_test 9 "pause bulk on OST (bug 1420)"
106 do_facet client mcreate $MOUNT/$tfile || return 1
107 drop_bl_callback "chmod 0777 $MOUNT/$tfile" || echo "evicted as expected"
108 # wait for the mds to evict the client
109 #echo "sleep $(($TIMEOUT*2))"
110 #sleep $(($TIMEOUT*2))
111 do_facet client touch $MOUNT/$tfile || echo "touch failed, evicted"
112 do_facet client checkstat -v -p 0777 $MOUNT/$tfile || return 3
113 do_facet client "munlink $MOUNT/$tfile"
115 run_test 10 "finish request on server after client eviction (bug 1521)"
118 # wake up a thread waiting for completion after eviction
120 do_facet client multiop $MOUNT/$tfile Ow || return 1
121 do_facet client multiop $MOUNT/$tfile or || return 2
125 do_facet client multiop $MOUNT/$tfile or || return 3
126 drop_bl_callback multiop $MOUNT/$tfile Ow || echo "evicted as expected"
128 do_facet client munlink $MOUNT/$tfile || return 4
130 run_test 11 "wake up a thread waiting for completion after eviction (b=2460)"
134 $LCTL mark multiop $MOUNT/$tfile OS_c
135 do_facet mds "sysctl -w lustre.fail_loc=0x115"
136 clear_failloc mds $((TIMEOUT * 2)) &
137 multiop $MOUNT/$tfile OS_c &
139 #define OBD_FAIL_MDS_CLOSE_NET 0x115
142 echo "waiting for multiop $PID"
143 wait $PID || return 2
144 do_facet client munlink $MOUNT/$tfile || return 3
146 run_test 12 "recover from timed out resend in ptlrpcd (b=2494)"
148 # Bug 113, check that readdir lost recv timeout works.
150 mkdir $MOUNT/readdir || return 1
151 touch $MOUNT/readdir/newentry || return
152 # OBD_FAIL_MDS_READPAGE_NET|OBD_FAIL_ONCE
153 do_facet mds "sysctl -w lustre.fail_loc=0x80000104"
154 ls $MOUNT/readdir || return 3
155 do_facet mds "sysctl -w lustre.fail_loc=0"
156 rm -rf $MOUNT/readdir || return 4
158 run_test 13 "mdc_readpage restart test (bug 1138)"
160 # Bug 113, check that readdir lost send timeout works.
163 touch $MOUNT/readdir/newentry
164 # OBD_FAIL_MDS_SENDPAGE|OBD_FAIL_ONCE
165 do_facet mds "sysctl -w lustre.fail_loc=0x80000106"
166 ls $MOUNT/readdir || return 1
167 do_facet mds "sysctl -w lustre.fail_loc=0"
169 run_test 14 "mdc_readpage resend test (bug 1138)"
172 do_facet mds "sysctl -w lustre.fail_loc=0x80000128"
173 touch $DIR/$tfile && return 1
176 run_test 15 "failed open (-ENOMEM)"
178 READ_AHEAD=`cat $LPROC/llite/*/max_read_ahead_mb | head -n 1`
180 for f in $LPROC/llite/*/max_read_ahead_mb; do
186 for f in $LPROC/llite/*/max_read_ahead_mb; do
187 echo $READ_AHEAD > $f
192 do_facet client cp /etc/termcap $MOUNT
196 #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 | OBD_FAIL_ONCE
197 do_facet ost1 sysctl -w lustre.fail_loc=0x80000504
199 # OST bulk will time out here, client resends
200 do_facet client "cmp /etc/termcap $MOUNT/termcap" || return 1
201 sysctl -w lustre.fail_loc=0
202 # give recovery a chance to finish (shouldn't take long)
204 do_facet client "cmp /etc/termcap $MOUNT/termcap" || return 2
207 run_test 16 "timeout bulk put, don't evict client (2732)"
210 # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE
211 # OST bulk will time out here, client retries
212 sysctl -w lustre.fail_loc=0x80000503
213 # need to ensure we send an RPC
214 do_facet client cp /etc/termcap $DIR/$tfile
218 sysctl -w lustre.fail_loc=0
219 do_facet client "df $DIR"
220 # expect cmp to succeed, client resent bulk
221 do_facet client "cmp /etc/termcap $DIR/$tfile" || return 3
222 do_facet client "rm $DIR/$tfile" || return 4
225 run_test 17 "timeout bulk get, evict client (2732)"
228 [ -z ${ost2_svc} ] && echo Skipping, needs 2 osts && return 0
230 do_facet client mkdir -p $MOUNT/$tdir
231 f=$MOUNT/$tdir/$tfile
234 pgcache_empty || return 1
237 lfs setstripe $f $((128 * 1024)) 1 1
239 do_facet client cp /etc/termcap $f
241 local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | egrep -v 'MDT' | awk '{print $1}'`
242 $LCTL --device $osc2dev deactivate || return 3
243 # my understanding is that there should be nothing in the page
244 # cache after the client reconnects?
246 pgcache_empty || rc=2
247 $LCTL --device $osc2dev activate
251 run_test 18a "manual ost invalidate clears page cache immediately"
254 do_facet client mkdir -p $MOUNT/$tdir
255 f=$MOUNT/$tdir/$tfile
256 f2=$MOUNT/$tdir/${tfile}-2
259 pgcache_empty || return 1
261 # shouldn't have to set stripe size of count==1
262 lfs setstripe $f $((128 * 1024)) 0 1
263 lfs setstripe $f2 $((128 * 1024)) 0 1
265 do_facet client cp /etc/termcap $f
268 # allow recovery to complete
269 sleep $((TIMEOUT + 2))
270 # my understanding is that there should be nothing in the page
271 # cache after the client reconnects?
273 pgcache_empty || rc=2
277 run_test 18b "eviction and reconnect clears page cache (2766)"
281 do_facet client mcreate $f || return 1
282 drop_ldlm_cancel "chmod 0777 $f" || echo "evicted as expected"
284 do_facet client checkstat -v -p 0777 $f || echo evicted
285 # let the client reconnect
287 do_facet client "munlink $f"
289 run_test 19a "test expired_lock_main on mds (2867)"
293 do_facet client multiop $f Ow || return 1
294 do_facet client multiop $f or || return 2
298 do_facet client multiop $f or || return 3
299 drop_ldlm_cancel multiop $f Ow || echo "client evicted, as expected"
301 do_facet client munlink $f || return 4
303 run_test 19b "test expired_lock_main on ost (2867)"
305 test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup
307 multiop $DIR/$tdir/${tfile} O_wc &
311 #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
312 do_facet ost1 sysctl -w lustre.fail_loc=0x80000308
313 kill -USR1 $MULTI_PID
316 [ $rc -eq 0 ] && error "multiop didn't fail enqueue: rc $rc" || true
318 run_test 20a "ldlm_handle_enqueue error (should return error)"
320 test_20b() { # bug 2986 - ldlm_handle_enqueue error during open
322 touch $DIR/$tdir/${tfile}
324 #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
325 do_facet ost1 sysctl -w lustre.fail_loc=0x80000308
326 dd if=/etc/hosts of=$DIR/$tdir/$tfile && \
327 error "didn't fail open enqueue" || true
329 run_test 20b "ldlm_handle_enqueue error (should return error)"
332 mkdir -p $DIR/$tdir-1
333 mkdir -p $DIR/$tdir-2
334 multiop $DIR/$tdir-1/f O_c &
337 do_facet mds "sysctl -w lustre.fail_loc=0x80000129"
338 multiop $DIR/$tdir-2/f Oc &
341 do_facet mds "sysctl -w lustre.fail_loc=0"
343 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
344 kill -USR1 $close_pid
346 wait $close_pid || return 1
347 wait $open_pid || return 2
348 do_facet mds "sysctl -w lustre.fail_loc=0"
350 $CHECKSTAT -t file $DIR/$tdir-1/f || return 3
351 $CHECKSTAT -t file $DIR/$tdir-2/f || return 4
355 run_test 21a "drop close request while close and open are both in flight"
358 mkdir -p $DIR/$tdir-1
359 mkdir -p $DIR/$tdir-2
360 multiop $DIR/$tdir-1/f O_c &
363 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
364 mcreate $DIR/$tdir-2/f &
367 do_facet mds "sysctl -w lustre.fail_loc=0"
369 kill -USR1 $close_pid
371 wait $close_pid || return 1
372 wait $open_pid || return 3
374 $CHECKSTAT -t file $DIR/$tdir-1/f || return 4
375 $CHECKSTAT -t file $DIR/$tdir-2/f || return 5
378 run_test 21b "drop open request while close and open are both in flight"
381 mkdir -p $DIR/$tdir-1
382 mkdir -p $DIR/$tdir-2
383 multiop $DIR/$tdir-1/f O_c &
386 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
387 mcreate $DIR/$tdir-2/f &
390 do_facet mds "sysctl -w lustre.fail_loc=0"
392 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
393 kill -USR1 $close_pid
395 wait $close_pid || return 1
396 wait $open_pid || return 2
398 do_facet mds "sysctl -w lustre.fail_loc=0"
400 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
401 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
404 run_test 21c "drop both request while close and open are both in flight"
407 mkdir -p $DIR/$tdir-1
408 mkdir -p $DIR/$tdir-2
409 multiop $DIR/$tdir-1/f O_c &
412 do_facet mds "sysctl -w lustre.fail_loc=0x80000129"
413 multiop $DIR/$tdir-2/f Oc &
415 do_facet mds "sysctl -w lustre.fail_loc=0"
417 do_facet mds "sysctl -w lustre.fail_loc=0x80000122"
420 wait $pid || return 1
421 do_facet mds "sysctl -w lustre.fail_loc=0"
423 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
424 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
428 run_test 21d "drop close reply while close and open are both in flight"
431 mkdir -p $DIR/$tdir-1
432 mkdir -p $DIR/$tdir-2
433 multiop $DIR/$tdir-1/f O_c &
436 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
437 touch $DIR/$tdir-2/f &
439 do_facet mds "sysctl -w lustre.fail_loc=0"
443 wait $pid || return 1
446 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
447 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
450 run_test 21e "drop open reply while close and open are both in flight"
453 mkdir -p $DIR/$tdir-1
454 mkdir -p $DIR/$tdir-2
455 multiop $DIR/$tdir-1/f O_c &
458 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
459 touch $DIR/$tdir-2/f &
461 do_facet mds "sysctl -w lustre.fail_loc=0"
463 do_facet mds "sysctl -w lustre.fail_loc=0x80000122"
466 wait $pid || return 1
467 do_facet mds "sysctl -w lustre.fail_loc=0"
469 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
470 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
473 run_test 21f "drop both reply while close and open are both in flight"
476 mkdir -p $DIR/$tdir-1
477 mkdir -p $DIR/$tdir-2
478 multiop $DIR/$tdir-1/f O_c &
481 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
482 touch $DIR/$tdir-2/f &
484 do_facet mds "sysctl -w lustre.fail_loc=0"
486 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
489 wait $pid || return 1
490 do_facet mds "sysctl -w lustre.fail_loc=0"
492 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
493 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
496 run_test 21g "drop open reply and close request while close and open are both in flight"
499 mkdir -p $DIR/$tdir-1
500 mkdir -p $DIR/$tdir-2
501 multiop $DIR/$tdir-1/f O_c &
504 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
505 touch $DIR/$tdir-2/f &
508 do_facet mds "sysctl -w lustre.fail_loc=0"
510 do_facet mds "sysctl -w lustre.fail_loc=0x80000122"
513 wait $pid || return 1
514 do_facet mds "sysctl -w lustre.fail_loc=0"
516 wait $touch_pid || return 2
518 $CHECKSTAT -t file $DIR/$tdir-1/f || return 3
519 $CHECKSTAT -t file $DIR/$tdir-2/f || return 4
522 run_test 21h "drop open request and close reply while close and open are both in flight"
524 # bug 3462 - multiple MDC requests
529 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
534 multiop $f1 msu || return 1
537 do_facet mds "sysctl -w lustre.fail_loc=0"
539 wait $close_pid || return 2
540 rm -rf $f2 || return 4
542 run_test 22 "drop close request and do mknod"
545 multiop $DIR/$tfile O_c &
547 # give a chance for open
551 drop_request "kill -USR1 $pid"
554 wait $pid || return 1
557 run_test 23 "client hang when close a file after mds crash"
559 test_24() { # bug 2248 - eviction fails writeback but app doesn't see it
562 multiop $DIR/$tdir/$tfile Owy_wyc &
567 kill -USR1 $MULTI_PID
570 sysctl -w lustre.fail_loc=0x0
572 [ $rc -eq 0 ] && error "multiop didn't fail fsync: rc $rc" || true
574 run_test 24 "fsync error (should return error)"
576 test_26() { # bug 5921 - evict dead exports by pinger
577 # this test can only run from a client on a separate node.
578 [ "`lsmod | grep obdfilter`" ] && \
579 echo "skipping test 26 (local OST)" && return
580 [ "`lsmod | grep mds`" ] && \
581 echo "skipping test 26 (local MDS)" && return
582 OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
583 OST_EXP="`do_facet ost1 cat $OST_FILE`"
584 OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2`
585 echo starting with $OST_NEXP1 OST exports
586 # OBD_FAIL_PTLRPC_DROP_RPC 0x505
587 do_facet client sysctl -w lustre.fail_loc=0x505
588 # evictor takes up to 2.25x to evict. But if there's a
589 # race to start the evictor from various obds, the loser
590 # might have to wait for the next ping.
591 echo Waiting for $(($TIMEOUT * 4)) secs
592 sleep $(($TIMEOUT * 4))
593 OST_EXP="`do_facet ost1 cat $OST_FILE`"
594 OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2`
595 echo ending with $OST_NEXP2 OST exports
596 do_facet client sysctl -w lustre.fail_loc=0x0
597 [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted"
600 run_test 26 "evict dead exports"
602 test_26b() { # bug 10140 - evict dead exports by pinger
603 zconf_mount `hostname` $MOUNT2
604 MDS_FILE=$LPROC/mdt/${mds_svc}/num_exports
605 MDS_NEXP1="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
606 OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
607 OST_NEXP1="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
608 echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
609 zconf_umount `hostname` $MOUNT2 -f
610 # evictor takes up to 2.25x to evict. But if there's a
611 # race to start the evictor from various obds, the loser
612 # might have to wait for the next ping.
613 echo Waiting for $(($TIMEOUT * 4)) secs
614 sleep $(($TIMEOUT * 4))
615 OST_NEXP2="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
616 MDS_NEXP2="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
617 echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
618 [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST"
619 [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS"
622 run_test 26b "evict dead exports"
625 [ "`lsmod | grep mds`" ] || \
626 { echo "skipping test 27 (non-local MDS)" && return 0; }
628 writemany -q -a $DIR/$tdir/$tfile 0 5 &
633 #define OBD_FAIL_OSC_SHUTDOWN 0x407
634 sysctl -w lustre.fail_loc=0x80000407
635 # need to wait for reconnect
636 echo -n waiting for fail_loc
637 while [ `sysctl -n lustre.fail_loc` -eq -2147482617 ]; do
643 kill -USR1 $CLIENT_PID
647 run_test 27 "fail LOV while using OSC's"
649 test_28() { # bug 6086 - error adding new clients
650 do_facet client mcreate $MOUNT/$tfile || return 1
651 drop_bl_callback "chmod 0777 $MOUNT/$tfile" ||echo "evicted as expected"
652 #define OBD_FAIL_MDS_ADD_CLIENT 0x12f
653 do_facet mds sysctl -w lustre.fail_loc=0x8000012f
654 # fail once (evicted), reconnect fail (fail_loc), ok
655 df || (sleep 1; df) || (sleep 1; df) || error "reconnect failed"
657 fail mds # verify MDS last_rcvd can be loaded
659 run_test 28 "handle error adding new clients (bug 6086)"
663 # put a load of file creates/writes/deletes
664 writemany -q $DIR/$tdir/$tfile 0 5 &
666 echo writemany pid $CLIENT_PID
670 # wait for client to reconnect to MDS
675 # client process should see no problems even though MDS went down
677 kill -USR1 $CLIENT_PID
680 echo writemany returned $rc
681 #these may fail because of eviction due to slow AST response.
684 run_test 50 "failover MDS under load"
688 # put a load of file creates/writes/deletes
689 writemany -q $DIR/$tdir/$tfile 0 5 &
694 # failover at various points during recovery
695 SEQ="1 5 10 $(seq $TIMEOUT 5 $(($TIMEOUT+10)))"
696 echo will failover at $SEQ
699 echo failover in $i sec
703 # client process should see no problems even though MDS went down
704 # and recovery was interrupted
706 kill -USR1 $CLIENT_PID
709 echo writemany returned $rc
712 run_test 51 "failover MDS during recovery"
715 do_facet client "writemany -q -a $DIR/$tdir/$tfile 300 5" &
717 echo writemany pid $CLIENT_PID
722 wait $CLIENT_PID || rc=$?
723 # active client process should see an EIO for down OST
724 [ $rc -eq 5 ] && { echo "writemany correctly failed $rc" && return 0; }
725 # but timing or failover setup may allow success
726 [ $rc -eq 0 ] && { echo "writemany succeeded" && return 0; }
727 echo "writemany returned $rc"
735 [ $rc -ne 0 ] && { return $rc; }
736 # wait for client to reconnect to OST
740 [ $rc -ne 0 ] && { return $rc; }
747 run_test 52 "failover OST under load"