6 ALWAYS_EXCEPT="52 $RECOVERY_SMALL_EXCEPT"
8 # also long tests: 19, 21a, 21e, 21f, 23, 27
10 [ "$SLOW" = "no" ] && EXCEPT_SLOW="17 26b 50 51 57"
12 #PTLDEBUG=${PTLDEBUG:--1}
13 LUSTRE=${LUSTRE:-`dirname $0`/..}
14 . $LUSTRE/tests/test-framework.sh
16 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 # Allow us to override the setup if we already have a mounted system by
21 # setting SETUP=" " and CLEANUP=" "
23 CLEANUP=${CLEANUP:-""}
25 cleanup_and_setup_lustre
26 rm -rf $DIR/${TESTSUITE}/[df][0-9]* # bug 13798 new t-f tdir staff
27 rm -rf $DIR/[df][0-9]*
30 drop_request "mcreate $MOUNT/1" || return 1
31 drop_reint_reply "mcreate $MOUNT/2" || return 2
33 run_test 1 "mcreate: drop req, drop rep"
36 drop_request "tchmod 111 $MOUNT/2" || return 1
37 drop_reint_reply "tchmod 666 $MOUNT/2" || return 2
39 run_test 2 "chmod: drop req, drop rep"
42 drop_request "statone $MOUNT/2" || return 1
43 drop_reply "statone $MOUNT/2" || return 2
45 run_test 3 "stat: drop req, drop rep"
47 SAMPLE_NAME=recovery-small.junk
48 SAMPLE_FILE=$TMP/$SAMPLE_NAME
49 # make this big, else test 9 doesn't wait for bulk -- bz 5595
50 dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4
53 do_facet client "cp $SAMPLE_FILE $MOUNT/$SAMPLE_NAME" || return 1
54 drop_request "cat $MOUNT/$SAMPLE_NAME > /dev/null" || return 2
55 drop_reply "cat $MOUNT/$SAMPLE_NAME > /dev/null" || return 3
57 run_test 4 "open: drop req, drop rep"
60 drop_request "mv $MOUNT/$SAMPLE_NAME $MOUNT/renamed" || return 1
61 drop_reint_reply "mv $MOUNT/renamed $MOUNT/renamed-again" || return 2
62 do_facet client "checkstat -v $MOUNT/renamed-again" || return 3
64 run_test 5 "rename: drop req, drop rep"
66 [ ! -e $MOUNT/renamed-again ] && cp $SAMPLE_FILE $MOUNT/renamed-again
68 drop_request "mlink $MOUNT/renamed-again $MOUNT/link1" || return 1
69 drop_reint_reply "mlink $MOUNT/renamed-again $MOUNT/link2" || return 2
71 run_test 6 "link: drop req, drop rep"
73 [ ! -e $MOUNT/link1 ] && mlink $MOUNT/renamed-again $MOUNT/link1
74 [ ! -e $MOUNT/link2 ] && mlink $MOUNT/renamed-again $MOUNT/link2
76 drop_request "munlink $MOUNT/link1" || return 1
77 drop_reint_reply "munlink $MOUNT/link2" || return 2
79 run_test 7 "unlink: drop req, drop rep"
83 drop_reint_reply "touch $MOUNT/$tfile" || return 1
85 run_test 8 "touch: drop rep (bug 1423)"
87 SAMPLE_FILE=$TMP/recovery-small.junk
88 dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4
92 pause_bulk "cp /etc/profile $MOUNT/$tfile" || return 1
93 do_facet client "cp $SAMPLE_FILE $MOUNT/${tfile}.2" || return 2
94 do_facet client "sync"
95 do_facet client "rm $MOUNT/$tfile $MOUNT/${tfile}.2" || return 3
97 run_test 9 "pause bulk on OST (bug 1420)"
101 do_facet client mcreate $MOUNT/$tfile || return 1
102 drop_bl_callback "chmod 0777 $MOUNT/$tfile" || echo "evicted as expected"
103 # wait for the mds to evict the client
104 #echo "sleep $(($TIMEOUT*2))"
105 #sleep $(($TIMEOUT*2))
106 do_facet client touch $MOUNT/$tfile || echo "touch failed, evicted"
107 do_facet client checkstat -v -p 0777 $MOUNT/$tfile || return 3
108 do_facet client "munlink $MOUNT/$tfile"
110 run_test 10 "finish request on server after client eviction (bug 1521)"
113 # wake up a thread waiting for completion after eviction
115 do_facet client multiop $MOUNT/$tfile Ow || return 1
116 do_facet client multiop $MOUNT/$tfile or || return 2
120 do_facet client multiop $MOUNT/$tfile or || return 3
121 drop_bl_callback multiop $MOUNT/$tfile Ow || echo "evicted as expected"
123 do_facet client munlink $MOUNT/$tfile || return 4
125 run_test 11 "wake up a thread waiting for completion after eviction (b=2460)"
129 $LCTL mark multiop $MOUNT/$tfile OS_c
130 do_facet mds "sysctl -w lustre.fail_loc=0x115"
131 clear_failloc mds $((TIMEOUT * 2)) &
132 multiop $MOUNT/$tfile OS_c &
134 #define OBD_FAIL_MDS_CLOSE_NET 0x115
137 echo "waiting for multiop $PID"
138 wait $PID || return 2
139 do_facet client munlink $MOUNT/$tfile || return 3
141 run_test 12 "recover from timed out resend in ptlrpcd (b=2494)"
143 # Bug 113, check that readdir lost recv timeout works.
145 mkdir $MOUNT/readdir || return 1
146 touch $MOUNT/readdir/newentry || return
147 # OBD_FAIL_MDS_READPAGE_NET|OBD_FAIL_ONCE
148 do_facet mds "sysctl -w lustre.fail_loc=0x80000104"
149 ls $MOUNT/readdir || return 3
150 do_facet mds "sysctl -w lustre.fail_loc=0"
151 rm -rf $MOUNT/readdir || return 4
153 run_test 13 "mdc_readpage restart test (bug 1138)"
155 # Bug 113, check that readdir lost send timeout works.
158 touch $MOUNT/readdir/newentry
159 # OBD_FAIL_MDS_SENDPAGE|OBD_FAIL_ONCE
160 do_facet mds "sysctl -w lustre.fail_loc=0x80000106"
161 ls $MOUNT/readdir || return 1
162 do_facet mds "sysctl -w lustre.fail_loc=0"
164 run_test 14 "mdc_readpage resend test (bug 1138)"
167 do_facet mds "sysctl -w lustre.fail_loc=0x80000128"
168 touch $DIR/$tfile && return 1
171 run_test 15 "failed open (-ENOMEM)"
173 READ_AHEAD=`cat $LPROC/llite/*/max_read_ahead_mb | head -n 1`
175 for f in $LPROC/llite/*/max_read_ahead_mb; do
181 for f in $LPROC/llite/*/max_read_ahead_mb; do
182 echo $READ_AHEAD > $f
187 do_facet client cp $SAMPLE_FILE $MOUNT
191 #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 | OBD_FAIL_ONCE
192 do_facet ost1 sysctl -w lustre.fail_loc=0x80000504
194 # OST bulk will time out here, client resends
195 do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 1
196 do_facet ost1 sysctl -w lustre.fail_loc=0
197 # give recovery a chance to finish (shouldn't take long)
199 do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 2
202 run_test 16 "timeout bulk put, don't evict client (2732)"
205 # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max
206 local at_max=$(do_facet ost1 "find /sys/ -name at_max")
207 [ -z "$at_max" ] && skip "missing /sys/.../at_max" && return 0
208 OST_AT_MAX=$(do_facet ost1 "cat $at_max")
209 do_facet ost1 "echo $TIMEOUT >> $at_max"
211 # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE
212 # OST bulk will time out here, client retries
213 do_facet ost1 sysctl -w lustre.fail_loc=0x80000503
214 # need to ensure we send an RPC
215 do_facet client cp $SAMPLE_FILE $DIR/$tfile
218 # with AT, client will wait adaptive_max*factor+net_latency before
219 # expiring the req, hopefully timeout*2 is enough
220 sleep $(($TIMEOUT*2))
222 do_facet ost1 sysctl -w lustre.fail_loc=0
223 do_facet client "df $DIR"
224 # expect cmp to succeed, client resent bulk
225 do_facet client "cmp $SAMPLE_FILE $DIR/$tfile" || return 3
226 do_facet client "rm $DIR/$tfile" || return 4
227 do_facet ost1 "echo $OST_AT_MAX >> $at_max"
230 run_test 17 "timeout bulk get, don't evict client (2732)"
233 [ -z ${ost2_svc} ] && skip "needs 2 osts" && return 0
235 do_facet client mkdir -p $MOUNT/$tdir
236 f=$MOUNT/$tdir/$tfile
239 pgcache_empty || return 1
242 lfs setstripe $f -s $((128 * 1024)) -i 1 -c 1
244 do_facet client cp $SAMPLE_FILE $f
246 local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | awk '{print $1}'`
247 $LCTL --device $osc2dev deactivate || return 3
248 # my understanding is that there should be nothing in the page
249 # cache after the client reconnects?
251 pgcache_empty || rc=2
252 $LCTL --device $osc2dev activate
256 run_test 18a "manual ost invalidate clears page cache immediately"
259 do_facet client mkdir -p $MOUNT/$tdir
260 f=$MOUNT/$tdir/$tfile
261 f2=$MOUNT/$tdir/${tfile}-2
264 pgcache_empty || return 1
266 # shouldn't have to set stripe size of count==1
267 lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1
268 lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1
270 do_facet client cp $SAMPLE_FILE $f
274 df $MOUNT > /dev/null 2>&1
276 # my understanding is that there should be nothing in the page
277 # cache after the client reconnects?
279 pgcache_empty || rc=2
283 run_test 18b "eviction and reconnect clears page cache (2766)"
287 do_facet client mcreate $f || return 1
288 drop_ldlm_cancel "chmod 0777 $f" || echo "evicted as expected"
290 do_facet client checkstat -v -p 0777 $f || echo evicted
291 # let the client reconnect
293 do_facet client "munlink $f"
295 run_test 19a "test expired_lock_main on mds (2867)"
299 do_facet client multiop $f Ow || return 1
300 do_facet client multiop $f or || return 2
304 do_facet client multiop $f or || return 3
305 drop_ldlm_cancel multiop $f Ow || echo "client evicted, as expected"
307 do_facet client munlink $f || return 4
309 run_test 19b "test expired_lock_main on ost (2867)"
311 test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup
313 multiop $DIR/$tdir/${tfile} O_wc &
317 #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
318 do_facet ost1 sysctl -w lustre.fail_loc=0x80000308
319 kill -USR1 $MULTI_PID
322 [ $rc -eq 0 ] && error "multiop didn't fail enqueue: rc $rc" || true
324 run_test 20a "ldlm_handle_enqueue error (should return error)"
326 test_20b() { # bug 2986 - ldlm_handle_enqueue error during open
328 touch $DIR/$tdir/${tfile}
330 #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
331 do_facet ost1 sysctl -w lustre.fail_loc=0x80000308
332 dd if=/etc/hosts of=$DIR/$tdir/$tfile && \
333 error "didn't fail open enqueue" || true
335 run_test 20b "ldlm_handle_enqueue error (should return error)"
338 mkdir -p $DIR/$tdir-1
339 mkdir -p $DIR/$tdir-2
340 multiop $DIR/$tdir-1/f O_c &
343 do_facet mds "sysctl -w lustre.fail_loc=0x80000129"
344 multiop $DIR/$tdir-2/f Oc &
347 do_facet mds "sysctl -w lustre.fail_loc=0"
349 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
350 kill -USR1 $close_pid
352 wait $close_pid || return 1
353 wait $open_pid || return 2
354 do_facet mds "sysctl -w lustre.fail_loc=0"
356 $CHECKSTAT -t file $DIR/$tdir-1/f || return 3
357 $CHECKSTAT -t file $DIR/$tdir-2/f || return 4
361 run_test 21a "drop close request while close and open are both in flight"
364 mkdir -p $DIR/$tdir-1
365 mkdir -p $DIR/$tdir-2
366 multiop $DIR/$tdir-1/f O_c &
369 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
370 mcreate $DIR/$tdir-2/f &
373 do_facet mds "sysctl -w lustre.fail_loc=0"
375 kill -USR1 $close_pid
377 wait $close_pid || return 1
378 wait $open_pid || return 3
380 $CHECKSTAT -t file $DIR/$tdir-1/f || return 4
381 $CHECKSTAT -t file $DIR/$tdir-2/f || return 5
384 run_test 21b "drop open request while close and open are both in flight"
387 mkdir -p $DIR/$tdir-1
388 mkdir -p $DIR/$tdir-2
389 multiop $DIR/$tdir-1/f O_c &
392 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
393 mcreate $DIR/$tdir-2/f &
396 do_facet mds "sysctl -w lustre.fail_loc=0"
398 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
399 kill -USR1 $close_pid
401 wait $close_pid || return 1
402 wait $open_pid || return 2
404 do_facet mds "sysctl -w lustre.fail_loc=0"
406 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
407 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
410 run_test 21c "drop both request while close and open are both in flight"
413 mkdir -p $DIR/$tdir-1
414 mkdir -p $DIR/$tdir-2
415 multiop $DIR/$tdir-1/f O_c &
418 do_facet mds "sysctl -w lustre.fail_loc=0x80000129"
419 multiop $DIR/$tdir-2/f Oc &
421 do_facet mds "sysctl -w lustre.fail_loc=0"
423 do_facet mds "sysctl -w lustre.fail_loc=0x80000122"
426 wait $pid || return 1
427 do_facet mds "sysctl -w lustre.fail_loc=0"
429 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
430 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
434 run_test 21d "drop close reply while close and open are both in flight"
437 mkdir -p $DIR/$tdir-1
438 mkdir -p $DIR/$tdir-2
439 multiop $DIR/$tdir-1/f O_c &
442 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
443 touch $DIR/$tdir-2/f &
445 do_facet mds "sysctl -w lustre.fail_loc=0"
449 wait $pid || return 1
452 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
453 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
456 run_test 21e "drop open reply while close and open are both in flight"
459 mkdir -p $DIR/$tdir-1
460 mkdir -p $DIR/$tdir-2
461 multiop $DIR/$tdir-1/f O_c &
464 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
465 touch $DIR/$tdir-2/f &
467 do_facet mds "sysctl -w lustre.fail_loc=0"
469 do_facet mds "sysctl -w lustre.fail_loc=0x80000122"
472 wait $pid || return 1
473 do_facet mds "sysctl -w lustre.fail_loc=0"
475 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
476 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
479 run_test 21f "drop both reply while close and open are both in flight"
482 mkdir -p $DIR/$tdir-1
483 mkdir -p $DIR/$tdir-2
484 multiop $DIR/$tdir-1/f O_c &
487 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
488 touch $DIR/$tdir-2/f &
490 do_facet mds "sysctl -w lustre.fail_loc=0"
492 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
495 wait $pid || return 1
496 do_facet mds "sysctl -w lustre.fail_loc=0"
498 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
499 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
502 run_test 21g "drop open reply and close request while close and open are both in flight"
505 mkdir -p $DIR/$tdir-1
506 mkdir -p $DIR/$tdir-2
507 multiop $DIR/$tdir-1/f O_c &
510 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
511 touch $DIR/$tdir-2/f &
514 do_facet mds "sysctl -w lustre.fail_loc=0"
516 do_facet mds "sysctl -w lustre.fail_loc=0x80000122"
519 wait $pid || return 1
520 do_facet mds "sysctl -w lustre.fail_loc=0"
522 wait $touch_pid || return 2
524 $CHECKSTAT -t file $DIR/$tdir-1/f || return 3
525 $CHECKSTAT -t file $DIR/$tdir-2/f || return 4
528 run_test 21h "drop open request and close reply while close and open are both in flight"
530 # bug 3462 - multiple MDC requests
535 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
540 multiop $f1 msu || return 1
543 do_facet mds "sysctl -w lustre.fail_loc=0"
545 wait $close_pid || return 2
546 rm -rf $f2 || return 4
548 run_test 22 "drop close request and do mknod"
551 multiop $DIR/$tfile O_c &
553 # give a chance for open
557 drop_request "kill -USR1 $pid"
560 wait $pid || return 1
563 run_test 23 "client hang when close a file after mds crash"
565 test_24() { # bug 2248 - eviction fails writeback but app doesn't see it
568 multiop $DIR/$tdir/$tfile Owy_wyc &
573 kill -USR1 $MULTI_PID
576 sysctl -w lustre.fail_loc=0x0
578 [ $rc -eq 0 ] && error_ignore 5494 "multiop didn't fail fsync: rc $rc" || true
580 run_test 24 "fsync error (should return error)"
582 test_26() { # bug 5921 - evict dead exports by pinger
583 # this test can only run from a client on a separate node.
584 remote_ost || skip "local OST" && return
585 remote_mds || skip "local MDS" && return
586 OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
587 OST_EXP="`do_facet ost1 cat $OST_FILE`"
588 OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2`
589 echo starting with $OST_NEXP1 OST exports
590 # OBD_FAIL_PTLRPC_DROP_RPC 0x505
591 do_facet client sysctl -w lustre.fail_loc=0x505
592 # evictor takes up to 2.25x to evict. But if there's a
593 # race to start the evictor from various obds, the loser
594 # might have to wait for the next ping.
595 echo Waiting for $(($TIMEOUT * 4)) secs
596 sleep $(($TIMEOUT * 4))
597 OST_EXP="`do_facet ost1 cat $OST_FILE`"
598 OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2`
599 echo ending with $OST_NEXP2 OST exports
600 do_facet client sysctl -w lustre.fail_loc=0x0
601 [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted"
604 run_test 26 "evict dead exports"
606 test_26b() { # bug 10140 - evict dead exports by pinger
608 zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
609 MDS_FILE=$LPROC/mds/${mds_svc}/num_exports
610 MDS_NEXP1="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
611 OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
612 OST_NEXP1="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
613 echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
614 #force umount a client; exports should get evicted
615 zconf_umount `hostname` $MOUNT2 -f
616 # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.
617 # But if there's a race to start the evictor from various obds,
618 # the loser might have to wait for the next ping.
619 echo Waiting for $(($TIMEOUT * 8)) secs
620 sleep $(($TIMEOUT * 8))
621 OST_NEXP2="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
622 MDS_NEXP2="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
623 echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
624 [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST"
625 [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS"
628 run_test 26b "evict dead exports"
631 remote_mds && { skip "remote MDS" && return 0; }
633 writemany -q -a $DIR/$tdir/$tfile 0 5 &
638 #define OBD_FAIL_OSC_SHUTDOWN 0x407
639 sysctl -w lustre.fail_loc=0x80000407
640 # need to wait for reconnect
641 echo -n waiting for fail_loc
642 while [ `sysctl -n lustre.fail_loc` -eq -2147482617 ]; do
648 kill -USR1 $CLIENT_PID
652 run_test 27 "fail LOV while using OSC's"
654 test_28() { # bug 6086 - error adding new clients
655 do_facet client mcreate $MOUNT/$tfile || return 1
656 drop_bl_callback "chmod 0777 $MOUNT/$tfile" ||echo "evicted as expected"
657 #define OBD_FAIL_MDS_ADD_CLIENT 0x12f
658 do_facet mds sysctl -w lustre.fail_loc=0x8000012f
659 # fail once (evicted), reconnect fail (fail_loc), ok
660 df || (sleep 1; df) || (sleep 1; df) || error "reconnect failed"
662 fail mds # verify MDS last_rcvd can be loaded
664 run_test 28 "handle error adding new clients (bug 6086)"
669 sysctl -w lnet.debug="-dlmtrace -ha"
670 # put a load of file creates/writes/deletes
671 writemany -q $DIR/$tdir/$tfile 0 5 &
673 echo writemany pid $CLIENT_PID
676 $LCTL mark "$TESTNAME fail mds 1"
678 # wait for client to reconnect to MDS
680 $LCTL mark "$TESTNAME fail mds 2"
683 $LCTL mark "$TESTNAME fail mds 3"
685 # client process should see no problems even though MDS went down
687 kill -USR1 $CLIENT_PID
690 echo writemany returned $rc
691 #these may fail because of eviction due to slow AST response.
693 [ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true
695 run_test 50 "failover MDS under load"
699 # put a load of file creates/writes/deletes
700 writemany -q $DIR/$tdir/$tfile 0 5 &
705 # failover at various points during recovery
706 SEQ="1 5 10 $(seq $TIMEOUT 5 $(($TIMEOUT+10)))"
707 echo will failover at $SEQ
710 echo failover in $i sec
712 $LCTL mark "$TESTNAME fail mds $i"
715 # client process should see no problems even though MDS went down
716 # and recovery was interrupted
718 kill -USR1 $CLIENT_PID
721 echo writemany returned $rc
722 [ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true
724 run_test 51 "failover MDS during recovery"
727 do_facet client "writemany -q -a $DIR/$tdir/$tfile 300 5" &
729 echo writemany pid $CLIENT_PID
732 $LCTL mark "$TESTNAME fail ost $1"
735 wait $CLIENT_PID || rc=$?
736 # active client process should see an EIO for down OST
737 [ $rc -eq 5 ] && { echo "writemany correctly failed $rc" && return 0; }
738 # but timing or failover setup may allow success
739 [ $rc -eq 0 ] && { echo "writemany succeeded" && return 0; }
740 echo "writemany returned $rc"
748 [ $rc -ne 0 ] && { return $rc; }
749 # wait for client to reconnect to OST
753 [ $rc -ne 0 ] && { return $rc; }
760 run_test 52 "failover OST under load"
762 # test of open reconstruct
765 drop_ldlm_reply "openfile -f O_RDWR:O_CREAT -m 0755 $DIR/$tfile" ||\
768 run_test 53 "touch: drop rep"
771 zconf_mount `hostname` $MOUNT2
775 cat $DIR2/$tfile.missing # save transno = 0, rc != 0 into last_rcvd
778 ERROR=`dmesg | egrep "(test 54|went back in time)" | tail -n1 | grep "went back in time"`
779 [ x"$ERROR" == x ] || error "back in time occured"
781 run_test 54 "back in time"
783 # bug 11330 - liblustre application death during I/O locks up OST
785 remote_ost && { skip "remote OST" && return 0; }
789 # first dd should be finished quickly
790 dd if=/dev/zero of=$DIR/$tdir/$tfile-1 bs=32M count=4 &
793 echo "step1: testing ......"
795 if [ -z `ps x | awk '$1 == '$DDPID' { print $5 }'` ]; then break; fi
797 if [ $count -gt 64 ]; then
798 error "dd should be finished!"
802 echo "(dd_pid=$DDPID, time=$count)successful"
804 #define OBD_FAIL_OST_DROP_REQ 0x21d
805 do_facet ost sysctl -w lustre.fail_loc=0x0000021d
806 # second dd will be never finished
807 dd if=/dev/zero of=$DIR/$tdir/$tfile-2 bs=32M count=4 &
810 echo "step2: testing ......"
811 while [ $count -le 64 ]; do
812 dd_name="`ps x | awk '$1 == '$DDPID' { print $5 }'`"
813 if [ -z $dd_name ]; then
815 echo "debug: (dd_name=$dd_name, dd_pid=$DDPID, time=$count)"
816 error "dd shouldn't be finished!"
821 echo "(dd_pid=$DDPID, time=$count)successful"
823 #Recover fail_loc and dd will finish soon
824 do_facet ost sysctl -w lustre.fail_loc=0
826 echo "step3: testing ......"
828 if [ -z `ps x | awk '$1 == '$DDPID' { print $5 }'` ]; then break; fi
830 if [ $count -gt 500 ]; then
831 error "dd should be finished!"
835 echo "(dd_pid=$DDPID, time=$count)successful"
839 run_test 55 "ost_brw_read/write drops timed-out read/write request"
841 test_56() { # b=11277
842 #define OBD_FAIL_MDS_RESEND 0x136
844 do_facet mds sysctl -w lustre.fail_loc=0x80000136
846 do_facet mds sysctl -w lustre.fail_loc=0
849 run_test 56 "do not allow reconnect to busy exports"
852 # no oscs means no client or mdt
853 while [ -e $LPROC/osc ]; do
854 for f in `find $LPROC -type f`; do
855 cat $f > /dev/null 2>&1
860 test_57() { # bug 10866
864 #define OBD_FAIL_LPROC_REMOVE 0xB00
865 sysctl -w lustre.fail_loc=0x80000B00
866 zconf_umount `hostname` $DIR
867 sysctl -w lustre.fail_loc=0x80000B00
870 sysctl -w lustre.fail_loc=0
872 do_facet client "df $DIR"
874 run_test 57 "read procfs entries causes kernel crash"
876 test_58() { # bug 11546
877 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801
880 sysctl -w lustre.fail_loc=0x80000801
881 cp $MOUNT/$tfile /dev/null &
884 sysctl -w lustre.fail_loc=0
885 drop_bl_callback rm -f $MOUNT/$tfile
887 do_facet client "df $DIR"
889 run_test 58 "Eviction in the middle of open RPC reply processing"
891 test_59() { # bug 10589
892 zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
893 sysctl -w lustre.fail_loc=0x311
894 writes=`dd if=/dev/zero of=$DIR2/$tfile count=1 2>&1 | awk 'BEGIN { FS="+" } /out/ {print $1}'`
895 sysctl -w lustre.fail_loc=0
897 zconf_umount `hostname` $DIR2 -f
898 reads=`dd if=$DIR/$tfile of=/dev/null 2>&1 | awk 'BEGIN { FS="+" } /in/ {print $1}'`
899 [ $reads -eq $writes ] || error "read" $reads "blocks, must be" $writes
901 run_test 59 "Read cancel race on client eviction"
903 equals_msg `basename $0`: test complete, cleaning up
904 check_and_cleanup_lustre
905 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true