6 ALWAYS_EXCEPT="52 $RECOVERY_SMALL_EXCEPT"
8 #PTLDEBUG=${PTLDEBUG:--1}
9 LUSTRE=${LUSTRE:-`dirname $0`/..}
10 . $LUSTRE/tests/test-framework.sh
12 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
16 # Allow us to override the setup if we already have a mounted system by
17 # setting SETUP=" " and CLEANUP=" "
19 CLEANUP=${CLEANUP:-""}
21 cleanup_and_setup_lustre
22 rm -rf $DIR/${TESTSUITE}/[df][0-9]* # bug 13798 new t-f tdir staff
23 rm -rf $DIR/[df][0-9]*
26 drop_request "mcreate $MOUNT/1" || return 1
27 drop_reint_reply "mcreate $MOUNT/2" || return 2
29 run_test 1 "mcreate: drop req, drop rep"
32 drop_request "tchmod 111 $MOUNT/2" || return 1
33 drop_reint_reply "tchmod 666 $MOUNT/2" || return 2
35 run_test 2 "chmod: drop req, drop rep"
38 drop_request "statone $MOUNT/2" || return 1
39 drop_reply "statone $MOUNT/2" || return 2
41 run_test 3 "stat: drop req, drop rep"
43 SAMPLE_NAME=recovery-small.junk
44 SAMPLE_FILE=$TMP/$SAMPLE_NAME
45 # make this big, else test 9 doesn't wait for bulk -- bz 5595
46 dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4
49 do_facet client "cp $SAMPLE_FILE $MOUNT/$SAMPLE_NAME" || return 1
50 drop_request "cat $MOUNT/$SAMPLE_NAME > /dev/null" || return 2
51 drop_reply "cat $MOUNT/$SAMPLE_NAME > /dev/null" || return 3
53 run_test 4 "open: drop req, drop rep"
56 drop_request "mv $MOUNT/$SAMPLE_NAME $MOUNT/renamed" || return 1
57 drop_reint_reply "mv $MOUNT/renamed $MOUNT/renamed-again" || return 2
58 do_facet client "checkstat -v $MOUNT/renamed-again" || return 3
60 run_test 5 "rename: drop req, drop rep"
62 [ ! -e $MOUNT/renamed-again ] && cp $SAMPLE_FILE $MOUNT/renamed-again
64 drop_request "mlink $MOUNT/renamed-again $MOUNT/link1" || return 1
65 drop_reint_reply "mlink $MOUNT/renamed-again $MOUNT/link2" || return 2
67 run_test 6 "link: drop req, drop rep"
69 [ ! -e $MOUNT/link1 ] && mlink $MOUNT/renamed-again $MOUNT/link1
70 [ ! -e $MOUNT/link2 ] && mlink $MOUNT/renamed-again $MOUNT/link2
72 drop_request "munlink $MOUNT/link1" || return 1
73 drop_reint_reply "munlink $MOUNT/link2" || return 2
75 run_test 7 "unlink: drop req, drop rep"
79 drop_reint_reply "touch $MOUNT/$tfile" || return 1
81 run_test 8 "touch: drop rep (bug 1423)"
83 SAMPLE_FILE=$TMP/recovery-small.junk
84 dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4
88 pause_bulk "cp /etc/profile $MOUNT/$tfile" || return 1
89 do_facet client "cp $SAMPLE_FILE $MOUNT/${tfile}.2" || return 2
90 do_facet client "sync"
91 do_facet client "rm $MOUNT/$tfile $MOUNT/${tfile}.2" || return 3
93 run_test 9 "pause bulk on OST (bug 1420)"
97 do_facet client mcreate $MOUNT/$tfile || return 1
98 drop_bl_callback "chmod 0777 $MOUNT/$tfile" || echo "evicted as expected"
99 # wait for the mds to evict the client
100 #echo "sleep $(($TIMEOUT*2))"
101 #sleep $(($TIMEOUT*2))
102 do_facet client touch $MOUNT/$tfile || echo "touch failed, evicted"
103 do_facet client checkstat -v -p 0777 $MOUNT/$tfile || return 3
104 do_facet client "munlink $MOUNT/$tfile"
106 run_test 10 "finish request on server after client eviction (bug 1521)"
109 # wake up a thread waiting for completion after eviction
111 do_facet client multiop $MOUNT/$tfile Ow || return 1
112 do_facet client multiop $MOUNT/$tfile or || return 2
116 do_facet client multiop $MOUNT/$tfile or || return 3
117 drop_bl_callback multiop $MOUNT/$tfile Ow || echo "evicted as expected"
119 do_facet client munlink $MOUNT/$tfile || return 4
121 run_test 11 "wake up a thread waiting for completion after eviction (b=2460)"
125 $LCTL mark multiop $MOUNT/$tfile OS_c
126 do_facet mds "sysctl -w lustre.fail_loc=0x115"
127 clear_failloc mds $((TIMEOUT * 2)) &
128 multiop $MOUNT/$tfile OS_c &
130 #define OBD_FAIL_MDS_CLOSE_NET 0x115
133 echo "waiting for multiop $PID"
134 wait $PID || return 2
135 do_facet client munlink $MOUNT/$tfile || return 3
137 run_test 12 "recover from timed out resend in ptlrpcd (b=2494)"
139 # Bug 113, check that readdir lost recv timeout works.
141 mkdir $MOUNT/readdir || return 1
142 touch $MOUNT/readdir/newentry || return
143 # OBD_FAIL_MDS_READPAGE_NET|OBD_FAIL_ONCE
144 do_facet mds "sysctl -w lustre.fail_loc=0x80000104"
145 ls $MOUNT/readdir || return 3
146 do_facet mds "sysctl -w lustre.fail_loc=0"
147 rm -rf $MOUNT/readdir || return 4
149 run_test 13 "mdc_readpage restart test (bug 1138)"
151 # Bug 113, check that readdir lost send timeout works.
154 touch $MOUNT/readdir/newentry
155 # OBD_FAIL_MDS_SENDPAGE|OBD_FAIL_ONCE
156 do_facet mds "sysctl -w lustre.fail_loc=0x80000106"
157 ls $MOUNT/readdir || return 1
158 do_facet mds "sysctl -w lustre.fail_loc=0"
160 run_test 14 "mdc_readpage resend test (bug 1138)"
163 do_facet mds "sysctl -w lustre.fail_loc=0x80000128"
164 touch $DIR/$tfile && return 1
167 run_test 15 "failed open (-ENOMEM)"
169 READ_AHEAD=`cat $LPROC/llite/*/max_read_ahead_mb | head -n 1`
171 for f in $LPROC/llite/*/max_read_ahead_mb; do
177 for f in $LPROC/llite/*/max_read_ahead_mb; do
178 echo $READ_AHEAD > $f
183 do_facet client cp $SAMPLE_FILE $MOUNT
187 #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 | OBD_FAIL_ONCE
188 do_facet ost1 sysctl -w lustre.fail_loc=0x80000504
190 # OST bulk will time out here, client resends
191 do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 1
192 do_facet ost1 sysctl -w lustre.fail_loc=0
193 # give recovery a chance to finish (shouldn't take long)
195 do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 2
198 run_test 16 "timeout bulk put, don't evict client (2732)"
201 # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max
202 local at_max=$(do_facet ost1 "find /sys/ -name at_max")
203 [ -z "$at_max" ] && skip "missing /sys/.../at_max" && return 0
204 OST_AT_MAX=$(do_facet ost1 "cat $at_max")
205 do_facet ost1 "echo $TIMEOUT >> $at_max"
207 # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE
208 # OST bulk will time out here, client retries
209 do_facet ost1 sysctl -w lustre.fail_loc=0x80000503
210 # need to ensure we send an RPC
211 do_facet client cp $SAMPLE_FILE $DIR/$tfile
214 # with AT, client will wait adaptive_max*factor+net_latency before
215 # expiring the req, hopefully timeout*2 is enough
216 sleep $(($TIMEOUT*2))
218 do_facet ost1 sysctl -w lustre.fail_loc=0
219 do_facet client "df $DIR"
220 # expect cmp to succeed, client resent bulk
221 do_facet client "cmp $SAMPLE_FILE $DIR/$tfile" || return 3
222 do_facet client "rm $DIR/$tfile" || return 4
223 do_facet ost1 "echo $OST_AT_MAX >> $at_max"
226 run_test 17 "timeout bulk get, don't evict client (2732)"
229 [ -z ${ost2_svc} ] && skip "needs 2 osts" && return 0
231 do_facet client mkdir -p $MOUNT/$tdir
232 f=$MOUNT/$tdir/$tfile
235 pgcache_empty || return 1
238 lfs setstripe $f $((128 * 1024)) 1 1
240 do_facet client cp $SAMPLE_FILE $f
242 local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | awk '{print $1}'`
243 $LCTL --device $osc2dev deactivate || return 3
244 # my understanding is that there should be nothing in the page
245 # cache after the client reconnects?
247 pgcache_empty || rc=2
248 $LCTL --device $osc2dev activate
252 run_test 18a "manual ost invalidate clears page cache immediately"
255 do_facet client mkdir -p $MOUNT/$tdir
256 f=$MOUNT/$tdir/$tfile
257 f2=$MOUNT/$tdir/${tfile}-2
260 pgcache_empty || return 1
262 # shouldn't have to set stripe size of count==1
263 lfs setstripe $f $((128 * 1024)) 0 1
264 lfs setstripe $f2 $((128 * 1024)) 0 1
266 do_facet client cp $SAMPLE_FILE $f
270 df $MOUNT > /dev/null 2>&1
272 # my understanding is that there should be nothing in the page
273 # cache after the client reconnects?
275 pgcache_empty || rc=2
279 run_test 18b "eviction and reconnect clears page cache (2766)"
283 do_facet client mcreate $f || return 1
284 drop_ldlm_cancel "chmod 0777 $f" || echo "evicted as expected"
286 do_facet client checkstat -v -p 0777 $f || echo evicted
287 # let the client reconnect
289 do_facet client "munlink $f"
291 run_test 19a "test expired_lock_main on mds (2867)"
295 do_facet client multiop $f Ow || return 1
296 do_facet client multiop $f or || return 2
300 do_facet client multiop $f or || return 3
301 drop_ldlm_cancel multiop $f Ow || echo "client evicted, as expected"
303 do_facet client munlink $f || return 4
305 run_test 19b "test expired_lock_main on ost (2867)"
307 test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup
309 multiop $DIR/$tdir/${tfile} O_wc &
313 #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
314 do_facet ost1 sysctl -w lustre.fail_loc=0x80000308
315 kill -USR1 $MULTI_PID
318 [ $rc -eq 0 ] && error "multiop didn't fail enqueue: rc $rc" || true
320 run_test 20a "ldlm_handle_enqueue error (should return error)"
322 test_20b() { # bug 2986 - ldlm_handle_enqueue error during open
324 touch $DIR/$tdir/${tfile}
326 #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
327 do_facet ost1 sysctl -w lustre.fail_loc=0x80000308
328 dd if=/etc/hosts of=$DIR/$tdir/$tfile && \
329 error "didn't fail open enqueue" || true
331 run_test 20b "ldlm_handle_enqueue error (should return error)"
334 mkdir -p $DIR/$tdir-1
335 mkdir -p $DIR/$tdir-2
336 multiop $DIR/$tdir-1/f O_c &
339 do_facet mds "sysctl -w lustre.fail_loc=0x80000129"
340 multiop $DIR/$tdir-2/f Oc &
343 do_facet mds "sysctl -w lustre.fail_loc=0"
345 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
346 kill -USR1 $close_pid
348 wait $close_pid || return 1
349 wait $open_pid || return 2
350 do_facet mds "sysctl -w lustre.fail_loc=0"
352 $CHECKSTAT -t file $DIR/$tdir-1/f || return 3
353 $CHECKSTAT -t file $DIR/$tdir-2/f || return 4
357 run_test 21a "drop close request while close and open are both in flight"
360 mkdir -p $DIR/$tdir-1
361 mkdir -p $DIR/$tdir-2
362 multiop $DIR/$tdir-1/f O_c &
365 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
366 mcreate $DIR/$tdir-2/f &
369 do_facet mds "sysctl -w lustre.fail_loc=0"
371 kill -USR1 $close_pid
373 wait $close_pid || return 1
374 wait $open_pid || return 3
376 $CHECKSTAT -t file $DIR/$tdir-1/f || return 4
377 $CHECKSTAT -t file $DIR/$tdir-2/f || return 5
380 run_test 21b "drop open request while close and open are both in flight"
383 mkdir -p $DIR/$tdir-1
384 mkdir -p $DIR/$tdir-2
385 multiop $DIR/$tdir-1/f O_c &
388 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
389 mcreate $DIR/$tdir-2/f &
392 do_facet mds "sysctl -w lustre.fail_loc=0"
394 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
395 kill -USR1 $close_pid
397 wait $close_pid || return 1
398 wait $open_pid || return 2
400 do_facet mds "sysctl -w lustre.fail_loc=0"
402 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
403 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
406 run_test 21c "drop both request while close and open are both in flight"
409 mkdir -p $DIR/$tdir-1
410 mkdir -p $DIR/$tdir-2
411 multiop $DIR/$tdir-1/f O_c &
414 do_facet mds "sysctl -w lustre.fail_loc=0x80000129"
415 multiop $DIR/$tdir-2/f Oc &
417 do_facet mds "sysctl -w lustre.fail_loc=0"
419 do_facet mds "sysctl -w lustre.fail_loc=0x80000122"
422 wait $pid || return 1
423 do_facet mds "sysctl -w lustre.fail_loc=0"
425 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
426 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
430 run_test 21d "drop close reply while close and open are both in flight"
433 mkdir -p $DIR/$tdir-1
434 mkdir -p $DIR/$tdir-2
435 multiop $DIR/$tdir-1/f O_c &
438 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
439 touch $DIR/$tdir-2/f &
441 do_facet mds "sysctl -w lustre.fail_loc=0"
445 wait $pid || return 1
448 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
449 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
452 run_test 21e "drop open reply while close and open are both in flight"
455 mkdir -p $DIR/$tdir-1
456 mkdir -p $DIR/$tdir-2
457 multiop $DIR/$tdir-1/f O_c &
460 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
461 touch $DIR/$tdir-2/f &
463 do_facet mds "sysctl -w lustre.fail_loc=0"
465 do_facet mds "sysctl -w lustre.fail_loc=0x80000122"
468 wait $pid || return 1
469 do_facet mds "sysctl -w lustre.fail_loc=0"
471 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
472 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
475 run_test 21f "drop both reply while close and open are both in flight"
478 mkdir -p $DIR/$tdir-1
479 mkdir -p $DIR/$tdir-2
480 multiop $DIR/$tdir-1/f O_c &
483 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
484 touch $DIR/$tdir-2/f &
486 do_facet mds "sysctl -w lustre.fail_loc=0"
488 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
491 wait $pid || return 1
492 do_facet mds "sysctl -w lustre.fail_loc=0"
494 $CHECKSTAT -t file $DIR/$tdir-1/f || return 2
495 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3
498 run_test 21g "drop open reply and close request while close and open are both in flight"
501 mkdir -p $DIR/$tdir-1
502 mkdir -p $DIR/$tdir-2
503 multiop $DIR/$tdir-1/f O_c &
506 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
507 touch $DIR/$tdir-2/f &
510 do_facet mds "sysctl -w lustre.fail_loc=0"
512 do_facet mds "sysctl -w lustre.fail_loc=0x80000122"
515 wait $pid || return 1
516 do_facet mds "sysctl -w lustre.fail_loc=0"
518 wait $touch_pid || return 2
520 $CHECKSTAT -t file $DIR/$tdir-1/f || return 3
521 $CHECKSTAT -t file $DIR/$tdir-2/f || return 4
524 run_test 21h "drop open request and close reply while close and open are both in flight"
526 # bug 3462 - multiple MDC requests
531 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
536 multiop $f1 msu || return 1
539 do_facet mds "sysctl -w lustre.fail_loc=0"
541 wait $close_pid || return 2
542 rm -rf $f2 || return 4
544 run_test 22 "drop close request and do mknod"
547 multiop $DIR/$tfile O_c &
549 # give a chance for open
553 drop_request "kill -USR1 $pid"
556 wait $pid || return 1
559 run_test 23 "client hang when close a file after mds crash"
561 test_24() { # bug 2248 - eviction fails writeback but app doesn't see it
564 multiop $DIR/$tdir/$tfile Owy_wyc &
569 kill -USR1 $MULTI_PID
572 sysctl -w lustre.fail_loc=0x0
574 [ $rc -eq 0 ] && error_ignore 5494 "multiop didn't fail fsync: rc $rc" || true
576 run_test 24 "fsync error (should return error)"
578 test_26() { # bug 5921 - evict dead exports by pinger
579 # this test can only run from a client on a separate node.
580 remote_ost || skip "local OST" && return
581 remote_mds || skip "local MDS" && return
582 OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
583 OST_EXP="`do_facet ost1 cat $OST_FILE`"
584 OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2`
585 echo starting with $OST_NEXP1 OST exports
586 # OBD_FAIL_PTLRPC_DROP_RPC 0x505
587 do_facet client sysctl -w lustre.fail_loc=0x505
588 # evictor takes up to 2.25x to evict. But if there's a
589 # race to start the evictor from various obds, the loser
590 # might have to wait for the next ping.
591 echo Waiting for $(($TIMEOUT * 4)) secs
592 sleep $(($TIMEOUT * 4))
593 OST_EXP="`do_facet ost1 cat $OST_FILE`"
594 OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2`
595 echo ending with $OST_NEXP2 OST exports
596 do_facet client sysctl -w lustre.fail_loc=0x0
597 [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted"
600 run_test 26 "evict dead exports"
602 test_26b() { # bug 10140 - evict dead exports by pinger
604 zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
605 MDS_FILE=$LPROC/mds/${mds_svc}/num_exports
606 MDS_NEXP1="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
607 OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
608 OST_NEXP1="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
609 echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
610 #force umount a client; exports should get evicted
611 zconf_umount `hostname` $MOUNT2 -f
612 # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.
613 # But if there's a race to start the evictor from various obds,
614 # the loser might have to wait for the next ping.
615 echo Waiting for $(($TIMEOUT * 8)) secs
616 sleep $(($TIMEOUT * 8))
617 OST_NEXP2="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
618 MDS_NEXP2="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
619 echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
620 [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST"
621 [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS"
624 run_test 26b "evict dead exports"
627 remote_mds && { skip "remote MDS" && return 0; }
629 writemany -q -a $DIR/$tdir/$tfile 0 5 &
634 #define OBD_FAIL_OSC_SHUTDOWN 0x407
635 sysctl -w lustre.fail_loc=0x80000407
636 # need to wait for reconnect
637 echo -n waiting for fail_loc
638 while [ `sysctl -n lustre.fail_loc` -eq -2147482617 ]; do
644 kill -USR1 $CLIENT_PID
648 run_test 27 "fail LOV while using OSC's"
650 test_28() { # bug 6086 - error adding new clients
651 do_facet client mcreate $MOUNT/$tfile || return 1
652 drop_bl_callback "chmod 0777 $MOUNT/$tfile" ||echo "evicted as expected"
653 #define OBD_FAIL_MDS_ADD_CLIENT 0x12f
654 do_facet mds sysctl -w lustre.fail_loc=0x8000012f
655 # fail once (evicted), reconnect fail (fail_loc), ok
656 df || (sleep 1; df) || (sleep 1; df) || error "reconnect failed"
658 fail mds # verify MDS last_rcvd can be loaded
660 run_test 28 "handle error adding new clients (bug 6086)"
665 sysctl -w lnet.debug="-dlmtrace -ha"
666 # put a load of file creates/writes/deletes
667 writemany -q $DIR/$tdir/$tfile 0 5 &
669 echo writemany pid $CLIENT_PID
672 $LCTL mark "$TESTNAME fail mds 1"
674 # wait for client to reconnect to MDS
676 $LCTL mark "$TESTNAME fail mds 2"
679 $LCTL mark "$TESTNAME fail mds 3"
681 # client process should see no problems even though MDS went down
683 kill -USR1 $CLIENT_PID
686 echo writemany returned $rc
687 #these may fail because of eviction due to slow AST response.
689 [ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true
691 run_test 50 "failover MDS under load"
695 # put a load of file creates/writes/deletes
696 writemany -q $DIR/$tdir/$tfile 0 5 &
701 # failover at various points during recovery
702 SEQ="1 5 10 $(seq $TIMEOUT 5 $(($TIMEOUT+10)))"
703 echo will failover at $SEQ
706 echo failover in $i sec
708 $LCTL mark "$TESTNAME fail mds $i"
711 # client process should see no problems even though MDS went down
712 # and recovery was interrupted
714 kill -USR1 $CLIENT_PID
717 echo writemany returned $rc
718 [ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true
720 run_test 51 "failover MDS during recovery"
723 do_facet client "writemany -q -a $DIR/$tdir/$tfile 300 5" &
725 echo writemany pid $CLIENT_PID
728 $LCTL mark "$TESTNAME fail ost $1"
731 wait $CLIENT_PID || rc=$?
732 # active client process should see an EIO for down OST
733 [ $rc -eq 5 ] && { echo "writemany correctly failed $rc" && return 0; }
734 # but timing or failover setup may allow success
735 [ $rc -eq 0 ] && { echo "writemany succeeded" && return 0; }
736 echo "writemany returned $rc"
744 [ $rc -ne 0 ] && { return $rc; }
745 # wait for client to reconnect to OST
749 [ $rc -ne 0 ] && { return $rc; }
756 run_test 52 "failover OST under load"
758 # test of open reconstruct
761 drop_ldlm_reply "openfile -f O_RDWR:O_CREAT -m 0755 $DIR/$tfile" ||\
764 run_test 53 "touch: drop rep"
767 zconf_mount `hostname` $MOUNT2
771 cat $DIR2/$tfile.missing # save transno = 0, rc != 0 into last_rcvd
774 ERROR=`dmesg | egrep "(test 54|went back in time)" | tail -n1 | grep "went back in time"`
775 [ x"$ERROR" == x ] || error "back in time occured"
777 run_test 54 "back in time"
779 # bug 11330 - liblustre application death during I/O locks up OST
781 remote_ost && { skip "remote OST" && return 0; }
785 # first dd should be finished quickly
786 dd if=/dev/zero of=$DIR/$tdir/$tfile-1 bs=32M count=4 &
789 echo "step1: testing ......"
791 if [ -z `ps x | awk '$1 == '$DDPID' { print $5 }'` ]; then break; fi
793 if [ $count -gt 64 ]; then
794 error "dd should be finished!"
798 echo "(dd_pid=$DDPID, time=$count)successful"
800 #define OBD_FAIL_OST_DROP_REQ 0x21d
801 do_facet ost sysctl -w lustre.fail_loc=0x0000021d
802 # second dd will be never finished
803 dd if=/dev/zero of=$DIR/$tdir/$tfile-2 bs=32M count=4 &
806 echo "step2: testing ......"
807 while [ $count -le 64 ]; do
808 dd_name="`ps x | awk '$1 == '$DDPID' { print $5 }'`"
809 if [ -z $dd_name ]; then
811 echo "debug: (dd_name=$dd_name, dd_pid=$DDPID, time=$count)"
812 error "dd shouldn't be finished!"
817 echo "(dd_pid=$DDPID, time=$count)successful"
819 #Recover fail_loc and dd will finish soon
820 do_facet ost sysctl -w lustre.fail_loc=0
822 echo "step3: testing ......"
824 if [ -z `ps x | awk '$1 == '$DDPID' { print $5 }'` ]; then break; fi
826 if [ $count -gt 500 ]; then
827 error "dd should be finished!"
831 echo "(dd_pid=$DDPID, time=$count)successful"
835 run_test 55 "ost_brw_read/write drops timed-out read/write request"
837 test_56() { # b=11277
838 #define OBD_FAIL_MDS_RESEND 0x136
840 do_facet mds sysctl -w lustre.fail_loc=0x80000136
842 do_facet mds sysctl -w lustre.fail_loc=0
845 run_test 56 "do not allow reconnect to busy exports"
848 # no oscs means no client or mdt
849 while [ -e $LPROC/osc ]; do
850 for f in `find $LPROC -type f`; do
851 cat $f > /dev/null 2>&1
856 test_57() { # bug 10866
860 #define OBD_FAIL_LPROC_REMOVE 0xB00
861 sysctl -w lustre.fail_loc=0x80000B00
862 zconf_umount `hostname` $DIR
863 sysctl -w lustre.fail_loc=0x80000B00
866 sysctl -w lustre.fail_loc=0
868 do_facet client "df $DIR"
870 run_test 57 "read procfs entries causes kernel crash"
872 test_58() { # bug 11546
873 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801
876 sysctl -w lustre.fail_loc=0x80000801
877 cp $MOUNT/$tfile /dev/null &
880 sysctl -w lustre.fail_loc=0
881 drop_bl_callback rm -f $MOUNT/$tfile
883 do_facet client "df $DIR"
885 run_test 58 "Eviction in the middle of open RPC reply processing"
887 test_59() { # bug 10589
888 zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
889 sysctl -w lustre.fail_loc=0x311
890 writes=`dd if=/dev/zero of=$DIR2/$tfile count=1 2>&1 | awk 'BEGIN { FS="+" } /out/ {print $1}'`
891 sysctl -w lustre.fail_loc=0
893 zconf_umount `hostname` $DIR2 -f
894 reads=`dd if=$DIR/$tfile of=/dev/null 2>&1 | awk 'BEGIN { FS="+" } /in/ {print $1}'`
895 [ $reads -eq $writes ] || error "read" $reads "blocks, must be" $writes
897 run_test 59 "Read cancel race on client eviction"
899 equals_msg `basename $0`: test complete, cleaning up
900 check_and_cleanup_lustre
901 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true