3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
91 echo "preparing... $nfiles * $ndirs files will be created $(date)."
92 if [ ! -z $igif ]; then
93 #define OBD_FAIL_FID_IGIF 0x1504
94 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
97 cp $LUSTRE/tests/*.sh $DIR/$tdir/
98 if [ $ndirs -gt 0 ]; then
99 createmany -d $DIR/$tdir/d $ndirs
100 createmany -m $DIR/$tdir/f $ndirs
101 if [ $nfiles -gt 0 ]; then
102 for ((i = 0; i < $ndirs; i++)); do
103 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
104 /dev/null || error "createmany $nfiles"
107 createmany -d $DIR/$tdir/e $ndirs
110 if [ ! -z $igif ]; then
111 touch $DIR/$tdir/dummy
112 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
115 echo "prepared $(date)."
118 run_e2fsck_on_mdt0() {
119 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
121 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
122 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
124 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
125 error "(2) Detected inconsistency on MDT0"
127 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
128 error "(3) Fail to start MDT0"
134 #define OBD_FAIL_LFSCK_DELAY1 0x1600
135 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
136 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
138 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
140 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
141 [ "$STATUS" == "scanning-phase1" ] ||
142 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
144 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
146 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
147 [ "$STATUS" == "stopped" ] ||
148 error "(6) Expect 'stopped', but got '$STATUS'"
150 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
152 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
153 [ "$STATUS" == "scanning-phase1" ] ||
154 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
156 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
157 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
158 mdd.${MDT_DEV}.lfsck_namespace |
159 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
161 error "(9) unexpected status"
164 local repaired=$($SHOW_NAMESPACE |
165 awk '/^updated_phase1/ { print $2 }')
166 [ $repaired -eq 0 ] ||
167 error "(10) Expect nothing to be repaired, but got: $repaired"
169 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
170 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
171 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
172 mdd.${MDT_DEV}.lfsck_namespace |
173 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
175 error "(12) unexpected status"
178 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
179 [ $((scanned1 + 1)) -eq $scanned2 ] ||
180 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
182 echo "stopall, should NOT crash LU-3649"
183 stopall || error "(14) Fail to stopall"
185 run_test 0 "Control LFSCK manually"
188 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
189 skip "OI Scrub not implemented for ZFS" && return
193 #define OBD_FAIL_FID_INDIR 0x1501
194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
195 touch $DIR/$tdir/dummy
197 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
199 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(4) unexpected status"
207 local repaired=$($SHOW_NAMESPACE |
208 awk '/^dirent_repaired/ { print $2 }')
209 # for interop with old server
210 [ -z "$repaired" ] &&
211 repaired=$($SHOW_NAMESPACE |
212 awk '/^updated_phase1/ { print $2 }')
214 [ $repaired -eq 1 ] ||
215 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
219 mount_client $MOUNT || error "(6) Fail to start client!"
221 #define OBD_FAIL_FID_LOOKUP 0x1505
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
223 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
231 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
232 skip "OI Scrub not implemented for ZFS" && return
236 #define OBD_FAIL_FID_INLMA 0x1502
237 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
238 touch $DIR/$tdir/dummy
240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
242 #define OBD_FAIL_FID_NOLMA 0x1506
243 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
244 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
245 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
246 mdd.${MDT_DEV}.lfsck_namespace |
247 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
249 error "(4) unexpected status"
252 local repaired=$($SHOW_NAMESPACE |
253 awk '/^dirent_repaired/ { print $2 }')
254 # for interop with old server
255 [ -z "$repaired" ] &&
256 repaired=$($SHOW_NAMESPACE |
257 awk '/^updated_phase1/ { print $2 }')
259 [ $repaired -eq 1 ] ||
260 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
265 mount_client $MOUNT || error "(6) Fail to start client!"
267 #define OBD_FAIL_FID_LOOKUP 0x1505
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
269 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
273 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
278 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
279 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
280 touch $DIR/$tdir/dummy
282 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
284 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
285 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
286 mdd.${MDT_DEV}.lfsck_namespace |
287 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
289 error "(4) unexpected status"
292 local repaired=$($SHOW_NAMESPACE |
293 awk '/^linkea_repaired/ { print $2 }')
294 # for interop with old server
295 [ -z "$repaired" ] &&
296 repaired=$($SHOW_NAMESPACE |
297 awk '/^updated_phase2/ { print $2 }')
299 [ $repaired -eq 1 ] ||
300 error "(5) Fail to repair crashed linkEA: $repaired"
304 mount_client $MOUNT || error "(6) Fail to start client!"
306 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
307 error "(7) Fail to stat $DIR/$tdir/dummy"
309 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
310 local dummyname=$($LFS fid2path $DIR $dummyfid)
311 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
312 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
314 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
320 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
321 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
322 touch $DIR/$tdir/dummy
324 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
326 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
327 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
328 mdd.${MDT_DEV}.lfsck_namespace |
329 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
331 error "(4) unexpected status"
334 local repaired=$($SHOW_NAMESPACE |
335 awk '/^updated_phase2/ { print $2 }')
336 [ $repaired -eq 1 ] ||
337 error "(5) Fail to repair crashed linkEA: $repaired"
341 mount_client $MOUNT || error "(6) Fail to start client!"
343 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
344 error "(7) Fail to stat $DIR/$tdir/dummy"
346 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
347 local dummyname=$($LFS fid2path $DIR $dummyfid)
348 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
349 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
351 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
357 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
358 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
359 touch $DIR/$tdir/dummy
361 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
363 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
364 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
365 mdd.${MDT_DEV}.lfsck_namespace |
366 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
368 error "(4) unexpected status"
371 local repaired=$($SHOW_NAMESPACE |
372 awk '/^updated_phase2/ { print $2 }')
373 [ $repaired -eq 1 ] ||
374 error "(5) Fail to repair crashed linkEA: $repaired"
378 mount_client $MOUNT || error "(6) Fail to start client!"
380 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
381 error "(7) Fail to stat $DIR/$tdir/dummy"
383 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
384 local dummyname=$($LFS fid2path $DIR $dummyfid)
385 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
386 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
388 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
394 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
395 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
396 touch $DIR/$tdir/dummy
398 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
400 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
401 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
402 mdd.${MDT_DEV}.lfsck_namespace |
403 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
405 error "(4) unexpected status"
408 local repaired=$($SHOW_NAMESPACE |
409 awk '/^linkea_repaired/ { print $2 }')
410 [ $repaired -eq 1 ] ||
411 error "(5) Fail to repair crashed linkEA: $repaired"
415 mount_client $MOUNT || error "(6) Fail to start client!"
417 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
418 error "(7) Fail to stat $DIR/$tdir/dummy"
420 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
421 local dummyname=$($LFS fid2path $DIR $dummyfid)
422 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
423 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
425 run_test 2d "LFSCK can recover the missing linkEA entry"
429 [ $MDSCOUNT -lt 2 ] &&
430 skip "We need at least 2 MDSes for this test" && return
434 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
436 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
437 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
438 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
439 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
441 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
442 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
443 mdd.${MDT_DEV}.lfsck_namespace |
444 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
446 error "(4) unexpected status"
449 local repaired=$($SHOW_NAMESPACE |
450 awk '/^linkea_repaired/ { print $2 }')
451 [ $repaired -eq 1 ] ||
452 error "(5) Fail to repair crashed linkEA: $repaired"
454 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
455 local name=$($LFS fid2path $DIR $fid)
456 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
457 error "(6) Fail to repair linkEA: $fid $name"
459 run_test 2e "namespace LFSCK can verify remote object linkEA"
465 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
466 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
467 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
469 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
470 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
471 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
473 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
474 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
475 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
477 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
478 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
479 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
481 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
483 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
484 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
485 mdd.${MDT_DEV}.lfsck_namespace |
486 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
488 error "(10) unexpected status"
491 local checked=$($SHOW_NAMESPACE |
492 awk '/^checked_phase2/ { print $2 }')
493 [ $checked -ge 4 ] ||
494 error "(11) Fail to check multiple-linked object: $checked"
496 local repaired=$($SHOW_NAMESPACE |
497 awk '/^multiple_linked_repaired/ { print $2 }')
498 [ $repaired -ge 2 ] ||
499 error "(12) Fail to repair multiple-linked object: $repaired"
501 run_test 3 "LFSCK can verify multiple-linked objects"
505 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
506 skip "OI Scrub not implemented for ZFS" && return
509 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
510 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
512 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
513 echo "start $SINGLEMDS with disabling OI scrub"
514 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
515 error "(2) Fail to start MDS!"
517 #define OBD_FAIL_LFSCK_DELAY2 0x1601
518 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
519 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
520 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
521 mdd.${MDT_DEV}.lfsck_namespace |
522 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
524 error "(5) unexpected status"
527 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
528 [ "$STATUS" == "scanning-phase1" ] ||
529 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
531 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
532 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
533 mdd.${MDT_DEV}.lfsck_namespace |
534 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
536 error "(7) unexpected status"
539 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
540 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
542 local repaired=$($SHOW_NAMESPACE |
543 awk '/^dirent_repaired/ { print $2 }')
544 # for interop with old server
545 [ -z "$repaired" ] &&
546 repaired=$($SHOW_NAMESPACE |
547 awk '/^updated_phase1/ { print $2 }')
549 [ $repaired -ge 9 ] ||
550 error "(9) Fail to re-generate FID-in-dirent: $repaired"
554 mount_client $MOUNT || error "(10) Fail to start client!"
556 #define OBD_FAIL_FID_LOOKUP 0x1505
557 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
558 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
559 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
561 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
565 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
566 skip "OI Scrub not implemented for ZFS" && return
569 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
570 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
572 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
573 echo "start $SINGLEMDS with disabling OI scrub"
574 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
575 error "(2) Fail to start MDS!"
577 #define OBD_FAIL_LFSCK_DELAY2 0x1601
578 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
579 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
580 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
581 mdd.${MDT_DEV}.lfsck_namespace |
582 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
584 error "(5) unexpected status"
587 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
588 [ "$STATUS" == "scanning-phase1" ] ||
589 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
591 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
592 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
593 mdd.${MDT_DEV}.lfsck_namespace |
594 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
596 error "(7) unexpected status"
599 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
600 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
602 local repaired=$($SHOW_NAMESPACE |
603 awk '/^dirent_repaired/ { print $2 }')
604 # for interop with old server
605 [ -z "$repaired" ] &&
606 repaired=$($SHOW_NAMESPACE |
607 awk '/^updated_phase1/ { print $2 }')
609 [ $repaired -ge 2 ] ||
610 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
614 mount_client $MOUNT || error "(10) Fail to start client!"
616 #define OBD_FAIL_FID_LOOKUP 0x1505
617 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
618 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
620 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
622 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
623 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
624 local dummyname=$($LFS fid2path $DIR $dummyfid)
625 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
626 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
628 run_test 5 "LFSCK can handle IGIF object upgrading"
633 #define OBD_FAIL_LFSCK_DELAY1 0x1600
634 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
635 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
637 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
638 [ "$STATUS" == "scanning-phase1" ] ||
639 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
641 # Sleep 3 sec to guarantee at least one object processed by LFSCK
643 # Fail the LFSCK to guarantee there is at least one checkpoint
644 #define OBD_FAIL_LFSCK_FATAL1 0x1608
645 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
646 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
647 mdd.${MDT_DEV}.lfsck_namespace |
648 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
650 error "(4) unexpected status"
653 local POS0=$($SHOW_NAMESPACE |
654 awk '/^last_checkpoint_position/ { print $2 }' |
657 #define OBD_FAIL_LFSCK_DELAY1 0x1600
658 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
659 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
661 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
665 local POS1=$($SHOW_NAMESPACE |
666 awk '/^latest_start_position/ { print $2 }' |
668 [[ $POS0 -lt $POS1 ]] ||
669 error "(7) Expect larger than: $POS0, but got $POS1"
671 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
672 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
673 mdd.${MDT_DEV}.lfsck_namespace |
674 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
676 error "(8) unexpected status"
679 run_test 6a "LFSCK resumes from last checkpoint (1)"
684 #define OBD_FAIL_LFSCK_DELAY2 0x1601
685 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
686 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
688 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
689 [ "$STATUS" == "scanning-phase1" ] ||
690 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
692 # Sleep 5 sec to guarantee that we are in the directory scanning
694 # Fail the LFSCK to guarantee there is at least one checkpoint
695 #define OBD_FAIL_LFSCK_FATAL2 0x1609
696 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
697 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
698 mdd.${MDT_DEV}.lfsck_namespace |
699 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
701 error "(4) unexpected status"
704 local O_POS0=$($SHOW_NAMESPACE |
705 awk '/^last_checkpoint_position/ { print $2 }' |
708 local D_POS0=$($SHOW_NAMESPACE |
709 awk '/^last_checkpoint_position/ { print $4 }')
711 #define OBD_FAIL_LFSCK_DELAY2 0x1601
712 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
713 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
715 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
716 [ "$STATUS" == "scanning-phase1" ] ||
717 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
719 local O_POS1=$($SHOW_NAMESPACE |
720 awk '/^latest_start_position/ { print $2 }' |
722 local D_POS1=$($SHOW_NAMESPACE |
723 awk '/^latest_start_position/ { print $4 }')
725 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
726 [[ $O_POS0 -lt $O_POS1 ]] ||
727 error "(7.1) $O_POS1 is not larger than $O_POS0"
729 [[ $D_POS0 -lt $D_POS1 ]] ||
730 error "(7.2) $D_POS1 is not larger than $D_POS0"
733 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
734 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
735 mdd.${MDT_DEV}.lfsck_namespace |
736 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
738 error "(8) unexpected status"
741 run_test 6b "LFSCK resumes from last checkpoint (2)"
748 #define OBD_FAIL_LFSCK_DELAY2 0x1601
749 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
750 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
752 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
753 [ "$STATUS" == "scanning-phase1" ] ||
754 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
756 # Sleep 3 sec to guarantee at least one object processed by LFSCK
758 echo "stop $SINGLEMDS"
759 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
761 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
762 echo "start $SINGLEMDS"
763 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
764 error "(5) Fail to start MDS!"
766 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
767 mdd.${MDT_DEV}.lfsck_namespace |
768 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
770 error "(6) unexpected status"
773 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
779 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
780 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
781 for ((i = 0; i < 20; i++)); do
782 touch $DIR/$tdir/dummy${i}
785 #define OBD_FAIL_LFSCK_DELAY3 0x1602
786 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
787 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
788 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
789 mdd.${MDT_DEV}.lfsck_namespace |
790 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
792 error "(4) unexpected status"
796 echo "stop $SINGLEMDS"
797 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
799 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
800 echo "start $SINGLEMDS"
801 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
802 error "(6) Fail to start MDS!"
804 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
805 mdd.${MDT_DEV}.lfsck_namespace |
806 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
808 error "(7) unexpected status"
811 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
816 formatall > /dev/null
822 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
823 [ "$STATUS" == "init" ] ||
824 error "(2) Expect 'init', but got '$STATUS'"
826 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
827 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
828 mkdir $DIR/$tdir/crashed
830 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
831 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
832 for ((i = 0; i < 5; i++)); do
833 touch $DIR/$tdir/dummy${i}
836 umount_client $MOUNT || error "(3) Fail to stop client!"
838 #define OBD_FAIL_LFSCK_DELAY2 0x1601
839 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
840 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
842 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
843 [ "$STATUS" == "scanning-phase1" ] ||
844 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
846 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
848 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
849 [ "$STATUS" == "stopped" ] ||
850 error "(7) Expect 'stopped', but got '$STATUS'"
852 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
854 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
855 [ "$STATUS" == "scanning-phase1" ] ||
856 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
858 #define OBD_FAIL_LFSCK_FATAL2 0x1609
859 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
860 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
861 mdd.${MDT_DEV}.lfsck_namespace |
862 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
864 error "(10) unexpected status"
867 #define OBD_FAIL_LFSCK_DELAY1 0x1600
868 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
869 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
871 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
872 [ "$STATUS" == "scanning-phase1" ] ||
873 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
875 #define OBD_FAIL_LFSCK_CRASH 0x160a
876 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
879 echo "stop $SINGLEMDS"
880 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
882 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
885 echo "start $SINGLEMDS"
886 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
887 error "(14) Fail to start MDS!"
889 local timeout=$(max_recovery_time)
892 while [ $timer -lt $timeout ]; do
893 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
894 mdt.${MDT_DEV}.recovery_status |
895 awk '/^status/ { print \\\$2 }'")
896 [ "$STATUS" != "RECOVERING" ] && break;
901 [ $timer != $timeout ] ||
902 error "(14.1) recovery timeout"
904 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
905 [ "$STATUS" == "crashed" ] ||
906 error "(15) Expect 'crashed', but got '$STATUS'"
908 #define OBD_FAIL_LFSCK_DELAY2 0x1601
909 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
910 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
912 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
913 [ "$STATUS" == "scanning-phase1" ] ||
914 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
916 echo "stop $SINGLEMDS"
917 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
919 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
920 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
922 echo "start $SINGLEMDS"
923 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
924 error "(19) Fail to start MDS!"
927 while [ $timer -lt $timeout ]; do
928 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
929 mdt.${MDT_DEV}.recovery_status |
930 awk '/^status/ { print \\\$2 }'")
931 [ "$STATUS" != "RECOVERING" ] && break;
936 [ $timer != $timeout ] ||
937 error "(19.1) recovery timeout"
939 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
940 [ "$STATUS" == "paused" ] ||
941 error "(20) Expect 'paused', but got '$STATUS'"
943 #define OBD_FAIL_LFSCK_DELAY3 0x1602
944 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
946 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
947 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
948 mdd.${MDT_DEV}.lfsck_namespace |
949 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
951 error "(22) unexpected status"
954 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
955 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
956 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
958 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
959 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
960 mdd.${MDT_DEV}.lfsck_namespace |
961 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
963 error "(24) unexpected status"
966 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
967 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
969 run_test 8 "LFSCK state machine"
972 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
973 skip "Testing on UP system, the speed may be inaccurate."
977 [[ $server_version -ge $(version_code 2.7.50) ]] ||
978 { skip "Need MDS version >= 2.7.50"; return; }
981 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
982 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
983 createmany -o $DIR/$tdir/lfsck/f 5000
985 local BASE_SPEED1=100
987 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
990 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
991 [ "$STATUS" == "scanning-phase1" ] ||
992 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
994 local SPEED=$($SHOW_LAYOUT |
995 awk '/^average_speed_phase1/ { print $2 }')
997 # There may be time error, normally it should be less than 2 seconds.
998 # We allow another 20% schedule error.
1000 # MAX_MARGIN = 1.2 = 12 / 10
1001 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1002 RUN_TIME1 * 12 / 10))
1003 [ $SPEED -lt $MAX_SPEED ] ||
1004 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1006 # adjust speed limit
1007 local BASE_SPEED2=300
1009 do_facet $SINGLEMDS \
1010 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1013 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1014 # MIN_MARGIN = 0.8 = 8 / 10
1015 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1016 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1017 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1018 [ $SPEED -gt $MIN_SPEED ] || {
1019 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1020 error_ignore LU-5624 \
1021 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1024 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1028 # MAX_MARGIN = 1.2 = 12 / 10
1029 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1030 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1031 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1032 [ $SPEED -lt $MAX_SPEED ] ||
1033 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1035 do_facet $SINGLEMDS \
1036 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1038 wait_update_facet $SINGLEMDS \
1039 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1040 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1041 error "(7) Failed to get expected 'completed'"
1043 run_test 9a "LFSCK speed control (1)"
1046 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1047 skip "Testing on UP system, the speed may be inaccurate."
1051 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1052 { skip "Need MDS version >= 2.7.50"; return; }
1056 echo "Preparing another 50 * 50 files (with error) at $(date)."
1057 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1058 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1059 createmany -d $DIR/$tdir/d 50
1060 createmany -m $DIR/$tdir/f 50
1061 for ((i = 0; i < 50; i++)); do
1062 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1065 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1066 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1067 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1068 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1069 mdd.${MDT_DEV}.lfsck_namespace |
1070 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1072 error "(5) unexpected status"
1075 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1076 echo "Prepared at $(date)."
1078 local BASE_SPEED1=50
1080 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1083 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1084 [ "$STATUS" == "scanning-phase2" ] ||
1085 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1087 local SPEED=$($SHOW_NAMESPACE |
1088 awk '/^average_speed_phase2/ { print $2 }')
1089 # There may be time error, normally it should be less than 2 seconds.
1090 # We allow another 20% schedule error.
1092 # MAX_MARGIN = 1.2 = 12 / 10
1093 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1094 RUN_TIME1 * 12 / 10))
1095 [ $SPEED -lt $MAX_SPEED ] ||
1096 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1098 # adjust speed limit
1099 local BASE_SPEED2=150
1101 do_facet $SINGLEMDS \
1102 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1105 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1106 # MIN_MARGIN = 0.8 = 8 / 10
1107 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1108 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1109 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1110 [ $SPEED -gt $MIN_SPEED ] || {
1111 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1112 error_ignore LU-5624 \
1113 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1116 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1120 # MAX_MARGIN = 1.2 = 12 / 10
1121 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1122 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1123 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1124 [ $SPEED -lt $MAX_SPEED ] ||
1125 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1127 do_facet $SINGLEMDS \
1128 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1129 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1130 mdd.${MDT_DEV}.lfsck_namespace |
1131 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1133 error "(11) unexpected status"
1136 run_test 9b "LFSCK speed control (2)"
1140 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1141 skip "lookup(..)/linkea on ZFS issue" && return
1145 echo "Preparing more files with error at $(date)."
1146 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1147 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1149 for ((i = 0; i < 1000; i = $((i+2)))); do
1150 mkdir -p $DIR/$tdir/d${i}
1151 touch $DIR/$tdir/f${i}
1152 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1155 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1156 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1158 for ((i = 1; i < 1000; i = $((i+2)))); do
1159 mkdir -p $DIR/$tdir/d${i}
1160 touch $DIR/$tdir/f${i}
1161 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1164 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1165 echo "Prepared at $(date)."
1167 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1169 umount_client $MOUNT
1170 mount_client $MOUNT || error "(3) Fail to start client!"
1172 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1176 [ "$STATUS" == "scanning-phase1" ] ||
1177 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1179 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1181 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1183 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1185 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1187 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1189 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1191 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1193 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1194 error "(14) Fail to softlink!"
1196 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1197 [ "$STATUS" == "scanning-phase1" ] ||
1198 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1200 do_facet $SINGLEMDS \
1201 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1202 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1203 mdd.${MDT_DEV}.lfsck_namespace |
1204 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1206 error "(16) unexpected status"
1209 run_test 10 "System is available during LFSCK scanning"
1212 ost_remove_lastid() {
1215 local rcmd="do_facet ost${ost}"
1217 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1219 # step 1: local mount
1220 mount_fstype ost${ost} || return 1
1221 # step 2: remove the specified LAST_ID
1222 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1224 unmount_fstype ost${ost} || return 2
1228 check_mount_and_prep
1229 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1230 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1235 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1237 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1238 error "(2) Fail to start ost1"
1240 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1241 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1243 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1244 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1246 wait_update_facet ost1 "$LCTL get_param -n \
1247 obdfilter.${OST_DEV}.lfsck_layout |
1248 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1250 error "(5) unexpected status"
1253 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1255 wait_update_facet ost1 "$LCTL get_param -n \
1256 obdfilter.${OST_DEV}.lfsck_layout |
1257 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1259 error "(6) unexpected status"
1262 echo "the LAST_ID(s) should have been rebuilt"
1263 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1264 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1266 run_test 11a "LFSCK can rebuild lost last_id"
1269 check_mount_and_prep
1270 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1272 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1273 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1274 do_facet ost1 $LCTL set_param fail_loc=0x160d
1276 local count=$(precreated_ost_obj_count 0 0)
1278 createmany -o $DIR/$tdir/f $((count + 32))
1280 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1281 local seq=$(do_facet mds1 $LCTL get_param -n \
1282 osp.${proc_path}.prealloc_last_seq)
1283 local lastid1=$(do_facet ost1 "lctl get_param -n \
1284 obdfilter.${ost1_svc}.last_id" | grep $seq |
1285 awk -F: '{ print $2 }')
1287 umount_client $MOUNT
1288 stop ost1 || error "(1) Fail to stop ost1"
1290 #define OBD_FAIL_OST_ENOSPC 0x215
1291 do_facet ost1 $LCTL set_param fail_loc=0x215
1293 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1294 error "(2) Fail to start ost1"
1296 for ((i = 0; i < 60; i++)); do
1297 lastid2=$(do_facet ost1 "lctl get_param -n \
1298 obdfilter.${ost1_svc}.last_id" | grep $seq |
1299 awk -F: '{ print $2 }')
1300 [ ! -z $lastid2 ] && break;
1304 echo "the on-disk LAST_ID should be smaller than the expected one"
1305 [ $lastid1 -gt $lastid2 ] ||
1306 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1308 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1309 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1311 wait_update_facet ost1 "$LCTL get_param -n \
1312 obdfilter.${OST_DEV}.lfsck_layout |
1313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1315 error "(6) unexpected status"
1318 stop ost1 || error "(7) Fail to stop ost1"
1320 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1321 error "(8) Fail to start ost1"
1323 echo "the on-disk LAST_ID should have been rebuilt"
1324 wait_update_facet ost1 "$LCTL get_param -n \
1325 obdfilter.${ost1_svc}.last_id | grep $seq |
1326 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1327 do_facet ost1 $LCTL get_param -n \
1328 obdfilter.${ost1_svc}.last_id
1329 error "(9) expect lastid1 $seq:$lastid1"
1332 do_facet ost1 $LCTL set_param fail_loc=0
1333 stopall || error "(10) Fail to stopall"
1335 run_test 11b "LFSCK can rebuild crashed last_id"
1338 [ $MDSCOUNT -lt 2 ] &&
1339 skip "We need at least 2 MDSes for test_12" && return
1341 check_mount_and_prep
1342 for k in $(seq $MDSCOUNT); do
1343 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1344 createmany -o $DIR/$tdir/${k}/f 100 ||
1345 error "(0) Fail to create 100 files."
1348 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1349 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1350 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1352 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1353 for k in $(seq $MDSCOUNT); do
1354 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1355 mdd.$(facet_svc mds${k}).lfsck_namespace |
1356 awk '/^status/ { print $2 }')
1357 [ "$STATUS" == "scanning-phase1" ] ||
1358 error "(3) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1361 echo "Stop namespace LFSCK on all targets by single lctl command."
1362 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1363 error "(4) Fail to stop LFSCK on all devices!"
1365 echo "All the LFSCK targets should be in 'stopped' status."
1366 for k in $(seq $MDSCOUNT); do
1367 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1368 mdd.$(facet_svc mds${k}).lfsck_namespace |
1369 awk '/^status/ { print $2 }')
1370 [ "$STATUS" == "stopped" ] ||
1371 error "(5) MDS${k} Expect 'stopped', but got '$STATUS'"
1374 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1375 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1376 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1378 echo "All the LFSCK targets should be in 'completed' status."
1379 for k in $(seq $MDSCOUNT); do
1380 wait_update_facet mds${k} "$LCTL get_param -n \
1381 mdd.$(facet_svc mds${k}).lfsck_namespace |
1382 awk '/^status/ { print \\\$2 }'" "completed" 8 ||
1383 error "(7) MDS${k} is not the expected 'completed'"
1386 start_full_debug_logging
1388 echo "Start layout LFSCK on all targets by single command (-s 1)."
1389 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1390 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1392 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1393 for k in $(seq $MDSCOUNT); do
1394 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1395 mdd.$(facet_svc mds${k}).lfsck_layout |
1396 awk '/^status/ { print $2 }')
1397 [ "$STATUS" == "scanning-phase1" ] ||
1398 error "(9) MDS${k} Expect 'scanning-phase1', but got '$STATUS'"
1401 echo "Stop layout LFSCK on all targets by single lctl command."
1402 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1403 error "(10) Fail to stop LFSCK on all devices!"
1405 echo "All the LFSCK targets should be in 'stopped' status."
1406 for k in $(seq $MDSCOUNT); do
1407 local STATUS=$(do_facet mds${k} $LCTL get_param -n \
1408 mdd.$(facet_svc mds${k}).lfsck_layout |
1409 awk '/^status/ { print $2 }')
1410 [ "$STATUS" == "stopped" ] ||
1411 error "(11) MDS${k} Expect 'stopped', but got '$STATUS'"
1414 for k in $(seq $OSTCOUNT); do
1415 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1416 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1417 awk '/^status/ { print $2 }')
1418 [ "$STATUS" == "stopped" ] ||
1419 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1422 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1423 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1424 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1426 echo "All the LFSCK targets should be in 'completed' status."
1427 for k in $(seq $MDSCOUNT); do
1428 # The LFSCK status query internal is 30 seconds. For the case
1429 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1430 # time to guarantee the status sync up.
1431 wait_update_facet mds${k} "$LCTL get_param -n \
1432 mdd.$(facet_svc mds${k}).lfsck_layout |
1433 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
1434 error "(14) MDS${k} is not the expected 'completed'"
1437 stop_full_debug_logging
1439 run_test 12 "single command to trigger LFSCK on all devices"
1443 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1444 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1445 echo "MDT-object FID."
1448 check_mount_and_prep
1450 echo "Inject failure stub to simulate bad lmm_oi"
1451 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1452 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1453 createmany -o $DIR/$tdir/f 32
1454 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1456 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1457 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1459 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1460 mdd.${MDT_DEV}.lfsck_layout |
1461 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1463 error "(2) unexpected status"
1466 local repaired=$($SHOW_LAYOUT |
1467 awk '/^repaired_others/ { print $2 }')
1468 [ $repaired -eq 32 ] ||
1469 error "(3) Fail to repair crashed lmm_oi: $repaired"
1471 run_test 13 "LFSCK can repair crashed lmm_oi"
1475 echo "The OST-object referenced by the MDT-object should be there;"
1476 echo "otherwise, the LFSCK should re-create the missing OST-object."
1479 check_mount_and_prep
1480 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1482 echo "Inject failure stub to simulate dangling referenced MDT-object"
1483 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1484 do_facet ost1 $LCTL set_param fail_loc=0x1610
1485 local count=$(precreated_ost_obj_count 0 0)
1487 createmany -o $DIR/$tdir/f $((count + 31))
1488 touch $DIR/$tdir/guard
1489 do_facet ost1 $LCTL set_param fail_loc=0
1491 start_full_debug_logging
1493 # exhaust other pre-created dangling cases
1494 count=$(precreated_ost_obj_count 0 0)
1495 createmany -o $DIR/$tdir/a $count ||
1496 error "(0) Fail to create $count files."
1498 echo "'ls' should fail because of dangling referenced MDT-object"
1499 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1501 echo "Trigger layout LFSCK to find out dangling reference"
1502 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1504 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1505 mdd.${MDT_DEV}.lfsck_layout |
1506 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1508 error "(3) unexpected status"
1511 local repaired=$($SHOW_LAYOUT |
1512 awk '/^repaired_dangling/ { print $2 }')
1513 [ $repaired -ge 32 ] ||
1514 error "(4) Fail to repair dangling reference: $repaired"
1516 echo "'stat' should fail because of not repair dangling by default"
1517 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1519 echo "Trigger layout LFSCK to repair dangling reference"
1520 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1522 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1523 mdd.${MDT_DEV}.lfsck_layout |
1524 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1526 error "(7) unexpected status"
1529 # There may be some async LFSCK updates in processing, wait for
1530 # a while until the target reparation has been done. LU-4970.
1532 echo "'stat' should success after layout LFSCK repairing"
1533 wait_update_facet client "stat $DIR/$tdir/guard |
1534 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1535 stat $DIR/$tdir/guard
1537 error "(8) unexpected size"
1540 repaired=$($SHOW_LAYOUT |
1541 awk '/^repaired_dangling/ { print $2 }')
1542 [ $repaired -ge 32 ] ||
1543 error "(9) Fail to repair dangling reference: $repaired"
1545 stop_full_debug_logging
1547 run_test 14 "LFSCK can repair MDT-object with dangling reference"
1551 echo "If the OST-object referenced by the MDT-object back points"
1552 echo "to some non-exist MDT-object, then the LFSCK should repair"
1553 echo "the OST-object to back point to the right MDT-object."
1556 check_mount_and_prep
1557 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1559 echo "Inject failure stub to make the OST-object to back point to"
1560 echo "non-exist MDT-object."
1561 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1563 do_facet ost1 $LCTL set_param fail_loc=0x1611
1564 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1565 cancel_lru_locks osc
1566 do_facet ost1 $LCTL set_param fail_loc=0
1568 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1569 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1571 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1572 mdd.${MDT_DEV}.lfsck_layout |
1573 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1575 error "(2) unexpected status"
1578 local repaired=$($SHOW_LAYOUT |
1579 awk '/^repaired_unmatched_pair/ { print $2 }')
1580 [ $repaired -eq 1 ] ||
1581 error "(3) Fail to repair unmatched pair: $repaired"
1583 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1587 echo "If the OST-object referenced by the MDT-object back points"
1588 echo "to other MDT-object that doesn't recognize the OST-object,"
1589 echo "then the LFSCK should repair it to back point to the right"
1590 echo "MDT-object (the first one)."
1593 check_mount_and_prep
1594 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1595 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1596 cancel_lru_locks osc
1598 echo "Inject failure stub to make the OST-object to back point to"
1599 echo "other MDT-object"
1601 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1602 do_facet ost1 $LCTL set_param fail_loc=0x1612
1603 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1604 cancel_lru_locks osc
1605 do_facet ost1 $LCTL set_param fail_loc=0
1607 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1608 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1610 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1611 mdd.${MDT_DEV}.lfsck_layout |
1612 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1614 error "(2) unexpected status"
1617 local repaired=$($SHOW_LAYOUT |
1618 awk '/^repaired_unmatched_pair/ { print $2 }')
1619 [ $repaired -eq 1 ] ||
1620 error "(3) Fail to repair unmatched pair: $repaired"
1622 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1625 [ $MDSCOUNT -lt 2 ] &&
1626 skip "We need at least 2 MDSes for this test" && return
1628 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1629 skip "Skip the test after 2.7.55 see LU-6437" && return
1632 echo "According to current metadata migration implementation,"
1633 echo "before the old MDT-object is removed, both the new MDT-object"
1634 echo "and old MDT-object will reference the same LOV layout. Then if"
1635 echo "the layout LFSCK finds the new MDT-object by race, it will"
1636 echo "regard related OST-object(s) as multiple referenced case, and"
1637 echo "will try to create new OST-object(s) for the new MDT-object."
1638 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1639 echo "MDT-object before confirm the multiple referenced case."
1642 check_mount_and_prep
1643 $LFS mkdir -i 1 $DIR/$tdir/a1
1644 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1645 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1646 cancel_lru_locks osc
1648 echo "Inject failure stub on MDT1 to delay the migration"
1650 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1651 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1652 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1653 $LFS migrate -m 0 $DIR/$tdir/a1 &
1656 echo "Trigger layout LFSCK to race with the migration"
1657 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1659 for k in $(seq $MDSCOUNT); do
1660 # The LFSCK status query internal is 30 seconds. For the case
1661 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1662 # time to guarantee the status sync up.
1663 wait_update_facet mds${k} "$LCTL get_param -n \
1664 mdd.$(facet_svc mds${k}).lfsck_layout |
1665 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1666 error "(2) MDS${k} is not the expected 'completed'"
1669 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1670 local repaired=$($SHOW_LAYOUT |
1671 awk '/^repaired_unmatched_pair/ { print $2 }')
1672 [ $repaired -eq 1 ] ||
1673 error "(3) Fail to repair unmatched pair: $repaired"
1675 repaired=$($SHOW_LAYOUT |
1676 awk '/^repaired_multiple_referenced/ { print $2 }')
1677 [ $repaired -eq 0 ] ||
1678 error "(4) Unexpectedly repaird multiple references: $repaired"
1680 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1684 echo "If the OST-object's owner information does not match the owner"
1685 echo "information stored in the MDT-object, then the LFSCK trust the"
1686 echo "MDT-object and update the OST-object's owner information."
1689 check_mount_and_prep
1690 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1691 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1692 cancel_lru_locks osc
1694 echo "Inject failure stub to skip OST-object owner changing"
1695 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1696 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1697 chown 1.1 $DIR/$tdir/f0
1698 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1700 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1703 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1705 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1706 mdd.${MDT_DEV}.lfsck_layout |
1707 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1709 error "(2) unexpected status"
1712 local repaired=$($SHOW_LAYOUT |
1713 awk '/^repaired_inconsistent_owner/ { print $2 }')
1714 [ $repaired -eq 1 ] ||
1715 error "(3) Fail to repair inconsistent owner: $repaired"
1717 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1721 echo "If more than one MDT-objects reference the same OST-object,"
1722 echo "and the OST-object only recognizes one MDT-object, then the"
1723 echo "LFSCK should create new OST-objects for such non-recognized"
1727 check_mount_and_prep
1728 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1730 echo "Inject failure stub to make two MDT-objects to refernce"
1731 echo "the OST-object"
1733 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1734 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1736 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1737 cancel_lru_locks osc
1739 createmany -o $DIR/$tdir/f 1
1741 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1743 cancel_lru_locks mdc
1744 cancel_lru_locks osc
1746 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1747 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1748 [ $size -eq 1048576 ] ||
1749 error "(1) f0 (wrong) size should be 1048576, but got $size"
1751 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1754 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1756 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1757 mdd.${MDT_DEV}.lfsck_layout |
1758 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1760 error "(3) unexpected status"
1763 local repaired=$($SHOW_LAYOUT |
1764 awk '/^repaired_multiple_referenced/ { print $2 }')
1765 [ $repaired -eq 1 ] ||
1766 error "(4) Fail to repair multiple references: $repaired"
1768 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1769 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1770 error "(5) Fail to write f0."
1771 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1772 [ $size -eq 1048576 ] ||
1773 error "(6) guard size should be 1048576, but got $size"
1775 run_test 17 "LFSCK can repair multiple references"
1777 $LCTL set_param debug=+cache > /dev/null
1781 echo "The target MDT-object is there, but related stripe information"
1782 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1783 echo "layout EA entries."
1786 check_mount_and_prep
1787 $LFS mkdir -i 0 $DIR/$tdir/a1
1788 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1789 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1791 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1793 $LFS path2fid $DIR/$tdir/a1/f1
1794 $LFS getstripe $DIR/$tdir/a1/f1
1796 if [ $MDSCOUNT -ge 2 ]; then
1797 $LFS mkdir -i 1 $DIR/$tdir/a2
1798 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1799 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1800 $LFS path2fid $DIR/$tdir/a2/f2
1801 $LFS getstripe $DIR/$tdir/a2/f2
1804 cancel_lru_locks osc
1806 echo "Inject failure, to make the MDT-object lost its layout EA"
1807 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1808 do_facet mds1 $LCTL set_param fail_loc=0x1615
1809 chown 1.1 $DIR/$tdir/a1/f1
1811 if [ $MDSCOUNT -ge 2 ]; then
1812 do_facet mds2 $LCTL set_param fail_loc=0x1615
1813 chown 1.1 $DIR/$tdir/a2/f2
1819 do_facet mds1 $LCTL set_param fail_loc=0
1820 if [ $MDSCOUNT -ge 2 ]; then
1821 do_facet mds2 $LCTL set_param fail_loc=0
1824 cancel_lru_locks mdc
1825 cancel_lru_locks osc
1827 echo "The file size should be incorrect since layout EA is lost"
1828 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1829 [ "$cur_size" != "$saved_size" ] ||
1830 error "(1) Expect incorrect file1 size"
1832 if [ $MDSCOUNT -ge 2 ]; then
1833 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1834 [ "$cur_size" != "$saved_size" ] ||
1835 error "(2) Expect incorrect file2 size"
1838 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1839 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1841 for k in $(seq $MDSCOUNT); do
1842 # The LFSCK status query internal is 30 seconds. For the case
1843 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1844 # time to guarantee the status sync up.
1845 wait_update_facet mds${k} "$LCTL get_param -n \
1846 mdd.$(facet_svc mds${k}).lfsck_layout |
1847 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1848 error "(4) MDS${k} is not the expected 'completed'"
1851 for k in $(seq $OSTCOUNT); do
1852 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1853 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1854 awk '/^status/ { print $2 }')
1855 [ "$cur_status" == "completed" ] ||
1856 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1859 local repaired=$(do_facet mds1 $LCTL get_param -n \
1860 mdd.$(facet_svc mds1).lfsck_layout |
1861 awk '/^repaired_orphan/ { print $2 }')
1862 [ $repaired -eq 1 ] ||
1863 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1865 if [ $MDSCOUNT -ge 2 ]; then
1866 repaired=$(do_facet mds2 $LCTL get_param -n \
1867 mdd.$(facet_svc mds2).lfsck_layout |
1868 awk '/^repaired_orphan/ { print $2 }')
1869 [ $repaired -eq 2 ] ||
1870 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1873 $LFS path2fid $DIR/$tdir/a1/f1
1874 $LFS getstripe $DIR/$tdir/a1/f1
1876 if [ $MDSCOUNT -ge 2 ]; then
1877 $LFS path2fid $DIR/$tdir/a2/f2
1878 $LFS getstripe $DIR/$tdir/a2/f2
1881 echo "The file size should be correct after layout LFSCK scanning"
1882 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1883 [ "$cur_size" == "$saved_size" ] ||
1884 error "(7) Expect file1 size $saved_size, but got $cur_size"
1886 if [ $MDSCOUNT -ge 2 ]; then
1887 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1888 [ "$cur_size" == "$saved_size" ] ||
1889 error "(8) Expect file2 size $saved_size, but got $cur_size"
1892 run_test 18a "Find out orphan OST-object and repair it (1)"
1896 echo "The target MDT-object is lost. The LFSCK should re-create the"
1897 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
1898 echo "can move it back to normal namespace manually."
1901 check_mount_and_prep
1902 $LFS mkdir -i 0 $DIR/$tdir/a1
1903 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1904 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1905 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1906 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
1908 $LFS getstripe $DIR/$tdir/a1/f1
1910 if [ $MDSCOUNT -ge 2 ]; then
1911 $LFS mkdir -i 1 $DIR/$tdir/a2
1912 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1913 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1914 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
1916 $LFS getstripe $DIR/$tdir/a2/f2
1919 cancel_lru_locks osc
1921 echo "Inject failure, to simulate the case of missing the MDT-object"
1922 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
1923 do_facet mds1 $LCTL set_param fail_loc=0x1616
1924 rm -f $DIR/$tdir/a1/f1
1926 if [ $MDSCOUNT -ge 2 ]; then
1927 do_facet mds2 $LCTL set_param fail_loc=0x1616
1928 rm -f $DIR/$tdir/a2/f2
1934 do_facet mds1 $LCTL set_param fail_loc=0
1935 if [ $MDSCOUNT -ge 2 ]; then
1936 do_facet mds2 $LCTL set_param fail_loc=0
1939 cancel_lru_locks mdc
1940 cancel_lru_locks osc
1942 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1943 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
1945 for k in $(seq $MDSCOUNT); do
1946 # The LFSCK status query internal is 30 seconds. For the case
1947 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1948 # time to guarantee the status sync up.
1949 wait_update_facet mds${k} "$LCTL get_param -n \
1950 mdd.$(facet_svc mds${k}).lfsck_layout |
1951 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1952 error "(2) MDS${k} is not the expected 'completed'"
1955 for k in $(seq $OSTCOUNT); do
1956 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1957 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1958 awk '/^status/ { print $2 }')
1959 [ "$cur_status" == "completed" ] ||
1960 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
1963 local repaired=$(do_facet mds1 $LCTL get_param -n \
1964 mdd.$(facet_svc mds1).lfsck_layout |
1965 awk '/^repaired_orphan/ { print $2 }')
1966 [ $repaired -eq 1 ] ||
1967 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
1969 if [ $MDSCOUNT -ge 2 ]; then
1970 repaired=$(do_facet mds2 $LCTL get_param -n \
1971 mdd.$(facet_svc mds2).lfsck_layout |
1972 awk '/^repaired_orphan/ { print $2 }')
1973 [ $repaired -eq 2 ] ||
1974 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
1977 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
1978 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
1979 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
1981 if [ $MDSCOUNT -ge 2 ]; then
1982 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
1983 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
1986 $LFS path2fid $DIR/$tdir/a1/f1
1987 $LFS getstripe $DIR/$tdir/a1/f1
1989 if [ $MDSCOUNT -ge 2 ]; then
1990 $LFS path2fid $DIR/$tdir/a2/f2
1991 $LFS getstripe $DIR/$tdir/a2/f2
1994 echo "The file size should be correct after layout LFSCK scanning"
1995 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1996 [ "$cur_size" == "$saved_size" ] ||
1997 error "(7) Expect file1 size $saved_size, but got $cur_size"
1999 if [ $MDSCOUNT -ge 2 ]; then
2000 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2001 [ "$cur_size" == "$saved_size" ] ||
2002 error "(8) Expect file2 size $saved_size, but got $cur_size"
2005 run_test 18b "Find out orphan OST-object and repair it (2)"
2009 echo "The target MDT-object is lost, and the OST-object FID is missing."
2010 echo "The LFSCK should re-create the MDT-object with new FID under the "
2011 echo "directory .lustre/lost+found/MDTxxxx."
2014 check_mount_and_prep
2015 $LFS mkdir -i 0 $DIR/$tdir/a1
2016 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2018 echo "Inject failure, to simulate the case of missing parent FID"
2019 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2020 do_facet ost1 $LCTL set_param fail_loc=0x1617
2022 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2023 $LFS getstripe $DIR/$tdir/a1/f1
2025 if [ $MDSCOUNT -ge 2 ]; then
2026 $LFS mkdir -i 1 $DIR/$tdir/a2
2027 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
2028 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2029 $LFS getstripe $DIR/$tdir/a2/f2
2032 cancel_lru_locks osc
2034 echo "Inject failure, to simulate the case of missing the MDT-object"
2035 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2036 do_facet mds1 $LCTL set_param fail_loc=0x1616
2037 rm -f $DIR/$tdir/a1/f1
2039 if [ $MDSCOUNT -ge 2 ]; then
2040 do_facet mds2 $LCTL set_param fail_loc=0x1616
2041 rm -f $DIR/$tdir/a2/f2
2047 do_facet mds1 $LCTL set_param fail_loc=0
2048 if [ $MDSCOUNT -ge 2 ]; then
2049 do_facet mds2 $LCTL set_param fail_loc=0
2052 cancel_lru_locks mdc
2053 cancel_lru_locks osc
2055 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2056 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2058 for k in $(seq $MDSCOUNT); do
2059 # The LFSCK status query internal is 30 seconds. For the case
2060 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2061 # time to guarantee the status sync up.
2062 wait_update_facet mds${k} "$LCTL get_param -n \
2063 mdd.$(facet_svc mds${k}).lfsck_layout |
2064 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2065 error "(2) MDS${k} is not the expected 'completed'"
2068 for k in $(seq $OSTCOUNT); do
2069 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2070 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2071 awk '/^status/ { print $2 }')
2072 [ "$cur_status" == "completed" ] ||
2073 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2076 if [ $MDSCOUNT -ge 2 ]; then
2082 local repaired=$(do_facet mds1 $LCTL get_param -n \
2083 mdd.$(facet_svc mds1).lfsck_layout |
2084 awk '/^repaired_orphan/ { print $2 }')
2085 [ $repaired -eq $expected ] ||
2086 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2088 if [ $MDSCOUNT -ge 2 ]; then
2089 repaired=$(do_facet mds2 $LCTL get_param -n \
2090 mdd.$(facet_svc mds2).lfsck_layout |
2091 awk '/^repaired_orphan/ { print $2 }')
2092 [ $repaired -eq 0 ] ||
2093 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2096 ls -ail $MOUNT/.lustre/lost+found/
2098 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2099 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2100 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2102 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2105 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2106 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2107 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2109 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2110 [ ! -z "$cname" ] ||
2111 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2113 run_test 18c "Find out orphan OST-object and repair it (3)"
2117 echo "The target MDT-object layout EA slot is occpuied by some new"
2118 echo "created OST-object when repair dangling reference case. Such"
2119 echo "conflict OST-object has never been modified. Then when found"
2120 echo "the orphan OST-object, LFSCK will replace it with the orphan"
2124 check_mount_and_prep
2126 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2127 echo "guard" > $DIR/$tdir/a1/f1
2128 echo "foo" > $DIR/$tdir/a1/f2
2129 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2130 $LFS path2fid $DIR/$tdir/a1/f1
2131 $LFS getstripe $DIR/$tdir/a1/f1
2132 $LFS path2fid $DIR/$tdir/a1/f2
2133 $LFS getstripe $DIR/$tdir/a1/f2
2134 cancel_lru_locks osc
2136 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2137 echo "to reference the same OST-object (which is f1's OST-obejct)."
2138 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2139 echo "dangling reference case, but f2's old OST-object is there."
2142 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2143 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2144 chown 1.1 $DIR/$tdir/a1/f2
2145 rm -f $DIR/$tdir/a1/f1
2148 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2150 echo "stopall to cleanup object cache"
2153 setupall > /dev/null
2155 echo "The file size should be incorrect since dangling referenced"
2156 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2157 [ "$cur_size" != "$saved_size" ] ||
2158 error "(1) Expect incorrect file2 size"
2160 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2161 do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x1602
2163 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2164 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2166 wait_update_facet mds1 "$LCTL get_param -n \
2167 mdd.$(facet_svc mds1).lfsck_layout |
2168 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2169 error "(3.0) MDS1 is not the expected 'scanning-phase2'"
2171 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2173 for k in $(seq $MDSCOUNT); do
2174 # The LFSCK status query internal is 30 seconds. For the case
2175 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2176 # time to guarantee the status sync up.
2177 wait_update_facet mds${k} "$LCTL get_param -n \
2178 mdd.$(facet_svc mds${k}).lfsck_layout |
2179 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2180 error "(3) MDS${k} is not the expected 'completed'"
2183 for k in $(seq $OSTCOUNT); do
2184 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2185 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2186 awk '/^status/ { print $2 }')
2187 [ "$cur_status" == "completed" ] ||
2188 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2191 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2192 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2193 awk '/^repaired_orphan/ { print $2 }')
2194 [ $repaired -eq 1 ] ||
2195 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2197 echo "The file size should be correct after layout LFSCK scanning"
2198 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2199 [ "$cur_size" == "$saved_size" ] ||
2200 error "(6) Expect file2 size $saved_size, but got $cur_size"
2202 echo "The LFSCK should find back the original data."
2203 cat $DIR/$tdir/a1/f2
2204 $LFS path2fid $DIR/$tdir/a1/f2
2205 $LFS getstripe $DIR/$tdir/a1/f2
2207 run_test 18d "Find out orphan OST-object and repair it (4)"
2211 echo "The target MDT-object layout EA slot is occpuied by some new"
2212 echo "created OST-object when repair dangling reference case. Such"
2213 echo "conflict OST-object has been modified by others. To keep the"
2214 echo "new data, the LFSCK will create a new file to refernece this"
2215 echo "old orphan OST-object."
2218 check_mount_and_prep
2220 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2221 echo "guard" > $DIR/$tdir/a1/f1
2222 echo "foo" > $DIR/$tdir/a1/f2
2223 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2224 $LFS path2fid $DIR/$tdir/a1/f1
2225 $LFS getstripe $DIR/$tdir/a1/f1
2226 $LFS path2fid $DIR/$tdir/a1/f2
2227 $LFS getstripe $DIR/$tdir/a1/f2
2228 cancel_lru_locks osc
2230 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2231 echo "to reference the same OST-object (which is f1's OST-obejct)."
2232 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2233 echo "dangling reference case, but f2's old OST-object is there."
2236 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2237 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2238 chown 1.1 $DIR/$tdir/a1/f2
2239 rm -f $DIR/$tdir/a1/f1
2242 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2244 echo "stopall to cleanup object cache"
2247 setupall > /dev/null
2249 echo "The file size should be incorrect since dangling referenced"
2250 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2251 [ "$cur_size" != "$saved_size" ] ||
2252 error "(1) Expect incorrect file2 size"
2254 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2255 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2257 start_full_debug_logging
2259 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2260 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2262 wait_update_facet mds1 "$LCTL get_param -n \
2263 mdd.$(facet_svc mds1).lfsck_layout |
2264 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2265 error "(3) MDS1 is not the expected 'scanning-phase2'"
2267 # to guarantee all updates are synced.
2271 echo "Write new data to f2 to modify the new created OST-object."
2272 echo "dummy" >> $DIR/$tdir/a1/f2
2274 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2276 for k in $(seq $MDSCOUNT); do
2277 # The LFSCK status query internal is 30 seconds. For the case
2278 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2279 # time to guarantee the status sync up.
2280 wait_update_facet mds${k} "$LCTL get_param -n \
2281 mdd.$(facet_svc mds${k}).lfsck_layout |
2282 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2283 error "(4) MDS${k} is not the expected 'completed'"
2286 for k in $(seq $OSTCOUNT); do
2287 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2288 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2289 awk '/^status/ { print $2 }')
2290 [ "$cur_status" == "completed" ] ||
2291 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2294 stop_full_debug_logging
2296 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2297 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2298 awk '/^repaired_orphan/ { print $2 }')
2299 [ $repaired -eq 1 ] ||
2300 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2302 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2303 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2304 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2306 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2307 [ ! -z "$cname" ] ||
2308 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2310 echo "The stub file should keep the original f2 data"
2311 cur_size=$(ls -il $cname | awk '{ print $6 }')
2312 [ "$cur_size" == "$saved_size" ] ||
2313 error "(9) Expect file2 size $saved_size, but got $cur_size"
2316 $LFS path2fid $cname
2317 $LFS getstripe $cname
2319 echo "The f2 should contains new data."
2320 cat $DIR/$tdir/a1/f2
2321 $LFS path2fid $DIR/$tdir/a1/f2
2322 $LFS getstripe $DIR/$tdir/a1/f2
2324 run_test 18e "Find out orphan OST-object and repair it (5)"
2327 [ $OSTCOUNT -lt 2 ] &&
2328 skip "The test needs at least 2 OSTs" && return
2331 echo "The target MDT-object is lost. The LFSCK should re-create the"
2332 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2333 echo "to verify some OST-object(s) during the first stage-scanning,"
2334 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2335 echo "should not be affected."
2338 check_mount_and_prep
2339 $LFS mkdir -i 0 $DIR/$tdir/a1
2340 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2341 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2342 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2343 $LFS mkdir -i 0 $DIR/$tdir/a2
2344 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2345 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2346 $LFS getstripe $DIR/$tdir/a1/f1
2347 $LFS getstripe $DIR/$tdir/a2/f2
2349 if [ $MDSCOUNT -ge 2 ]; then
2350 $LFS mkdir -i 1 $DIR/$tdir/a3
2351 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2352 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2353 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2354 $LFS mkdir -i 1 $DIR/$tdir/a4
2355 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2356 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2357 $LFS getstripe $DIR/$tdir/a3/f3
2358 $LFS getstripe $DIR/$tdir/a4/f4
2361 cancel_lru_locks osc
2363 echo "Inject failure, to simulate the case of missing the MDT-object"
2364 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2365 do_facet mds1 $LCTL set_param fail_loc=0x1616
2366 rm -f $DIR/$tdir/a1/f1
2367 rm -f $DIR/$tdir/a2/f2
2369 if [ $MDSCOUNT -ge 2 ]; then
2370 do_facet mds2 $LCTL set_param fail_loc=0x1616
2371 rm -f $DIR/$tdir/a3/f3
2372 rm -f $DIR/$tdir/a4/f4
2378 do_facet mds1 $LCTL set_param fail_loc=0
2379 if [ $MDSCOUNT -ge 2 ]; then
2380 do_facet mds2 $LCTL set_param fail_loc=0
2383 cancel_lru_locks mdc
2384 cancel_lru_locks osc
2386 echo "Inject failure, to simulate the OST0 fail to handle"
2387 echo "MDT0 LFSCK request during the first-stage scanning."
2388 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2389 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2391 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2392 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2394 for k in $(seq $MDSCOUNT); do
2395 # The LFSCK status query internal is 30 seconds. For the case
2396 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2397 # time to guarantee the status sync up.
2398 wait_update_facet mds${k} "$LCTL get_param -n \
2399 mdd.$(facet_svc mds${k}).lfsck_layout |
2400 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2401 error "(2) MDS${k} is not the expected 'partial'"
2404 wait_update_facet ost1 "$LCTL get_param -n \
2405 obdfilter.$(facet_svc ost1).lfsck_layout |
2406 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2407 error "(3) OST1 is not the expected 'partial'"
2410 wait_update_facet ost2 "$LCTL get_param -n \
2411 obdfilter.$(facet_svc ost2).lfsck_layout |
2412 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2413 error "(4) OST2 is not the expected 'completed'"
2416 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2418 local repaired=$(do_facet mds1 $LCTL get_param -n \
2419 mdd.$(facet_svc mds1).lfsck_layout |
2420 awk '/^repaired_orphan/ { print $2 }')
2421 [ $repaired -eq 1 ] ||
2422 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2424 if [ $MDSCOUNT -ge 2 ]; then
2425 repaired=$(do_facet mds2 $LCTL get_param -n \
2426 mdd.$(facet_svc mds2).lfsck_layout |
2427 awk '/^repaired_orphan/ { print $2 }')
2428 [ $repaired -eq 1 ] ||
2429 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2432 echo "Trigger layout LFSCK on all devices again to cleanup"
2433 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2435 for k in $(seq $MDSCOUNT); do
2436 # The LFSCK status query internal is 30 seconds. For the case
2437 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2438 # time to guarantee the status sync up.
2439 wait_update_facet mds${k} "$LCTL get_param -n \
2440 mdd.$(facet_svc mds${k}).lfsck_layout |
2441 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2442 error "(8) MDS${k} is not the expected 'completed'"
2445 for k in $(seq $OSTCOUNT); do
2446 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2447 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2448 awk '/^status/ { print $2 }')
2449 [ "$cur_status" == "completed" ] ||
2450 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2454 local repaired=$(do_facet mds1 $LCTL get_param -n \
2455 mdd.$(facet_svc mds1).lfsck_layout |
2456 awk '/^repaired_orphan/ { print $2 }')
2457 [ $repaired -eq 2 ] ||
2458 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2460 if [ $MDSCOUNT -ge 2 ]; then
2461 repaired=$(do_facet mds2 $LCTL get_param -n \
2462 mdd.$(facet_svc mds2).lfsck_layout |
2463 awk '/^repaired_orphan/ { print $2 }')
2464 [ $repaired -eq 2 ] ||
2465 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2468 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2470 $LCTL set_param debug=-cache > /dev/null
2473 check_mount_and_prep
2474 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2476 echo "foo" > $DIR/$tdir/a0
2477 echo "guard" > $DIR/$tdir/a1
2478 cancel_lru_locks osc
2480 echo "Inject failure, then client will offer wrong parent FID when read"
2481 do_facet ost1 $LCTL set_param -n \
2482 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2483 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2484 $LCTL set_param fail_loc=0x1619
2486 echo "Read RPC with wrong parent FID should be denied"
2487 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2488 $LCTL set_param fail_loc=0
2490 run_test 19a "OST-object inconsistency self detect"
2493 check_mount_and_prep
2494 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2496 echo "Inject failure stub to make the OST-object to back point to"
2497 echo "non-exist MDT-object"
2499 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2500 do_facet ost1 $LCTL set_param fail_loc=0x1611
2501 echo "foo" > $DIR/$tdir/f0
2502 cancel_lru_locks osc
2503 do_facet ost1 $LCTL set_param fail_loc=0
2505 echo "Nothing should be fixed since self detect and repair is disabled"
2506 local repaired=$(do_facet ost1 $LCTL get_param -n \
2507 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2508 awk '/^repaired/ { print $2 }')
2509 [ $repaired -eq 0 ] ||
2510 error "(1) Expected 0 repaired, but got $repaired"
2512 echo "Read RPC with right parent FID should be accepted,"
2513 echo "and cause parent FID on OST to be fixed"
2515 do_facet ost1 $LCTL set_param -n \
2516 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2517 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2519 repaired=$(do_facet ost1 $LCTL get_param -n \
2520 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2521 awk '/^repaired/ { print $2 }')
2522 [ $repaired -eq 1 ] ||
2523 error "(3) Expected 1 repaired, but got $repaired"
2525 run_test 19b "OST-object inconsistency self repair"
2528 [ $OSTCOUNT -lt 2 ] &&
2529 skip "The test needs at least 2 OSTs" && return
2532 echo "The target MDT-object and some of its OST-object are lost."
2533 echo "The LFSCK should find out the left OST-objects and re-create"
2534 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2535 echo "with the partial OST-objects (LOV EA hole)."
2537 echo "New client can access the file with LOV EA hole via normal"
2538 echo "system tools or commands without crash the system."
2540 echo "For old client, even though it cannot access the file with"
2541 echo "LOV EA hole, it should not cause the system crash."
2544 check_mount_and_prep
2545 $LFS mkdir -i 0 $DIR/$tdir/a1
2546 if [ $OSTCOUNT -gt 2 ]; then
2547 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2550 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2554 # 256 blocks on the stripe0.
2555 # 1 block on the stripe1 for 2 OSTs case.
2556 # 256 blocks on the stripe1 for other cases.
2557 # 1 block on the stripe2 if OSTs > 2
2558 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2559 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2560 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2562 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2563 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2564 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2567 $LFS getstripe $DIR/$tdir/a1/f0
2569 $LFS getstripe $DIR/$tdir/a1/f1
2571 $LFS getstripe $DIR/$tdir/a1/f2
2573 if [ $OSTCOUNT -gt 2 ]; then
2574 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2575 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2577 $LFS getstripe $DIR/$tdir/a1/f3
2580 cancel_lru_locks osc
2582 echo "Inject failure..."
2583 echo "To simulate f0 lost MDT-object"
2584 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2585 do_facet mds1 $LCTL set_param fail_loc=0x1616
2586 rm -f $DIR/$tdir/a1/f0
2588 echo "To simulate f1 lost MDT-object and OST-object0"
2589 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2590 do_facet mds1 $LCTL set_param fail_loc=0x161a
2591 rm -f $DIR/$tdir/a1/f1
2593 echo "To simulate f2 lost MDT-object and OST-object1"
2594 do_facet mds1 $LCTL set_param fail_val=1
2595 rm -f $DIR/$tdir/a1/f2
2597 if [ $OSTCOUNT -gt 2 ]; then
2598 echo "To simulate f3 lost MDT-object and OST-object2"
2599 do_facet mds1 $LCTL set_param fail_val=2
2600 rm -f $DIR/$tdir/a1/f3
2603 umount_client $MOUNT
2606 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2608 echo "Inject failure to slow down the LFSCK on OST0"
2609 #define OBD_FAIL_LFSCK_DELAY5 0x161b
2610 do_facet ost1 $LCTL set_param fail_loc=0x161b
2612 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2613 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2616 do_facet ost1 $LCTL set_param fail_loc=0
2618 for k in $(seq $MDSCOUNT); do
2619 # The LFSCK status query internal is 30 seconds. For the case
2620 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2621 # time to guarantee the status sync up.
2622 wait_update_facet mds${k} "$LCTL get_param -n \
2623 mdd.$(facet_svc mds${k}).lfsck_layout |
2624 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2625 error "(2) MDS${k} is not the expected 'completed'"
2628 for k in $(seq $OSTCOUNT); do
2629 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2630 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2631 awk '/^status/ { print $2 }')
2632 [ "$cur_status" == "completed" ] ||
2633 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2636 local repaired=$(do_facet mds1 $LCTL get_param -n \
2637 mdd.$(facet_svc mds1).lfsck_layout |
2638 awk '/^repaired_orphan/ { print $2 }')
2639 if [ $OSTCOUNT -gt 2 ]; then
2640 [ $repaired -eq 9 ] ||
2641 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2643 [ $repaired -eq 4 ] ||
2644 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2647 mount_client $MOUNT || error "(5.0) Fail to start client!"
2649 LOV_PATTERN_F_HOLE=0x40000000
2652 # ${fid0}-R-0 is the old f0
2654 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2655 echo "Check $name, which is the old f0"
2657 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2659 local pattern=0x$($LFS getstripe -L $name)
2660 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2661 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2663 local stripes=$($LFS getstripe -c $name)
2664 if [ $OSTCOUNT -gt 2 ]; then
2665 [ $stripes -eq 3 ] ||
2666 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2668 [ $stripes -eq 2 ] ||
2669 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2672 local size=$(stat $name | awk '/Size:/ { print $2 }')
2673 [ $size -eq $((4096 * $bcount)) ] ||
2674 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2676 cat $name > /dev/null || error "(5.5) cannot read $name"
2678 echo "dummy" >> $name || error "(5.6) cannot write $name"
2680 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2682 touch $name || error "(5.8) cannot touch $name"
2684 rm -f $name || error "(5.9) cannot unlink $name"
2687 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2689 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2690 if [ $OSTCOUNT -gt 2 ]; then
2691 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2693 echo "Check $name, it contains the old f1's stripe1"
2696 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2698 pattern=0x$($LFS getstripe -L $name)
2699 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2700 error "(6.2) expect pattern flag hole, but got $pattern"
2702 stripes=$($LFS getstripe -c $name)
2703 if [ $OSTCOUNT -gt 2 ]; then
2704 [ $stripes -eq 3 ] ||
2705 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2707 [ $stripes -eq 2 ] ||
2708 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2711 size=$(stat $name | awk '/Size:/ { print $2 }')
2712 [ $size -eq $((4096 * $bcount)) ] ||
2713 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2715 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2717 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2718 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2721 [ $failures -eq 256 ] ||
2722 error "(6.6) expect 256 IO failures, but get $failures"
2724 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2725 [ $size -eq $((4096 * $bcount)) ] ||
2726 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2728 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2729 error "(6.8) write to the LOV EA hole should fail"
2731 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2732 error "(6.9) write to normal stripe should NOT fail"
2734 echo "foo" >> $name && error "(6.10) append write $name should fail"
2736 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2738 touch $name || error "(6.12) cannot touch $name"
2740 rm -f $name || error "(6.13) cannot unlink $name"
2743 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2745 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2746 if [ $OSTCOUNT -gt 2 ]; then
2747 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2749 echo "Check $name, it contains the old f2's stripe0"
2752 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2754 pattern=0x$($LFS getstripe -L $name)
2755 stripes=$($LFS getstripe -c $name)
2756 size=$(stat $name | awk '/Size:/ { print $2 }')
2757 if [ $OSTCOUNT -gt 2 ]; then
2758 [[ $((pattern & LOV_PATTERN_F_HOLE)) -ne 0 ]] ||
2759 error "(7.2.1) expect pattern flag hole, but got $pattern"
2761 [ $stripes -eq 3 ] ||
2762 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2764 [ $size -eq $((4096 * $bcount)) ] ||
2765 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2767 cat $name > /dev/null &&
2768 error "(7.5.1) normal read $name should fail"
2770 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2771 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2773 [ $failures -eq 256 ] ||
2774 error "(7.6) expect 256 IO failures, but get $failures"
2776 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2777 [ $size -eq $((4096 * $bcount)) ] ||
2778 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2780 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2781 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2783 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2784 error "(7.8.1) write to normal stripe should NOT fail"
2786 echo "foo" >> $name &&
2787 error "(7.8.3) append write $name should fail"
2789 chown $RUNAS_ID:$RUNAS_GID $name ||
2790 error "(7.9.1) cannot chown on $name"
2792 touch $name || error "(7.10.1) cannot touch $name"
2794 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2795 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
2797 [ $stripes -eq 1 ] ||
2798 error "(7.3.2) expect the stripe count is 1, but got $stripes"
2801 [ $size -eq $((4096 * (256 + 0))) ] ||
2802 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2804 cat $name > /dev/null || error "(7.5.2) cannot read $name"
2806 echo "dummy" >> $name || error "(7.8.2) cannot write $name"
2808 chown $RUNAS_ID:$RUNAS_GID $name ||
2809 error "(7.9.2) cannot chown on $name"
2811 touch $name || error "(7.10.2) cannot touch $name"
2814 rm -f $name || error "(7.11) cannot unlink $name"
2816 [ $OSTCOUNT -le 2 ] && return
2819 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2821 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2822 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2824 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2826 pattern=0x$($LFS getstripe -L $name)
2827 [[ $((pattern & LOV_PATTERN_F_HOLE)) -eq 0 ]] ||
2828 error "(8.2) NOT expect pattern flag hole, but got $pattern"
2830 stripes=$($LFS getstripe -c $name)
2831 # LFSCK does not know the old f3 had 3 stripes.
2832 # It only tries to find as much as possible.
2833 # The stripe count depends on the last stripe's offset.
2834 [ $stripes -eq 2 ] ||
2835 error "(8.3) expect the stripe count is 2, but got $stripes"
2837 size=$(stat $name | awk '/Size:/ { print $2 }')
2839 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2840 error "(8.4) expect the size $((4096 * 512)), but got $size"
2842 cat $name > /dev/null || error "(8.5) cannot read $name"
2844 echo "dummy" >> $name || error "(8.6) cannot write $name"
2846 chown $RUNAS_ID:$RUNAS_GID $name ||
2847 error "(8.7) cannot chown on $name"
2849 touch $name || error "(8.8) cannot touch $name"
2851 rm -f $name || error "(8.9) cannot unlink $name"
2853 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
2856 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
2857 skip "ignore the test if MDS is older than 2.5.59" && return
2859 check_mount_and_prep
2860 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
2862 echo "Start all LFSCK components by default (-s 1)"
2863 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
2864 error "Fail to start LFSCK"
2866 echo "namespace LFSCK should be in 'scanning-phase1' status"
2867 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
2868 [ "$STATUS" == "scanning-phase1" ] ||
2869 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
2871 echo "layout LFSCK should be in 'scanning-phase1' status"
2872 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
2873 [ "$STATUS" == "scanning-phase1" ] ||
2874 error "Expect layout 'scanning-phase1', but got '$STATUS'"
2876 echo "Stop all LFSCK components by default"
2877 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
2878 error "Fail to stop LFSCK"
2880 run_test 21 "run all LFSCK components by default"
2883 [ $MDSCOUNT -lt 2 ] &&
2884 skip "We need at least 2 MDSes for this test" && return
2887 echo "The parent_A references the child directory via some name entry,"
2888 echo "but the child directory back references another parent_B via its"
2889 echo "".." name entry. The parent_B does not exist. Then the namespace"
2890 echo "LFSCK will repair the child directory's ".." name entry."
2893 check_mount_and_prep
2895 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2896 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2898 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2899 echo "The dummy's dotdot name entry references the guard."
2900 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2901 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2902 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2903 error "(3) Fail to mkdir on MDT0"
2904 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2906 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
2908 echo "Trigger namespace LFSCK to repair unmatched pairs"
2909 $START_NAMESPACE -A -r ||
2910 error "(5) Fail to start LFSCK for namespace"
2912 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2913 mdd.${MDT_DEV}.lfsck_namespace |
2914 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2916 error "(6) unexpected status"
2919 local repaired=$($SHOW_NAMESPACE |
2920 awk '/^unmatched_pairs_repaired/ { print $2 }')
2921 [ $repaired -eq 1 ] ||
2922 error "(7) Fail to repair unmatched pairs: $repaired"
2924 echo "'ls' should success after namespace LFSCK repairing"
2925 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
2926 error "(8) ls should success."
2928 run_test 22a "LFSCK can repair unmatched pairs (1)"
2931 [ $MDSCOUNT -lt 2 ] &&
2932 skip "We need at least 2 MDSes for this test" && return
2935 echo "The parent_A references the child directory via the name entry_B,"
2936 echo "but the child directory back references another parent_C via its"
2937 echo "".." name entry. The parent_C exists, but there is no the name"
2938 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
2939 echo "the child directory's ".." name entry and its linkEA."
2942 check_mount_and_prep
2944 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
2945 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
2947 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
2948 echo "and bad linkEA. The dummy's dotdot name entry references the"
2949 echo "guard. The dummy's linkEA references n non-exist name entry."
2950 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
2951 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
2952 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
2953 error "(3) Fail to mkdir on MDT0"
2954 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2956 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
2957 echo "fid2path should NOT work on the dummy's FID $dummyfid"
2958 local dummyname=$($LFS fid2path $DIR $dummyfid)
2959 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
2960 error "(4) fid2path works unexpectedly."
2962 echo "Trigger namespace LFSCK to repair unmatched pairs"
2963 $START_NAMESPACE -A -r ||
2964 error "(5) Fail to start LFSCK for namespace"
2966 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2967 mdd.${MDT_DEV}.lfsck_namespace |
2968 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2970 error "(6) unexpected status"
2973 local repaired=$($SHOW_NAMESPACE |
2974 awk '/^unmatched_pairs_repaired/ { print $2 }')
2975 [ $repaired -eq 1 ] ||
2976 error "(7) Fail to repair unmatched pairs: $repaired"
2978 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
2979 local dummyname=$($LFS fid2path $DIR $dummyfid)
2980 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
2981 error "(8) fid2path does not work"
2983 run_test 22b "LFSCK can repair unmatched pairs (2)"
2986 [ $MDSCOUNT -lt 2 ] &&
2987 skip "We need at least 2 MDSes for this test" && return
2990 echo "The name entry is there, but the MDT-object for such name "
2991 echo "entry does not exist. The namespace LFSCK should find out "
2992 echo "and repair the inconsistency as required."
2995 check_mount_and_prep
2997 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
2998 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3000 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3001 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3002 do_facet mds2 $LCTL set_param fail_loc=0x1620
3003 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3004 do_facet mds2 $LCTL set_param fail_loc=0
3006 echo "'ls' should fail because of dangling name entry"
3007 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3009 echo "Trigger namespace LFSCK to find out dangling name entry"
3010 $START_NAMESPACE -A -r ||
3011 error "(5) Fail to start LFSCK for namespace"
3013 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3014 mdd.${MDT_DEV}.lfsck_namespace |
3015 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3017 error "(6) unexpected status"
3020 local repaired=$($SHOW_NAMESPACE |
3021 awk '/^dangling_repaired/ { print $2 }')
3022 [ $repaired -eq 1 ] ||
3023 error "(7) Fail to repair dangling name entry: $repaired"
3025 echo "'ls' should fail because not re-create MDT-object by default"
3026 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3028 echo "Trigger namespace LFSCK again to repair dangling name entry"
3029 $START_NAMESPACE -A -r -C ||
3030 error "(9) Fail to start LFSCK for namespace"
3032 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3033 mdd.${MDT_DEV}.lfsck_namespace |
3034 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3036 error "(10) unexpected status"
3039 repaired=$($SHOW_NAMESPACE |
3040 awk '/^dangling_repaired/ { print $2 }')
3041 [ $repaired -eq 1 ] ||
3042 error "(11) Fail to repair dangling name entry: $repaired"
3044 echo "'ls' should success after namespace LFSCK repairing"
3045 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3047 run_test 23a "LFSCK can repair dangling name entry (1)"
3051 echo "The objectA has multiple hard links, one of them corresponding"
3052 echo "to the name entry_B. But there is something wrong for the name"
3053 echo "entry_B and cause entry_B to references non-exist object_C."
3054 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3055 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3056 echo "comes to the second-stage scanning, it will find that the"
3057 echo "former re-creating object_C is not proper, and will try to"
3058 echo "replace the object_C with the real object_A."
3061 check_mount_and_prep
3063 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3064 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3065 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3067 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3068 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3069 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3070 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3071 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3073 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3075 echo "'ls' should fail because of dangling name entry"
3076 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3077 error "(6) ls should fail."
3079 echo "Trigger namespace LFSCK to find out dangling name entry"
3080 $START_NAMESPACE -r -C ||
3081 error "(7) Fail to start LFSCK for namespace"
3083 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3084 mdd.${MDT_DEV}.lfsck_namespace |
3085 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3087 error "(8) unexpected status"
3090 local repaired=$($SHOW_NAMESPACE |
3091 awk '/^dangling_repaired/ { print $2 }')
3092 [ $repaired -eq 1 ] ||
3093 error "(9) Fail to repair dangling name entry: $repaired"
3095 repaired=$($SHOW_NAMESPACE |
3096 awk '/^multiple_linked_repaired/ { print $2 }')
3097 [ $repaired -eq 1 ] ||
3098 error "(10) Fail to drop the former created object: $repaired"
3100 local data=$(cat $DIR/$tdir/d0/foo)
3101 [ "$data" == "dummy" ] ||
3102 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3104 run_test 23b "LFSCK can repair dangling name entry (2)"
3108 echo "The objectA has multiple hard links, one of them corresponding"
3109 echo "to the name entry_B. But there is something wrong for the name"
3110 echo "entry_B and cause entry_B to references non-exist object_C."
3111 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3112 echo "as dangling, and re-create the lost object_C. And then others"
3113 echo "modified the re-created object_C. When the LFSCK comes to the"
3114 echo "second-stage scanning, it will find that the former re-creating"
3115 echo "object_C maybe wrong and try to replace the object_C with the"
3116 echo "real object_A. But because object_C has been modified, so the"
3117 echo "LFSCK cannot replace it."
3120 check_mount_and_prep
3122 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3123 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3124 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3126 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3127 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3128 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1621
3129 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3130 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3132 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3134 echo "'ls' should fail because of dangling name entry"
3135 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3136 error "(6) ls should fail."
3138 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3139 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3141 echo "Trigger namespace LFSCK to find out dangling name entry"
3142 $START_NAMESPACE -r -C ||
3143 error "(7) Fail to start LFSCK for namespace"
3145 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3146 awk '/Size/ { print \\\$2 }'" "0" 32 || {
3147 stat $DIR/$tdir/guard
3149 error "(8) unexpected size"
3152 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3153 cancel_lru_locks osc
3155 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3156 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3157 mdd.${MDT_DEV}.lfsck_namespace |
3158 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3160 error "(10) unexpected status"
3163 local repaired=$($SHOW_NAMESPACE |
3164 awk '/^dangling_repaired/ { print $2 }')
3165 [ $repaired -eq 1 ] ||
3166 error "(11) Fail to repair dangling name entry: $repaired"
3168 local data=$(cat $DIR/$tdir/d0/foo)
3169 [ "$data" != "dummy" ] ||
3170 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3172 run_test 23c "LFSCK can repair dangling name entry (3)"
3175 [ $MDSCOUNT -lt 2 ] &&
3176 skip "We need at least 2 MDSes for this test" && return
3179 echo "Two MDT-objects back reference the same name entry via their"
3180 echo "each own linkEA entry, but the name entry only references one"
3181 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3182 echo "for the MDT-object that is not recognized. If such MDT-object"
3183 echo "has no other linkEA entry after the removing, then the LFSCK"
3184 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3187 check_mount_and_prep
3189 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3191 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3192 $LFS path2fid $DIR/$tdir/d0/guard
3194 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3195 $LFS path2fid $DIR/$tdir/d0/dummy
3198 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3199 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3201 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3204 touch $DIR/$tdir/d0/guard/foo ||
3205 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3207 echo "Inject failure stub on MDT0 to simulate the case that"
3208 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3209 echo "that references $DIR/$tdir/d0/guard/foo."
3210 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3211 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3212 echo "there with the same linkEA entry as another MDT-object"
3213 echo "$DIR/$tdir/d0/guard/foo has"
3215 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3216 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3217 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3218 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3219 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3220 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3221 rmdir $DIR/$tdir/d0/dummy/foo ||
3222 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3225 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3226 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3227 error "(6) stat successfully unexpectedly"
3229 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3230 $START_NAMESPACE -A -r ||
3231 error "(7) Fail to start LFSCK for namespace"
3233 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3234 mdd.${MDT_DEV}.lfsck_namespace |
3235 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3237 error "(8) unexpected status"
3240 local repaired=$($SHOW_NAMESPACE |
3241 awk '/^multiple_referenced_repaired/ { print $2 }')
3242 [ $repaired -eq 1 ] ||
3243 error "(9) Fail to repair multiple referenced name entry: $repaired"
3245 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3246 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3247 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3249 local cname="$cfid-$pfid-D-0"
3250 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3251 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3253 run_test 24 "LFSCK can repair multiple-referenced name entry"
3256 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3257 skip "Only support to inject failure on ldiskfs" && return
3260 echo "The file type in the name entry does not match the file type"
3261 echo "claimed by the referenced object. Then the LFSCK will update"
3262 echo "the file type in the name entry."
3265 check_mount_and_prep
3267 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3269 echo "Inject failure stub on MDT0 to simulate the case that"
3270 echo "the file type stored in the name entry is wrong."
3272 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3273 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3274 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3277 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3278 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3280 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3281 mdd.${MDT_DEV}.lfsck_namespace |
3282 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3284 error "(4) unexpected status"
3287 local repaired=$($SHOW_NAMESPACE |
3288 awk '/^bad_file_type_repaired/ { print $2 }')
3289 [ $repaired -eq 1 ] ||
3290 error "(5) Fail to repair bad file type in name entry: $repaired"
3292 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3294 run_test 25 "LFSCK can repair bad file type in the name entry"
3298 echo "The local name entry back referenced by the MDT-object is lost."
3299 echo "The namespace LFSCK will add the missing local name entry back"
3300 echo "to the normal namespace."
3303 check_mount_and_prep
3305 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3306 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3307 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3309 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3310 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3312 echo "Inject failure stub on MDT0 to simulate the case that"
3313 echo "foo's name entry will be removed, but the foo's object"
3314 echo "and its linkEA are kept in the system."
3316 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3317 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3318 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3319 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3321 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(5) 'ls' should fail"
3323 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3324 $START_NAMESPACE -r -A ||
3325 error "(6) Fail to start LFSCK for namespace"
3327 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3328 mdd.${MDT_DEV}.lfsck_namespace |
3329 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3331 error "(7) unexpected status"
3334 local repaired=$($SHOW_NAMESPACE |
3335 awk '/^lost_dirent_repaired/ { print $2 }')
3336 [ $repaired -eq 1 ] ||
3337 error "(8) Fail to repair lost dirent: $repaired"
3339 ls -ail $DIR/$tdir/d0/foo ||
3340 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3342 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3343 [ "$foofid" == "$foofid2" ] ||
3344 error "(10) foo's FID changed: $foofid, $foofid2"
3346 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3349 [ $MDSCOUNT -lt 2 ] &&
3350 skip "We need at least 2 MDSes for this test" && return
3353 echo "The remote name entry back referenced by the MDT-object is lost."
3354 echo "The namespace LFSCK will add the missing remote name entry back"
3355 echo "to the normal namespace."
3358 check_mount_and_prep
3360 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3361 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3362 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3364 echo "Inject failure stub on MDT0 to simulate the case that"
3365 echo "foo's name entry will be removed, but the foo's object"
3366 echo "and its linkEA are kept in the system."
3368 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3369 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3370 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3371 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3373 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 && "(4) 'ls' should fail"
3375 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3376 $START_NAMESPACE -r -A ||
3377 error "(5) Fail to start LFSCK for namespace"
3379 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3380 mdd.${MDT_DEV}.lfsck_namespace |
3381 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3383 error "(6) unexpected status"
3386 local repaired=$($SHOW_NAMESPACE |
3387 awk '/^lost_dirent_repaired/ { print $2 }')
3388 [ $repaired -eq 1 ] ||
3389 error "(7) Fail to repair lost dirent: $repaired"
3391 ls -ail $DIR/$tdir/d0/foo ||
3392 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3394 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3395 [ "$foofid" == "$foofid2" ] ||
3396 error "(9) foo's FID changed: $foofid, $foofid2"
3398 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3402 echo "The local parent referenced by the MDT-object linkEA is lost."
3403 echo "The namespace LFSCK will re-create the lost parent as orphan."
3406 check_mount_and_prep
3408 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3409 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3410 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3411 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3413 echo "Inject failure stub on MDT0 to simulate the case that"
3414 echo "foo's name entry will be removed, but the foo's object"
3415 echo "and its linkEA are kept in the system. And then remove"
3416 echo "another hard link and the parent directory."
3418 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3419 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3420 rm -f $DIR/$tdir/d0/foo ||
3421 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3422 rm -f $DIR/$tdir/d0/dummy ||
3423 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3424 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3426 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3427 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(6) 'ls' should fail"
3429 echo "Trigger namespace LFSCK to repair the lost parent"
3430 $START_NAMESPACE -r -A ||
3431 error "(6) Fail to start LFSCK for namespace"
3433 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3434 mdd.${MDT_DEV}.lfsck_namespace |
3435 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3437 error "(7) unexpected status"
3440 local repaired=$($SHOW_NAMESPACE |
3441 awk '/^lost_dirent_repaired/ { print $2 }')
3442 [ $repaired -eq 1 ] ||
3443 error "(8) Fail to repair lost dirent: $repaired"
3445 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3446 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3447 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3449 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3451 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3452 [ ! -z "$cname" ] ||
3453 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3455 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3458 [ $MDSCOUNT -lt 2 ] &&
3459 skip "We need at least 2 MDSes for this test" && return
3462 echo "The remote parent referenced by the MDT-object linkEA is lost."
3463 echo "The namespace LFSCK will re-create the lost parent as orphan."
3466 check_mount_and_prep
3468 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3469 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3471 $LFS path2fid $DIR/$tdir/d0
3473 echo "Inject failure stub on MDT0 to simulate the case that"
3474 echo "foo's name entry will be removed, but the foo's object"
3475 echo "and its linkEA are kept in the system. And then remove"
3476 echo "the parent directory."
3478 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3479 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3480 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3481 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3483 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3484 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && "(5) 'ls' should fail"
3486 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3487 $START_NAMESPACE -r -A ||
3488 error "(6) Fail to start LFSCK for namespace"
3490 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3491 mdd.${MDT_DEV}.lfsck_namespace |
3492 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3494 error "(7) unexpected status"
3497 local repaired=$($SHOW_NAMESPACE |
3498 awk '/^lost_dirent_repaired/ { print $2 }')
3499 [ $repaired -eq 1 ] ||
3500 error "(8) Fail to repair lost dirent: $repaired"
3502 ls -ail $MOUNT/.lustre/lost+found/
3504 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3505 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3506 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3508 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3510 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3511 [ ! -z "$cname" ] ||
3512 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3514 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3517 [ $MDSCOUNT -lt 2 ] &&
3518 skip "The test needs at least 2 MDTs" && return
3521 echo "The target name entry is lost. The LFSCK should insert the"
3522 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3523 echo "the MDT (on which the orphan MDT-object resides) has ever"
3524 echo "failed to respond some name entry verification during the"
3525 echo "first stage-scanning, then the LFSCK should skip to handle"
3526 echo "orphan MDT-object on this MDT. But other MDTs should not"
3530 check_mount_and_prep
3531 $LFS mkdir -i 0 $DIR/$tdir/d1
3532 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3533 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3535 $LFS mkdir -i 1 $DIR/$tdir/d2
3536 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3537 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3539 echo "Inject failure stub on MDT0 to simulate the case that"
3540 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3541 echo "and its linkEA are kept in the system. And the case that"
3542 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3543 echo "and its linkEA are kept in the system."
3545 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3546 do_facet mds1 $LCTL set_param fail_loc=0x1624
3547 do_facet mds2 $LCTL set_param fail_loc=0x1624
3548 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3549 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3550 do_facet mds1 $LCTL set_param fail_loc=0
3551 do_facet mds2 $LCTL set_param fail_loc=0
3553 cancel_lru_locks mdc
3554 cancel_lru_locks osc
3556 echo "Inject failure, to simulate the MDT0 fail to handle"
3557 echo "MDT1 LFSCK request during the first-stage scanning."
3558 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3559 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3561 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3562 $START_NAMESPACE -r -A ||
3563 error "(3) Fail to start LFSCK for namespace"
3565 wait_update_facet mds1 "$LCTL get_param -n \
3566 mdd.$(facet_svc mds1).lfsck_namespace |
3567 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3568 error "(4) mds1 is not the expected 'partial'"
3571 wait_update_facet mds2 "$LCTL get_param -n \
3572 mdd.$(facet_svc mds2).lfsck_namespace |
3573 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3574 error "(5) mds2 is not the expected 'completed'"
3577 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3579 local repaired=$(do_facet mds1 $LCTL get_param -n \
3580 mdd.$(facet_svc mds1).lfsck_namespace |
3581 awk '/^lost_dirent_repaired/ { print $2 }')
3582 [ $repaired -eq 0 ] ||
3583 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3585 repaired=$(do_facet mds2 $LCTL get_param -n \
3586 mdd.$(facet_svc mds2).lfsck_namespace |
3587 awk '/^lost_dirent_repaired/ { print $2 }')
3588 [ $repaired -eq 1 ] ||
3589 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3591 echo "Trigger namespace LFSCK on all devices again to cleanup"
3592 $START_NAMESPACE -r -A ||
3593 error "(8) Fail to start LFSCK for namespace"
3595 for k in $(seq $MDSCOUNT); do
3596 # The LFSCK status query internal is 30 seconds. For the case
3597 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3598 # time to guarantee the status sync up.
3599 wait_update_facet mds${k} "$LCTL get_param -n \
3600 mdd.$(facet_svc mds${k}).lfsck_namespace |
3601 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3602 error "(9) MDS${k} is not the expected 'completed'"
3605 local repaired=$(do_facet mds1 $LCTL get_param -n \
3606 mdd.$(facet_svc mds1).lfsck_namespace |
3607 awk '/^lost_dirent_repaired/ { print $2 }')
3608 [ $repaired -eq 1 ] ||
3609 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3611 repaired=$(do_facet mds2 $LCTL get_param -n \
3612 mdd.$(facet_svc mds2).lfsck_namespace |
3613 awk '/^lost_dirent_repaired/ { print $2 }')
3614 [ $repaired -eq 0 ] ||
3615 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3617 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3621 echo "The object's nlink attribute is larger than the object's known"
3622 echo "name entries count. The LFSCK will repair the object's nlink"
3623 echo "attribute to match the known name entries count"
3626 check_mount_and_prep
3628 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3629 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3631 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3632 echo "nlink attribute is larger than its name entries count."
3634 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3635 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3636 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3637 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3638 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3640 cancel_lru_locks mdc
3641 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3642 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3644 echo "Trigger namespace LFSCK to repair the nlink count"
3645 $START_NAMESPACE -r -A ||
3646 error "(5) Fail to start LFSCK for namespace"
3648 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3649 mdd.${MDT_DEV}.lfsck_namespace |
3650 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3652 error "(6) unexpected status"
3655 local repaired=$($SHOW_NAMESPACE |
3656 awk '/^nlinks_repaired/ { print $2 }')
3657 [ $repaired -eq 1 ] ||
3658 error "(7) Fail to repair nlink count: $repaired"
3660 cancel_lru_locks mdc
3661 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3662 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3664 run_test 29a "LFSCK can repair bad nlink count (1)"
3668 echo "The object's nlink attribute is smaller than the object's known"
3669 echo "name entries count. The LFSCK will repair the object's nlink"
3670 echo "attribute to match the known name entries count"
3673 check_mount_and_prep
3675 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3676 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3678 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3679 echo "nlink attribute is smaller than its name entries count."
3681 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3682 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3683 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3684 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3685 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3687 cancel_lru_locks mdc
3688 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3689 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3691 echo "Trigger namespace LFSCK to repair the nlink count"
3692 $START_NAMESPACE -r -A ||
3693 error "(5) Fail to start LFSCK for namespace"
3695 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3696 mdd.${MDT_DEV}.lfsck_namespace |
3697 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3699 error "(6) unexpected status"
3702 local repaired=$($SHOW_NAMESPACE |
3703 awk '/^nlinks_repaired/ { print $2 }')
3704 [ $repaired -eq 1 ] ||
3705 error "(7) Fail to repair nlink count: $repaired"
3707 cancel_lru_locks mdc
3708 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3709 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3711 run_test 29b "LFSCK can repair bad nlink count (2)"
3715 echo "There are too many hard links to the object, and exceeds the"
3716 echo "object's linkEA limitation, as to NOT all the known name entries"
3717 echo "will be recorded in the linkEA. Under such case, LFSCK should"
3718 echo "skip the nlink verification for this object."
3721 check_mount_and_prep
3723 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3724 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3725 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3726 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3728 echo "Inject failure stub on MDT0 to simulate the case that"
3729 echo "foo's hard links exceed the object's linkEA limitation."
3731 #define OBD_FAIL_LFSCK_LINKEA_OVERFLOW 0x1627
3732 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1627
3733 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h2 ||
3734 error "(4) Fail to hard link to $DIR/$tdir/d0/foo"
3736 cancel_lru_locks mdc
3738 local count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3739 [ $count1 -eq 3 ] || error "(5) Stat failure: $count1"
3741 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3742 $LFS fid2path $DIR $foofid
3743 local count2=$($LFS fid2path $DIR $foofid | wc -l)
3744 [ $count2 -eq 2 ] || "(6) Fail to inject error: $count2"
3746 echo "Trigger namespace LFSCK to repair the nlink count"
3747 $START_NAMESPACE -r -A ||
3748 error "(7) Fail to start LFSCK for namespace"
3750 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3751 mdd.${MDT_DEV}.lfsck_namespace |
3752 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3754 error "(8) unexpected status"
3757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3758 local repaired=$($SHOW_NAMESPACE |
3759 awk '/^nlinks_repaired/ { print $2 }')
3760 [ $repaired -eq 0 ] ||
3761 error "(9) Repair nlink count unexpcetedly: $repaired"
3763 cancel_lru_locks mdc
3765 count1=$(stat --format=%h $DIR/$tdir/d0/foo)
3766 [ $count1 -eq 3 ] || error "(10) Stat failure: $count1"
3768 count2=$($LFS fid2path $DIR $foofid | wc -l)
3769 [ $count2 -eq 2 ] ||
3770 error "(11) Repaired something unexpectedly: $count2"
3772 run_test 29c "Not verify nlink attr if hark links exceed linkEA limitation"
3775 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3776 skip "Only support backend /lost+found for ldiskfs" && return
3779 echo "The namespace LFSCK will move the orphans from backend"
3780 echo "/lost+found directory to normal client visible namespace"
3781 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3784 check_mount_and_prep
3786 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3787 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
3789 echo "Inject failure stub on MDT0 to simulate the case that"
3790 echo "directory d0 has no linkEA entry, then the LFSCK will"
3791 echo "move it into .lustre/lost+found/MDTxxxx/ later."
3793 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
3794 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
3795 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
3796 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3798 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
3799 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
3801 echo "Inject failure stub on MDT0 to simulate the case that the"
3802 echo "object's name entry will be removed, but not destroy the"
3803 echo "object. Then backend e2fsck will handle it as orphan and"
3804 echo "add them into the backend /lost+found directory."
3806 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3807 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3808 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
3809 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
3810 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
3811 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
3812 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3814 umount_client $MOUNT || error "(10) Fail to stop client!"
3816 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
3819 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
3820 error "(12) Fail to run e2fsck"
3822 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
3823 error "(13) Fail to start MDT0"
3825 echo "Trigger namespace LFSCK to recover backend orphans"
3826 $START_NAMESPACE -r -A ||
3827 error "(14) Fail to start LFSCK for namespace"
3829 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3830 mdd.${MDT_DEV}.lfsck_namespace |
3831 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3833 error "(15) unexpected status"
3836 local repaired=$($SHOW_NAMESPACE |
3837 awk '/^local_lost_found_moved/ { print $2 }')
3838 [ $repaired -ge 4 ] ||
3839 error "(16) Fail to recover backend orphans: $repaired"
3841 mount_client $MOUNT || error "(17) Fail to start client!"
3843 stat $DIR/$tdir/foo/f0 || "(18) f0 is not recovered"
3845 ls -ail $MOUNT/.lustre/lost+found/
3847 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
3848 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3849 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3851 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3853 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
3854 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
3856 stat ${cname}/d1 || error "(21) d0 is not recovered"
3857 stat ${cname}/f1 || error "(22) f1 is not recovered"
3859 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
3862 [ $MDSCOUNT -lt 2 ] &&
3863 skip "The test needs at least 2 MDTs" && return
3866 echo "For the name entry under a striped directory, if the name"
3867 echo "hash does not match the shard, then the LFSCK will repair"
3868 echo "the bad name entry"
3871 check_mount_and_prep
3873 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3874 error "(1) Fail to create striped directory"
3876 echo "Inject failure stub on client to simulate the case that"
3877 echo "some name entry should be inserted into other non-first"
3878 echo "shard, but inserted into the first shard by wrong"
3880 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3881 $LCTL set_param fail_loc=0x1628 fail_val=0
3882 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3883 error "(2) Fail to create file under striped directory"
3884 $LCTL set_param fail_loc=0 fail_val=0
3886 echo "Trigger namespace LFSCK to repair bad name hash"
3887 $START_NAMESPACE -r -A ||
3888 error "(3) Fail to start LFSCK for namespace"
3890 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3891 mdd.${MDT_DEV}.lfsck_namespace |
3892 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3894 error "(4) unexpected status"
3897 local repaired=$($SHOW_NAMESPACE |
3898 awk '/^name_hash_repaired/ { print $2 }')
3899 [ $repaired -ge 1 ] ||
3900 error "(5) Fail to repair bad name hash: $repaired"
3902 umount_client $MOUNT || error "(6) umount failed"
3903 mount_client $MOUNT || error "(7) mount failed"
3905 for ((i = 0; i < $MDSCOUNT; i++)); do
3906 stat $DIR/$tdir/striped_dir/d$i ||
3907 error "(8) Fail to stat d$i after LFSCK"
3908 rmdir $DIR/$tdir/striped_dir/d$i ||
3909 error "(9) Fail to unlink d$i after LFSCK"
3912 rmdir $DIR/$tdir/striped_dir ||
3913 error "(10) Fail to remove the striped directory after LFSCK"
3915 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
3918 [ $MDSCOUNT -lt 2 ] &&
3919 skip "The test needs at least 2 MDTs" && return
3922 echo "For the name entry under a striped directory, if the name"
3923 echo "hash does not match the shard, then the LFSCK will repair"
3924 echo "the bad name entry"
3927 check_mount_and_prep
3929 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3930 error "(1) Fail to create striped directory"
3932 echo "Inject failure stub on client to simulate the case that"
3933 echo "some name entry should be inserted into other non-second"
3934 echo "shard, but inserted into the secod shard by wrong"
3936 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
3937 $LCTL set_param fail_loc=0x1628 fail_val=1
3938 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
3939 error "(2) Fail to create file under striped directory"
3940 $LCTL set_param fail_loc=0 fail_val=0
3942 echo "Trigger namespace LFSCK to repair bad name hash"
3943 $START_NAMESPACE -r -A ||
3944 error "(3) Fail to start LFSCK for namespace"
3946 wait_update_facet mds2 "$LCTL get_param -n \
3947 mdd.$(facet_svc mds2).lfsck_namespace |
3948 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3949 error "(4) unexpected status"
3951 local repaired=$(do_facet mds2 $LCTL get_param -n \
3952 mdd.$(facet_svc mds2).lfsck_namespace |
3953 awk '/^name_hash_repaired/ { print $2 }')
3954 [ $repaired -ge 1 ] ||
3955 error "(5) Fail to repair bad name hash: $repaired"
3957 umount_client $MOUNT || error "(6) umount failed"
3958 mount_client $MOUNT || error "(7) mount failed"
3960 for ((i = 0; i < $MDSCOUNT; i++)); do
3961 stat $DIR/$tdir/striped_dir/d$i ||
3962 error "(8) Fail to stat d$i after LFSCK"
3963 rmdir $DIR/$tdir/striped_dir/d$i ||
3964 error "(9) Fail to unlink d$i after LFSCK"
3967 rmdir $DIR/$tdir/striped_dir ||
3968 error "(10) Fail to remove the striped directory after LFSCK"
3970 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
3973 [ $MDSCOUNT -lt 2 ] &&
3974 skip "The test needs at least 2 MDTs" && return
3977 echo "For some reason, the master MDT-object of the striped directory"
3978 echo "may lost its master LMV EA. If nobody created files under the"
3979 echo "master directly after the master LMV EA lost, then the LFSCK"
3980 echo "should re-generate the master LMV EA."
3983 check_mount_and_prep
3985 echo "Inject failure stub on MDT0 to simulate the case that the"
3986 echo "master MDT-object of the striped directory lost the LMV EA."
3988 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
3989 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
3990 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
3991 error "(1) Fail to create striped directory"
3992 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3994 echo "Trigger namespace LFSCK to re-generate master LMV EA"
3995 $START_NAMESPACE -r -A ||
3996 error "(2) Fail to start LFSCK for namespace"
3998 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3999 mdd.${MDT_DEV}.lfsck_namespace |
4000 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4002 error "(3) unexpected status"
4005 local repaired=$($SHOW_NAMESPACE |
4006 awk '/^striped_dirs_repaired/ { print $2 }')
4007 [ $repaired -eq 1 ] ||
4008 error "(4) Fail to re-generate master LMV EA: $repaired"
4010 umount_client $MOUNT || error "(5) umount failed"
4011 mount_client $MOUNT || error "(6) mount failed"
4013 local empty=$(ls $DIR/$tdir/striped_dir/)
4014 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4016 rmdir $DIR/$tdir/striped_dir ||
4017 error "(8) Fail to remove the striped directory after LFSCK"
4019 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4022 [ $MDSCOUNT -lt 2 ] &&
4023 skip "The test needs at least 2 MDTs" && return
4026 echo "For some reason, the master MDT-object of the striped directory"
4027 echo "may lost its master LMV EA. If somebody created files under the"
4028 echo "master directly after the master LMV EA lost, then the LFSCK"
4029 echo "should NOT re-generate the master LMV EA, instead, it should"
4030 echo "change the broken striped dirctory as read-only to prevent"
4031 echo "further damage"
4034 check_mount_and_prep
4036 echo "Inject failure stub on MDT0 to simulate the case that the"
4037 echo "master MDT-object of the striped directory lost the LMV EA."
4039 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4040 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4041 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4042 error "(1) Fail to create striped directory"
4043 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4045 umount_client $MOUNT || error "(2) umount failed"
4046 mount_client $MOUNT || error "(3) mount failed"
4048 touch $DIR/$tdir/striped_dir/dummy ||
4049 error "(4) Fail to touch under broken striped directory"
4051 echo "Trigger namespace LFSCK to find out the inconsistency"
4052 $START_NAMESPACE -r -A ||
4053 error "(5) Fail to start LFSCK for namespace"
4055 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4056 mdd.${MDT_DEV}.lfsck_namespace |
4057 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4059 error "(6) unexpected status"
4062 local repaired=$($SHOW_NAMESPACE |
4063 awk '/^striped_dirs_repaired/ { print $2 }')
4064 [ $repaired -eq 0 ] ||
4065 error "(7) Re-generate master LMV EA unexpected: $repaired"
4067 stat $DIR/$tdir/striped_dir/dummy ||
4068 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4070 touch $DIR/$tdir/striped_dir/foo &&
4071 error "(9) The broken striped directory should be read-only"
4073 chattr -i $DIR/$tdir/striped_dir ||
4074 error "(10) Fail to chattr on the broken striped directory"
4076 rmdir $DIR/$tdir/striped_dir ||
4077 error "(11) Fail to remove the striped directory after LFSCK"
4079 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4082 [ $MDSCOUNT -lt 2 ] &&
4083 skip "The test needs at least 2 MDTs" && return
4086 echo "For some reason, the slave MDT-object of the striped directory"
4087 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4088 echo "slave LMV EA."
4091 check_mount_and_prep
4093 echo "Inject failure stub on MDT0 to simulate the case that the"
4094 echo "slave MDT-object (that resides on the same MDT as the master"
4095 echo "MDT-object resides on) lost the LMV EA."
4097 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4098 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4099 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4100 error "(1) Fail to create striped directory"
4101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4103 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4104 $START_NAMESPACE -r -A ||
4105 error "(2) Fail to start LFSCK for namespace"
4107 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4108 mdd.${MDT_DEV}.lfsck_namespace |
4109 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4111 error "(3) unexpected status"
4114 local repaired=$($SHOW_NAMESPACE |
4115 awk '/^striped_shards_repaired/ { print $2 }')
4116 [ $repaired -eq 1 ] ||
4117 error "(4) Fail to re-generate slave LMV EA: $repaired"
4119 rmdir $DIR/$tdir/striped_dir ||
4120 error "(5) Fail to remove the striped directory after LFSCK"
4122 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4125 [ $MDSCOUNT -lt 2 ] &&
4126 skip "The test needs at least 2 MDTs" && return
4129 echo "For some reason, the slave MDT-object of the striped directory"
4130 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4131 echo "slave LMV EA."
4134 check_mount_and_prep
4136 echo "Inject failure stub on MDT0 to simulate the case that the"
4137 echo "slave MDT-object (that resides on differnt MDT as the master"
4138 echo "MDT-object resides on) lost the LMV EA."
4140 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4141 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4142 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4143 error "(1) Fail to create striped directory"
4144 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4146 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4147 $START_NAMESPACE -r -A ||
4148 error "(2) Fail to start LFSCK for namespace"
4150 wait_update_facet mds2 "$LCTL get_param -n \
4151 mdd.$(facet_svc mds2).lfsck_namespace |
4152 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
4153 error "(3) unexpected status"
4155 local repaired=$(do_facet mds2 $LCTL get_param -n \
4156 mdd.$(facet_svc mds2).lfsck_namespace |
4157 awk '/^striped_shards_repaired/ { print $2 }')
4158 [ $repaired -eq 1 ] ||
4159 error "(4) Fail to re-generate slave LMV EA: $repaired"
4161 rmdir $DIR/$tdir/striped_dir ||
4162 error "(5) Fail to remove the striped directory after LFSCK"
4164 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4167 [ $MDSCOUNT -lt 2 ] &&
4168 skip "The test needs at least 2 MDTs" && return
4171 echo "For some reason, the stripe index in the slave LMV EA is"
4172 echo "corrupted. The LFSCK should repair the slave LMV EA."
4175 check_mount_and_prep
4177 echo "Inject failure stub on MDT0 to simulate the case that the"
4178 echo "slave LMV EA on the first shard of the striped directory"
4179 echo "claims the same index as the second shard claims"
4181 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4182 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4183 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4184 error "(1) Fail to create striped directory"
4185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4187 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4188 $START_NAMESPACE -r -A ||
4189 error "(2) Fail to start LFSCK for namespace"
4191 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4192 mdd.${MDT_DEV}.lfsck_namespace |
4193 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4195 error "(3) unexpected status"
4198 local repaired=$($SHOW_NAMESPACE |
4199 awk '/^striped_shards_repaired/ { print $2 }')
4200 [ $repaired -eq 1 ] ||
4201 error "(4) Fail to repair slave LMV EA: $repaired"
4203 umount_client $MOUNT || error "(5) umount failed"
4204 mount_client $MOUNT || error "(6) mount failed"
4206 touch $DIR/$tdir/striped_dir/foo ||
4207 error "(7) Fail to touch file after the LFSCK"
4209 rm -f $DIR/$tdir/striped_dir/foo ||
4210 error "(8) Fail to unlink file after the LFSCK"
4212 rmdir $DIR/$tdir/striped_dir ||
4213 error "(9) Fail to remove the striped directory after LFSCK"
4215 run_test 31g "Repair the corrupted slave LMV EA"
4218 [ $MDSCOUNT -lt 2 ] &&
4219 skip "The test needs at least 2 MDTs" && return
4222 echo "For some reason, the shard's name entry in the striped"
4223 echo "directory may be corrupted. The LFSCK should repair the"
4224 echo "bad shard's name entry."
4227 check_mount_and_prep
4229 echo "Inject failure stub on MDT0 to simulate the case that the"
4230 echo "first shard's name entry in the striped directory claims"
4231 echo "the same index as the second shard's name entry claims."
4233 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4234 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4235 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4236 error "(1) Fail to create striped directory"
4237 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4239 echo "Trigger namespace LFSCK to repair the shard's name entry"
4240 $START_NAMESPACE -r -A ||
4241 error "(2) Fail to start LFSCK for namespace"
4243 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4244 mdd.${MDT_DEV}.lfsck_namespace |
4245 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4247 error "(3) unexpected status"
4250 local repaired=$($SHOW_NAMESPACE |
4251 awk '/^dirent_repaired/ { print $2 }')
4252 [ $repaired -eq 1 ] ||
4253 error "(4) Fail to repair shard's name entry: $repaired"
4255 umount_client $MOUNT || error "(5) umount failed"
4256 mount_client $MOUNT || error "(6) mount failed"
4258 touch $DIR/$tdir/striped_dir/foo ||
4259 error "(7) Fail to touch file after the LFSCK"
4261 rm -f $DIR/$tdir/striped_dir/foo ||
4262 error "(8) Fail to unlink file after the LFSCK"
4264 rmdir $DIR/$tdir/striped_dir ||
4265 error "(9) Fail to remove the striped directory after LFSCK"
4267 run_test 31h "Repair the corrupted shard's name entry"
4269 # restore MDS/OST size
4270 MDSSIZE=${SAVED_MDSSIZE}
4271 OSTSIZE=${SAVED_OSTSIZE}
4272 OSTCOUNT=${SAVED_OSTCOUNT}
4274 # cleanup the system at last