3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
33 skip "Need MDS version at least 2.3.60"
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
45 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 MDT_DEV="${FSNAME}-MDT0000"
55 OST_DEV="${FSNAME}-OST0000"
56 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
57 START_NAMESPACE="do_facet $SINGLEMDS \
58 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
59 START_LAYOUT="do_facet $SINGLEMDS \
60 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
61 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
62 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
63 SHOW_NAMESPACE="do_facet $SINGLEMDS \
64 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
65 SHOW_LAYOUT="do_facet $SINGLEMDS \
66 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
67 SHOW_LAYOUT_ON_OST="do_facet ost1 \
68 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
69 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
70 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
71 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
80 echo "preparing... $nfiles * $ndirs files will be created $(date)."
81 if [ ! -z $igif ]; then
82 #define OBD_FAIL_FID_IGIF 0x1504
83 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
86 cp $LUSTRE/tests/*.sh $DIR/$tdir/
87 if [ $ndirs -gt 0 ]; then
88 createmany -d $DIR/$tdir/d $ndirs
89 createmany -m $DIR/$tdir/f $ndirs
90 if [ $nfiles -gt 0 ]; then
91 for ((i = 0; i < $ndirs; i++)); do
92 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
93 /dev/null || error "createmany $nfiles"
96 createmany -d $DIR/$tdir/e $ndirs
99 if [ ! -z $igif ]; then
100 touch $DIR/$tdir/dummy
101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
104 echo "prepared $(date)."
107 run_e2fsck_on_mdt0() {
108 [ $mds1_FSTYPE == ldiskfs ] || return 0
110 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
111 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
113 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
114 error "(2) Detected inconsistency on MDT0"
116 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
117 error "(3) Fail to start MDT0"
120 wait_all_targets_blocked() {
125 local count=$(do_facet mds1 \
126 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
127 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
128 [[ $count -eq $MDSCOUNT ]] || {
129 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
130 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
139 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
141 "$MDSCOUNT" $LTIME || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) some MDTs are not in ${status}"
150 #define OBD_FAIL_LFSCK_DELAY1 0x1600
151 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
152 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
154 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
156 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
157 [ "$STATUS" == "scanning-phase1" ] ||
158 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
160 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
162 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
163 [ "$STATUS" == "stopped" ] ||
164 error "(6) Expect 'stopped', but got '$STATUS'"
166 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
168 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
172 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
173 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
174 mdd.${MDT_DEV}.lfsck_namespace |
175 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
177 error "(9) unexpected status"
180 local repaired=$($SHOW_NAMESPACE |
181 awk '/^updated_phase1/ { print $2 }')
182 [ $repaired -eq 0 ] ||
183 error "(10) Expect nothing to be repaired, but got: $repaired"
185 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
186 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
187 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
188 mdd.${MDT_DEV}.lfsck_namespace |
189 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
191 error "(12) unexpected status"
194 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
195 [ $((scanned1 + 1)) -eq $scanned2 ] ||
196 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
198 echo "stopall, should NOT crash LU-3649"
199 stopall || error "(14) Fail to stopall"
201 run_test 0 "Control LFSCK manually"
206 #define OBD_FAIL_FID_INDIR 0x1501
207 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
208 touch $DIR/$tdir/dummy
210 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
212 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
213 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
214 mdd.${MDT_DEV}.lfsck_namespace |
215 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
217 error "(4) unexpected status"
220 local repaired=$($SHOW_NAMESPACE |
221 awk '/^dirent_repaired/ { print $2 }')
222 # for interop with old server
223 [ -z "$repaired" ] &&
224 repaired=$($SHOW_NAMESPACE |
225 awk '/^updated_phase1/ { print $2 }')
227 [ $repaired -eq 1 ] ||
228 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
232 mount_client $MOUNT || error "(6) Fail to start client!"
234 #define OBD_FAIL_FID_LOOKUP 0x1505
235 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
236 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
240 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
244 [ "$mds1_FSTYPE" != ldiskfs ] &&
245 skip "OI Scrub not implemented for ZFS"
249 #define OBD_FAIL_FID_INLMA 0x1502
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
251 touch $DIR/$tdir/dummy
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 #define OBD_FAIL_FID_NOLMA 0x1506
256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
257 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
258 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
259 mdd.${MDT_DEV}.lfsck_namespace |
260 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
262 error "(4) unexpected status"
265 local repaired=$($SHOW_NAMESPACE |
266 awk '/^dirent_repaired/ { print $2 }')
267 # for interop with old server
268 [ -z "$repaired" ] &&
269 repaired=$($SHOW_NAMESPACE |
270 awk '/^updated_phase1/ { print $2 }')
272 [ $repaired -eq 1 ] ||
273 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
278 mount_client $MOUNT || error "(6) Fail to start client!"
280 #define OBD_FAIL_FID_LOOKUP 0x1505
281 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
282 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
284 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
286 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
291 #define OBD_FAIL_FID_IGIF 0x1504
292 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
293 touch $DIR/$tdir/dummy
295 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
297 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
298 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
299 mdd.${MDT_DEV}.lfsck_namespace |
300 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
302 error "(4) unexpected status"
305 local repaired=$($SHOW_NAMESPACE |
306 awk '/^dirent_repaired/ { print $2 }')
307 # for interop with old server
308 [ -z "$repaired" ] &&
309 repaired=$($SHOW_NAMESPACE |
310 awk '/^updated_phase1/ { print $2 }')
312 [ $repaired -eq 1 ] ||
313 error "(5) Fail to repair lost FID-in-dirent: $repaired"
317 mount_client $MOUNT || error "(6) Fail to start client!"
319 #define OBD_FAIL_FID_LOOKUP 0x1505
320 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
321 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
323 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
325 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
328 [ $MDS1_VERSION -lt $(version_code 2.13.57) ] &&
329 skip "MDS older than 2.13.57"
330 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
334 touch $DIR/$tdir/$tfile
335 mkdir $DIR/$tdir/subdir
336 $LFS mkdir -i 1 $DIR/$tdir/remotedir
337 $LFS path2fid $DIR/$tdir
338 ll_decode_linkea $DIR/$tdir/$tfile
339 ll_decode_linkea $DIR/$tdir/subdir
340 ll_decode_linkea $DIR/$tdir/remotedir
342 local mntpt=$(facet_mntpt mds1)
344 # unlink OI files to remove the stale entry
345 local saved_opts=$MDS_MOUNT_OPTS
348 mount_fstype mds1 $mntpt
349 # increase $tdir FID oid in LMA
350 do_facet mds1 "getfattr -d -m trusted.lma -e hex \
351 --absolute-names $mntpt/ROOT/$tdir | \
352 sed -E 's/0(.{8})$/1\1/' | setfattr --restore=-"
353 unmount_fstype mds1 $mntpt
356 # the FID oid in LMA was increased above, and it's not in OI table,
357 # run scrub first to generate mapping in OI, so the following namespace
358 # check can fix linkea correctly, this is not necessary normally.
359 do_facet mds1 $LCTL lfsck_start -M ${MDT_DEV} -t scrub ||
360 error "failed to start LFSCK for scrub!"
361 wait_update_facet mds1 "$LCTL get_param -n \
362 osd-*.$(facet_svc mds1).oi_scrub |
363 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
364 error "unexpected status"
366 $START_NAMESPACE -r -A || error "fail to start LFSCK for namespace!"
367 wait_update_facet mds1 "$LCTL get_param -n \
368 mdd.${MDT_DEV}.lfsck_namespace |
369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
371 error "unexpected status"
373 $LFS path2fid $DIR/$tdir
374 ll_decode_linkea $DIR/$tdir/$tfile
375 ll_decode_linkea $DIR/$tdir/subdir
376 ll_decode_linkea $DIR/$tdir/remotedir
381 fid=$($LFS path2fid $DIR/$tdir)
382 for f in $tfile subdir remotedir; do
383 pfid=$(ll_decode_linkea $DIR/$tdir/$f |
384 awk '/pfid/ { print $3 }')
386 [ "$pfid" == "$fid" ] || error "$fid in LMA != $pfid in linkea"
389 run_test 1d "LFSCK can fix mismatch of FID in LMA and FID in child linkea"
394 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
395 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
396 touch $DIR/$tdir/dummy
398 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
400 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
401 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
402 mdd.${MDT_DEV}.lfsck_namespace |
403 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
405 error "(4) unexpected status"
408 local repaired=$($SHOW_NAMESPACE |
409 awk '/^linkea_repaired/ { print $2 }')
410 # for interop with old server
411 [ -z "$repaired" ] &&
412 repaired=$($SHOW_NAMESPACE |
413 awk '/^updated_phase2/ { print $2 }')
415 [ $repaired -eq 1 ] ||
416 error "(5) Fail to repair crashed linkEA: $repaired"
420 mount_client $MOUNT || error "(6) Fail to start client!"
422 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
423 error "(7) Fail to stat $DIR/$tdir/dummy"
425 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
426 local dummyname=$($LFS fid2path $DIR $dummyfid)
427 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
428 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
430 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
436 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
437 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
438 touch $DIR/$tdir/dummy
440 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
442 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
443 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
444 mdd.${MDT_DEV}.lfsck_namespace |
445 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
447 error "(4) unexpected status"
450 local repaired=$($SHOW_NAMESPACE |
451 awk '/^updated_phase2/ { print $2 }')
452 [ $repaired -eq 1 ] ||
453 error "(5) Fail to repair crashed linkEA: $repaired"
457 mount_client $MOUNT || error "(6) Fail to start client!"
459 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
460 error "(7) Fail to stat $DIR/$tdir/dummy"
462 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
463 local dummyname=$($LFS fid2path $DIR $dummyfid)
464 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
465 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
467 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
471 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
472 skip "MDS older than 2.4.90"
476 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
477 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
478 touch $DIR/$tdir/dummy
480 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
482 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
483 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
484 mdd.${MDT_DEV}.lfsck_namespace |
485 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
487 error "(4) unexpected status"
490 local repaired=$($SHOW_NAMESPACE |
491 awk '/^updated_phase2/ { print $2 }')
492 [ $repaired -eq 1 ] ||
493 error "(5) Fail to repair crashed linkEA: $repaired"
497 mount_client $MOUNT || error "(6) Fail to start client!"
499 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
500 error "(7) Fail to stat $DIR/$tdir/dummy"
502 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
503 local dummyname=$($LFS fid2path $DIR $dummyfid)
504 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
505 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
507 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
511 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
512 skip "MDS older than 2.6.50, LU-4788"
516 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
517 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
518 touch $DIR/$tdir/dummy
520 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
522 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
523 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
524 mdd.${MDT_DEV}.lfsck_namespace |
525 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
527 error "(4) unexpected status"
530 local repaired=$($SHOW_NAMESPACE |
531 awk '/^linkea_repaired/ { print $2 }')
532 [ $repaired -eq 1 ] ||
533 error "(5) Fail to repair crashed linkEA: $repaired"
537 mount_client $MOUNT || error "(6) Fail to start client!"
539 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
540 error "(7) Fail to stat $DIR/$tdir/dummy"
542 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
543 local dummyname=$($LFS fid2path $DIR $dummyfid)
544 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
545 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
547 run_test 2d "LFSCK can recover the missing linkEA entry"
551 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
552 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
553 skip "MDS older than 2.6.50, LU-5511"
557 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
559 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
560 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
561 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
562 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
564 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
566 wait_all_targets_blocked namespace completed 4
568 local repaired=$($SHOW_NAMESPACE |
569 awk '/^linkea_repaired/ { print $2 }')
570 [ $repaired -eq 1 ] ||
571 error "(5) Fail to repair crashed linkEA: $repaired"
573 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
574 local name=$($LFS fid2path $DIR $fid)
575 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
576 error "(6) Fail to repair linkEA: $fid $name"
578 run_test 2e "namespace LFSCK can verify remote object linkEA"
582 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
583 skip "MDS older than 2.6.50, LU-4788"
587 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
588 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
589 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
591 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
592 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
593 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
595 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
596 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
597 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
599 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
600 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
601 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
603 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
605 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
606 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
607 mdd.${MDT_DEV}.lfsck_namespace |
608 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
610 error "(10) unexpected status"
613 local checked=$($SHOW_NAMESPACE |
614 awk '/^checked_phase2/ { print $2 }')
615 [ $checked -ge 4 ] ||
616 error "(11) Fail to check multiple-linked object: $checked"
618 local repaired=$($SHOW_NAMESPACE |
619 awk '/^multiple_linked_repaired/ { print $2 }')
620 [ $repaired -ge 2 ] ||
621 error "(12) Fail to repair multiple-linked object: $repaired"
623 run_test 3 "LFSCK can verify multiple-linked objects"
627 [ "$mds1_FSTYPE" != ldiskfs ] &&
628 skip "OI Scrub not implemented for ZFS"
631 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
632 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
634 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
635 echo "start $SINGLEMDS with disabling OI scrub"
636 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
637 error "(2) Fail to start MDS!"
639 #define OBD_FAIL_LFSCK_DELAY2 0x1601
640 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
641 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
643 mdd.${MDT_DEV}.lfsck_namespace |
644 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
646 error "(5) unexpected status"
649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
650 [ "$STATUS" == "scanning-phase1" ] ||
651 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
653 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
658 error "(7) unexpected status"
661 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
662 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
664 local repaired=$($SHOW_NAMESPACE |
665 awk '/^dirent_repaired/ { print $2 }')
666 # for interop with old server
667 [ -z "$repaired" ] &&
668 repaired=$($SHOW_NAMESPACE |
669 awk '/^updated_phase1/ { print $2 }')
671 [ $repaired -ge 9 ] ||
672 error "(9) Fail to re-generate FID-in-dirent: $repaired"
676 mount_client $MOUNT || error "(10) Fail to start client!"
678 #define OBD_FAIL_FID_LOOKUP 0x1505
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
680 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
681 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
683 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
687 [ "$mds1_FSTYPE" != ldiskfs ] &&
688 skip "OI Scrub not implemented for ZFS"
691 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
692 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
694 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
695 echo "start $SINGLEMDS with disabling OI scrub"
696 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
697 error "(2) Fail to start MDS!"
699 #define OBD_FAIL_LFSCK_DELAY2 0x1601
700 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
701 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
702 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
703 mdd.${MDT_DEV}.lfsck_namespace |
704 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
706 error "(5) unexpected status"
709 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
710 [ "$STATUS" == "scanning-phase1" ] ||
711 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
713 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
714 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
715 mdd.${MDT_DEV}.lfsck_namespace |
716 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
718 error "(7) unexpected status"
721 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
722 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
724 local repaired=$($SHOW_NAMESPACE |
725 awk '/^dirent_repaired/ { print $2 }')
726 # for interop with old server
727 [ -z "$repaired" ] &&
728 repaired=$($SHOW_NAMESPACE |
729 awk '/^updated_phase1/ { print $2 }')
731 [ $repaired -ge 2 ] ||
732 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
736 mount_client $MOUNT || error "(10) Fail to start client!"
738 #define OBD_FAIL_FID_LOOKUP 0x1505
739 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
740 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
742 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
744 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
745 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
746 local dummyname=$($LFS fid2path $DIR $dummyfid)
747 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
748 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
750 run_test 5 "LFSCK can handle IGIF object upgrading"
755 #define OBD_FAIL_LFSCK_DELAY1 0x1600
756 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
757 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
759 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
760 [ "$STATUS" == "scanning-phase1" ] ||
761 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
763 # Sleep 3 sec to guarantee at least one object processed by LFSCK
765 # Fail the LFSCK to guarantee there is at least one checkpoint
766 #define OBD_FAIL_LFSCK_FATAL1 0x1608
767 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
768 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
769 mdd.${MDT_DEV}.lfsck_namespace |
770 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
772 error "(4) unexpected status"
775 local POS0=$($SHOW_NAMESPACE |
776 awk '/^last_checkpoint_position/ { print $2 }' |
779 #define OBD_FAIL_LFSCK_DELAY1 0x1600
780 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
781 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
783 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
784 [ "$STATUS" == "scanning-phase1" ] ||
785 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
787 local POS1=$($SHOW_NAMESPACE |
788 awk '/^latest_start_position/ { print $2 }' |
790 [[ $POS0 -lt $POS1 ]] ||
791 error "(7) Expect larger than: $POS0, but got $POS1"
793 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
794 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
795 mdd.${MDT_DEV}.lfsck_namespace |
796 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
798 error "(8) unexpected status"
801 run_test 6a "LFSCK resumes from last checkpoint (1)"
806 #define OBD_FAIL_LFSCK_DELAY2 0x1601
807 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
808 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
810 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
811 [ "$STATUS" == "scanning-phase1" ] ||
812 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
814 # Sleep 5 sec to guarantee that we are in the directory scanning
816 # Fail the LFSCK to guarantee there is at least one checkpoint
817 #define OBD_FAIL_LFSCK_FATAL2 0x1609
818 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
819 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
820 mdd.${MDT_DEV}.lfsck_namespace |
821 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
823 error "(4) unexpected status"
826 local O_POS0=$($SHOW_NAMESPACE |
827 awk '/^last_checkpoint_position/ { print $2 }' |
830 local D_POS0=$($SHOW_NAMESPACE |
831 awk '/^last_checkpoint_position/ { print $4 }')
833 #define OBD_FAIL_LFSCK_DELAY2 0x1601
834 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
835 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
837 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
838 [ "$STATUS" == "scanning-phase1" ] ||
839 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
841 local O_POS1=$($SHOW_NAMESPACE |
842 awk '/^latest_start_position/ { print $2 }' |
844 local D_POS1=$($SHOW_NAMESPACE |
845 awk '/^latest_start_position/ { print $4 }')
847 echo "Additional debug for 6b"
849 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
850 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
851 [[ $O_POS0 -lt $O_POS1 ]] ||
852 error "(7.1) $O_POS1 is not larger than $O_POS0"
854 [[ $D_POS0 -lt $D_POS1 ]] ||
855 error "(7.2) $D_POS1 is not larger than $D_POS0"
858 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
859 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
860 mdd.${MDT_DEV}.lfsck_namespace |
861 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
863 error "(8) unexpected status"
866 run_test 6b "LFSCK resumes from last checkpoint (2)"
873 #define OBD_FAIL_LFSCK_DELAY2 0x1601
874 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
875 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
877 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
878 [ "$STATUS" == "scanning-phase1" ] ||
879 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
881 # Sleep 3 sec to guarantee at least one object processed by LFSCK
883 echo "stop $SINGLEMDS"
884 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
886 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
887 echo "start $SINGLEMDS"
888 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
889 error "(5) Fail to start MDS!"
891 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
892 mdd.${MDT_DEV}.lfsck_namespace |
893 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
895 error "(6) unexpected status"
898 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
904 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
905 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
906 for ((i = 0; i < 20; i++)); do
907 touch $DIR/$tdir/dummy${i}
910 #define OBD_FAIL_LFSCK_DELAY3 0x1602
911 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
912 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
913 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
914 mdd.${MDT_DEV}.lfsck_namespace |
915 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
917 error "(4) unexpected status"
921 echo "stop $SINGLEMDS"
922 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
924 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
925 echo "start $SINGLEMDS"
926 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
927 error "(6) Fail to start MDS!"
929 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
930 mdd.${MDT_DEV}.lfsck_namespace |
931 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
933 error "(7) unexpected status"
936 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
941 formatall > /dev/null
947 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
948 [ "$STATUS" == "init" ] ||
949 error "(2) Expect 'init', but got '$STATUS'"
951 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
952 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
953 mkdir $DIR/$tdir/crashed
955 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
956 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
957 for ((i = 0; i < 5; i++)); do
958 touch $DIR/$tdir/dummy${i}
961 umount_client $MOUNT || error "(3) Fail to stop client!"
963 #define OBD_FAIL_LFSCK_DELAY2 0x1601
964 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
965 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
967 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
968 [ "$STATUS" == "scanning-phase1" ] ||
969 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
971 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
973 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
974 [ "$STATUS" == "stopped" ] ||
975 error "(7) Expect 'stopped', but got '$STATUS'"
977 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
979 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
980 [ "$STATUS" == "scanning-phase1" ] ||
981 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
983 #define OBD_FAIL_LFSCK_FATAL2 0x1609
984 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
985 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
986 mdd.${MDT_DEV}.lfsck_namespace |
987 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
989 error "(10) unexpected status"
992 #define OBD_FAIL_LFSCK_DELAY1 0x1600
993 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
994 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
996 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
997 [ "$STATUS" == "scanning-phase1" ] ||
998 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
1000 #define OBD_FAIL_LFSCK_CRASH 0x160a
1001 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
1004 echo "stop $SINGLEMDS"
1005 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
1007 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1008 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1010 echo "start $SINGLEMDS"
1011 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1012 error "(14) Fail to start MDS!"
1014 local timeout=$(max_recovery_time)
1017 while [ $timer -lt $timeout ]; do
1018 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1019 mdt.${MDT_DEV}.recovery_status |
1020 awk '/^status/ { print \\\$2 }'")
1021 [ "$STATUS" != "RECOVERING" ] && break;
1023 timer=$((timer + 1))
1026 [ $timer != $timeout ] ||
1027 error "(14.1) recovery timeout"
1029 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1030 [ "$STATUS" == "crashed" ] ||
1031 error "(15) Expect 'crashed', but got '$STATUS'"
1033 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1034 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1035 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
1037 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1038 [ "$STATUS" == "scanning-phase1" ] ||
1039 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1041 echo "stop $SINGLEMDS"
1042 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
1044 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1045 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1047 echo "start $SINGLEMDS"
1048 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1049 error "(19) Fail to start MDS!"
1052 while [ $timer -lt $timeout ]; do
1053 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1054 mdt.${MDT_DEV}.recovery_status |
1055 awk '/^status/ { print \\\$2 }'")
1056 [ "$STATUS" != "RECOVERING" ] && break;
1058 timer=$((timer + 1))
1061 [ $timer != $timeout ] ||
1062 error "(19.1) recovery timeout"
1064 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1065 [ "$STATUS" == "paused" ] ||
1066 error "(20) Expect 'paused', but got '$STATUS'"
1068 echo "stop $SINGLEMDS"
1069 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1071 echo "start $SINGLEMDS without resume LFSCK"
1072 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1073 error "(20.2) Fail to start MDS!"
1076 while [ $timer -lt $timeout ]; do
1077 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1078 mdt.${MDT_DEV}.recovery_status |
1079 awk '/^status/ { print \\\$2 }'")
1080 [ "$STATUS" != "RECOVERING" ] && break;
1082 timer=$((timer + 1))
1085 [ $timer != $timeout ] ||
1086 error "(20.3) recovery timeout"
1088 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1089 [ "$STATUS" == "paused" ] ||
1090 error "(20.4) Expect 'paused', but got '$STATUS'"
1092 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1093 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1095 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1096 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1097 mdd.${MDT_DEV}.lfsck_namespace |
1098 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1100 error "(22) unexpected status"
1103 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1104 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1105 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1107 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1108 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1109 mdd.${MDT_DEV}.lfsck_namespace |
1110 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1112 error "(24) unexpected status"
1115 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1116 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1118 run_test 8 "LFSCK state machine"
1121 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1122 skip "Testing on UP system, the speed may be inaccurate."
1126 check_mount_and_prep
1127 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1128 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1129 createmany -o $DIR/$tdir/lfsck/f 5000
1131 local BASE_SPEED1=100
1133 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1136 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1137 [ "$STATUS" == "scanning-phase1" ] ||
1138 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1140 local SPEED=$($SHOW_LAYOUT |
1141 awk '/^average_speed_phase1/ { print $2 }')
1143 # There may be time error, normally it should be less than 2 seconds.
1144 # We allow another 20% schedule error.
1146 # MAX_MARGIN = 1.3 = 13 / 10
1147 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1148 RUN_TIME1 * 13 / 10))
1149 [ $SPEED -lt $MAX_SPEED ] || {
1151 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1152 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1155 # adjust speed limit
1156 local BASE_SPEED2=300
1158 do_facet $SINGLEMDS \
1159 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1162 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1163 # MIN_MARGIN = 0.7 = 7 / 10
1164 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1165 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1166 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1167 [ $SPEED -gt $MIN_SPEED ] || {
1168 if [ $mds1_FSTYPE != ldiskfs ]; then
1169 error_ignore LU-5624 \
1170 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1173 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1177 # MAX_MARGIN = 1.3 = 13 / 10
1178 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1179 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1180 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1181 [ $SPEED -lt $MAX_SPEED ] || {
1183 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1184 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1185 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1188 do_nodes $(comma_list $(mdts_nodes)) \
1189 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1190 do_nodes $(comma_list $(osts_nodes)) \
1191 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1193 wait_update_facet $SINGLEMDS \
1194 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1195 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1196 error "(7) Failed to get expected 'completed'"
1198 run_test 9a "LFSCK speed control (1)"
1201 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1202 skip "Testing on UP system, the speed may be inaccurate."
1208 echo "Preparing another 50 * 50 files (with error) at $(date)."
1209 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1210 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1211 createmany -d $DIR/$tdir/d 50
1212 createmany -m $DIR/$tdir/f 50
1213 for ((i = 0; i < 50; i++)); do
1214 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1217 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1218 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1219 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1220 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1221 mdd.${MDT_DEV}.lfsck_namespace |
1222 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1224 error "(5) unexpected status"
1227 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1228 echo "Prepared at $(date)."
1230 local BASE_SPEED1=50
1232 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1235 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1236 [ "$STATUS" == "scanning-phase2" ] ||
1237 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1239 local SPEED=$($SHOW_NAMESPACE |
1240 awk '/^average_speed_phase2/ { print $2 }')
1241 # There may be time error, normally it should be less than 2 seconds.
1242 # We allow another 20% schedule error.
1244 # MAX_MARGIN = 1.3 = 13 / 10
1245 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1246 RUN_TIME1 * 13 / 10))
1247 [ $SPEED -lt $MAX_SPEED ] || {
1249 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1250 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1253 # adjust speed limit
1254 local BASE_SPEED2=150
1256 do_facet $SINGLEMDS \
1257 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1260 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1261 # MIN_MARGIN = 0.7 = 7 / 10
1262 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1263 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1264 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1265 [ $SPEED -gt $MIN_SPEED ] || {
1266 if [ $mds1_FSTYPE != ldiskfs ]; then
1267 error_ignore LU-5624 \
1268 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1271 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1275 # MAX_MARGIN = 1.3 = 13 / 10
1276 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1277 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1278 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1279 [ $SPEED -lt $MAX_SPEED ] || {
1281 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1282 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1283 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1286 do_nodes $(comma_list $(mdts_nodes)) \
1287 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1288 do_nodes $(comma_list $(osts_nodes)) \
1289 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1290 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1291 mdd.${MDT_DEV}.lfsck_namespace |
1292 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1294 error "(11) unexpected status"
1297 run_test 9b "LFSCK speed control (2)"
1301 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1305 echo "Preparing more files with error at $(date)."
1306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1309 for ((i = 0; i < 1000; i = $((i+2)))); do
1310 mkdir -p $DIR/$tdir/d${i}
1311 touch $DIR/$tdir/f${i}
1312 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1315 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1316 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1318 for ((i = 1; i < 1000; i = $((i+2)))); do
1319 mkdir -p $DIR/$tdir/d${i}
1320 touch $DIR/$tdir/f${i}
1321 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1324 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1325 echo "Prepared at $(date)."
1327 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1329 umount_client $MOUNT
1330 mount_client $MOUNT || error "(3) Fail to start client!"
1332 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1335 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1336 [ "$STATUS" == "scanning-phase1" ] ||
1337 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1339 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1341 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1343 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1345 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1347 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1349 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1351 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1353 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1354 error "(14) Fail to softlink!"
1356 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1357 [ "$STATUS" == "scanning-phase1" ] ||
1358 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1360 do_nodes $(comma_list $(mdts_nodes)) \
1361 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1362 do_nodes $(comma_list $(osts_nodes)) \
1363 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1364 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1365 mdd.${MDT_DEV}.lfsck_namespace |
1366 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1368 error "(16) unexpected status"
1371 run_test 10 "System is available during LFSCK scanning"
1374 ost_remove_lastid() {
1377 local rcmd="do_facet ost${ost}"
1379 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1381 # step 1: local mount
1382 mount_fstype ost${ost} || return 1
1383 # step 2: remove the specified LAST_ID
1384 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1386 unmount_fstype ost${ost} || return 2
1390 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1391 skip "MDS older than 2.5.55, LU-1267"
1393 check_mount_and_prep
1394 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1395 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1400 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1402 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1403 error "(2) Fail to start ost1"
1405 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1406 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1408 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1409 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1411 wait_update_facet ost1 "$LCTL get_param -n \
1412 obdfilter.${OST_DEV}.lfsck_layout |
1413 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1415 error "(5) unexpected status"
1418 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1420 wait_update_facet ost1 "$LCTL get_param -n \
1421 obdfilter.${OST_DEV}.lfsck_layout |
1422 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1424 error "(6) unexpected status"
1427 echo "the LAST_ID(s) should have been rebuilt"
1428 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1429 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1431 run_test 11a "LFSCK can rebuild lost last_id"
1434 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1435 skip "MDS older than 2.5.55, LU-1267"
1437 check_mount_and_prep
1438 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1440 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1441 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1442 do_facet ost1 $LCTL set_param fail_loc=0x160d
1444 local count=$(precreated_ost_obj_count 0 0)
1446 createmany -o $DIR/$tdir/f $((count + 32))
1448 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1449 local seq=$(do_facet mds1 $LCTL get_param -n \
1450 osp.${proc_path}.prealloc_last_seq)
1451 local id_used=$(do_facet mds1 $LCTL get_param -n \
1452 osp.${proc_path}.prealloc_last_id)
1454 umount_client $MOUNT
1455 stop ost1 || error "(1) Fail to stop ost1"
1457 #define OBD_FAIL_OST_ENOSPC 0x215
1458 do_facet ost1 $LCTL set_param fail_loc=0x215
1460 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1461 error "(2) Fail to start ost1"
1463 for ((i = 0; i < 60; i++)); do
1464 id_ost1=$(do_facet ost1 \
1465 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1466 awk -F: "/$seq/ { print \$2 }")
1467 [ -n "$id_ost1" ] && break
1471 echo "the on-disk LAST_ID should be smaller than the expected one"
1472 [ $id_used -gt $id_ost1 ] ||
1473 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1475 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1476 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1478 wait_update_facet ost1 \
1479 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1480 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1482 error "(6) unexpected status"
1485 stop ost1 || error "(7) Fail to stop ost1"
1487 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1488 error "(8) Fail to start ost1"
1490 echo "the on-disk LAST_ID should have been rebuilt"
1491 # last_id may be larger than $id_used if objects were created/skipped
1492 wait_update_facet_cond ost1 \
1493 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1494 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1495 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1496 error "(9) expect last_id >= id_used $seq:$id_used"
1499 do_facet ost1 $LCTL set_param fail_loc=0
1500 stopall || error "(10) Fail to stopall"
1502 run_test 11b "LFSCK can rebuild crashed last_id"
1505 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1506 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1507 skip "MDS older than 2.5.55, LU-3950"
1509 check_mount_and_prep
1510 for k in $(seq $MDSCOUNT); do
1511 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1512 createmany -o $DIR/$tdir/${k}/f 100 ||
1513 error "(0) Fail to create 100 files."
1516 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1517 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1518 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1520 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1521 wait_all_targets namespace scanning-phase1 3
1523 echo "Stop namespace LFSCK on all targets by single lctl command."
1524 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1525 error "(4) Fail to stop LFSCK on all devices!"
1527 echo "All the LFSCK targets should be in 'stopped' status."
1528 wait_all_targets_blocked namespace stopped 5
1530 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1531 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1532 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1534 echo "All the LFSCK targets should be in 'completed' status."
1535 wait_all_targets_blocked namespace completed 7
1537 start_full_debug_logging
1539 echo "Start layout LFSCK on all targets by single command (-s 1)."
1540 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1541 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1543 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1544 wait_all_targets layout scanning-phase1 9
1546 echo "Stop layout LFSCK on all targets by single lctl command."
1547 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1548 error "(10) Fail to stop LFSCK on all devices!"
1550 echo "All the LFSCK targets should be in 'stopped' status."
1551 wait_all_targets_blocked layout stopped 11
1553 for k in $(seq $OSTCOUNT); do
1554 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1555 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1556 awk '/^status/ { print $2 }')
1557 [ "$STATUS" == "stopped" ] ||
1558 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1561 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1562 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1563 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1565 echo "All the LFSCK targets should be in 'completed' status."
1566 wait_all_targets_blocked layout completed 14
1568 stop_full_debug_logging
1570 run_test 12a "single command to trigger LFSCK on all devices"
1573 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1574 skip "MDS older than 2.5.55, LU-3950"
1576 check_mount_and_prep
1578 echo "Start LFSCK without '-M' specified."
1579 do_facet mds1 $LCTL lfsck_start -A -r ||
1580 error "(0) Fail to start LFSCK without '-M'"
1582 wait_all_targets_blocked namespace completed 1
1583 wait_all_targets_blocked layout completed 2
1585 local count=$(do_facet mds1 $LCTL dl |
1586 awk '{ print $3 }' | grep mdt | wc -l)
1587 if [ $count -gt 1 ]; then
1589 echo "Start layout LFSCK on the node with multipe targets,"
1590 echo "but not specify '-M'/'-A' option. Should get failure."
1592 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1593 error "(3) Start layout LFSCK should fail" || true
1596 run_test 12b "auto detect Lustre device"
1599 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1600 skip "MDS older than 2.5.55, LU-3593"
1603 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1604 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1605 echo "MDT-object FID."
1608 check_mount_and_prep
1610 echo "Inject failure stub to simulate bad lmm_oi"
1611 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1612 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1613 createmany -o $DIR/$tdir/f 1
1614 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1615 error "(0) Fail to create PFL $DIR/$tdir/f1"
1616 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1618 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1619 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1621 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1622 mdd.${MDT_DEV}.lfsck_layout |
1623 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1625 error "(2) unexpected status"
1628 local repaired=$($SHOW_LAYOUT |
1629 awk '/^repaired_others/ { print $2 }')
1630 [ $repaired -eq 2 ] ||
1631 error "(3) Fail to repair crashed lmm_oi: $repaired"
1633 run_test 13 "LFSCK can repair crashed lmm_oi"
1636 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1637 skip "MDS older than 2.5.55, LU-3590"
1640 echo "The OST-object referenced by the MDT-object should be there;"
1641 echo "otherwise, the LFSCK should re-create the missing OST-object."
1642 echo "without '--delay-create-ostobj' option."
1645 check_mount_and_prep
1646 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1648 echo "Inject failure stub to simulate dangling referenced MDT-object"
1649 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1650 do_facet ost1 $LCTL set_param fail_loc=0x1610
1651 local count=$(precreated_ost_obj_count 0 0)
1653 createmany -o $DIR/$tdir/f $((count + 16)) ||
1654 error "(0.1) Fail to create $DIR/$tdir/fx"
1655 touch $DIR/$tdir/guard0
1657 for ((i = 0; i < 16; i++)); do
1658 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1659 $DIR/$tdir/f_comp${i} ||
1660 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1662 touch $DIR/$tdir/guard1
1664 do_facet ost1 $LCTL set_param fail_loc=0
1666 start_full_debug_logging
1668 # exhaust other pre-created dangling cases
1669 count=$(precreated_ost_obj_count 0 0)
1670 createmany -o $DIR/$tdir/a $count ||
1671 error "(0.5) Fail to create $count files."
1673 echo "'ls' should fail because of dangling referenced MDT-object"
1674 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1676 echo "Trigger layout LFSCK to find out dangling reference"
1677 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1679 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1680 mdd.${MDT_DEV}.lfsck_layout |
1681 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1683 error "(3) unexpected status"
1686 local repaired=$($SHOW_LAYOUT |
1687 awk '/^repaired_dangling/ { print $2 }')
1688 [ $repaired -ge 32 ] ||
1689 error "(4) Fail to repair dangling reference: $repaired"
1691 echo "'stat' should fail because of not repair dangling by default"
1692 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1693 error "(5.1) stat should fail"
1694 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1695 error "(5.2) stat should fail"
1697 echo "Trigger layout LFSCK to repair dangling reference"
1698 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1700 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1701 mdd.${MDT_DEV}.lfsck_layout |
1702 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1704 error "(7) unexpected status"
1707 # There may be some async LFSCK updates in processing, wait for
1708 # a while until the target reparation has been done. LU-4970.
1710 echo "'stat' should success after layout LFSCK repairing"
1711 wait_update_facet client "stat $DIR/$tdir/guard0 |
1712 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1713 stat $DIR/$tdir/guard0
1715 error "(8.1) unexpected size"
1718 wait_update_facet client "stat $DIR/$tdir/guard1 |
1719 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1720 stat $DIR/$tdir/guard1
1722 error "(8.2) unexpected size"
1725 repaired=$($SHOW_LAYOUT |
1726 awk '/^repaired_dangling/ { print $2 }')
1727 [ $repaired -ge 32 ] ||
1728 error "(9) Fail to repair dangling reference: $repaired"
1730 stop_full_debug_logging
1732 echo "stopall to cleanup object cache"
1735 setupall > /dev/null
1737 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1740 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1741 skip "MDS older than 2.5.55, LU-3590"
1744 echo "The OST-object referenced by the MDT-object should be there;"
1745 echo "otherwise, the LFSCK should re-create the missing OST-object."
1746 echo "with '--delay-create-ostobj' option."
1749 check_mount_and_prep
1750 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1752 echo "Inject failure stub to simulate dangling referenced MDT-object"
1753 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1754 do_facet ost1 $LCTL set_param fail_loc=0x1610
1755 local count=$(precreated_ost_obj_count 0 0)
1757 createmany -o $DIR/$tdir/f $((count + 31))
1758 touch $DIR/$tdir/guard
1759 do_facet ost1 $LCTL set_param fail_loc=0
1761 start_full_debug_logging
1763 # exhaust other pre-created dangling cases
1764 count=$(precreated_ost_obj_count 0 0)
1765 createmany -o $DIR/$tdir/a $count ||
1766 error "(0) Fail to create $count files."
1768 echo "'ls' should fail because of dangling referenced MDT-object"
1769 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1771 echo "Trigger layout LFSCK to find out dangling reference"
1772 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1774 wait_all_targets_blocked layout completed 3
1776 local repaired=$($SHOW_LAYOUT |
1777 awk '/^repaired_dangling/ { print $2 }')
1778 [ $repaired -ge 32 ] ||
1779 error "(4) Fail to repair dangling reference: $repaired"
1781 echo "'stat' should fail because of not repair dangling by default"
1782 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1784 echo "Trigger layout LFSCK to repair dangling reference"
1785 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1787 wait_all_targets_blocked layout completed 7
1789 # There may be some async LFSCK updates in processing, wait for
1790 # a while until the target reparation has been done. LU-4970.
1792 echo "'stat' should success after layout LFSCK repairing"
1793 wait_update_facet client "stat $DIR/$tdir/guard |
1794 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1795 stat $DIR/$tdir/guard
1797 error "(8) unexpected size"
1800 repaired=$($SHOW_LAYOUT |
1801 awk '/^repaired_dangling/ { print $2 }')
1802 [ $repaired -ge 32 ] ||
1803 error "(9) Fail to repair dangling reference: $repaired"
1805 stop_full_debug_logging
1807 echo "stopall to cleanup object cache"
1810 setupall > /dev/null
1812 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1815 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1816 skip "MDS older than 2.5.55, LU-3591"
1819 echo "If the OST-object referenced by the MDT-object back points"
1820 echo "to some non-exist MDT-object, then the LFSCK should repair"
1821 echo "the OST-object to back point to the right MDT-object."
1824 check_mount_and_prep
1825 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1827 echo "Inject failure stub to make the OST-object to back point to"
1828 echo "non-exist MDT-object."
1829 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1831 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1832 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1833 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1835 error "(0) Fail to create PFL $DIR/$tdir/f1"
1836 # 'dd' will trigger punch RPC firstly on every OST-objects.
1837 # So even though some OST-object will not be write by 'dd',
1838 # as long as it is allocated (may be NOT allocated in pfl_3b)
1839 # its layout information will be set also.
1840 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1841 cancel_lru_locks osc
1842 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1844 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1845 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1847 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1848 mdd.${MDT_DEV}.lfsck_layout |
1849 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1851 error "(2) unexpected status"
1854 local repaired=$($SHOW_LAYOUT |
1855 awk '/^repaired_unmatched_pair/ { print $2 }')
1856 [ $repaired -ge 3 ] ||
1857 error "(3) Fail to repair unmatched pair: $repaired"
1859 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1862 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1863 skip "MDS older than 2.5.55, LU-3591"
1866 echo "If the OST-object referenced by the MDT-object back points"
1867 echo "to other MDT-object that doesn't recognize the OST-object,"
1868 echo "then the LFSCK should repair it to back point to the right"
1869 echo "MDT-object (the first one)."
1872 check_mount_and_prep
1873 mkdir -p $DIR/$tdir/0
1874 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1875 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1876 cancel_lru_locks osc
1878 echo "Inject failure stub to make the OST-object to back point to"
1879 echo "other MDT-object"
1882 [ $OSTCOUNT -ge 2 ] && stripes=2
1884 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1885 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1886 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1887 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1889 error "(0) Fail to create PFL $DIR/$tdir/f1"
1890 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1891 cancel_lru_locks osc
1892 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1894 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1895 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1897 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1898 mdd.${MDT_DEV}.lfsck_layout |
1899 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1901 error "(2) unexpected status"
1904 local repaired=$($SHOW_LAYOUT |
1905 awk '/^repaired_unmatched_pair/ { print $2 }')
1906 [ $repaired -eq 4 ] ||
1907 error "(3) Fail to repair unmatched pair: $repaired"
1909 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1912 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1913 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1914 skip "MDS newer than 2.7.55, LU-6475"
1915 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1916 skip "MDS older than 2.5.55, LU-3591"
1919 echo "According to current metadata migration implementation,"
1920 echo "before the old MDT-object is removed, both the new MDT-object"
1921 echo "and old MDT-object will reference the same LOV layout. Then if"
1922 echo "the layout LFSCK finds the new MDT-object by race, it will"
1923 echo "regard related OST-object(s) as multiple referenced case, and"
1924 echo "will try to create new OST-object(s) for the new MDT-object."
1925 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1926 echo "MDT-object before confirm the multiple referenced case."
1929 check_mount_and_prep
1930 $LFS mkdir -i 1 $DIR/$tdir/a1
1931 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1932 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1933 cancel_lru_locks osc
1935 echo "Inject failure stub on MDT1 to delay the migration"
1937 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1938 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1939 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1940 $LFS migrate -m 0 $DIR/$tdir/a1 &
1943 echo "Trigger layout LFSCK to race with the migration"
1944 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1946 wait_all_targets_blocked layout completed 2
1948 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1949 local repaired=$($SHOW_LAYOUT |
1950 awk '/^repaired_unmatched_pair/ { print $2 }')
1951 [ $repaired -eq 1 ] ||
1952 error "(3) Fail to repair unmatched pair: $repaired"
1954 repaired=$($SHOW_LAYOUT |
1955 awk '/^repaired_multiple_referenced/ { print $2 }')
1956 [ $repaired -eq 0 ] ||
1957 error "(4) Unexpectedly repaird multiple references: $repaired"
1959 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1962 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1963 skip "MDS older than 2.5.55, LU-3594"
1966 echo "If the OST-object's owner information does not match the owner"
1967 echo "information stored in the MDT-object, then the LFSCK trust the"
1968 echo "MDT-object and update the OST-object's owner information."
1971 check_mount_and_prep
1972 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1973 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1974 cancel_lru_locks osc
1976 # created but no setattr or write to the file.
1978 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1979 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1981 echo "Inject failure stub to skip OST-object owner changing"
1982 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1983 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1984 chown 1.1 $DIR/$tdir/f0
1985 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1987 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1990 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1992 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1993 mdd.${MDT_DEV}.lfsck_layout |
1994 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1996 error "(2) unexpected status"
1999 local repaired=$($SHOW_LAYOUT |
2000 awk '/^repaired_inconsistent_owner/ { print $2 }')
2001 [ $repaired -eq 1 ] ||
2002 error "(3) Fail to repair inconsistent owner: $repaired"
2004 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2007 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2008 skip "MDS older than 2.5.55, LU-3594"
2011 echo "If more than one MDT-objects reference the same OST-object,"
2012 echo "and the OST-object only recognizes one MDT-object, then the"
2013 echo "LFSCK should create new OST-objects for such non-recognized"
2017 check_mount_and_prep
2018 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2020 echo "Inject failure stub to make two MDT-objects to refernce"
2021 echo "the OST-object"
2023 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2024 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2025 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2026 cancel_lru_locks mdc
2027 cancel_lru_locks osc
2029 createmany -o $DIR/$tdir/f 1
2030 cancel_lru_locks mdc
2031 cancel_lru_locks osc
2033 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2035 error "(0) Fail to create PFL $DIR/$tdir/f1"
2036 cancel_lru_locks mdc
2037 cancel_lru_locks osc
2038 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2040 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2041 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2042 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2043 [ $size -eq 1048576 ] ||
2044 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2046 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2047 [ $size -eq 1048576 ] ||
2048 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2050 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2053 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2055 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2056 mdd.${MDT_DEV}.lfsck_layout |
2057 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2059 error "(3) unexpected status"
2062 local repaired=$($SHOW_LAYOUT |
2063 awk '/^repaired_multiple_referenced/ { print $2 }')
2064 [ $repaired -eq 2 ] ||
2065 error "(4) Fail to repair multiple references: $repaired"
2067 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2068 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2069 error "(5) Fail to write f0."
2070 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2071 [ $size -eq 1048576 ] ||
2072 error "(6) guard size should be 1048576, but got $size"
2074 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2075 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2076 error "(7) Fail to write f1."
2077 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2078 [ $size -eq 1048576 ] ||
2079 error "(8) guard size should be 1048576, but got $size"
2081 run_test 17 "LFSCK can repair multiple references"
2083 $LCTL set_param debug=+cache > /dev/null
2086 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2087 skip "MDS older than 2.5.55, LU-3336"
2090 echo "The target MDT-object is there, but related stripe information"
2091 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2092 echo "layout EA entries."
2095 check_mount_and_prep
2096 $LFS mkdir -i 0 $DIR/$tdir/a1
2097 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2098 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2100 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2102 $LFS path2fid $DIR/$tdir/a1/f1
2103 $LFS getstripe $DIR/$tdir/a1/f1
2105 if [ $MDSCOUNT -ge 2 ]; then
2106 $LFS mkdir -i 1 $DIR/$tdir/a2
2107 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2108 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2109 $LFS path2fid $DIR/$tdir/a2/f2
2110 $LFS getstripe $DIR/$tdir/a2/f2
2113 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2114 error "(0) Fail to create PFL $DIR/$tdir/f3"
2116 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2118 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2120 $LFS path2fid $DIR/$tdir/f3
2121 $LFS getstripe $DIR/$tdir/f3
2123 cancel_lru_locks osc
2125 echo "Inject failure, to make the MDT-object lost its layout EA"
2126 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2127 do_facet mds1 $LCTL set_param fail_loc=0x1615
2128 chown 1.1 $DIR/$tdir/a1/f1
2130 if [ $MDSCOUNT -ge 2 ]; then
2131 do_facet mds2 $LCTL set_param fail_loc=0x1615
2132 chown 1.1 $DIR/$tdir/a2/f2
2135 chown 1.1 $DIR/$tdir/f3
2140 do_facet mds1 $LCTL set_param fail_loc=0
2141 if [ $MDSCOUNT -ge 2 ]; then
2142 do_facet mds2 $LCTL set_param fail_loc=0
2145 cancel_lru_locks mdc
2146 cancel_lru_locks osc
2148 echo "The file size should be incorrect since layout EA is lost"
2149 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2150 [ "$cur_size" != "$saved_size1" ] ||
2151 error "(1) Expect incorrect file1 size"
2153 if [ $MDSCOUNT -ge 2 ]; then
2154 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2155 [ "$cur_size" != "$saved_size1" ] ||
2156 error "(2) Expect incorrect file2 size"
2159 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2160 [ "$cur_size" != "$saved_size2" ] ||
2161 error "(1.2) Expect incorrect file3 size"
2163 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2164 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2166 for k in $(seq $MDSCOUNT); do
2167 # The LFSCK status query internal is 30 seconds. For the case
2168 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2169 # time to guarantee the status sync up.
2170 wait_update_facet mds${k} "$LCTL get_param -n \
2171 mdd.$(facet_svc mds${k}).lfsck_layout |
2172 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2173 error "(4) MDS${k} is not the expected 'completed'"
2176 for k in $(seq $OSTCOUNT); do
2177 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2178 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2179 awk '/^status/ { print $2 }')
2180 [ "$cur_status" == "completed" ] ||
2181 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2184 local repaired=$(do_facet mds1 $LCTL get_param -n \
2185 mdd.$(facet_svc mds1).lfsck_layout |
2186 awk '/^repaired_orphan/ { print $2 }')
2187 [ $repaired -eq 3 ] ||
2188 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2190 if [ $MDSCOUNT -ge 2 ]; then
2191 repaired=$(do_facet mds2 $LCTL get_param -n \
2192 mdd.$(facet_svc mds2).lfsck_layout |
2193 awk '/^repaired_orphan/ { print $2 }')
2194 [ $repaired -eq 2 ] ||
2195 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2198 $LFS path2fid $DIR/$tdir/a1/f1
2199 $LFS getstripe $DIR/$tdir/a1/f1
2201 if [ $MDSCOUNT -ge 2 ]; then
2202 $LFS path2fid $DIR/$tdir/a2/f2
2203 $LFS getstripe $DIR/$tdir/a2/f2
2206 $LFS path2fid $DIR/$tdir/f3
2207 $LFS getstripe $DIR/$tdir/f3
2209 echo "The file size should be correct after layout LFSCK scanning"
2210 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2211 [ "$cur_size" == "$saved_size1" ] ||
2212 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2214 if [ $MDSCOUNT -ge 2 ]; then
2215 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2216 [ "$cur_size" == "$saved_size1" ] ||
2217 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2220 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2221 [ "$cur_size" == "$saved_size2" ] ||
2222 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2224 run_test 18a "Find out orphan OST-object and repair it (1)"
2227 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2228 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2229 skip "MDS older than 2.5.55, LU-3336"
2232 echo "The target MDT-object is lost. The LFSCK should re-create the"
2233 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2234 echo "can move it back to normal namespace manually."
2237 check_mount_and_prep
2238 $LFS mkdir -i 0 $DIR/$tdir/a1
2239 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2240 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2241 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2242 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2244 $LFS getstripe $DIR/$tdir/a1/f1
2246 if [ $MDSCOUNT -ge 2 ]; then
2247 $LFS mkdir -i 1 $DIR/$tdir/a2
2248 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2249 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2250 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2252 $LFS getstripe $DIR/$tdir/a2/f2
2255 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2256 error "(0) Fail to create PFL $DIR/$tdir/f3"
2258 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2260 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2261 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2263 $LFS getstripe $DIR/$tdir/f3
2265 cancel_lru_locks osc
2267 echo "Inject failure, to simulate the case of missing the MDT-object"
2268 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2269 do_facet mds1 $LCTL set_param fail_loc=0x1616
2270 rm -f $DIR/$tdir/a1/f1
2272 if [ $MDSCOUNT -ge 2 ]; then
2273 do_facet mds2 $LCTL set_param fail_loc=0x1616
2274 rm -f $DIR/$tdir/a2/f2
2282 do_facet mds1 $LCTL set_param fail_loc=0
2283 if [ $MDSCOUNT -ge 2 ]; then
2284 do_facet mds2 $LCTL set_param fail_loc=0
2287 cancel_lru_locks mdc
2288 cancel_lru_locks osc
2290 # dryrun mode only check orphans, not repaie
2291 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2292 $START_LAYOUT --dryrun -o -r ||
2293 error "Fail to start layout LFSCK in dryrun mode"
2294 wait_all_targets_blocked layout completed 2
2296 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2297 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2298 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2300 local orphans=$(do_facet mds1 $LCTL get_param -n \
2301 mdd.$(facet_svc mds1).lfsck_layout |
2302 awk '/^inconsistent_orphan/ { print $2 }')
2303 [ $orphans -eq 3 ] ||
2304 error "Expect 3 found on mds1, but got: $orphans"
2306 # orphan parents should not be created
2308 for subdir in $MOUNT/.lustre/lost+found/*; do
2309 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2312 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2313 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2315 for k in $(seq $MDSCOUNT); do
2316 # The LFSCK status query internal is 30 seconds. For the case
2317 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2318 # time to guarantee the status sync up.
2319 wait_update_facet mds${k} "$LCTL get_param -n \
2320 mdd.$(facet_svc mds${k}).lfsck_layout |
2321 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2322 error "(2) MDS${k} is not the expected 'completed'"
2325 for k in $(seq $OSTCOUNT); do
2326 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2327 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2328 awk '/^status/ { print $2 }')
2329 [ "$cur_status" == "completed" ] ||
2330 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2333 local repaired=$(do_facet mds1 $LCTL get_param -n \
2334 mdd.$(facet_svc mds1).lfsck_layout |
2335 awk '/^repaired_orphan/ { print $2 }')
2336 [ $repaired -eq 3 ] ||
2337 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2339 if [ $MDSCOUNT -ge 2 ]; then
2340 repaired=$(do_facet mds2 $LCTL get_param -n \
2341 mdd.$(facet_svc mds2).lfsck_layout |
2342 awk '/^repaired_orphan/ { print $2 }')
2343 [ $repaired -eq 2 ] ||
2344 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2347 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2348 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2349 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2351 if [ $MDSCOUNT -ge 2 ]; then
2352 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2353 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2356 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2357 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2359 $LFS path2fid $DIR/$tdir/a1/f1
2360 $LFS getstripe $DIR/$tdir/a1/f1
2362 if [ $MDSCOUNT -ge 2 ]; then
2363 $LFS path2fid $DIR/$tdir/a2/f2
2364 $LFS getstripe $DIR/$tdir/a2/f2
2367 $LFS path2fid $DIR/$tdir/f3
2368 $LFS getstripe $DIR/$tdir/f3
2370 echo "The file size should be correct after layout LFSCK scanning"
2371 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2372 [ "$cur_size" == "$saved_size1" ] ||
2373 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2375 if [ $MDSCOUNT -ge 2 ]; then
2376 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2377 [ "$cur_size" == "$saved_size1" ] ||
2378 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2381 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2382 [ "$cur_size" == "$saved_size2" ] ||
2383 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2385 run_test 18b "Find out orphan OST-object and repair it (2)"
2388 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2389 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2390 skip "MDS older than 2.5.55, LU-3336"
2393 echo "The target MDT-object is lost, and the OST-object FID is missing."
2394 echo "The LFSCK should re-create the MDT-object with new FID under the "
2395 echo "directory .lustre/lost+found/MDTxxxx."
2398 check_mount_and_prep
2399 $LFS mkdir -i 0 $DIR/$tdir/a1
2400 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2402 echo "Inject failure, to simulate the case of missing parent FID"
2403 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2404 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2406 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2407 $LFS getstripe $DIR/$tdir/a1/f1
2409 if [ $MDSCOUNT -ge 2 ]; then
2410 $LFS mkdir -i 1 $DIR/$tdir/a2
2411 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2412 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2413 $LFS getstripe $DIR/$tdir/a2/f2
2416 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2417 error "(0) Fail to create PFL $DIR/$tdir/f3"
2419 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2420 $LFS getstripe $DIR/$tdir/f3
2422 cancel_lru_locks osc
2423 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2425 echo "Inject failure, to simulate the case of missing the MDT-object"
2426 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2427 do_facet mds1 $LCTL set_param fail_loc=0x1616
2428 rm -f $DIR/$tdir/a1/f1
2430 if [ $MDSCOUNT -ge 2 ]; then
2431 do_facet mds2 $LCTL set_param fail_loc=0x1616
2432 rm -f $DIR/$tdir/a2/f2
2440 do_facet mds1 $LCTL set_param fail_loc=0
2441 if [ $MDSCOUNT -ge 2 ]; then
2442 do_facet mds2 $LCTL set_param fail_loc=0
2445 cancel_lru_locks mdc
2446 cancel_lru_locks osc
2448 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2449 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2451 for k in $(seq $MDSCOUNT); do
2452 # The LFSCK status query internal is 30 seconds. For the case
2453 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2454 # time to guarantee the status sync up.
2455 wait_update_facet mds${k} "$LCTL get_param -n \
2456 mdd.$(facet_svc mds${k}).lfsck_layout |
2457 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2458 error "(2) MDS${k} is not the expected 'completed'"
2461 for k in $(seq $OSTCOUNT); do
2462 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2463 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2464 awk '/^status/ { print $2 }')
2465 [ "$cur_status" == "completed" ] ||
2466 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2469 if [ $MDSCOUNT -ge 2 ]; then
2475 local repaired=$(do_facet mds1 $LCTL get_param -n \
2476 mdd.$(facet_svc mds1).lfsck_layout |
2477 awk '/^repaired_orphan/ { print $2 }')
2478 [ $repaired -eq $expected ] ||
2479 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2481 if [ $MDSCOUNT -ge 2 ]; then
2482 repaired=$(do_facet mds2 $LCTL get_param -n \
2483 mdd.$(facet_svc mds2).lfsck_layout |
2484 awk '/^repaired_orphan/ { print $2 }')
2485 [ $repaired -eq 0 ] ||
2486 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2489 ls -ail $MOUNT/.lustre/lost+found/
2491 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2492 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2493 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2495 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2498 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2499 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2500 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2502 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2503 [ ! -z "$cname" ] ||
2504 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2506 run_test 18c "Find out orphan OST-object and repair it (3)"
2509 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2510 skip "MDS older than 2.5.55, LU-3336"
2513 echo "The target MDT-object layout EA is corrupted, but the right"
2514 echo "OST-object is still alive as orphan. The layout LFSCK will"
2515 echo "not create new OST-object to occupy such slot."
2518 check_mount_and_prep
2520 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2521 echo "guard" > $DIR/$tdir/a1/f1
2522 echo "foo" > $DIR/$tdir/a1/f2
2524 echo "guard" > $DIR/$tdir/a1/f3
2525 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2526 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2527 echo "foo" > $DIR/$tdir/a1/f4
2529 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2530 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2531 $LFS path2fid $DIR/$tdir/a1/f1
2532 $LFS getstripe $DIR/$tdir/a1/f1
2533 $LFS path2fid $DIR/$tdir/a1/f2
2534 $LFS getstripe $DIR/$tdir/a1/f2
2535 $LFS path2fid $DIR/$tdir/a1/f3
2536 $LFS getstripe $DIR/$tdir/a1/f3
2537 $LFS path2fid $DIR/$tdir/a1/f4
2538 $LFS getstripe $DIR/$tdir/a1/f4
2539 cancel_lru_locks osc
2541 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2542 echo "to reference the same OST-object (which is f1's OST-obejct)."
2543 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2544 echo "dangling reference case, but f2's old OST-object is there."
2546 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2547 echo "to reference the same OST-object (which is f3's OST-obejct)."
2548 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2549 echo "dangling reference case, but f4's old OST-object is there."
2552 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2553 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2554 chown 1.1 $DIR/$tdir/a1/f2
2555 chown 1.1 $DIR/$tdir/a1/f4
2556 rm -f $DIR/$tdir/a1/f1
2557 rm -f $DIR/$tdir/a1/f3
2560 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2562 echo "stopall to cleanup object cache"
2565 setupall > /dev/null
2567 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2568 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2570 for k in $(seq $MDSCOUNT); do
2571 # The LFSCK status query internal is 30 seconds. For the case
2572 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2573 # time to guarantee the status sync up.
2574 wait_update_facet mds${k} "$LCTL get_param -n \
2575 mdd.$(facet_svc mds${k}).lfsck_layout |
2576 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2577 error "(3) MDS${k} is not the expected 'completed'"
2580 for k in $(seq $OSTCOUNT); do
2581 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2582 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2583 awk '/^status/ { print $2 }')
2584 [ "$cur_status" == "completed" ] ||
2585 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2588 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2589 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2590 awk '/^repaired_orphan/ { print $2 }')
2591 [ $repaired -eq 2 ] ||
2592 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2594 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2595 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2596 awk '/^repaired_dangling/ { print $2 }')
2597 [ $repaired -eq 0 ] ||
2598 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2600 echo "The file size should be correct after layout LFSCK scanning"
2601 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2602 [ "$cur_size" == "$saved_size1" ] ||
2603 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2605 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2606 [ "$cur_size" == "$saved_size2" ] ||
2607 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2609 echo "The LFSCK should find back the original data."
2610 cat $DIR/$tdir/a1/f2
2611 $LFS path2fid $DIR/$tdir/a1/f2
2612 $LFS getstripe $DIR/$tdir/a1/f2
2613 cat $DIR/$tdir/a1/f4
2614 $LFS path2fid $DIR/$tdir/a1/f4
2615 $LFS getstripe $DIR/$tdir/a1/f4
2617 run_test 18d "Find out orphan OST-object and repair it (4)"
2620 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2621 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2622 skip "MDS older than 2.5.55, LU-3336"
2625 echo "The target MDT-object layout EA slot is occpuied by some new"
2626 echo "created OST-object when repair dangling reference case. Such"
2627 echo "conflict OST-object has been modified by others. To keep the"
2628 echo "new data, the LFSCK will create a new file to refernece this"
2629 echo "old orphan OST-object."
2632 check_mount_and_prep
2634 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2635 echo "guard" > $DIR/$tdir/a1/f1
2636 echo "foo" > $DIR/$tdir/a1/f2
2638 echo "guard" > $DIR/$tdir/a1/f3
2639 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2640 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2641 echo "foo" > $DIR/$tdir/a1/f4
2643 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2644 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2646 $LFS path2fid $DIR/$tdir/a1/f1
2647 $LFS getstripe $DIR/$tdir/a1/f1
2648 $LFS path2fid $DIR/$tdir/a1/f2
2649 $LFS getstripe $DIR/$tdir/a1/f2
2650 $LFS path2fid $DIR/$tdir/a1/f3
2651 $LFS getstripe $DIR/$tdir/a1/f3
2652 $LFS path2fid $DIR/$tdir/a1/f4
2653 $LFS getstripe $DIR/$tdir/a1/f4
2654 cancel_lru_locks osc
2656 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2657 echo "to reference the same OST-object (which is f1's OST-obejct)."
2658 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2659 echo "dangling reference case, but f2's old OST-object is there."
2661 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2662 echo "to reference the same OST-object (which is f3's OST-obejct)."
2663 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2664 echo "dangling reference case, but f4's old OST-object is there."
2667 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2668 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2669 chown 1.1 $DIR/$tdir/a1/f2
2670 chown 1.1 $DIR/$tdir/a1/f4
2671 rm -f $DIR/$tdir/a1/f1
2672 rm -f $DIR/$tdir/a1/f3
2675 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2677 echo "stopall to cleanup object cache"
2680 setupall > /dev/null
2682 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2683 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2685 start_full_debug_logging
2687 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2688 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2690 wait_update_facet mds1 "$LCTL get_param -n \
2691 mdd.$(facet_svc mds1).lfsck_layout |
2692 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2693 error "(3) MDS1 is not the expected 'scanning-phase2'"
2695 # to guarantee all updates are synced.
2699 echo "Write new data to f2/f4 to modify the new created OST-object."
2700 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2701 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2703 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2705 for k in $(seq $MDSCOUNT); do
2706 # The LFSCK status query internal is 30 seconds. For the case
2707 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2708 # time to guarantee the status sync up.
2709 wait_update_facet mds${k} "$LCTL get_param -n \
2710 mdd.$(facet_svc mds${k}).lfsck_layout |
2711 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2712 error "(4) MDS${k} is not the expected 'completed'"
2715 for k in $(seq $OSTCOUNT); do
2716 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2717 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2718 awk '/^status/ { print $2 }')
2719 [ "$cur_status" == "completed" ] ||
2720 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2723 stop_full_debug_logging
2725 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2726 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2727 awk '/^repaired_orphan/ { print $2 }')
2728 [ $repaired -eq 2 ] ||
2729 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2731 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2732 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2733 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2735 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2736 if [ $count -ne 2 ]; then
2737 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2738 error "(8) Expect 2 stubs under lost+found, but got $count"
2741 echo "The stub file should keep the original f2 or f4 data"
2742 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2743 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2744 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2745 error "(9) Got unexpected $cur_size"
2748 $LFS path2fid $cname
2749 $LFS getstripe $cname
2751 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2752 cur_size=$(ls -il $cname | awk '{ print $6 }')
2753 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2754 error "(10) Got unexpected $cur_size"
2757 $LFS path2fid $cname
2758 $LFS getstripe $cname
2760 echo "The f2/f4 should contains new data."
2761 cat $DIR/$tdir/a1/f2
2762 $LFS path2fid $DIR/$tdir/a1/f2
2763 $LFS getstripe $DIR/$tdir/a1/f2
2764 cat $DIR/$tdir/a1/f4
2765 $LFS path2fid $DIR/$tdir/a1/f4
2766 $LFS getstripe $DIR/$tdir/a1/f4
2768 run_test 18e "Find out orphan OST-object and repair it (5)"
2771 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2774 echo "The target MDT-object is lost. The LFSCK should re-create the"
2775 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2776 echo "to verify some OST-object(s) during the first stage-scanning,"
2777 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2778 echo "should not be affected."
2781 check_mount_and_prep
2782 $LFS mkdir -i 0 $DIR/$tdir/a1
2783 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2784 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2785 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2786 $LFS mkdir -i 0 $DIR/$tdir/a2
2787 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2788 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2789 $LFS getstripe $DIR/$tdir/a1/f1
2790 $LFS getstripe $DIR/$tdir/a2/f2
2792 if [ $MDSCOUNT -ge 2 ]; then
2793 $LFS mkdir -i 1 $DIR/$tdir/a3
2794 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2795 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2796 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2797 $LFS mkdir -i 1 $DIR/$tdir/a4
2798 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2799 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2800 $LFS getstripe $DIR/$tdir/a3/f3
2801 $LFS getstripe $DIR/$tdir/a4/f4
2804 cancel_lru_locks osc
2806 echo "Inject failure, to simulate the case of missing the MDT-object"
2807 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2808 do_facet mds1 $LCTL set_param fail_loc=0x1616
2809 rm -f $DIR/$tdir/a1/f1
2810 rm -f $DIR/$tdir/a2/f2
2812 if [ $MDSCOUNT -ge 2 ]; then
2813 do_facet mds2 $LCTL set_param fail_loc=0x1616
2814 rm -f $DIR/$tdir/a3/f3
2815 rm -f $DIR/$tdir/a4/f4
2821 do_facet mds1 $LCTL set_param fail_loc=0
2822 if [ $MDSCOUNT -ge 2 ]; then
2823 do_facet mds2 $LCTL set_param fail_loc=0
2826 cancel_lru_locks mdc
2827 cancel_lru_locks osc
2829 echo "Inject failure, to simulate the OST0 fail to handle"
2830 echo "MDT0 LFSCK request during the first-stage scanning."
2831 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2832 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2834 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2835 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2837 for k in $(seq $MDSCOUNT); do
2838 # The LFSCK status query internal is 30 seconds. For the case
2839 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2840 # time to guarantee the status sync up.
2841 wait_update_facet mds${k} "$LCTL get_param -n \
2842 mdd.$(facet_svc mds${k}).lfsck_layout |
2843 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2844 error "(2) MDS${k} is not the expected 'partial'"
2847 wait_update_facet ost1 "$LCTL get_param -n \
2848 obdfilter.$(facet_svc ost1).lfsck_layout |
2849 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2850 error "(3) OST1 is not the expected 'partial'"
2853 wait_update_facet ost2 "$LCTL get_param -n \
2854 obdfilter.$(facet_svc ost2).lfsck_layout |
2855 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2856 error "(4) OST2 is not the expected 'completed'"
2859 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2861 local repaired=$(do_facet mds1 $LCTL get_param -n \
2862 mdd.$(facet_svc mds1).lfsck_layout |
2863 awk '/^repaired_orphan/ { print $2 }')
2864 [ $repaired -eq 1 ] ||
2865 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2867 if [ $MDSCOUNT -ge 2 ]; then
2868 repaired=$(do_facet mds2 $LCTL get_param -n \
2869 mdd.$(facet_svc mds2).lfsck_layout |
2870 awk '/^repaired_orphan/ { print $2 }')
2871 [ $repaired -eq 1 ] ||
2872 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2875 echo "Trigger layout LFSCK on all devices again to cleanup"
2876 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2878 for k in $(seq $MDSCOUNT); do
2879 # The LFSCK status query internal is 30 seconds. For the case
2880 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2881 # time to guarantee the status sync up.
2882 wait_update_facet mds${k} "$LCTL get_param -n \
2883 mdd.$(facet_svc mds${k}).lfsck_layout |
2884 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2885 error "(8) MDS${k} is not the expected 'completed'"
2888 for k in $(seq $OSTCOUNT); do
2889 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2890 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2891 awk '/^status/ { print $2 }')
2892 [ "$cur_status" == "completed" ] ||
2893 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2897 local repaired=$(do_facet mds1 $LCTL get_param -n \
2898 mdd.$(facet_svc mds1).lfsck_layout |
2899 awk '/^repaired_orphan/ { print $2 }')
2900 [ $repaired -eq 2 ] ||
2901 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2903 if [ $MDSCOUNT -ge 2 ]; then
2904 repaired=$(do_facet mds2 $LCTL get_param -n \
2905 mdd.$(facet_svc mds2).lfsck_layout |
2906 awk '/^repaired_orphan/ { print $2 }')
2907 [ $repaired -eq 2 ] ||
2908 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2911 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2914 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2917 echo "The target MDT-object is lost, but related OI mapping is there"
2918 echo "The LFSCK should recreate the lost MDT-object without affected"
2919 echo "by the stale OI mapping."
2922 check_mount_and_prep
2923 $LFS mkdir -i 0 $DIR/$tdir/a1
2924 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2925 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2926 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2928 $LFS getstripe $DIR/$tdir/a1/f1
2929 cancel_lru_locks osc
2931 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2932 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2933 do_facet mds1 $LCTL set_param fail_loc=0x162e
2934 rm -f $DIR/$tdir/a1/f1
2936 do_facet mds1 $LCTL set_param fail_loc=0
2937 cancel_lru_locks mdc
2938 cancel_lru_locks osc
2940 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2941 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2943 for k in $(seq $MDSCOUNT); do
2944 # The LFSCK status query internal is 30 seconds. For the case
2945 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2946 # time to guarantee the status sync up.
2947 wait_update_facet mds${k} "$LCTL get_param -n \
2948 mdd.$(facet_svc mds${k}).lfsck_layout |
2949 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2950 error "(2) MDS${k} is not the expected 'completed'"
2953 for k in $(seq $OSTCOUNT); do
2954 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2955 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2956 awk '/^status/ { print $2 }')
2957 [ "$cur_status" == "completed" ] ||
2958 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2961 local repaired=$(do_facet mds1 $LCTL get_param -n \
2962 mdd.$(facet_svc mds1).lfsck_layout |
2963 awk '/^repaired_orphan/ { print $2 }')
2964 [ $repaired -eq $OSTCOUNT ] ||
2965 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2967 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2968 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2969 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2971 $LFS path2fid $DIR/$tdir/a1/f1
2972 $LFS getstripe $DIR/$tdir/a1/f1
2974 run_test 18g "Find out orphan OST-object and repair it (7)"
2978 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2979 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2980 echo "scanning its OST-object(s). Then in the second stage scanning,"
2981 echo "the OST will return related OST-object(s) to the MDT as orphan."
2982 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2983 echo "the 'orphan(s)' stripe information."
2986 check_mount_and_prep
2988 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2989 error "(0) Fail to create PFL $DIR/$tdir/f0"
2991 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2992 error "(1.1) Fail to write $DIR/$tdir/f0"
2994 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2995 error "(1.2) Fail to write $DIR/$tdir/f0"
2997 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2999 echo "Inject failure stub to simulate bad PFL extent range"
3000 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3001 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3003 chown 1.1 $DIR/$tdir/f0
3005 cancel_lru_locks mdc
3006 cancel_lru_locks osc
3007 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3009 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3010 error "(2) Write to bad PFL file should fail"
3012 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3013 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3015 for k in $(seq $MDSCOUNT); do
3016 # The LFSCK status query internal is 30 seconds. For the case
3017 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3018 # time to guarantee the status sync up.
3019 wait_update_facet mds${k} "$LCTL get_param -n \
3020 mdd.$(facet_svc mds${k}).lfsck_layout |
3021 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3022 error "(4.1) MDS${k} is not the expected 'completed'"
3025 for k in $(seq $OSTCOUNT); do
3026 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3027 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3028 awk '/^status/ { print $2 }')
3029 [ "$cur_status" == "completed" ] ||
3030 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3034 local repaired=$($SHOW_LAYOUT |
3035 awk '/^repaired_orphan/ { print $2 }')
3036 [ $repaired -eq 2 ] ||
3037 error "(5) Fail to repair crashed PFL range: $repaired"
3039 echo "Data in $DIR/$tdir/f0 should not be broken"
3040 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3041 error "(6) Data in $DIR/$tdir/f0 is broken"
3043 echo "Write should succeed after LFSCK repairing the bad PFL range"
3044 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3045 error "(7) Write should succeed after LFSCK"
3047 run_test 18h "LFSCK can repair crashed PFL extent range"
3049 $LCTL set_param debug=-cache > /dev/null
3052 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3053 skip "MDS older than 2.5.55, LU-3951"
3055 check_mount_and_prep
3056 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3058 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3059 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3061 echo "foo1" > $DIR/$tdir/a0
3062 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3063 error "(0) Fail to create PFL $DIR/$tdir/a1"
3064 echo "foo2" > $DIR/$tdir/a1
3065 echo "guard" > $DIR/$tdir/a2
3066 cancel_lru_locks osc
3068 echo "Inject failure, then client will offer wrong parent FID when read"
3069 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3070 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3072 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3073 $LCTL set_param fail_loc=0x1619
3075 echo "Read RPC with wrong parent FID should be denied"
3076 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3077 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3078 $LCTL set_param fail_loc=0
3080 run_test 19a "OST-object inconsistency self detect"
3083 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3084 skip "MDS older than 2.5.55, LU-3951"
3086 check_mount_and_prep
3087 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3089 echo "Inject failure stub to make the OST-object to back point to"
3090 echo "non-exist MDT-object"
3092 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3093 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3095 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3096 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3097 echo "foo1" > $DIR/$tdir/f0
3098 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3099 error "(0) Fail to create PFL $DIR/$tdir/f1"
3100 echo "foo2" > $DIR/$tdir/f1
3101 cancel_lru_locks osc
3102 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3104 do_facet ost1 $LCTL set_param -n \
3105 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3106 echo "Nothing should be fixed since self detect and repair is disabled"
3107 local repaired=$(do_facet ost1 $LCTL get_param -n \
3108 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3109 awk '/^repaired/ { print $2 }')
3110 [ $repaired -eq 0 ] ||
3111 error "(1) Expected 0 repaired, but got $repaired"
3113 echo "Read RPC with right parent FID should be accepted,"
3114 echo "and cause parent FID on OST to be fixed"
3116 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3117 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3119 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3120 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3122 repaired=$(do_facet ost1 $LCTL get_param -n \
3123 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3124 awk '/^repaired/ { print $2 }')
3125 [ $repaired -eq 2 ] ||
3126 error "(3) Expected 1 repaired, but got $repaired"
3128 run_test 19b "OST-object inconsistency self repair"
3130 PATTERN_WITH_HOLE="40000001"
3131 PATTERN_WITHOUT_HOLE="raid0"
3134 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3135 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3136 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3137 skip "MDS older than 2.5.55, LU-4887"
3140 echo "The target MDT-object and some of its OST-object are lost."
3141 echo "The LFSCK should find out the left OST-objects and re-create"
3142 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3143 echo "with the partial OST-objects (LOV EA hole)."
3145 echo "New client can access the file with LOV EA hole via normal"
3146 echo "system tools or commands without crash the system."
3148 echo "For old client, even though it cannot access the file with"
3149 echo "LOV EA hole, it should not cause the system crash."
3152 check_mount_and_prep
3153 $LFS mkdir -i 0 $DIR/$tdir/a1
3154 if [ $OSTCOUNT -gt 2 ]; then
3155 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3158 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3162 # 256 blocks on the stripe0.
3163 # 1 block on the stripe1 for 2 OSTs case.
3164 # 256 blocks on the stripe1 for other cases.
3165 # 1 block on the stripe2 if OSTs > 2
3166 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3167 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3168 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3170 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3171 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3172 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3175 $LFS getstripe $DIR/$tdir/a1/f0
3177 $LFS getstripe $DIR/$tdir/a1/f1
3179 $LFS getstripe $DIR/$tdir/a1/f2
3181 if [ $OSTCOUNT -gt 2 ]; then
3182 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3183 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3185 $LFS getstripe $DIR/$tdir/a1/f3
3188 cancel_lru_locks osc
3190 echo "Inject failure..."
3191 echo "To simulate f0 lost MDT-object"
3192 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3193 do_facet mds1 $LCTL set_param fail_loc=0x1616
3194 rm -f $DIR/$tdir/a1/f0
3196 echo "To simulate f1 lost MDT-object and OST-object0"
3197 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3198 do_facet mds1 $LCTL set_param fail_loc=0x161a
3199 rm -f $DIR/$tdir/a1/f1
3201 echo "To simulate f2 lost MDT-object and OST-object1"
3202 do_facet mds1 $LCTL set_param fail_val=1
3203 rm -f $DIR/$tdir/a1/f2
3205 if [ $OSTCOUNT -gt 2 ]; then
3206 echo "To simulate f3 lost MDT-object and OST-object2"
3207 do_facet mds1 $LCTL set_param fail_val=2
3208 rm -f $DIR/$tdir/a1/f3
3211 umount_client $MOUNT
3214 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3216 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3217 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3219 for k in $(seq $MDSCOUNT); do
3220 # The LFSCK status query internal is 30 seconds. For the case
3221 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3222 # time to guarantee the status sync up.
3223 wait_update_facet mds${k} "$LCTL get_param -n \
3224 mdd.$(facet_svc mds${k}).lfsck_layout |
3225 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3226 error "(2) MDS${k} is not the expected 'completed'"
3229 for k in $(seq $OSTCOUNT); do
3230 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3231 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3232 awk '/^status/ { print $2 }')
3233 [ "$cur_status" == "completed" ] ||
3234 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3237 local repaired=$(do_facet mds1 $LCTL get_param -n \
3238 mdd.$(facet_svc mds1).lfsck_layout |
3239 awk '/^repaired_orphan/ { print $2 }')
3240 if [ $OSTCOUNT -gt 2 ]; then
3241 [ $repaired -eq 9 ] ||
3242 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3244 [ $repaired -eq 4 ] ||
3245 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3248 mount_client $MOUNT || error "(5.0) Fail to start client!"
3250 LOV_PATTERN_F_HOLE=0x40000000
3253 # ${fid0}-R-0 is the old f0
3255 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3256 echo "Check $name, which is the old f0"
3258 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3260 local pattern=$($LFS getstripe -L $name)
3261 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3262 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3264 local stripes=$($LFS getstripe -c $name)
3265 if [ $OSTCOUNT -gt 2 ]; then
3266 [ $stripes -eq 3 ] ||
3267 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3269 [ $stripes -eq 2 ] ||
3270 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3273 local size=$(stat $name | awk '/Size:/ { print $2 }')
3274 [ $size -eq $((4096 * $bcount)) ] ||
3275 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3277 cat $name > /dev/null || error "(5.5) cannot read $name"
3279 echo "dummy" >> $name || error "(5.6) cannot write $name"
3281 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3283 touch $name || error "(5.8) cannot touch $name"
3285 rm -f $name || error "(5.9) cannot unlink $name"
3288 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3290 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3291 if [ $OSTCOUNT -gt 2 ]; then
3292 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3294 echo "Check $name, it contains the old f1's stripe1"
3297 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3299 pattern=$($LFS getstripe -L $name)
3300 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3301 error "(6.2) expect pattern flag hole, but got $pattern"
3303 stripes=$($LFS getstripe -c $name)
3304 if [ $OSTCOUNT -gt 2 ]; then
3305 [ $stripes -eq 3 ] ||
3306 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3308 [ $stripes -eq 2 ] ||
3309 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3312 size=$(stat $name | awk '/Size:/ { print $2 }')
3313 [ $size -eq $((4096 * $bcount)) ] ||
3314 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3316 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3318 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3319 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3322 [ $failures -eq 256 ] ||
3323 error "(6.6) expect 256 IO failures, but get $failures"
3325 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3326 [ $size -eq $((4096 * $bcount)) ] ||
3327 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3329 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3330 error "(6.8) write to the LOV EA hole should fail"
3332 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3333 error "(6.9) write to normal stripe should NOT fail"
3335 echo "foo" >> $name && error "(6.10) append write $name should fail"
3337 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3339 touch $name || error "(6.12) cannot touch $name"
3341 rm -f $name || error "(6.13) cannot unlink $name"
3344 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3346 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3347 if [ $OSTCOUNT -gt 2 ]; then
3348 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3350 echo "Check $name, it contains the old f2's stripe0"
3353 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3355 pattern=$($LFS getstripe -L $name)
3356 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3357 error "(7.2) expect pattern flag hole, but got $pattern"
3359 stripes=$($LFS getstripe -c $name)
3360 size=$(stat $name | awk '/Size:/ { print $2 }')
3361 if [ $OSTCOUNT -gt 2 ]; then
3362 [ $stripes -eq 3 ] ||
3363 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3365 [ $size -eq $((4096 * $bcount)) ] ||
3366 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3368 cat $name > /dev/null &&
3369 error "(7.5.1) normal read $name should fail"
3371 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3372 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3374 [ $failures -eq 256 ] ||
3375 error "(7.6) expect 256 IO failures, but get $failures"
3377 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3378 [ $size -eq $((4096 * $bcount)) ] ||
3379 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3381 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3382 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3384 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3385 error "(7.8.1) write to normal stripe should NOT fail"
3387 echo "foo" >> $name &&
3388 error "(7.8.3) append write $name should fail"
3390 chown $RUNAS_ID:$RUNAS_GID $name ||
3391 error "(7.9.1) cannot chown on $name"
3393 touch $name || error "(7.10.1) cannot touch $name"
3395 [ $stripes -eq 2 ] ||
3396 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3399 [ $size -eq $((4096 * (256 + 0))) ] ||
3400 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3402 cat $name > /dev/null &&
3403 error "(7.5.2) normal read $name should fail"
3405 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3406 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3407 [ $failures -eq 256 ] ||
3408 error "(7.6.2) expect 256 IO failures, but get $failures"
3411 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3412 [ $size -eq $((4096 * $bcount)) ] ||
3413 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3415 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3416 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3418 chown $RUNAS_ID:$RUNAS_GID $name ||
3419 error "(7.9.2) cannot chown on $name"
3421 touch $name || error "(7.10.2) cannot touch $name"
3424 rm -f $name || error "(7.11) cannot unlink $name"
3426 [ $OSTCOUNT -le 2 ] && return
3429 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3431 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3432 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3434 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3436 pattern=$($LFS getstripe -L $name)
3437 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3438 error "(8.2) expect pattern flag hole, but got $pattern"
3440 stripes=$($LFS getstripe -c $name)
3441 [ $stripes -eq 3 ] ||
3442 error "(8.3) expect the stripe count is 3, but got $stripes"
3444 size=$(stat $name | awk '/Size:/ { print $2 }')
3446 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3447 error "(8.4) expect the size $((4096 * 512)), but got $size"
3449 cat $name > /dev/null &&
3450 error "(8.5) normal read $name should fail"
3452 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3453 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3455 [ $failures -eq 256 ] ||
3456 error "(8.6) expect 256 IO failures, but get $failures"
3459 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3460 [ $size -eq $((4096 * $bcount)) ] ||
3461 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3463 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3464 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3466 chown $RUNAS_ID:$RUNAS_GID $name ||
3467 error "(8.9) cannot chown on $name"
3469 touch $name || error "(8.10) cannot touch $name"
3471 rm -f $name || error "(8.11) cannot unlink $name"
3473 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3476 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3477 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3478 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3479 skip "MDS older than 2.5.55, LU-4887"
3482 echo "The target MDT-object and some of its OST-object are lost."
3483 echo "The LFSCK should find out the left OST-objects and re-create"
3484 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3485 echo "with the partial OST-objects (LOV EA hole)."
3487 echo "New client can access the file with LOV EA hole via normal"
3488 echo "system tools or commands without crash the system - PFL case."
3491 check_mount_and_prep
3493 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3494 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3495 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3496 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3497 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3498 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3500 local bcount=$((256 * 3 + 1))
3502 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3503 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3504 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3506 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3507 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3508 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3511 $LFS getstripe $DIR/$tdir/f0
3513 $LFS getstripe $DIR/$tdir/f1
3515 $LFS getstripe $DIR/$tdir/f2
3517 cancel_lru_locks mdc
3518 cancel_lru_locks osc
3520 echo "Inject failure..."
3521 echo "To simulate f0 lost MDT-object"
3522 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3523 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3526 echo "To simulate the case of f1 lost MDT-object and "
3527 echo "the first OST-object in each PFL component"
3528 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3529 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3532 echo "To simulate the case of f2 lost MDT-object and "
3533 echo "the second OST-object in each PFL component"
3534 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3539 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3541 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3542 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3544 for k in $(seq $MDSCOUNT); do
3545 # The LFSCK status query internal is 30 seconds. For the case
3546 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3547 # time to guarantee the status sync up.
3548 wait_update_facet mds${k} "$LCTL get_param -n \
3549 mdd.$(facet_svc mds${k}).lfsck_layout |
3550 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3551 error "(4) MDS${k} is not the expected 'completed'"
3554 for k in $(seq $OSTCOUNT); do
3555 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3556 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3557 awk '/^status/ { print $2 }')
3558 [ "$cur_status" == "completed" ] ||
3559 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3562 local repaired=$(do_facet mds1 $LCTL get_param -n \
3563 mdd.$(facet_svc mds1).lfsck_layout |
3564 awk '/^repaired_orphan/ { print $2 }')
3565 [ $repaired -eq 8 ] ||
3566 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3569 # ${fid0}-R-0 is the old f0
3571 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3572 echo "Check $name, which is the old f0"
3574 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3576 local pattern=$($LFS getstripe -L -I1 $name)
3577 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3578 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3580 pattern=$($LFS getstripe -L -I2 $name)
3581 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3582 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3584 local stripes=$($LFS getstripe -c -I1 $name)
3585 [ $stripes -eq 2 ] ||
3586 error "(7.3.1) expect 2 stripes, but got $stripes"
3588 stripes=$($LFS getstripe -c -I2 $name)
3589 [ $stripes -eq 2 ] ||
3590 error "(7.3.2) expect 2 stripes, but got $stripes"
3592 local e_start=$($LFS getstripe -I1 $name |
3593 awk '/lcme_extent.e_start:/ { print $2 }')
3594 [ $e_start -eq 0 ] ||
3595 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3597 local e_end=$($LFS getstripe -I1 $name |
3598 awk '/lcme_extent.e_end:/ { print $2 }')
3599 [ $e_end -eq 2097152 ] ||
3600 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3602 e_start=$($LFS getstripe -I2 $name |
3603 awk '/lcme_extent.e_start:/ { print $2 }')
3604 [ $e_start -eq 2097152 ] ||
3605 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3607 e_end=$($LFS getstripe -I2 $name |
3608 awk '/lcme_extent.e_end:/ { print $2 }')
3609 [ "$e_end" = "EOF" ] ||
3610 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3612 local size=$(stat $name | awk '/Size:/ { print $2 }')
3613 [ $size -eq $((4096 * $bcount)) ] ||
3614 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3616 cat $name > /dev/null || error "(7.7) cannot read $name"
3618 echo "dummy" >> $name || error "(7.8) cannot write $name"
3620 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3622 touch $name || error "(7.10) cannot touch $name"
3624 rm -f $name || error "(7.11) cannot unlink $name"
3627 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3629 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3630 echo "Check $name, it contains f1's second OST-object in each COMP"
3632 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3634 pattern=$($LFS getstripe -L -I1 $name)
3635 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3636 error "(8.2.1) expect pattern flag hole, but got $pattern"
3638 pattern=$($LFS getstripe -L -I2 $name)
3639 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3640 error "(8.2.2) expect pattern flag hole, but got $pattern"
3642 stripes=$($LFS getstripe -c -I1 $name)
3643 [ $stripes -eq 2 ] ||
3644 error "(8.3.2) expect 2 stripes, but got $stripes"
3646 stripes=$($LFS getstripe -c -I2 $name)
3647 [ $stripes -eq 2 ] ||
3648 error "(8.3.2) expect 2 stripes, but got $stripes"
3650 e_start=$($LFS getstripe -I1 $name |
3651 awk '/lcme_extent.e_start:/ { print $2 }')
3652 [ $e_start -eq 0 ] ||
3653 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3655 e_end=$($LFS getstripe -I1 $name |
3656 awk '/lcme_extent.e_end:/ { print $2 }')
3657 [ $e_end -eq 2097152 ] ||
3658 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3660 e_start=$($LFS getstripe -I2 $name |
3661 awk '/lcme_extent.e_start:/ { print $2 }')
3662 [ $e_start -eq 2097152 ] ||
3663 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3665 e_end=$($LFS getstripe -I2 $name |
3666 awk '/lcme_extent.e_end:/ { print $2 }')
3667 [ "$e_end" = "EOF" ] ||
3668 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3670 size=$(stat $name | awk '/Size:/ { print $2 }')
3671 [ $size -eq $((4096 * $bcount)) ] ||
3672 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3674 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3676 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3677 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3679 # The first stripe in each COMP was lost
3680 [ $failures -eq 512 ] ||
3681 error "(8.8) expect 512 IO failures, but get $failures"
3683 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3684 [ $size -eq $((4096 * $bcount)) ] ||
3685 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3687 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3688 error "(8.10) write to the LOV EA hole should fail"
3690 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3691 error "(8.11) write to normal stripe should NOT fail"
3693 echo "foo" >> $name && error "(8.12) append write $name should fail"
3695 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3697 touch $name || error "(8.14) cannot touch $name"
3699 rm -f $name || error "(8.15) cannot unlink $name"
3702 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3704 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3705 echo "Check $name, it contains f2's first stripe in each COMP"
3707 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3709 pattern=$($LFS getstripe -L -I1 $name)
3710 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3711 error "(9.2.1) expect pattern flag hole, but got $pattern"
3713 pattern=$($LFS getstripe -L -I2 $name)
3714 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3715 error "(9.2.2) expect pattern flag hole, but got $pattern"
3717 stripes=$($LFS getstripe -c -I1 $name)
3718 [ $stripes -eq 2 ] ||
3719 error "(9.3.2) expect 2 stripes, but got $stripes"
3721 stripes=$($LFS getstripe -c -I2 $name)
3722 [ $stripes -eq 2 ] ||
3723 error "(9.3.2) expect 2 stripes, but got $stripes"
3725 e_start=$($LFS getstripe -I1 $name |
3726 awk '/lcme_extent.e_start:/ { print $2 }')
3727 [ $e_start -eq 0 ] ||
3728 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3730 e_end=$($LFS getstripe -I1 $name |
3731 awk '/lcme_extent.e_end:/ { print $2 }')
3732 [ $e_end -eq 2097152 ] ||
3733 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3735 e_start=$($LFS getstripe -I2 $name |
3736 awk '/lcme_extent.e_start:/ { print $2 }')
3737 [ $e_start -eq 2097152 ] ||
3738 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3740 e_end=$($LFS getstripe -I2 $name |
3741 awk '/lcme_extent.e_end:/ { print $2 }')
3742 [ "$e_end" = "EOF" ] ||
3743 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3745 size=$(stat $name | awk '/Size:/ { print $2 }')
3746 # The second stripe in COMP was lost, so we do not know there
3747 # have ever been some data before. 'stat' will regard it as
3748 # no data on the lost stripe.
3750 [ $size -eq $((4096 * $bcount)) ] ||
3751 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3753 cat $name > /dev/null &&
3754 error "(9.7) normal read $name should fail"
3756 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3757 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3758 [ $failures -eq 512 ] ||
3759 error "(9.8) expect 256 IO failures, but get $failures"
3761 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3762 # The second stripe in COMP was lost, so we do not know there
3763 # have ever been some data before. Since 'dd' skip failure,
3764 # it will regard the lost stripe contains data.
3766 [ $size -eq $((4096 * $bcount)) ] ||
3767 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3769 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3770 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3772 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3773 error "(9.11) write to normal stripe should NOT fail"
3775 echo "foo" >> $name &&
3776 error "(9.12) append write $name should fail"
3778 chown $RUNAS_ID:$RUNAS_GID $name ||
3779 error "(9.13) cannot chown on $name"
3781 touch $name || error "(9.14) cannot touch $name"
3783 rm -f $name || error "(7.15) cannot unlink $name"
3785 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3788 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3789 skip "MDS older than 2.5.59, LU-4887"
3791 check_mount_and_prep
3792 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3794 echo "Start all LFSCK components by default (-s 1)"
3795 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3796 error "Fail to start LFSCK"
3798 echo "namespace LFSCK should be in 'scanning-phase1' status"
3799 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3800 [ "$STATUS" == "scanning-phase1" ] ||
3801 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3803 echo "layout LFSCK should be in 'scanning-phase1' status"
3804 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3805 [ "$STATUS" == "scanning-phase1" ] ||
3806 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3808 echo "Stop all LFSCK components by default"
3809 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3810 error "Fail to stop LFSCK"
3812 run_test 21 "run all LFSCK components by default"
3815 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3816 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3817 skip "MDS older than 2.6.50, LU-5511"
3820 echo "The parent_A references the child directory via some name entry,"
3821 echo "but the child directory back references another parent_B via its"
3822 echo "".." name entry. The parent_B does not exist. Then the namespace"
3823 echo "LFSCK will repair the child directory's ".." name entry."
3826 check_mount_and_prep
3828 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3829 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3831 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3832 echo "The dummy's dotdot name entry references the guard."
3833 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3834 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3835 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3836 error "(3) Fail to mkdir on MDT0"
3837 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3839 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3841 echo "Trigger namespace LFSCK to repair unmatched pairs"
3842 $START_NAMESPACE -A -r ||
3843 error "(5) Fail to start LFSCK for namespace"
3845 wait_all_targets_blocked namespace completed 6
3847 local repaired=$($SHOW_NAMESPACE |
3848 awk '/^unmatched_pairs_repaired/ { print $2 }')
3849 [ $repaired -eq 1 ] ||
3850 error "(7) Fail to repair unmatched pairs: $repaired"
3852 echo "'ls' should success after namespace LFSCK repairing"
3853 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3854 error "(8) ls should success."
3856 run_test 22a "LFSCK can repair unmatched pairs (1)"
3859 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3860 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3861 skip "MDS older than 2.6.50, LU-5511"
3864 echo "The parent_A references the child directory via the name entry_B,"
3865 echo "but the child directory back references another parent_C via its"
3866 echo "".." name entry. The parent_C exists, but there is no the name"
3867 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3868 echo "the child directory's ".." name entry and its linkEA."
3871 check_mount_and_prep
3873 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3874 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3876 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3877 echo "and bad linkEA. The dummy's dotdot name entry references the"
3878 echo "guard. The dummy's linkEA references n non-exist name entry."
3879 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3880 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3881 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3882 error "(3) Fail to mkdir on MDT0"
3883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3885 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3886 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3887 local dummyname=$($LFS fid2path $DIR $dummyfid)
3888 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3889 error "(4) fid2path works unexpectedly."
3891 echo "Trigger namespace LFSCK to repair unmatched pairs"
3892 $START_NAMESPACE -A -r ||
3893 error "(5) Fail to start LFSCK for namespace"
3895 wait_all_targets_blocked namespace completed 6
3897 local repaired=$($SHOW_NAMESPACE |
3898 awk '/^unmatched_pairs_repaired/ { print $2 }')
3899 [ $repaired -eq 1 ] ||
3900 error "(7) Fail to repair unmatched pairs: $repaired"
3902 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3903 local dummyname=$($LFS fid2path $DIR $dummyfid)
3904 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3905 error "(8) fid2path does not work"
3907 run_test 22b "LFSCK can repair unmatched pairs (2)"
3910 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3911 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3912 skip "MDS older than 2.6.50, LU-5512"
3915 echo "The name entry is there, but the MDT-object for such name "
3916 echo "entry does not exist. The namespace LFSCK should find out "
3917 echo "and repair the inconsistency as required."
3920 check_mount_and_prep
3922 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3923 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3925 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3926 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3927 do_facet mds2 $LCTL set_param fail_loc=0x1620
3928 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3929 do_facet mds2 $LCTL set_param fail_loc=0
3931 echo "'ls' should fail because of dangling name entry"
3932 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3934 echo "Trigger namespace LFSCK to find out dangling name entry"
3935 $START_NAMESPACE -A -r ||
3936 error "(5) Fail to start LFSCK for namespace"
3938 wait_all_targets_blocked namespace completed 6
3940 local repaired=$($SHOW_NAMESPACE |
3941 awk '/^dangling_repaired/ { print $2 }')
3942 [ $repaired -eq 1 ] ||
3943 error "(7) Fail to repair dangling name entry: $repaired"
3945 echo "'ls' should fail because not re-create MDT-object by default"
3946 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3948 echo "Trigger namespace LFSCK again to repair dangling name entry"
3949 $START_NAMESPACE -A -r -C ||
3950 error "(9) Fail to start LFSCK for namespace"
3952 wait_all_targets_blocked namespace completed 10
3954 repaired=$($SHOW_NAMESPACE |
3955 awk '/^dangling_repaired/ { print $2 }')
3956 [ $repaired -eq 1 ] ||
3957 error "(11) Fail to repair dangling name entry: $repaired"
3959 echo "'ls' should success after namespace LFSCK repairing"
3960 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3962 run_test 23a "LFSCK can repair dangling name entry (1)"
3965 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3966 skip "MDS older than 2.6.50, LU-5512"
3969 echo "The objectA has multiple hard links, one of them corresponding"
3970 echo "to the name entry_B. But there is something wrong for the name"
3971 echo "entry_B and cause entry_B to references non-exist object_C."
3972 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3973 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3974 echo "comes to the second-stage scanning, it will find that the"
3975 echo "former re-creating object_C is not proper, and will try to"
3976 echo "replace the object_C with the real object_A."
3979 check_mount_and_prep
3981 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3982 $LFS path2fid $DIR/$tdir/d0
3984 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3986 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3987 $LFS path2fid $DIR/$tdir/d0/f0
3989 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3990 $LFS path2fid $DIR/$tdir/d0/f1
3992 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3993 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3995 if [ "$SEQ0" != "$SEQ1" ]; then
3996 # To guarantee that the f0 and f1 are in the same FID seq
3997 rm -f $DIR/$tdir/d0/f0 ||
3998 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3999 echo "dummy" > $DIR/$tdir/d0/f0 ||
4000 error "(3.2) Fail to touch on MDT0"
4001 $LFS path2fid $DIR/$tdir/d0/f0
4004 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4005 OID=$(printf %d $OID)
4007 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4008 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4009 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4010 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4011 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4013 # If there is creation after the dangling injection, it may re-use
4014 # the just released local object (inode) that is referenced by the
4015 # dangling name entry. It will fail the dangling injection.
4016 # So before deleting the target object for the dangling name entry,
4017 # remove some other objects to avoid the target object being reused
4018 # by some potential creations. LU-7429
4019 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4021 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4023 echo "'ls' should fail because of dangling name entry"
4024 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4025 error "(6) ls should fail."
4027 echo "Trigger namespace LFSCK to find out dangling name entry"
4028 $START_NAMESPACE -r -C ||
4029 error "(7) Fail to start LFSCK for namespace"
4031 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4032 mdd.${MDT_DEV}.lfsck_namespace |
4033 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4035 error "(8) unexpected status"
4038 local repaired=$($SHOW_NAMESPACE |
4039 awk '/^dangling_repaired/ { print $2 }')
4040 [ $repaired -eq 1 ] ||
4041 error "(9) Fail to repair dangling name entry: $repaired"
4043 repaired=$($SHOW_NAMESPACE |
4044 awk '/^multiple_linked_repaired/ { print $2 }')
4045 [ $repaired -eq 1 ] ||
4046 error "(10) Fail to drop the former created object: $repaired"
4048 local data=$(cat $DIR/$tdir/d0/foo)
4049 [ "$data" == "dummy" ] ||
4050 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4052 run_test 23b "LFSCK can repair dangling name entry (2)"
4055 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4056 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4057 mdd.${MDT_DEV}.lfsck_namespace |
4058 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4060 error "(10) unexpected status"
4063 stop_full_debug_logging
4067 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4068 skip "MDS older than 2.6.50, LU-5512"
4071 echo "The objectA has multiple hard links, one of them corresponding"
4072 echo "to the name entry_B. But there is something wrong for the name"
4073 echo "entry_B and cause entry_B to references non-exist object_C."
4074 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4075 echo "as dangling, and re-create the lost object_C. And then others"
4076 echo "modified the re-created object_C. When the LFSCK comes to the"
4077 echo "second-stage scanning, it will find that the former re-creating"
4078 echo "object_C maybe wrong and try to replace the object_C with the"
4079 echo "real object_A. But because object_C has been modified, so the"
4080 echo "LFSCK cannot replace it."
4083 start_full_debug_logging
4085 check_mount_and_prep
4087 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4088 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4089 echo "parent_fid=$parent_fid"
4091 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4093 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4094 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4095 echo "f0_fid=$f0_fid"
4097 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4098 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4099 echo "f1_fid=$f1_fid"
4101 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4102 # To guarantee that the f0 and f1 are in the same FID seq
4103 rm -f $DIR/$tdir/d0/f0 ||
4104 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4105 echo "dummy" > $DIR/$tdir/d0/f0 ||
4106 error "(3.2) Fail to touch on MDT0"
4107 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4108 echo "f0_fid=$f0_fid (replaced)"
4111 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4113 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4114 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4115 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4116 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4117 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4119 # If there is creation after the dangling injection, it may re-use
4120 # the just released local object (inode) that is referenced by the
4121 # dangling name entry. It will fail the dangling injection.
4122 # So before deleting the target object for the dangling name entry,
4123 # remove some other objects to avoid the target object being reused
4124 # by some potential creations. LU-7429
4125 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4127 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4129 echo "'ls' should fail because of dangling name entry"
4130 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4131 error "(6) ls should fail."
4133 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4134 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4136 echo "Trigger namespace LFSCK to find out dangling name entry"
4137 $START_NAMESPACE -r -C ||
4138 error "(7) Fail to start LFSCK for namespace"
4140 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4141 # While unexpected by the test, it is valid for LFSCK to repair
4142 # the link to the original object before any data is written.
4143 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4145 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4146 log "LFSCK repaired file prematurely"
4151 stat $DIR/$tdir/d0/foo
4153 error "(8) unexpected size"
4156 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4157 cancel_lru_locks osc
4161 local repaired=$($SHOW_NAMESPACE |
4162 awk '/^dangling_repaired/ { print $2 }')
4163 [ $repaired -eq 1 ] ||
4164 error "(11) Fail to repair dangling name entry: $repaired"
4166 local data=$(cat $DIR/$tdir/d0/foo)
4167 [ "$data" != "dummy" ] ||
4168 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4170 run_test 23c "LFSCK can repair dangling name entry (3)"
4173 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4174 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4175 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4176 skip "MDS older than 2.6.50, LU-5513"
4179 echo "Two MDT-objects back reference the same name entry via their"
4180 echo "each own linkEA entry, but the name entry only references one"
4181 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4182 echo "for the MDT-object that is not recognized. If such MDT-object"
4183 echo "has no other linkEA entry after the removing, then the LFSCK"
4184 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4187 check_mount_and_prep
4189 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4191 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4192 $LFS path2fid $DIR/$tdir/d0/guard
4194 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4195 $LFS path2fid $DIR/$tdir/d0/dummy
4198 if [ $mds1_FSTYPE != ldiskfs ]; then
4199 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4201 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4204 touch $DIR/$tdir/d0/guard/foo ||
4205 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4207 echo "Inject failure stub on MDT0 to simulate the case that"
4208 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4209 echo "that references $DIR/$tdir/d0/guard/foo."
4210 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4211 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4212 echo "there with the same linkEA entry as another MDT-object"
4213 echo "$DIR/$tdir/d0/guard/foo has"
4215 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4216 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4217 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4218 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4219 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4220 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4221 rmdir $DIR/$tdir/d0/dummy/foo ||
4222 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4225 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4226 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4227 error "(6) stat successfully unexpectedly"
4229 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4230 $START_NAMESPACE -A -r ||
4231 error "(7) Fail to start LFSCK for namespace"
4233 wait_all_targets_blocked namespace completed 8
4235 local repaired=$($SHOW_NAMESPACE |
4236 awk '/^multiple_referenced_repaired/ { print $2 }')
4237 [ $repaired -eq 1 ] ||
4238 error "(9) Fail to repair multiple referenced name entry: $repaired"
4240 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4241 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4242 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4244 local cname="$cfid-$pfid-D-0"
4245 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4246 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4248 run_test 24 "LFSCK can repair multiple-referenced name entry"
4251 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4252 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4253 skip "MDS older than 2.6.50, LU-5515"
4256 echo "The file type in the name entry does not match the file type"
4257 echo "claimed by the referenced object. Then the LFSCK will update"
4258 echo "the file type in the name entry."
4261 check_mount_and_prep
4263 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4265 echo "Inject failure stub on MDT0 to simulate the case that"
4266 echo "the file type stored in the name entry is wrong."
4268 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4270 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4273 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4274 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4276 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4277 mdd.${MDT_DEV}.lfsck_namespace |
4278 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4280 error "(4) unexpected status"
4283 local repaired=$($SHOW_NAMESPACE |
4284 awk '/^bad_file_type_repaired/ { print $2 }')
4285 [ $repaired -eq 1 ] ||
4286 error "(5) Fail to repair bad file type in name entry: $repaired"
4288 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4290 run_test 25 "LFSCK can repair bad file type in the name entry"
4293 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4294 skip "MDS older than 2.6.50, LU-5516"
4297 echo "The local name entry back referenced by the MDT-object is lost."
4298 echo "The namespace LFSCK will add the missing local name entry back"
4299 echo "to the normal namespace."
4302 check_mount_and_prep
4304 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4305 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4306 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4308 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4309 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4311 echo "Inject failure stub on MDT0 to simulate the case that"
4312 echo "foo's name entry will be removed, but the foo's object"
4313 echo "and its linkEA are kept in the system."
4315 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4316 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4317 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4318 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4320 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4321 error "(5) 'ls' should fail"
4323 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4324 $START_NAMESPACE -r -A ||
4325 error "(6) Fail to start LFSCK for namespace"
4327 wait_all_targets_blocked namespace completed 7
4329 local repaired=$($SHOW_NAMESPACE |
4330 awk '/^lost_dirent_repaired/ { print $2 }')
4331 [ $repaired -eq 1 ] ||
4332 error "(8) Fail to repair lost dirent: $repaired"
4334 ls -ail $DIR/$tdir/d0/foo ||
4335 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4337 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4338 [ "$foofid" == "$foofid2" ] ||
4339 error "(10) foo's FID changed: $foofid, $foofid2"
4341 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4344 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4345 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4346 skip "MDS older than 2.6.50, LU-5516"
4349 echo "The remote name entry back referenced by the MDT-object is lost."
4350 echo "The namespace LFSCK will add the missing remote name entry back"
4351 echo "to the normal namespace."
4354 check_mount_and_prep
4356 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4357 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4358 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4360 echo "Inject failure stub on MDT0 to simulate the case that"
4361 echo "foo's name entry will be removed, but the foo's object"
4362 echo "and its linkEA are kept in the system."
4364 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4365 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4366 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4367 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4369 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4370 error "(4) 'ls' should fail"
4372 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4373 $START_NAMESPACE -r -A ||
4374 error "(5) Fail to start LFSCK for namespace"
4376 wait_all_targets_blocked namespace completed 6
4378 local repaired=$($SHOW_NAMESPACE |
4379 awk '/^lost_dirent_repaired/ { print $2 }')
4380 [ $repaired -eq 1 ] ||
4381 error "(7) Fail to repair lost dirent: $repaired"
4383 ls -ail $DIR/$tdir/d0/foo ||
4384 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4386 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4387 [ "$foofid" == "$foofid2" ] ||
4388 error "(9) foo's FID changed: $foofid, $foofid2"
4390 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4393 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4394 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4395 skip "MDS older than 2.6.50, LU-5516"
4398 echo "The local parent referenced by the MDT-object linkEA is lost."
4399 echo "The namespace LFSCK will re-create the lost parent as orphan."
4402 check_mount_and_prep
4404 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4405 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4406 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4407 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4409 echo "Inject failure stub on MDT0 to simulate the case that"
4410 echo "foo's name entry will be removed, but the foo's object"
4411 echo "and its linkEA are kept in the system. And then remove"
4412 echo "another hard link and the parent directory."
4414 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4415 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4416 rm -f $DIR/$tdir/d0/foo ||
4417 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4418 rm -f $DIR/$tdir/d0/dummy ||
4419 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4420 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4422 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4423 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4425 echo "Trigger namespace LFSCK to repair the lost parent"
4426 $START_NAMESPACE -r -A ||
4427 error "(6) Fail to start LFSCK for namespace"
4429 wait_all_targets_blocked namespace completed 7
4431 local repaired=$($SHOW_NAMESPACE |
4432 awk '/^lost_dirent_repaired/ { print $2 }')
4433 [ $repaired -eq 1 ] ||
4434 error "(8) Fail to repair lost dirent: $repaired"
4436 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4437 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4438 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4440 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4442 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4443 [ ! -z "$cname" ] ||
4444 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4446 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4449 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4450 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4451 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4452 skip "MDS older than 2.6.50, LU-5516"
4455 echo "The remote parent referenced by the MDT-object linkEA is lost."
4456 echo "The namespace LFSCK will re-create the lost parent as orphan."
4459 check_mount_and_prep
4461 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4462 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4464 $LFS path2fid $DIR/$tdir/d0
4466 echo "Inject failure stub on MDT0 to simulate the case that"
4467 echo "foo's name entry will be removed, but the foo's object"
4468 echo "and its linkEA are kept in the system. And then remove"
4469 echo "the parent directory."
4471 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4472 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4473 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4474 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4476 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4477 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4479 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4480 $START_NAMESPACE -r -A ||
4481 error "(6) Fail to start LFSCK for namespace"
4483 wait_all_targets_blocked namespace completed 7
4485 local repaired=$($SHOW_NAMESPACE |
4486 awk '/^lost_dirent_repaired/ { print $2 }')
4487 [ $repaired -eq 1 ] ||
4488 error "(8) Fail to repair lost dirent: $repaired"
4490 ls -ail $MOUNT/.lustre/lost+found/
4492 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4493 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4494 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4496 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4498 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4499 [ ! -z "$cname" ] ||
4500 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4502 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4505 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4506 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4507 skip "MDS older than 2.6.50, LU-5506"
4510 echo "The target name entry is lost. The LFSCK should insert the"
4511 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4512 echo "the MDT (on which the orphan MDT-object resides) has ever"
4513 echo "failed to respond some name entry verification during the"
4514 echo "first stage-scanning, then the LFSCK should skip to handle"
4515 echo "orphan MDT-object on this MDT. But other MDTs should not"
4519 check_mount_and_prep
4520 $LFS mkdir -i 0 $DIR/$tdir/d1
4521 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4522 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4524 $LFS mkdir -i 1 $DIR/$tdir/d2
4525 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4526 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4528 echo "Inject failure stub on MDT0 to simulate the case that"
4529 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4530 echo "and its linkEA are kept in the system. And the case that"
4531 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4532 echo "and its linkEA are kept in the system."
4534 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4535 do_facet mds1 $LCTL set_param fail_loc=0x1624
4536 do_facet mds2 $LCTL set_param fail_loc=0x1624
4537 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4538 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4539 do_facet mds1 $LCTL set_param fail_loc=0
4540 do_facet mds2 $LCTL set_param fail_loc=0
4542 cancel_lru_locks mdc
4543 cancel_lru_locks osc
4545 echo "Inject failure, to simulate the MDT0 fail to handle"
4546 echo "MDT1 LFSCK request during the first-stage scanning."
4547 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4548 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4550 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4551 $START_NAMESPACE -r -A ||
4552 error "(3) Fail to start LFSCK for namespace"
4554 wait_update_facet mds1 "$LCTL get_param -n \
4555 mdd.$(facet_svc mds1).lfsck_namespace |
4556 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4557 error "(4) mds1 is not the expected 'partial'"
4560 wait_update_facet mds2 "$LCTL get_param -n \
4561 mdd.$(facet_svc mds2).lfsck_namespace |
4562 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4563 error "(5) mds2 is not the expected 'completed'"
4566 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4568 local repaired=$(do_facet mds1 $LCTL get_param -n \
4569 mdd.$(facet_svc mds1).lfsck_namespace |
4570 awk '/^lost_dirent_repaired/ { print $2 }')
4571 [ $repaired -eq 0 ] ||
4572 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4574 repaired=$(do_facet mds2 $LCTL get_param -n \
4575 mdd.$(facet_svc mds2).lfsck_namespace |
4576 awk '/^lost_dirent_repaired/ { print $2 }')
4577 [ $repaired -eq 1 ] ||
4578 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4580 echo "Trigger namespace LFSCK on all devices again to cleanup"
4581 $START_NAMESPACE -r -A ||
4582 error "(8) Fail to start LFSCK for namespace"
4584 wait_all_targets_blocked namespace completed 9
4586 local repaired=$(do_facet mds1 $LCTL get_param -n \
4587 mdd.$(facet_svc mds1).lfsck_namespace |
4588 awk '/^lost_dirent_repaired/ { print $2 }')
4589 [ $repaired -eq 1 ] ||
4590 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4592 repaired=$(do_facet mds2 $LCTL get_param -n \
4593 mdd.$(facet_svc mds2).lfsck_namespace |
4594 awk '/^lost_dirent_repaired/ { print $2 }')
4595 [ $repaired -eq 0 ] ||
4596 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4598 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4601 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4602 skip "MDS older than 2.6.50, LU-5517"
4605 echo "The object's nlink attribute is larger than the object's known"
4606 echo "name entries count. The LFSCK will repair the object's nlink"
4607 echo "attribute to match the known name entries count"
4610 check_mount_and_prep
4612 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4613 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4615 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4616 echo "nlink attribute is larger than its name entries count."
4618 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4620 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4621 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4622 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4624 cancel_lru_locks mdc
4625 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4626 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4628 echo "Trigger namespace LFSCK to repair the nlink count"
4629 $START_NAMESPACE -r -A ||
4630 error "(5) Fail to start LFSCK for namespace"
4632 wait_all_targets_blocked namespace completed 6
4634 local repaired=$($SHOW_NAMESPACE |
4635 awk '/^nlinks_repaired/ { print $2 }')
4636 [ $repaired -eq 1 ] ||
4637 error "(7) Fail to repair nlink count: $repaired"
4639 cancel_lru_locks mdc
4640 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4641 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4643 # Disable 29a, we only allow nlink to be updated if the known linkEA
4644 # entries is larger than nlink count.
4646 #run_test 29a "LFSCK can repair bad nlink count (1)"
4649 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4650 skip "MDS older than 2.6.50, LU-5517"
4653 echo "The object's nlink attribute is smaller than the object's known"
4654 echo "name entries count. The LFSCK will repair the object's nlink"
4655 echo "attribute to match the known name entries count"
4658 check_mount_and_prep
4660 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4661 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4663 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4664 echo "nlink attribute is smaller than its name entries count."
4666 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4667 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4668 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4669 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4670 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4672 cancel_lru_locks mdc
4673 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4674 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4676 echo "Trigger namespace LFSCK to repair the nlink count"
4677 $START_NAMESPACE -r -A ||
4678 error "(5) Fail to start LFSCK for namespace"
4680 wait_all_targets_blocked namespace completed 6
4682 local repaired=$($SHOW_NAMESPACE |
4683 awk '/^nlinks_repaired/ { print $2 }')
4684 [ $repaired -eq 1 ] ||
4685 error "(7) Fail to repair nlink count: $repaired"
4687 cancel_lru_locks mdc
4688 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4689 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4691 run_test 29b "LFSCK can repair bad nlink count (2)"
4695 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4696 skip "MDS older than 2.6.50, LU-5517"
4699 echo "The namespace LFSCK will create many hard links to the target"
4700 echo "file as to exceed the linkEA size limitation. Under such case"
4701 echo "the linkEA will be marked as overflow that will prevent the"
4702 echo "target file to be migrated. Then remove some hard links to"
4703 echo "make the left hard links to be held within the linkEA size"
4704 echo "limitation. But before the namespace LFSCK adding all the"
4705 echo "missed linkEA entries back, the overflow mark (timestamp)"
4706 echo "will not be cleared."
4709 check_mount_and_prep
4711 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4712 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4713 error "(0.2) Fail to mkdir"
4714 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4715 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4717 # define MAX_LINKEA_SIZE 4096
4718 # sizeof(link_ea_header) = 24
4719 # sizeof(link_ea_entry) = 18
4720 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4721 # (sizeof(link_ea_entry) + name_length))
4722 # If the average name length is 12 bytes, then 150 hard links
4723 # is totally enough to overflow the linkEA
4724 echo "Create 150 hard links should succeed although the linkEA overflow"
4725 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4726 error "(2) Fail to hard link"
4728 cancel_lru_locks mdc
4729 if [ $MDSCOUNT -ge 2 ]; then
4730 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4731 error "(3.1) Migrate should fail"
4733 echo "The object with linkEA overflow should NOT be migrated"
4734 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4735 [ "$newfid" == "$oldfid" ] ||
4736 error "(3.2) Migrate should fail: $newfid != $oldfid"
4739 # Remove 100 hard links, then the linkEA should have space
4740 # to hold the missed linkEA entries.
4741 echo "Remove 100 hard links to save space for the missed linkEA entries"
4742 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4744 if [ $MDSCOUNT -ge 2 ]; then
4745 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4746 error "(5.1) Migrate should fail"
4748 # The overflow timestamp is still there, so migration will fail.
4749 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4750 [ "$newfid" == "$oldfid" ] ||
4751 error "(5.2) Migrate should fail: $newfid != $oldfid"
4754 # sleep 3 seconds to guarantee that the overflow is recognized
4757 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4758 $START_NAMESPACE -r -A ||
4759 error "(6) Fail to start LFSCK for namespace"
4761 wait_all_targets_blocked namespace completed 7
4763 local repaired=$($SHOW_NAMESPACE |
4764 awk '/^linkea_overflow_cleared/ { print $2 }')
4765 [ $repaired -eq 1 ] ||
4766 error "(8) Fail to clear linkea overflow: $repaired"
4768 repaired=$($SHOW_NAMESPACE |
4769 awk '/^nlinks_repaired/ { print $2 }')
4770 [ $repaired -eq 0 ] ||
4771 error "(9) Unexpected nlink repaired: $repaired"
4773 if [ $MDSCOUNT -ge 2 ]; then
4774 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4775 error "(10.1) Migrate failure"
4777 # Migration should succeed after clear the overflow timestamp.
4778 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4779 [ "$newfid" != "$oldfid" ] ||
4780 error "(10.2) Migrate should succeed"
4782 ls -l $DIR/$tdir/foo > /dev/null ||
4783 error "(11) 'ls' failed after migration"
4786 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4787 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4789 run_test 29c "verify linkEA size limitation"
4792 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4793 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4794 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4795 skip "MDS older than 2.6.50, LU-5518"
4798 echo "The namespace LFSCK will move the orphans from backend"
4799 echo "/lost+found directory to normal client visible namespace"
4800 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4803 check_mount_and_prep
4805 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4806 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4808 echo "Inject failure stub on MDT0 to simulate the case that"
4809 echo "directory d0 has no linkEA entry, then the LFSCK will"
4810 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4812 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4813 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4814 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4815 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4817 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4818 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4820 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4821 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4823 echo "Inject failure stub on MDT0 to simulate the case that the"
4824 echo "object's name entry will be removed, but not destroy the"
4825 echo "object. Then backend e2fsck will handle it as orphan and"
4826 echo "add them into the backend /lost+found directory."
4828 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4829 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4830 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4831 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4832 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4833 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4834 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4836 umount_client $MOUNT || error "(10) Fail to stop client!"
4838 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4841 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4842 error "(12) Fail to run e2fsck"
4844 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4845 error "(13) Fail to start MDT0"
4847 echo "Trigger namespace LFSCK to recover backend orphans"
4848 $START_NAMESPACE -r -A ||
4849 error "(14) Fail to start LFSCK for namespace"
4851 wait_all_targets_blocked namespace completed 15
4853 local repaired=$($SHOW_NAMESPACE |
4854 awk '/^local_lost_found_moved/ { print $2 }')
4855 [ $repaired -ge 4 ] ||
4856 error "(16) Fail to recover backend orphans: $repaired"
4858 mount_client $MOUNT || error "(17) Fail to start client!"
4860 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4862 ls -ail $MOUNT/.lustre/lost+found/
4864 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4865 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4866 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4868 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4870 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4871 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4873 stat ${cname}/d1 || error "(21) d1 is not recovered"
4874 stat ${cname}/f1 || error "(22) f1 is not recovered"
4876 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4879 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4880 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4881 skip "MDS older than 2.6.50, LU-5519"
4884 echo "For the name entry under a striped directory, if the name"
4885 echo "hash does not match the shard, then the LFSCK will repair"
4886 echo "the bad name entry"
4889 check_mount_and_prep
4891 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4892 error "(1) Fail to create striped directory"
4894 echo "Inject failure stub on client to simulate the case that"
4895 echo "some name entry should be inserted into other non-first"
4896 echo "shard, but inserted into the first shard by wrong"
4898 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4899 $LCTL set_param fail_loc=0x1628 fail_val=0
4900 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4901 error "(2) Fail to create file under striped directory"
4902 $LCTL set_param fail_loc=0 fail_val=0
4904 echo "Trigger namespace LFSCK to repair bad name hash"
4905 $START_NAMESPACE -r -A ||
4906 error "(3) Fail to start LFSCK for namespace"
4908 wait_all_targets_blocked namespace completed 4
4910 local repaired=$($SHOW_NAMESPACE |
4911 awk '/^name_hash_repaired/ { print $2 }')
4912 [ $repaired -ge 1 ] ||
4913 error "(5) Fail to repair bad name hash: $repaired"
4915 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
4917 error "Fail to find flag bad type: $rc"
4919 umount_client $MOUNT || error "(6) umount failed"
4920 mount_client $MOUNT || error "(7) mount failed"
4922 for ((i = 0; i < $MDSCOUNT; i++)); do
4923 stat $DIR/$tdir/striped_dir/d$i ||
4924 error "(8) Fail to stat d$i after LFSCK"
4925 rmdir $DIR/$tdir/striped_dir/d$i ||
4926 error "(9) Fail to unlink d$i after LFSCK"
4929 rmdir $DIR/$tdir/striped_dir ||
4930 error "(10) Fail to remove the striped directory after LFSCK"
4932 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4935 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4936 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4937 skip "MDS older than 2.6.50, LU-5519"
4940 echo "For the name entry under a striped directory, if the name"
4941 echo "hash does not match the shard, then the LFSCK will repair"
4942 echo "the bad name entry"
4945 check_mount_and_prep
4947 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4948 error "(1) Fail to create striped directory"
4950 echo "Inject failure stub on client to simulate the case that"
4951 echo "some name entry should be inserted into other non-second"
4952 echo "shard, but inserted into the secod shard by wrong"
4954 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4955 $LCTL set_param fail_loc=0x1628 fail_val=1
4956 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4957 error "(2) Fail to create file under striped directory"
4958 $LCTL set_param fail_loc=0 fail_val=0
4960 echo "Trigger namespace LFSCK to repair bad name hash"
4961 $START_NAMESPACE -r -A ||
4962 error "(3) Fail to start LFSCK for namespace"
4964 wait_all_targets_blocked namespace completed 4
4966 local repaired=$(do_facet mds2 $LCTL get_param -n \
4967 mdd.$(facet_svc mds2).lfsck_namespace |
4968 awk '/^name_hash_repaired/ { print $2 }')
4969 echo "repaired $repaired name entries with bad hash"
4970 [ $repaired -ge 1 ] ||
4971 error "(5) Fail to repair bad name hash: $repaired"
4973 umount_client $MOUNT || error "(6) umount failed"
4974 mount_client $MOUNT || error "(7) mount failed"
4976 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
4977 stat $DIR/$tdir/striped_dir/d$i ||
4978 error "(8) Fail to stat d$i after LFSCK"
4979 rmdir $DIR/$tdir/striped_dir/d$i ||
4980 error "(9) Fail to unlink d$i after LFSCK"
4983 rmdir $DIR/$tdir/striped_dir ||
4984 error "(10) Fail to remove the striped directory after LFSCK"
4986 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4989 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4990 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4991 skip "MDS older than 2.6.50, LU-5519"
4994 echo "For some reason, the master MDT-object of the striped directory"
4995 echo "may lost its master LMV EA. If nobody created files under the"
4996 echo "master directly after the master LMV EA lost, then the LFSCK"
4997 echo "should re-generate the master LMV EA."
5000 check_mount_and_prep
5002 echo "Inject failure stub on MDT0 to simulate the case that the"
5003 echo "master MDT-object of the striped directory lost the LMV EA."
5005 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5007 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5008 error "(1) Fail to create striped directory"
5009 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5011 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5012 $START_NAMESPACE -r -A ||
5013 error "(2) Fail to start LFSCK for namespace"
5015 wait_all_targets_blocked namespace completed 3
5017 local repaired=$($SHOW_NAMESPACE |
5018 awk '/^striped_dirs_repaired/ { print $2 }')
5019 [ $repaired -eq 1 ] ||
5020 error "(4) Fail to re-generate master LMV EA: $repaired"
5022 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5023 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5025 umount_client $MOUNT || error "(5) umount failed"
5026 mount_client $MOUNT || error "(6) mount failed"
5028 local empty=$(ls $DIR/$tdir/striped_dir/)
5029 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5031 rmdir $DIR/$tdir/striped_dir ||
5032 error "(8) Fail to remove the striped directory after LFSCK"
5034 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5037 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5038 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5039 skip "MDS older than 2.6.50, LU-5519"
5042 echo "For some reason, the master MDT-object of the striped directory"
5043 echo "may lost its master LMV EA. If somebody created files under the"
5044 echo "master directly after the master LMV EA lost, then the LFSCK"
5045 echo "should NOT re-generate the master LMV EA, instead, it should"
5046 echo "change the broken striped dirctory as read-only to prevent"
5047 echo "further damage"
5050 check_mount_and_prep
5052 echo "Inject failure stub on MDT0 to simulate the case that the"
5053 echo "master MDT-object of the striped directory lost the LMV EA."
5055 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5056 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5057 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5058 error "(1) Fail to create striped directory"
5059 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5061 umount_client $MOUNT || error "(2) umount failed"
5062 mount_client $MOUNT || error "(3) mount failed"
5064 touch $DIR/$tdir/striped_dir/dummy ||
5065 error "(4) Fail to touch under broken striped directory"
5067 echo "Trigger namespace LFSCK to find out the inconsistency"
5068 $START_NAMESPACE -r -A ||
5069 error "(5) Fail to start LFSCK for namespace"
5071 wait_all_targets_blocked namespace completed 6
5073 local repaired=$($SHOW_NAMESPACE |
5074 awk '/^striped_dirs_repaired/ { print $2 }')
5075 [ $repaired -eq 0 ] ||
5076 error "(7) Re-generate master LMV EA unexpected: $repaired"
5078 stat $DIR/$tdir/striped_dir/dummy ||
5079 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5081 touch $DIR/$tdir/striped_dir/foo &&
5082 error "(9) The broken striped directory should be read-only"
5084 chattr -i $DIR/$tdir/striped_dir ||
5085 error "(10) Fail to chattr on the broken striped directory"
5087 rmdir $DIR/$tdir/striped_dir ||
5088 error "(11) Fail to remove the striped directory after LFSCK"
5090 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5093 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5094 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5095 skip "MDS older than 2.6.50, LU-5519"
5098 echo "For some reason, the slave MDT-object of the striped directory"
5099 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5100 echo "slave LMV EA."
5103 check_mount_and_prep
5105 echo "Inject failure stub on MDT0 to simulate the case that the"
5106 echo "slave MDT-object (that resides on the same MDT as the master"
5107 echo "MDT-object resides on) lost the LMV EA."
5109 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5110 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5111 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5112 error "(1) Fail to create striped directory"
5113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5115 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5116 $START_NAMESPACE -r -A ||
5117 error "(2) Fail to start LFSCK for namespace"
5119 wait_all_targets_blocked namespace completed 3
5121 local repaired=$($SHOW_NAMESPACE |
5122 awk '/^striped_shards_repaired/ { print $2 }')
5123 [ $repaired -eq 1 ] ||
5124 error "(4) Fail to re-generate slave LMV EA: $repaired"
5126 rmdir $DIR/$tdir/striped_dir ||
5127 error "(5) Fail to remove the striped directory after LFSCK"
5129 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5132 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5133 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5134 skip "MDS older than 2.6.50, LU-5519"
5137 echo "For some reason, the slave MDT-object of the striped directory"
5138 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5139 echo "slave LMV EA."
5142 check_mount_and_prep
5144 echo "Inject failure stub on MDT0 to simulate the case that the"
5145 echo "slave MDT-object (that resides on different MDT as the master"
5146 echo "MDT-object resides on) lost the LMV EA."
5148 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5149 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5150 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5151 error "(1) Fail to create striped directory"
5152 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5154 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5155 $START_NAMESPACE -r -A ||
5156 error "(2) Fail to start LFSCK for namespace"
5158 wait_all_targets_blocked namespace completed 3
5160 local repaired=$(do_facet mds2 $LCTL get_param -n \
5161 mdd.$(facet_svc mds2).lfsck_namespace |
5162 awk '/^striped_shards_repaired/ { print $2 }')
5163 [ $repaired -eq 1 ] ||
5164 error "(4) Fail to re-generate slave LMV EA: $repaired"
5166 rmdir $DIR/$tdir/striped_dir ||
5167 error "(5) Fail to remove the striped directory after LFSCK"
5169 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5172 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5173 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5174 skip "MDS older than 2.6.50, LU-5519"
5177 echo "For some reason, the stripe index in the slave LMV EA is"
5178 echo "corrupted. The LFSCK should repair the slave LMV EA."
5181 check_mount_and_prep
5183 echo "Inject failure stub on MDT0 to simulate the case that the"
5184 echo "slave LMV EA on the first shard of the striped directory"
5185 echo "claims the same index as the second shard claims"
5187 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5188 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5189 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5190 error "(1) Fail to create striped directory"
5191 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5193 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5194 $START_NAMESPACE -r -A ||
5195 error "(2) Fail to start LFSCK for namespace"
5197 wait_all_targets_blocked namespace completed 3
5199 local repaired=$($SHOW_NAMESPACE |
5200 awk '/^striped_shards_repaired/ { print $2 }')
5201 [ $repaired -eq 1 ] ||
5202 error "(4) Fail to repair slave LMV EA: $repaired"
5204 umount_client $MOUNT || error "(5) umount failed"
5205 mount_client $MOUNT || error "(6) mount failed"
5207 touch $DIR/$tdir/striped_dir/foo ||
5208 error "(7) Fail to touch file after the LFSCK"
5210 rm -f $DIR/$tdir/striped_dir/foo ||
5211 error "(8) Fail to unlink file after the LFSCK"
5213 rmdir $DIR/$tdir/striped_dir ||
5214 error "(9) Fail to remove the striped directory after LFSCK"
5216 run_test 31g "Repair the corrupted slave LMV EA"
5219 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5220 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5221 skip "MDS older than 2.6.50, LU-5519"
5224 echo "For some reason, the shard's name entry in the striped"
5225 echo "directory may be corrupted. The LFSCK should repair the"
5226 echo "bad shard's name entry."
5229 check_mount_and_prep
5231 echo "Inject failure stub on MDT0 to simulate the case that the"
5232 echo "first shard's name entry in the striped directory claims"
5233 echo "the same index as the second shard's name entry claims."
5235 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5236 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5237 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5238 error "(1) Fail to create striped directory"
5239 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5241 echo "Trigger namespace LFSCK to repair the shard's name entry"
5242 $START_NAMESPACE -r -A ||
5243 error "(2) Fail to start LFSCK for namespace"
5245 wait_all_targets_blocked namespace completed 3
5247 local repaired=$($SHOW_NAMESPACE |
5248 awk '/^dirent_repaired/ { print $2 }')
5249 [ $repaired -eq 1 ] ||
5250 error "(4) Fail to repair shard's name entry: $repaired"
5252 umount_client $MOUNT || error "(5) umount failed"
5253 mount_client $MOUNT || error "(6) mount failed"
5255 touch $DIR/$tdir/striped_dir/foo ||
5256 error "(7) Fail to touch file after the LFSCK"
5258 rm -f $DIR/$tdir/striped_dir/foo ||
5259 error "(8) Fail to unlink file after the LFSCK"
5261 rmdir $DIR/$tdir/striped_dir ||
5262 error "(9) Fail to remove the striped directory after LFSCK"
5264 run_test 31h "Repair the corrupted shard's name entry"
5269 umount_client $MOUNT
5271 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5272 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5273 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5275 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5276 [ "$STATUS" == "scanning-phase1" ] ||
5277 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5280 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5282 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5286 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5288 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5289 error "(5) Fail to start ost1"
5291 run_test 32a "stop LFSCK when some OST failed"
5295 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5298 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5299 error "(1) Fail to create $DIR/$tdir/dp"
5300 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5301 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5302 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5303 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5304 umount_client $MOUNT
5306 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5307 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5308 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5310 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5311 mdd.${MDT_DEV}.lfsck_namespace |
5312 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5314 error "(5) unexpected status"
5318 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5320 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5324 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5326 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5327 error "(8) Fail to start MDT2"
5329 run_test 32b "stop LFSCK when some MDT failed"
5335 $START_LAYOUT --dryrun -o -r ||
5336 error "(1) Fail to start layout LFSCK"
5337 wait_all_targets_blocked layout completed 2
5339 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5340 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5341 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5343 $START_NAMESPACE -e abort -A -r ||
5344 error "(4) Fail to start namespace LFSCK"
5345 wait_all_targets_blocked namespace completed 5
5347 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5348 [ "$PARAMS" == "failout,all_targets" ] ||
5349 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5351 run_test 33 "check LFSCK paramters"
5355 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5356 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5360 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5361 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5362 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5363 error "(1) Fail to create $DIR/$tdir/dummy"
5365 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5366 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5367 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5368 mdd.${MDT_DEV}.lfsck_namespace |
5369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5371 error "(3) unexpected status"
5374 local repaired=$($SHOW_NAMESPACE |
5375 awk '/^dirent_repaired/ { print $2 }')
5376 [ $repaired -eq 1 ] ||
5377 error "(4) Fail to repair the lost agent object: $repaired"
5379 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5380 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5381 mdd.${MDT_DEV}.lfsck_namespace |
5382 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5384 error "(6) unexpected status"
5387 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5388 [ $repaired -eq 0 ] ||
5389 error "(7) Unexpected repairing: $repaired"
5391 run_test 34 "LFSCK can rebuild the lost agent object"
5395 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5399 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5400 do_facet mds2 $LCTL set_param fail_loc=0x1631
5401 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5402 error "(1) Fail to create $DIR/$tdir/dummy"
5405 do_facet mds2 $LCTL set_param fail_loc=0
5406 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5407 wait_update_facet mds2 "$LCTL get_param -n \
5408 mdd.$(facet_svc mds2).lfsck_namespace |
5409 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5410 error "(3) MDS${k} is not the expected 'completed'"
5412 local repaired=$(do_facet mds2 $LCTL get_param -n \
5413 mdd.$(facet_svc mds2).lfsck_namespace |
5414 awk '/^agent_entries_repaired/ { print $2 }')
5415 [ $repaired -eq 1 ] ||
5416 error "(4) Fail to repair the lost agent entry: $repaired"
5418 echo "stopall to cleanup object cache"
5421 setupall > /dev/null
5423 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5424 wait_update_facet mds2 "$LCTL get_param -n \
5425 mdd.$(facet_svc mds2).lfsck_namespace |
5426 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5427 error "(6) MDS${k} is not the expected 'completed'"
5429 repaired=$(do_facet mds2 $LCTL get_param -n \
5430 mdd.$(facet_svc mds2).lfsck_namespace |
5431 awk '/^agent_entries_repaired/ { print $2 }')
5432 [ $repaired -eq 0 ] ||
5433 error "(7) Unexpected repairing: $repaired"
5435 run_test 35 "LFSCK can rebuild the lost agent entry"
5438 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5441 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5442 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5443 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5446 check_mount_and_prep
5450 lctl get_param osc.*.*grant*
5451 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5453 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5454 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5455 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5456 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5457 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5458 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5459 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5460 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5461 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5463 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5464 error "(3) Fail to write $DIR/$tdir/f0"
5465 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5466 error "(4) Fail to write $DIR/$tdir/f1"
5467 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5468 error "(5) Fail to write $DIR/$tdir/f2"
5470 $LFS mirror resync $DIR/$tdir/f0 ||
5471 error "(6) Fail to resync $DIR/$tdir/f0"
5472 $LFS mirror resync $DIR/$tdir/f1 ||
5473 error "(7) Fail to resync $DIR/$tdir/f1"
5474 $LFS mirror resync $DIR/$tdir/f2 ||
5475 error "(8) Fail to resync $DIR/$tdir/f2"
5477 cancel_lru_locks mdc
5478 cancel_lru_locks osc
5480 $LFS getstripe $DIR/$tdir/f0 ||
5481 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5482 $LFS getstripe $DIR/$tdir/f1 ||
5483 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5484 $LFS getstripe $DIR/$tdir/f2 ||
5485 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5487 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5488 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5489 do_facet mds1 $LCTL set_param fail_loc=0x1616
5491 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5492 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5493 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5494 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5495 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5496 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5500 do_facet mds1 $LCTL set_param fail_loc=0
5502 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5503 error "(15) The 1st of mirror is not destroyed"
5504 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5505 error "(16) The 2nd of mirror is not destroyed"
5506 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5507 error "(17) The 3rd of mirror is not destroyed"
5511 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5512 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5513 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5514 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5515 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5516 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5518 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5519 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5521 for k in $(seq $MDSCOUNT); do
5522 # The LFSCK status query internal is 30 seconds. For the case
5523 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5524 # time to guarantee the status sync up.
5525 wait_update_facet mds${k} "$LCTL get_param -n \
5526 mdd.$(facet_svc mds${k}).lfsck_layout |
5527 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5528 error "(22) MDS${k} is not the expected 'completed'"
5531 for k in $(seq $OSTCOUNT); do
5532 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5533 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5534 awk '/^status/ { print $2 }')
5535 [ "$cur_status" == "completed" ] ||
5536 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5539 local repaired=$(do_facet mds1 $LCTL get_param -n \
5540 mdd.$(facet_svc mds1).lfsck_layout |
5541 awk '/^repaired_orphan/ { print $2 }')
5542 [ $repaired -eq 9 ] ||
5543 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5545 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5546 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5547 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5548 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5549 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5550 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5552 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5553 $LFS getstripe $DIR/$tdir/f0
5554 error "(28) The 1st of mirror is not recovered"
5557 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5558 $LFS getstripe $DIR/$tdir/f1
5559 error "(29) The 2nd of mirror is not recovered"
5562 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5563 $LFS getstripe $DIR/$tdir/f2
5564 error "(30) The 3rd of mirror is not recovered"
5567 run_test 36a "rebuild LOV EA for mirrored file (1)"
5570 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5571 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5574 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5575 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5576 echo "with the PFID EA of related OST-object(s) belong to the file. "
5579 check_mount_and_prep
5581 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5582 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5583 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5585 local fid=$($LFS path2fid $DIR/$tdir/f0)
5587 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5588 error "(1) Fail to write $DIR/$tdir/f0"
5589 $LFS mirror resync $DIR/$tdir/f0 ||
5590 error "(2) Fail to resync $DIR/$tdir/f0"
5592 cancel_lru_locks mdc
5593 cancel_lru_locks osc
5595 $LFS getstripe $DIR/$tdir/f0 ||
5596 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5598 echo "Inject failure, to simulate the case of missing the MDT-object"
5599 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5600 do_facet mds1 $LCTL set_param fail_loc=0x1616
5601 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5605 do_facet mds1 $LCTL set_param fail_loc=0
5607 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5608 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5610 for k in $(seq $MDSCOUNT); do
5611 # The LFSCK status query internal is 30 seconds. For the case
5612 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5613 # time to guarantee the status sync up.
5614 wait_update_facet mds${k} "$LCTL get_param -n \
5615 mdd.$(facet_svc mds${k}).lfsck_layout |
5616 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5617 error "(6) MDS${k} is not the expected 'completed'"
5620 for k in $(seq $OSTCOUNT); do
5621 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5622 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5623 awk '/^status/ { print $2 }')
5624 [ "$cur_status" == "completed" ] ||
5625 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5628 local count=$(do_facet mds1 $LCTL get_param -n \
5629 mdd.$(facet_svc mds1).lfsck_layout |
5630 awk '/^repaired_orphan/ { print $2 }')
5631 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5633 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5634 count=$($LFS getstripe --mirror-count $name)
5635 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5637 count=$($LFS getstripe --component-count $name)
5638 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5640 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5641 $LFS getstripe $name
5642 error "(11) The 1st of mirror is not recovered"
5645 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5646 $LFS getstripe $name
5647 error "(12) The 2nd of mirror is not recovered"
5650 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5651 $LFS getstripe $name
5652 error "(13) The 3rd of mirror is not recovered"
5655 run_test 36b "rebuild LOV EA for mirrored file (2)"
5658 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5659 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5662 echo "The mirrored file has been modified, not resynced yet, then "
5663 echo "lost its MDT-object, but relatd OST-objects are still there. "
5664 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5665 echo "with the PFID EA of related OST-object(s) belong to the file. "
5668 check_mount_and_prep
5670 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5672 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5674 local fid=$($LFS path2fid $DIR/$tdir/f0)
5676 # The 1st dd && resync makes all related OST-objects have been written
5677 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5678 error "(1.1) Fail to write $DIR/$tdir/f0"
5679 $LFS mirror resync $DIR/$tdir/f0 ||
5680 error "(1.2) Fail to resync $DIR/$tdir/f0"
5681 # The 2nd dd makes one mirror to be stale
5682 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5683 error "(1.3) Fail to write $DIR/$tdir/f0"
5685 cancel_lru_locks mdc
5686 cancel_lru_locks osc
5688 $LFS getstripe $DIR/$tdir/f0 ||
5689 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5691 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5692 awk '/lcme_flags/ { print $2 }')
5693 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5694 awk '/lcme_flags/ { print $2 }')
5696 echo "Inject failure, to simulate the case of missing the MDT-object"
5697 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5698 do_facet mds1 $LCTL set_param fail_loc=0x1616
5699 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5703 do_facet mds1 $LCTL set_param fail_loc=0
5705 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5706 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5708 for k in $(seq $MDSCOUNT); do
5709 # The LFSCK status query internal is 30 seconds. For the case
5710 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5711 # time to guarantee the status sync up.
5712 wait_update_facet mds${k} "$LCTL get_param -n \
5713 mdd.$(facet_svc mds${k}).lfsck_layout |
5714 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5715 error "(5) MDS${k} is not the expected 'completed'"
5718 for k in $(seq $OSTCOUNT); do
5719 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5720 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5721 awk '/^status/ { print $2 }')
5722 [ "$cur_status" == "completed" ] ||
5723 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5726 local count=$(do_facet mds1 $LCTL get_param -n \
5727 mdd.$(facet_svc mds1).lfsck_layout |
5728 awk '/^repaired_orphan/ { print $2 }')
5729 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5731 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5732 count=$($LFS getstripe --mirror-count $name)
5733 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5735 count=$($LFS getstripe --component-count $name)
5736 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5738 local flags=$($LFS getstripe $name | head -n 10 |
5739 awk '/lcme_flags/ { print $2 }')
5740 [ "$flags" == "$saved_flags1" ] || {
5741 $LFS getstripe $name
5742 error "(10) expect flags $saved_flags1, got $flags"
5745 flags=$($LFS getstripe $name | tail -n 10 |
5746 awk '/lcme_flags/ { print $2 }')
5747 [ "$flags" == "$saved_flags2" ] || {
5748 $LFS getstripe $name
5749 error "(11) expect flags $saved_flags2, got $flags"
5752 run_test 36c "rebuild LOV EA for mirrored file (3)"
5758 local t_dir="$DIR/$tdir/d0"
5759 check_mount_and_prep
5761 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5762 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5766 $START_NAMESPACE -r -A || {
5767 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5769 wait_all_targets_blocked namespace completed 4
5774 run_test 37 "LFSCK must skip a ORPHAN"
5778 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5779 skip "Need MDS version newer than 2.12.51"
5781 test_mkdir $DIR/$tdir
5782 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5783 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5785 # create foreign file
5786 $LFS setstripe --foreign=none --flags 0xda05 \
5787 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5788 error "$DIR/$tdir/$tfile: create failed"
5790 $LFS getstripe -v $DIR/$tdir/$tfile |
5791 grep "lfm_magic:.*0x0BD70BD0" ||
5792 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5793 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5794 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5795 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5796 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5797 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5798 $LFS getstripe -v $DIR/$tdir/$tfile |
5799 grep "lfm_flags:.*0x0000DA05" ||
5800 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5801 $LFS getstripe $DIR/$tdir/$tfile |
5802 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5803 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5805 # modify striping should fail
5806 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5807 error "$DIR/$tdir/$tfile: setstripe should fail"
5809 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5811 wait_all_targets_blocked namespace completed 1
5813 # check that "global" namespace_repaired == 0 !!!
5814 local repaired=$(do_facet mds1 \
5815 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5816 awk '/^namespace_repaired/ { print \\\$2 }'")
5817 [ $repaired -eq 0 ] ||
5818 error "(2) Expect no namespace repair, but got: $repaired"
5820 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5822 wait_all_targets_blocked layout completed 2
5824 # check that "global" layout_repaired == 0 !!!
5825 local repaired=$(do_facet mds1 \
5826 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5827 awk '/^layout_repaired/ { print \\\$2 }'")
5828 [ $repaired -eq 0 ] ||
5829 error "(2) Expect no layout repair, but got: $repaired"
5831 echo "post-lfsck checks of foreign file"
5833 $LFS getstripe -v $DIR/$tdir/$tfile |
5834 grep "lfm_magic:.*0x0BD70BD0" ||
5835 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5836 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5837 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5838 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5839 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5840 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5841 $LFS getstripe -v $DIR/$tdir/$tfile |
5842 grep "lfm_flags:.*0x0000DA05" ||
5843 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5844 $LFS getstripe $DIR/$tdir/$tfile |
5845 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5846 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5848 # modify striping should fail
5849 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5850 error "$DIR/$tdir/$tfile: setstripe should fail"
5853 cat $DIR/$tdir/$tfile && "$DIR/$tdir/$tfile: read should fail"
5854 cat /etc/passwd > $DIR/$tdir/$tfile &&
5855 error "$DIR/$tdir/$tfile: write should fail"
5857 #remove foreign file
5858 rm $DIR/$tdir/$tfile ||
5859 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5861 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5865 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5866 skip "Need MDS version newer than 2.12.51"
5868 test_mkdir $DIR/$tdir
5869 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5870 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5872 # create foreign dir
5873 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5874 $DIR/$tdir/${tdir}2 ||
5875 error "$DIR/$tdir/${tdir}2: create failed"
5877 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5878 grep "lfm_magic:.*0x0CD50CD0" ||
5879 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5880 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5881 # - sizeof(lfm_type) - sizeof(lfm_flags)
5882 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5883 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5884 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5885 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5886 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5887 grep "lfm_flags:.*0x0000DA05" ||
5888 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5889 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5890 grep "lfm_value.*${uuid1}@${uuid2}" ||
5891 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5893 # file create in dir should fail
5894 touch $DIR/$tdir/${tdir}2/$tfile &&
5895 "$DIR/${tdir}2: file create should fail"
5898 chmod 777 $DIR/$tdir/${tdir}2 ||
5899 error "$DIR/${tdir}2: chmod failed"
5902 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5903 error "$DIR/${tdir}2: chown failed"
5905 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5907 wait_all_targets_blocked namespace completed 1
5909 # check that "global" namespace_repaired == 0 !!!
5910 local repaired=$(do_facet mds1 \
5911 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5912 awk '/^namespace_repaired/ { print \\\$2 }'")
5913 [ $repaired -eq 0 ] ||
5914 error "(2) Expect nothing to be repaired, but got: $repaired"
5916 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5918 wait_all_targets_blocked layout completed 2
5920 # check that "global" layout_repaired == 0 !!!
5921 local repaired=$(do_facet mds1 \
5922 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5923 awk '/^layout_repaired/ { print \\\$2 }'")
5924 [ $repaired -eq 0 ] ||
5925 error "(2) Expect no layout repair, but got: $repaired"
5927 echo "post-lfsck checks of foreign dir"
5929 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5930 grep "lfm_magic:.*0x0CD50CD0" ||
5931 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5932 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5933 # - sizeof(lfm_type) - sizeof(lfm_flags)
5934 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5935 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5936 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5937 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5938 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5939 grep "lfm_flags:.*0x0000DA05" ||
5940 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5941 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5942 grep "lfm_value.*${uuid1}@${uuid2}" ||
5943 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5945 # file create in dir should fail
5946 touch $DIR/$tdir/${tdir}2/$tfile &&
5947 "$DIR/${tdir}2: file create should fail"
5950 chmod 777 $DIR/$tdir/${tdir}2 ||
5951 error "$DIR/${tdir}2: chmod failed"
5954 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5955 error "$DIR/${tdir}2: chown failed"
5958 rmdir $DIR/$tdir/${tdir}2 ||
5959 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5961 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5964 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5966 check_mount_and_prep
5967 $LFS mkdir -i 1 $DIR/$tdir/dir1
5968 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
5970 touch $DIR/$tdir/dir1/f1
5971 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
5973 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
5974 $LFS migrate -m 0 $DIR/$tdir/dir1
5976 echo "trigger LFSCK for layout"
5977 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
5979 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5980 mdd.${MDT_DEV}.lfsck_layout |
5981 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5983 error "(2) unexpected status"
5986 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
5988 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
5990 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
5994 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
5996 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
5997 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
5998 do_facet $SINGLEMDS $LCTL dk > /dev/null
6000 echo "trigger LFSCK for SEL layout"
6001 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6002 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6003 mdd.${MDT_DEV}.lfsck_layout |
6004 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6006 error "(2) unexpected status"
6009 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6010 grep "lfsck_layout_verify_header")
6012 [[ "x$errors" == "x" ]] || {
6014 error "lfsck failed"
6017 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6019 run_test 41 "SEL support in LFSCK"
6021 # restore MDS/OST size
6022 MDSSIZE=${SAVED_MDSSIZE}
6023 OSTSIZE=${SAVED_OSTSIZE}
6024 OSTCOUNT=${SAVED_OSTCOUNT}
6026 # cleanup the system at last
6027 REFORMAT="yes" cleanup_and_setup_lustre
6030 check_and_cleanup_lustre