3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
33 skip "Need MDS version at least 2.3.60"
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
45 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 MDT_DEV="${FSNAME}-MDT0000"
55 OST_DEV="${FSNAME}-OST0000"
56 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
57 START_NAMESPACE="do_facet $SINGLEMDS \
58 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
59 START_LAYOUT="do_facet $SINGLEMDS \
60 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
61 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
62 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
63 SHOW_NAMESPACE="do_facet $SINGLEMDS \
64 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
65 SHOW_LAYOUT="do_facet $SINGLEMDS \
66 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
67 SHOW_LAYOUT_ON_OST="do_facet ost1 \
68 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
69 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
70 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
71 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
80 echo "preparing... $nfiles * $ndirs files will be created $(date)."
81 if [ ! -z $igif ]; then
82 #define OBD_FAIL_FID_IGIF 0x1504
83 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
86 cp $LUSTRE/tests/*.sh $DIR/$tdir/
87 if [ $ndirs -gt 0 ]; then
88 createmany -d $DIR/$tdir/d $ndirs
89 createmany -m $DIR/$tdir/f $ndirs
90 if [ $nfiles -gt 0 ]; then
91 for ((i = 0; i < $ndirs; i++)); do
92 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
93 /dev/null || error "createmany $nfiles"
96 createmany -d $DIR/$tdir/e $ndirs
99 if [ ! -z $igif ]; then
100 touch $DIR/$tdir/dummy
101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
104 echo "prepared $(date)."
107 run_e2fsck_on_mdt0() {
108 [ $mds1_FSTYPE == ldiskfs ] || return 0
110 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
111 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
113 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
114 error "(2) Detected inconsistency on MDT0"
116 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
117 error "(3) Fail to start MDT0"
120 wait_all_targets_blocked() {
125 local count=$(do_facet mds1 \
126 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
127 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
128 [[ $count -eq $MDSCOUNT ]] || {
129 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
130 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
139 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
141 "$MDSCOUNT" $LTIME || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) some MDTs are not in ${status}"
150 #define OBD_FAIL_LFSCK_DELAY1 0x1600
151 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
152 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
154 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
156 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
157 [ "$STATUS" == "scanning-phase1" ] ||
158 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
160 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
162 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
163 [ "$STATUS" == "stopped" ] ||
164 error "(6) Expect 'stopped', but got '$STATUS'"
166 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
168 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
172 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
173 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
174 mdd.${MDT_DEV}.lfsck_namespace |
175 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
177 error "(9) unexpected status"
180 local repaired=$($SHOW_NAMESPACE |
181 awk '/^updated_phase1/ { print $2 }')
182 [ $repaired -eq 0 ] ||
183 error "(10) Expect nothing to be repaired, but got: $repaired"
185 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
186 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
187 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
188 mdd.${MDT_DEV}.lfsck_namespace |
189 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
191 error "(12) unexpected status"
194 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
195 [ $((scanned1 + 1)) -eq $scanned2 ] ||
196 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
198 echo "stopall, should NOT crash LU-3649"
199 stopall || error "(14) Fail to stopall"
201 run_test 0 "Control LFSCK manually"
206 #define OBD_FAIL_FID_INDIR 0x1501
207 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
208 touch $DIR/$tdir/dummy
210 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
212 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
213 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
214 mdd.${MDT_DEV}.lfsck_namespace |
215 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
217 error "(4) unexpected status"
220 local repaired=$($SHOW_NAMESPACE |
221 awk '/^dirent_repaired/ { print $2 }')
222 # for interop with old server
223 [ -z "$repaired" ] &&
224 repaired=$($SHOW_NAMESPACE |
225 awk '/^updated_phase1/ { print $2 }')
227 [ $repaired -eq 1 ] ||
228 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
232 mount_client $MOUNT || error "(6) Fail to start client!"
234 #define OBD_FAIL_FID_LOOKUP 0x1505
235 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
236 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
240 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
244 [ "$mds1_FSTYPE" != ldiskfs ] &&
245 skip "OI Scrub not implemented for ZFS"
249 #define OBD_FAIL_FID_INLMA 0x1502
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
251 touch $DIR/$tdir/dummy
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 #define OBD_FAIL_FID_NOLMA 0x1506
256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
257 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
258 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
259 mdd.${MDT_DEV}.lfsck_namespace |
260 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
262 error "(4) unexpected status"
265 local repaired=$($SHOW_NAMESPACE |
266 awk '/^dirent_repaired/ { print $2 }')
267 # for interop with old server
268 [ -z "$repaired" ] &&
269 repaired=$($SHOW_NAMESPACE |
270 awk '/^updated_phase1/ { print $2 }')
272 [ $repaired -eq 1 ] ||
273 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
278 mount_client $MOUNT || error "(6) Fail to start client!"
280 #define OBD_FAIL_FID_LOOKUP 0x1505
281 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
282 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
284 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
286 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
291 #define OBD_FAIL_FID_IGIF 0x1504
292 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
293 touch $DIR/$tdir/dummy
295 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
297 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
298 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
299 mdd.${MDT_DEV}.lfsck_namespace |
300 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
302 error "(4) unexpected status"
305 local repaired=$($SHOW_NAMESPACE |
306 awk '/^dirent_repaired/ { print $2 }')
307 # for interop with old server
308 [ -z "$repaired" ] &&
309 repaired=$($SHOW_NAMESPACE |
310 awk '/^updated_phase1/ { print $2 }')
312 [ $repaired -eq 1 ] ||
313 error "(5) Fail to repair lost FID-in-dirent: $repaired"
317 mount_client $MOUNT || error "(6) Fail to start client!"
319 #define OBD_FAIL_FID_LOOKUP 0x1505
320 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
321 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
323 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
325 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
328 [ $MDS1_VERSION -lt $(version_code 2.13.57) ] &&
329 skip "MDS older than 2.13.57"
330 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
334 touch $DIR/$tdir/$tfile
335 mkdir $DIR/$tdir/subdir
336 $LFS mkdir -i 1 $DIR/$tdir/remotedir
337 $LFS path2fid $DIR/$tdir
338 ll_decode_linkea $DIR/$tdir/$tfile
339 ll_decode_linkea $DIR/$tdir/subdir
340 ll_decode_linkea $DIR/$tdir/remotedir
342 local mntpt=$(facet_mntpt mds1)
344 # unlink OI files to remove the stale entry
345 local saved_opts=$MDS_MOUNT_OPTS
348 mount_fstype mds1 $mntpt
349 # increase $tdir FID oid in LMA
350 do_facet mds1 "getfattr -d -m trusted.lma -e hex \
351 --absolute-names $mntpt/ROOT/$tdir | \
352 sed -E 's/0(.{8})$/1\1/' | setfattr --restore=-"
353 unmount_fstype mds1 $mntpt
356 # the FID oid in LMA was increased above, and it's not in OI table,
357 # run scrub first to generate mapping in OI, so the following namespace
358 # check can fix linkea correctly, this is not necessary normally.
359 do_facet mds1 $LCTL lfsck_start -M ${MDT_DEV} -t scrub ||
360 error "failed to start LFSCK for scrub!"
361 wait_update_facet mds1 "$LCTL get_param -n \
362 osd-*.$(facet_svc mds1).oi_scrub |
363 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
364 error "unexpected status"
366 $START_NAMESPACE -r -A || error "fail to start LFSCK for namespace!"
367 wait_update_facet mds1 "$LCTL get_param -n \
368 mdd.${MDT_DEV}.lfsck_namespace |
369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
371 error "unexpected status"
373 $LFS path2fid $DIR/$tdir
374 ll_decode_linkea $DIR/$tdir/$tfile
375 ll_decode_linkea $DIR/$tdir/subdir
376 ll_decode_linkea $DIR/$tdir/remotedir
381 fid=$($LFS path2fid $DIR/$tdir)
382 for f in $tfile subdir remotedir; do
383 pfid=$(ll_decode_linkea $DIR/$tdir/$f |
384 awk '/pfid/ { print $3 }')
386 [ "$pfid" == "$fid" ] || error "$fid in LMA != $pfid in linkea"
389 run_test 1d "LFSCK can fix mismatch of FID in LMA and FID in child linkea"
394 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
395 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
396 touch $DIR/$tdir/dummy
398 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
400 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
401 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
402 mdd.${MDT_DEV}.lfsck_namespace |
403 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
405 error "(4) unexpected status"
408 local repaired=$($SHOW_NAMESPACE |
409 awk '/^linkea_repaired/ { print $2 }')
410 # for interop with old server
411 [ -z "$repaired" ] &&
412 repaired=$($SHOW_NAMESPACE |
413 awk '/^updated_phase2/ { print $2 }')
415 [ $repaired -eq 1 ] ||
416 error "(5) Fail to repair crashed linkEA: $repaired"
420 mount_client $MOUNT || error "(6) Fail to start client!"
422 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
423 error "(7) Fail to stat $DIR/$tdir/dummy"
425 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
426 local dummyname=$($LFS fid2path $DIR $dummyfid)
427 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
428 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
430 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
436 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
437 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
438 touch $DIR/$tdir/dummy
440 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
442 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
443 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
444 mdd.${MDT_DEV}.lfsck_namespace |
445 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
447 error "(4) unexpected status"
450 local repaired=$($SHOW_NAMESPACE |
451 awk '/^updated_phase2/ { print $2 }')
452 [ $repaired -eq 1 ] ||
453 error "(5) Fail to repair crashed linkEA: $repaired"
457 mount_client $MOUNT || error "(6) Fail to start client!"
459 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
460 error "(7) Fail to stat $DIR/$tdir/dummy"
462 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
463 local dummyname=$($LFS fid2path $DIR $dummyfid)
464 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
465 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
467 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
471 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
472 skip "MDS older than 2.4.90"
476 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
477 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
478 touch $DIR/$tdir/dummy
480 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
482 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
483 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
484 mdd.${MDT_DEV}.lfsck_namespace |
485 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
487 error "(4) unexpected status"
490 local repaired=$($SHOW_NAMESPACE |
491 awk '/^updated_phase2/ { print $2 }')
492 [ $repaired -eq 1 ] ||
493 error "(5) Fail to repair crashed linkEA: $repaired"
497 mount_client $MOUNT || error "(6) Fail to start client!"
499 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
500 error "(7) Fail to stat $DIR/$tdir/dummy"
502 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
503 local dummyname=$($LFS fid2path $DIR $dummyfid)
504 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
505 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
507 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
511 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
512 skip "MDS older than 2.6.50, LU-4788"
516 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
517 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
518 touch $DIR/$tdir/dummy
520 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
522 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
523 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
524 mdd.${MDT_DEV}.lfsck_namespace |
525 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
527 error "(4) unexpected status"
530 local repaired=$($SHOW_NAMESPACE |
531 awk '/^linkea_repaired/ { print $2 }')
532 [ $repaired -eq 1 ] ||
533 error "(5) Fail to repair crashed linkEA: $repaired"
537 mount_client $MOUNT || error "(6) Fail to start client!"
539 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
540 error "(7) Fail to stat $DIR/$tdir/dummy"
542 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
543 local dummyname=$($LFS fid2path $DIR $dummyfid)
544 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
545 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
547 run_test 2d "LFSCK can recover the missing linkEA entry"
551 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
552 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
553 skip "MDS older than 2.6.50, LU-5511"
557 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
559 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
560 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
561 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
562 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
564 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
566 wait_all_targets_blocked namespace completed 4
568 local repaired=$($SHOW_NAMESPACE |
569 awk '/^linkea_repaired/ { print $2 }')
570 [ $repaired -eq 1 ] ||
571 error "(5) Fail to repair crashed linkEA: $repaired"
573 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
574 local name=$($LFS fid2path $DIR $fid)
575 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
576 error "(6) Fail to repair linkEA: $fid $name"
578 run_test 2e "namespace LFSCK can verify remote object linkEA"
582 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
583 skip "MDS older than 2.6.50, LU-4788"
587 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
588 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
589 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
591 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
592 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
593 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
595 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
596 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
597 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
599 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
600 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
601 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
603 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
605 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
606 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
607 mdd.${MDT_DEV}.lfsck_namespace |
608 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
610 error "(10) unexpected status"
613 local checked=$($SHOW_NAMESPACE |
614 awk '/^checked_phase2/ { print $2 }')
615 [ $checked -ge 4 ] ||
616 error "(11) Fail to check multiple-linked object: $checked"
618 local repaired=$($SHOW_NAMESPACE |
619 awk '/^multiple_linked_repaired/ { print $2 }')
620 [ $repaired -ge 2 ] ||
621 error "(12) Fail to repair multiple-linked object: $repaired"
623 run_test 3 "LFSCK can verify multiple-linked objects"
627 [ "$mds1_FSTYPE" != ldiskfs ] &&
628 skip "OI Scrub not implemented for ZFS"
631 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
632 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
634 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
635 echo "start $SINGLEMDS with disabling OI scrub"
636 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
637 error "(2) Fail to start MDS!"
639 #define OBD_FAIL_LFSCK_DELAY2 0x1601
640 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
641 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
643 mdd.${MDT_DEV}.lfsck_namespace |
644 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
646 error "(5) unexpected status"
649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
650 [ "$STATUS" == "scanning-phase1" ] ||
651 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
653 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
658 error "(7) unexpected status"
661 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
662 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
664 local repaired=$($SHOW_NAMESPACE |
665 awk '/^dirent_repaired/ { print $2 }')
666 # for interop with old server
667 [ -z "$repaired" ] &&
668 repaired=$($SHOW_NAMESPACE |
669 awk '/^updated_phase1/ { print $2 }')
671 [ $repaired -ge 9 ] ||
672 error "(9) Fail to re-generate FID-in-dirent: $repaired"
676 mount_client $MOUNT || error "(10) Fail to start client!"
678 #define OBD_FAIL_FID_LOOKUP 0x1505
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
680 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
681 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
683 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
687 [ "$mds1_FSTYPE" != ldiskfs ] &&
688 skip "OI Scrub not implemented for ZFS"
691 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
692 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
694 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
695 echo "start $SINGLEMDS with disabling OI scrub"
696 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
697 error "(2) Fail to start MDS!"
699 #define OBD_FAIL_LFSCK_DELAY2 0x1601
700 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
701 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
702 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
703 mdd.${MDT_DEV}.lfsck_namespace |
704 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
706 error "(5) unexpected status"
709 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
710 [ "$STATUS" == "scanning-phase1" ] ||
711 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
713 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
714 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
715 mdd.${MDT_DEV}.lfsck_namespace |
716 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
718 error "(7) unexpected status"
721 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
722 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
724 local repaired=$($SHOW_NAMESPACE |
725 awk '/^dirent_repaired/ { print $2 }')
726 # for interop with old server
727 [ -z "$repaired" ] &&
728 repaired=$($SHOW_NAMESPACE |
729 awk '/^updated_phase1/ { print $2 }')
731 [ $repaired -ge 2 ] ||
732 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
736 mount_client $MOUNT || error "(10) Fail to start client!"
738 #define OBD_FAIL_FID_LOOKUP 0x1505
739 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
740 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
742 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
744 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
745 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
746 local dummyname=$($LFS fid2path $DIR $dummyfid)
747 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
748 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
750 run_test 5 "LFSCK can handle IGIF object upgrading"
755 #define OBD_FAIL_LFSCK_DELAY1 0x1600
756 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
757 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
759 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
760 [ "$STATUS" == "scanning-phase1" ] ||
761 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
763 # Sleep 3 sec to guarantee at least one object processed by LFSCK
765 # Fail the LFSCK to guarantee there is at least one checkpoint
766 #define OBD_FAIL_LFSCK_FATAL1 0x1608
767 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
768 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
769 mdd.${MDT_DEV}.lfsck_namespace |
770 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
772 error "(4) unexpected status"
775 local POS0=$($SHOW_NAMESPACE |
776 awk '/^last_checkpoint_position/ { print $2 }' |
779 #define OBD_FAIL_LFSCK_DELAY1 0x1600
780 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
781 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
783 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
784 [ "$STATUS" == "scanning-phase1" ] ||
785 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
787 local POS1=$($SHOW_NAMESPACE |
788 awk '/^latest_start_position/ { print $2 }' |
790 [[ $POS0 -lt $POS1 ]] ||
791 error "(7) Expect larger than: $POS0, but got $POS1"
793 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
794 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
795 mdd.${MDT_DEV}.lfsck_namespace |
796 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
798 error "(8) unexpected status"
801 run_test 6a "LFSCK resumes from last checkpoint (1)"
806 #define OBD_FAIL_LFSCK_DELAY2 0x1601
807 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
808 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
810 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
811 [ "$STATUS" == "scanning-phase1" ] ||
812 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
814 # Sleep 5 sec to guarantee that we are in the directory scanning
816 # Fail the LFSCK to guarantee there is at least one checkpoint
817 #define OBD_FAIL_LFSCK_FATAL2 0x1609
818 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
819 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
820 mdd.${MDT_DEV}.lfsck_namespace |
821 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
823 error "(4) unexpected status"
826 local O_POS0=$($SHOW_NAMESPACE |
827 awk '/^last_checkpoint_position/ { print $2 }' |
830 local D_POS0=$($SHOW_NAMESPACE |
831 awk '/^last_checkpoint_position/ { print $4 }')
833 #define OBD_FAIL_LFSCK_DELAY2 0x1601
834 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
835 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
837 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
838 [ "$STATUS" == "scanning-phase1" ] ||
839 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
841 local O_POS1=$($SHOW_NAMESPACE |
842 awk '/^latest_start_position/ { print $2 }' |
844 local D_POS1=$($SHOW_NAMESPACE |
845 awk '/^latest_start_position/ { print $4 }')
847 echo "Additional debug for 6b"
849 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
850 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
851 [[ $O_POS0 -lt $O_POS1 ]] ||
852 error "(7.1) $O_POS1 is not larger than $O_POS0"
854 [[ $D_POS0 -lt $D_POS1 ]] ||
855 error "(7.2) $D_POS1 is not larger than $D_POS0"
858 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
859 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
860 mdd.${MDT_DEV}.lfsck_namespace |
861 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
863 error "(8) unexpected status"
866 run_test 6b "LFSCK resumes from last checkpoint (2)"
873 #define OBD_FAIL_LFSCK_DELAY2 0x1601
874 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
875 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
877 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
878 [ "$STATUS" == "scanning-phase1" ] ||
879 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
881 # Sleep 3 sec to guarantee at least one object processed by LFSCK
883 echo "stop $SINGLEMDS"
884 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
886 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
887 echo "start $SINGLEMDS"
888 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
889 error "(5) Fail to start MDS!"
891 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
892 mdd.${MDT_DEV}.lfsck_namespace |
893 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
895 error "(6) unexpected status"
898 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
904 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
905 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
906 for ((i = 0; i < 20; i++)); do
907 touch $DIR/$tdir/dummy${i}
910 #define OBD_FAIL_LFSCK_DELAY3 0x1602
911 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
912 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
913 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
914 mdd.${MDT_DEV}.lfsck_namespace |
915 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
917 error "(4) unexpected status"
921 echo "stop $SINGLEMDS"
922 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
924 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
925 echo "start $SINGLEMDS"
926 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
927 error "(6) Fail to start MDS!"
929 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
930 mdd.${MDT_DEV}.lfsck_namespace |
931 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
933 error "(7) unexpected status"
936 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
941 formatall > /dev/null
947 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
948 [ "$STATUS" == "init" ] ||
949 error "(2) Expect 'init', but got '$STATUS'"
951 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
952 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
953 mkdir $DIR/$tdir/crashed
955 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
956 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
957 for ((i = 0; i < 5; i++)); do
958 touch $DIR/$tdir/dummy${i}
961 umount_client $MOUNT || error "(3) Fail to stop client!"
963 #define OBD_FAIL_LFSCK_DELAY2 0x1601
964 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
965 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
967 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
968 [ "$STATUS" == "scanning-phase1" ] ||
969 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
971 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
973 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
974 [ "$STATUS" == "stopped" ] ||
975 error "(7) Expect 'stopped', but got '$STATUS'"
977 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
979 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
980 [ "$STATUS" == "scanning-phase1" ] ||
981 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
983 #define OBD_FAIL_LFSCK_FATAL2 0x1609
984 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
985 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
986 mdd.${MDT_DEV}.lfsck_namespace |
987 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
989 error "(10) unexpected status"
992 #define OBD_FAIL_LFSCK_DELAY1 0x1600
993 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
994 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
996 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
997 [ "$STATUS" == "scanning-phase1" ] ||
998 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
1000 #define OBD_FAIL_LFSCK_CRASH 0x160a
1001 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
1004 echo "stop $SINGLEMDS"
1005 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
1007 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1008 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1010 echo "start $SINGLEMDS"
1011 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1012 error "(14) Fail to start MDS!"
1014 local timeout=$(max_recovery_time)
1017 while [ $timer -lt $timeout ]; do
1018 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1019 mdt.${MDT_DEV}.recovery_status |
1020 awk '/^status/ { print \\\$2 }'")
1021 [ "$STATUS" != "RECOVERING" ] && break;
1023 timer=$((timer + 1))
1026 [ $timer != $timeout ] ||
1027 error "(14.1) recovery timeout"
1029 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1030 [ "$STATUS" == "crashed" ] ||
1031 error "(15) Expect 'crashed', but got '$STATUS'"
1033 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1034 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1035 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
1037 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1038 [ "$STATUS" == "scanning-phase1" ] ||
1039 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1041 echo "stop $SINGLEMDS"
1042 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
1044 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1045 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1047 echo "start $SINGLEMDS"
1048 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1049 error "(19) Fail to start MDS!"
1052 while [ $timer -lt $timeout ]; do
1053 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1054 mdt.${MDT_DEV}.recovery_status |
1055 awk '/^status/ { print \\\$2 }'")
1056 [ "$STATUS" != "RECOVERING" ] && break;
1058 timer=$((timer + 1))
1061 [ $timer != $timeout ] ||
1062 error "(19.1) recovery timeout"
1064 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1065 [ "$STATUS" == "paused" ] ||
1066 error "(20) Expect 'paused', but got '$STATUS'"
1068 echo "stop $SINGLEMDS"
1069 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1071 echo "start $SINGLEMDS without resume LFSCK"
1072 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1073 error "(20.2) Fail to start MDS!"
1076 while [ $timer -lt $timeout ]; do
1077 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1078 mdt.${MDT_DEV}.recovery_status |
1079 awk '/^status/ { print \\\$2 }'")
1080 [ "$STATUS" != "RECOVERING" ] && break;
1082 timer=$((timer + 1))
1085 [ $timer != $timeout ] ||
1086 error "(20.3) recovery timeout"
1088 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1089 [ "$STATUS" == "paused" ] ||
1090 error "(20.4) Expect 'paused', but got '$STATUS'"
1092 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1093 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1095 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1096 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1097 mdd.${MDT_DEV}.lfsck_namespace |
1098 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1100 error "(22) unexpected status"
1103 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1106 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1107 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1108 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1110 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1111 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1112 mdd.${MDT_DEV}.lfsck_namespace |
1113 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1115 error "(24) unexpected status"
1118 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1119 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1121 run_test 8 "LFSCK state machine"
1124 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1125 skip "Testing on UP system, the speed may be inaccurate."
1129 check_mount_and_prep
1130 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1131 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1132 createmany -o $DIR/$tdir/lfsck/f 5000
1134 local BASE_SPEED1=100
1136 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1139 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1140 [ "$STATUS" == "scanning-phase1" ] ||
1141 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1143 local SPEED=$($SHOW_LAYOUT |
1144 awk '/^average_speed_phase1/ { print $2 }')
1146 # There may be time error, normally it should be less than 2 seconds.
1147 # We allow another 20% schedule error.
1149 # MAX_MARGIN = 1.3 = 13 / 10
1150 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1151 RUN_TIME1 * 13 / 10))
1152 [ $SPEED -lt $MAX_SPEED ] || {
1154 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1155 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1158 # adjust speed limit
1159 local BASE_SPEED2=300
1161 do_facet $SINGLEMDS \
1162 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1165 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1166 # MIN_MARGIN = 0.7 = 7 / 10
1167 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1168 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1169 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1170 [ $SPEED -gt $MIN_SPEED ] || {
1171 if [ $mds1_FSTYPE != ldiskfs ]; then
1172 error_ignore LU-5624 \
1173 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1176 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1180 # MAX_MARGIN = 1.3 = 13 / 10
1181 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1182 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1183 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1184 [ $SPEED -lt $MAX_SPEED ] || {
1186 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1187 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1188 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1191 do_nodes $(comma_list $(mdts_nodes)) \
1192 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1193 do_nodes $(comma_list $(osts_nodes)) \
1194 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1196 wait_update_facet $SINGLEMDS \
1197 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1198 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1199 error "(7) Failed to get expected 'completed'"
1201 run_test 9a "LFSCK speed control (1)"
1204 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1205 skip "Testing on UP system, the speed may be inaccurate."
1211 echo "Preparing another 50 * 50 files (with error) at $(date)."
1212 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1213 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1214 createmany -d $DIR/$tdir/d 50
1215 createmany -m $DIR/$tdir/f 50
1216 for ((i = 0; i < 50; i++)); do
1217 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1220 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1221 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1222 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1223 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1224 mdd.${MDT_DEV}.lfsck_namespace |
1225 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1227 error "(5) unexpected status"
1230 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1231 echo "Prepared at $(date)."
1233 local BASE_SPEED1=50
1235 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1238 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1239 [ "$STATUS" == "scanning-phase2" ] ||
1240 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1242 local SPEED=$($SHOW_NAMESPACE |
1243 awk '/^average_speed_phase2/ { print $2 }')
1244 # There may be time error, normally it should be less than 2 seconds.
1245 # We allow another 20% schedule error.
1247 # MAX_MARGIN = 1.3 = 13 / 10
1248 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1249 RUN_TIME1 * 13 / 10))
1250 [ $SPEED -lt $MAX_SPEED ] || {
1252 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1253 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1256 # adjust speed limit
1257 local BASE_SPEED2=150
1259 do_facet $SINGLEMDS \
1260 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1263 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1264 # MIN_MARGIN = 0.7 = 7 / 10
1265 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1266 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1267 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1268 [ $SPEED -gt $MIN_SPEED ] || {
1269 if [ $mds1_FSTYPE != ldiskfs ]; then
1270 error_ignore LU-5624 \
1271 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1274 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1278 # MAX_MARGIN = 1.3 = 13 / 10
1279 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1280 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1281 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1282 [ $SPEED -lt $MAX_SPEED ] || {
1284 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1285 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1286 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1289 do_nodes $(comma_list $(mdts_nodes)) \
1290 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1291 do_nodes $(comma_list $(osts_nodes)) \
1292 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1293 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1294 mdd.${MDT_DEV}.lfsck_namespace |
1295 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1297 error "(11) unexpected status"
1300 run_test 9b "LFSCK speed control (2)"
1304 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1308 echo "Preparing more files with error at $(date)."
1309 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1312 for ((i = 0; i < 1000; i = $((i+2)))); do
1313 mkdir -p $DIR/$tdir/d${i}
1314 touch $DIR/$tdir/f${i}
1315 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1318 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1319 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1321 for ((i = 1; i < 1000; i = $((i+2)))); do
1322 mkdir -p $DIR/$tdir/d${i}
1323 touch $DIR/$tdir/f${i}
1324 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1327 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1328 echo "Prepared at $(date)."
1330 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1332 umount_client $MOUNT
1333 mount_client $MOUNT || error "(3) Fail to start client!"
1335 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1338 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1339 [ "$STATUS" == "scanning-phase1" ] ||
1340 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1342 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1344 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1346 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1348 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1350 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1352 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1354 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1356 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1357 error "(14) Fail to softlink!"
1359 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1360 [ "$STATUS" == "scanning-phase1" ] ||
1361 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1363 do_nodes $(comma_list $(mdts_nodes)) \
1364 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1365 do_nodes $(comma_list $(osts_nodes)) \
1366 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1367 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1368 mdd.${MDT_DEV}.lfsck_namespace |
1369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1371 error "(16) unexpected status"
1374 run_test 10 "System is available during LFSCK scanning"
1377 ost_remove_lastid() {
1380 local rcmd="do_facet ost${ost}"
1382 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1384 # step 1: local mount
1385 mount_fstype ost${ost} || return 1
1386 # step 2: remove the specified LAST_ID
1387 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1389 unmount_fstype ost${ost} || return 2
1393 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1394 skip "MDS older than 2.5.55, LU-1267"
1396 check_mount_and_prep
1397 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1398 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1403 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1405 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1406 error "(2) Fail to start ost1"
1408 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1409 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1411 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1412 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1414 wait_update_facet ost1 "$LCTL get_param -n \
1415 obdfilter.${OST_DEV}.lfsck_layout |
1416 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1418 error "(5) unexpected status"
1421 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1423 wait_update_facet ost1 "$LCTL get_param -n \
1424 obdfilter.${OST_DEV}.lfsck_layout |
1425 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1427 error "(6) unexpected status"
1430 echo "the LAST_ID(s) should have been rebuilt"
1431 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1432 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1434 run_test 11a "LFSCK can rebuild lost last_id"
1437 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1438 skip "MDS older than 2.5.55, LU-1267"
1440 check_mount_and_prep
1441 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1443 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1444 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1445 do_facet ost1 $LCTL set_param fail_loc=0x160d
1447 local count=$(precreated_ost_obj_count 0 0)
1449 createmany -o $DIR/$tdir/f $((count + 32))
1451 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1452 local seq=$(do_facet mds1 $LCTL get_param -n \
1453 osp.${proc_path}.prealloc_last_seq)
1454 local id_used=$(do_facet mds1 $LCTL get_param -n \
1455 osp.${proc_path}.prealloc_last_id)
1457 umount_client $MOUNT
1458 stop ost1 || error "(1) Fail to stop ost1"
1460 #define OBD_FAIL_OST_ENOSPC 0x215
1461 do_facet ost1 $LCTL set_param fail_loc=0x215
1463 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1464 error "(2) Fail to start ost1"
1466 for ((i = 0; i < 60; i++)); do
1467 id_ost1=$(do_facet ost1 \
1468 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1469 awk -F: "/$seq/ { print \$2 }")
1470 [ -n "$id_ost1" ] && break
1474 echo "the on-disk LAST_ID should be smaller than the expected one"
1475 [ $id_used -gt $id_ost1 ] ||
1476 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1478 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1479 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1481 wait_update_facet ost1 \
1482 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1483 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1485 error "(6) unexpected status"
1488 stop ost1 || error "(7) Fail to stop ost1"
1490 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1491 error "(8) Fail to start ost1"
1493 echo "the on-disk LAST_ID should have been rebuilt"
1494 # last_id may be larger than $id_used if objects were created/skipped
1495 wait_update_facet_cond ost1 \
1496 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1497 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1498 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1499 error "(9) expect last_id >= id_used $seq:$id_used"
1502 do_facet ost1 $LCTL set_param fail_loc=0
1503 stopall || error "(10) Fail to stopall"
1505 run_test 11b "LFSCK can rebuild crashed last_id"
1508 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1509 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1510 skip "MDS older than 2.5.55, LU-3950"
1512 check_mount_and_prep
1513 for k in $(seq $MDSCOUNT); do
1514 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1515 createmany -o $DIR/$tdir/${k}/f 100 ||
1516 error "(0) Fail to create 100 files."
1519 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1520 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1521 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1523 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1524 wait_all_targets namespace scanning-phase1 3
1526 echo "Stop namespace LFSCK on all targets by single lctl command."
1527 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1528 error "(4) Fail to stop LFSCK on all devices!"
1530 echo "All the LFSCK targets should be in 'stopped' status."
1531 wait_all_targets_blocked namespace stopped 5
1533 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1534 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1535 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1537 echo "All the LFSCK targets should be in 'completed' status."
1538 wait_all_targets_blocked namespace completed 7
1540 start_full_debug_logging
1542 echo "Start layout LFSCK on all targets by single command (-s 1)."
1543 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1544 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1546 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1547 wait_all_targets layout scanning-phase1 9
1549 echo "Stop layout LFSCK on all targets by single lctl command."
1550 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1551 error "(10) Fail to stop LFSCK on all devices!"
1553 echo "All the LFSCK targets should be in 'stopped' status."
1554 wait_all_targets_blocked layout stopped 11
1556 for k in $(seq $OSTCOUNT); do
1557 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1558 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1559 awk '/^status/ { print $2 }')
1560 [ "$STATUS" == "stopped" ] ||
1561 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1564 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1565 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1566 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1568 echo "All the LFSCK targets should be in 'completed' status."
1569 wait_all_targets_blocked layout completed 14
1571 stop_full_debug_logging
1573 run_test 12a "single command to trigger LFSCK on all devices"
1576 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1577 skip "MDS older than 2.5.55, LU-3950"
1579 check_mount_and_prep
1581 echo "Start LFSCK without '-M' specified."
1582 do_facet mds1 $LCTL lfsck_start -A -r ||
1583 error "(0) Fail to start LFSCK without '-M'"
1585 wait_all_targets_blocked namespace completed 1
1586 wait_all_targets_blocked layout completed 2
1588 local count=$(do_facet mds1 $LCTL dl |
1589 awk '{ print $3 }' | grep mdt | wc -l)
1590 if [ $count -gt 1 ]; then
1592 echo "Start layout LFSCK on the node with multipe targets,"
1593 echo "but not specify '-M'/'-A' option. Should get failure."
1595 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1596 error "(3) Start layout LFSCK should fail" || true
1599 run_test 12b "auto detect Lustre device"
1602 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1603 skip "MDS older than 2.5.55, LU-3593"
1606 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1607 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1608 echo "MDT-object FID."
1611 check_mount_and_prep
1613 echo "Inject failure stub to simulate bad lmm_oi"
1614 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1616 createmany -o $DIR/$tdir/f 1
1617 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1618 error "(0) Fail to create PFL $DIR/$tdir/f1"
1619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1621 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1622 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1624 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1625 mdd.${MDT_DEV}.lfsck_layout |
1626 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1628 error "(2) unexpected status"
1631 local repaired=$($SHOW_LAYOUT |
1632 awk '/^repaired_others/ { print $2 }')
1633 [ $repaired -eq 2 ] ||
1634 error "(3) Fail to repair crashed lmm_oi: $repaired"
1636 run_test 13 "LFSCK can repair crashed lmm_oi"
1639 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1640 skip "MDS older than 2.5.55, LU-3590"
1643 echo "The OST-object referenced by the MDT-object should be there;"
1644 echo "otherwise, the LFSCK should re-create the missing OST-object."
1645 echo "without '--delay-create-ostobj' option."
1648 check_mount_and_prep
1649 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1651 echo "Inject failure stub to simulate dangling referenced MDT-object"
1652 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1653 do_facet ost1 $LCTL set_param fail_loc=0x1610
1654 local count=$(precreated_ost_obj_count 0 0)
1656 createmany -o $DIR/$tdir/f $((count + 16)) ||
1657 error "(0.1) Fail to create $DIR/$tdir/fx"
1658 touch $DIR/$tdir/guard0
1660 for ((i = 0; i < 16; i++)); do
1661 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1662 $DIR/$tdir/f_comp${i} ||
1663 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1665 touch $DIR/$tdir/guard1
1667 do_facet ost1 $LCTL set_param fail_loc=0
1669 start_full_debug_logging
1671 # exhaust other pre-created dangling cases
1672 count=$(precreated_ost_obj_count 0 0)
1673 createmany -o $DIR/$tdir/a $count ||
1674 error "(0.5) Fail to create $count files."
1676 echo "'ls' should fail because of dangling referenced MDT-object"
1677 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1679 echo "Trigger layout LFSCK to find out dangling reference"
1680 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1682 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1683 mdd.${MDT_DEV}.lfsck_layout |
1684 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1686 error "(3) unexpected status"
1689 local repaired=$($SHOW_LAYOUT |
1690 awk '/^repaired_dangling/ { print $2 }')
1691 [ $repaired -ge 32 ] ||
1692 error "(4) Fail to repair dangling reference: $repaired"
1694 echo "'stat' should fail because of not repair dangling by default"
1695 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1696 error "(5.1) stat should fail"
1697 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1698 error "(5.2) stat should fail"
1700 echo "Trigger layout LFSCK to repair dangling reference"
1701 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1703 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1704 mdd.${MDT_DEV}.lfsck_layout |
1705 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1707 error "(7) unexpected status"
1710 # There may be some async LFSCK updates in processing, wait for
1711 # a while until the target reparation has been done. LU-4970.
1713 echo "'stat' should success after layout LFSCK repairing"
1714 wait_update_facet client "stat $DIR/$tdir/guard0 |
1715 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1716 stat $DIR/$tdir/guard0
1718 error "(8.1) unexpected size"
1721 wait_update_facet client "stat $DIR/$tdir/guard1 |
1722 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1723 stat $DIR/$tdir/guard1
1725 error "(8.2) unexpected size"
1728 repaired=$($SHOW_LAYOUT |
1729 awk '/^repaired_dangling/ { print $2 }')
1730 [ $repaired -ge 32 ] ||
1731 error "(9) Fail to repair dangling reference: $repaired"
1733 stop_full_debug_logging
1735 echo "stopall to cleanup object cache"
1738 setupall > /dev/null
1740 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1743 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1744 skip "MDS older than 2.5.55, LU-3590"
1747 echo "The OST-object referenced by the MDT-object should be there;"
1748 echo "otherwise, the LFSCK should re-create the missing OST-object."
1749 echo "with '--delay-create-ostobj' option."
1752 check_mount_and_prep
1753 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1755 echo "Inject failure stub to simulate dangling referenced MDT-object"
1756 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1757 do_facet ost1 $LCTL set_param fail_loc=0x1610
1758 local count=$(precreated_ost_obj_count 0 0)
1760 createmany -o $DIR/$tdir/f $((count + 31))
1761 touch $DIR/$tdir/guard
1762 do_facet ost1 $LCTL set_param fail_loc=0
1764 start_full_debug_logging
1766 # exhaust other pre-created dangling cases
1767 count=$(precreated_ost_obj_count 0 0)
1768 createmany -o $DIR/$tdir/a $count ||
1769 error "(0) Fail to create $count files."
1771 echo "'ls' should fail because of dangling referenced MDT-object"
1772 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1774 echo "Trigger layout LFSCK to find out dangling reference"
1775 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1777 wait_all_targets_blocked layout completed 3
1779 local repaired=$($SHOW_LAYOUT |
1780 awk '/^repaired_dangling/ { print $2 }')
1781 [ $repaired -ge 32 ] ||
1782 error "(4) Fail to repair dangling reference: $repaired"
1784 echo "'stat' should fail because of not repair dangling by default"
1785 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1787 echo "Trigger layout LFSCK to repair dangling reference"
1788 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1790 wait_all_targets_blocked layout completed 7
1792 # There may be some async LFSCK updates in processing, wait for
1793 # a while until the target reparation has been done. LU-4970.
1795 echo "'stat' should success after layout LFSCK repairing"
1796 wait_update_facet client "stat $DIR/$tdir/guard |
1797 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1798 stat $DIR/$tdir/guard
1800 error "(8) unexpected size"
1803 repaired=$($SHOW_LAYOUT |
1804 awk '/^repaired_dangling/ { print $2 }')
1805 [ $repaired -ge 32 ] ||
1806 error "(9) Fail to repair dangling reference: $repaired"
1808 stop_full_debug_logging
1810 echo "stopall to cleanup object cache"
1813 setupall > /dev/null
1815 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1818 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1819 skip "MDS older than 2.5.55, LU-3591"
1822 echo "If the OST-object referenced by the MDT-object back points"
1823 echo "to some non-exist MDT-object, then the LFSCK should repair"
1824 echo "the OST-object to back point to the right MDT-object."
1827 check_mount_and_prep
1828 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1830 echo "Inject failure stub to make the OST-object to back point to"
1831 echo "non-exist MDT-object."
1832 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1834 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1835 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1836 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1838 error "(0) Fail to create PFL $DIR/$tdir/f1"
1839 # 'dd' will trigger punch RPC firstly on every OST-objects.
1840 # So even though some OST-object will not be write by 'dd',
1841 # as long as it is allocated (may be NOT allocated in pfl_3b)
1842 # its layout information will be set also.
1843 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1844 cancel_lru_locks osc
1845 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1847 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1848 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1850 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1851 mdd.${MDT_DEV}.lfsck_layout |
1852 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1854 error "(2) unexpected status"
1857 local repaired=$($SHOW_LAYOUT |
1858 awk '/^repaired_unmatched_pair/ { print $2 }')
1859 [ $repaired -ge 3 ] ||
1860 error "(3) Fail to repair unmatched pair: $repaired"
1862 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1865 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1866 skip "MDS older than 2.5.55, LU-3591"
1869 echo "If the OST-object referenced by the MDT-object back points"
1870 echo "to other MDT-object that doesn't recognize the OST-object,"
1871 echo "then the LFSCK should repair it to back point to the right"
1872 echo "MDT-object (the first one)."
1875 check_mount_and_prep
1876 mkdir -p $DIR/$tdir/0
1877 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1878 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1879 cancel_lru_locks osc
1881 echo "Inject failure stub to make the OST-object to back point to"
1882 echo "other MDT-object"
1885 [ $OSTCOUNT -ge 2 ] && stripes=2
1887 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1888 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1889 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1890 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1892 error "(0) Fail to create PFL $DIR/$tdir/f1"
1893 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1894 cancel_lru_locks osc
1895 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1897 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1898 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1900 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1901 mdd.${MDT_DEV}.lfsck_layout |
1902 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1904 error "(2) unexpected status"
1907 local repaired=$($SHOW_LAYOUT |
1908 awk '/^repaired_unmatched_pair/ { print $2 }')
1909 [ $repaired -eq 4 ] ||
1910 error "(3) Fail to repair unmatched pair: $repaired"
1912 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1915 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1916 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1917 skip "MDS newer than 2.7.55, LU-6475"
1918 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1919 skip "MDS older than 2.5.55, LU-3591"
1922 echo "According to current metadata migration implementation,"
1923 echo "before the old MDT-object is removed, both the new MDT-object"
1924 echo "and old MDT-object will reference the same LOV layout. Then if"
1925 echo "the layout LFSCK finds the new MDT-object by race, it will"
1926 echo "regard related OST-object(s) as multiple referenced case, and"
1927 echo "will try to create new OST-object(s) for the new MDT-object."
1928 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1929 echo "MDT-object before confirm the multiple referenced case."
1932 check_mount_and_prep
1933 $LFS mkdir -i 1 $DIR/$tdir/a1
1934 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1935 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1936 cancel_lru_locks osc
1938 echo "Inject failure stub on MDT1 to delay the migration"
1940 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1941 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1942 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1943 $LFS migrate -m 0 $DIR/$tdir/a1 &
1946 echo "Trigger layout LFSCK to race with the migration"
1947 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1949 wait_all_targets_blocked layout completed 2
1951 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1952 local repaired=$($SHOW_LAYOUT |
1953 awk '/^repaired_unmatched_pair/ { print $2 }')
1954 [ $repaired -eq 1 ] ||
1955 error "(3) Fail to repair unmatched pair: $repaired"
1957 repaired=$($SHOW_LAYOUT |
1958 awk '/^repaired_multiple_referenced/ { print $2 }')
1959 [ $repaired -eq 0 ] ||
1960 error "(4) Unexpectedly repaird multiple references: $repaired"
1962 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1965 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1966 skip "MDS older than 2.5.55, LU-3594"
1969 echo "If the OST-object's owner information does not match the owner"
1970 echo "information stored in the MDT-object, then the LFSCK trust the"
1971 echo "MDT-object and update the OST-object's owner information."
1974 check_mount_and_prep
1975 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1976 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1977 cancel_lru_locks osc
1979 # created but no setattr or write to the file.
1981 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1982 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1984 echo "Inject failure stub to skip OST-object owner changing"
1985 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1986 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1987 chown 1.1 $DIR/$tdir/f0
1988 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1990 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1993 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1995 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1996 mdd.${MDT_DEV}.lfsck_layout |
1997 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1999 error "(2) unexpected status"
2002 local repaired=$($SHOW_LAYOUT |
2003 awk '/^repaired_inconsistent_owner/ { print $2 }')
2004 [ $repaired -eq 1 ] ||
2005 error "(3) Fail to repair inconsistent owner: $repaired"
2007 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2010 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2011 skip "MDS older than 2.5.55, LU-3594"
2014 echo "If more than one MDT-objects reference the same OST-object,"
2015 echo "and the OST-object only recognizes one MDT-object, then the"
2016 echo "LFSCK should create new OST-objects for such non-recognized"
2020 check_mount_and_prep
2021 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2023 echo "Inject failure stub to make two MDT-objects to refernce"
2024 echo "the OST-object"
2026 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2027 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2028 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2029 cancel_lru_locks mdc
2030 cancel_lru_locks osc
2032 createmany -o $DIR/$tdir/f 1
2033 cancel_lru_locks mdc
2034 cancel_lru_locks osc
2036 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2038 error "(0) Fail to create PFL $DIR/$tdir/f1"
2039 cancel_lru_locks mdc
2040 cancel_lru_locks osc
2041 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2043 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2044 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2045 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2046 [ $size -eq 1048576 ] ||
2047 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2049 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2050 [ $size -eq 1048576 ] ||
2051 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2053 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2056 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2058 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2059 mdd.${MDT_DEV}.lfsck_layout |
2060 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2062 error "(3) unexpected status"
2065 local repaired=$($SHOW_LAYOUT |
2066 awk '/^repaired_multiple_referenced/ { print $2 }')
2067 [ $repaired -eq 2 ] ||
2068 error "(4) Fail to repair multiple references: $repaired"
2070 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2071 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2072 error "(5) Fail to write f0."
2073 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2074 [ $size -eq 1048576 ] ||
2075 error "(6) guard size should be 1048576, but got $size"
2077 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2078 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2079 error "(7) Fail to write f1."
2080 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2081 [ $size -eq 1048576 ] ||
2082 error "(8) guard size should be 1048576, but got $size"
2084 run_test 17 "LFSCK can repair multiple references"
2086 $LCTL set_param debug=+cache > /dev/null
2089 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2090 skip "MDS older than 2.5.55, LU-3336"
2093 echo "The target MDT-object is there, but related stripe information"
2094 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2095 echo "layout EA entries."
2098 check_mount_and_prep
2099 $LFS mkdir -i 0 $DIR/$tdir/a1
2100 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2101 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2103 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2105 $LFS path2fid $DIR/$tdir/a1/f1
2106 $LFS getstripe $DIR/$tdir/a1/f1
2108 if [ $MDSCOUNT -ge 2 ]; then
2109 $LFS mkdir -i 1 $DIR/$tdir/a2
2110 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2111 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2112 $LFS path2fid $DIR/$tdir/a2/f2
2113 $LFS getstripe $DIR/$tdir/a2/f2
2116 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2117 error "(0) Fail to create PFL $DIR/$tdir/f3"
2119 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2121 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2123 $LFS path2fid $DIR/$tdir/f3
2124 $LFS getstripe $DIR/$tdir/f3
2126 cancel_lru_locks osc
2128 echo "Inject failure, to make the MDT-object lost its layout EA"
2129 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2130 do_facet mds1 $LCTL set_param fail_loc=0x1615
2131 chown 1.1 $DIR/$tdir/a1/f1
2133 if [ $MDSCOUNT -ge 2 ]; then
2134 do_facet mds2 $LCTL set_param fail_loc=0x1615
2135 chown 1.1 $DIR/$tdir/a2/f2
2138 chown 1.1 $DIR/$tdir/f3
2143 do_facet mds1 $LCTL set_param fail_loc=0
2144 if [ $MDSCOUNT -ge 2 ]; then
2145 do_facet mds2 $LCTL set_param fail_loc=0
2148 cancel_lru_locks mdc
2149 cancel_lru_locks osc
2151 echo "The file size should be incorrect since layout EA is lost"
2152 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2153 [ "$cur_size" != "$saved_size1" ] ||
2154 error "(1) Expect incorrect file1 size"
2156 if [ $MDSCOUNT -ge 2 ]; then
2157 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2158 [ "$cur_size" != "$saved_size1" ] ||
2159 error "(2) Expect incorrect file2 size"
2162 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2163 [ "$cur_size" != "$saved_size2" ] ||
2164 error "(1.2) Expect incorrect file3 size"
2166 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2167 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2169 for k in $(seq $MDSCOUNT); do
2170 # The LFSCK status query internal is 30 seconds. For the case
2171 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2172 # time to guarantee the status sync up.
2173 wait_update_facet mds${k} "$LCTL get_param -n \
2174 mdd.$(facet_svc mds${k}).lfsck_layout |
2175 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2176 error "(4) MDS${k} is not the expected 'completed'"
2179 for k in $(seq $OSTCOUNT); do
2180 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2181 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2182 awk '/^status/ { print $2 }')
2183 [ "$cur_status" == "completed" ] ||
2184 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2187 local repaired=$(do_facet mds1 $LCTL get_param -n \
2188 mdd.$(facet_svc mds1).lfsck_layout |
2189 awk '/^repaired_orphan/ { print $2 }')
2190 [ $repaired -eq 3 ] ||
2191 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2193 if [ $MDSCOUNT -ge 2 ]; then
2194 repaired=$(do_facet mds2 $LCTL get_param -n \
2195 mdd.$(facet_svc mds2).lfsck_layout |
2196 awk '/^repaired_orphan/ { print $2 }')
2197 [ $repaired -eq 2 ] ||
2198 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2201 $LFS path2fid $DIR/$tdir/a1/f1
2202 $LFS getstripe $DIR/$tdir/a1/f1
2204 if [ $MDSCOUNT -ge 2 ]; then
2205 $LFS path2fid $DIR/$tdir/a2/f2
2206 $LFS getstripe $DIR/$tdir/a2/f2
2209 $LFS path2fid $DIR/$tdir/f3
2210 $LFS getstripe $DIR/$tdir/f3
2212 echo "The file size should be correct after layout LFSCK scanning"
2213 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2214 [ "$cur_size" == "$saved_size1" ] ||
2215 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2217 if [ $MDSCOUNT -ge 2 ]; then
2218 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2219 [ "$cur_size" == "$saved_size1" ] ||
2220 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2223 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2224 [ "$cur_size" == "$saved_size2" ] ||
2225 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2227 run_test 18a "Find out orphan OST-object and repair it (1)"
2230 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2231 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2232 skip "MDS older than 2.5.55, LU-3336"
2235 echo "The target MDT-object is lost. The LFSCK should re-create the"
2236 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2237 echo "can move it back to normal namespace manually."
2240 check_mount_and_prep
2241 $LFS mkdir -i 0 $DIR/$tdir/a1
2242 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2243 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2244 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2245 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2247 $LFS getstripe $DIR/$tdir/a1/f1
2249 if [ $MDSCOUNT -ge 2 ]; then
2250 $LFS mkdir -i 1 $DIR/$tdir/a2
2251 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2252 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2253 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2255 $LFS getstripe $DIR/$tdir/a2/f2
2258 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2259 error "(0) Fail to create PFL $DIR/$tdir/f3"
2261 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2263 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2264 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2266 $LFS getstripe $DIR/$tdir/f3
2268 cancel_lru_locks osc
2270 echo "Inject failure, to simulate the case of missing the MDT-object"
2271 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2272 do_facet mds1 $LCTL set_param fail_loc=0x1616
2273 rm -f $DIR/$tdir/a1/f1
2275 if [ $MDSCOUNT -ge 2 ]; then
2276 do_facet mds2 $LCTL set_param fail_loc=0x1616
2277 rm -f $DIR/$tdir/a2/f2
2285 do_facet mds1 $LCTL set_param fail_loc=0
2286 if [ $MDSCOUNT -ge 2 ]; then
2287 do_facet mds2 $LCTL set_param fail_loc=0
2290 cancel_lru_locks mdc
2291 cancel_lru_locks osc
2293 # dryrun mode only check orphans, not repaie
2294 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2295 $START_LAYOUT --dryrun -o -r ||
2296 error "Fail to start layout LFSCK in dryrun mode"
2297 wait_all_targets_blocked layout completed 2
2299 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2300 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2301 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2303 local orphans=$(do_facet mds1 $LCTL get_param -n \
2304 mdd.$(facet_svc mds1).lfsck_layout |
2305 awk '/^inconsistent_orphan/ { print $2 }')
2306 [ $orphans -eq 3 ] ||
2307 error "Expect 3 found on mds1, but got: $orphans"
2309 # orphan parents should not be created
2311 for subdir in $MOUNT/.lustre/lost+found/*; do
2312 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2315 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2316 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2318 for k in $(seq $MDSCOUNT); do
2319 # The LFSCK status query internal is 30 seconds. For the case
2320 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2321 # time to guarantee the status sync up.
2322 wait_update_facet mds${k} "$LCTL get_param -n \
2323 mdd.$(facet_svc mds${k}).lfsck_layout |
2324 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2325 error "(2) MDS${k} is not the expected 'completed'"
2328 for k in $(seq $OSTCOUNT); do
2329 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2330 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2331 awk '/^status/ { print $2 }')
2332 [ "$cur_status" == "completed" ] ||
2333 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2336 local repaired=$(do_facet mds1 $LCTL get_param -n \
2337 mdd.$(facet_svc mds1).lfsck_layout |
2338 awk '/^repaired_orphan/ { print $2 }')
2339 [ $repaired -eq 3 ] ||
2340 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2342 if [ $MDSCOUNT -ge 2 ]; then
2343 repaired=$(do_facet mds2 $LCTL get_param -n \
2344 mdd.$(facet_svc mds2).lfsck_layout |
2345 awk '/^repaired_orphan/ { print $2 }')
2346 [ $repaired -eq 2 ] ||
2347 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2350 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2351 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2352 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2354 if [ $MDSCOUNT -ge 2 ]; then
2355 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2356 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2359 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2360 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2362 $LFS path2fid $DIR/$tdir/a1/f1
2363 $LFS getstripe $DIR/$tdir/a1/f1
2365 if [ $MDSCOUNT -ge 2 ]; then
2366 $LFS path2fid $DIR/$tdir/a2/f2
2367 $LFS getstripe $DIR/$tdir/a2/f2
2370 $LFS path2fid $DIR/$tdir/f3
2371 $LFS getstripe $DIR/$tdir/f3
2373 echo "The file size should be correct after layout LFSCK scanning"
2374 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2375 [ "$cur_size" == "$saved_size1" ] ||
2376 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2378 if [ $MDSCOUNT -ge 2 ]; then
2379 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2380 [ "$cur_size" == "$saved_size1" ] ||
2381 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2384 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2385 [ "$cur_size" == "$saved_size2" ] ||
2386 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2388 run_test 18b "Find out orphan OST-object and repair it (2)"
2391 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2392 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2393 skip "MDS older than 2.5.55, LU-3336"
2396 echo "The target MDT-object is lost, and the OST-object FID is missing."
2397 echo "The LFSCK should re-create the MDT-object with new FID under the "
2398 echo "directory .lustre/lost+found/MDTxxxx."
2401 check_mount_and_prep
2402 $LFS mkdir -i 0 $DIR/$tdir/a1
2403 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2405 echo "Inject failure, to simulate the case of missing parent FID"
2406 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2407 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2409 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2410 $LFS getstripe $DIR/$tdir/a1/f1
2412 if [ $MDSCOUNT -ge 2 ]; then
2413 $LFS mkdir -i 1 $DIR/$tdir/a2
2414 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2415 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2416 $LFS getstripe $DIR/$tdir/a2/f2
2419 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2420 error "(0) Fail to create PFL $DIR/$tdir/f3"
2422 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2423 $LFS getstripe $DIR/$tdir/f3
2425 cancel_lru_locks osc
2426 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2428 echo "Inject failure, to simulate the case of missing the MDT-object"
2429 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2430 do_facet mds1 $LCTL set_param fail_loc=0x1616
2431 rm -f $DIR/$tdir/a1/f1
2433 if [ $MDSCOUNT -ge 2 ]; then
2434 do_facet mds2 $LCTL set_param fail_loc=0x1616
2435 rm -f $DIR/$tdir/a2/f2
2443 do_facet mds1 $LCTL set_param fail_loc=0
2444 if [ $MDSCOUNT -ge 2 ]; then
2445 do_facet mds2 $LCTL set_param fail_loc=0
2448 cancel_lru_locks mdc
2449 cancel_lru_locks osc
2451 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2452 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2454 for k in $(seq $MDSCOUNT); do
2455 # The LFSCK status query internal is 30 seconds. For the case
2456 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2457 # time to guarantee the status sync up.
2458 wait_update_facet mds${k} "$LCTL get_param -n \
2459 mdd.$(facet_svc mds${k}).lfsck_layout |
2460 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2461 error "(2) MDS${k} is not the expected 'completed'"
2464 for k in $(seq $OSTCOUNT); do
2465 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2466 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2467 awk '/^status/ { print $2 }')
2468 [ "$cur_status" == "completed" ] ||
2469 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2472 if [ $MDSCOUNT -ge 2 ]; then
2478 local repaired=$(do_facet mds1 $LCTL get_param -n \
2479 mdd.$(facet_svc mds1).lfsck_layout |
2480 awk '/^repaired_orphan/ { print $2 }')
2481 [ $repaired -eq $expected ] ||
2482 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2484 if [ $MDSCOUNT -ge 2 ]; then
2485 repaired=$(do_facet mds2 $LCTL get_param -n \
2486 mdd.$(facet_svc mds2).lfsck_layout |
2487 awk '/^repaired_orphan/ { print $2 }')
2488 [ $repaired -eq 0 ] ||
2489 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2492 ls -ail $MOUNT/.lustre/lost+found/
2494 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2495 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2496 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2498 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2501 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2502 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2503 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2505 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2506 [ ! -z "$cname" ] ||
2507 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2509 run_test 18c "Find out orphan OST-object and repair it (3)"
2512 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2513 skip "MDS older than 2.5.55, LU-3336"
2516 echo "The target MDT-object layout EA is corrupted, but the right"
2517 echo "OST-object is still alive as orphan. The layout LFSCK will"
2518 echo "not create new OST-object to occupy such slot."
2521 check_mount_and_prep
2523 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2524 echo "guard" > $DIR/$tdir/a1/f1
2525 echo "foo" > $DIR/$tdir/a1/f2
2527 echo "guard" > $DIR/$tdir/a1/f3
2528 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2529 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2530 echo "foo" > $DIR/$tdir/a1/f4
2532 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2533 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2534 $LFS path2fid $DIR/$tdir/a1/f1
2535 $LFS getstripe $DIR/$tdir/a1/f1
2536 $LFS path2fid $DIR/$tdir/a1/f2
2537 $LFS getstripe $DIR/$tdir/a1/f2
2538 $LFS path2fid $DIR/$tdir/a1/f3
2539 $LFS getstripe $DIR/$tdir/a1/f3
2540 $LFS path2fid $DIR/$tdir/a1/f4
2541 $LFS getstripe $DIR/$tdir/a1/f4
2542 cancel_lru_locks osc
2544 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2545 echo "to reference the same OST-object (which is f1's OST-obejct)."
2546 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2547 echo "dangling reference case, but f2's old OST-object is there."
2549 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2550 echo "to reference the same OST-object (which is f3's OST-obejct)."
2551 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2552 echo "dangling reference case, but f4's old OST-object is there."
2555 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2556 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2557 chown 1.1 $DIR/$tdir/a1/f2
2558 chown 1.1 $DIR/$tdir/a1/f4
2559 rm -f $DIR/$tdir/a1/f1
2560 rm -f $DIR/$tdir/a1/f3
2563 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2565 echo "stopall to cleanup object cache"
2568 setupall > /dev/null
2570 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2571 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2573 for k in $(seq $MDSCOUNT); do
2574 # The LFSCK status query internal is 30 seconds. For the case
2575 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2576 # time to guarantee the status sync up.
2577 wait_update_facet mds${k} "$LCTL get_param -n \
2578 mdd.$(facet_svc mds${k}).lfsck_layout |
2579 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2580 error "(3) MDS${k} is not the expected 'completed'"
2583 for k in $(seq $OSTCOUNT); do
2584 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2585 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2586 awk '/^status/ { print $2 }')
2587 [ "$cur_status" == "completed" ] ||
2588 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2591 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2592 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2593 awk '/^repaired_orphan/ { print $2 }')
2594 [ $repaired -eq 2 ] ||
2595 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2597 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2598 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2599 awk '/^repaired_dangling/ { print $2 }')
2600 [ $repaired -eq 0 ] ||
2601 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2603 echo "The file size should be correct after layout LFSCK scanning"
2604 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2605 [ "$cur_size" == "$saved_size1" ] ||
2606 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2608 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2609 [ "$cur_size" == "$saved_size2" ] ||
2610 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2612 echo "The LFSCK should find back the original data."
2613 cat $DIR/$tdir/a1/f2
2614 $LFS path2fid $DIR/$tdir/a1/f2
2615 $LFS getstripe $DIR/$tdir/a1/f2
2616 cat $DIR/$tdir/a1/f4
2617 $LFS path2fid $DIR/$tdir/a1/f4
2618 $LFS getstripe $DIR/$tdir/a1/f4
2620 run_test 18d "Find out orphan OST-object and repair it (4)"
2623 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2624 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2625 skip "MDS older than 2.5.55, LU-3336"
2628 echo "The target MDT-object layout EA slot is occpuied by some new"
2629 echo "created OST-object when repair dangling reference case. Such"
2630 echo "conflict OST-object has been modified by others. To keep the"
2631 echo "new data, the LFSCK will create a new file to refernece this"
2632 echo "old orphan OST-object."
2635 check_mount_and_prep
2637 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2638 echo "guard" > $DIR/$tdir/a1/f1
2639 echo "foo" > $DIR/$tdir/a1/f2
2641 echo "guard" > $DIR/$tdir/a1/f3
2642 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2643 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2644 echo "foo" > $DIR/$tdir/a1/f4
2646 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2647 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2649 $LFS path2fid $DIR/$tdir/a1/f1
2650 $LFS getstripe $DIR/$tdir/a1/f1
2651 $LFS path2fid $DIR/$tdir/a1/f2
2652 $LFS getstripe $DIR/$tdir/a1/f2
2653 $LFS path2fid $DIR/$tdir/a1/f3
2654 $LFS getstripe $DIR/$tdir/a1/f3
2655 $LFS path2fid $DIR/$tdir/a1/f4
2656 $LFS getstripe $DIR/$tdir/a1/f4
2657 cancel_lru_locks osc
2659 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2660 echo "to reference the same OST-object (which is f1's OST-obejct)."
2661 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2662 echo "dangling reference case, but f2's old OST-object is there."
2664 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2665 echo "to reference the same OST-object (which is f3's OST-obejct)."
2666 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2667 echo "dangling reference case, but f4's old OST-object is there."
2670 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2671 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2672 chown 1.1 $DIR/$tdir/a1/f2
2673 chown 1.1 $DIR/$tdir/a1/f4
2674 rm -f $DIR/$tdir/a1/f1
2675 rm -f $DIR/$tdir/a1/f3
2678 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2680 echo "stopall to cleanup object cache"
2683 setupall > /dev/null
2685 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2686 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2688 start_full_debug_logging
2690 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2691 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2693 wait_update_facet mds1 "$LCTL get_param -n \
2694 mdd.$(facet_svc mds1).lfsck_layout |
2695 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2696 error "(3) MDS1 is not the expected 'scanning-phase2'"
2698 # to guarantee all updates are synced.
2702 echo "Write new data to f2/f4 to modify the new created OST-object."
2703 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2704 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2706 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2708 for k in $(seq $MDSCOUNT); do
2709 # The LFSCK status query internal is 30 seconds. For the case
2710 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2711 # time to guarantee the status sync up.
2712 wait_update_facet mds${k} "$LCTL get_param -n \
2713 mdd.$(facet_svc mds${k}).lfsck_layout |
2714 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2715 error "(4) MDS${k} is not the expected 'completed'"
2718 for k in $(seq $OSTCOUNT); do
2719 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2720 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2721 awk '/^status/ { print $2 }')
2722 [ "$cur_status" == "completed" ] ||
2723 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2726 stop_full_debug_logging
2728 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2729 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2730 awk '/^repaired_orphan/ { print $2 }')
2731 [ $repaired -eq 2 ] ||
2732 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2734 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2735 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2736 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2738 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2739 if [ $count -ne 2 ]; then
2740 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2741 error "(8) Expect 2 stubs under lost+found, but got $count"
2744 echo "The stub file should keep the original f2 or f4 data"
2745 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2746 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2747 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2748 error "(9) Got unexpected $cur_size"
2751 $LFS path2fid $cname
2752 $LFS getstripe $cname
2754 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2755 cur_size=$(ls -il $cname | awk '{ print $6 }')
2756 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2757 error "(10) Got unexpected $cur_size"
2760 $LFS path2fid $cname
2761 $LFS getstripe $cname
2763 echo "The f2/f4 should contains new data."
2764 cat $DIR/$tdir/a1/f2
2765 $LFS path2fid $DIR/$tdir/a1/f2
2766 $LFS getstripe $DIR/$tdir/a1/f2
2767 cat $DIR/$tdir/a1/f4
2768 $LFS path2fid $DIR/$tdir/a1/f4
2769 $LFS getstripe $DIR/$tdir/a1/f4
2771 run_test 18e "Find out orphan OST-object and repair it (5)"
2774 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2777 echo "The target MDT-object is lost. The LFSCK should re-create the"
2778 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2779 echo "to verify some OST-object(s) during the first stage-scanning,"
2780 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2781 echo "should not be affected."
2784 check_mount_and_prep
2785 $LFS mkdir -i 0 $DIR/$tdir/a1
2786 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2787 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2788 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2789 $LFS mkdir -i 0 $DIR/$tdir/a2
2790 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2791 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2792 $LFS getstripe $DIR/$tdir/a1/f1
2793 $LFS getstripe $DIR/$tdir/a2/f2
2795 if [ $MDSCOUNT -ge 2 ]; then
2796 $LFS mkdir -i 1 $DIR/$tdir/a3
2797 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2798 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2799 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2800 $LFS mkdir -i 1 $DIR/$tdir/a4
2801 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2802 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2803 $LFS getstripe $DIR/$tdir/a3/f3
2804 $LFS getstripe $DIR/$tdir/a4/f4
2807 cancel_lru_locks osc
2809 echo "Inject failure, to simulate the case of missing the MDT-object"
2810 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2811 do_facet mds1 $LCTL set_param fail_loc=0x1616
2812 rm -f $DIR/$tdir/a1/f1
2813 rm -f $DIR/$tdir/a2/f2
2815 if [ $MDSCOUNT -ge 2 ]; then
2816 do_facet mds2 $LCTL set_param fail_loc=0x1616
2817 rm -f $DIR/$tdir/a3/f3
2818 rm -f $DIR/$tdir/a4/f4
2824 do_facet mds1 $LCTL set_param fail_loc=0
2825 if [ $MDSCOUNT -ge 2 ]; then
2826 do_facet mds2 $LCTL set_param fail_loc=0
2829 cancel_lru_locks mdc
2830 cancel_lru_locks osc
2832 echo "Inject failure, to simulate the OST0 fail to handle"
2833 echo "MDT0 LFSCK request during the first-stage scanning."
2834 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2835 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2837 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2838 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2840 for k in $(seq $MDSCOUNT); do
2841 # The LFSCK status query internal is 30 seconds. For the case
2842 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2843 # time to guarantee the status sync up.
2844 wait_update_facet mds${k} "$LCTL get_param -n \
2845 mdd.$(facet_svc mds${k}).lfsck_layout |
2846 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2847 error "(2) MDS${k} is not the expected 'partial'"
2850 wait_update_facet ost1 "$LCTL get_param -n \
2851 obdfilter.$(facet_svc ost1).lfsck_layout |
2852 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2853 error "(3) OST1 is not the expected 'partial'"
2856 wait_update_facet ost2 "$LCTL get_param -n \
2857 obdfilter.$(facet_svc ost2).lfsck_layout |
2858 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2859 error "(4) OST2 is not the expected 'completed'"
2862 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2864 local repaired=$(do_facet mds1 $LCTL get_param -n \
2865 mdd.$(facet_svc mds1).lfsck_layout |
2866 awk '/^repaired_orphan/ { print $2 }')
2867 [ $repaired -eq 1 ] ||
2868 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2870 if [ $MDSCOUNT -ge 2 ]; then
2871 repaired=$(do_facet mds2 $LCTL get_param -n \
2872 mdd.$(facet_svc mds2).lfsck_layout |
2873 awk '/^repaired_orphan/ { print $2 }')
2874 [ $repaired -eq 1 ] ||
2875 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2878 echo "Trigger layout LFSCK on all devices again to cleanup"
2879 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2881 for k in $(seq $MDSCOUNT); do
2882 # The LFSCK status query internal is 30 seconds. For the case
2883 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2884 # time to guarantee the status sync up.
2885 wait_update_facet mds${k} "$LCTL get_param -n \
2886 mdd.$(facet_svc mds${k}).lfsck_layout |
2887 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2888 error "(8) MDS${k} is not the expected 'completed'"
2891 for k in $(seq $OSTCOUNT); do
2892 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2893 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2894 awk '/^status/ { print $2 }')
2895 [ "$cur_status" == "completed" ] ||
2896 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2900 local repaired=$(do_facet mds1 $LCTL get_param -n \
2901 mdd.$(facet_svc mds1).lfsck_layout |
2902 awk '/^repaired_orphan/ { print $2 }')
2903 [ $repaired -eq 2 ] ||
2904 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2906 if [ $MDSCOUNT -ge 2 ]; then
2907 repaired=$(do_facet mds2 $LCTL get_param -n \
2908 mdd.$(facet_svc mds2).lfsck_layout |
2909 awk '/^repaired_orphan/ { print $2 }')
2910 [ $repaired -eq 2 ] ||
2911 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2914 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2917 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2920 echo "The target MDT-object is lost, but related OI mapping is there"
2921 echo "The LFSCK should recreate the lost MDT-object without affected"
2922 echo "by the stale OI mapping."
2925 check_mount_and_prep
2926 $LFS mkdir -i 0 $DIR/$tdir/a1
2927 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2928 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2929 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2931 $LFS getstripe $DIR/$tdir/a1/f1
2932 cancel_lru_locks osc
2934 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2935 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2936 do_facet mds1 $LCTL set_param fail_loc=0x162e
2937 rm -f $DIR/$tdir/a1/f1
2939 do_facet mds1 $LCTL set_param fail_loc=0
2940 cancel_lru_locks mdc
2941 cancel_lru_locks osc
2943 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2944 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2946 for k in $(seq $MDSCOUNT); do
2947 # The LFSCK status query internal is 30 seconds. For the case
2948 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2949 # time to guarantee the status sync up.
2950 wait_update_facet mds${k} "$LCTL get_param -n \
2951 mdd.$(facet_svc mds${k}).lfsck_layout |
2952 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2953 error "(2) MDS${k} is not the expected 'completed'"
2956 for k in $(seq $OSTCOUNT); do
2957 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2958 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2959 awk '/^status/ { print $2 }')
2960 [ "$cur_status" == "completed" ] ||
2961 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2964 local repaired=$(do_facet mds1 $LCTL get_param -n \
2965 mdd.$(facet_svc mds1).lfsck_layout |
2966 awk '/^repaired_orphan/ { print $2 }')
2967 [ $repaired -eq $OSTCOUNT ] ||
2968 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2970 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2971 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2972 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2974 $LFS path2fid $DIR/$tdir/a1/f1
2975 $LFS getstripe $DIR/$tdir/a1/f1
2977 run_test 18g "Find out orphan OST-object and repair it (7)"
2981 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2982 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2983 echo "scanning its OST-object(s). Then in the second stage scanning,"
2984 echo "the OST will return related OST-object(s) to the MDT as orphan."
2985 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2986 echo "the 'orphan(s)' stripe information."
2989 check_mount_and_prep
2991 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2992 error "(0) Fail to create PFL $DIR/$tdir/f0"
2994 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2995 error "(1.1) Fail to write $DIR/$tdir/f0"
2997 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2998 error "(1.2) Fail to write $DIR/$tdir/f0"
3000 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3002 echo "Inject failure stub to simulate bad PFL extent range"
3003 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3004 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3006 chown 1.1 $DIR/$tdir/f0
3008 cancel_lru_locks mdc
3009 cancel_lru_locks osc
3010 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3012 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3013 error "(2) Write to bad PFL file should fail"
3015 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3016 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3018 for k in $(seq $MDSCOUNT); do
3019 # The LFSCK status query internal is 30 seconds. For the case
3020 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3021 # time to guarantee the status sync up.
3022 wait_update_facet mds${k} "$LCTL get_param -n \
3023 mdd.$(facet_svc mds${k}).lfsck_layout |
3024 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3025 error "(4.1) MDS${k} is not the expected 'completed'"
3028 for k in $(seq $OSTCOUNT); do
3029 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3030 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3031 awk '/^status/ { print $2 }')
3032 [ "$cur_status" == "completed" ] ||
3033 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3037 local repaired=$($SHOW_LAYOUT |
3038 awk '/^repaired_orphan/ { print $2 }')
3039 [ $repaired -eq 2 ] ||
3040 error "(5) Fail to repair crashed PFL range: $repaired"
3042 echo "Data in $DIR/$tdir/f0 should not be broken"
3043 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3044 error "(6) Data in $DIR/$tdir/f0 is broken"
3046 echo "Write should succeed after LFSCK repairing the bad PFL range"
3047 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3048 error "(7) Write should succeed after LFSCK"
3050 run_test 18h "LFSCK can repair crashed PFL extent range"
3052 $LCTL set_param debug=-cache > /dev/null
3055 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3056 skip "MDS older than 2.5.55, LU-3951"
3058 check_mount_and_prep
3059 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3061 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3062 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3064 echo "foo1" > $DIR/$tdir/a0
3065 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3066 error "(0) Fail to create PFL $DIR/$tdir/a1"
3067 echo "foo2" > $DIR/$tdir/a1
3068 echo "guard" > $DIR/$tdir/a2
3069 cancel_lru_locks osc
3071 echo "Inject failure, then client will offer wrong parent FID when read"
3072 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3073 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3075 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3076 $LCTL set_param fail_loc=0x1619
3078 echo "Read RPC with wrong parent FID should be denied"
3079 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3080 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3081 $LCTL set_param fail_loc=0
3083 run_test 19a "OST-object inconsistency self detect"
3086 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3087 skip "MDS older than 2.5.55, LU-3951"
3089 check_mount_and_prep
3090 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3092 echo "Inject failure stub to make the OST-object to back point to"
3093 echo "non-exist MDT-object"
3095 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3096 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3098 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3099 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3100 echo "foo1" > $DIR/$tdir/f0
3101 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3102 error "(0) Fail to create PFL $DIR/$tdir/f1"
3103 echo "foo2" > $DIR/$tdir/f1
3104 cancel_lru_locks osc
3105 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3107 do_facet ost1 $LCTL set_param -n \
3108 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3109 echo "Nothing should be fixed since self detect and repair is disabled"
3110 local repaired=$(do_facet ost1 $LCTL get_param -n \
3111 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3112 awk '/^repaired/ { print $2 }')
3113 [ $repaired -eq 0 ] ||
3114 error "(1) Expected 0 repaired, but got $repaired"
3116 echo "Read RPC with right parent FID should be accepted,"
3117 echo "and cause parent FID on OST to be fixed"
3119 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3120 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3122 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3123 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3125 repaired=$(do_facet ost1 $LCTL get_param -n \
3126 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3127 awk '/^repaired/ { print $2 }')
3128 [ $repaired -eq 2 ] ||
3129 error "(3) Expected 1 repaired, but got $repaired"
3131 run_test 19b "OST-object inconsistency self repair"
3133 PATTERN_WITH_HOLE="40000001"
3134 PATTERN_WITHOUT_HOLE="raid0"
3137 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3138 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3139 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3140 skip "MDS older than 2.5.55, LU-4887"
3143 echo "The target MDT-object and some of its OST-object are lost."
3144 echo "The LFSCK should find out the left OST-objects and re-create"
3145 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3146 echo "with the partial OST-objects (LOV EA hole)."
3148 echo "New client can access the file with LOV EA hole via normal"
3149 echo "system tools or commands without crash the system."
3151 echo "For old client, even though it cannot access the file with"
3152 echo "LOV EA hole, it should not cause the system crash."
3155 check_mount_and_prep
3156 $LFS mkdir -i 0 $DIR/$tdir/a1
3157 if [ $OSTCOUNT -gt 2 ]; then
3158 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3161 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3165 # 256 blocks on the stripe0.
3166 # 1 block on the stripe1 for 2 OSTs case.
3167 # 256 blocks on the stripe1 for other cases.
3168 # 1 block on the stripe2 if OSTs > 2
3169 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3170 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3171 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3173 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3174 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3175 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3178 $LFS getstripe $DIR/$tdir/a1/f0
3180 $LFS getstripe $DIR/$tdir/a1/f1
3182 $LFS getstripe $DIR/$tdir/a1/f2
3184 if [ $OSTCOUNT -gt 2 ]; then
3185 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3186 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3188 $LFS getstripe $DIR/$tdir/a1/f3
3191 cancel_lru_locks osc
3193 echo "Inject failure..."
3194 echo "To simulate f0 lost MDT-object"
3195 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3196 do_facet mds1 $LCTL set_param fail_loc=0x1616
3197 rm -f $DIR/$tdir/a1/f0
3199 echo "To simulate f1 lost MDT-object and OST-object0"
3200 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3201 do_facet mds1 $LCTL set_param fail_loc=0x161a
3202 rm -f $DIR/$tdir/a1/f1
3204 echo "To simulate f2 lost MDT-object and OST-object1"
3205 do_facet mds1 $LCTL set_param fail_val=1
3206 rm -f $DIR/$tdir/a1/f2
3208 if [ $OSTCOUNT -gt 2 ]; then
3209 echo "To simulate f3 lost MDT-object and OST-object2"
3210 do_facet mds1 $LCTL set_param fail_val=2
3211 rm -f $DIR/$tdir/a1/f3
3214 umount_client $MOUNT
3217 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3219 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3220 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3222 for k in $(seq $MDSCOUNT); do
3223 # The LFSCK status query internal is 30 seconds. For the case
3224 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3225 # time to guarantee the status sync up.
3226 wait_update_facet mds${k} "$LCTL get_param -n \
3227 mdd.$(facet_svc mds${k}).lfsck_layout |
3228 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3229 error "(2) MDS${k} is not the expected 'completed'"
3232 for k in $(seq $OSTCOUNT); do
3233 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3234 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3235 awk '/^status/ { print $2 }')
3236 [ "$cur_status" == "completed" ] ||
3237 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3240 local repaired=$(do_facet mds1 $LCTL get_param -n \
3241 mdd.$(facet_svc mds1).lfsck_layout |
3242 awk '/^repaired_orphan/ { print $2 }')
3243 if [ $OSTCOUNT -gt 2 ]; then
3244 [ $repaired -eq 9 ] ||
3245 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3247 [ $repaired -eq 4 ] ||
3248 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3251 mount_client $MOUNT || error "(5.0) Fail to start client!"
3253 LOV_PATTERN_F_HOLE=0x40000000
3256 # ${fid0}-R-0 is the old f0
3258 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3259 echo "Check $name, which is the old f0"
3261 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3263 local pattern=$($LFS getstripe -L $name)
3264 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3265 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3267 local stripes=$($LFS getstripe -c $name)
3268 if [ $OSTCOUNT -gt 2 ]; then
3269 [ $stripes -eq 3 ] ||
3270 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3272 [ $stripes -eq 2 ] ||
3273 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3276 local size=$(stat $name | awk '/Size:/ { print $2 }')
3277 [ $size -eq $((4096 * $bcount)) ] ||
3278 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3280 cat $name > /dev/null || error "(5.5) cannot read $name"
3282 echo "dummy" >> $name || error "(5.6) cannot write $name"
3284 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3286 touch $name || error "(5.8) cannot touch $name"
3288 rm -f $name || error "(5.9) cannot unlink $name"
3291 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3293 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3294 if [ $OSTCOUNT -gt 2 ]; then
3295 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3297 echo "Check $name, it contains the old f1's stripe1"
3300 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3302 pattern=$($LFS getstripe -L $name)
3303 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3304 error "(6.2) expect pattern flag hole, but got $pattern"
3306 stripes=$($LFS getstripe -c $name)
3307 if [ $OSTCOUNT -gt 2 ]; then
3308 [ $stripes -eq 3 ] ||
3309 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3311 [ $stripes -eq 2 ] ||
3312 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3315 size=$(stat $name | awk '/Size:/ { print $2 }')
3316 [ $size -eq $((4096 * $bcount)) ] ||
3317 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3319 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3321 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3322 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3325 [ $failures -eq 256 ] ||
3326 error "(6.6) expect 256 IO failures, but get $failures"
3328 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3329 [ $size -eq $((4096 * $bcount)) ] ||
3330 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3332 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3333 error "(6.8) write to the LOV EA hole should fail"
3335 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3336 error "(6.9) write to normal stripe should NOT fail"
3338 echo "foo" >> $name && error "(6.10) append write $name should fail"
3340 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3342 touch $name || error "(6.12) cannot touch $name"
3344 rm -f $name || error "(6.13) cannot unlink $name"
3347 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3349 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3350 if [ $OSTCOUNT -gt 2 ]; then
3351 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3353 echo "Check $name, it contains the old f2's stripe0"
3356 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3358 pattern=$($LFS getstripe -L $name)
3359 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3360 error "(7.2) expect pattern flag hole, but got $pattern"
3362 stripes=$($LFS getstripe -c $name)
3363 size=$(stat $name | awk '/Size:/ { print $2 }')
3364 if [ $OSTCOUNT -gt 2 ]; then
3365 [ $stripes -eq 3 ] ||
3366 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3368 [ $size -eq $((4096 * $bcount)) ] ||
3369 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3371 cat $name > /dev/null &&
3372 error "(7.5.1) normal read $name should fail"
3374 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3375 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3377 [ $failures -eq 256 ] ||
3378 error "(7.6) expect 256 IO failures, but get $failures"
3380 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3381 [ $size -eq $((4096 * $bcount)) ] ||
3382 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3384 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3385 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3387 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3388 error "(7.8.1) write to normal stripe should NOT fail"
3390 echo "foo" >> $name &&
3391 error "(7.8.3) append write $name should fail"
3393 chown $RUNAS_ID:$RUNAS_GID $name ||
3394 error "(7.9.1) cannot chown on $name"
3396 touch $name || error "(7.10.1) cannot touch $name"
3398 [ $stripes -eq 2 ] ||
3399 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3402 [ $size -eq $((4096 * (256 + 0))) ] ||
3403 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3405 cat $name > /dev/null &&
3406 error "(7.5.2) normal read $name should fail"
3408 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3409 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3410 [ $failures -eq 256 ] ||
3411 error "(7.6.2) expect 256 IO failures, but get $failures"
3414 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3415 [ $size -eq $((4096 * $bcount)) ] ||
3416 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3418 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3419 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3421 chown $RUNAS_ID:$RUNAS_GID $name ||
3422 error "(7.9.2) cannot chown on $name"
3424 touch $name || error "(7.10.2) cannot touch $name"
3427 rm -f $name || error "(7.11) cannot unlink $name"
3429 [ $OSTCOUNT -le 2 ] && return
3432 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3434 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3435 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3437 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3439 pattern=$($LFS getstripe -L $name)
3440 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3441 error "(8.2) expect pattern flag hole, but got $pattern"
3443 stripes=$($LFS getstripe -c $name)
3444 [ $stripes -eq 3 ] ||
3445 error "(8.3) expect the stripe count is 3, but got $stripes"
3447 size=$(stat $name | awk '/Size:/ { print $2 }')
3449 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3450 error "(8.4) expect the size $((4096 * 512)), but got $size"
3452 cat $name > /dev/null &&
3453 error "(8.5) normal read $name should fail"
3455 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3456 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3458 [ $failures -eq 256 ] ||
3459 error "(8.6) expect 256 IO failures, but get $failures"
3462 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3463 [ $size -eq $((4096 * $bcount)) ] ||
3464 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3466 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3467 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3469 chown $RUNAS_ID:$RUNAS_GID $name ||
3470 error "(8.9) cannot chown on $name"
3472 touch $name || error "(8.10) cannot touch $name"
3474 rm -f $name || error "(8.11) cannot unlink $name"
3476 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3479 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3480 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3481 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3482 skip "MDS older than 2.5.55, LU-4887"
3485 echo "The target MDT-object and some of its OST-object are lost."
3486 echo "The LFSCK should find out the left OST-objects and re-create"
3487 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3488 echo "with the partial OST-objects (LOV EA hole)."
3490 echo "New client can access the file with LOV EA hole via normal"
3491 echo "system tools or commands without crash the system - PFL case."
3494 check_mount_and_prep
3496 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3497 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3498 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3499 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3500 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3501 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3503 local bcount=$((256 * 3 + 1))
3505 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3506 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3507 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3509 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3510 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3511 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3514 $LFS getstripe $DIR/$tdir/f0
3516 $LFS getstripe $DIR/$tdir/f1
3518 $LFS getstripe $DIR/$tdir/f2
3520 cancel_lru_locks mdc
3521 cancel_lru_locks osc
3523 echo "Inject failure..."
3524 echo "To simulate f0 lost MDT-object"
3525 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3526 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3529 echo "To simulate the case of f1 lost MDT-object and "
3530 echo "the first OST-object in each PFL component"
3531 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3532 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3535 echo "To simulate the case of f2 lost MDT-object and "
3536 echo "the second OST-object in each PFL component"
3537 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3542 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3544 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3545 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3547 for k in $(seq $MDSCOUNT); do
3548 # The LFSCK status query internal is 30 seconds. For the case
3549 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3550 # time to guarantee the status sync up.
3551 wait_update_facet mds${k} "$LCTL get_param -n \
3552 mdd.$(facet_svc mds${k}).lfsck_layout |
3553 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3554 error "(4) MDS${k} is not the expected 'completed'"
3557 for k in $(seq $OSTCOUNT); do
3558 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3559 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3560 awk '/^status/ { print $2 }')
3561 [ "$cur_status" == "completed" ] ||
3562 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3565 local repaired=$(do_facet mds1 $LCTL get_param -n \
3566 mdd.$(facet_svc mds1).lfsck_layout |
3567 awk '/^repaired_orphan/ { print $2 }')
3568 [ $repaired -eq 8 ] ||
3569 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3572 # ${fid0}-R-0 is the old f0
3574 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3575 echo "Check $name, which is the old f0"
3577 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3579 local pattern=$($LFS getstripe -L -I1 $name)
3580 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3581 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3583 pattern=$($LFS getstripe -L -I2 $name)
3584 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3585 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3587 local stripes=$($LFS getstripe -c -I1 $name)
3588 [ $stripes -eq 2 ] ||
3589 error "(7.3.1) expect 2 stripes, but got $stripes"
3591 stripes=$($LFS getstripe -c -I2 $name)
3592 [ $stripes -eq 2 ] ||
3593 error "(7.3.2) expect 2 stripes, but got $stripes"
3595 local e_start=$($LFS getstripe -I1 $name |
3596 awk '/lcme_extent.e_start:/ { print $2 }')
3597 [ $e_start -eq 0 ] ||
3598 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3600 local e_end=$($LFS getstripe -I1 $name |
3601 awk '/lcme_extent.e_end:/ { print $2 }')
3602 [ $e_end -eq 2097152 ] ||
3603 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3605 e_start=$($LFS getstripe -I2 $name |
3606 awk '/lcme_extent.e_start:/ { print $2 }')
3607 [ $e_start -eq 2097152 ] ||
3608 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3610 e_end=$($LFS getstripe -I2 $name |
3611 awk '/lcme_extent.e_end:/ { print $2 }')
3612 [ "$e_end" = "EOF" ] ||
3613 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3615 local size=$(stat $name | awk '/Size:/ { print $2 }')
3616 [ $size -eq $((4096 * $bcount)) ] ||
3617 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3619 cat $name > /dev/null || error "(7.7) cannot read $name"
3621 echo "dummy" >> $name || error "(7.8) cannot write $name"
3623 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3625 touch $name || error "(7.10) cannot touch $name"
3627 rm -f $name || error "(7.11) cannot unlink $name"
3630 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3632 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3633 echo "Check $name, it contains f1's second OST-object in each COMP"
3635 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3637 pattern=$($LFS getstripe -L -I1 $name)
3638 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3639 error "(8.2.1) expect pattern flag hole, but got $pattern"
3641 pattern=$($LFS getstripe -L -I2 $name)
3642 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3643 error "(8.2.2) expect pattern flag hole, but got $pattern"
3645 stripes=$($LFS getstripe -c -I1 $name)
3646 [ $stripes -eq 2 ] ||
3647 error "(8.3.2) expect 2 stripes, but got $stripes"
3649 stripes=$($LFS getstripe -c -I2 $name)
3650 [ $stripes -eq 2 ] ||
3651 error "(8.3.2) expect 2 stripes, but got $stripes"
3653 e_start=$($LFS getstripe -I1 $name |
3654 awk '/lcme_extent.e_start:/ { print $2 }')
3655 [ $e_start -eq 0 ] ||
3656 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3658 e_end=$($LFS getstripe -I1 $name |
3659 awk '/lcme_extent.e_end:/ { print $2 }')
3660 [ $e_end -eq 2097152 ] ||
3661 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3663 e_start=$($LFS getstripe -I2 $name |
3664 awk '/lcme_extent.e_start:/ { print $2 }')
3665 [ $e_start -eq 2097152 ] ||
3666 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3668 e_end=$($LFS getstripe -I2 $name |
3669 awk '/lcme_extent.e_end:/ { print $2 }')
3670 [ "$e_end" = "EOF" ] ||
3671 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3673 size=$(stat $name | awk '/Size:/ { print $2 }')
3674 [ $size -eq $((4096 * $bcount)) ] ||
3675 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3677 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3679 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3680 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3682 # The first stripe in each COMP was lost
3683 [ $failures -eq 512 ] ||
3684 error "(8.8) expect 512 IO failures, but get $failures"
3686 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3687 [ $size -eq $((4096 * $bcount)) ] ||
3688 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3690 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3691 error "(8.10) write to the LOV EA hole should fail"
3693 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3694 error "(8.11) write to normal stripe should NOT fail"
3696 echo "foo" >> $name && error "(8.12) append write $name should fail"
3698 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3700 touch $name || error "(8.14) cannot touch $name"
3702 rm -f $name || error "(8.15) cannot unlink $name"
3705 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3707 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3708 echo "Check $name, it contains f2's first stripe in each COMP"
3710 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3712 pattern=$($LFS getstripe -L -I1 $name)
3713 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3714 error "(9.2.1) expect pattern flag hole, but got $pattern"
3716 pattern=$($LFS getstripe -L -I2 $name)
3717 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3718 error "(9.2.2) expect pattern flag hole, but got $pattern"
3720 stripes=$($LFS getstripe -c -I1 $name)
3721 [ $stripes -eq 2 ] ||
3722 error "(9.3.2) expect 2 stripes, but got $stripes"
3724 stripes=$($LFS getstripe -c -I2 $name)
3725 [ $stripes -eq 2 ] ||
3726 error "(9.3.2) expect 2 stripes, but got $stripes"
3728 e_start=$($LFS getstripe -I1 $name |
3729 awk '/lcme_extent.e_start:/ { print $2 }')
3730 [ $e_start -eq 0 ] ||
3731 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3733 e_end=$($LFS getstripe -I1 $name |
3734 awk '/lcme_extent.e_end:/ { print $2 }')
3735 [ $e_end -eq 2097152 ] ||
3736 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3738 e_start=$($LFS getstripe -I2 $name |
3739 awk '/lcme_extent.e_start:/ { print $2 }')
3740 [ $e_start -eq 2097152 ] ||
3741 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3743 e_end=$($LFS getstripe -I2 $name |
3744 awk '/lcme_extent.e_end:/ { print $2 }')
3745 [ "$e_end" = "EOF" ] ||
3746 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3748 size=$(stat $name | awk '/Size:/ { print $2 }')
3749 # The second stripe in COMP was lost, so we do not know there
3750 # have ever been some data before. 'stat' will regard it as
3751 # no data on the lost stripe.
3753 [ $size -eq $((4096 * $bcount)) ] ||
3754 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3756 cat $name > /dev/null &&
3757 error "(9.7) normal read $name should fail"
3759 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3760 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3761 [ $failures -eq 512 ] ||
3762 error "(9.8) expect 256 IO failures, but get $failures"
3764 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3765 # The second stripe in COMP was lost, so we do not know there
3766 # have ever been some data before. Since 'dd' skip failure,
3767 # it will regard the lost stripe contains data.
3769 [ $size -eq $((4096 * $bcount)) ] ||
3770 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3772 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3773 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3775 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3776 error "(9.11) write to normal stripe should NOT fail"
3778 echo "foo" >> $name &&
3779 error "(9.12) append write $name should fail"
3781 chown $RUNAS_ID:$RUNAS_GID $name ||
3782 error "(9.13) cannot chown on $name"
3784 touch $name || error "(9.14) cannot touch $name"
3786 rm -f $name || error "(7.15) cannot unlink $name"
3788 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3791 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3792 skip "MDS older than 2.5.59, LU-4887"
3794 check_mount_and_prep
3795 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3797 echo "Start all LFSCK components by default (-s 1)"
3798 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3799 error "Fail to start LFSCK"
3801 echo "namespace LFSCK should be in 'scanning-phase1' status"
3802 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3803 [ "$STATUS" == "scanning-phase1" ] ||
3804 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3806 echo "layout LFSCK should be in 'scanning-phase1' status"
3807 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3808 [ "$STATUS" == "scanning-phase1" ] ||
3809 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3811 echo "Stop all LFSCK components by default"
3812 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3813 error "Fail to stop LFSCK"
3815 run_test 21 "run all LFSCK components by default"
3818 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3819 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3820 skip "MDS older than 2.6.50, LU-5511"
3823 echo "The parent_A references the child directory via some name entry,"
3824 echo "but the child directory back references another parent_B via its"
3825 echo "".." name entry. The parent_B does not exist. Then the namespace"
3826 echo "LFSCK will repair the child directory's ".." name entry."
3829 check_mount_and_prep
3831 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3832 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3834 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3835 echo "The dummy's dotdot name entry references the guard."
3836 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3837 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3838 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3839 error "(3) Fail to mkdir on MDT0"
3840 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3842 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3844 echo "Trigger namespace LFSCK to repair unmatched pairs"
3845 $START_NAMESPACE -A -r ||
3846 error "(5) Fail to start LFSCK for namespace"
3848 wait_all_targets_blocked namespace completed 6
3850 local repaired=$($SHOW_NAMESPACE |
3851 awk '/^unmatched_pairs_repaired/ { print $2 }')
3852 [ $repaired -eq 1 ] ||
3853 error "(7) Fail to repair unmatched pairs: $repaired"
3855 echo "'ls' should success after namespace LFSCK repairing"
3856 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3857 error "(8) ls should success."
3859 run_test 22a "LFSCK can repair unmatched pairs (1)"
3862 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3863 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3864 skip "MDS older than 2.6.50, LU-5511"
3867 echo "The parent_A references the child directory via the name entry_B,"
3868 echo "but the child directory back references another parent_C via its"
3869 echo "".." name entry. The parent_C exists, but there is no the name"
3870 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3871 echo "the child directory's ".." name entry and its linkEA."
3874 check_mount_and_prep
3876 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3877 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3879 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3880 echo "and bad linkEA. The dummy's dotdot name entry references the"
3881 echo "guard. The dummy's linkEA references n non-exist name entry."
3882 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3884 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3885 error "(3) Fail to mkdir on MDT0"
3886 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3888 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3889 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3890 local dummyname=$($LFS fid2path $DIR $dummyfid)
3891 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3892 error "(4) fid2path works unexpectedly."
3894 echo "Trigger namespace LFSCK to repair unmatched pairs"
3895 $START_NAMESPACE -A -r ||
3896 error "(5) Fail to start LFSCK for namespace"
3898 wait_all_targets_blocked namespace completed 6
3900 local repaired=$($SHOW_NAMESPACE |
3901 awk '/^unmatched_pairs_repaired/ { print $2 }')
3902 [ $repaired -eq 1 ] ||
3903 error "(7) Fail to repair unmatched pairs: $repaired"
3905 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3906 local dummyname=$($LFS fid2path $DIR $dummyfid)
3907 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3908 error "(8) fid2path does not work"
3910 run_test 22b "LFSCK can repair unmatched pairs (2)"
3913 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3914 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3915 skip "MDS older than 2.6.50, LU-5512"
3918 echo "The name entry is there, but the MDT-object for such name "
3919 echo "entry does not exist. The namespace LFSCK should find out "
3920 echo "and repair the inconsistency as required."
3923 check_mount_and_prep
3925 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3926 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3928 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3929 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3930 do_facet mds2 $LCTL set_param fail_loc=0x1620
3931 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3932 do_facet mds2 $LCTL set_param fail_loc=0
3934 echo "'ls' should fail because of dangling name entry"
3935 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3937 echo "Trigger namespace LFSCK to find out dangling name entry"
3938 $START_NAMESPACE -A -r ||
3939 error "(5) Fail to start LFSCK for namespace"
3941 wait_all_targets_blocked namespace completed 6
3943 local repaired=$($SHOW_NAMESPACE |
3944 awk '/^dangling_repaired/ { print $2 }')
3945 [ $repaired -eq 1 ] ||
3946 error "(7) Fail to repair dangling name entry: $repaired"
3948 echo "'ls' should fail because not re-create MDT-object by default"
3949 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3951 echo "Trigger namespace LFSCK again to repair dangling name entry"
3952 $START_NAMESPACE -A -r -C ||
3953 error "(9) Fail to start LFSCK for namespace"
3955 wait_all_targets_blocked namespace completed 10
3957 repaired=$($SHOW_NAMESPACE |
3958 awk '/^dangling_repaired/ { print $2 }')
3959 [ $repaired -eq 1 ] ||
3960 error "(11) Fail to repair dangling name entry: $repaired"
3962 echo "'ls' should success after namespace LFSCK repairing"
3963 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3965 run_test 23a "LFSCK can repair dangling name entry (1)"
3968 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3969 skip "MDS older than 2.6.50, LU-5512"
3972 echo "The objectA has multiple hard links, one of them corresponding"
3973 echo "to the name entry_B. But there is something wrong for the name"
3974 echo "entry_B and cause entry_B to references non-exist object_C."
3975 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3976 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3977 echo "comes to the second-stage scanning, it will find that the"
3978 echo "former re-creating object_C is not proper, and will try to"
3979 echo "replace the object_C with the real object_A."
3982 check_mount_and_prep
3984 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3985 $LFS path2fid $DIR/$tdir/d0
3987 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3989 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3990 $LFS path2fid $DIR/$tdir/d0/f0
3992 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3993 $LFS path2fid $DIR/$tdir/d0/f1
3995 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3996 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3998 if [ "$SEQ0" != "$SEQ1" ]; then
3999 # To guarantee that the f0 and f1 are in the same FID seq
4000 rm -f $DIR/$tdir/d0/f0 ||
4001 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4002 echo "dummy" > $DIR/$tdir/d0/f0 ||
4003 error "(3.2) Fail to touch on MDT0"
4004 $LFS path2fid $DIR/$tdir/d0/f0
4007 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4008 OID=$(printf %d $OID)
4010 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4011 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4012 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4013 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4014 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4016 # If there is creation after the dangling injection, it may re-use
4017 # the just released local object (inode) that is referenced by the
4018 # dangling name entry. It will fail the dangling injection.
4019 # So before deleting the target object for the dangling name entry,
4020 # remove some other objects to avoid the target object being reused
4021 # by some potential creations. LU-7429
4022 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4024 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4026 echo "'ls' should fail because of dangling name entry"
4027 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4028 error "(6) ls should fail."
4030 echo "Trigger namespace LFSCK to find out dangling name entry"
4031 $START_NAMESPACE -r -C ||
4032 error "(7) Fail to start LFSCK for namespace"
4034 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4035 mdd.${MDT_DEV}.lfsck_namespace |
4036 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4038 error "(8) unexpected status"
4041 local repaired=$($SHOW_NAMESPACE |
4042 awk '/^dangling_repaired/ { print $2 }')
4043 [ $repaired -eq 1 ] ||
4044 error "(9) Fail to repair dangling name entry: $repaired"
4046 repaired=$($SHOW_NAMESPACE |
4047 awk '/^multiple_linked_repaired/ { print $2 }')
4048 [ $repaired -eq 1 ] ||
4049 error "(10) Fail to drop the former created object: $repaired"
4051 local data=$(cat $DIR/$tdir/d0/foo)
4052 [ "$data" == "dummy" ] ||
4053 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4055 run_test 23b "LFSCK can repair dangling name entry (2)"
4058 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4059 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4060 mdd.${MDT_DEV}.lfsck_namespace |
4061 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4063 error "(10) unexpected status"
4066 stop_full_debug_logging
4070 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4071 skip "MDS older than 2.6.50, LU-5512"
4074 echo "The objectA has multiple hard links, one of them corresponding"
4075 echo "to the name entry_B. But there is something wrong for the name"
4076 echo "entry_B and cause entry_B to references non-exist object_C."
4077 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4078 echo "as dangling, and re-create the lost object_C. And then others"
4079 echo "modified the re-created object_C. When the LFSCK comes to the"
4080 echo "second-stage scanning, it will find that the former re-creating"
4081 echo "object_C maybe wrong and try to replace the object_C with the"
4082 echo "real object_A. But because object_C has been modified, so the"
4083 echo "LFSCK cannot replace it."
4086 start_full_debug_logging
4088 check_mount_and_prep
4090 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4091 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4092 echo "parent_fid=$parent_fid"
4094 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4096 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4097 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4098 echo "f0_fid=$f0_fid"
4100 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4101 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4102 echo "f1_fid=$f1_fid"
4104 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4105 # To guarantee that the f0 and f1 are in the same FID seq
4106 rm -f $DIR/$tdir/d0/f0 ||
4107 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4108 echo "dummy" > $DIR/$tdir/d0/f0 ||
4109 error "(3.2) Fail to touch on MDT0"
4110 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4111 echo "f0_fid=$f0_fid (replaced)"
4114 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4116 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4117 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4118 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4119 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4120 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4122 # If there is creation after the dangling injection, it may re-use
4123 # the just released local object (inode) that is referenced by the
4124 # dangling name entry. It will fail the dangling injection.
4125 # So before deleting the target object for the dangling name entry,
4126 # remove some other objects to avoid the target object being reused
4127 # by some potential creations. LU-7429
4128 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4130 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4132 echo "'ls' should fail because of dangling name entry"
4133 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4134 error "(6) ls should fail."
4136 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4137 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4139 echo "Trigger namespace LFSCK to find out dangling name entry"
4140 $START_NAMESPACE -r -C ||
4141 error "(7) Fail to start LFSCK for namespace"
4143 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4144 # While unexpected by the test, it is valid for LFSCK to repair
4145 # the link to the original object before any data is written.
4146 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4148 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4149 log "LFSCK repaired file prematurely"
4154 stat $DIR/$tdir/d0/foo
4156 error "(8) unexpected size"
4159 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4160 cancel_lru_locks osc
4164 local repaired=$($SHOW_NAMESPACE |
4165 awk '/^dangling_repaired/ { print $2 }')
4166 [ $repaired -eq 1 ] ||
4167 error "(11) Fail to repair dangling name entry: $repaired"
4169 local data=$(cat $DIR/$tdir/d0/foo)
4170 [ "$data" != "dummy" ] ||
4171 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4173 run_test 23c "LFSCK can repair dangling name entry (3)"
4176 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4177 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4178 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4179 skip "MDS older than 2.6.50, LU-5513"
4182 echo "Two MDT-objects back reference the same name entry via their"
4183 echo "each own linkEA entry, but the name entry only references one"
4184 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4185 echo "for the MDT-object that is not recognized. If such MDT-object"
4186 echo "has no other linkEA entry after the removing, then the LFSCK"
4187 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4190 check_mount_and_prep
4192 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4194 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4195 $LFS path2fid $DIR/$tdir/d0/guard
4197 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4198 $LFS path2fid $DIR/$tdir/d0/dummy
4201 if [ $mds1_FSTYPE != ldiskfs ]; then
4202 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4204 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4207 touch $DIR/$tdir/d0/guard/foo ||
4208 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4210 echo "Inject failure stub on MDT0 to simulate the case that"
4211 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4212 echo "that references $DIR/$tdir/d0/guard/foo."
4213 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4214 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4215 echo "there with the same linkEA entry as another MDT-object"
4216 echo "$DIR/$tdir/d0/guard/foo has"
4218 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4220 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4221 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4222 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4223 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4224 rmdir $DIR/$tdir/d0/dummy/foo ||
4225 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4226 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4228 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4229 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4230 error "(6) stat successfully unexpectedly"
4232 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4233 $START_NAMESPACE -A -r ||
4234 error "(7) Fail to start LFSCK for namespace"
4236 wait_all_targets_blocked namespace completed 8
4238 local repaired=$($SHOW_NAMESPACE |
4239 awk '/^multiple_referenced_repaired/ { print $2 }')
4240 [ $repaired -eq 1 ] ||
4241 error "(9) Fail to repair multiple referenced name entry: $repaired"
4243 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4244 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4245 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4247 local cname="$cfid-$pfid-D-0"
4248 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4249 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4251 run_test 24 "LFSCK can repair multiple-referenced name entry"
4254 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4255 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4256 skip "MDS older than 2.6.50, LU-5515"
4259 echo "The file type in the name entry does not match the file type"
4260 echo "claimed by the referenced object. Then the LFSCK will update"
4261 echo "the file type in the name entry."
4264 check_mount_and_prep
4266 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4268 echo "Inject failure stub on MDT0 to simulate the case that"
4269 echo "the file type stored in the name entry is wrong."
4271 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4272 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4273 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4274 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4276 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4277 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4279 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4280 mdd.${MDT_DEV}.lfsck_namespace |
4281 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4283 error "(4) unexpected status"
4286 local repaired=$($SHOW_NAMESPACE |
4287 awk '/^bad_file_type_repaired/ { print $2 }')
4288 [ $repaired -eq 1 ] ||
4289 error "(5) Fail to repair bad file type in name entry: $repaired"
4291 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4293 run_test 25 "LFSCK can repair bad file type in the name entry"
4296 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4297 skip "MDS older than 2.6.50, LU-5516"
4300 echo "The local name entry back referenced by the MDT-object is lost."
4301 echo "The namespace LFSCK will add the missing local name entry back"
4302 echo "to the normal namespace."
4305 check_mount_and_prep
4307 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4308 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4309 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4311 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4312 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4314 echo "Inject failure stub on MDT0 to simulate the case that"
4315 echo "foo's name entry will be removed, but the foo's object"
4316 echo "and its linkEA are kept in the system."
4318 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4319 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4320 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4321 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4323 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4324 error "(5) 'ls' should fail"
4326 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4327 $START_NAMESPACE -r -A ||
4328 error "(6) Fail to start LFSCK for namespace"
4330 wait_all_targets_blocked namespace completed 7
4332 local repaired=$($SHOW_NAMESPACE |
4333 awk '/^lost_dirent_repaired/ { print $2 }')
4334 [ $repaired -eq 1 ] ||
4335 error "(8) Fail to repair lost dirent: $repaired"
4337 ls -ail $DIR/$tdir/d0/foo ||
4338 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4340 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4341 [ "$foofid" == "$foofid2" ] ||
4342 error "(10) foo's FID changed: $foofid, $foofid2"
4344 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4347 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4348 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4349 skip "MDS older than 2.6.50, LU-5516"
4352 echo "The remote name entry back referenced by the MDT-object is lost."
4353 echo "The namespace LFSCK will add the missing remote name entry back"
4354 echo "to the normal namespace."
4357 check_mount_and_prep
4359 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4360 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4361 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4363 echo "Inject failure stub on MDT0 to simulate the case that"
4364 echo "foo's name entry will be removed, but the foo's object"
4365 echo "and its linkEA are kept in the system."
4367 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4368 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4369 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4370 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4372 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4373 error "(4) 'ls' should fail"
4375 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4376 $START_NAMESPACE -r -A ||
4377 error "(5) Fail to start LFSCK for namespace"
4379 wait_all_targets_blocked namespace completed 6
4381 local repaired=$($SHOW_NAMESPACE |
4382 awk '/^lost_dirent_repaired/ { print $2 }')
4383 [ $repaired -eq 1 ] ||
4384 error "(7) Fail to repair lost dirent: $repaired"
4386 ls -ail $DIR/$tdir/d0/foo ||
4387 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4389 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4390 [ "$foofid" == "$foofid2" ] ||
4391 error "(9) foo's FID changed: $foofid, $foofid2"
4393 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4396 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4397 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4398 skip "MDS older than 2.6.50, LU-5516"
4401 echo "The local parent referenced by the MDT-object linkEA is lost."
4402 echo "The namespace LFSCK will re-create the lost parent as orphan."
4405 check_mount_and_prep
4407 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4408 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4409 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4410 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4412 echo "Inject failure stub on MDT0 to simulate the case that"
4413 echo "foo's name entry will be removed, but the foo's object"
4414 echo "and its linkEA are kept in the system. And then remove"
4415 echo "another hard link and the parent directory."
4417 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4418 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4419 rm -f $DIR/$tdir/d0/foo ||
4420 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4421 rm -f $DIR/$tdir/d0/dummy ||
4422 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4425 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4426 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4428 echo "Trigger namespace LFSCK to repair the lost parent"
4429 $START_NAMESPACE -r -A ||
4430 error "(6) Fail to start LFSCK for namespace"
4432 wait_all_targets_blocked namespace completed 7
4434 local repaired=$($SHOW_NAMESPACE |
4435 awk '/^lost_dirent_repaired/ { print $2 }')
4436 [ $repaired -eq 1 ] ||
4437 error "(8) Fail to repair lost dirent: $repaired"
4439 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4440 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4441 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4443 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4445 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4446 [ ! -z "$cname" ] ||
4447 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4449 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4452 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4453 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4454 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4455 skip "MDS older than 2.6.50, LU-5516"
4458 echo "The remote parent referenced by the MDT-object linkEA is lost."
4459 echo "The namespace LFSCK will re-create the lost parent as orphan."
4462 check_mount_and_prep
4464 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4465 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4467 $LFS path2fid $DIR/$tdir/d0
4469 echo "Inject failure stub on MDT0 to simulate the case that"
4470 echo "foo's name entry will be removed, but the foo's object"
4471 echo "and its linkEA are kept in the system. And then remove"
4472 echo "the parent directory."
4474 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4475 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4476 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4477 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4479 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4480 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4482 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4483 $START_NAMESPACE -r -A ||
4484 error "(6) Fail to start LFSCK for namespace"
4486 wait_all_targets_blocked namespace completed 7
4488 local repaired=$($SHOW_NAMESPACE |
4489 awk '/^lost_dirent_repaired/ { print $2 }')
4490 [ $repaired -eq 1 ] ||
4491 error "(8) Fail to repair lost dirent: $repaired"
4493 ls -ail $MOUNT/.lustre/lost+found/
4495 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4496 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4497 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4499 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4501 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4502 [ ! -z "$cname" ] ||
4503 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4505 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4508 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4509 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4510 skip "MDS older than 2.6.50, LU-5506"
4513 echo "The target name entry is lost. The LFSCK should insert the"
4514 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4515 echo "the MDT (on which the orphan MDT-object resides) has ever"
4516 echo "failed to respond some name entry verification during the"
4517 echo "first stage-scanning, then the LFSCK should skip to handle"
4518 echo "orphan MDT-object on this MDT. But other MDTs should not"
4522 check_mount_and_prep
4523 $LFS mkdir -i 0 $DIR/$tdir/d1
4524 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4525 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4527 $LFS mkdir -i 1 $DIR/$tdir/d2
4528 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4529 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4531 echo "Inject failure stub on MDT0 to simulate the case that"
4532 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4533 echo "and its linkEA are kept in the system. And the case that"
4534 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4535 echo "and its linkEA are kept in the system."
4537 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4538 do_facet mds1 $LCTL set_param fail_loc=0x1624
4539 do_facet mds2 $LCTL set_param fail_loc=0x1624
4540 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4541 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4542 do_facet mds1 $LCTL set_param fail_loc=0
4543 do_facet mds2 $LCTL set_param fail_loc=0
4545 cancel_lru_locks mdc
4546 cancel_lru_locks osc
4548 echo "Inject failure, to simulate the MDT0 fail to handle"
4549 echo "MDT1 LFSCK request during the first-stage scanning."
4550 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4551 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4553 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4554 $START_NAMESPACE -r -A ||
4555 error "(3) Fail to start LFSCK for namespace"
4557 wait_update_facet mds1 "$LCTL get_param -n \
4558 mdd.$(facet_svc mds1).lfsck_namespace |
4559 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4560 error "(4) mds1 is not the expected 'partial'"
4563 wait_update_facet mds2 "$LCTL get_param -n \
4564 mdd.$(facet_svc mds2).lfsck_namespace |
4565 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4566 error "(5) mds2 is not the expected 'completed'"
4569 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4571 local repaired=$(do_facet mds1 $LCTL get_param -n \
4572 mdd.$(facet_svc mds1).lfsck_namespace |
4573 awk '/^lost_dirent_repaired/ { print $2 }')
4574 [ $repaired -eq 0 ] ||
4575 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4577 repaired=$(do_facet mds2 $LCTL get_param -n \
4578 mdd.$(facet_svc mds2).lfsck_namespace |
4579 awk '/^lost_dirent_repaired/ { print $2 }')
4580 [ $repaired -eq 1 ] ||
4581 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4583 echo "Trigger namespace LFSCK on all devices again to cleanup"
4584 $START_NAMESPACE -r -A ||
4585 error "(8) Fail to start LFSCK for namespace"
4587 wait_all_targets_blocked namespace completed 9
4589 local repaired=$(do_facet mds1 $LCTL get_param -n \
4590 mdd.$(facet_svc mds1).lfsck_namespace |
4591 awk '/^lost_dirent_repaired/ { print $2 }')
4592 [ $repaired -eq 1 ] ||
4593 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4595 repaired=$(do_facet mds2 $LCTL get_param -n \
4596 mdd.$(facet_svc mds2).lfsck_namespace |
4597 awk '/^lost_dirent_repaired/ { print $2 }')
4598 [ $repaired -eq 0 ] ||
4599 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4601 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4604 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4605 skip "MDS older than 2.6.50, LU-5517"
4608 echo "The object's nlink attribute is larger than the object's known"
4609 echo "name entries count. The LFSCK will repair the object's nlink"
4610 echo "attribute to match the known name entries count"
4613 check_mount_and_prep
4615 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4616 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4618 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4619 echo "nlink attribute is larger than its name entries count."
4621 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4622 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4623 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4624 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4625 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4627 cancel_lru_locks mdc
4628 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4629 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4631 echo "Trigger namespace LFSCK to repair the nlink count"
4632 $START_NAMESPACE -r -A ||
4633 error "(5) Fail to start LFSCK for namespace"
4635 wait_all_targets_blocked namespace completed 6
4637 local repaired=$($SHOW_NAMESPACE |
4638 awk '/^nlinks_repaired/ { print $2 }')
4639 [ $repaired -eq 1 ] ||
4640 error "(7) Fail to repair nlink count: $repaired"
4642 cancel_lru_locks mdc
4643 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4644 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4646 # Disable 29a, we only allow nlink to be updated if the known linkEA
4647 # entries is larger than nlink count.
4649 #run_test 29a "LFSCK can repair bad nlink count (1)"
4652 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4653 skip "MDS older than 2.6.50, LU-5517"
4656 echo "The object's nlink attribute is smaller than the object's known"
4657 echo "name entries count. The LFSCK will repair the object's nlink"
4658 echo "attribute to match the known name entries count"
4661 check_mount_and_prep
4663 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4664 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4666 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4667 echo "nlink attribute is smaller than its name entries count."
4669 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4670 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4671 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4672 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4673 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4675 cancel_lru_locks mdc
4676 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4677 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4679 echo "Trigger namespace LFSCK to repair the nlink count"
4680 $START_NAMESPACE -r -A ||
4681 error "(5) Fail to start LFSCK for namespace"
4683 wait_all_targets_blocked namespace completed 6
4685 local repaired=$($SHOW_NAMESPACE |
4686 awk '/^nlinks_repaired/ { print $2 }')
4687 [ $repaired -eq 1 ] ||
4688 error "(7) Fail to repair nlink count: $repaired"
4690 cancel_lru_locks mdc
4691 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4692 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4694 run_test 29b "LFSCK can repair bad nlink count (2)"
4698 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4699 skip "MDS older than 2.6.50, LU-5517"
4702 echo "The namespace LFSCK will create many hard links to the target"
4703 echo "file as to exceed the linkEA size limitation. Under such case"
4704 echo "the linkEA will be marked as overflow that will prevent the"
4705 echo "target file to be migrated. Then remove some hard links to"
4706 echo "make the left hard links to be held within the linkEA size"
4707 echo "limitation. But before the namespace LFSCK adding all the"
4708 echo "missed linkEA entries back, the overflow mark (timestamp)"
4709 echo "will not be cleared."
4712 check_mount_and_prep
4714 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4715 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4716 error "(0.2) Fail to mkdir"
4717 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4718 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4720 # define MAX_LINKEA_SIZE 4096
4721 # sizeof(link_ea_header) = 24
4722 # sizeof(link_ea_entry) = 18
4723 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4724 # (sizeof(link_ea_entry) + name_length))
4725 # If the average name length is 12 bytes, then 150 hard links
4726 # is totally enough to overflow the linkEA
4727 echo "Create 150 hard links should succeed although the linkEA overflow"
4728 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4729 error "(2) Fail to hard link"
4731 cancel_lru_locks mdc
4732 if [ $MDSCOUNT -ge 2 ]; then
4733 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4734 error "(3.1) Migrate should fail"
4736 echo "The object with linkEA overflow should NOT be migrated"
4737 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4738 [ "$newfid" == "$oldfid" ] ||
4739 error "(3.2) Migrate should fail: $newfid != $oldfid"
4742 # Remove 100 hard links, then the linkEA should have space
4743 # to hold the missed linkEA entries.
4744 echo "Remove 100 hard links to save space for the missed linkEA entries"
4745 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4747 if [ $MDSCOUNT -ge 2 ]; then
4748 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4749 error "(5.1) Migrate should fail"
4751 # The overflow timestamp is still there, so migration will fail.
4752 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4753 [ "$newfid" == "$oldfid" ] ||
4754 error "(5.2) Migrate should fail: $newfid != $oldfid"
4757 # sleep 3 seconds to guarantee that the overflow is recognized
4760 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4761 $START_NAMESPACE -r -A ||
4762 error "(6) Fail to start LFSCK for namespace"
4764 wait_all_targets_blocked namespace completed 7
4766 local repaired=$($SHOW_NAMESPACE |
4767 awk '/^linkea_overflow_cleared/ { print $2 }')
4768 [ $repaired -eq 1 ] ||
4769 error "(8) Fail to clear linkea overflow: $repaired"
4771 repaired=$($SHOW_NAMESPACE |
4772 awk '/^nlinks_repaired/ { print $2 }')
4773 [ $repaired -eq 0 ] ||
4774 error "(9) Unexpected nlink repaired: $repaired"
4776 if [ $MDSCOUNT -ge 2 ]; then
4777 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4778 error "(10.1) Migrate failure"
4780 # Migration should succeed after clear the overflow timestamp.
4781 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4782 [ "$newfid" != "$oldfid" ] ||
4783 error "(10.2) Migrate should succeed"
4785 ls -l $DIR/$tdir/foo > /dev/null ||
4786 error "(11) 'ls' failed after migration"
4789 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4790 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4792 run_test 29c "verify linkEA size limitation"
4795 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4796 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4797 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4798 skip "MDS older than 2.6.50, LU-5518"
4801 echo "The namespace LFSCK will move the orphans from backend"
4802 echo "/lost+found directory to normal client visible namespace"
4803 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4806 check_mount_and_prep
4808 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4809 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4811 echo "Inject failure stub on MDT0 to simulate the case that"
4812 echo "directory d0 has no linkEA entry, then the LFSCK will"
4813 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4815 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4816 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4817 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4818 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4820 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4821 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4823 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4824 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4826 echo "Inject failure stub on MDT0 to simulate the case that the"
4827 echo "object's name entry will be removed, but not destroy the"
4828 echo "object. Then backend e2fsck will handle it as orphan and"
4829 echo "add them into the backend /lost+found directory."
4831 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4832 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4833 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4834 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4835 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4836 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4837 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4839 umount_client $MOUNT || error "(10) Fail to stop client!"
4841 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4844 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4845 error "(12) Fail to run e2fsck"
4847 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4848 error "(13) Fail to start MDT0"
4850 echo "Trigger namespace LFSCK to recover backend orphans"
4851 $START_NAMESPACE -r -A ||
4852 error "(14) Fail to start LFSCK for namespace"
4854 wait_all_targets_blocked namespace completed 15
4856 local repaired=$($SHOW_NAMESPACE |
4857 awk '/^local_lost_found_moved/ { print $2 }')
4858 [ $repaired -ge 4 ] ||
4859 error "(16) Fail to recover backend orphans: $repaired"
4861 mount_client $MOUNT || error "(17) Fail to start client!"
4863 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4865 ls -ail $MOUNT/.lustre/lost+found/
4867 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4868 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4869 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4871 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4873 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4874 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4876 stat ${cname}/d1 || error "(21) d1 is not recovered"
4877 stat ${cname}/f1 || error "(22) f1 is not recovered"
4879 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4882 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4883 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4884 skip "MDS older than 2.6.50, LU-5519"
4887 echo "For the name entry under a striped directory, if the name"
4888 echo "hash does not match the shard, then the LFSCK will repair"
4889 echo "the bad name entry"
4892 check_mount_and_prep
4894 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4895 error "(1) Fail to create striped directory"
4897 echo "Inject failure stub on client to simulate the case that"
4898 echo "some name entry should be inserted into other non-first"
4899 echo "shard, but inserted into the first shard by wrong"
4901 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4902 $LCTL set_param fail_loc=0x1628 fail_val=0
4903 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4904 error "(2) Fail to create file under striped directory"
4905 $LCTL set_param fail_loc=0 fail_val=0
4907 echo "Trigger namespace LFSCK to repair bad name hash"
4908 $START_NAMESPACE -r -A ||
4909 error "(3) Fail to start LFSCK for namespace"
4911 wait_all_targets_blocked namespace completed 4
4913 local repaired=$($SHOW_NAMESPACE |
4914 awk '/^name_hash_repaired/ { print $2 }')
4915 [ $repaired -ge 1 ] ||
4916 error "(5) Fail to repair bad name hash: $repaired"
4918 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
4920 error "Fail to find flag bad type: $rc"
4922 umount_client $MOUNT || error "(6) umount failed"
4923 mount_client $MOUNT || error "(7) mount failed"
4925 for ((i = 0; i < $MDSCOUNT; i++)); do
4926 stat $DIR/$tdir/striped_dir/d$i ||
4927 error "(8) Fail to stat d$i after LFSCK"
4928 rmdir $DIR/$tdir/striped_dir/d$i ||
4929 error "(9) Fail to unlink d$i after LFSCK"
4932 rmdir $DIR/$tdir/striped_dir ||
4933 error "(10) Fail to remove the striped directory after LFSCK"
4935 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4938 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4939 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4940 skip "MDS older than 2.6.50, LU-5519"
4943 echo "For the name entry under a striped directory, if the name"
4944 echo "hash does not match the shard, then the LFSCK will repair"
4945 echo "the bad name entry"
4948 check_mount_and_prep
4950 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4951 error "(1) Fail to create striped directory"
4953 echo "Inject failure stub on client to simulate the case that"
4954 echo "some name entry should be inserted into other non-second"
4955 echo "shard, but inserted into the secod shard by wrong"
4957 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4958 $LCTL set_param fail_loc=0x1628 fail_val=1
4959 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4960 error "(2) Fail to create file under striped directory"
4961 $LCTL set_param fail_loc=0 fail_val=0
4963 echo "Trigger namespace LFSCK to repair bad name hash"
4964 $START_NAMESPACE -r -A ||
4965 error "(3) Fail to start LFSCK for namespace"
4967 wait_all_targets_blocked namespace completed 4
4969 local repaired=$(do_facet mds2 $LCTL get_param -n \
4970 mdd.$(facet_svc mds2).lfsck_namespace |
4971 awk '/^name_hash_repaired/ { print $2 }')
4972 echo "repaired $repaired name entries with bad hash"
4973 [ $repaired -ge 1 ] ||
4974 error "(5) Fail to repair bad name hash: $repaired"
4976 umount_client $MOUNT || error "(6) umount failed"
4977 mount_client $MOUNT || error "(7) mount failed"
4979 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
4980 stat $DIR/$tdir/striped_dir/d$i ||
4981 error "(8) Fail to stat d$i after LFSCK"
4982 rmdir $DIR/$tdir/striped_dir/d$i ||
4983 error "(9) Fail to unlink d$i after LFSCK"
4986 rmdir $DIR/$tdir/striped_dir ||
4987 error "(10) Fail to remove the striped directory after LFSCK"
4989 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4992 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4993 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4994 skip "MDS older than 2.6.50, LU-5519"
4997 echo "For some reason, the master MDT-object of the striped directory"
4998 echo "may lost its master LMV EA. If nobody created files under the"
4999 echo "master directly after the master LMV EA lost, then the LFSCK"
5000 echo "should re-generate the master LMV EA."
5003 check_mount_and_prep
5005 echo "Inject failure stub on MDT0 to simulate the case that the"
5006 echo "master MDT-object of the striped directory lost the LMV EA."
5008 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5009 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5010 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5011 error "(1) Fail to create striped directory"
5012 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5014 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5015 $START_NAMESPACE -r -A ||
5016 error "(2) Fail to start LFSCK for namespace"
5018 wait_all_targets_blocked namespace completed 3
5020 local repaired=$($SHOW_NAMESPACE |
5021 awk '/^striped_dirs_repaired/ { print $2 }')
5022 [ $repaired -eq 1 ] ||
5023 error "(4) Fail to re-generate master LMV EA: $repaired"
5025 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5026 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5028 umount_client $MOUNT || error "(5) umount failed"
5029 mount_client $MOUNT || error "(6) mount failed"
5031 local empty=$(ls $DIR/$tdir/striped_dir/)
5032 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5034 rmdir $DIR/$tdir/striped_dir ||
5035 error "(8) Fail to remove the striped directory after LFSCK"
5037 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5040 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5041 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5042 skip "MDS older than 2.6.50, LU-5519"
5045 echo "For some reason, the master MDT-object of the striped directory"
5046 echo "may lost its master LMV EA. If somebody created files under the"
5047 echo "master directly after the master LMV EA lost, then the LFSCK"
5048 echo "should NOT re-generate the master LMV EA, instead, it should"
5049 echo "change the broken striped dirctory as read-only to prevent"
5050 echo "further damage"
5053 check_mount_and_prep
5055 echo "Inject failure stub on MDT0 to simulate the case that the"
5056 echo "master MDT-object of the striped directory lost the LMV EA."
5058 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5059 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5060 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5061 error "(1) Fail to create striped directory"
5062 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5064 umount_client $MOUNT || error "(2) umount failed"
5065 mount_client $MOUNT || error "(3) mount failed"
5067 touch $DIR/$tdir/striped_dir/dummy ||
5068 error "(4) Fail to touch under broken striped directory"
5070 echo "Trigger namespace LFSCK to find out the inconsistency"
5071 $START_NAMESPACE -r -A ||
5072 error "(5) Fail to start LFSCK for namespace"
5074 wait_all_targets_blocked namespace completed 6
5076 local repaired=$($SHOW_NAMESPACE |
5077 awk '/^striped_dirs_repaired/ { print $2 }')
5078 [ $repaired -eq 0 ] ||
5079 error "(7) Re-generate master LMV EA unexpected: $repaired"
5081 stat $DIR/$tdir/striped_dir/dummy ||
5082 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5084 touch $DIR/$tdir/striped_dir/foo &&
5085 error "(9) The broken striped directory should be read-only"
5087 chattr -i $DIR/$tdir/striped_dir ||
5088 error "(10) Fail to chattr on the broken striped directory"
5090 rmdir $DIR/$tdir/striped_dir ||
5091 error "(11) Fail to remove the striped directory after LFSCK"
5093 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5096 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5097 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5098 skip "MDS older than 2.6.50, LU-5519"
5101 echo "For some reason, the slave MDT-object of the striped directory"
5102 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5103 echo "slave LMV EA."
5106 check_mount_and_prep
5108 echo "Inject failure stub on MDT0 to simulate the case that the"
5109 echo "slave MDT-object (that resides on the same MDT as the master"
5110 echo "MDT-object resides on) lost the LMV EA."
5112 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5114 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5115 error "(1) Fail to create striped directory"
5116 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5118 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5119 $START_NAMESPACE -r -A ||
5120 error "(2) Fail to start LFSCK for namespace"
5122 wait_all_targets_blocked namespace completed 3
5124 local repaired=$($SHOW_NAMESPACE |
5125 awk '/^striped_shards_repaired/ { print $2 }')
5126 [ $repaired -eq 1 ] ||
5127 error "(4) Fail to re-generate slave LMV EA: $repaired"
5129 rmdir $DIR/$tdir/striped_dir ||
5130 error "(5) Fail to remove the striped directory after LFSCK"
5132 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5135 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5136 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5137 skip "MDS older than 2.6.50, LU-5519"
5140 echo "For some reason, the slave MDT-object of the striped directory"
5141 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5142 echo "slave LMV EA."
5145 check_mount_and_prep
5147 echo "Inject failure stub on MDT0 to simulate the case that the"
5148 echo "slave MDT-object (that resides on different MDT as the master"
5149 echo "MDT-object resides on) lost the LMV EA."
5151 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5152 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5153 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5154 error "(1) Fail to create striped directory"
5155 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5157 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5158 $START_NAMESPACE -r -A ||
5159 error "(2) Fail to start LFSCK for namespace"
5161 wait_all_targets_blocked namespace completed 3
5163 local repaired=$(do_facet mds2 $LCTL get_param -n \
5164 mdd.$(facet_svc mds2).lfsck_namespace |
5165 awk '/^striped_shards_repaired/ { print $2 }')
5166 [ $repaired -eq 1 ] ||
5167 error "(4) Fail to re-generate slave LMV EA: $repaired"
5169 rmdir $DIR/$tdir/striped_dir ||
5170 error "(5) Fail to remove the striped directory after LFSCK"
5172 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5175 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5176 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5177 skip "MDS older than 2.6.50, LU-5519"
5180 echo "For some reason, the stripe index in the slave LMV EA is"
5181 echo "corrupted. The LFSCK should repair the slave LMV EA."
5184 check_mount_and_prep
5186 echo "Inject failure stub on MDT0 to simulate the case that the"
5187 echo "slave LMV EA on the first shard of the striped directory"
5188 echo "claims the same index as the second shard claims"
5190 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5191 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5192 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5193 error "(1) Fail to create striped directory"
5194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5196 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5197 $START_NAMESPACE -r -A ||
5198 error "(2) Fail to start LFSCK for namespace"
5200 wait_all_targets_blocked namespace completed 3
5202 local repaired=$($SHOW_NAMESPACE |
5203 awk '/^striped_shards_repaired/ { print $2 }')
5204 [ $repaired -eq 1 ] ||
5205 error "(4) Fail to repair slave LMV EA: $repaired"
5207 umount_client $MOUNT || error "(5) umount failed"
5208 mount_client $MOUNT || error "(6) mount failed"
5210 touch $DIR/$tdir/striped_dir/foo ||
5211 error "(7) Fail to touch file after the LFSCK"
5213 rm -f $DIR/$tdir/striped_dir/foo ||
5214 error "(8) Fail to unlink file after the LFSCK"
5216 rmdir $DIR/$tdir/striped_dir ||
5217 error "(9) Fail to remove the striped directory after LFSCK"
5219 run_test 31g "Repair the corrupted slave LMV EA"
5222 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5223 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5224 skip "MDS older than 2.6.50, LU-5519"
5227 echo "For some reason, the shard's name entry in the striped"
5228 echo "directory may be corrupted. The LFSCK should repair the"
5229 echo "bad shard's name entry."
5232 check_mount_and_prep
5234 echo "Inject failure stub on MDT0 to simulate the case that the"
5235 echo "first shard's name entry in the striped directory claims"
5236 echo "the same index as the second shard's name entry claims."
5238 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5239 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5240 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5241 error "(1) Fail to create striped directory"
5242 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5244 echo "Trigger namespace LFSCK to repair the shard's name entry"
5245 $START_NAMESPACE -r -A ||
5246 error "(2) Fail to start LFSCK for namespace"
5248 wait_all_targets_blocked namespace completed 3
5250 local repaired=$($SHOW_NAMESPACE |
5251 awk '/^dirent_repaired/ { print $2 }')
5252 [ $repaired -eq 1 ] ||
5253 error "(4) Fail to repair shard's name entry: $repaired"
5255 umount_client $MOUNT || error "(5) umount failed"
5256 mount_client $MOUNT || error "(6) mount failed"
5258 touch $DIR/$tdir/striped_dir/foo ||
5259 error "(7) Fail to touch file after the LFSCK"
5261 rm -f $DIR/$tdir/striped_dir/foo ||
5262 error "(8) Fail to unlink file after the LFSCK"
5264 rmdir $DIR/$tdir/striped_dir ||
5265 error "(9) Fail to remove the striped directory after LFSCK"
5267 run_test 31h "Repair the corrupted shard's name entry"
5272 umount_client $MOUNT
5274 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5275 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5276 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5278 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5279 [ "$STATUS" == "scanning-phase1" ] ||
5280 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5283 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5285 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5289 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5291 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5292 error "(5) Fail to start ost1"
5294 run_test 32a "stop LFSCK when some OST failed"
5298 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5301 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5302 error "(1) Fail to create $DIR/$tdir/dp"
5303 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5304 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5305 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5306 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5307 umount_client $MOUNT
5309 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5310 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5311 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5314 mdd.${MDT_DEV}.lfsck_namespace |
5315 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5317 error "(5) unexpected status"
5321 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5323 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5327 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5329 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5330 error "(8) Fail to start MDT2"
5332 run_test 32b "stop LFSCK when some MDT failed"
5338 $START_LAYOUT --dryrun -o -r ||
5339 error "(1) Fail to start layout LFSCK"
5340 wait_all_targets_blocked layout completed 2
5342 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5343 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5344 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5346 $START_NAMESPACE -e abort -A -r ||
5347 error "(4) Fail to start namespace LFSCK"
5348 wait_all_targets_blocked namespace completed 5
5350 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5351 [ "$PARAMS" == "failout,all_targets" ] ||
5352 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5354 run_test 33 "check LFSCK paramters"
5358 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5359 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5363 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5364 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5365 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5366 error "(1) Fail to create $DIR/$tdir/dummy"
5368 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5369 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5370 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5371 mdd.${MDT_DEV}.lfsck_namespace |
5372 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5374 error "(3) unexpected status"
5377 local repaired=$($SHOW_NAMESPACE |
5378 awk '/^dirent_repaired/ { print $2 }')
5379 [ $repaired -eq 1 ] ||
5380 error "(4) Fail to repair the lost agent object: $repaired"
5382 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5383 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5384 mdd.${MDT_DEV}.lfsck_namespace |
5385 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5387 error "(6) unexpected status"
5390 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5391 [ $repaired -eq 0 ] ||
5392 error "(7) Unexpected repairing: $repaired"
5394 run_test 34 "LFSCK can rebuild the lost agent object"
5398 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5402 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5403 do_facet mds2 $LCTL set_param fail_loc=0x1631
5404 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5405 error "(1) Fail to create $DIR/$tdir/dummy"
5408 do_facet mds2 $LCTL set_param fail_loc=0
5409 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5410 wait_update_facet mds2 "$LCTL get_param -n \
5411 mdd.$(facet_svc mds2).lfsck_namespace |
5412 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5413 error "(3) MDS${k} is not the expected 'completed'"
5415 local repaired=$(do_facet mds2 $LCTL get_param -n \
5416 mdd.$(facet_svc mds2).lfsck_namespace |
5417 awk '/^agent_entries_repaired/ { print $2 }')
5418 [ $repaired -eq 1 ] ||
5419 error "(4) Fail to repair the lost agent entry: $repaired"
5421 echo "stopall to cleanup object cache"
5424 setupall > /dev/null
5426 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5427 wait_update_facet mds2 "$LCTL get_param -n \
5428 mdd.$(facet_svc mds2).lfsck_namespace |
5429 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5430 error "(6) MDS${k} is not the expected 'completed'"
5432 repaired=$(do_facet mds2 $LCTL get_param -n \
5433 mdd.$(facet_svc mds2).lfsck_namespace |
5434 awk '/^agent_entries_repaired/ { print $2 }')
5435 [ $repaired -eq 0 ] ||
5436 error "(7) Unexpected repairing: $repaired"
5438 run_test 35 "LFSCK can rebuild the lost agent entry"
5441 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5444 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5445 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5446 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5449 check_mount_and_prep
5453 lctl get_param osc.*.*grant*
5454 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5456 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5457 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5458 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5459 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5460 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5461 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5462 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5463 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5464 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5466 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5467 error "(3) Fail to write $DIR/$tdir/f0"
5468 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5469 error "(4) Fail to write $DIR/$tdir/f1"
5470 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5471 error "(5) Fail to write $DIR/$tdir/f2"
5473 $LFS mirror resync $DIR/$tdir/f0 ||
5474 error "(6) Fail to resync $DIR/$tdir/f0"
5475 $LFS mirror resync $DIR/$tdir/f1 ||
5476 error "(7) Fail to resync $DIR/$tdir/f1"
5477 $LFS mirror resync $DIR/$tdir/f2 ||
5478 error "(8) Fail to resync $DIR/$tdir/f2"
5480 cancel_lru_locks mdc
5481 cancel_lru_locks osc
5483 $LFS getstripe $DIR/$tdir/f0 ||
5484 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5485 $LFS getstripe $DIR/$tdir/f1 ||
5486 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5487 $LFS getstripe $DIR/$tdir/f2 ||
5488 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5490 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5491 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5492 do_facet mds1 $LCTL set_param fail_loc=0x1616
5494 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5495 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5496 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5497 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5498 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5499 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5503 do_facet mds1 $LCTL set_param fail_loc=0
5505 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5506 error "(15) The 1st of mirror is not destroyed"
5507 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5508 error "(16) The 2nd of mirror is not destroyed"
5509 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5510 error "(17) The 3rd of mirror is not destroyed"
5514 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5515 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5516 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5517 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5518 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5519 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5521 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5522 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5524 for k in $(seq $MDSCOUNT); do
5525 # The LFSCK status query internal is 30 seconds. For the case
5526 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5527 # time to guarantee the status sync up.
5528 wait_update_facet mds${k} "$LCTL get_param -n \
5529 mdd.$(facet_svc mds${k}).lfsck_layout |
5530 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5531 error "(22) MDS${k} is not the expected 'completed'"
5534 for k in $(seq $OSTCOUNT); do
5535 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5536 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5537 awk '/^status/ { print $2 }')
5538 [ "$cur_status" == "completed" ] ||
5539 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5542 local repaired=$(do_facet mds1 $LCTL get_param -n \
5543 mdd.$(facet_svc mds1).lfsck_layout |
5544 awk '/^repaired_orphan/ { print $2 }')
5545 [ $repaired -eq 9 ] ||
5546 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5548 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5549 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5550 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5551 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5552 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5553 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5555 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5556 $LFS getstripe $DIR/$tdir/f0
5557 error "(28) The 1st of mirror is not recovered"
5560 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5561 $LFS getstripe $DIR/$tdir/f1
5562 error "(29) The 2nd of mirror is not recovered"
5565 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5566 $LFS getstripe $DIR/$tdir/f2
5567 error "(30) The 3rd of mirror is not recovered"
5570 run_test 36a "rebuild LOV EA for mirrored file (1)"
5573 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5574 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5577 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5578 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5579 echo "with the PFID EA of related OST-object(s) belong to the file. "
5582 check_mount_and_prep
5584 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5585 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5586 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5588 local fid=$($LFS path2fid $DIR/$tdir/f0)
5590 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5591 error "(1) Fail to write $DIR/$tdir/f0"
5592 $LFS mirror resync $DIR/$tdir/f0 ||
5593 error "(2) Fail to resync $DIR/$tdir/f0"
5595 cancel_lru_locks mdc
5596 cancel_lru_locks osc
5598 $LFS getstripe $DIR/$tdir/f0 ||
5599 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5601 echo "Inject failure, to simulate the case of missing the MDT-object"
5602 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5603 do_facet mds1 $LCTL set_param fail_loc=0x1616
5604 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5608 do_facet mds1 $LCTL set_param fail_loc=0
5610 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5611 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5613 for k in $(seq $MDSCOUNT); do
5614 # The LFSCK status query internal is 30 seconds. For the case
5615 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5616 # time to guarantee the status sync up.
5617 wait_update_facet mds${k} "$LCTL get_param -n \
5618 mdd.$(facet_svc mds${k}).lfsck_layout |
5619 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5620 error "(6) MDS${k} is not the expected 'completed'"
5623 for k in $(seq $OSTCOUNT); do
5624 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5625 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5626 awk '/^status/ { print $2 }')
5627 [ "$cur_status" == "completed" ] ||
5628 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5631 local count=$(do_facet mds1 $LCTL get_param -n \
5632 mdd.$(facet_svc mds1).lfsck_layout |
5633 awk '/^repaired_orphan/ { print $2 }')
5634 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5636 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5637 count=$($LFS getstripe --mirror-count $name)
5638 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5640 count=$($LFS getstripe --component-count $name)
5641 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5643 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5644 $LFS getstripe $name
5645 error "(11) The 1st of mirror is not recovered"
5648 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5649 $LFS getstripe $name
5650 error "(12) The 2nd of mirror is not recovered"
5653 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5654 $LFS getstripe $name
5655 error "(13) The 3rd of mirror is not recovered"
5658 run_test 36b "rebuild LOV EA for mirrored file (2)"
5661 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5662 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5665 echo "The mirrored file has been modified, not resynced yet, then "
5666 echo "lost its MDT-object, but relatd OST-objects are still there. "
5667 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5668 echo "with the PFID EA of related OST-object(s) belong to the file. "
5671 check_mount_and_prep
5673 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5675 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5677 local fid=$($LFS path2fid $DIR/$tdir/f0)
5679 # The 1st dd && resync makes all related OST-objects have been written
5680 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5681 error "(1.1) Fail to write $DIR/$tdir/f0"
5682 $LFS mirror resync $DIR/$tdir/f0 ||
5683 error "(1.2) Fail to resync $DIR/$tdir/f0"
5684 # The 2nd dd makes one mirror to be stale
5685 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5686 error "(1.3) Fail to write $DIR/$tdir/f0"
5688 cancel_lru_locks mdc
5689 cancel_lru_locks osc
5691 $LFS getstripe $DIR/$tdir/f0 ||
5692 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5694 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5695 awk '/lcme_flags/ { print $2 }')
5696 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5697 awk '/lcme_flags/ { print $2 }')
5699 echo "Inject failure, to simulate the case of missing the MDT-object"
5700 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5701 do_facet mds1 $LCTL set_param fail_loc=0x1616
5702 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5706 do_facet mds1 $LCTL set_param fail_loc=0
5708 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5709 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5711 for k in $(seq $MDSCOUNT); do
5712 # The LFSCK status query internal is 30 seconds. For the case
5713 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5714 # time to guarantee the status sync up.
5715 wait_update_facet mds${k} "$LCTL get_param -n \
5716 mdd.$(facet_svc mds${k}).lfsck_layout |
5717 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5718 error "(5) MDS${k} is not the expected 'completed'"
5721 for k in $(seq $OSTCOUNT); do
5722 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5723 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5724 awk '/^status/ { print $2 }')
5725 [ "$cur_status" == "completed" ] ||
5726 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5729 local count=$(do_facet mds1 $LCTL get_param -n \
5730 mdd.$(facet_svc mds1).lfsck_layout |
5731 awk '/^repaired_orphan/ { print $2 }')
5732 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5734 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5735 count=$($LFS getstripe --mirror-count $name)
5736 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5738 count=$($LFS getstripe --component-count $name)
5739 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5741 local flags=$($LFS getstripe $name | head -n 10 |
5742 awk '/lcme_flags/ { print $2 }')
5743 [ "$flags" == "$saved_flags1" ] || {
5744 $LFS getstripe $name
5745 error "(10) expect flags $saved_flags1, got $flags"
5748 flags=$($LFS getstripe $name | tail -n 10 |
5749 awk '/lcme_flags/ { print $2 }')
5750 [ "$flags" == "$saved_flags2" ] || {
5751 $LFS getstripe $name
5752 error "(11) expect flags $saved_flags2, got $flags"
5755 run_test 36c "rebuild LOV EA for mirrored file (3)"
5761 local t_dir="$DIR/$tdir/d0"
5762 check_mount_and_prep
5764 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5765 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5769 $START_NAMESPACE -r -A || {
5770 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5772 wait_all_targets_blocked namespace completed 4
5777 run_test 37 "LFSCK must skip a ORPHAN"
5781 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5782 skip "Need MDS version newer than 2.12.51"
5784 test_mkdir $DIR/$tdir
5785 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5786 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5788 # create foreign file
5789 $LFS setstripe --foreign=none --flags 0xda05 \
5790 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5791 error "$DIR/$tdir/$tfile: create failed"
5793 $LFS getstripe -v $DIR/$tdir/$tfile |
5794 grep "lfm_magic:.*0x0BD70BD0" ||
5795 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5796 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5797 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5798 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5799 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5800 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5801 $LFS getstripe -v $DIR/$tdir/$tfile |
5802 grep "lfm_flags:.*0x0000DA05" ||
5803 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5804 $LFS getstripe $DIR/$tdir/$tfile |
5805 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5806 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5808 # modify striping should fail
5809 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5810 error "$DIR/$tdir/$tfile: setstripe should fail"
5812 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5814 wait_all_targets_blocked namespace completed 1
5816 # check that "global" namespace_repaired == 0 !!!
5817 local repaired=$(do_facet mds1 \
5818 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5819 awk '/^namespace_repaired/ { print \\\$2 }'")
5820 [ $repaired -eq 0 ] ||
5821 error "(2) Expect no namespace repair, but got: $repaired"
5823 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5825 wait_all_targets_blocked layout completed 2
5827 # check that "global" layout_repaired == 0 !!!
5828 local repaired=$(do_facet mds1 \
5829 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5830 awk '/^layout_repaired/ { print \\\$2 }'")
5831 [ $repaired -eq 0 ] ||
5832 error "(2) Expect no layout repair, but got: $repaired"
5834 echo "post-lfsck checks of foreign file"
5836 $LFS getstripe -v $DIR/$tdir/$tfile |
5837 grep "lfm_magic:.*0x0BD70BD0" ||
5838 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5839 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5840 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5841 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5842 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5843 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5844 $LFS getstripe -v $DIR/$tdir/$tfile |
5845 grep "lfm_flags:.*0x0000DA05" ||
5846 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5847 $LFS getstripe $DIR/$tdir/$tfile |
5848 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5849 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5851 # modify striping should fail
5852 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5853 error "$DIR/$tdir/$tfile: setstripe should fail"
5856 cat $DIR/$tdir/$tfile && "$DIR/$tdir/$tfile: read should fail"
5857 cat /etc/passwd > $DIR/$tdir/$tfile &&
5858 error "$DIR/$tdir/$tfile: write should fail"
5860 #remove foreign file
5861 rm $DIR/$tdir/$tfile ||
5862 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5864 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5868 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5869 skip "Need MDS version newer than 2.12.51"
5871 test_mkdir $DIR/$tdir
5872 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5873 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5875 # create foreign dir
5876 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5877 $DIR/$tdir/${tdir}2 ||
5878 error "$DIR/$tdir/${tdir}2: create failed"
5880 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5881 grep "lfm_magic:.*0x0CD50CD0" ||
5882 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5883 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5884 # - sizeof(lfm_type) - sizeof(lfm_flags)
5885 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5886 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5887 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5888 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5889 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5890 grep "lfm_flags:.*0x0000DA05" ||
5891 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5892 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5893 grep "lfm_value.*${uuid1}@${uuid2}" ||
5894 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5896 # file create in dir should fail
5897 touch $DIR/$tdir/${tdir}2/$tfile &&
5898 "$DIR/${tdir}2: file create should fail"
5901 chmod 777 $DIR/$tdir/${tdir}2 ||
5902 error "$DIR/${tdir}2: chmod failed"
5905 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5906 error "$DIR/${tdir}2: chown failed"
5908 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5910 wait_all_targets_blocked namespace completed 1
5912 # check that "global" namespace_repaired == 0 !!!
5913 local repaired=$(do_facet mds1 \
5914 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5915 awk '/^namespace_repaired/ { print \\\$2 }'")
5916 [ $repaired -eq 0 ] ||
5917 error "(2) Expect nothing to be repaired, but got: $repaired"
5919 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5921 wait_all_targets_blocked layout completed 2
5923 # check that "global" layout_repaired == 0 !!!
5924 local repaired=$(do_facet mds1 \
5925 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5926 awk '/^layout_repaired/ { print \\\$2 }'")
5927 [ $repaired -eq 0 ] ||
5928 error "(2) Expect no layout repair, but got: $repaired"
5930 echo "post-lfsck checks of foreign dir"
5932 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5933 grep "lfm_magic:.*0x0CD50CD0" ||
5934 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5935 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5936 # - sizeof(lfm_type) - sizeof(lfm_flags)
5937 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5938 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5939 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5940 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5941 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5942 grep "lfm_flags:.*0x0000DA05" ||
5943 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5944 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5945 grep "lfm_value.*${uuid1}@${uuid2}" ||
5946 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5948 # file create in dir should fail
5949 touch $DIR/$tdir/${tdir}2/$tfile &&
5950 "$DIR/${tdir}2: file create should fail"
5953 chmod 777 $DIR/$tdir/${tdir}2 ||
5954 error "$DIR/${tdir}2: chmod failed"
5957 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5958 error "$DIR/${tdir}2: chown failed"
5961 rmdir $DIR/$tdir/${tdir}2 ||
5962 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5964 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5967 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5969 check_mount_and_prep
5970 $LFS mkdir -i 1 $DIR/$tdir/dir1
5971 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
5973 touch $DIR/$tdir/dir1/f1
5974 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
5976 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
5977 $LFS migrate -m 0 $DIR/$tdir/dir1
5979 echo "trigger LFSCK for layout"
5980 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
5982 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5983 mdd.${MDT_DEV}.lfsck_layout |
5984 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5986 error "(2) unexpected status"
5989 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
5991 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
5993 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
5997 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
5999 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6000 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6001 do_facet $SINGLEMDS $LCTL dk > /dev/null
6003 echo "trigger LFSCK for SEL layout"
6004 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6005 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6006 mdd.${MDT_DEV}.lfsck_layout |
6007 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6009 error "(2) unexpected status"
6012 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6013 grep "lfsck_layout_verify_header")
6015 [[ "x$errors" == "x" ]] || {
6017 error "lfsck failed"
6020 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6022 run_test 41 "SEL support in LFSCK"
6024 # restore MDS/OST size
6025 MDSSIZE=${SAVED_MDSSIZE}
6026 OSTSIZE=${SAVED_OSTSIZE}
6027 OSTCOUNT=${SAVED_OSTCOUNT}
6029 # cleanup the system at last
6030 REFORMAT="yes" cleanup_and_setup_lustre
6033 check_and_cleanup_lustre