3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
33 skip "Need MDS version at least 2.3.60"
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
45 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 MDT_DEV="${FSNAME}-MDT0000"
55 OST_DEV="${FSNAME}-OST0000"
56 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
57 START_NAMESPACE="do_facet $SINGLEMDS \
58 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
59 START_LAYOUT="do_facet $SINGLEMDS \
60 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
61 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
62 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
63 SHOW_NAMESPACE="do_facet $SINGLEMDS \
64 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
65 SHOW_LAYOUT="do_facet $SINGLEMDS \
66 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
67 SHOW_LAYOUT_ON_OST="do_facet ost1 \
68 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
69 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
70 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
71 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
80 echo "preparing... $nfiles * $ndirs files will be created $(date)."
81 if [ ! -z $igif ]; then
82 #define OBD_FAIL_FID_IGIF 0x1504
83 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
86 cp $LUSTRE/tests/*.sh $DIR/$tdir/
87 if [ $ndirs -gt 0 ]; then
88 createmany -d $DIR/$tdir/d $ndirs
89 createmany -m $DIR/$tdir/f $ndirs
90 if [ $nfiles -gt 0 ]; then
91 for ((i = 0; i < $ndirs; i++)); do
92 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
93 /dev/null || error "createmany $nfiles"
96 createmany -d $DIR/$tdir/e $ndirs
99 if [ ! -z $igif ]; then
100 touch $DIR/$tdir/dummy
101 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
104 echo "prepared $(date)."
107 run_e2fsck_on_mdt0() {
108 [ $mds1_FSTYPE == ldiskfs ] || return 0
110 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
111 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
113 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
114 error "(2) Detected inconsistency on MDT0"
116 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
117 error "(3) Fail to start MDT0"
120 wait_all_targets_blocked() {
125 local count=$(do_facet mds1 \
126 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
127 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
128 [[ $count -eq $MDSCOUNT ]] || {
129 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
130 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
139 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
141 "$MDSCOUNT" $LTIME || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) some MDTs are not in ${status}"
150 #define OBD_FAIL_LFSCK_DELAY1 0x1600
151 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
152 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
154 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
156 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
157 [ "$STATUS" == "scanning-phase1" ] ||
158 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
160 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
162 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
163 [ "$STATUS" == "stopped" ] ||
164 error "(6) Expect 'stopped', but got '$STATUS'"
166 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
168 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
172 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
173 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
174 mdd.${MDT_DEV}.lfsck_namespace |
175 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
177 error "(9) unexpected status"
180 local repaired=$($SHOW_NAMESPACE |
181 awk '/^updated_phase1/ { print $2 }')
182 [ $repaired -eq 0 ] ||
183 error "(10) Expect nothing to be repaired, but got: $repaired"
185 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
186 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
187 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
188 mdd.${MDT_DEV}.lfsck_namespace |
189 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
191 error "(12) unexpected status"
194 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
195 [ $((scanned1 + 1)) -eq $scanned2 ] ||
196 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
198 echo "stopall, should NOT crash LU-3649"
199 stopall || error "(14) Fail to stopall"
201 run_test 0 "Control LFSCK manually"
206 #define OBD_FAIL_FID_INDIR 0x1501
207 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
208 touch $DIR/$tdir/dummy
210 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
212 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
213 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
214 mdd.${MDT_DEV}.lfsck_namespace |
215 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
217 error "(4) unexpected status"
220 local repaired=$($SHOW_NAMESPACE |
221 awk '/^dirent_repaired/ { print $2 }')
222 # for interop with old server
223 [ -z "$repaired" ] &&
224 repaired=$($SHOW_NAMESPACE |
225 awk '/^updated_phase1/ { print $2 }')
227 [ $repaired -eq 1 ] ||
228 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
232 mount_client $MOUNT || error "(6) Fail to start client!"
234 #define OBD_FAIL_FID_LOOKUP 0x1505
235 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
236 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
238 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
240 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
244 [ "$mds1_FSTYPE" != ldiskfs ] &&
245 skip "OI Scrub not implemented for ZFS"
249 #define OBD_FAIL_FID_INLMA 0x1502
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
251 touch $DIR/$tdir/dummy
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 #define OBD_FAIL_FID_NOLMA 0x1506
256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
257 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
258 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
259 mdd.${MDT_DEV}.lfsck_namespace |
260 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
262 error "(4) unexpected status"
265 local repaired=$($SHOW_NAMESPACE |
266 awk '/^dirent_repaired/ { print $2 }')
267 # for interop with old server
268 [ -z "$repaired" ] &&
269 repaired=$($SHOW_NAMESPACE |
270 awk '/^updated_phase1/ { print $2 }')
272 [ $repaired -eq 1 ] ||
273 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
278 mount_client $MOUNT || error "(6) Fail to start client!"
280 #define OBD_FAIL_FID_LOOKUP 0x1505
281 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
282 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
284 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
286 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
291 #define OBD_FAIL_FID_IGIF 0x1504
292 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
293 touch $DIR/$tdir/dummy
295 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
297 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
298 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
299 mdd.${MDT_DEV}.lfsck_namespace |
300 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
302 error "(4) unexpected status"
305 local repaired=$($SHOW_NAMESPACE |
306 awk '/^dirent_repaired/ { print $2 }')
307 # for interop with old server
308 [ -z "$repaired" ] &&
309 repaired=$($SHOW_NAMESPACE |
310 awk '/^updated_phase1/ { print $2 }')
312 [ $repaired -eq 1 ] ||
313 error "(5) Fail to repair lost FID-in-dirent: $repaired"
317 mount_client $MOUNT || error "(6) Fail to start client!"
319 #define OBD_FAIL_FID_LOOKUP 0x1505
320 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
321 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
323 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
325 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
328 [ $MDS1_VERSION -lt $(version_code 2.13.57) ] &&
329 skip "MDS older than 2.13.57"
330 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
334 touch $DIR/$tdir/$tfile
335 mkdir $DIR/$tdir/subdir
336 $LFS mkdir -i 1 $DIR/$tdir/remotedir
337 $LFS path2fid $DIR/$tdir
338 ll_decode_linkea $DIR/$tdir/$tfile
339 ll_decode_linkea $DIR/$tdir/subdir
340 ll_decode_linkea $DIR/$tdir/remotedir
342 local mntpt=$(facet_mntpt mds1)
344 # unlink OI files to remove the stale entry
345 local saved_opts=$MDS_MOUNT_OPTS
348 mount_fstype mds1 $mntpt
349 # increase $tdir FID oid in LMA
350 do_facet mds1 "getfattr -d -m trusted.lma -e hex \
351 --absolute-names $mntpt/ROOT/$tdir | \
352 sed -E 's/0(.{8})$/1\1/' | setfattr --restore=-"
353 unmount_fstype mds1 $mntpt
356 # the FID oid in LMA was increased above, and it's not in OI table,
357 # run scrub first to generate mapping in OI, so the following namespace
358 # check can fix linkea correctly, this is not necessary normally.
359 do_facet mds1 $LCTL lfsck_start -M ${MDT_DEV} -t scrub ||
360 error "failed to start LFSCK for scrub!"
361 wait_update_facet mds1 "$LCTL get_param -n \
362 osd-*.$(facet_svc mds1).oi_scrub |
363 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
364 error "unexpected status"
366 $START_NAMESPACE -r -A || error "fail to start LFSCK for namespace!"
367 wait_update_facet mds1 "$LCTL get_param -n \
368 mdd.${MDT_DEV}.lfsck_namespace |
369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
371 error "unexpected status"
373 $LFS path2fid $DIR/$tdir
374 ll_decode_linkea $DIR/$tdir/$tfile
375 ll_decode_linkea $DIR/$tdir/subdir
376 ll_decode_linkea $DIR/$tdir/remotedir
381 fid=$($LFS path2fid $DIR/$tdir)
382 for f in $tfile subdir remotedir; do
383 pfid=$(ll_decode_linkea $DIR/$tdir/$f |
384 awk '/pfid/ { print $3 }')
386 [ "$pfid" == "$fid" ] || error "$fid in LMA != $pfid in linkea"
389 run_test 1d "LFSCK can fix mismatch of FID in LMA and FID in child linkea"
394 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
395 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
396 touch $DIR/$tdir/dummy
398 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
400 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
401 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
402 mdd.${MDT_DEV}.lfsck_namespace |
403 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
405 error "(4) unexpected status"
408 local repaired=$($SHOW_NAMESPACE |
409 awk '/^linkea_repaired/ { print $2 }')
410 # for interop with old server
411 [ -z "$repaired" ] &&
412 repaired=$($SHOW_NAMESPACE |
413 awk '/^updated_phase2/ { print $2 }')
415 [ $repaired -eq 1 ] ||
416 error "(5) Fail to repair crashed linkEA: $repaired"
420 mount_client $MOUNT || error "(6) Fail to start client!"
422 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
423 error "(7) Fail to stat $DIR/$tdir/dummy"
425 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
426 local dummyname=$($LFS fid2path $DIR $dummyfid)
427 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
428 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
430 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
436 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
437 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
438 touch $DIR/$tdir/dummy
440 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
442 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
443 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
444 mdd.${MDT_DEV}.lfsck_namespace |
445 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
447 error "(4) unexpected status"
450 local repaired=$($SHOW_NAMESPACE |
451 awk '/^updated_phase2/ { print $2 }')
452 [ $repaired -eq 1 ] ||
453 error "(5) Fail to repair crashed linkEA: $repaired"
457 mount_client $MOUNT || error "(6) Fail to start client!"
459 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
460 error "(7) Fail to stat $DIR/$tdir/dummy"
462 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
463 local dummyname=$($LFS fid2path $DIR $dummyfid)
464 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
465 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
467 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
471 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
472 skip "MDS older than 2.4.90"
476 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
477 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
478 touch $DIR/$tdir/dummy
480 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
482 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
483 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
484 mdd.${MDT_DEV}.lfsck_namespace |
485 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
487 error "(4) unexpected status"
490 local repaired=$($SHOW_NAMESPACE |
491 awk '/^updated_phase2/ { print $2 }')
492 [ $repaired -eq 1 ] ||
493 error "(5) Fail to repair crashed linkEA: $repaired"
497 mount_client $MOUNT || error "(6) Fail to start client!"
499 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
500 error "(7) Fail to stat $DIR/$tdir/dummy"
502 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
503 local dummyname=$($LFS fid2path $DIR $dummyfid)
504 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
505 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
507 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
511 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
512 skip "MDS older than 2.6.50, LU-4788"
516 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
517 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
518 touch $DIR/$tdir/dummy
520 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
522 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
523 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
524 mdd.${MDT_DEV}.lfsck_namespace |
525 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
527 error "(4) unexpected status"
530 local repaired=$($SHOW_NAMESPACE |
531 awk '/^linkea_repaired/ { print $2 }')
532 [ $repaired -eq 1 ] ||
533 error "(5) Fail to repair crashed linkEA: $repaired"
537 mount_client $MOUNT || error "(6) Fail to start client!"
539 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
540 error "(7) Fail to stat $DIR/$tdir/dummy"
542 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
543 local dummyname=$($LFS fid2path $DIR $dummyfid)
544 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
545 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
547 run_test 2d "LFSCK can recover the missing linkEA entry"
551 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
552 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
553 skip "MDS older than 2.6.50, LU-5511"
557 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
559 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
560 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
561 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
562 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
564 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
566 wait_all_targets_blocked namespace completed 4
568 local repaired=$($SHOW_NAMESPACE |
569 awk '/^linkea_repaired/ { print $2 }')
570 [ $repaired -eq 1 ] ||
571 error "(5) Fail to repair crashed linkEA: $repaired"
573 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
574 local name=$($LFS fid2path $DIR $fid)
575 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
576 error "(6) Fail to repair linkEA: $fid $name"
578 run_test 2e "namespace LFSCK can verify remote object linkEA"
582 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
583 skip "MDS older than 2.6.50, LU-4788"
587 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
588 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
589 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
591 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
592 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
593 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
595 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
596 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
597 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
599 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
600 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
601 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
603 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
605 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
606 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
607 mdd.${MDT_DEV}.lfsck_namespace |
608 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
610 error "(10) unexpected status"
613 local checked=$($SHOW_NAMESPACE |
614 awk '/^checked_phase2/ { print $2 }')
615 [ $checked -ge 4 ] ||
616 error "(11) Fail to check multiple-linked object: $checked"
618 local repaired=$($SHOW_NAMESPACE |
619 awk '/^multiple_linked_repaired/ { print $2 }')
620 [ $repaired -ge 2 ] ||
621 error "(12) Fail to repair multiple-linked object: $repaired"
623 run_test 3 "LFSCK can verify multiple-linked objects"
627 [ "$mds1_FSTYPE" != ldiskfs ] &&
628 skip "OI Scrub not implemented for ZFS"
631 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
632 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
634 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
635 echo "start $SINGLEMDS with disabling OI scrub"
636 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
637 error "(2) Fail to start MDS!"
639 #define OBD_FAIL_LFSCK_DELAY2 0x1601
640 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
641 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
643 mdd.${MDT_DEV}.lfsck_namespace |
644 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
646 error "(5) unexpected status"
649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
650 [ "$STATUS" == "scanning-phase1" ] ||
651 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
653 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
658 error "(7) unexpected status"
661 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
662 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
664 local repaired=$($SHOW_NAMESPACE |
665 awk '/^dirent_repaired/ { print $2 }')
666 # for interop with old server
667 [ -z "$repaired" ] &&
668 repaired=$($SHOW_NAMESPACE |
669 awk '/^updated_phase1/ { print $2 }')
671 [ $repaired -ge 9 ] ||
672 error "(9) Fail to re-generate FID-in-dirent: $repaired"
676 mount_client $MOUNT || error "(10) Fail to start client!"
678 #define OBD_FAIL_FID_LOOKUP 0x1505
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
680 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
681 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
683 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
687 [ "$mds1_FSTYPE" != ldiskfs ] &&
688 skip "OI Scrub not implemented for ZFS"
691 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
692 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
694 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
695 echo "start $SINGLEMDS with disabling OI scrub"
696 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
697 error "(2) Fail to start MDS!"
699 #define OBD_FAIL_LFSCK_DELAY2 0x1601
700 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
701 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
702 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
703 mdd.${MDT_DEV}.lfsck_namespace |
704 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
706 error "(5) unexpected status"
709 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
710 [ "$STATUS" == "scanning-phase1" ] ||
711 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
713 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
714 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
715 mdd.${MDT_DEV}.lfsck_namespace |
716 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
718 error "(7) unexpected status"
721 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
722 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
724 local repaired=$($SHOW_NAMESPACE |
725 awk '/^dirent_repaired/ { print $2 }')
726 # for interop with old server
727 [ -z "$repaired" ] &&
728 repaired=$($SHOW_NAMESPACE |
729 awk '/^updated_phase1/ { print $2 }')
731 [ $repaired -ge 2 ] ||
732 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
736 mount_client $MOUNT || error "(10) Fail to start client!"
738 #define OBD_FAIL_FID_LOOKUP 0x1505
739 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
740 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
742 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
744 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
745 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
746 local dummyname=$($LFS fid2path $DIR $dummyfid)
747 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
748 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
750 run_test 5 "LFSCK can handle IGIF object upgrading"
755 #define OBD_FAIL_LFSCK_DELAY1 0x1600
756 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
757 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
759 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
760 [ "$STATUS" == "scanning-phase1" ] ||
761 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
763 # Sleep 3 sec to guarantee at least one object processed by LFSCK
765 # Fail the LFSCK to guarantee there is at least one checkpoint
766 #define OBD_FAIL_LFSCK_FATAL1 0x1608
767 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
768 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
769 mdd.${MDT_DEV}.lfsck_namespace |
770 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
772 error "(4) unexpected status"
775 local POS0=$($SHOW_NAMESPACE |
776 awk '/^last_checkpoint_position/ { print $2 }' |
779 #define OBD_FAIL_LFSCK_DELAY1 0x1600
780 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
781 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
783 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
784 [ "$STATUS" == "scanning-phase1" ] ||
785 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
787 local POS1=$($SHOW_NAMESPACE |
788 awk '/^latest_start_position/ { print $2 }' |
790 [[ $POS0 -lt $POS1 ]] ||
791 error "(7) Expect larger than: $POS0, but got $POS1"
793 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
794 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
795 mdd.${MDT_DEV}.lfsck_namespace |
796 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
798 error "(8) unexpected status"
801 run_test 6a "LFSCK resumes from last checkpoint (1)"
806 #define OBD_FAIL_LFSCK_DELAY2 0x1601
807 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
808 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
810 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
811 [ "$STATUS" == "scanning-phase1" ] ||
812 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
814 # Sleep 5 sec to guarantee that we are in the directory scanning
816 # Fail the LFSCK to guarantee there is at least one checkpoint
817 #define OBD_FAIL_LFSCK_FATAL2 0x1609
818 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
819 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
820 mdd.${MDT_DEV}.lfsck_namespace |
821 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
823 error "(4) unexpected status"
826 local O_POS0=$($SHOW_NAMESPACE |
827 awk '/^last_checkpoint_position/ { print $2 }' |
830 local D_POS0=$($SHOW_NAMESPACE |
831 awk '/^last_checkpoint_position/ { print $4 }')
833 #define OBD_FAIL_LFSCK_DELAY2 0x1601
834 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
835 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
837 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
838 [ "$STATUS" == "scanning-phase1" ] ||
839 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
841 local O_POS1=$($SHOW_NAMESPACE |
842 awk '/^latest_start_position/ { print $2 }' |
844 local D_POS1=$($SHOW_NAMESPACE |
845 awk '/^latest_start_position/ { print $4 }')
847 echo "Additional debug for 6b"
849 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
850 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
851 [[ $O_POS0 -lt $O_POS1 ]] ||
852 error "(7.1) $O_POS1 is not larger than $O_POS0"
854 [[ $D_POS0 -lt $D_POS1 ]] ||
855 error "(7.2) $D_POS1 is not larger than $D_POS0"
858 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
859 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
860 mdd.${MDT_DEV}.lfsck_namespace |
861 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
863 error "(8) unexpected status"
866 run_test 6b "LFSCK resumes from last checkpoint (2)"
873 #define OBD_FAIL_LFSCK_DELAY2 0x1601
874 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
875 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
877 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
878 [ "$STATUS" == "scanning-phase1" ] ||
879 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
881 # Sleep 3 sec to guarantee at least one object processed by LFSCK
883 echo "stop $SINGLEMDS"
884 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
886 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
887 echo "start $SINGLEMDS"
888 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
889 error "(5) Fail to start MDS!"
891 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
892 mdd.${MDT_DEV}.lfsck_namespace |
893 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
895 error "(6) unexpected status"
898 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
904 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
905 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
906 for ((i = 0; i < 20; i++)); do
907 touch $DIR/$tdir/dummy${i}
910 #define OBD_FAIL_LFSCK_DELAY3 0x1602
911 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
912 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
913 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
914 mdd.${MDT_DEV}.lfsck_namespace |
915 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
917 error "(4) unexpected status"
921 echo "stop $SINGLEMDS"
922 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
924 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
925 echo "start $SINGLEMDS"
926 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
927 error "(6) Fail to start MDS!"
929 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
930 mdd.${MDT_DEV}.lfsck_namespace |
931 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
933 error "(7) unexpected status"
936 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
947 formatall > /dev/null
953 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
954 [ "$STATUS" == "init" ] ||
955 namespace_error "(2) Expect 'init', but got '$STATUS'"
957 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
958 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
959 mkdir $DIR/$tdir/crashed
961 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
962 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
963 for ((i = 0; i < 5; i++)); do
964 touch $DIR/$tdir/dummy${i}
967 umount_client $MOUNT || error "(3) Fail to stop client!"
969 #define OBD_FAIL_LFSCK_DELAY2 0x1601
970 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
972 namespace_error "(4) Fail to start LFSCK for namespace!"
974 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
975 [ "$STATUS" == "scanning-phase1" ] ||
976 namespace_error "(5) Expect 'scanning-phase1', but got '$STATUS'"
978 $STOP_LFSCK || namespace_error "(6) Fail to stop LFSCK!"
980 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
981 [ "$STATUS" == "stopped" ] ||
982 namespace_error "(7) Expect 'stopped', but got '$STATUS'"
985 namespace_error "(8) Fail to start LFSCK for namespace!"
987 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
988 [ "$STATUS" == "scanning-phase1" ] ||
989 namespace_error "(9) Expect 'scanning-phase1', but got '$STATUS'"
991 #define OBD_FAIL_LFSCK_FATAL2 0x1609
992 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
993 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
994 mdd.${MDT_DEV}.lfsck_namespace |
995 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
997 namespace_error "(10) unexpected status"
1000 #define OBD_FAIL_LFSCK_DELAY1 0x1600
1001 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
1003 namespace_error "(11) Fail to start LFSCK for namespace!"
1005 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1006 [ "$STATUS" == "scanning-phase1" ] ||
1007 namespace_error "(12) Expect 'scanning-phase1', but got '$STATUS'"
1009 #define OBD_FAIL_LFSCK_CRASH 0x160a
1010 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
1013 echo "stop $SINGLEMDS"
1014 stop $SINGLEMDS > /dev/null || namespace_error "(13) Fail to stop MDS!"
1016 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1017 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1019 echo "start $SINGLEMDS"
1020 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1021 namespace_error "(14) Fail to start MDS!"
1023 local timeout=$(max_recovery_time)
1026 while [ $timer -lt $timeout ]; do
1027 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1028 mdt.${MDT_DEV}.recovery_status |
1029 awk '/^status/ { print \\\$2 }'")
1030 [ "$STATUS" != "RECOVERING" ] && break;
1032 timer=$((timer + 1))
1035 [ $timer != $timeout ] || (
1036 do_facet $SINGLEMDS "$LCTL get_param -n \
1037 mdt.${MDT_DEV}.recovery_status"
1038 error "(14.1) recovery timeout"
1041 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1042 [ "$STATUS" == "crashed" ] ||
1043 namespace_error "(15) Expect 'crashed', but got '$STATUS'"
1045 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1046 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1048 namespace_error "(16) Fail to start LFSCK for namespace!"
1050 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1051 [ "$STATUS" == "scanning-phase1" ] ||
1052 namespace_error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1054 echo "stop $SINGLEMDS"
1055 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
1057 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1058 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1060 echo "start $SINGLEMDS"
1061 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1062 error "(19) Fail to start MDS!"
1065 while [ $timer -lt $timeout ]; do
1066 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1067 mdt.${MDT_DEV}.recovery_status |
1068 awk '/^status/ { print \\\$2 }'")
1069 [ "$STATUS" != "RECOVERING" ] && break;
1071 timer=$((timer + 1))
1074 [ $timer != $timeout ] || (
1075 do_facet $SINGLEMDS "$LCTL get_param -n \
1076 mdt.${MDT_DEV}.recovery_status"
1077 error "(19.1) recovery timeout"
1080 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1081 [ "$STATUS" == "paused" ] ||
1082 namespace_error "(20) Expect 'paused', but got '$STATUS'"
1084 echo "stop $SINGLEMDS"
1085 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1087 echo "start $SINGLEMDS without resume LFSCK"
1088 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1089 error "(20.2) Fail to start MDS!"
1092 while [ $timer -lt $timeout ]; do
1093 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1094 mdt.${MDT_DEV}.recovery_status |
1095 awk '/^status/ { print \\\$2 }'")
1096 [ "$STATUS" != "RECOVERING" ] && break;
1098 timer=$((timer + 1))
1101 [ $timer != $timeout ] || (
1102 do_facet $SINGLEMDS "$LCTL get_param -n \
1103 mdt.${MDT_DEV}.recovery_status"
1104 error "(20.3) recovery timeout"
1107 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1108 [ "$STATUS" == "paused" ] ||
1109 namespace_error "(20.4) Expect 'paused', but got '$STATUS'"
1111 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1112 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1115 namespace_error "(21) Fail to start LFSCK for namespace!"
1116 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1117 mdd.${MDT_DEV}.lfsck_namespace |
1118 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1120 namespace_error "(22) unexpected status"
1123 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1126 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1127 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1128 namespace_error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1130 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1131 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1132 mdd.${MDT_DEV}.lfsck_namespace |
1133 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1135 namespace_error "(24) unexpected status"
1138 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1140 namespace_error "(25) Expect empty flags, but got '$FLAGS'"
1142 run_test 8 "LFSCK state machine"
1145 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1146 skip "Testing on UP system, the speed may be inaccurate."
1150 check_mount_and_prep
1151 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1152 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1153 createmany -o $DIR/$tdir/lfsck/f 5000
1155 local BASE_SPEED1=100
1157 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1160 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1161 [ "$STATUS" == "scanning-phase1" ] ||
1162 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1164 local SPEED=$($SHOW_LAYOUT |
1165 awk '/^average_speed_phase1/ { print $2 }')
1167 # There may be time error, normally it should be less than 2 seconds.
1168 # We allow another 20% schedule error.
1170 # MAX_MARGIN = 1.3 = 13 / 10
1171 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1172 RUN_TIME1 * 13 / 10))
1173 [ $SPEED -lt $MAX_SPEED ] || {
1175 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1176 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1179 # adjust speed limit
1180 local BASE_SPEED2=300
1182 do_facet $SINGLEMDS \
1183 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1186 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1187 # MIN_MARGIN = 0.7 = 7 / 10
1188 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1189 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1190 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1191 [ $SPEED -gt $MIN_SPEED ] || {
1192 if [ $mds1_FSTYPE != ldiskfs ]; then
1193 error_ignore LU-5624 \
1194 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1197 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1201 # MAX_MARGIN = 1.3 = 13 / 10
1202 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1203 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1204 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1205 [ $SPEED -lt $MAX_SPEED ] || {
1207 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1208 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1209 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1212 do_nodes $(comma_list $(mdts_nodes)) \
1213 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1214 do_nodes $(comma_list $(osts_nodes)) \
1215 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1217 wait_update_facet $SINGLEMDS \
1218 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1219 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1220 error "(7) Failed to get expected 'completed'"
1222 run_test 9a "LFSCK speed control (1)"
1225 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1226 skip "Testing on UP system, the speed may be inaccurate."
1232 echo "Preparing another 50 * 50 files (with error) at $(date)."
1233 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1234 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1235 createmany -d $DIR/$tdir/d 50
1236 createmany -m $DIR/$tdir/f 50
1237 for ((i = 0; i < 50; i++)); do
1238 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1241 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1242 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1243 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1244 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1245 mdd.${MDT_DEV}.lfsck_namespace |
1246 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1248 error "(5) unexpected status"
1251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1252 echo "Prepared at $(date)."
1254 local BASE_SPEED1=50
1256 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1259 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1260 [ "$STATUS" == "scanning-phase2" ] ||
1261 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1263 local SPEED=$($SHOW_NAMESPACE |
1264 awk '/^average_speed_phase2/ { print $2 }')
1265 # There may be time error, normally it should be less than 2 seconds.
1266 # We allow another 20% schedule error.
1268 # MAX_MARGIN = 1.3 = 13 / 10
1269 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1270 RUN_TIME1 * 13 / 10))
1271 [ $SPEED -lt $MAX_SPEED ] || {
1273 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1274 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1277 # adjust speed limit
1278 local BASE_SPEED2=150
1280 do_facet $SINGLEMDS \
1281 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1284 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1285 # MIN_MARGIN = 0.7 = 7 / 10
1286 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1287 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1288 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1289 [ $SPEED -gt $MIN_SPEED ] || {
1290 if [ $mds1_FSTYPE != ldiskfs ]; then
1291 error_ignore LU-5624 \
1292 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1295 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1299 # MAX_MARGIN = 1.3 = 13 / 10
1300 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1301 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1302 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1303 [ $SPEED -lt $MAX_SPEED ] || {
1305 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1306 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1307 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1310 do_nodes $(comma_list $(mdts_nodes)) \
1311 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1312 do_nodes $(comma_list $(osts_nodes)) \
1313 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1314 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1315 mdd.${MDT_DEV}.lfsck_namespace |
1316 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1318 error "(11) unexpected status"
1321 run_test 9b "LFSCK speed control (2)"
1325 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1329 echo "Preparing more files with error at $(date)."
1330 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1333 for ((i = 0; i < 1000; i = $((i+2)))); do
1334 mkdir -p $DIR/$tdir/d${i}
1335 touch $DIR/$tdir/f${i}
1336 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1339 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1340 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1342 for ((i = 1; i < 1000; i = $((i+2)))); do
1343 mkdir -p $DIR/$tdir/d${i}
1344 touch $DIR/$tdir/f${i}
1345 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1348 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1349 echo "Prepared at $(date)."
1351 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1353 umount_client $MOUNT
1354 mount_client $MOUNT || error "(3) Fail to start client!"
1356 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1359 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1360 [ "$STATUS" == "scanning-phase1" ] ||
1361 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1363 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1365 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1367 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1369 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1371 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1373 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1375 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1377 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1378 error "(14) Fail to softlink!"
1380 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1381 [ "$STATUS" == "scanning-phase1" ] ||
1382 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1384 do_nodes $(comma_list $(mdts_nodes)) \
1385 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1386 do_nodes $(comma_list $(osts_nodes)) \
1387 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1388 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1389 mdd.${MDT_DEV}.lfsck_namespace |
1390 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1392 error "(16) unexpected status"
1395 run_test 10 "System is available during LFSCK scanning"
1398 ost_remove_lastid() {
1401 local rcmd="do_facet ost${ost}"
1403 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1405 # step 1: local mount
1406 mount_fstype ost${ost} || return 1
1407 # step 2: remove the specified LAST_ID
1408 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1410 unmount_fstype ost${ost} || return 2
1414 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1415 skip "MDS older than 2.5.55, LU-1267"
1417 check_mount_and_prep
1418 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1419 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1424 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1426 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1427 error "(2) Fail to start ost1"
1429 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1430 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1432 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1433 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1435 wait_update_facet ost1 "$LCTL get_param -n \
1436 obdfilter.${OST_DEV}.lfsck_layout |
1437 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1439 error "(5) unexpected status"
1442 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1444 wait_update_facet ost1 "$LCTL get_param -n \
1445 obdfilter.${OST_DEV}.lfsck_layout |
1446 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1448 error "(6) unexpected status"
1451 echo "the LAST_ID(s) should have been rebuilt"
1452 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1453 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1455 run_test 11a "LFSCK can rebuild lost last_id"
1458 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1459 skip "MDS older than 2.5.55, LU-1267"
1461 check_mount_and_prep
1462 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1464 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1465 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1466 do_facet ost1 $LCTL set_param fail_loc=0x160d
1468 local count=$(precreated_ost_obj_count 0 0)
1470 createmany -o $DIR/$tdir/f $((count + 32))
1472 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1473 local seq=$(do_facet mds1 $LCTL get_param -n \
1474 osp.${proc_path}.prealloc_last_seq)
1475 local id_used=$(do_facet mds1 $LCTL get_param -n \
1476 osp.${proc_path}.prealloc_last_id)
1478 umount_client $MOUNT
1479 stop ost1 || error "(1) Fail to stop ost1"
1481 #define OBD_FAIL_OST_ENOSPC 0x215
1482 do_facet ost1 $LCTL set_param fail_loc=0x215
1484 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1485 error "(2) Fail to start ost1"
1487 for ((i = 0; i < 60; i++)); do
1488 id_ost1=$(do_facet ost1 \
1489 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1490 awk -F: "/$seq/ { print \$2 }")
1491 [ -n "$id_ost1" ] && break
1495 echo "the on-disk LAST_ID should be smaller than the expected one"
1496 [ $id_used -gt $id_ost1 ] ||
1497 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1499 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1500 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1502 wait_update_facet ost1 \
1503 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1504 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1506 error "(6) unexpected status"
1509 stop ost1 || error "(7) Fail to stop ost1"
1511 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1512 error "(8) Fail to start ost1"
1514 echo "the on-disk LAST_ID should have been rebuilt"
1515 # last_id may be larger than $id_used if objects were created/skipped
1516 wait_update_facet_cond ost1 \
1517 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1518 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1519 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1520 error "(9) expect last_id >= id_used $seq:$id_used"
1523 do_facet ost1 $LCTL set_param fail_loc=0
1524 stopall || error "(10) Fail to stopall"
1526 run_test 11b "LFSCK can rebuild crashed last_id"
1529 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1530 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1531 skip "MDS older than 2.5.55, LU-3950"
1533 check_mount_and_prep
1534 for k in $(seq $MDSCOUNT); do
1535 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1536 createmany -o $DIR/$tdir/${k}/f 100 ||
1537 error "(0) Fail to create 100 files."
1540 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1541 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1542 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1544 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1545 wait_all_targets namespace scanning-phase1 3
1547 echo "Stop namespace LFSCK on all targets by single lctl command."
1548 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1549 error "(4) Fail to stop LFSCK on all devices!"
1551 echo "All the LFSCK targets should be in 'stopped' status."
1552 wait_all_targets_blocked namespace stopped 5
1554 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1555 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1556 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1558 echo "All the LFSCK targets should be in 'completed' status."
1559 wait_all_targets_blocked namespace completed 7
1561 start_full_debug_logging
1563 echo "Start layout LFSCK on all targets by single command (-s 1)."
1564 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1565 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1567 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1568 wait_all_targets layout scanning-phase1 9
1570 echo "Stop layout LFSCK on all targets by single lctl command."
1571 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1572 error "(10) Fail to stop LFSCK on all devices!"
1574 echo "All the LFSCK targets should be in 'stopped' status."
1575 wait_all_targets_blocked layout stopped 11
1577 for k in $(seq $OSTCOUNT); do
1578 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1579 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1580 awk '/^status/ { print $2 }')
1581 [ "$STATUS" == "stopped" ] ||
1582 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1585 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1586 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1587 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1589 echo "All the LFSCK targets should be in 'completed' status."
1590 wait_all_targets_blocked layout completed 14
1592 stop_full_debug_logging
1594 run_test 12a "single command to trigger LFSCK on all devices"
1597 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1598 skip "MDS older than 2.5.55, LU-3950"
1600 check_mount_and_prep
1602 echo "Start LFSCK without '-M' specified."
1603 do_facet mds1 $LCTL lfsck_start -A -r ||
1604 error "(0) Fail to start LFSCK without '-M'"
1606 wait_all_targets_blocked namespace completed 1
1607 wait_all_targets_blocked layout completed 2
1609 local count=$(do_facet mds1 $LCTL dl |
1610 awk '{ print $3 }' | grep mdt | wc -l)
1611 if [ $count -gt 1 ]; then
1613 echo "Start layout LFSCK on the node with multipe targets,"
1614 echo "but not specify '-M'/'-A' option. Should get failure."
1616 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1617 error "(3) Start layout LFSCK should fail" || true
1620 run_test 12b "auto detect Lustre device"
1623 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1624 skip "MDS older than 2.5.55, LU-3593"
1627 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1628 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1629 echo "MDT-object FID."
1632 check_mount_and_prep
1634 echo "Inject failure stub to simulate bad lmm_oi"
1635 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1636 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1637 createmany -o $DIR/$tdir/f 1
1638 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1639 error "(0) Fail to create PFL $DIR/$tdir/f1"
1640 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1642 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1643 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1645 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1646 mdd.${MDT_DEV}.lfsck_layout |
1647 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1649 error "(2) unexpected status"
1652 local repaired=$($SHOW_LAYOUT |
1653 awk '/^repaired_others/ { print $2 }')
1654 [ $repaired -eq 2 ] ||
1655 error "(3) Fail to repair crashed lmm_oi: $repaired"
1657 run_test 13 "LFSCK can repair crashed lmm_oi"
1660 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1661 skip "MDS older than 2.5.55, LU-3590"
1664 echo "The OST-object referenced by the MDT-object should be there;"
1665 echo "otherwise, the LFSCK should re-create the missing OST-object."
1666 echo "without '--delay-create-ostobj' option."
1669 check_mount_and_prep
1670 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1672 echo "Inject failure stub to simulate dangling referenced MDT-object"
1673 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1674 do_facet ost1 $LCTL set_param fail_loc=0x1610
1675 local count=$(precreated_ost_obj_count 0 0)
1677 createmany -o $DIR/$tdir/f $((count + 16)) ||
1678 error "(0.1) Fail to create $DIR/$tdir/fx"
1679 touch $DIR/$tdir/guard0
1681 for ((i = 0; i < 16; i++)); do
1682 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1683 $DIR/$tdir/f_comp${i} ||
1684 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1686 touch $DIR/$tdir/guard1
1688 do_facet ost1 $LCTL set_param fail_loc=0
1690 start_full_debug_logging
1692 # exhaust other pre-created dangling cases
1693 count=$(precreated_ost_obj_count 0 0)
1694 createmany -o $DIR/$tdir/a $count ||
1695 error "(0.5) Fail to create $count files."
1697 echo "'ls' should fail because of dangling referenced MDT-object"
1698 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1700 echo "Trigger layout LFSCK to find out dangling reference"
1701 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1703 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1704 mdd.${MDT_DEV}.lfsck_layout |
1705 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1707 error "(3) unexpected status"
1710 local repaired=$($SHOW_LAYOUT |
1711 awk '/^repaired_dangling/ { print $2 }')
1712 [ $repaired -ge 32 ] ||
1713 error "(4) Fail to repair dangling reference: $repaired"
1715 echo "'stat' should fail because of not repair dangling by default"
1716 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1717 error "(5.1) stat should fail"
1718 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1719 error "(5.2) stat should fail"
1721 echo "Trigger layout LFSCK to repair dangling reference"
1722 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1724 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1725 mdd.${MDT_DEV}.lfsck_layout |
1726 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1728 error "(7) unexpected status"
1731 # There may be some async LFSCK updates in processing, wait for
1732 # a while until the target reparation has been done. LU-4970.
1734 echo "'stat' should success after layout LFSCK repairing"
1735 wait_update_facet client "stat $DIR/$tdir/guard0 |
1736 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1737 stat $DIR/$tdir/guard0
1739 error "(8.1) unexpected size"
1742 wait_update_facet client "stat $DIR/$tdir/guard1 |
1743 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1744 stat $DIR/$tdir/guard1
1746 error "(8.2) unexpected size"
1749 repaired=$($SHOW_LAYOUT |
1750 awk '/^repaired_dangling/ { print $2 }')
1751 [ $repaired -ge 32 ] ||
1752 error "(9) Fail to repair dangling reference: $repaired"
1754 stop_full_debug_logging
1756 echo "stopall to cleanup object cache"
1759 setupall > /dev/null
1761 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1764 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1765 skip "MDS older than 2.5.55, LU-3590"
1768 echo "The OST-object referenced by the MDT-object should be there;"
1769 echo "otherwise, the LFSCK should re-create the missing OST-object."
1770 echo "with '--delay-create-ostobj' option."
1773 check_mount_and_prep
1774 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1776 echo "Inject failure stub to simulate dangling referenced MDT-object"
1777 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1778 do_facet ost1 $LCTL set_param fail_loc=0x1610
1779 local count=$(precreated_ost_obj_count 0 0)
1781 createmany -o $DIR/$tdir/f $((count + 31))
1782 touch $DIR/$tdir/guard
1783 do_facet ost1 $LCTL set_param fail_loc=0
1785 start_full_debug_logging
1787 # exhaust other pre-created dangling cases
1788 count=$(precreated_ost_obj_count 0 0)
1789 createmany -o $DIR/$tdir/a $count ||
1790 error "(0) Fail to create $count files."
1792 echo "'ls' should fail because of dangling referenced MDT-object"
1793 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1795 echo "Trigger layout LFSCK to find out dangling reference"
1796 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1798 wait_all_targets_blocked layout completed 3
1800 local repaired=$($SHOW_LAYOUT |
1801 awk '/^repaired_dangling/ { print $2 }')
1802 [ $repaired -ge 32 ] ||
1803 error "(4) Fail to repair dangling reference: $repaired"
1805 echo "'stat' should fail because of not repair dangling by default"
1806 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1808 echo "Trigger layout LFSCK to repair dangling reference"
1809 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1811 wait_all_targets_blocked layout completed 7
1813 # There may be some async LFSCK updates in processing, wait for
1814 # a while until the target reparation has been done. LU-4970.
1816 echo "'stat' should success after layout LFSCK repairing"
1817 wait_update_facet client "stat $DIR/$tdir/guard |
1818 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1819 stat $DIR/$tdir/guard
1821 error "(8) unexpected size"
1824 repaired=$($SHOW_LAYOUT |
1825 awk '/^repaired_dangling/ { print $2 }')
1826 [ $repaired -ge 32 ] ||
1827 error "(9) Fail to repair dangling reference: $repaired"
1829 stop_full_debug_logging
1831 echo "stopall to cleanup object cache"
1834 setupall > /dev/null
1836 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1839 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1840 skip "MDS older than 2.5.55, LU-3591"
1843 echo "If the OST-object referenced by the MDT-object back points"
1844 echo "to some non-exist MDT-object, then the LFSCK should repair"
1845 echo "the OST-object to back point to the right MDT-object."
1848 check_mount_and_prep
1849 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1851 echo "Inject failure stub to make the OST-object to back point to"
1852 echo "non-exist MDT-object."
1853 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1855 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1856 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1857 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1859 error "(0) Fail to create PFL $DIR/$tdir/f1"
1860 # 'dd' will trigger punch RPC firstly on every OST-objects.
1861 # So even though some OST-object will not be write by 'dd',
1862 # as long as it is allocated (may be NOT allocated in pfl_3b)
1863 # its layout information will be set also.
1864 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1865 cancel_lru_locks osc
1866 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1868 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1869 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1871 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1872 mdd.${MDT_DEV}.lfsck_layout |
1873 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1875 error "(2) unexpected status"
1878 local repaired=$($SHOW_LAYOUT |
1879 awk '/^repaired_unmatched_pair/ { print $2 }')
1880 [ $repaired -ge 3 ] ||
1881 error "(3) Fail to repair unmatched pair: $repaired"
1883 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1886 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1887 skip "MDS older than 2.5.55, LU-3591"
1890 echo "If the OST-object referenced by the MDT-object back points"
1891 echo "to other MDT-object that doesn't recognize the OST-object,"
1892 echo "then the LFSCK should repair it to back point to the right"
1893 echo "MDT-object (the first one)."
1896 check_mount_and_prep
1897 mkdir -p $DIR/$tdir/0
1898 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1899 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1900 cancel_lru_locks osc
1902 echo "Inject failure stub to make the OST-object to back point to"
1903 echo "other MDT-object"
1906 [ $OSTCOUNT -ge 2 ] && stripes=2
1908 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1909 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1910 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1911 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1913 error "(0) Fail to create PFL $DIR/$tdir/f1"
1914 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1915 cancel_lru_locks osc
1916 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1918 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1919 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1921 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1922 mdd.${MDT_DEV}.lfsck_layout |
1923 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1925 error "(2) unexpected status"
1928 local repaired=$($SHOW_LAYOUT |
1929 awk '/^repaired_unmatched_pair/ { print $2 }')
1930 [ $repaired -eq 4 ] ||
1931 error "(3) Fail to repair unmatched pair: $repaired"
1933 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1936 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1937 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1938 skip "MDS newer than 2.7.55, LU-6475"
1939 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1940 skip "MDS older than 2.5.55, LU-3591"
1943 echo "According to current metadata migration implementation,"
1944 echo "before the old MDT-object is removed, both the new MDT-object"
1945 echo "and old MDT-object will reference the same LOV layout. Then if"
1946 echo "the layout LFSCK finds the new MDT-object by race, it will"
1947 echo "regard related OST-object(s) as multiple referenced case, and"
1948 echo "will try to create new OST-object(s) for the new MDT-object."
1949 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1950 echo "MDT-object before confirm the multiple referenced case."
1953 check_mount_and_prep
1954 $LFS mkdir -i 1 $DIR/$tdir/a1
1955 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1956 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1957 cancel_lru_locks osc
1959 echo "Inject failure stub on MDT1 to delay the migration"
1961 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1962 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1963 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1964 $LFS migrate -m 0 $DIR/$tdir/a1 &
1967 echo "Trigger layout LFSCK to race with the migration"
1968 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1970 wait_all_targets_blocked layout completed 2
1972 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1973 local repaired=$($SHOW_LAYOUT |
1974 awk '/^repaired_unmatched_pair/ { print $2 }')
1975 [ $repaired -eq 1 ] ||
1976 error "(3) Fail to repair unmatched pair: $repaired"
1978 repaired=$($SHOW_LAYOUT |
1979 awk '/^repaired_multiple_referenced/ { print $2 }')
1980 [ $repaired -eq 0 ] ||
1981 error "(4) Unexpectedly repaird multiple references: $repaired"
1983 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1986 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1987 skip "MDS older than 2.5.55, LU-3594"
1990 echo "If the OST-object's owner information does not match the owner"
1991 echo "information stored in the MDT-object, then the LFSCK trust the"
1992 echo "MDT-object and update the OST-object's owner information."
1995 check_mount_and_prep
1996 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1997 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1998 cancel_lru_locks osc
2000 # created but no setattr or write to the file.
2002 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
2003 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
2005 echo "Inject failure stub to skip OST-object owner changing"
2006 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
2007 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
2008 chown 1.1 $DIR/$tdir/f0
2009 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2011 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
2014 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
2016 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2017 mdd.${MDT_DEV}.lfsck_layout |
2018 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2020 error "(2) unexpected status"
2023 local repaired=$($SHOW_LAYOUT |
2024 awk '/^repaired_inconsistent_owner/ { print $2 }')
2025 [ $repaired -eq 1 ] ||
2026 error "(3) Fail to repair inconsistent owner: $repaired"
2028 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2031 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2032 skip "MDS older than 2.5.55, LU-3594"
2035 echo "If more than one MDT-objects reference the same OST-object,"
2036 echo "and the OST-object only recognizes one MDT-object, then the"
2037 echo "LFSCK should create new OST-objects for such non-recognized"
2041 check_mount_and_prep
2042 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2044 echo "Inject failure stub to make two MDT-objects to refernce"
2045 echo "the OST-object"
2047 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2048 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2049 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2050 cancel_lru_locks mdc
2051 cancel_lru_locks osc
2053 createmany -o $DIR/$tdir/f 1
2054 cancel_lru_locks mdc
2055 cancel_lru_locks osc
2057 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2059 error "(0) Fail to create PFL $DIR/$tdir/f1"
2060 cancel_lru_locks mdc
2061 cancel_lru_locks osc
2062 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2064 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2065 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2066 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2067 [ $size -eq 1048576 ] ||
2068 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2070 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2071 [ $size -eq 1048576 ] ||
2072 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2074 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2077 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2079 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2080 mdd.${MDT_DEV}.lfsck_layout |
2081 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2083 error "(3) unexpected status"
2086 local repaired=$($SHOW_LAYOUT |
2087 awk '/^repaired_multiple_referenced/ { print $2 }')
2088 [ $repaired -eq 2 ] ||
2089 error "(4) Fail to repair multiple references: $repaired"
2091 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2092 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2093 error "(5) Fail to write f0."
2094 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2095 [ $size -eq 1048576 ] ||
2096 error "(6) guard size should be 1048576, but got $size"
2098 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2099 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2100 error "(7) Fail to write f1."
2101 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2102 [ $size -eq 1048576 ] ||
2103 error "(8) guard size should be 1048576, but got $size"
2105 run_test 17 "LFSCK can repair multiple references"
2107 $LCTL set_param debug=+cache > /dev/null
2110 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2111 skip "MDS older than 2.5.55, LU-3336"
2114 echo "The target MDT-object is there, but related stripe information"
2115 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2116 echo "layout EA entries."
2119 check_mount_and_prep
2120 $LFS mkdir -i 0 $DIR/$tdir/a1
2121 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2122 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2124 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2126 $LFS path2fid $DIR/$tdir/a1/f1
2127 $LFS getstripe $DIR/$tdir/a1/f1
2129 if [ $MDSCOUNT -ge 2 ]; then
2130 $LFS mkdir -i 1 $DIR/$tdir/a2
2131 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2132 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2133 $LFS path2fid $DIR/$tdir/a2/f2
2134 $LFS getstripe $DIR/$tdir/a2/f2
2137 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2138 error "(0) Fail to create PFL $DIR/$tdir/f3"
2140 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2142 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2144 $LFS path2fid $DIR/$tdir/f3
2145 $LFS getstripe $DIR/$tdir/f3
2147 cancel_lru_locks osc
2149 echo "Inject failure, to make the MDT-object lost its layout EA"
2150 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2151 do_facet mds1 $LCTL set_param fail_loc=0x1615
2152 chown 1.1 $DIR/$tdir/a1/f1
2154 if [ $MDSCOUNT -ge 2 ]; then
2155 do_facet mds2 $LCTL set_param fail_loc=0x1615
2156 chown 1.1 $DIR/$tdir/a2/f2
2159 chown 1.1 $DIR/$tdir/f3
2164 do_facet mds1 $LCTL set_param fail_loc=0
2165 if [ $MDSCOUNT -ge 2 ]; then
2166 do_facet mds2 $LCTL set_param fail_loc=0
2169 cancel_lru_locks mdc
2170 cancel_lru_locks osc
2172 echo "The file size should be incorrect since layout EA is lost"
2173 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2174 [ "$cur_size" != "$saved_size1" ] ||
2175 error "(1) Expect incorrect file1 size"
2177 if [ $MDSCOUNT -ge 2 ]; then
2178 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2179 [ "$cur_size" != "$saved_size1" ] ||
2180 error "(2) Expect incorrect file2 size"
2183 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2184 [ "$cur_size" != "$saved_size2" ] ||
2185 error "(1.2) Expect incorrect file3 size"
2187 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2188 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2190 for k in $(seq $MDSCOUNT); do
2191 # The LFSCK status query internal is 30 seconds. For the case
2192 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2193 # time to guarantee the status sync up.
2194 wait_update_facet mds${k} "$LCTL get_param -n \
2195 mdd.$(facet_svc mds${k}).lfsck_layout |
2196 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2197 error "(4) MDS${k} is not the expected 'completed'"
2200 for k in $(seq $OSTCOUNT); do
2201 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2202 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2203 awk '/^status/ { print $2 }')
2204 [ "$cur_status" == "completed" ] ||
2205 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2208 local repaired=$(do_facet mds1 $LCTL get_param -n \
2209 mdd.$(facet_svc mds1).lfsck_layout |
2210 awk '/^repaired_orphan/ { print $2 }')
2211 [ $repaired -eq 3 ] ||
2212 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2214 if [ $MDSCOUNT -ge 2 ]; then
2215 repaired=$(do_facet mds2 $LCTL get_param -n \
2216 mdd.$(facet_svc mds2).lfsck_layout |
2217 awk '/^repaired_orphan/ { print $2 }')
2218 [ $repaired -eq 2 ] ||
2219 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2222 $LFS path2fid $DIR/$tdir/a1/f1
2223 $LFS getstripe $DIR/$tdir/a1/f1
2225 if [ $MDSCOUNT -ge 2 ]; then
2226 $LFS path2fid $DIR/$tdir/a2/f2
2227 $LFS getstripe $DIR/$tdir/a2/f2
2230 $LFS path2fid $DIR/$tdir/f3
2231 $LFS getstripe $DIR/$tdir/f3
2233 echo "The file size should be correct after layout LFSCK scanning"
2234 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2235 [ "$cur_size" == "$saved_size1" ] ||
2236 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2238 if [ $MDSCOUNT -ge 2 ]; then
2239 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2240 [ "$cur_size" == "$saved_size1" ] ||
2241 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2244 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2245 [ "$cur_size" == "$saved_size2" ] ||
2246 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2248 run_test 18a "Find out orphan OST-object and repair it (1)"
2251 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2252 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2253 skip "MDS older than 2.5.55, LU-3336"
2256 echo "The target MDT-object is lost. The LFSCK should re-create the"
2257 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2258 echo "can move it back to normal namespace manually."
2261 check_mount_and_prep
2262 $LFS mkdir -i 0 $DIR/$tdir/a1
2263 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2264 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2265 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2266 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2268 $LFS getstripe $DIR/$tdir/a1/f1
2270 if [ $MDSCOUNT -ge 2 ]; then
2271 $LFS mkdir -i 1 $DIR/$tdir/a2
2272 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2273 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2274 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2276 $LFS getstripe $DIR/$tdir/a2/f2
2279 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2280 error "(0) Fail to create PFL $DIR/$tdir/f3"
2282 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2284 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2285 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2287 $LFS getstripe $DIR/$tdir/f3
2289 cancel_lru_locks osc
2291 echo "Inject failure, to simulate the case of missing the MDT-object"
2292 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2293 do_facet mds1 $LCTL set_param fail_loc=0x1616
2294 rm -f $DIR/$tdir/a1/f1
2296 if [ $MDSCOUNT -ge 2 ]; then
2297 do_facet mds2 $LCTL set_param fail_loc=0x1616
2298 rm -f $DIR/$tdir/a2/f2
2306 do_facet mds1 $LCTL set_param fail_loc=0
2307 if [ $MDSCOUNT -ge 2 ]; then
2308 do_facet mds2 $LCTL set_param fail_loc=0
2311 cancel_lru_locks mdc
2312 cancel_lru_locks osc
2314 # dryrun mode only check orphans, not repaie
2315 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2316 $START_LAYOUT --dryrun -o -r ||
2317 error "Fail to start layout LFSCK in dryrun mode"
2318 wait_all_targets_blocked layout completed 2
2320 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2321 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2322 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2324 local orphans=$(do_facet mds1 $LCTL get_param -n \
2325 mdd.$(facet_svc mds1).lfsck_layout |
2326 awk '/^inconsistent_orphan/ { print $2 }')
2327 [ $orphans -eq 3 ] ||
2328 error "Expect 3 found on mds1, but got: $orphans"
2330 # orphan parents should not be created
2332 for subdir in $MOUNT/.lustre/lost+found/*; do
2333 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2336 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2337 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2339 for k in $(seq $MDSCOUNT); do
2340 # The LFSCK status query internal is 30 seconds. For the case
2341 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2342 # time to guarantee the status sync up.
2343 wait_update_facet mds${k} "$LCTL get_param -n \
2344 mdd.$(facet_svc mds${k}).lfsck_layout |
2345 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2346 error "(2) MDS${k} is not the expected 'completed'"
2349 for k in $(seq $OSTCOUNT); do
2350 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2351 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2352 awk '/^status/ { print $2 }')
2353 [ "$cur_status" == "completed" ] ||
2354 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2357 local repaired=$(do_facet mds1 $LCTL get_param -n \
2358 mdd.$(facet_svc mds1).lfsck_layout |
2359 awk '/^repaired_orphan/ { print $2 }')
2360 [ $repaired -eq 3 ] ||
2361 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2363 if [ $MDSCOUNT -ge 2 ]; then
2364 repaired=$(do_facet mds2 $LCTL get_param -n \
2365 mdd.$(facet_svc mds2).lfsck_layout |
2366 awk '/^repaired_orphan/ { print $2 }')
2367 [ $repaired -eq 2 ] ||
2368 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2371 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2372 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2373 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2375 if [ $MDSCOUNT -ge 2 ]; then
2376 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2377 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2380 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2381 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2383 $LFS path2fid $DIR/$tdir/a1/f1
2384 $LFS getstripe $DIR/$tdir/a1/f1
2386 if [ $MDSCOUNT -ge 2 ]; then
2387 $LFS path2fid $DIR/$tdir/a2/f2
2388 $LFS getstripe $DIR/$tdir/a2/f2
2391 $LFS path2fid $DIR/$tdir/f3
2392 $LFS getstripe $DIR/$tdir/f3
2394 echo "The file size should be correct after layout LFSCK scanning"
2395 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2396 [ "$cur_size" == "$saved_size1" ] ||
2397 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2399 if [ $MDSCOUNT -ge 2 ]; then
2400 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2401 [ "$cur_size" == "$saved_size1" ] ||
2402 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2405 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2406 [ "$cur_size" == "$saved_size2" ] ||
2407 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2409 run_test 18b "Find out orphan OST-object and repair it (2)"
2412 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2413 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2414 skip "MDS older than 2.5.55, LU-3336"
2417 echo "The target MDT-object is lost, and the OST-object FID is missing."
2418 echo "The LFSCK should re-create the MDT-object with new FID under the "
2419 echo "directory .lustre/lost+found/MDTxxxx."
2422 check_mount_and_prep
2423 $LFS mkdir -i 0 $DIR/$tdir/a1
2424 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2426 echo "Inject failure, to simulate the case of missing parent FID"
2427 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2428 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2430 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2431 $LFS getstripe $DIR/$tdir/a1/f1
2433 if [ $MDSCOUNT -ge 2 ]; then
2434 $LFS mkdir -i 1 $DIR/$tdir/a2
2435 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2436 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2437 $LFS getstripe $DIR/$tdir/a2/f2
2440 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2441 error "(0) Fail to create PFL $DIR/$tdir/f3"
2443 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2444 $LFS getstripe $DIR/$tdir/f3
2446 cancel_lru_locks osc
2447 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2449 echo "Inject failure, to simulate the case of missing the MDT-object"
2450 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2451 do_facet mds1 $LCTL set_param fail_loc=0x1616
2452 rm -f $DIR/$tdir/a1/f1
2454 if [ $MDSCOUNT -ge 2 ]; then
2455 do_facet mds2 $LCTL set_param fail_loc=0x1616
2456 rm -f $DIR/$tdir/a2/f2
2464 do_facet mds1 $LCTL set_param fail_loc=0
2465 if [ $MDSCOUNT -ge 2 ]; then
2466 do_facet mds2 $LCTL set_param fail_loc=0
2469 cancel_lru_locks mdc
2470 cancel_lru_locks osc
2472 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2473 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2475 for k in $(seq $MDSCOUNT); do
2476 # The LFSCK status query internal is 30 seconds. For the case
2477 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2478 # time to guarantee the status sync up.
2479 wait_update_facet mds${k} "$LCTL get_param -n \
2480 mdd.$(facet_svc mds${k}).lfsck_layout |
2481 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2482 error "(2) MDS${k} is not the expected 'completed'"
2485 for k in $(seq $OSTCOUNT); do
2486 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2487 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2488 awk '/^status/ { print $2 }')
2489 [ "$cur_status" == "completed" ] ||
2490 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2493 if [ $MDSCOUNT -ge 2 ]; then
2499 local repaired=$(do_facet mds1 $LCTL get_param -n \
2500 mdd.$(facet_svc mds1).lfsck_layout |
2501 awk '/^repaired_orphan/ { print $2 }')
2502 [ $repaired -eq $expected ] ||
2503 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2505 if [ $MDSCOUNT -ge 2 ]; then
2506 repaired=$(do_facet mds2 $LCTL get_param -n \
2507 mdd.$(facet_svc mds2).lfsck_layout |
2508 awk '/^repaired_orphan/ { print $2 }')
2509 [ $repaired -eq 0 ] ||
2510 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2513 ls -ail $MOUNT/.lustre/lost+found/
2515 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2516 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2517 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2519 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2522 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2523 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2524 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2526 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2527 [ ! -z "$cname" ] ||
2528 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2530 run_test 18c "Find out orphan OST-object and repair it (3)"
2533 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2534 skip "MDS older than 2.5.55, LU-3336"
2537 echo "The target MDT-object layout EA is corrupted, but the right"
2538 echo "OST-object is still alive as orphan. The layout LFSCK will"
2539 echo "not create new OST-object to occupy such slot."
2542 check_mount_and_prep
2544 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2545 echo "guard" > $DIR/$tdir/a1/f1
2546 echo "foo" > $DIR/$tdir/a1/f2
2548 echo "guard" > $DIR/$tdir/a1/f3
2549 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2550 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2551 echo "foo" > $DIR/$tdir/a1/f4
2553 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2554 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2555 $LFS path2fid $DIR/$tdir/a1/f1
2556 $LFS getstripe $DIR/$tdir/a1/f1
2557 $LFS path2fid $DIR/$tdir/a1/f2
2558 $LFS getstripe $DIR/$tdir/a1/f2
2559 $LFS path2fid $DIR/$tdir/a1/f3
2560 $LFS getstripe $DIR/$tdir/a1/f3
2561 $LFS path2fid $DIR/$tdir/a1/f4
2562 $LFS getstripe $DIR/$tdir/a1/f4
2563 cancel_lru_locks osc
2565 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2566 echo "to reference the same OST-object (which is f1's OST-obejct)."
2567 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2568 echo "dangling reference case, but f2's old OST-object is there."
2570 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2571 echo "to reference the same OST-object (which is f3's OST-obejct)."
2572 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2573 echo "dangling reference case, but f4's old OST-object is there."
2576 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2577 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2578 chown 1.1 $DIR/$tdir/a1/f2
2579 chown 1.1 $DIR/$tdir/a1/f4
2580 rm -f $DIR/$tdir/a1/f1
2581 rm -f $DIR/$tdir/a1/f3
2584 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2586 echo "stopall to cleanup object cache"
2589 setupall > /dev/null
2591 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2592 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2594 for k in $(seq $MDSCOUNT); do
2595 # The LFSCK status query internal is 30 seconds. For the case
2596 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2597 # time to guarantee the status sync up.
2598 wait_update_facet mds${k} "$LCTL get_param -n \
2599 mdd.$(facet_svc mds${k}).lfsck_layout |
2600 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2601 error "(3) MDS${k} is not the expected 'completed'"
2604 for k in $(seq $OSTCOUNT); do
2605 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2606 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2607 awk '/^status/ { print $2 }')
2608 [ "$cur_status" == "completed" ] ||
2609 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2612 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2613 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2614 awk '/^repaired_orphan/ { print $2 }')
2615 [ $repaired -eq 2 ] ||
2616 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2618 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2619 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2620 awk '/^repaired_dangling/ { print $2 }')
2621 [ $repaired -eq 0 ] ||
2622 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2624 echo "The file size should be correct after layout LFSCK scanning"
2625 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2626 [ "$cur_size" == "$saved_size1" ] ||
2627 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2629 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2630 [ "$cur_size" == "$saved_size2" ] ||
2631 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2633 echo "The LFSCK should find back the original data."
2634 cat $DIR/$tdir/a1/f2
2635 $LFS path2fid $DIR/$tdir/a1/f2
2636 $LFS getstripe $DIR/$tdir/a1/f2
2637 cat $DIR/$tdir/a1/f4
2638 $LFS path2fid $DIR/$tdir/a1/f4
2639 $LFS getstripe $DIR/$tdir/a1/f4
2641 run_test 18d "Find out orphan OST-object and repair it (4)"
2644 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2645 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2646 skip "MDS older than 2.5.55, LU-3336"
2649 echo "The target MDT-object layout EA slot is occpuied by some new"
2650 echo "created OST-object when repair dangling reference case. Such"
2651 echo "conflict OST-object has been modified by others. To keep the"
2652 echo "new data, the LFSCK will create a new file to refernece this"
2653 echo "old orphan OST-object."
2656 check_mount_and_prep
2658 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2659 echo "guard" > $DIR/$tdir/a1/f1
2660 echo "foo" > $DIR/$tdir/a1/f2
2662 echo "guard" > $DIR/$tdir/a1/f3
2663 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2664 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2665 echo "foo" > $DIR/$tdir/a1/f4
2667 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2668 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2670 $LFS path2fid $DIR/$tdir/a1/f1
2671 $LFS getstripe $DIR/$tdir/a1/f1
2672 $LFS path2fid $DIR/$tdir/a1/f2
2673 $LFS getstripe $DIR/$tdir/a1/f2
2674 $LFS path2fid $DIR/$tdir/a1/f3
2675 $LFS getstripe $DIR/$tdir/a1/f3
2676 $LFS path2fid $DIR/$tdir/a1/f4
2677 $LFS getstripe $DIR/$tdir/a1/f4
2678 cancel_lru_locks osc
2680 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2681 echo "to reference the same OST-object (which is f1's OST-obejct)."
2682 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2683 echo "dangling reference case, but f2's old OST-object is there."
2685 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2686 echo "to reference the same OST-object (which is f3's OST-obejct)."
2687 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2688 echo "dangling reference case, but f4's old OST-object is there."
2691 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2692 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2693 chown 1.1 $DIR/$tdir/a1/f2
2694 chown 1.1 $DIR/$tdir/a1/f4
2695 rm -f $DIR/$tdir/a1/f1
2696 rm -f $DIR/$tdir/a1/f3
2699 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2701 echo "stopall to cleanup object cache"
2704 setupall > /dev/null
2706 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2707 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2709 start_full_debug_logging
2711 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2712 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2714 wait_update_facet mds1 "$LCTL get_param -n \
2715 mdd.$(facet_svc mds1).lfsck_layout |
2716 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2717 error "(3) MDS1 is not the expected 'scanning-phase2'"
2719 # to guarantee all updates are synced.
2723 echo "Write new data to f2/f4 to modify the new created OST-object."
2724 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2725 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2727 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2729 for k in $(seq $MDSCOUNT); do
2730 # The LFSCK status query internal is 30 seconds. For the case
2731 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2732 # time to guarantee the status sync up.
2733 wait_update_facet mds${k} "$LCTL get_param -n \
2734 mdd.$(facet_svc mds${k}).lfsck_layout |
2735 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2736 error "(4) MDS${k} is not the expected 'completed'"
2739 for k in $(seq $OSTCOUNT); do
2740 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2741 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2742 awk '/^status/ { print $2 }')
2743 [ "$cur_status" == "completed" ] ||
2744 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2747 stop_full_debug_logging
2749 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2750 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2751 awk '/^repaired_orphan/ { print $2 }')
2752 [ $repaired -eq 2 ] ||
2753 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2755 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2756 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2757 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2759 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2760 if [ $count -ne 2 ]; then
2761 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2762 error "(8) Expect 2 stubs under lost+found, but got $count"
2765 echo "The stub file should keep the original f2 or f4 data"
2766 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2767 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2768 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2769 error "(9) Got unexpected $cur_size"
2772 $LFS path2fid $cname
2773 $LFS getstripe $cname
2775 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2776 cur_size=$(ls -il $cname | awk '{ print $6 }')
2777 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2778 error "(10) Got unexpected $cur_size"
2781 $LFS path2fid $cname
2782 $LFS getstripe $cname
2784 echo "The f2/f4 should contains new data."
2785 cat $DIR/$tdir/a1/f2
2786 $LFS path2fid $DIR/$tdir/a1/f2
2787 $LFS getstripe $DIR/$tdir/a1/f2
2788 cat $DIR/$tdir/a1/f4
2789 $LFS path2fid $DIR/$tdir/a1/f4
2790 $LFS getstripe $DIR/$tdir/a1/f4
2792 run_test 18e "Find out orphan OST-object and repair it (5)"
2795 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2798 echo "The target MDT-object is lost. The LFSCK should re-create the"
2799 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2800 echo "to verify some OST-object(s) during the first stage-scanning,"
2801 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2802 echo "should not be affected."
2805 check_mount_and_prep
2806 $LFS mkdir -i 0 $DIR/$tdir/a1
2807 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2808 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2809 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2810 $LFS mkdir -i 0 $DIR/$tdir/a2
2811 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2812 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2813 $LFS getstripe $DIR/$tdir/a1/f1
2814 $LFS getstripe $DIR/$tdir/a2/f2
2816 if [ $MDSCOUNT -ge 2 ]; then
2817 $LFS mkdir -i 1 $DIR/$tdir/a3
2818 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2819 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2820 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2821 $LFS mkdir -i 1 $DIR/$tdir/a4
2822 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2823 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2824 $LFS getstripe $DIR/$tdir/a3/f3
2825 $LFS getstripe $DIR/$tdir/a4/f4
2828 cancel_lru_locks osc
2830 echo "Inject failure, to simulate the case of missing the MDT-object"
2831 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2832 do_facet mds1 $LCTL set_param fail_loc=0x1616
2833 rm -f $DIR/$tdir/a1/f1
2834 rm -f $DIR/$tdir/a2/f2
2836 if [ $MDSCOUNT -ge 2 ]; then
2837 do_facet mds2 $LCTL set_param fail_loc=0x1616
2838 rm -f $DIR/$tdir/a3/f3
2839 rm -f $DIR/$tdir/a4/f4
2845 do_facet mds1 $LCTL set_param fail_loc=0
2846 if [ $MDSCOUNT -ge 2 ]; then
2847 do_facet mds2 $LCTL set_param fail_loc=0
2850 cancel_lru_locks mdc
2851 cancel_lru_locks osc
2853 echo "Inject failure, to simulate the OST0 fail to handle"
2854 echo "MDT0 LFSCK request during the first-stage scanning."
2855 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2856 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2858 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2859 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2861 for k in $(seq $MDSCOUNT); do
2862 # The LFSCK status query internal is 30 seconds. For the case
2863 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2864 # time to guarantee the status sync up.
2865 wait_update_facet mds${k} "$LCTL get_param -n \
2866 mdd.$(facet_svc mds${k}).lfsck_layout |
2867 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2868 error "(2) MDS${k} is not the expected 'partial'"
2871 wait_update_facet ost1 "$LCTL get_param -n \
2872 obdfilter.$(facet_svc ost1).lfsck_layout |
2873 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2874 error "(3) OST1 is not the expected 'partial'"
2877 wait_update_facet ost2 "$LCTL get_param -n \
2878 obdfilter.$(facet_svc ost2).lfsck_layout |
2879 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2880 error "(4) OST2 is not the expected 'completed'"
2883 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2885 local repaired=$(do_facet mds1 $LCTL get_param -n \
2886 mdd.$(facet_svc mds1).lfsck_layout |
2887 awk '/^repaired_orphan/ { print $2 }')
2888 [ $repaired -eq 1 ] ||
2889 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2891 if [ $MDSCOUNT -ge 2 ]; then
2892 repaired=$(do_facet mds2 $LCTL get_param -n \
2893 mdd.$(facet_svc mds2).lfsck_layout |
2894 awk '/^repaired_orphan/ { print $2 }')
2895 [ $repaired -eq 1 ] ||
2896 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2899 echo "Trigger layout LFSCK on all devices again to cleanup"
2900 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2902 for k in $(seq $MDSCOUNT); do
2903 # The LFSCK status query internal is 30 seconds. For the case
2904 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2905 # time to guarantee the status sync up.
2906 wait_update_facet mds${k} "$LCTL get_param -n \
2907 mdd.$(facet_svc mds${k}).lfsck_layout |
2908 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2909 error "(8) MDS${k} is not the expected 'completed'"
2912 for k in $(seq $OSTCOUNT); do
2913 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2914 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2915 awk '/^status/ { print $2 }')
2916 [ "$cur_status" == "completed" ] ||
2917 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2921 local repaired=$(do_facet mds1 $LCTL get_param -n \
2922 mdd.$(facet_svc mds1).lfsck_layout |
2923 awk '/^repaired_orphan/ { print $2 }')
2924 [ $repaired -eq 2 ] ||
2925 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2927 if [ $MDSCOUNT -ge 2 ]; then
2928 repaired=$(do_facet mds2 $LCTL get_param -n \
2929 mdd.$(facet_svc mds2).lfsck_layout |
2930 awk '/^repaired_orphan/ { print $2 }')
2931 [ $repaired -eq 2 ] ||
2932 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2935 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2938 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2941 echo "The target MDT-object is lost, but related OI mapping is there"
2942 echo "The LFSCK should recreate the lost MDT-object without affected"
2943 echo "by the stale OI mapping."
2946 check_mount_and_prep
2947 $LFS mkdir -i 0 $DIR/$tdir/a1
2948 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2949 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2950 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2952 $LFS getstripe $DIR/$tdir/a1/f1
2953 cancel_lru_locks osc
2955 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2956 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2957 do_facet mds1 $LCTL set_param fail_loc=0x162e
2958 rm -f $DIR/$tdir/a1/f1
2960 do_facet mds1 $LCTL set_param fail_loc=0
2961 cancel_lru_locks mdc
2962 cancel_lru_locks osc
2964 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2965 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2967 for k in $(seq $MDSCOUNT); do
2968 # The LFSCK status query internal is 30 seconds. For the case
2969 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2970 # time to guarantee the status sync up.
2971 wait_update_facet mds${k} "$LCTL get_param -n \
2972 mdd.$(facet_svc mds${k}).lfsck_layout |
2973 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2974 error "(2) MDS${k} is not the expected 'completed'"
2977 for k in $(seq $OSTCOUNT); do
2978 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2979 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2980 awk '/^status/ { print $2 }')
2981 [ "$cur_status" == "completed" ] ||
2982 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2985 local repaired=$(do_facet mds1 $LCTL get_param -n \
2986 mdd.$(facet_svc mds1).lfsck_layout |
2987 awk '/^repaired_orphan/ { print $2 }')
2988 [ $repaired -eq $OSTCOUNT ] ||
2989 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2991 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2992 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2993 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2995 $LFS path2fid $DIR/$tdir/a1/f1
2996 $LFS getstripe $DIR/$tdir/a1/f1
2998 run_test 18g "Find out orphan OST-object and repair it (7)"
3002 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
3003 echo "the layout LFSCK will keep the bad PFL file(s) there without"
3004 echo "scanning its OST-object(s). Then in the second stage scanning,"
3005 echo "the OST will return related OST-object(s) to the MDT as orphan."
3006 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
3007 echo "the 'orphan(s)' stripe information."
3010 check_mount_and_prep
3012 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
3013 error "(0) Fail to create PFL $DIR/$tdir/f0"
3015 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
3016 error "(1.1) Fail to write $DIR/$tdir/f0"
3018 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
3019 error "(1.2) Fail to write $DIR/$tdir/f0"
3021 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3023 echo "Inject failure stub to simulate bad PFL extent range"
3024 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3025 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3027 chown 1.1 $DIR/$tdir/f0
3029 cancel_lru_locks mdc
3030 cancel_lru_locks osc
3031 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3033 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3034 error "(2) Write to bad PFL file should fail"
3036 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3037 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3039 for k in $(seq $MDSCOUNT); do
3040 # The LFSCK status query internal is 30 seconds. For the case
3041 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3042 # time to guarantee the status sync up.
3043 wait_update_facet mds${k} "$LCTL get_param -n \
3044 mdd.$(facet_svc mds${k}).lfsck_layout |
3045 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3046 error "(4.1) MDS${k} is not the expected 'completed'"
3049 for k in $(seq $OSTCOUNT); do
3050 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3051 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3052 awk '/^status/ { print $2 }')
3053 [ "$cur_status" == "completed" ] ||
3054 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3058 local repaired=$($SHOW_LAYOUT |
3059 awk '/^repaired_orphan/ { print $2 }')
3060 [ $repaired -eq 2 ] ||
3061 error "(5) Fail to repair crashed PFL range: $repaired"
3063 echo "Data in $DIR/$tdir/f0 should not be broken"
3064 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3065 error "(6) Data in $DIR/$tdir/f0 is broken"
3067 echo "Write should succeed after LFSCK repairing the bad PFL range"
3068 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3069 error "(7) Write should succeed after LFSCK"
3071 run_test 18h "LFSCK can repair crashed PFL extent range"
3073 $LCTL set_param debug=-cache > /dev/null
3076 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3077 skip "MDS older than 2.5.55, LU-3951"
3079 check_mount_and_prep
3080 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3082 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3083 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3085 echo "foo1" > $DIR/$tdir/a0
3086 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3087 error "(0) Fail to create PFL $DIR/$tdir/a1"
3088 echo "foo2" > $DIR/$tdir/a1
3089 echo "guard" > $DIR/$tdir/a2
3090 cancel_lru_locks osc
3092 echo "Inject failure, then client will offer wrong parent FID when read"
3093 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3094 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3096 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3097 $LCTL set_param fail_loc=0x1619
3099 echo "Read RPC with wrong parent FID should be denied"
3100 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3101 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3102 $LCTL set_param fail_loc=0
3104 run_test 19a "OST-object inconsistency self detect"
3107 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3108 skip "MDS older than 2.5.55, LU-3951"
3110 check_mount_and_prep
3111 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3113 echo "Inject failure stub to make the OST-object to back point to"
3114 echo "non-exist MDT-object"
3116 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3117 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3119 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3120 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3121 echo "foo1" > $DIR/$tdir/f0
3122 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3123 error "(0) Fail to create PFL $DIR/$tdir/f1"
3124 echo "foo2" > $DIR/$tdir/f1
3125 cancel_lru_locks osc
3126 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3128 do_facet ost1 $LCTL set_param -n \
3129 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3130 echo "Nothing should be fixed since self detect and repair is disabled"
3131 local repaired=$(do_facet ost1 $LCTL get_param -n \
3132 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3133 awk '/^repaired/ { print $2 }')
3134 [ $repaired -eq 0 ] ||
3135 error "(1) Expected 0 repaired, but got $repaired"
3137 echo "Read RPC with right parent FID should be accepted,"
3138 echo "and cause parent FID on OST to be fixed"
3140 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3141 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3143 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3144 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3146 repaired=$(do_facet ost1 $LCTL get_param -n \
3147 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3148 awk '/^repaired/ { print $2 }')
3149 [ $repaired -eq 2 ] ||
3150 error "(3) Expected 1 repaired, but got $repaired"
3152 run_test 19b "OST-object inconsistency self repair"
3154 PATTERN_WITH_HOLE="40000001"
3155 PATTERN_WITHOUT_HOLE="raid0"
3158 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3159 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3160 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3161 skip "MDS older than 2.5.55, LU-4887"
3164 echo "The target MDT-object and some of its OST-object are lost."
3165 echo "The LFSCK should find out the left OST-objects and re-create"
3166 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3167 echo "with the partial OST-objects (LOV EA hole)."
3169 echo "New client can access the file with LOV EA hole via normal"
3170 echo "system tools or commands without crash the system."
3172 echo "For old client, even though it cannot access the file with"
3173 echo "LOV EA hole, it should not cause the system crash."
3176 check_mount_and_prep
3177 $LFS mkdir -i 0 $DIR/$tdir/a1
3178 if [ $OSTCOUNT -gt 2 ]; then
3179 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3182 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3186 # 256 blocks on the stripe0.
3187 # 1 block on the stripe1 for 2 OSTs case.
3188 # 256 blocks on the stripe1 for other cases.
3189 # 1 block on the stripe2 if OSTs > 2
3190 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3191 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3192 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3194 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3195 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3196 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3199 $LFS getstripe $DIR/$tdir/a1/f0
3201 $LFS getstripe $DIR/$tdir/a1/f1
3203 $LFS getstripe $DIR/$tdir/a1/f2
3205 if [ $OSTCOUNT -gt 2 ]; then
3206 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3207 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3209 $LFS getstripe $DIR/$tdir/a1/f3
3212 cancel_lru_locks osc
3214 echo "Inject failure..."
3215 echo "To simulate f0 lost MDT-object"
3216 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3217 do_facet mds1 $LCTL set_param fail_loc=0x1616
3218 rm -f $DIR/$tdir/a1/f0
3220 echo "To simulate f1 lost MDT-object and OST-object0"
3221 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3222 do_facet mds1 $LCTL set_param fail_loc=0x161a
3223 rm -f $DIR/$tdir/a1/f1
3225 echo "To simulate f2 lost MDT-object and OST-object1"
3226 do_facet mds1 $LCTL set_param fail_val=1
3227 rm -f $DIR/$tdir/a1/f2
3229 if [ $OSTCOUNT -gt 2 ]; then
3230 echo "To simulate f3 lost MDT-object and OST-object2"
3231 do_facet mds1 $LCTL set_param fail_val=2
3232 rm -f $DIR/$tdir/a1/f3
3235 umount_client $MOUNT
3238 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3240 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3241 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3243 for k in $(seq $MDSCOUNT); do
3244 # The LFSCK status query internal is 30 seconds. For the case
3245 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3246 # time to guarantee the status sync up.
3247 wait_update_facet mds${k} "$LCTL get_param -n \
3248 mdd.$(facet_svc mds${k}).lfsck_layout |
3249 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3250 error "(2) MDS${k} is not the expected 'completed'"
3253 for k in $(seq $OSTCOUNT); do
3254 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3255 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3256 awk '/^status/ { print $2 }')
3257 [ "$cur_status" == "completed" ] ||
3258 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3261 local repaired=$(do_facet mds1 $LCTL get_param -n \
3262 mdd.$(facet_svc mds1).lfsck_layout |
3263 awk '/^repaired_orphan/ { print $2 }')
3264 if [ $OSTCOUNT -gt 2 ]; then
3265 [ $repaired -eq 9 ] ||
3266 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3268 [ $repaired -eq 4 ] ||
3269 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3272 mount_client $MOUNT || error "(5.0) Fail to start client!"
3274 LOV_PATTERN_F_HOLE=0x40000000
3277 # ${fid0}-R-0 is the old f0
3279 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3280 echo "Check $name, which is the old f0"
3282 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3284 local pattern=$($LFS getstripe -L $name)
3285 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3286 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3288 local stripes=$($LFS getstripe -c $name)
3289 if [ $OSTCOUNT -gt 2 ]; then
3290 [ $stripes -eq 3 ] ||
3291 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3293 [ $stripes -eq 2 ] ||
3294 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3297 local size=$(stat $name | awk '/Size:/ { print $2 }')
3298 [ $size -eq $((4096 * $bcount)) ] ||
3299 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3301 cat $name > /dev/null || error "(5.5) cannot read $name"
3303 echo "dummy" >> $name || error "(5.6) cannot write $name"
3305 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3307 touch $name || error "(5.8) cannot touch $name"
3309 rm -f $name || error "(5.9) cannot unlink $name"
3312 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3314 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3315 if [ $OSTCOUNT -gt 2 ]; then
3316 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3318 echo "Check $name, it contains the old f1's stripe1"
3321 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3323 pattern=$($LFS getstripe -L $name)
3324 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3325 error "(6.2) expect pattern flag hole, but got $pattern"
3327 stripes=$($LFS getstripe -c $name)
3328 if [ $OSTCOUNT -gt 2 ]; then
3329 [ $stripes -eq 3 ] ||
3330 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3332 [ $stripes -eq 2 ] ||
3333 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3336 size=$(stat $name | awk '/Size:/ { print $2 }')
3337 [ $size -eq $((4096 * $bcount)) ] ||
3338 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3340 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3342 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3343 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3346 [ $failures -eq 256 ] ||
3347 error "(6.6) expect 256 IO failures, but get $failures"
3349 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3350 [ $size -eq $((4096 * $bcount)) ] ||
3351 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3353 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3354 error "(6.8) write to the LOV EA hole should fail"
3356 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3357 error "(6.9) write to normal stripe should NOT fail"
3359 echo "foo" >> $name && error "(6.10) append write $name should fail"
3361 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3363 touch $name || error "(6.12) cannot touch $name"
3365 rm -f $name || error "(6.13) cannot unlink $name"
3368 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3370 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3371 if [ $OSTCOUNT -gt 2 ]; then
3372 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3374 echo "Check $name, it contains the old f2's stripe0"
3377 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3379 pattern=$($LFS getstripe -L $name)
3380 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3381 error "(7.2) expect pattern flag hole, but got $pattern"
3383 stripes=$($LFS getstripe -c $name)
3384 size=$(stat $name | awk '/Size:/ { print $2 }')
3385 if [ $OSTCOUNT -gt 2 ]; then
3386 [ $stripes -eq 3 ] ||
3387 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3389 [ $size -eq $((4096 * $bcount)) ] ||
3390 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3392 cat $name > /dev/null &&
3393 error "(7.5.1) normal read $name should fail"
3395 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3396 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3398 [ $failures -eq 256 ] ||
3399 error "(7.6) expect 256 IO failures, but get $failures"
3401 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3402 [ $size -eq $((4096 * $bcount)) ] ||
3403 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3405 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3406 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3408 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3409 error "(7.8.1) write to normal stripe should NOT fail"
3411 echo "foo" >> $name &&
3412 error "(7.8.3) append write $name should fail"
3414 chown $RUNAS_ID:$RUNAS_GID $name ||
3415 error "(7.9.1) cannot chown on $name"
3417 touch $name || error "(7.10.1) cannot touch $name"
3419 [ $stripes -eq 2 ] ||
3420 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3423 [ $size -eq $((4096 * (256 + 0))) ] ||
3424 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3426 cat $name > /dev/null &&
3427 error "(7.5.2) normal read $name should fail"
3429 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3430 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3431 [ $failures -eq 256 ] ||
3432 error "(7.6.2) expect 256 IO failures, but get $failures"
3435 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3436 [ $size -eq $((4096 * $bcount)) ] ||
3437 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3439 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3440 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3442 chown $RUNAS_ID:$RUNAS_GID $name ||
3443 error "(7.9.2) cannot chown on $name"
3445 touch $name || error "(7.10.2) cannot touch $name"
3448 rm -f $name || error "(7.11) cannot unlink $name"
3450 [ $OSTCOUNT -le 2 ] && return
3453 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3455 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3456 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3458 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3460 pattern=$($LFS getstripe -L $name)
3461 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3462 error "(8.2) expect pattern flag hole, but got $pattern"
3464 stripes=$($LFS getstripe -c $name)
3465 [ $stripes -eq 3 ] ||
3466 error "(8.3) expect the stripe count is 3, but got $stripes"
3468 size=$(stat $name | awk '/Size:/ { print $2 }')
3470 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3471 error "(8.4) expect the size $((4096 * 512)), but got $size"
3473 cat $name > /dev/null &&
3474 error "(8.5) normal read $name should fail"
3476 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3477 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3479 [ $failures -eq 256 ] ||
3480 error "(8.6) expect 256 IO failures, but get $failures"
3483 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3484 [ $size -eq $((4096 * $bcount)) ] ||
3485 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3487 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3488 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3490 chown $RUNAS_ID:$RUNAS_GID $name ||
3491 error "(8.9) cannot chown on $name"
3493 touch $name || error "(8.10) cannot touch $name"
3495 rm -f $name || error "(8.11) cannot unlink $name"
3497 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3500 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3501 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3502 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3503 skip "MDS older than 2.5.55, LU-4887"
3506 echo "The target MDT-object and some of its OST-object are lost."
3507 echo "The LFSCK should find out the left OST-objects and re-create"
3508 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3509 echo "with the partial OST-objects (LOV EA hole)."
3511 echo "New client can access the file with LOV EA hole via normal"
3512 echo "system tools or commands without crash the system - PFL case."
3515 check_mount_and_prep
3517 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3518 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3519 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3520 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3521 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3522 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3524 local bcount=$((256 * 3 + 1))
3526 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3527 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3528 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3530 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3531 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3532 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3535 $LFS getstripe $DIR/$tdir/f0
3537 $LFS getstripe $DIR/$tdir/f1
3539 $LFS getstripe $DIR/$tdir/f2
3541 cancel_lru_locks mdc
3542 cancel_lru_locks osc
3544 echo "Inject failure..."
3545 echo "To simulate f0 lost MDT-object"
3546 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3547 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3550 echo "To simulate the case of f1 lost MDT-object and "
3551 echo "the first OST-object in each PFL component"
3552 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3553 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3556 echo "To simulate the case of f2 lost MDT-object and "
3557 echo "the second OST-object in each PFL component"
3558 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3563 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3565 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3566 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3568 for k in $(seq $MDSCOUNT); do
3569 # The LFSCK status query internal is 30 seconds. For the case
3570 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3571 # time to guarantee the status sync up.
3572 wait_update_facet mds${k} "$LCTL get_param -n \
3573 mdd.$(facet_svc mds${k}).lfsck_layout |
3574 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3575 error "(4) MDS${k} is not the expected 'completed'"
3578 for k in $(seq $OSTCOUNT); do
3579 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3580 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3581 awk '/^status/ { print $2 }')
3582 [ "$cur_status" == "completed" ] ||
3583 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3586 local repaired=$(do_facet mds1 $LCTL get_param -n \
3587 mdd.$(facet_svc mds1).lfsck_layout |
3588 awk '/^repaired_orphan/ { print $2 }')
3589 [ $repaired -eq 8 ] ||
3590 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3593 # ${fid0}-R-0 is the old f0
3595 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3596 echo "Check $name, which is the old f0"
3598 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3600 local pattern=$($LFS getstripe -L -I1 $name)
3601 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3602 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3604 pattern=$($LFS getstripe -L -I2 $name)
3605 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3606 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3608 local stripes=$($LFS getstripe -c -I1 $name)
3609 [ $stripes -eq 2 ] ||
3610 error "(7.3.1) expect 2 stripes, but got $stripes"
3612 stripes=$($LFS getstripe -c -I2 $name)
3613 [ $stripes -eq 2 ] ||
3614 error "(7.3.2) expect 2 stripes, but got $stripes"
3616 local e_start=$($LFS getstripe -I1 $name |
3617 awk '/lcme_extent.e_start:/ { print $2 }')
3618 [ $e_start -eq 0 ] ||
3619 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3621 local e_end=$($LFS getstripe -I1 $name |
3622 awk '/lcme_extent.e_end:/ { print $2 }')
3623 [ $e_end -eq 2097152 ] ||
3624 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3626 e_start=$($LFS getstripe -I2 $name |
3627 awk '/lcme_extent.e_start:/ { print $2 }')
3628 [ $e_start -eq 2097152 ] ||
3629 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3631 e_end=$($LFS getstripe -I2 $name |
3632 awk '/lcme_extent.e_end:/ { print $2 }')
3633 [ "$e_end" = "EOF" ] ||
3634 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3636 local size=$(stat $name | awk '/Size:/ { print $2 }')
3637 [ $size -eq $((4096 * $bcount)) ] ||
3638 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3640 cat $name > /dev/null || error "(7.7) cannot read $name"
3642 echo "dummy" >> $name || error "(7.8) cannot write $name"
3644 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3646 touch $name || error "(7.10) cannot touch $name"
3648 rm -f $name || error "(7.11) cannot unlink $name"
3651 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3653 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3654 echo "Check $name, it contains f1's second OST-object in each COMP"
3656 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3658 pattern=$($LFS getstripe -L -I1 $name)
3659 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3660 error "(8.2.1) expect pattern flag hole, but got $pattern"
3662 pattern=$($LFS getstripe -L -I2 $name)
3663 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3664 error "(8.2.2) expect pattern flag hole, but got $pattern"
3666 stripes=$($LFS getstripe -c -I1 $name)
3667 [ $stripes -eq 2 ] ||
3668 error "(8.3.2) expect 2 stripes, but got $stripes"
3670 stripes=$($LFS getstripe -c -I2 $name)
3671 [ $stripes -eq 2 ] ||
3672 error "(8.3.2) expect 2 stripes, but got $stripes"
3674 e_start=$($LFS getstripe -I1 $name |
3675 awk '/lcme_extent.e_start:/ { print $2 }')
3676 [ $e_start -eq 0 ] ||
3677 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3679 e_end=$($LFS getstripe -I1 $name |
3680 awk '/lcme_extent.e_end:/ { print $2 }')
3681 [ $e_end -eq 2097152 ] ||
3682 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3684 e_start=$($LFS getstripe -I2 $name |
3685 awk '/lcme_extent.e_start:/ { print $2 }')
3686 [ $e_start -eq 2097152 ] ||
3687 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3689 e_end=$($LFS getstripe -I2 $name |
3690 awk '/lcme_extent.e_end:/ { print $2 }')
3691 [ "$e_end" = "EOF" ] ||
3692 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3694 size=$(stat $name | awk '/Size:/ { print $2 }')
3695 [ $size -eq $((4096 * $bcount)) ] ||
3696 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3698 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3700 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3701 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3703 # The first stripe in each COMP was lost
3704 [ $failures -eq 512 ] ||
3705 error "(8.8) expect 512 IO failures, but get $failures"
3707 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3708 [ $size -eq $((4096 * $bcount)) ] ||
3709 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3711 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3712 error "(8.10) write to the LOV EA hole should fail"
3714 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3715 error "(8.11) write to normal stripe should NOT fail"
3717 echo "foo" >> $name && error "(8.12) append write $name should fail"
3719 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3721 touch $name || error "(8.14) cannot touch $name"
3723 rm -f $name || error "(8.15) cannot unlink $name"
3726 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3728 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3729 echo "Check $name, it contains f2's first stripe in each COMP"
3731 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3733 pattern=$($LFS getstripe -L -I1 $name)
3734 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3735 error "(9.2.1) expect pattern flag hole, but got $pattern"
3737 pattern=$($LFS getstripe -L -I2 $name)
3738 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3739 error "(9.2.2) expect pattern flag hole, but got $pattern"
3741 stripes=$($LFS getstripe -c -I1 $name)
3742 [ $stripes -eq 2 ] ||
3743 error "(9.3.2) expect 2 stripes, but got $stripes"
3745 stripes=$($LFS getstripe -c -I2 $name)
3746 [ $stripes -eq 2 ] ||
3747 error "(9.3.2) expect 2 stripes, but got $stripes"
3749 e_start=$($LFS getstripe -I1 $name |
3750 awk '/lcme_extent.e_start:/ { print $2 }')
3751 [ $e_start -eq 0 ] ||
3752 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3754 e_end=$($LFS getstripe -I1 $name |
3755 awk '/lcme_extent.e_end:/ { print $2 }')
3756 [ $e_end -eq 2097152 ] ||
3757 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3759 e_start=$($LFS getstripe -I2 $name |
3760 awk '/lcme_extent.e_start:/ { print $2 }')
3761 [ $e_start -eq 2097152 ] ||
3762 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3764 e_end=$($LFS getstripe -I2 $name |
3765 awk '/lcme_extent.e_end:/ { print $2 }')
3766 [ "$e_end" = "EOF" ] ||
3767 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3769 size=$(stat $name | awk '/Size:/ { print $2 }')
3770 # The second stripe in COMP was lost, so we do not know there
3771 # have ever been some data before. 'stat' will regard it as
3772 # no data on the lost stripe.
3774 [ $size -eq $((4096 * $bcount)) ] ||
3775 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3777 cat $name > /dev/null &&
3778 error "(9.7) normal read $name should fail"
3780 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3781 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3782 [ $failures -eq 512 ] ||
3783 error "(9.8) expect 256 IO failures, but get $failures"
3785 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3786 # The second stripe in COMP was lost, so we do not know there
3787 # have ever been some data before. Since 'dd' skip failure,
3788 # it will regard the lost stripe contains data.
3790 [ $size -eq $((4096 * $bcount)) ] ||
3791 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3793 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3794 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3796 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3797 error "(9.11) write to normal stripe should NOT fail"
3799 echo "foo" >> $name &&
3800 error "(9.12) append write $name should fail"
3802 chown $RUNAS_ID:$RUNAS_GID $name ||
3803 error "(9.13) cannot chown on $name"
3805 touch $name || error "(9.14) cannot touch $name"
3807 rm -f $name || error "(7.15) cannot unlink $name"
3809 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3812 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3813 skip "MDS older than 2.5.59, LU-4887"
3815 check_mount_and_prep
3816 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3818 echo "Start all LFSCK components by default (-s 1)"
3819 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3820 error "Fail to start LFSCK"
3822 echo "namespace LFSCK should be in 'scanning-phase1' status"
3823 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3824 [ "$STATUS" == "scanning-phase1" ] ||
3825 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3827 echo "layout LFSCK should be in 'scanning-phase1' status"
3828 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3829 [ "$STATUS" == "scanning-phase1" ] ||
3830 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3832 echo "Stop all LFSCK components by default"
3833 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3834 error "Fail to stop LFSCK"
3836 run_test 21 "run all LFSCK components by default"
3839 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3840 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3841 skip "MDS older than 2.6.50, LU-5511"
3844 echo "The parent_A references the child directory via some name entry,"
3845 echo "but the child directory back references another parent_B via its"
3846 echo "".." name entry. The parent_B does not exist. Then the namespace"
3847 echo "LFSCK will repair the child directory's ".." name entry."
3850 check_mount_and_prep
3852 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3853 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3855 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3856 echo "The dummy's dotdot name entry references the guard."
3857 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3858 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3859 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3860 error "(3) Fail to mkdir on MDT0"
3861 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3863 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3865 echo "Trigger namespace LFSCK to repair unmatched pairs"
3866 $START_NAMESPACE -A -r ||
3867 error "(5) Fail to start LFSCK for namespace"
3869 wait_all_targets_blocked namespace completed 6
3871 local repaired=$($SHOW_NAMESPACE |
3872 awk '/^unmatched_pairs_repaired/ { print $2 }')
3873 [ $repaired -eq 1 ] ||
3874 error "(7) Fail to repair unmatched pairs: $repaired"
3876 echo "'ls' should success after namespace LFSCK repairing"
3877 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3878 error "(8) ls should success."
3880 run_test 22a "LFSCK can repair unmatched pairs (1)"
3883 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3884 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3885 skip "MDS older than 2.6.50, LU-5511"
3888 echo "The parent_A references the child directory via the name entry_B,"
3889 echo "but the child directory back references another parent_C via its"
3890 echo "".." name entry. The parent_C exists, but there is no the name"
3891 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3892 echo "the child directory's ".." name entry and its linkEA."
3895 check_mount_and_prep
3897 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3898 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3900 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3901 echo "and bad linkEA. The dummy's dotdot name entry references the"
3902 echo "guard. The dummy's linkEA references n non-exist name entry."
3903 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3904 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3905 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3906 error "(3) Fail to mkdir on MDT0"
3907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3909 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3910 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3911 local dummyname=$($LFS fid2path $DIR $dummyfid)
3912 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3913 error "(4) fid2path works unexpectedly."
3915 echo "Trigger namespace LFSCK to repair unmatched pairs"
3916 $START_NAMESPACE -A -r ||
3917 error "(5) Fail to start LFSCK for namespace"
3919 wait_all_targets_blocked namespace completed 6
3921 local repaired=$($SHOW_NAMESPACE |
3922 awk '/^unmatched_pairs_repaired/ { print $2 }')
3923 [ $repaired -eq 1 ] ||
3924 error "(7) Fail to repair unmatched pairs: $repaired"
3926 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3927 local dummyname=$($LFS fid2path $DIR $dummyfid)
3928 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3929 error "(8) fid2path does not work"
3931 run_test 22b "LFSCK can repair unmatched pairs (2)"
3934 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3935 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3936 skip "MDS older than 2.6.50, LU-5512"
3939 echo "The name entry is there, but the MDT-object for such name "
3940 echo "entry does not exist. The namespace LFSCK should find out "
3941 echo "and repair the inconsistency as required."
3944 check_mount_and_prep
3946 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3947 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3949 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3950 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3951 do_facet mds2 $LCTL set_param fail_loc=0x1620
3952 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3953 do_facet mds2 $LCTL set_param fail_loc=0
3955 echo "'ls' should fail because of dangling name entry"
3956 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3958 echo "Trigger namespace LFSCK to find out dangling name entry"
3959 $START_NAMESPACE -A -r ||
3960 error "(5) Fail to start LFSCK for namespace"
3962 wait_all_targets_blocked namespace completed 6
3964 local repaired=$($SHOW_NAMESPACE |
3965 awk '/^dangling_repaired/ { print $2 }')
3966 [ $repaired -eq 1 ] ||
3967 error "(7) Fail to repair dangling name entry: $repaired"
3969 echo "'ls' should fail because not re-create MDT-object by default"
3970 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3972 echo "Trigger namespace LFSCK again to repair dangling name entry"
3973 $START_NAMESPACE -A -r -C ||
3974 error "(9) Fail to start LFSCK for namespace"
3976 wait_all_targets_blocked namespace completed 10
3978 repaired=$($SHOW_NAMESPACE |
3979 awk '/^dangling_repaired/ { print $2 }')
3980 [ $repaired -eq 1 ] ||
3981 error "(11) Fail to repair dangling name entry: $repaired"
3983 echo "'ls' should success after namespace LFSCK repairing"
3984 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3986 run_test 23a "LFSCK can repair dangling name entry (1)"
3989 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3990 skip "MDS older than 2.6.50, LU-5512"
3993 echo "The objectA has multiple hard links, one of them corresponding"
3994 echo "to the name entry_B. But there is something wrong for the name"
3995 echo "entry_B and cause entry_B to references non-exist object_C."
3996 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3997 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3998 echo "comes to the second-stage scanning, it will find that the"
3999 echo "former re-creating object_C is not proper, and will try to"
4000 echo "replace the object_C with the real object_A."
4003 check_mount_and_prep
4005 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4006 $LFS path2fid $DIR/$tdir/d0
4008 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4010 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4011 $LFS path2fid $DIR/$tdir/d0/f0
4013 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4014 $LFS path2fid $DIR/$tdir/d0/f1
4016 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
4017 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
4019 if [ "$SEQ0" != "$SEQ1" ]; then
4020 # To guarantee that the f0 and f1 are in the same FID seq
4021 rm -f $DIR/$tdir/d0/f0 ||
4022 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4023 echo "dummy" > $DIR/$tdir/d0/f0 ||
4024 error "(3.2) Fail to touch on MDT0"
4025 $LFS path2fid $DIR/$tdir/d0/f0
4028 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4029 OID=$(printf %d $OID)
4031 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4032 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4033 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4034 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4035 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4037 # If there is creation after the dangling injection, it may re-use
4038 # the just released local object (inode) that is referenced by the
4039 # dangling name entry. It will fail the dangling injection.
4040 # So before deleting the target object for the dangling name entry,
4041 # remove some other objects to avoid the target object being reused
4042 # by some potential creations. LU-7429
4043 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4045 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4047 echo "'ls' should fail because of dangling name entry"
4048 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4049 error "(6) ls should fail."
4051 echo "Trigger namespace LFSCK to find out dangling name entry"
4052 $START_NAMESPACE -r -C ||
4053 error "(7) Fail to start LFSCK for namespace"
4055 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4056 mdd.${MDT_DEV}.lfsck_namespace |
4057 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4059 error "(8) unexpected status"
4062 local repaired=$($SHOW_NAMESPACE |
4063 awk '/^dangling_repaired/ { print $2 }')
4064 [ $repaired -eq 1 ] ||
4065 error "(9) Fail to repair dangling name entry: $repaired"
4067 repaired=$($SHOW_NAMESPACE |
4068 awk '/^multiple_linked_repaired/ { print $2 }')
4069 [ $repaired -eq 1 ] ||
4070 error "(10) Fail to drop the former created object: $repaired"
4072 local data=$(cat $DIR/$tdir/d0/foo)
4073 [ "$data" == "dummy" ] ||
4074 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4076 run_test 23b "LFSCK can repair dangling name entry (2)"
4079 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4080 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4081 mdd.${MDT_DEV}.lfsck_namespace |
4082 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4084 error "(10) unexpected status"
4087 stop_full_debug_logging
4091 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4092 skip "MDS older than 2.6.50, LU-5512"
4095 echo "The objectA has multiple hard links, one of them corresponding"
4096 echo "to the name entry_B. But there is something wrong for the name"
4097 echo "entry_B and cause entry_B to references non-exist object_C."
4098 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4099 echo "as dangling, and re-create the lost object_C. And then others"
4100 echo "modified the re-created object_C. When the LFSCK comes to the"
4101 echo "second-stage scanning, it will find that the former re-creating"
4102 echo "object_C maybe wrong and try to replace the object_C with the"
4103 echo "real object_A. But because object_C has been modified, so the"
4104 echo "LFSCK cannot replace it."
4107 start_full_debug_logging
4109 check_mount_and_prep
4111 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4112 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4113 echo "parent_fid=$parent_fid"
4115 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4117 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4118 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4119 echo "f0_fid=$f0_fid"
4121 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4122 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4123 echo "f1_fid=$f1_fid"
4125 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4126 # To guarantee that the f0 and f1 are in the same FID seq
4127 rm -f $DIR/$tdir/d0/f0 ||
4128 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4129 echo "dummy" > $DIR/$tdir/d0/f0 ||
4130 error "(3.2) Fail to touch on MDT0"
4131 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4132 echo "f0_fid=$f0_fid (replaced)"
4135 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4137 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4138 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4139 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4140 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4141 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4143 # If there is creation after the dangling injection, it may re-use
4144 # the just released local object (inode) that is referenced by the
4145 # dangling name entry. It will fail the dangling injection.
4146 # So before deleting the target object for the dangling name entry,
4147 # remove some other objects to avoid the target object being reused
4148 # by some potential creations. LU-7429
4149 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4151 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4153 echo "'ls' should fail because of dangling name entry"
4154 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4155 error "(6) ls should fail."
4157 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4158 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4160 echo "Trigger namespace LFSCK to find out dangling name entry"
4161 $START_NAMESPACE -r -C ||
4162 error "(7) Fail to start LFSCK for namespace"
4164 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4165 # While unexpected by the test, it is valid for LFSCK to repair
4166 # the link to the original object before any data is written.
4167 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4169 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4170 log "LFSCK repaired file prematurely"
4175 stat $DIR/$tdir/d0/foo
4177 error "(8) unexpected size"
4180 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4181 cancel_lru_locks osc
4185 local repaired=$($SHOW_NAMESPACE |
4186 awk '/^dangling_repaired/ { print $2 }')
4187 [ $repaired -eq 1 ] ||
4188 error "(11) Fail to repair dangling name entry: $repaired"
4190 local data=$(cat $DIR/$tdir/d0/foo)
4191 [ "$data" != "dummy" ] ||
4192 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4194 run_test 23c "LFSCK can repair dangling name entry (3)"
4197 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4198 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4199 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4200 skip "MDS older than 2.6.50, LU-5513"
4203 echo "Two MDT-objects back reference the same name entry via their"
4204 echo "each own linkEA entry, but the name entry only references one"
4205 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4206 echo "for the MDT-object that is not recognized. If such MDT-object"
4207 echo "has no other linkEA entry after the removing, then the LFSCK"
4208 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4211 check_mount_and_prep
4213 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4215 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4216 $LFS path2fid $DIR/$tdir/d0/guard
4218 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4219 $LFS path2fid $DIR/$tdir/d0/dummy
4222 if [ $mds1_FSTYPE != ldiskfs ]; then
4223 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4225 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4228 touch $DIR/$tdir/d0/guard/foo ||
4229 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4231 echo "Inject failure stub on MDT0 to simulate the case that"
4232 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4233 echo "that references $DIR/$tdir/d0/guard/foo."
4234 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4235 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4236 echo "there with the same linkEA entry as another MDT-object"
4237 echo "$DIR/$tdir/d0/guard/foo has"
4239 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4241 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4242 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4243 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4244 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4245 rmdir $DIR/$tdir/d0/dummy/foo ||
4246 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4247 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4249 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4250 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4251 error "(6) stat successfully unexpectedly"
4253 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4254 $START_NAMESPACE -A -r ||
4255 error "(7) Fail to start LFSCK for namespace"
4257 wait_all_targets_blocked namespace completed 8
4259 local repaired=$($SHOW_NAMESPACE |
4260 awk '/^multiple_referenced_repaired/ { print $2 }')
4261 [ $repaired -eq 1 ] ||
4262 error "(9) Fail to repair multiple referenced name entry: $repaired"
4264 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4265 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4266 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4268 local cname="$cfid-$pfid-D-0"
4269 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4270 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4272 run_test 24 "LFSCK can repair multiple-referenced name entry"
4275 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4276 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4277 skip "MDS older than 2.6.50, LU-5515"
4280 echo "The file type in the name entry does not match the file type"
4281 echo "claimed by the referenced object. Then the LFSCK will update"
4282 echo "the file type in the name entry."
4285 check_mount_and_prep
4287 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4289 echo "Inject failure stub on MDT0 to simulate the case that"
4290 echo "the file type stored in the name entry is wrong."
4292 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4293 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4294 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4295 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4297 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4298 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4300 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4301 mdd.${MDT_DEV}.lfsck_namespace |
4302 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4304 error "(4) unexpected status"
4307 local repaired=$($SHOW_NAMESPACE |
4308 awk '/^bad_file_type_repaired/ { print $2 }')
4309 [ $repaired -eq 1 ] ||
4310 error "(5) Fail to repair bad file type in name entry: $repaired"
4312 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4314 run_test 25 "LFSCK can repair bad file type in the name entry"
4317 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4318 skip "MDS older than 2.6.50, LU-5516"
4321 echo "The local name entry back referenced by the MDT-object is lost."
4322 echo "The namespace LFSCK will add the missing local name entry back"
4323 echo "to the normal namespace."
4326 check_mount_and_prep
4328 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4329 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4330 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4332 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4333 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4335 echo "Inject failure stub on MDT0 to simulate the case that"
4336 echo "foo's name entry will be removed, but the foo's object"
4337 echo "and its linkEA are kept in the system."
4339 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4340 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4341 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4342 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4344 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4345 error "(5) 'ls' should fail"
4347 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4348 $START_NAMESPACE -r -A ||
4349 error "(6) Fail to start LFSCK for namespace"
4351 wait_all_targets_blocked namespace completed 7
4353 local repaired=$($SHOW_NAMESPACE |
4354 awk '/^lost_dirent_repaired/ { print $2 }')
4355 [ $repaired -eq 1 ] ||
4356 error "(8) Fail to repair lost dirent: $repaired"
4358 ls -ail $DIR/$tdir/d0/foo ||
4359 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4361 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4362 [ "$foofid" == "$foofid2" ] ||
4363 error "(10) foo's FID changed: $foofid, $foofid2"
4365 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4368 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4369 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4370 skip "MDS older than 2.6.50, LU-5516"
4373 echo "The remote name entry back referenced by the MDT-object is lost."
4374 echo "The namespace LFSCK will add the missing remote name entry back"
4375 echo "to the normal namespace."
4378 check_mount_and_prep
4380 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4381 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4382 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4384 echo "Inject failure stub on MDT0 to simulate the case that"
4385 echo "foo's name entry will be removed, but the foo's object"
4386 echo "and its linkEA are kept in the system."
4388 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4390 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4393 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4394 error "(4) 'ls' should fail"
4396 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4397 $START_NAMESPACE -r -A ||
4398 error "(5) Fail to start LFSCK for namespace"
4400 wait_all_targets_blocked namespace completed 6
4402 local repaired=$($SHOW_NAMESPACE |
4403 awk '/^lost_dirent_repaired/ { print $2 }')
4404 [ $repaired -eq 1 ] ||
4405 error "(7) Fail to repair lost dirent: $repaired"
4407 ls -ail $DIR/$tdir/d0/foo ||
4408 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4410 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4411 [ "$foofid" == "$foofid2" ] ||
4412 error "(9) foo's FID changed: $foofid, $foofid2"
4414 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4417 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4418 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4419 skip "MDS older than 2.6.50, LU-5516"
4422 echo "The local parent referenced by the MDT-object linkEA is lost."
4423 echo "The namespace LFSCK will re-create the lost parent as orphan."
4426 check_mount_and_prep
4428 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4429 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4430 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4431 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4433 echo "Inject failure stub on MDT0 to simulate the case that"
4434 echo "foo's name entry will be removed, but the foo's object"
4435 echo "and its linkEA are kept in the system. And then remove"
4436 echo "another hard link and the parent directory."
4438 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4439 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4440 rm -f $DIR/$tdir/d0/foo ||
4441 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4442 rm -f $DIR/$tdir/d0/dummy ||
4443 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4444 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4446 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4447 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4449 echo "Trigger namespace LFSCK to repair the lost parent"
4450 $START_NAMESPACE -r -A ||
4451 error "(6) Fail to start LFSCK for namespace"
4453 wait_all_targets_blocked namespace completed 7
4455 local repaired=$($SHOW_NAMESPACE |
4456 awk '/^lost_dirent_repaired/ { print $2 }')
4457 [ $repaired -eq 1 ] ||
4458 error "(8) Fail to repair lost dirent: $repaired"
4460 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4461 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4462 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4464 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4466 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4467 [ ! -z "$cname" ] ||
4468 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4470 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4473 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4474 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4475 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4476 skip "MDS older than 2.6.50, LU-5516"
4479 echo "The remote parent referenced by the MDT-object linkEA is lost."
4480 echo "The namespace LFSCK will re-create the lost parent as orphan."
4483 check_mount_and_prep
4485 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4486 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4488 $LFS path2fid $DIR/$tdir/d0
4490 echo "Inject failure stub on MDT0 to simulate the case that"
4491 echo "foo's name entry will be removed, but the foo's object"
4492 echo "and its linkEA are kept in the system. And then remove"
4493 echo "the parent directory."
4495 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4496 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4497 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4500 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4501 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4503 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4504 $START_NAMESPACE -r -A ||
4505 error "(6) Fail to start LFSCK for namespace"
4507 wait_all_targets_blocked namespace completed 7
4509 local repaired=$($SHOW_NAMESPACE |
4510 awk '/^lost_dirent_repaired/ { print $2 }')
4511 [ $repaired -eq 1 ] ||
4512 error "(8) Fail to repair lost dirent: $repaired"
4514 ls -ail $MOUNT/.lustre/lost+found/
4516 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4517 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4518 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4520 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4522 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4523 [ ! -z "$cname" ] ||
4524 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4526 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4529 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4530 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4531 skip "MDS older than 2.6.50, LU-5506"
4534 echo "The target name entry is lost. The LFSCK should insert the"
4535 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4536 echo "the MDT (on which the orphan MDT-object resides) has ever"
4537 echo "failed to respond some name entry verification during the"
4538 echo "first stage-scanning, then the LFSCK should skip to handle"
4539 echo "orphan MDT-object on this MDT. But other MDTs should not"
4543 check_mount_and_prep
4544 $LFS mkdir -i 0 $DIR/$tdir/d1
4545 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4546 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4548 $LFS mkdir -i 1 $DIR/$tdir/d2
4549 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4550 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4552 echo "Inject failure stub on MDT0 to simulate the case that"
4553 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4554 echo "and its linkEA are kept in the system. And the case that"
4555 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4556 echo "and its linkEA are kept in the system."
4558 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4559 do_facet mds1 $LCTL set_param fail_loc=0x1624
4560 do_facet mds2 $LCTL set_param fail_loc=0x1624
4561 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4562 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4563 do_facet mds1 $LCTL set_param fail_loc=0
4564 do_facet mds2 $LCTL set_param fail_loc=0
4566 cancel_lru_locks mdc
4567 cancel_lru_locks osc
4569 echo "Inject failure, to simulate the MDT0 fail to handle"
4570 echo "MDT1 LFSCK request during the first-stage scanning."
4571 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4572 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4574 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4575 $START_NAMESPACE -r -A ||
4576 error "(3) Fail to start LFSCK for namespace"
4578 wait_update_facet mds1 "$LCTL get_param -n \
4579 mdd.$(facet_svc mds1).lfsck_namespace |
4580 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4581 error "(4) mds1 is not the expected 'partial'"
4584 wait_update_facet mds2 "$LCTL get_param -n \
4585 mdd.$(facet_svc mds2).lfsck_namespace |
4586 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4587 error "(5) mds2 is not the expected 'completed'"
4590 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4592 local repaired=$(do_facet mds1 $LCTL get_param -n \
4593 mdd.$(facet_svc mds1).lfsck_namespace |
4594 awk '/^lost_dirent_repaired/ { print $2 }')
4595 [ $repaired -eq 0 ] ||
4596 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4598 repaired=$(do_facet mds2 $LCTL get_param -n \
4599 mdd.$(facet_svc mds2).lfsck_namespace |
4600 awk '/^lost_dirent_repaired/ { print $2 }')
4601 [ $repaired -eq 1 ] ||
4602 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4604 echo "Trigger namespace LFSCK on all devices again to cleanup"
4605 $START_NAMESPACE -r -A ||
4606 error "(8) Fail to start LFSCK for namespace"
4608 wait_all_targets_blocked namespace completed 9
4610 local repaired=$(do_facet mds1 $LCTL get_param -n \
4611 mdd.$(facet_svc mds1).lfsck_namespace |
4612 awk '/^lost_dirent_repaired/ { print $2 }')
4613 [ $repaired -eq 1 ] ||
4614 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4616 repaired=$(do_facet mds2 $LCTL get_param -n \
4617 mdd.$(facet_svc mds2).lfsck_namespace |
4618 awk '/^lost_dirent_repaired/ { print $2 }')
4619 [ $repaired -eq 0 ] ||
4620 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4622 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4625 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4626 skip "MDS older than 2.6.50, LU-5517"
4629 echo "The object's nlink attribute is larger than the object's known"
4630 echo "name entries count. The LFSCK will repair the object's nlink"
4631 echo "attribute to match the known name entries count"
4634 check_mount_and_prep
4636 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4637 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4639 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4640 echo "nlink attribute is larger than its name entries count."
4642 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4643 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4644 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4645 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4648 cancel_lru_locks mdc
4649 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4650 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4652 echo "Trigger namespace LFSCK to repair the nlink count"
4653 $START_NAMESPACE -r -A ||
4654 error "(5) Fail to start LFSCK for namespace"
4656 wait_all_targets_blocked namespace completed 6
4658 local repaired=$($SHOW_NAMESPACE |
4659 awk '/^nlinks_repaired/ { print $2 }')
4660 [ $repaired -eq 1 ] ||
4661 error "(7) Fail to repair nlink count: $repaired"
4663 cancel_lru_locks mdc
4664 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4665 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4667 # Disable 29a, we only allow nlink to be updated if the known linkEA
4668 # entries is larger than nlink count.
4670 #run_test 29a "LFSCK can repair bad nlink count (1)"
4673 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4674 skip "MDS older than 2.6.50, LU-5517"
4677 echo "The object's nlink attribute is smaller than the object's known"
4678 echo "name entries count. The LFSCK will repair the object's nlink"
4679 echo "attribute to match the known name entries count"
4682 check_mount_and_prep
4684 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4685 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4687 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4688 echo "nlink attribute is smaller than its name entries count."
4690 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4691 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4692 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4693 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4694 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4696 cancel_lru_locks mdc
4697 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4698 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4700 echo "Trigger namespace LFSCK to repair the nlink count"
4701 $START_NAMESPACE -r -A ||
4702 error "(5) Fail to start LFSCK for namespace"
4704 wait_all_targets_blocked namespace completed 6
4706 local repaired=$($SHOW_NAMESPACE |
4707 awk '/^nlinks_repaired/ { print $2 }')
4708 [ $repaired -eq 1 ] ||
4709 error "(7) Fail to repair nlink count: $repaired"
4711 cancel_lru_locks mdc
4712 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4713 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4715 run_test 29b "LFSCK can repair bad nlink count (2)"
4719 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4720 skip "MDS older than 2.6.50, LU-5517"
4723 echo "The namespace LFSCK will create many hard links to the target"
4724 echo "file as to exceed the linkEA size limitation. Under such case"
4725 echo "the linkEA will be marked as overflow that will prevent the"
4726 echo "target file to be migrated. Then remove some hard links to"
4727 echo "make the left hard links to be held within the linkEA size"
4728 echo "limitation. But before the namespace LFSCK adding all the"
4729 echo "missed linkEA entries back, the overflow mark (timestamp)"
4730 echo "will not be cleared."
4733 check_mount_and_prep
4735 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4736 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4737 error "(0.2) Fail to mkdir"
4738 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4739 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4741 # define MAX_LINKEA_SIZE 4096
4742 # sizeof(link_ea_header) = 24
4743 # sizeof(link_ea_entry) = 18
4744 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4745 # (sizeof(link_ea_entry) + name_length))
4746 # If the average name length is 12 bytes, then 150 hard links
4747 # is totally enough to overflow the linkEA
4748 echo "Create 150 hard links should succeed although the linkEA overflow"
4749 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4750 error "(2) Fail to hard link"
4752 cancel_lru_locks mdc
4753 if [ $MDSCOUNT -ge 2 ]; then
4754 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4755 error "(3.1) Migrate should fail"
4757 echo "The object with linkEA overflow should NOT be migrated"
4758 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4759 [ "$newfid" == "$oldfid" ] ||
4760 error "(3.2) Migrate should fail: $newfid != $oldfid"
4763 # Remove 100 hard links, then the linkEA should have space
4764 # to hold the missed linkEA entries.
4765 echo "Remove 100 hard links to save space for the missed linkEA entries"
4766 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4768 if [ $MDSCOUNT -ge 2 ]; then
4769 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4770 error "(5.1) Migrate should fail"
4772 # The overflow timestamp is still there, so migration will fail.
4773 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4774 [ "$newfid" == "$oldfid" ] ||
4775 error "(5.2) Migrate should fail: $newfid != $oldfid"
4778 # sleep 3 seconds to guarantee that the overflow is recognized
4781 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4782 $START_NAMESPACE -r -A ||
4783 error "(6) Fail to start LFSCK for namespace"
4785 wait_all_targets_blocked namespace completed 7
4787 local repaired=$($SHOW_NAMESPACE |
4788 awk '/^linkea_overflow_cleared/ { print $2 }')
4789 [ $repaired -eq 1 ] ||
4790 error "(8) Fail to clear linkea overflow: $repaired"
4792 repaired=$($SHOW_NAMESPACE |
4793 awk '/^nlinks_repaired/ { print $2 }')
4794 [ $repaired -eq 0 ] ||
4795 error "(9) Unexpected nlink repaired: $repaired"
4797 if [ $MDSCOUNT -ge 2 ]; then
4798 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4799 error "(10.1) Migrate failure"
4801 # Migration should succeed after clear the overflow timestamp.
4802 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4803 [ "$newfid" != "$oldfid" ] ||
4804 error "(10.2) Migrate should succeed"
4806 ls -l $DIR/$tdir/foo > /dev/null ||
4807 error "(11) 'ls' failed after migration"
4810 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4811 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4813 run_test 29c "verify linkEA size limitation"
4816 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4817 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4818 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4819 skip "MDS older than 2.6.50, LU-5518"
4822 echo "The namespace LFSCK will move the orphans from backend"
4823 echo "/lost+found directory to normal client visible namespace"
4824 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4827 check_mount_and_prep
4829 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4830 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4832 echo "Inject failure stub on MDT0 to simulate the case that"
4833 echo "directory d0 has no linkEA entry, then the LFSCK will"
4834 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4836 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4837 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4838 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4839 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4841 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4842 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4844 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4845 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4847 echo "Inject failure stub on MDT0 to simulate the case that the"
4848 echo "object's name entry will be removed, but not destroy the"
4849 echo "object. Then backend e2fsck will handle it as orphan and"
4850 echo "add them into the backend /lost+found directory."
4852 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4853 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4854 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4855 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4856 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4857 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4858 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4860 umount_client $MOUNT || error "(10) Fail to stop client!"
4862 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4865 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4866 error "(12) Fail to run e2fsck"
4868 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4869 error "(13) Fail to start MDT0"
4871 echo "Trigger namespace LFSCK to recover backend orphans"
4872 $START_NAMESPACE -r -A ||
4873 error "(14) Fail to start LFSCK for namespace"
4875 wait_all_targets_blocked namespace completed 15
4877 local repaired=$($SHOW_NAMESPACE |
4878 awk '/^local_lost_found_moved/ { print $2 }')
4879 [ $repaired -ge 4 ] ||
4880 error "(16) Fail to recover backend orphans: $repaired"
4882 mount_client $MOUNT || error "(17) Fail to start client!"
4884 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4886 ls -ail $MOUNT/.lustre/lost+found/
4888 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4889 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4890 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4892 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4894 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4895 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4897 stat ${cname}/d1 || error "(21) d1 is not recovered"
4898 stat ${cname}/f1 || error "(22) f1 is not recovered"
4900 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4903 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4904 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4905 skip "MDS older than 2.6.50, LU-5519"
4908 echo "For the name entry under a striped directory, if the name"
4909 echo "hash does not match the shard, then the LFSCK will repair"
4910 echo "the bad name entry"
4913 check_mount_and_prep
4915 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4916 error "(1) Fail to create striped directory"
4918 echo "Inject failure stub on client to simulate the case that"
4919 echo "some name entry should be inserted into other non-first"
4920 echo "shard, but inserted into the first shard by wrong"
4922 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4923 $LCTL set_param fail_loc=0x1628 fail_val=0
4924 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4925 error "(2) Fail to create file under striped directory"
4926 $LCTL set_param fail_loc=0 fail_val=0
4928 echo "Trigger namespace LFSCK to repair bad name hash"
4929 $START_NAMESPACE -r -A ||
4930 error "(3) Fail to start LFSCK for namespace"
4932 wait_all_targets_blocked namespace completed 4
4934 local repaired=$($SHOW_NAMESPACE |
4935 awk '/^name_hash_repaired/ { print $2 }')
4936 [ $repaired -ge 1 ] ||
4937 error "(5) Fail to repair bad name hash: $repaired"
4939 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
4941 error "Fail to find flag bad type: $rc"
4943 umount_client $MOUNT || error "(6) umount failed"
4944 mount_client $MOUNT || error "(7) mount failed"
4946 for ((i = 0; i < $MDSCOUNT; i++)); do
4947 stat $DIR/$tdir/striped_dir/d$i ||
4948 error "(8) Fail to stat d$i after LFSCK"
4949 rmdir $DIR/$tdir/striped_dir/d$i ||
4950 error "(9) Fail to unlink d$i after LFSCK"
4953 rmdir $DIR/$tdir/striped_dir ||
4954 error "(10) Fail to remove the striped directory after LFSCK"
4956 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4959 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4960 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4961 skip "MDS older than 2.6.50, LU-5519"
4964 echo "For the name entry under a striped directory, if the name"
4965 echo "hash does not match the shard, then the LFSCK will repair"
4966 echo "the bad name entry"
4969 check_mount_and_prep
4971 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4972 error "(1) Fail to create striped directory"
4974 echo "Inject failure stub on client to simulate the case that"
4975 echo "some name entry should be inserted into other non-second"
4976 echo "shard, but inserted into the secod shard by wrong"
4978 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4979 $LCTL set_param fail_loc=0x1628 fail_val=1
4980 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4981 error "(2) Fail to create file under striped directory"
4982 $LCTL set_param fail_loc=0 fail_val=0
4984 echo "Trigger namespace LFSCK to repair bad name hash"
4985 $START_NAMESPACE -r -A ||
4986 error "(3) Fail to start LFSCK for namespace"
4988 wait_all_targets_blocked namespace completed 4
4990 local repaired=$(do_facet mds2 $LCTL get_param -n \
4991 mdd.$(facet_svc mds2).lfsck_namespace |
4992 awk '/^name_hash_repaired/ { print $2 }')
4993 echo "repaired $repaired name entries with bad hash"
4994 [ $repaired -ge 1 ] ||
4995 error "(5) Fail to repair bad name hash: $repaired"
4997 umount_client $MOUNT || error "(6) umount failed"
4998 mount_client $MOUNT || error "(7) mount failed"
5000 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
5001 stat $DIR/$tdir/striped_dir/d$i ||
5002 error "(8) Fail to stat d$i after LFSCK"
5003 rmdir $DIR/$tdir/striped_dir/d$i ||
5004 error "(9) Fail to unlink d$i after LFSCK"
5007 rmdir $DIR/$tdir/striped_dir ||
5008 error "(10) Fail to remove the striped directory after LFSCK"
5010 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
5013 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5014 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5015 skip "MDS older than 2.6.50, LU-5519"
5018 echo "For some reason, the master MDT-object of the striped directory"
5019 echo "may lost its master LMV EA. If nobody created files under the"
5020 echo "master directly after the master LMV EA lost, then the LFSCK"
5021 echo "should re-generate the master LMV EA."
5024 check_mount_and_prep
5026 echo "Inject failure stub on MDT0 to simulate the case that the"
5027 echo "master MDT-object of the striped directory lost the LMV EA."
5029 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5030 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5031 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5032 error "(1) Fail to create striped directory"
5033 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5035 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5036 $START_NAMESPACE -r -A ||
5037 error "(2) Fail to start LFSCK for namespace"
5039 wait_all_targets_blocked namespace completed 3
5041 local repaired=$($SHOW_NAMESPACE |
5042 awk '/^striped_dirs_repaired/ { print $2 }')
5043 [ $repaired -eq 1 ] ||
5044 error "(4) Fail to re-generate master LMV EA: $repaired"
5046 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5047 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5049 umount_client $MOUNT || error "(5) umount failed"
5050 mount_client $MOUNT || error "(6) mount failed"
5052 local empty=$(ls $DIR/$tdir/striped_dir/)
5053 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5055 rmdir $DIR/$tdir/striped_dir ||
5056 error "(8) Fail to remove the striped directory after LFSCK"
5058 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5061 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5062 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5063 skip "MDS older than 2.6.50, LU-5519"
5066 echo "For some reason, the master MDT-object of the striped directory"
5067 echo "may lost its master LMV EA. If somebody created files under the"
5068 echo "master directly after the master LMV EA lost, then the LFSCK"
5069 echo "should NOT re-generate the master LMV EA, instead, it should"
5070 echo "change the broken striped dirctory as read-only to prevent"
5071 echo "further damage"
5074 check_mount_and_prep
5076 echo "Inject failure stub on MDT0 to simulate the case that the"
5077 echo "master MDT-object of the striped directory lost the LMV EA."
5079 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5080 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5081 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5082 error "(1) Fail to create striped directory"
5083 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5085 umount_client $MOUNT || error "(2) umount failed"
5086 mount_client $MOUNT || error "(3) mount failed"
5088 touch $DIR/$tdir/striped_dir/dummy ||
5089 error "(4) Fail to touch under broken striped directory"
5091 echo "Trigger namespace LFSCK to find out the inconsistency"
5092 $START_NAMESPACE -r -A ||
5093 error "(5) Fail to start LFSCK for namespace"
5095 wait_all_targets_blocked namespace completed 6
5097 local repaired=$($SHOW_NAMESPACE |
5098 awk '/^striped_dirs_repaired/ { print $2 }')
5099 [ $repaired -eq 0 ] ||
5100 error "(7) Re-generate master LMV EA unexpected: $repaired"
5102 stat $DIR/$tdir/striped_dir/dummy ||
5103 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5105 touch $DIR/$tdir/striped_dir/foo &&
5106 error "(9) The broken striped directory should be read-only"
5108 chattr -i $DIR/$tdir/striped_dir ||
5109 error "(10) Fail to chattr on the broken striped directory"
5111 rmdir $DIR/$tdir/striped_dir ||
5112 error "(11) Fail to remove the striped directory after LFSCK"
5114 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5117 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5118 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5119 skip "MDS older than 2.6.50, LU-5519"
5122 echo "For some reason, the slave MDT-object of the striped directory"
5123 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5124 echo "slave LMV EA."
5127 check_mount_and_prep
5129 echo "Inject failure stub on MDT0 to simulate the case that the"
5130 echo "slave MDT-object (that resides on the same MDT as the master"
5131 echo "MDT-object resides on) lost the LMV EA."
5133 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5134 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5135 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5136 error "(1) Fail to create striped directory"
5137 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5139 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5140 $START_NAMESPACE -r -A ||
5141 error "(2) Fail to start LFSCK for namespace"
5143 wait_all_targets_blocked namespace completed 3
5145 local repaired=$($SHOW_NAMESPACE |
5146 awk '/^striped_shards_repaired/ { print $2 }')
5147 [ $repaired -eq 1 ] ||
5148 error "(4) Fail to re-generate slave LMV EA: $repaired"
5150 rmdir $DIR/$tdir/striped_dir ||
5151 error "(5) Fail to remove the striped directory after LFSCK"
5153 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5156 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5157 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5158 skip "MDS older than 2.6.50, LU-5519"
5161 echo "For some reason, the slave MDT-object of the striped directory"
5162 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5163 echo "slave LMV EA."
5166 check_mount_and_prep
5168 echo "Inject failure stub on MDT0 to simulate the case that the"
5169 echo "slave MDT-object (that resides on different MDT as the master"
5170 echo "MDT-object resides on) lost the LMV EA."
5172 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5173 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5174 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5175 error "(1) Fail to create striped directory"
5176 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5178 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5179 $START_NAMESPACE -r -A ||
5180 error "(2) Fail to start LFSCK for namespace"
5182 wait_all_targets_blocked namespace completed 3
5184 local repaired=$(do_facet mds2 $LCTL get_param -n \
5185 mdd.$(facet_svc mds2).lfsck_namespace |
5186 awk '/^striped_shards_repaired/ { print $2 }')
5187 [ $repaired -eq 1 ] ||
5188 error "(4) Fail to re-generate slave LMV EA: $repaired"
5190 rmdir $DIR/$tdir/striped_dir ||
5191 error "(5) Fail to remove the striped directory after LFSCK"
5193 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5196 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5197 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5198 skip "MDS older than 2.6.50, LU-5519"
5201 echo "For some reason, the stripe index in the slave LMV EA is"
5202 echo "corrupted. The LFSCK should repair the slave LMV EA."
5205 check_mount_and_prep
5207 echo "Inject failure stub on MDT0 to simulate the case that the"
5208 echo "slave LMV EA on the first shard of the striped directory"
5209 echo "claims the same index as the second shard claims"
5211 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5213 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5214 error "(1) Fail to create striped directory"
5215 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5217 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5218 $START_NAMESPACE -r -A ||
5219 error "(2) Fail to start LFSCK for namespace"
5221 wait_all_targets_blocked namespace completed 3
5223 local repaired=$($SHOW_NAMESPACE |
5224 awk '/^striped_shards_repaired/ { print $2 }')
5225 [ $repaired -eq 1 ] ||
5226 error "(4) Fail to repair slave LMV EA: $repaired"
5228 umount_client $MOUNT || error "(5) umount failed"
5229 mount_client $MOUNT || error "(6) mount failed"
5231 touch $DIR/$tdir/striped_dir/foo ||
5232 error "(7) Fail to touch file after the LFSCK"
5234 rm -f $DIR/$tdir/striped_dir/foo ||
5235 error "(8) Fail to unlink file after the LFSCK"
5237 rmdir $DIR/$tdir/striped_dir ||
5238 error "(9) Fail to remove the striped directory after LFSCK"
5240 run_test 31g "Repair the corrupted slave LMV EA"
5243 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5244 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5245 skip "MDS older than 2.6.50, LU-5519"
5248 echo "For some reason, the shard's name entry in the striped"
5249 echo "directory may be corrupted. The LFSCK should repair the"
5250 echo "bad shard's name entry."
5253 check_mount_and_prep
5255 echo "Inject failure stub on MDT0 to simulate the case that the"
5256 echo "first shard's name entry in the striped directory claims"
5257 echo "the same index as the second shard's name entry claims."
5259 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5260 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5261 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5262 error "(1) Fail to create striped directory"
5263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5265 echo "Trigger namespace LFSCK to repair the shard's name entry"
5266 $START_NAMESPACE -r -A ||
5267 error "(2) Fail to start LFSCK for namespace"
5269 wait_all_targets_blocked namespace completed 3
5271 local repaired=$($SHOW_NAMESPACE |
5272 awk '/^dirent_repaired/ { print $2 }')
5273 [ $repaired -eq 1 ] ||
5274 error "(4) Fail to repair shard's name entry: $repaired"
5276 umount_client $MOUNT || error "(5) umount failed"
5277 mount_client $MOUNT || error "(6) mount failed"
5279 touch $DIR/$tdir/striped_dir/foo ||
5280 error "(7) Fail to touch file after the LFSCK"
5282 rm -f $DIR/$tdir/striped_dir/foo ||
5283 error "(8) Fail to unlink file after the LFSCK"
5285 rmdir $DIR/$tdir/striped_dir ||
5286 error "(9) Fail to remove the striped directory after LFSCK"
5288 run_test 31h "Repair the corrupted shard's name entry"
5293 umount_client $MOUNT
5295 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5296 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5297 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5299 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5300 [ "$STATUS" == "scanning-phase1" ] ||
5301 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5304 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5306 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5310 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5312 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5313 error "(5) Fail to start ost1"
5315 run_test 32a "stop LFSCK when some OST failed"
5319 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5322 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5323 error "(1) Fail to create $DIR/$tdir/dp"
5324 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5325 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5326 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5327 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5328 umount_client $MOUNT
5330 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5331 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5332 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5334 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5335 mdd.${MDT_DEV}.lfsck_namespace |
5336 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5338 error "(5) unexpected status"
5342 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5344 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5348 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5350 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5351 error "(8) Fail to start MDT2"
5353 run_test 32b "stop LFSCK when some MDT failed"
5359 $START_LAYOUT --dryrun -o -r ||
5360 error "(1) Fail to start layout LFSCK"
5361 wait_all_targets_blocked layout completed 2
5363 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5364 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5365 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5367 $START_NAMESPACE -e abort -A -r ||
5368 error "(4) Fail to start namespace LFSCK"
5369 wait_all_targets_blocked namespace completed 5
5371 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5372 [ "$PARAMS" == "failout,all_targets" ] ||
5373 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5375 run_test 33 "check LFSCK paramters"
5379 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5380 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5384 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5385 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5386 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5387 error "(1) Fail to create $DIR/$tdir/dummy"
5389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5390 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5391 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5392 mdd.${MDT_DEV}.lfsck_namespace |
5393 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5395 error "(3) unexpected status"
5398 local repaired=$($SHOW_NAMESPACE |
5399 awk '/^dirent_repaired/ { print $2 }')
5400 [ $repaired -eq 1 ] ||
5401 error "(4) Fail to repair the lost agent object: $repaired"
5403 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5404 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5405 mdd.${MDT_DEV}.lfsck_namespace |
5406 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5408 error "(6) unexpected status"
5411 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5412 [ $repaired -eq 0 ] ||
5413 error "(7) Unexpected repairing: $repaired"
5415 run_test 34 "LFSCK can rebuild the lost agent object"
5419 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5423 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5424 do_facet mds2 $LCTL set_param fail_loc=0x1631
5425 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5426 error "(1) Fail to create $DIR/$tdir/dummy"
5429 do_facet mds2 $LCTL set_param fail_loc=0
5430 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5431 wait_update_facet mds2 "$LCTL get_param -n \
5432 mdd.$(facet_svc mds2).lfsck_namespace |
5433 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5434 error "(3) MDS${k} is not the expected 'completed'"
5436 local repaired=$(do_facet mds2 $LCTL get_param -n \
5437 mdd.$(facet_svc mds2).lfsck_namespace |
5438 awk '/^agent_entries_repaired/ { print $2 }')
5439 [ $repaired -eq 1 ] ||
5440 error "(4) Fail to repair the lost agent entry: $repaired"
5442 echo "stopall to cleanup object cache"
5445 setupall > /dev/null
5447 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5448 wait_update_facet mds2 "$LCTL get_param -n \
5449 mdd.$(facet_svc mds2).lfsck_namespace |
5450 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5451 error "(6) MDS${k} is not the expected 'completed'"
5453 repaired=$(do_facet mds2 $LCTL get_param -n \
5454 mdd.$(facet_svc mds2).lfsck_namespace |
5455 awk '/^agent_entries_repaired/ { print $2 }')
5456 [ $repaired -eq 0 ] ||
5457 error "(7) Unexpected repairing: $repaired"
5459 run_test 35 "LFSCK can rebuild the lost agent entry"
5462 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5465 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5466 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5467 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5470 check_mount_and_prep
5474 lctl get_param osc.*.*grant*
5475 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5477 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5478 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5479 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5480 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5481 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5482 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5483 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5484 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5485 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5487 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5488 error "(3) Fail to write $DIR/$tdir/f0"
5489 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5490 error "(4) Fail to write $DIR/$tdir/f1"
5491 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5492 error "(5) Fail to write $DIR/$tdir/f2"
5494 $LFS mirror resync $DIR/$tdir/f0 ||
5495 error "(6) Fail to resync $DIR/$tdir/f0"
5496 $LFS mirror resync $DIR/$tdir/f1 ||
5497 error "(7) Fail to resync $DIR/$tdir/f1"
5498 $LFS mirror resync $DIR/$tdir/f2 ||
5499 error "(8) Fail to resync $DIR/$tdir/f2"
5501 cancel_lru_locks mdc
5502 cancel_lru_locks osc
5504 $LFS getstripe $DIR/$tdir/f0 ||
5505 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5506 $LFS getstripe $DIR/$tdir/f1 ||
5507 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5508 $LFS getstripe $DIR/$tdir/f2 ||
5509 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5511 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5512 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5513 do_facet mds1 $LCTL set_param fail_loc=0x1616
5515 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5516 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5517 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5518 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5519 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5520 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5524 do_facet mds1 $LCTL set_param fail_loc=0
5526 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5527 error "(15) The 1st of mirror is not destroyed"
5528 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5529 error "(16) The 2nd of mirror is not destroyed"
5530 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5531 error "(17) The 3rd of mirror is not destroyed"
5535 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5536 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5537 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5538 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5539 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5540 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5542 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5543 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5545 for k in $(seq $MDSCOUNT); do
5546 # The LFSCK status query internal is 30 seconds. For the case
5547 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5548 # time to guarantee the status sync up.
5549 wait_update_facet mds${k} "$LCTL get_param -n \
5550 mdd.$(facet_svc mds${k}).lfsck_layout |
5551 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5552 error "(22) MDS${k} is not the expected 'completed'"
5555 for k in $(seq $OSTCOUNT); do
5556 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5557 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5558 awk '/^status/ { print $2 }')
5559 [ "$cur_status" == "completed" ] ||
5560 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5563 local repaired=$(do_facet mds1 $LCTL get_param -n \
5564 mdd.$(facet_svc mds1).lfsck_layout |
5565 awk '/^repaired_orphan/ { print $2 }')
5566 [ $repaired -eq 9 ] ||
5567 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5569 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5570 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5571 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5572 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5573 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5574 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5576 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5577 $LFS getstripe $DIR/$tdir/f0
5578 error "(28) The 1st of mirror is not recovered"
5581 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5582 $LFS getstripe $DIR/$tdir/f1
5583 error "(29) The 2nd of mirror is not recovered"
5586 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5587 $LFS getstripe $DIR/$tdir/f2
5588 error "(30) The 3rd of mirror is not recovered"
5591 run_test 36a "rebuild LOV EA for mirrored file (1)"
5594 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5595 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5598 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5599 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5600 echo "with the PFID EA of related OST-object(s) belong to the file. "
5603 check_mount_and_prep
5605 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5606 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5607 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5609 local fid=$($LFS path2fid $DIR/$tdir/f0)
5611 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5612 error "(1) Fail to write $DIR/$tdir/f0"
5613 $LFS mirror resync $DIR/$tdir/f0 ||
5614 error "(2) Fail to resync $DIR/$tdir/f0"
5616 cancel_lru_locks mdc
5617 cancel_lru_locks osc
5619 $LFS getstripe $DIR/$tdir/f0 ||
5620 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5622 echo "Inject failure, to simulate the case of missing the MDT-object"
5623 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5624 do_facet mds1 $LCTL set_param fail_loc=0x1616
5625 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5629 do_facet mds1 $LCTL set_param fail_loc=0
5631 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5632 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5634 for k in $(seq $MDSCOUNT); do
5635 # The LFSCK status query internal is 30 seconds. For the case
5636 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5637 # time to guarantee the status sync up.
5638 wait_update_facet mds${k} "$LCTL get_param -n \
5639 mdd.$(facet_svc mds${k}).lfsck_layout |
5640 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5641 error "(6) MDS${k} is not the expected 'completed'"
5644 for k in $(seq $OSTCOUNT); do
5645 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5646 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5647 awk '/^status/ { print $2 }')
5648 [ "$cur_status" == "completed" ] ||
5649 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5652 local count=$(do_facet mds1 $LCTL get_param -n \
5653 mdd.$(facet_svc mds1).lfsck_layout |
5654 awk '/^repaired_orphan/ { print $2 }')
5655 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5657 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5658 count=$($LFS getstripe --mirror-count $name)
5659 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5661 count=$($LFS getstripe --component-count $name)
5662 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5664 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5665 $LFS getstripe $name
5666 error "(11) The 1st of mirror is not recovered"
5669 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5670 $LFS getstripe $name
5671 error "(12) The 2nd of mirror is not recovered"
5674 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5675 $LFS getstripe $name
5676 error "(13) The 3rd of mirror is not recovered"
5679 run_test 36b "rebuild LOV EA for mirrored file (2)"
5682 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5683 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5686 echo "The mirrored file has been modified, not resynced yet, then "
5687 echo "lost its MDT-object, but relatd OST-objects are still there. "
5688 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5689 echo "with the PFID EA of related OST-object(s) belong to the file. "
5692 check_mount_and_prep
5694 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5696 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5698 local fid=$($LFS path2fid $DIR/$tdir/f0)
5700 # The 1st dd && resync makes all related OST-objects have been written
5701 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5702 error "(1.1) Fail to write $DIR/$tdir/f0"
5703 $LFS mirror resync $DIR/$tdir/f0 ||
5704 error "(1.2) Fail to resync $DIR/$tdir/f0"
5705 # The 2nd dd makes one mirror to be stale
5706 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5707 error "(1.3) Fail to write $DIR/$tdir/f0"
5709 cancel_lru_locks mdc
5710 cancel_lru_locks osc
5712 $LFS getstripe $DIR/$tdir/f0 ||
5713 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5715 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5716 awk '/lcme_flags/ { print $2 }')
5717 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5718 awk '/lcme_flags/ { print $2 }')
5720 echo "Inject failure, to simulate the case of missing the MDT-object"
5721 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5722 do_facet mds1 $LCTL set_param fail_loc=0x1616
5723 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5727 do_facet mds1 $LCTL set_param fail_loc=0
5729 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5730 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5732 for k in $(seq $MDSCOUNT); do
5733 # The LFSCK status query internal is 30 seconds. For the case
5734 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5735 # time to guarantee the status sync up.
5736 wait_update_facet mds${k} "$LCTL get_param -n \
5737 mdd.$(facet_svc mds${k}).lfsck_layout |
5738 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5739 error "(5) MDS${k} is not the expected 'completed'"
5742 for k in $(seq $OSTCOUNT); do
5743 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5744 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5745 awk '/^status/ { print $2 }')
5746 [ "$cur_status" == "completed" ] ||
5747 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5750 local count=$(do_facet mds1 $LCTL get_param -n \
5751 mdd.$(facet_svc mds1).lfsck_layout |
5752 awk '/^repaired_orphan/ { print $2 }')
5753 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5755 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5756 count=$($LFS getstripe --mirror-count $name)
5757 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5759 count=$($LFS getstripe --component-count $name)
5760 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5762 local flags=$($LFS getstripe $name | head -n 10 |
5763 awk '/lcme_flags/ { print $2 }')
5764 [ "$flags" == "$saved_flags1" ] || {
5765 $LFS getstripe $name
5766 error "(10) expect flags $saved_flags1, got $flags"
5769 flags=$($LFS getstripe $name | tail -n 10 |
5770 awk '/lcme_flags/ { print $2 }')
5771 [ "$flags" == "$saved_flags2" ] || {
5772 $LFS getstripe $name
5773 error "(11) expect flags $saved_flags2, got $flags"
5776 run_test 36c "rebuild LOV EA for mirrored file (3)"
5782 local t_dir="$DIR/$tdir/d0"
5783 check_mount_and_prep
5785 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5786 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5790 $START_NAMESPACE -r -A || {
5791 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5793 wait_all_targets_blocked namespace completed 4
5798 run_test 37 "LFSCK must skip a ORPHAN"
5802 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5803 skip "Need MDS version newer than 2.12.51"
5805 test_mkdir $DIR/$tdir
5806 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5807 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5809 # create foreign file
5810 $LFS setstripe --foreign=none --flags 0xda05 \
5811 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5812 error "$DIR/$tdir/$tfile: create failed"
5814 $LFS getstripe -v $DIR/$tdir/$tfile |
5815 grep "lfm_magic:.*0x0BD70BD0" ||
5816 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5817 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5818 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5819 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5820 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5821 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5822 $LFS getstripe -v $DIR/$tdir/$tfile |
5823 grep "lfm_flags:.*0x0000DA05" ||
5824 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5825 $LFS getstripe $DIR/$tdir/$tfile |
5826 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5827 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5829 # modify striping should fail
5830 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5831 error "$DIR/$tdir/$tfile: setstripe should fail"
5833 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5835 wait_all_targets_blocked namespace completed 1
5837 # check that "global" namespace_repaired == 0 !!!
5838 local repaired=$(do_facet mds1 \
5839 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5840 awk '/^namespace_repaired/ { print \\\$2 }'")
5841 [ $repaired -eq 0 ] ||
5842 error "(2) Expect no namespace repair, but got: $repaired"
5844 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5846 wait_all_targets_blocked layout completed 2
5848 # check that "global" layout_repaired == 0 !!!
5849 local repaired=$(do_facet mds1 \
5850 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5851 awk '/^layout_repaired/ { print \\\$2 }'")
5852 [ $repaired -eq 0 ] ||
5853 error "(2) Expect no layout repair, but got: $repaired"
5855 echo "post-lfsck checks of foreign file"
5857 $LFS getstripe -v $DIR/$tdir/$tfile |
5858 grep "lfm_magic:.*0x0BD70BD0" ||
5859 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5860 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5861 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5862 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5863 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5864 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5865 $LFS getstripe -v $DIR/$tdir/$tfile |
5866 grep "lfm_flags:.*0x0000DA05" ||
5867 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5868 $LFS getstripe $DIR/$tdir/$tfile |
5869 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5870 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5872 # modify striping should fail
5873 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5874 error "$DIR/$tdir/$tfile: setstripe should fail"
5877 cat $DIR/$tdir/$tfile && error "$DIR/$tdir/$tfile: read should fail"
5878 cat /etc/passwd > $DIR/$tdir/$tfile &&
5879 error "$DIR/$tdir/$tfile: write should fail"
5881 #remove foreign file
5882 rm $DIR/$tdir/$tfile ||
5883 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5885 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5889 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5890 skip "Need MDS version newer than 2.12.51"
5892 test_mkdir $DIR/$tdir
5893 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5894 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5896 # create foreign dir
5897 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5898 $DIR/$tdir/${tdir}2 ||
5899 error "$DIR/$tdir/${tdir}2: create failed"
5901 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5902 grep "lfm_magic:.*0x0CD50CD0" ||
5903 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5904 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5905 # - sizeof(lfm_type) - sizeof(lfm_flags)
5906 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5907 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5908 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5909 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5910 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5911 grep "lfm_flags:.*0x0000DA05" ||
5912 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5913 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5914 grep "lfm_value.*${uuid1}@${uuid2}" ||
5915 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5917 # file create in dir should fail
5918 touch $DIR/$tdir/${tdir}2/$tfile &&
5919 "$DIR/${tdir}2: file create should fail"
5922 chmod 777 $DIR/$tdir/${tdir}2 ||
5923 error "$DIR/${tdir}2: chmod failed"
5926 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5927 error "$DIR/${tdir}2: chown failed"
5929 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5931 wait_all_targets_blocked namespace completed 1
5933 # check that "global" namespace_repaired == 0 !!!
5934 local repaired=$(do_facet mds1 \
5935 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5936 awk '/^namespace_repaired/ { print \\\$2 }'")
5937 [ $repaired -eq 0 ] ||
5938 error "(2) Expect nothing to be repaired, but got: $repaired"
5940 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5942 wait_all_targets_blocked layout completed 2
5944 # check that "global" layout_repaired == 0 !!!
5945 local repaired=$(do_facet mds1 \
5946 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5947 awk '/^layout_repaired/ { print \\\$2 }'")
5948 [ $repaired -eq 0 ] ||
5949 error "(2) Expect no layout repair, but got: $repaired"
5951 echo "post-lfsck checks of foreign dir"
5953 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5954 grep "lfm_magic:.*0x0CD50CD0" ||
5955 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5956 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5957 # - sizeof(lfm_type) - sizeof(lfm_flags)
5958 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5959 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5960 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5961 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5962 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5963 grep "lfm_flags:.*0x0000DA05" ||
5964 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5965 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5966 grep "lfm_value.*${uuid1}@${uuid2}" ||
5967 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5969 # file create in dir should fail
5970 touch $DIR/$tdir/${tdir}2/$tfile &&
5971 "$DIR/${tdir}2: file create should fail"
5974 chmod 777 $DIR/$tdir/${tdir}2 ||
5975 error "$DIR/${tdir}2: chmod failed"
5978 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5979 error "$DIR/${tdir}2: chown failed"
5982 rmdir $DIR/$tdir/${tdir}2 ||
5983 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5985 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5988 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5990 check_mount_and_prep
5991 $LFS mkdir -i 1 $DIR/$tdir/dir1
5992 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
5994 touch $DIR/$tdir/dir1/f1
5995 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
5997 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
5998 $LFS migrate -m 0 $DIR/$tdir/dir1
6000 echo "trigger LFSCK for layout"
6001 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
6003 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6004 mdd.${MDT_DEV}.lfsck_layout |
6005 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6007 error "(2) unexpected status"
6010 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
6012 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
6014 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
6018 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
6020 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6021 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6022 do_facet $SINGLEMDS $LCTL dk > /dev/null
6024 echo "trigger LFSCK for SEL layout"
6025 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6026 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6027 mdd.${MDT_DEV}.lfsck_layout |
6028 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6030 error "(2) unexpected status"
6033 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6034 grep "lfsck_layout_verify_header")
6036 [[ "x$errors" == "x" ]] || {
6038 error "lfsck failed"
6041 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6043 run_test 41 "SEL support in LFSCK"
6045 # restore MDS/OST size
6046 MDSSIZE=${SAVED_MDSSIZE}
6047 OSTSIZE=${SAVED_OSTSIZE}
6048 OSTCOUNT=${SAVED_OSTCOUNT}
6050 # cleanup the system at last
6051 REFORMAT="yes" cleanup_and_setup_lustre
6054 check_and_cleanup_lustre