3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
33 skip "Need MDS version at least 2.3.60"
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
45 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 MDT_DEV=$(devicelabel $SINGLEMDS $(facet_device $SINGLEMDS))
55 OST_DEV="${FSNAME}-OST0000"
56 START_NAMESPACE="do_facet $SINGLEMDS \
57 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
58 START_LAYOUT="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
60 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
61 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
62 SHOW_NAMESPACE="do_facet $SINGLEMDS \
63 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
64 SHOW_LAYOUT="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
66 SHOW_LAYOUT_ON_OST="do_facet ost1 \
67 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
68 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
69 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
70 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
79 echo "preparing... $nfiles * $ndirs files will be created $(date)."
80 if [ ! -z $igif ]; then
81 #define OBD_FAIL_FID_IGIF 0x1504
82 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
85 cp $LUSTRE/tests/*.sh $DIR/$tdir/
86 if [ $ndirs -gt 0 ]; then
87 createmany -d $DIR/$tdir/d $ndirs
88 createmany -m $DIR/$tdir/f $ndirs
89 if [ $nfiles -gt 0 ]; then
90 for ((i = 0; i < $ndirs; i++)); do
91 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
92 /dev/null || error "createmany $nfiles"
95 createmany -d $DIR/$tdir/e $ndirs
98 if [ ! -z $igif ]; then
99 touch $DIR/$tdir/dummy
100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
103 echo "prepared $(date)."
110 local dev=$(facet_device $facet)
112 start $facet $dev $opts > /dev/null ||
113 error "($err) Fail to start $facet!"
116 run_e2fsck_on_mds_facet() {
117 [ $mds1_FSTYPE == ldiskfs ] || return 0
121 stop $mds > /dev/null || error "(0) Fail to the stop $mds"
122 local host=$(facet_active_host $mds)
123 local dev=$(facet_device $mds)
125 run_e2fsck $host $dev "-n" |
127 run_e2fsck $host $dev "-n"
128 error "(2) Detected inconsistency on $mds"
130 start_facet $mds "$MOUNT_OPTS_NOSCRUB" 3
133 wait_all_targets_blocked() {
138 local count=$(do_facet mds1 \
139 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
141 [[ $count -eq $MDSCOUNT ]] || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
152 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
153 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
154 "$MDSCOUNT" $LTIME || {
155 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
156 error "($err) some MDTs are not in ${status}"
163 #define OBD_FAIL_LFSCK_DELAY1 0x1600
164 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
165 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
167 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
169 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
170 [ "$STATUS" == "scanning-phase1" ] ||
171 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
173 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
176 [ "$STATUS" == "stopped" ] ||
177 error "(6) Expect 'stopped', but got '$STATUS'"
179 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
181 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
182 [ "$STATUS" == "scanning-phase1" ] ||
183 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
187 mdd.${MDT_DEV}.lfsck_namespace |
188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
190 error "(9) unexpected status"
193 local repaired=$($SHOW_NAMESPACE |
194 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 0 ] ||
196 error "(10) Expect nothing to be repaired, but got: $repaired"
198 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
199 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(12) unexpected status"
207 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
208 [ $((scanned1 + 1)) -eq $scanned2 ] ||
209 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
211 echo "stopall, should NOT crash LU-3649"
212 stopall || error "(14) Fail to stopall"
214 run_test 0 "Control LFSCK manually"
219 #define OBD_FAIL_FID_INDIR 0x1501
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
221 touch $DIR/$tdir/dummy
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
225 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
227 mdd.${MDT_DEV}.lfsck_namespace |
228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
230 error "(4) unexpected status"
233 local repaired=$($SHOW_NAMESPACE |
234 awk '/^dirent_repaired/ { print $2 }')
235 # for interop with old server
236 [ -z "$repaired" ] &&
237 repaired=$($SHOW_NAMESPACE |
238 awk '/^updated_phase1/ { print $2 }')
240 [ $repaired -eq 1 ] ||
241 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
243 run_e2fsck_on_mds_facet $SINGLEMDS
245 mount_client $MOUNT || error "(6) Fail to start client!"
247 #define OBD_FAIL_FID_LOOKUP 0x1505
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
249 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
253 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
257 [ "$mds1_FSTYPE" != ldiskfs ] &&
258 skip "OI Scrub not implemented for ZFS"
262 #define OBD_FAIL_FID_INLMA 0x1502
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
264 touch $DIR/$tdir/dummy
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
268 #define OBD_FAIL_FID_NOLMA 0x1506
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
270 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
272 mdd.${MDT_DEV}.lfsck_namespace |
273 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
275 error "(4) unexpected status"
278 local repaired=$($SHOW_NAMESPACE |
279 awk '/^dirent_repaired/ { print $2 }')
280 # for interop with old server
281 [ -z "$repaired" ] &&
282 repaired=$($SHOW_NAMESPACE |
283 awk '/^updated_phase1/ { print $2 }')
285 [ $repaired -eq 1 ] ||
286 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
289 run_e2fsck_on_mds_facet $SINGLEMDS
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
304 #define OBD_FAIL_FID_IGIF 0x1504
305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
306 touch $DIR/$tdir/dummy
308 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
310 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
311 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
312 mdd.${MDT_DEV}.lfsck_namespace |
313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
315 error "(4) unexpected status"
318 local repaired=$($SHOW_NAMESPACE |
319 awk '/^dirent_repaired/ { print $2 }')
320 # for interop with old server
321 [ -z "$repaired" ] &&
322 repaired=$($SHOW_NAMESPACE |
323 awk '/^updated_phase1/ { print $2 }')
325 [ $repaired -eq 1 ] ||
326 error "(5) Fail to repair lost FID-in-dirent: $repaired"
328 run_e2fsck_on_mds_facet $SINGLEMDS
330 mount_client $MOUNT || error "(6) Fail to start client!"
332 #define OBD_FAIL_FID_LOOKUP 0x1505
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
334 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
338 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
341 [ $MDS1_VERSION -lt $(version_code 2.13.57) ] &&
342 skip "MDS older than 2.13.57"
343 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
347 touch $DIR/$tdir/$tfile
348 mkdir $DIR/$tdir/subdir
349 $LFS mkdir -i 1 $DIR/$tdir/remotedir
350 $LFS path2fid $DIR/$tdir
351 ll_decode_linkea $DIR/$tdir/$tfile
352 ll_decode_linkea $DIR/$tdir/subdir
353 ll_decode_linkea $DIR/$tdir/remotedir
355 local mntpt=$(facet_mntpt mds1)
357 # unlink OI files to remove the stale entry
358 local saved_opts=$MDS_MOUNT_OPTS
361 mount_fstype mds1 $mntpt
362 # increase $tdir FID oid in LMA
363 do_facet mds1 "getfattr -d -m trusted.lma -e hex \
364 --absolute-names $mntpt/ROOT/$tdir | \
365 sed -E 's/0(.{8})$/1\1/' | setfattr --restore=-"
366 unmount_fstype mds1 $mntpt
369 # the FID oid in LMA was increased above, and it's not in OI table,
370 # run scrub first to generate mapping in OI, so the following namespace
371 # check can fix linkea correctly, this is not necessary normally.
372 do_facet mds1 $LCTL lfsck_start -M ${MDT_DEV} -t scrub ||
373 error "failed to start LFSCK for scrub!"
374 wait_update_facet mds1 "$LCTL get_param -n \
375 osd-*.$(facet_svc mds1).oi_scrub |
376 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
377 error "unexpected status"
379 $START_NAMESPACE -r -A || error "fail to start LFSCK for namespace!"
380 wait_update_facet mds1 "$LCTL get_param -n \
381 mdd.${MDT_DEV}.lfsck_namespace |
382 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
384 error "unexpected status"
386 $LFS path2fid $DIR/$tdir
387 ll_decode_linkea $DIR/$tdir/$tfile
388 ll_decode_linkea $DIR/$tdir/subdir
389 ll_decode_linkea $DIR/$tdir/remotedir
394 fid=$($LFS path2fid $DIR/$tdir)
395 for f in $tfile subdir remotedir; do
396 pfid=$(ll_decode_linkea $DIR/$tdir/$f |
397 awk '/pfid/ { print $3 }')
399 [ "$pfid" == "$fid" ] || error "$fid in LMA != $pfid in linkea"
402 run_test 1d "LFSCK can fix mismatch of FID in LMA and FID in child linkea"
407 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
409 touch $DIR/$tdir/dummy
411 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
413 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
414 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
415 mdd.${MDT_DEV}.lfsck_namespace |
416 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
418 error "(4) unexpected status"
421 local repaired=$($SHOW_NAMESPACE |
422 awk '/^linkea_repaired/ { print $2 }')
423 # for interop with old server
424 [ -z "$repaired" ] &&
425 repaired=$($SHOW_NAMESPACE |
426 awk '/^updated_phase2/ { print $2 }')
428 [ $repaired -eq 1 ] ||
429 error "(5) Fail to repair crashed linkEA: $repaired"
431 run_e2fsck_on_mds_facet $SINGLEMDS
433 mount_client $MOUNT || error "(6) Fail to start client!"
435 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
436 error "(7) Fail to stat $DIR/$tdir/dummy"
438 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
439 local dummyname=$($LFS fid2path $DIR $dummyfid)
440 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
441 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
443 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
449 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
450 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
451 touch $DIR/$tdir/dummy
453 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
455 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
456 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
457 mdd.${MDT_DEV}.lfsck_namespace |
458 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
460 error "(4) unexpected status"
463 local repaired=$($SHOW_NAMESPACE |
464 awk '/^updated_phase2/ { print $2 }')
465 [ $repaired -eq 1 ] ||
466 error "(5) Fail to repair crashed linkEA: $repaired"
468 run_e2fsck_on_mds_facet $SINGLEMDS
470 mount_client $MOUNT || error "(6) Fail to start client!"
472 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
473 error "(7) Fail to stat $DIR/$tdir/dummy"
475 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
476 local dummyname=$($LFS fid2path $DIR $dummyfid)
477 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
478 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
480 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
484 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
485 skip "MDS older than 2.4.90"
489 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
490 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
491 touch $DIR/$tdir/dummy
493 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
495 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
496 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
497 mdd.${MDT_DEV}.lfsck_namespace |
498 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
500 error "(4) unexpected status"
503 local repaired=$($SHOW_NAMESPACE |
504 awk '/^updated_phase2/ { print $2 }')
505 [ $repaired -eq 1 ] ||
506 error "(5) Fail to repair crashed linkEA: $repaired"
508 run_e2fsck_on_mds_facet $SINGLEMDS
510 mount_client $MOUNT || error "(6) Fail to start client!"
512 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
513 error "(7) Fail to stat $DIR/$tdir/dummy"
515 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
516 local dummyname=$($LFS fid2path $DIR $dummyfid)
517 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
518 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
520 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
524 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
525 skip "MDS older than 2.6.50, LU-4788"
529 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
530 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
531 touch $DIR/$tdir/dummy
533 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
535 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
536 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
537 mdd.${MDT_DEV}.lfsck_namespace |
538 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
540 error "(4) unexpected status"
543 local repaired=$($SHOW_NAMESPACE |
544 awk '/^linkea_repaired/ { print $2 }')
545 [ $repaired -eq 1 ] ||
546 error "(5) Fail to repair crashed linkEA: $repaired"
548 run_e2fsck_on_mds_facet $SINGLEMDS
550 mount_client $MOUNT || error "(6) Fail to start client!"
552 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
553 error "(7) Fail to stat $DIR/$tdir/dummy"
555 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
556 local dummyname=$($LFS fid2path $DIR $dummyfid)
557 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
558 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
560 run_test 2d "LFSCK can recover the missing linkEA entry"
564 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
565 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
566 skip "MDS older than 2.6.50, LU-5511"
570 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
572 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
573 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
574 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
575 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
577 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
579 wait_all_targets_blocked namespace completed 4
581 local repaired=$($SHOW_NAMESPACE |
582 awk '/^linkea_repaired/ { print $2 }')
583 [ $repaired -eq 1 ] ||
584 error "(5) Fail to repair crashed linkEA: $repaired"
586 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
587 local name=$($LFS fid2path $DIR $fid)
588 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
589 error "(6) Fail to repair linkEA: $fid $name"
591 run_test 2e "namespace LFSCK can verify remote object linkEA"
595 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
596 skip "MDS older than 2.6.50, LU-4788"
600 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
601 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
602 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
604 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
605 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
606 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
608 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
609 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
610 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
612 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
613 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
614 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
616 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
618 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
619 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
620 mdd.${MDT_DEV}.lfsck_namespace |
621 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
623 error "(10) unexpected status"
626 local checked=$($SHOW_NAMESPACE |
627 awk '/^checked_phase2/ { print $2 }')
628 [ $checked -ge 4 ] ||
629 error "(11) Fail to check multiple-linked object: $checked"
631 local repaired=$($SHOW_NAMESPACE |
632 awk '/^multiple_linked_repaired/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(12) Fail to repair multiple-linked object: $repaired"
636 run_test 3 "LFSCK can verify multiple-linked objects"
640 [ "$mds1_FSTYPE" != ldiskfs ] &&
641 skip "OI Scrub not implemented for ZFS"
644 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
645 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
647 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
648 echo "start $SINGLEMDS with disabling OI scrub"
649 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
651 #define OBD_FAIL_LFSCK_DELAY2 0x1601
652 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
653 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
658 error "(5) unexpected status"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
665 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
666 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
667 mdd.${MDT_DEV}.lfsck_namespace |
668 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
670 error "(7) unexpected status"
673 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
674 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
676 local repaired=$($SHOW_NAMESPACE |
677 awk '/^dirent_repaired/ { print $2 }')
678 # for interop with old server
679 [ -z "$repaired" ] &&
680 repaired=$($SHOW_NAMESPACE |
681 awk '/^updated_phase1/ { print $2 }')
683 [ $repaired -ge 9 ] ||
684 error "(9) Fail to re-generate FID-in-dirent: $repaired"
686 run_e2fsck_on_mds_facet $SINGLEMDS
688 mount_client $MOUNT || error "(10) Fail to start client!"
690 #define OBD_FAIL_FID_LOOKUP 0x1505
691 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
692 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
693 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
695 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
699 [ "$mds1_FSTYPE" != ldiskfs ] &&
700 skip "OI Scrub not implemented for ZFS"
703 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
704 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
706 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
707 echo "start $SINGLEMDS with disabling OI scrub"
708 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
710 #define OBD_FAIL_LFSCK_DELAY2 0x1601
711 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
712 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
713 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
714 mdd.${MDT_DEV}.lfsck_namespace |
715 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
717 error "(5) unexpected status"
720 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
721 [ "$STATUS" == "scanning-phase1" ] ||
722 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
724 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
725 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
726 mdd.${MDT_DEV}.lfsck_namespace |
727 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
729 error "(7) unexpected status"
732 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
733 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
735 local repaired=$($SHOW_NAMESPACE |
736 awk '/^dirent_repaired/ { print $2 }')
737 # for interop with old server
738 [ -z "$repaired" ] &&
739 repaired=$($SHOW_NAMESPACE |
740 awk '/^updated_phase1/ { print $2 }')
742 [ $repaired -ge 2 ] ||
743 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
745 run_e2fsck_on_mds_facet $SINGLEMDS
747 mount_client $MOUNT || error "(10) Fail to start client!"
749 #define OBD_FAIL_FID_LOOKUP 0x1505
750 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
751 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
753 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
755 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
756 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
757 local dummyname=$($LFS fid2path $DIR $dummyfid)
758 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
759 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
761 run_test 5 "LFSCK can handle IGIF object upgrading"
766 #define OBD_FAIL_LFSCK_DELAY1 0x1600
767 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
768 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
770 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
771 [ "$STATUS" == "scanning-phase1" ] ||
772 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
774 # Sleep 3 sec to guarantee at least one object processed by LFSCK
776 # Fail the LFSCK to guarantee there is at least one checkpoint
777 #define OBD_FAIL_LFSCK_FATAL1 0x1608
778 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
779 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
780 mdd.${MDT_DEV}.lfsck_namespace |
781 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
783 error "(4) unexpected status"
786 local POS0=$($SHOW_NAMESPACE |
787 awk '/^last_checkpoint_position/ { print $2 }' |
790 #define OBD_FAIL_LFSCK_DELAY1 0x1600
791 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
792 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
794 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
795 [ "$STATUS" == "scanning-phase1" ] ||
796 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
798 local POS1=$($SHOW_NAMESPACE |
799 awk '/^latest_start_position/ { print $2 }' |
801 [[ $POS0 -lt $POS1 ]] ||
802 error "(7) Expect larger than: $POS0, but got $POS1"
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
805 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
806 mdd.${MDT_DEV}.lfsck_namespace |
807 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
809 error "(8) unexpected status"
812 run_test 6a "LFSCK resumes from last checkpoint (1)"
817 #define OBD_FAIL_LFSCK_DELAY2 0x1601
818 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
819 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
821 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
822 [ "$STATUS" == "scanning-phase1" ] ||
823 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
825 # Sleep 5 sec to guarantee that we are in the directory scanning
827 # Fail the LFSCK to guarantee there is at least one checkpoint
828 #define OBD_FAIL_LFSCK_FATAL2 0x1609
829 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
830 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
831 mdd.${MDT_DEV}.lfsck_namespace |
832 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
834 error "(4) unexpected status"
837 local O_POS0=$($SHOW_NAMESPACE |
838 awk '/^last_checkpoint_position/ { print $2 }' |
841 local D_POS0=$($SHOW_NAMESPACE |
842 awk '/^last_checkpoint_position/ { print $4 }')
844 #define OBD_FAIL_LFSCK_DELAY2 0x1601
845 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
846 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
848 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
849 [ "$STATUS" == "scanning-phase1" ] ||
850 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
852 local O_POS1=$($SHOW_NAMESPACE |
853 awk '/^latest_start_position/ { print $2 }' |
855 local D_POS1=$($SHOW_NAMESPACE |
856 awk '/^latest_start_position/ { print $4 }')
858 echo "Additional debug for 6b"
860 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
861 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
862 [[ $O_POS0 -lt $O_POS1 ]] ||
863 error "(7.1) $O_POS1 is not larger than $O_POS0"
865 [[ $D_POS0 -lt $D_POS1 ]] ||
866 error "(7.2) $D_POS1 is not larger than $D_POS0"
869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
870 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
871 mdd.${MDT_DEV}.lfsck_namespace |
872 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
874 error "(8) unexpected status"
877 run_test 6b "LFSCK resumes from last checkpoint (2)"
884 #define OBD_FAIL_LFSCK_DELAY2 0x1601
885 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
886 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
888 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
889 [ "$STATUS" == "scanning-phase1" ] ||
890 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
892 # Sleep 3 sec to guarantee at least one object processed by LFSCK
894 echo "stop $SINGLEMDS"
895 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop $SINGLEMDS!"
897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
898 echo "start $SINGLEMDS"
899 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 5
901 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
902 mdd.${MDT_DEV}.lfsck_namespace |
903 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
905 error "(6) unexpected status"
908 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
914 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
915 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
916 for ((i = 0; i < 20; i++)); do
917 touch $DIR/$tdir/dummy${i}
920 #define OBD_FAIL_LFSCK_DELAY3 0x1602
921 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
922 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
923 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
924 mdd.${MDT_DEV}.lfsck_namespace |
925 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
927 error "(4) unexpected status"
931 echo "stop $SINGLEMDS"
932 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop $SINGLEMDS!"
934 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
935 echo "start $SINGLEMDS"
936 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 6
938 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
939 mdd.${MDT_DEV}.lfsck_namespace |
940 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
942 error "(7) unexpected status"
945 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
956 formatall > /dev/null
962 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
963 [ "$STATUS" == "init" ] ||
964 namespace_error "(2) Expect 'init', but got '$STATUS'"
966 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
967 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
968 mkdir $DIR/$tdir/crashed
970 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
971 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
972 for ((i = 0; i < 5; i++)); do
973 touch $DIR/$tdir/dummy${i}
976 umount_client $MOUNT || error "(3) Fail to stop client!"
978 #define OBD_FAIL_LFSCK_DELAY2 0x1601
979 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
981 namespace_error "(4) Fail to start LFSCK for namespace!"
983 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
984 [ "$STATUS" == "scanning-phase1" ] ||
985 namespace_error "(5) Expect 'scanning-phase1', but got '$STATUS'"
987 $STOP_LFSCK || namespace_error "(6) Fail to stop LFSCK!"
989 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
990 [ "$STATUS" == "stopped" ] ||
991 namespace_error "(7) Expect 'stopped', but got '$STATUS'"
994 namespace_error "(8) Fail to start LFSCK for namespace!"
996 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
997 [ "$STATUS" == "scanning-phase1" ] ||
998 namespace_error "(9) Expect 'scanning-phase1', but got '$STATUS'"
1000 #define OBD_FAIL_LFSCK_FATAL2 0x1609
1001 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
1002 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1003 mdd.${MDT_DEV}.lfsck_namespace |
1004 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
1006 namespace_error "(10) unexpected status"
1009 #define OBD_FAIL_LFSCK_DELAY1 0x1600
1010 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
1012 namespace_error "(11) Fail to start LFSCK for namespace!"
1014 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1015 [ "$STATUS" == "scanning-phase1" ] ||
1016 namespace_error "(12) Expect 'scanning-phase1', but got '$STATUS'"
1018 #define OBD_FAIL_LFSCK_CRASH 0x160a
1019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
1022 echo "stop $SINGLEMDS"
1023 stop $SINGLEMDS > /dev/null || namespace_error "(13) Fail to stop MDS!"
1025 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1026 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1028 echo "start $SINGLEMDS"
1029 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 14
1031 local timeout=$(max_recovery_time)
1034 while [ $timer -lt $timeout ]; do
1035 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1036 mdt.${MDT_DEV}.recovery_status |
1037 awk '/^status/ { print \\\$2 }'")
1038 [ "$STATUS" != "RECOVERING" ] && break;
1040 timer=$((timer + 1))
1043 [ $timer != $timeout ] || (
1044 do_facet $SINGLEMDS "$LCTL get_param -n \
1045 mdt.${MDT_DEV}.recovery_status"
1046 error "(14.1) recovery timeout"
1049 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1050 [ "$STATUS" == "crashed" ] ||
1051 namespace_error "(15) Expect 'crashed', but got '$STATUS'"
1053 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1054 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1056 namespace_error "(16) Fail to start LFSCK for namespace!"
1058 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1059 [ "$STATUS" == "scanning-phase1" ] ||
1060 namespace_error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1062 echo "stop $SINGLEMDS"
1063 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop $SINGLEMDS!"
1065 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1066 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1068 echo "start $SINGLEMDS"
1069 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 19
1072 while [ $timer -lt $timeout ]; do
1073 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1074 mdt.${MDT_DEV}.recovery_status |
1075 awk '/^status/ { print \\\$2 }'")
1076 [ "$STATUS" != "RECOVERING" ] && break;
1078 timer=$((timer + 1))
1081 [ $timer != $timeout ] || (
1082 do_facet $SINGLEMDS "$LCTL get_param -n \
1083 mdt.${MDT_DEV}.recovery_status"
1084 error "(19.1) recovery timeout"
1087 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1088 [ "$STATUS" == "paused" ] ||
1089 namespace_error "(20) Expect 'paused', but got '$STATUS'"
1091 echo "stop $SINGLEMDS"
1092 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1094 echo "start $SINGLEMDS without resume LFSCK"
1095 start_facet $SINGLEMDS "$MOUNT_OPTS_SKIP_LFSCK" 20.2
1098 while [ $timer -lt $timeout ]; do
1099 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1100 mdt.${MDT_DEV}.recovery_status |
1101 awk '/^status/ { print \\\$2 }'")
1102 [ "$STATUS" != "RECOVERING" ] && break;
1104 timer=$((timer + 1))
1107 [ $timer != $timeout ] || (
1108 do_facet $SINGLEMDS "$LCTL get_param -n \
1109 mdt.${MDT_DEV}.recovery_status"
1110 error "(20.3) recovery timeout"
1113 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1114 [ "$STATUS" == "paused" ] ||
1115 namespace_error "(20.4) Expect 'paused', but got '$STATUS'"
1117 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1118 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1121 namespace_error "(21) Fail to start LFSCK for namespace!"
1122 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1123 mdd.${MDT_DEV}.lfsck_namespace |
1124 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1126 namespace_error "(22) unexpected status"
1129 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1132 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1133 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1134 namespace_error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1136 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1137 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1138 mdd.${MDT_DEV}.lfsck_namespace |
1139 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1141 namespace_error "(24) unexpected status"
1144 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1146 namespace_error "(25) Expect empty flags, but got '$FLAGS'"
1148 run_test 8 "LFSCK state machine"
1151 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1152 skip "Testing on UP system, the speed may be inaccurate."
1156 check_mount_and_prep
1157 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1158 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1159 createmany -o $DIR/$tdir/lfsck/f 5000
1161 local BASE_SPEED1=100
1163 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1166 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1167 [ "$STATUS" == "scanning-phase1" ] ||
1168 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1170 local SPEED=$($SHOW_LAYOUT |
1171 awk '/^average_speed_phase1/ { print $2 }')
1173 # There may be time error, normally it should be less than 2 seconds.
1174 # We allow another 20% schedule error.
1176 # MAX_MARGIN = 1.3 = 13 / 10
1177 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1178 RUN_TIME1 * 13 / 10))
1179 [ $SPEED -lt $MAX_SPEED ] || {
1181 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1182 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1185 # adjust speed limit
1186 local BASE_SPEED2=300
1188 do_facet $SINGLEMDS \
1189 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1192 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1193 # MIN_MARGIN = 0.7 = 7 / 10
1194 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1195 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1196 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1197 [ $SPEED -gt $MIN_SPEED ] || {
1198 if [ $mds1_FSTYPE != ldiskfs ]; then
1199 error_ignore LU-5624 \
1200 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1203 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1207 # MAX_MARGIN = 1.3 = 13 / 10
1208 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1209 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1210 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1211 [ $SPEED -lt $MAX_SPEED ] || {
1213 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1214 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1215 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1218 do_nodes $(comma_list $(mdts_nodes)) \
1219 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1220 do_nodes $(comma_list $(osts_nodes)) \
1221 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1223 wait_update_facet $SINGLEMDS \
1224 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1225 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1226 error "(7) Failed to get expected 'completed'"
1228 run_test 9a "LFSCK speed control (1)"
1231 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1232 skip "Testing on UP system, the speed may be inaccurate."
1238 echo "Preparing another 50 * 50 files (with error) at $(date)."
1239 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1241 createmany -d $DIR/$tdir/d 50
1242 createmany -m $DIR/$tdir/f 50
1243 for ((i = 0; i < 50; i++)); do
1244 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1247 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1249 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1250 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1251 mdd.${MDT_DEV}.lfsck_namespace |
1252 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1254 error "(5) unexpected status"
1257 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1258 echo "Prepared at $(date)."
1260 local BASE_SPEED1=50
1262 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1265 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1266 [ "$STATUS" == "scanning-phase2" ] ||
1267 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1269 local SPEED=$($SHOW_NAMESPACE |
1270 awk '/^average_speed_phase2/ { print $2 }')
1271 # There may be time error, normally it should be less than 2 seconds.
1272 # We allow another 20% schedule error.
1274 # MAX_MARGIN = 1.3 = 13 / 10
1275 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1276 RUN_TIME1 * 13 / 10))
1277 [ $SPEED -lt $MAX_SPEED ] || {
1279 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1280 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1283 # adjust speed limit
1284 local BASE_SPEED2=150
1286 do_facet $SINGLEMDS \
1287 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1290 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1291 # MIN_MARGIN = 0.7 = 7 / 10
1292 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1293 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1294 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1295 [ $SPEED -gt $MIN_SPEED ] || {
1296 if [ $mds1_FSTYPE != ldiskfs ]; then
1297 error_ignore LU-5624 \
1298 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1301 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1305 # MAX_MARGIN = 1.3 = 13 / 10
1306 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1307 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1308 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1309 [ $SPEED -lt $MAX_SPEED ] || {
1311 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1312 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1313 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1316 do_nodes $(comma_list $(mdts_nodes)) \
1317 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1318 do_nodes $(comma_list $(osts_nodes)) \
1319 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1320 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1321 mdd.${MDT_DEV}.lfsck_namespace |
1322 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1324 error "(11) unexpected status"
1327 run_test 9b "LFSCK speed control (2)"
1331 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1335 echo "Preparing more files with error at $(date)."
1336 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1337 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1339 for ((i = 0; i < 1000; i = $((i+2)))); do
1340 mkdir -p $DIR/$tdir/d${i}
1341 touch $DIR/$tdir/f${i}
1342 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1345 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1348 for ((i = 1; i < 1000; i = $((i+2)))); do
1349 mkdir -p $DIR/$tdir/d${i}
1350 touch $DIR/$tdir/f${i}
1351 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1354 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1355 echo "Prepared at $(date)."
1357 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1359 umount_client $MOUNT
1360 mount_client $MOUNT || error "(3) Fail to start client!"
1362 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1365 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1366 [ "$STATUS" == "scanning-phase1" ] ||
1367 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1369 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1371 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1373 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1375 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1377 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1379 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1381 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1383 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1384 error "(14) Fail to softlink!"
1386 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1387 [ "$STATUS" == "scanning-phase1" ] ||
1388 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1390 do_nodes $(comma_list $(mdts_nodes)) \
1391 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1392 do_nodes $(comma_list $(osts_nodes)) \
1393 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1394 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1395 mdd.${MDT_DEV}.lfsck_namespace |
1396 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1398 error "(16) unexpected status"
1401 run_test 10 "System is available during LFSCK scanning"
1404 ost_remove_lastid() {
1407 local rcmd="do_facet ost${ost}"
1409 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1411 # step 1: local mount
1412 mount_fstype ost${ost} || return 1
1413 # step 2: remove the specified LAST_ID
1414 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1416 unmount_fstype ost${ost} || return 2
1420 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1421 skip "MDS older than 2.5.55, LU-1267"
1423 check_mount_and_prep
1424 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1425 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1430 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1432 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1433 error "(2) Fail to start ost1"
1435 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1436 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1438 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1439 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1441 wait_update_facet ost1 "$LCTL get_param -n \
1442 obdfilter.${OST_DEV}.lfsck_layout |
1443 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1445 error "(5) unexpected status"
1448 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1450 wait_update_facet ost1 "$LCTL get_param -n \
1451 obdfilter.${OST_DEV}.lfsck_layout |
1452 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1454 error "(6) unexpected status"
1457 echo "the LAST_ID(s) should have been rebuilt"
1458 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1459 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1461 run_test 11a "LFSCK can rebuild lost last_id"
1464 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1465 skip "MDS older than 2.5.55, LU-1267"
1467 check_mount_and_prep
1468 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1470 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1471 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1472 do_facet ost1 $LCTL set_param fail_loc=0x160d
1474 local count=$(precreated_ost_obj_count 0 0)
1476 createmany -o $DIR/$tdir/f $((count + 32))
1478 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1479 local seq=$(do_facet mds1 $LCTL get_param -n \
1480 osp.${proc_path}.prealloc_last_seq)
1481 local id_used=$(do_facet mds1 $LCTL get_param -n \
1482 osp.${proc_path}.prealloc_last_id)
1484 umount_client $MOUNT
1485 stop ost1 || error "(1) Fail to stop ost1"
1487 #define OBD_FAIL_OST_ENOSPC 0x215
1488 do_facet ost1 $LCTL set_param fail_loc=0x215
1490 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1491 error "(2) Fail to start ost1"
1493 for ((i = 0; i < 60; i++)); do
1494 id_ost1=$(do_facet ost1 \
1495 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1496 awk -F: "/$seq/ { print \$2 }")
1497 [ -n "$id_ost1" ] && break
1501 echo "the on-disk LAST_ID should be smaller than the expected one"
1502 [ $id_used -gt $id_ost1 ] ||
1503 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1505 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1506 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1508 wait_update_facet ost1 \
1509 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1510 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1512 error "(6) unexpected status"
1515 stop ost1 || error "(7) Fail to stop ost1"
1517 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1518 error "(8) Fail to start ost1"
1520 echo "the on-disk LAST_ID should have been rebuilt"
1521 # last_id may be larger than $id_used if objects were created/skipped
1522 wait_update_facet_cond ost1 \
1523 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1524 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1525 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1526 error "(9) expect last_id >= id_used $seq:$id_used"
1529 do_facet ost1 $LCTL set_param fail_loc=0
1530 stopall || error "(10) Fail to stopall"
1532 run_test 11b "LFSCK can rebuild crashed last_id"
1535 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1536 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1537 skip "MDS older than 2.5.55, LU-3950"
1539 check_mount_and_prep
1540 for k in $(seq $MDSCOUNT); do
1541 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1542 createmany -o $DIR/$tdir/${k}/f 100 ||
1543 error "(0) Fail to create 100 files."
1546 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1547 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1548 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1550 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1551 wait_all_targets namespace scanning-phase1 3
1553 echo "Stop namespace LFSCK on all targets by single lctl command."
1554 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1555 error "(4) Fail to stop LFSCK on all devices!"
1557 echo "All the LFSCK targets should be in 'stopped' status."
1558 wait_all_targets_blocked namespace stopped 5
1560 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1561 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1562 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1564 echo "All the LFSCK targets should be in 'completed' status."
1565 wait_all_targets_blocked namespace completed 7
1567 start_full_debug_logging
1569 echo "Start layout LFSCK on all targets by single command (-s 1)."
1570 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1571 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1573 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1574 wait_all_targets layout scanning-phase1 9
1576 echo "Stop layout LFSCK on all targets by single lctl command."
1577 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1578 error "(10) Fail to stop LFSCK on all devices!"
1580 echo "All the LFSCK targets should be in 'stopped' status."
1581 wait_all_targets_blocked layout stopped 11
1583 for k in $(seq $OSTCOUNT); do
1584 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1585 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1586 awk '/^status/ { print $2 }')
1587 [ "$STATUS" == "stopped" ] ||
1588 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1591 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1592 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1593 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1595 echo "All the LFSCK targets should be in 'completed' status."
1596 wait_all_targets_blocked layout completed 14
1598 stop_full_debug_logging
1600 run_test 12a "single command to trigger LFSCK on all devices"
1603 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1604 skip "MDS older than 2.5.55, LU-3950"
1606 check_mount_and_prep
1608 echo "Start LFSCK without '-M' specified."
1609 do_facet mds1 $LCTL lfsck_start -A -r ||
1610 error "(0) Fail to start LFSCK without '-M'"
1612 wait_all_targets_blocked namespace completed 1
1613 wait_all_targets_blocked layout completed 2
1615 local count=$(do_facet mds1 $LCTL dl |
1616 awk '{ print $3 }' | grep mdt | wc -l)
1617 if [ $count -gt 1 ]; then
1619 echo "Start layout LFSCK on the node with multipe targets,"
1620 echo "but not specify '-M'/'-A' option. Should get failure."
1622 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1623 error "(3) Start layout LFSCK should fail" || true
1626 run_test 12b "auto detect Lustre device"
1629 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1630 skip "MDS older than 2.5.55, LU-3593"
1633 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1634 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1635 echo "MDT-object FID."
1638 check_mount_and_prep
1640 echo "Inject failure stub to simulate bad lmm_oi"
1641 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1642 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1643 createmany -o $DIR/$tdir/f 1
1644 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1645 error "(0) Fail to create PFL $DIR/$tdir/f1"
1646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1648 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1649 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1651 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1652 mdd.${MDT_DEV}.lfsck_layout |
1653 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1655 error "(2) unexpected status"
1658 local repaired=$($SHOW_LAYOUT |
1659 awk '/^repaired_others/ { print $2 }')
1660 [ $repaired -eq 2 ] ||
1661 error "(3) Fail to repair crashed lmm_oi: $repaired"
1663 run_test 13 "LFSCK can repair crashed lmm_oi"
1666 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1667 skip "MDS older than 2.5.55, LU-3590"
1670 echo "The OST-object referenced by the MDT-object should be there;"
1671 echo "otherwise, the LFSCK should re-create the missing OST-object."
1672 echo "without '--delay-create-ostobj' option."
1675 check_mount_and_prep
1676 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1678 echo "Inject failure stub to simulate dangling referenced MDT-object"
1679 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1680 do_facet ost1 $LCTL set_param fail_loc=0x1610
1681 local count=$(precreated_ost_obj_count 0 0)
1683 createmany -o $DIR/$tdir/f $((count + 16)) ||
1684 error "(0.1) Fail to create $DIR/$tdir/fx"
1685 touch $DIR/$tdir/guard0
1687 for ((i = 0; i < 16; i++)); do
1688 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1689 $DIR/$tdir/f_comp${i} ||
1690 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1692 touch $DIR/$tdir/guard1
1694 do_facet ost1 $LCTL set_param fail_loc=0
1696 start_full_debug_logging
1698 # exhaust other pre-created dangling cases
1699 count=$(precreated_ost_obj_count 0 0)
1700 createmany -o $DIR/$tdir/a $count ||
1701 error "(0.5) Fail to create $count files."
1703 echo "'ls' should fail because of dangling referenced MDT-object"
1704 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1706 echo "Trigger layout LFSCK to find out dangling reference"
1707 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1709 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1710 mdd.${MDT_DEV}.lfsck_layout |
1711 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1713 error "(3) unexpected status"
1716 local repaired=$($SHOW_LAYOUT |
1717 awk '/^repaired_dangling/ { print $2 }')
1718 [ $repaired -ge 32 ] ||
1719 error "(4) Fail to repair dangling reference: $repaired"
1721 echo "'stat' should fail because of not repair dangling by default"
1722 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1723 error "(5.1) stat should fail"
1724 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1725 error "(5.2) stat should fail"
1727 echo "Trigger layout LFSCK to repair dangling reference"
1728 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1730 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1731 mdd.${MDT_DEV}.lfsck_layout |
1732 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1734 error "(7) unexpected status"
1737 # There may be some async LFSCK updates in processing, wait for
1738 # a while until the target reparation has been done. LU-4970.
1740 echo "'stat' should success after layout LFSCK repairing"
1741 wait_update_facet client "stat $DIR/$tdir/guard0 |
1742 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1743 stat $DIR/$tdir/guard0
1745 error "(8.1) unexpected size"
1748 wait_update_facet client "stat $DIR/$tdir/guard1 |
1749 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1750 stat $DIR/$tdir/guard1
1752 error "(8.2) unexpected size"
1755 repaired=$($SHOW_LAYOUT |
1756 awk '/^repaired_dangling/ { print $2 }')
1757 [ $repaired -ge 32 ] ||
1758 error "(9) Fail to repair dangling reference: $repaired"
1760 stop_full_debug_logging
1762 echo "stopall to cleanup object cache"
1765 setupall > /dev/null
1767 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1770 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1771 skip "MDS older than 2.5.55, LU-3590"
1774 echo "The OST-object referenced by the MDT-object should be there;"
1775 echo "otherwise, the LFSCK should re-create the missing OST-object."
1776 echo "with '--delay-create-ostobj' option."
1779 check_mount_and_prep
1780 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1782 echo "Inject failure stub to simulate dangling referenced MDT-object"
1783 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1784 do_facet ost1 $LCTL set_param fail_loc=0x1610
1785 local count=$(precreated_ost_obj_count 0 0)
1787 createmany -o $DIR/$tdir/f $((count + 31))
1788 touch $DIR/$tdir/guard
1789 do_facet ost1 $LCTL set_param fail_loc=0
1791 start_full_debug_logging
1793 # exhaust other pre-created dangling cases
1794 count=$(precreated_ost_obj_count 0 0)
1795 createmany -o $DIR/$tdir/a $count ||
1796 error "(0) Fail to create $count files."
1798 echo "'ls' should fail because of dangling referenced MDT-object"
1799 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1801 echo "Trigger layout LFSCK to find out dangling reference"
1802 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1804 wait_all_targets_blocked layout completed 3
1806 local repaired=$($SHOW_LAYOUT |
1807 awk '/^repaired_dangling/ { print $2 }')
1808 [ $repaired -ge 32 ] ||
1809 error "(4) Fail to repair dangling reference: $repaired"
1811 echo "'stat' should fail because of not repair dangling by default"
1812 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1814 echo "Trigger layout LFSCK to repair dangling reference"
1815 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1817 wait_all_targets_blocked layout completed 7
1819 # There may be some async LFSCK updates in processing, wait for
1820 # a while until the target reparation has been done. LU-4970.
1822 echo "'stat' should success after layout LFSCK repairing"
1823 wait_update_facet client "stat $DIR/$tdir/guard |
1824 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1825 stat $DIR/$tdir/guard
1827 error "(8) unexpected size"
1830 repaired=$($SHOW_LAYOUT |
1831 awk '/^repaired_dangling/ { print $2 }')
1832 [ $repaired -ge 32 ] ||
1833 error "(9) Fail to repair dangling reference: $repaired"
1835 stop_full_debug_logging
1837 echo "stopall to cleanup object cache"
1840 setupall > /dev/null
1842 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1845 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1846 skip "MDS older than 2.5.55, LU-3591"
1849 echo "If the OST-object referenced by the MDT-object back points"
1850 echo "to some non-exist MDT-object, then the LFSCK should repair"
1851 echo "the OST-object to back point to the right MDT-object."
1854 check_mount_and_prep
1855 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1857 echo "Inject failure stub to make the OST-object to back point to"
1858 echo "non-exist MDT-object."
1859 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1861 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1862 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1863 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1865 error "(0) Fail to create PFL $DIR/$tdir/f1"
1866 # 'dd' will trigger punch RPC firstly on every OST-objects.
1867 # So even though some OST-object will not be write by 'dd',
1868 # as long as it is allocated (may be NOT allocated in pfl_3b)
1869 # its layout information will be set also.
1870 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1871 cancel_lru_locks osc
1872 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1874 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1875 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1877 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1878 mdd.${MDT_DEV}.lfsck_layout |
1879 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1881 error "(2) unexpected status"
1884 local repaired=$($SHOW_LAYOUT |
1885 awk '/^repaired_unmatched_pair/ { print $2 }')
1886 [ $repaired -ge 3 ] ||
1887 error "(3) Fail to repair unmatched pair: $repaired"
1889 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1892 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1893 skip "MDS older than 2.5.55, LU-3591"
1896 echo "If the OST-object referenced by the MDT-object back points"
1897 echo "to other MDT-object that doesn't recognize the OST-object,"
1898 echo "then the LFSCK should repair it to back point to the right"
1899 echo "MDT-object (the first one)."
1902 check_mount_and_prep
1903 mkdir -p $DIR/$tdir/0
1904 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1905 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1906 cancel_lru_locks osc
1908 echo "Inject failure stub to make the OST-object to back point to"
1909 echo "other MDT-object"
1912 [ $OSTCOUNT -ge 2 ] && stripes=2
1914 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1915 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1916 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1917 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1919 error "(0) Fail to create PFL $DIR/$tdir/f1"
1920 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1921 cancel_lru_locks osc
1922 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1924 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1925 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1927 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1928 mdd.${MDT_DEV}.lfsck_layout |
1929 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1931 error "(2) unexpected status"
1934 local repaired=$($SHOW_LAYOUT |
1935 awk '/^repaired_unmatched_pair/ { print $2 }')
1936 [ $repaired -eq 4 ] ||
1937 error "(3) Fail to repair unmatched pair: $repaired"
1939 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1942 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1943 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1944 skip "MDS newer than 2.7.55, LU-6475"
1945 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1946 skip "MDS older than 2.5.55, LU-3591"
1949 echo "According to current metadata migration implementation,"
1950 echo "before the old MDT-object is removed, both the new MDT-object"
1951 echo "and old MDT-object will reference the same LOV layout. Then if"
1952 echo "the layout LFSCK finds the new MDT-object by race, it will"
1953 echo "regard related OST-object(s) as multiple referenced case, and"
1954 echo "will try to create new OST-object(s) for the new MDT-object."
1955 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1956 echo "MDT-object before confirm the multiple referenced case."
1959 check_mount_and_prep
1960 $LFS mkdir -i 1 $DIR/$tdir/a1
1961 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1962 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1963 cancel_lru_locks osc
1965 echo "Inject failure stub on MDT1 to delay the migration"
1967 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1968 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1969 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1970 $LFS migrate -m 0 $DIR/$tdir/a1 &
1973 echo "Trigger layout LFSCK to race with the migration"
1974 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1976 wait_all_targets_blocked layout completed 2
1978 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1979 local repaired=$($SHOW_LAYOUT |
1980 awk '/^repaired_unmatched_pair/ { print $2 }')
1981 [ $repaired -eq 1 ] ||
1982 error "(3) Fail to repair unmatched pair: $repaired"
1984 repaired=$($SHOW_LAYOUT |
1985 awk '/^repaired_multiple_referenced/ { print $2 }')
1986 [ $repaired -eq 0 ] ||
1987 error "(4) Unexpectedly repaird multiple references: $repaired"
1989 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1992 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1993 skip "MDS older than 2.5.55, LU-3594"
1996 echo "If the OST-object's owner information does not match the owner"
1997 echo "information stored in the MDT-object, then the LFSCK trust the"
1998 echo "MDT-object and update the OST-object's owner information."
2001 check_mount_and_prep
2002 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2003 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
2004 cancel_lru_locks osc
2006 # created but no setattr or write to the file.
2008 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
2009 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
2011 echo "Inject failure stub to skip OST-object owner changing"
2012 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
2013 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
2014 chown 1.1 $DIR/$tdir/f0
2015 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2017 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
2020 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
2022 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2023 mdd.${MDT_DEV}.lfsck_layout |
2024 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2026 error "(2) unexpected status"
2029 local repaired=$($SHOW_LAYOUT |
2030 awk '/^repaired_inconsistent_owner/ { print $2 }')
2031 [ $repaired -eq 1 ] ||
2032 error "(3) Fail to repair inconsistent owner: $repaired"
2034 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2037 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2038 skip "MDS older than 2.5.55, LU-3594"
2041 echo "If more than one MDT-objects reference the same OST-object,"
2042 echo "and the OST-object only recognizes one MDT-object, then the"
2043 echo "LFSCK should create new OST-objects for such non-recognized"
2047 check_mount_and_prep
2048 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2050 echo "Inject failure stub to make two MDT-objects to refernce"
2051 echo "the OST-object"
2053 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2054 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2055 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2056 cancel_lru_locks mdc
2057 cancel_lru_locks osc
2059 createmany -o $DIR/$tdir/f 1
2060 cancel_lru_locks mdc
2061 cancel_lru_locks osc
2063 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2065 error "(0) Fail to create PFL $DIR/$tdir/f1"
2066 cancel_lru_locks mdc
2067 cancel_lru_locks osc
2068 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2070 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2071 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2072 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2073 [ $size -eq 1048576 ] ||
2074 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2076 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2077 [ $size -eq 1048576 ] ||
2078 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2080 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2083 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2085 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2086 mdd.${MDT_DEV}.lfsck_layout |
2087 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2089 error "(3) unexpected status"
2092 local repaired=$($SHOW_LAYOUT |
2093 awk '/^repaired_multiple_referenced/ { print $2 }')
2094 [ $repaired -eq 2 ] ||
2095 error "(4) Fail to repair multiple references: $repaired"
2097 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2098 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2099 error "(5) Fail to write f0."
2100 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2101 [ $size -eq 1048576 ] ||
2102 error "(6) guard size should be 1048576, but got $size"
2104 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2105 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2106 error "(7) Fail to write f1."
2107 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2108 [ $size -eq 1048576 ] ||
2109 error "(8) guard size should be 1048576, but got $size"
2111 run_test 17 "LFSCK can repair multiple references"
2113 $LCTL set_param debug=+cache > /dev/null
2116 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2117 skip "MDS older than 2.5.55, LU-3336"
2120 echo "The target MDT-object is there, but related stripe information"
2121 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2122 echo "layout EA entries."
2125 check_mount_and_prep
2126 $LFS mkdir -i 0 $DIR/$tdir/a1
2127 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2128 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2130 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2132 $LFS path2fid $DIR/$tdir/a1/f1
2133 $LFS getstripe $DIR/$tdir/a1/f1
2135 if [ $MDSCOUNT -ge 2 ]; then
2136 $LFS mkdir -i 1 $DIR/$tdir/a2
2137 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2138 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2139 $LFS path2fid $DIR/$tdir/a2/f2
2140 $LFS getstripe $DIR/$tdir/a2/f2
2143 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2144 error "(0) Fail to create PFL $DIR/$tdir/f3"
2146 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2148 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2150 $LFS path2fid $DIR/$tdir/f3
2151 $LFS getstripe $DIR/$tdir/f3
2153 cancel_lru_locks osc
2155 echo "Inject failure, to make the MDT-object lost its layout EA"
2156 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2157 do_facet mds1 $LCTL set_param fail_loc=0x1615
2158 chown 1.1 $DIR/$tdir/a1/f1
2160 if [ $MDSCOUNT -ge 2 ]; then
2161 do_facet mds2 $LCTL set_param fail_loc=0x1615
2162 chown 1.1 $DIR/$tdir/a2/f2
2165 chown 1.1 $DIR/$tdir/f3
2170 do_facet mds1 $LCTL set_param fail_loc=0
2171 if [ $MDSCOUNT -ge 2 ]; then
2172 do_facet mds2 $LCTL set_param fail_loc=0
2175 cancel_lru_locks mdc
2176 cancel_lru_locks osc
2178 echo "The file size should be incorrect since layout EA is lost"
2179 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2180 [ "$cur_size" != "$saved_size1" ] ||
2181 error "(1) Expect incorrect file1 size"
2183 if [ $MDSCOUNT -ge 2 ]; then
2184 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2185 [ "$cur_size" != "$saved_size1" ] ||
2186 error "(2) Expect incorrect file2 size"
2189 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2190 [ "$cur_size" != "$saved_size2" ] ||
2191 error "(1.2) Expect incorrect file3 size"
2193 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2194 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2196 for k in $(seq $MDSCOUNT); do
2197 # The LFSCK status query internal is 30 seconds. For the case
2198 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2199 # time to guarantee the status sync up.
2200 wait_update_facet mds${k} "$LCTL get_param -n \
2201 mdd.$(facet_svc mds${k}).lfsck_layout |
2202 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2203 error "(4) MDS${k} is not the expected 'completed'"
2206 for k in $(seq $OSTCOUNT); do
2207 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2208 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2209 awk '/^status/ { print $2 }')
2210 [ "$cur_status" == "completed" ] ||
2211 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2214 local repaired=$(do_facet mds1 $LCTL get_param -n \
2215 mdd.$(facet_svc mds1).lfsck_layout |
2216 awk '/^repaired_orphan/ { print $2 }')
2217 [ $repaired -eq 3 ] ||
2218 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2220 if [ $MDSCOUNT -ge 2 ]; then
2221 repaired=$(do_facet mds2 $LCTL get_param -n \
2222 mdd.$(facet_svc mds2).lfsck_layout |
2223 awk '/^repaired_orphan/ { print $2 }')
2224 [ $repaired -eq 2 ] ||
2225 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2228 $LFS path2fid $DIR/$tdir/a1/f1
2229 $LFS getstripe $DIR/$tdir/a1/f1
2231 if [ $MDSCOUNT -ge 2 ]; then
2232 $LFS path2fid $DIR/$tdir/a2/f2
2233 $LFS getstripe $DIR/$tdir/a2/f2
2236 $LFS path2fid $DIR/$tdir/f3
2237 $LFS getstripe $DIR/$tdir/f3
2239 echo "The file size should be correct after layout LFSCK scanning"
2240 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2241 [ "$cur_size" == "$saved_size1" ] ||
2242 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2244 if [ $MDSCOUNT -ge 2 ]; then
2245 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2246 [ "$cur_size" == "$saved_size1" ] ||
2247 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2250 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2251 [ "$cur_size" == "$saved_size2" ] ||
2252 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2254 run_test 18a "Find out orphan OST-object and repair it (1)"
2257 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2258 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2259 skip "MDS older than 2.5.55, LU-3336"
2262 echo "The target MDT-object is lost. The LFSCK should re-create the"
2263 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2264 echo "can move it back to normal namespace manually."
2267 check_mount_and_prep
2268 $LFS mkdir -i 0 $DIR/$tdir/a1
2269 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2270 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2271 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2272 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2274 $LFS getstripe $DIR/$tdir/a1/f1
2276 if [ $MDSCOUNT -ge 2 ]; then
2277 $LFS mkdir -i 1 $DIR/$tdir/a2
2278 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2279 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2280 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2282 $LFS getstripe $DIR/$tdir/a2/f2
2285 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2286 error "(0) Fail to create PFL $DIR/$tdir/f3"
2288 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2290 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2291 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2293 $LFS getstripe $DIR/$tdir/f3
2295 cancel_lru_locks osc
2297 echo "Inject failure, to simulate the case of missing the MDT-object"
2298 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2299 do_facet mds1 $LCTL set_param fail_loc=0x1616
2300 rm -f $DIR/$tdir/a1/f1
2302 if [ $MDSCOUNT -ge 2 ]; then
2303 do_facet mds2 $LCTL set_param fail_loc=0x1616
2304 rm -f $DIR/$tdir/a2/f2
2312 do_facet mds1 $LCTL set_param fail_loc=0
2313 if [ $MDSCOUNT -ge 2 ]; then
2314 do_facet mds2 $LCTL set_param fail_loc=0
2317 cancel_lru_locks mdc
2318 cancel_lru_locks osc
2320 # dryrun mode only check orphans, not repaie
2321 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2322 $START_LAYOUT --dryrun -o -r ||
2323 error "Fail to start layout LFSCK in dryrun mode"
2324 wait_all_targets_blocked layout completed 2
2326 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2327 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2328 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2330 local orphans=$(do_facet mds1 $LCTL get_param -n \
2331 mdd.$(facet_svc mds1).lfsck_layout |
2332 awk '/^inconsistent_orphan/ { print $2 }')
2333 [ $orphans -eq 3 ] ||
2334 error "Expect 3 found on mds1, but got: $orphans"
2336 # orphan parents should not be created
2338 for subdir in $MOUNT/.lustre/lost+found/*; do
2339 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2342 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2343 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2345 for k in $(seq $MDSCOUNT); do
2346 # The LFSCK status query internal is 30 seconds. For the case
2347 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2348 # time to guarantee the status sync up.
2349 wait_update_facet mds${k} "$LCTL get_param -n \
2350 mdd.$(facet_svc mds${k}).lfsck_layout |
2351 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2352 error "(2) MDS${k} is not the expected 'completed'"
2355 for k in $(seq $OSTCOUNT); do
2356 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2357 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2358 awk '/^status/ { print $2 }')
2359 [ "$cur_status" == "completed" ] ||
2360 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2363 local repaired=$(do_facet mds1 $LCTL get_param -n \
2364 mdd.$(facet_svc mds1).lfsck_layout |
2365 awk '/^repaired_orphan/ { print $2 }')
2366 [ $repaired -eq 3 ] ||
2367 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2369 if [ $MDSCOUNT -ge 2 ]; then
2370 repaired=$(do_facet mds2 $LCTL get_param -n \
2371 mdd.$(facet_svc mds2).lfsck_layout |
2372 awk '/^repaired_orphan/ { print $2 }')
2373 [ $repaired -eq 2 ] ||
2374 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2377 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2378 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2379 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2381 if [ $MDSCOUNT -ge 2 ]; then
2382 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2383 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2386 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2387 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2389 $LFS path2fid $DIR/$tdir/a1/f1
2390 $LFS getstripe $DIR/$tdir/a1/f1
2392 if [ $MDSCOUNT -ge 2 ]; then
2393 $LFS path2fid $DIR/$tdir/a2/f2
2394 $LFS getstripe $DIR/$tdir/a2/f2
2397 $LFS path2fid $DIR/$tdir/f3
2398 $LFS getstripe $DIR/$tdir/f3
2400 echo "The file size should be correct after layout LFSCK scanning"
2401 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2402 [ "$cur_size" == "$saved_size1" ] ||
2403 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2405 if [ $MDSCOUNT -ge 2 ]; then
2406 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2407 [ "$cur_size" == "$saved_size1" ] ||
2408 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2411 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2412 [ "$cur_size" == "$saved_size2" ] ||
2413 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2415 run_test 18b "Find out orphan OST-object and repair it (2)"
2418 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2419 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2420 skip "MDS older than 2.5.55, LU-3336"
2423 echo "The target MDT-object is lost, and the OST-object FID is missing."
2424 echo "The LFSCK should re-create the MDT-object with new FID under the "
2425 echo "directory .lustre/lost+found/MDTxxxx."
2428 check_mount_and_prep
2429 $LFS mkdir -i 0 $DIR/$tdir/a1
2430 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2432 echo "Inject failure, to simulate the case of missing parent FID"
2433 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2434 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2436 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2437 $LFS getstripe $DIR/$tdir/a1/f1
2439 if [ $MDSCOUNT -ge 2 ]; then
2440 $LFS mkdir -i 1 $DIR/$tdir/a2
2441 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2442 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2443 $LFS getstripe $DIR/$tdir/a2/f2
2446 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2447 error "(0) Fail to create PFL $DIR/$tdir/f3"
2449 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2450 $LFS getstripe $DIR/$tdir/f3
2452 cancel_lru_locks osc
2453 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2455 echo "Inject failure, to simulate the case of missing the MDT-object"
2456 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2457 do_facet mds1 $LCTL set_param fail_loc=0x1616
2458 rm -f $DIR/$tdir/a1/f1
2460 if [ $MDSCOUNT -ge 2 ]; then
2461 do_facet mds2 $LCTL set_param fail_loc=0x1616
2462 rm -f $DIR/$tdir/a2/f2
2470 do_facet mds1 $LCTL set_param fail_loc=0
2471 if [ $MDSCOUNT -ge 2 ]; then
2472 do_facet mds2 $LCTL set_param fail_loc=0
2475 cancel_lru_locks mdc
2476 cancel_lru_locks osc
2478 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2479 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2481 for k in $(seq $MDSCOUNT); do
2482 # The LFSCK status query internal is 30 seconds. For the case
2483 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2484 # time to guarantee the status sync up.
2485 wait_update_facet mds${k} "$LCTL get_param -n \
2486 mdd.$(facet_svc mds${k}).lfsck_layout |
2487 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2488 error "(2) MDS${k} is not the expected 'completed'"
2491 for k in $(seq $OSTCOUNT); do
2492 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2493 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2494 awk '/^status/ { print $2 }')
2495 [ "$cur_status" == "completed" ] ||
2496 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2499 if [ $MDSCOUNT -ge 2 ]; then
2505 local repaired=$(do_facet mds1 $LCTL get_param -n \
2506 mdd.$(facet_svc mds1).lfsck_layout |
2507 awk '/^repaired_orphan/ { print $2 }')
2508 [ $repaired -eq $expected ] ||
2509 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2511 if [ $MDSCOUNT -ge 2 ]; then
2512 repaired=$(do_facet mds2 $LCTL get_param -n \
2513 mdd.$(facet_svc mds2).lfsck_layout |
2514 awk '/^repaired_orphan/ { print $2 }')
2515 [ $repaired -eq 0 ] ||
2516 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2519 ls -ail $MOUNT/.lustre/lost+found/
2521 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2522 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2523 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2525 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2528 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2529 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2530 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2532 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2533 [ ! -z "$cname" ] ||
2534 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2536 run_test 18c "Find out orphan OST-object and repair it (3)"
2539 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2540 skip "MDS older than 2.5.55, LU-3336"
2543 echo "The target MDT-object layout EA is corrupted, but the right"
2544 echo "OST-object is still alive as orphan. The layout LFSCK will"
2545 echo "not create new OST-object to occupy such slot."
2548 check_mount_and_prep
2550 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2551 echo "guard" > $DIR/$tdir/a1/f1
2552 echo "foo" > $DIR/$tdir/a1/f2
2554 echo "guard" > $DIR/$tdir/a1/f3
2555 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2556 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2557 echo "foo" > $DIR/$tdir/a1/f4
2559 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2560 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2561 $LFS path2fid $DIR/$tdir/a1/f1
2562 $LFS getstripe $DIR/$tdir/a1/f1
2563 $LFS path2fid $DIR/$tdir/a1/f2
2564 $LFS getstripe $DIR/$tdir/a1/f2
2565 $LFS path2fid $DIR/$tdir/a1/f3
2566 $LFS getstripe $DIR/$tdir/a1/f3
2567 $LFS path2fid $DIR/$tdir/a1/f4
2568 $LFS getstripe $DIR/$tdir/a1/f4
2569 cancel_lru_locks osc
2571 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2572 echo "to reference the same OST-object (which is f1's OST-obejct)."
2573 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2574 echo "dangling reference case, but f2's old OST-object is there."
2576 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2577 echo "to reference the same OST-object (which is f3's OST-obejct)."
2578 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2579 echo "dangling reference case, but f4's old OST-object is there."
2582 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2583 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2584 chown 1.1 $DIR/$tdir/a1/f2
2585 chown 1.1 $DIR/$tdir/a1/f4
2586 rm -f $DIR/$tdir/a1/f1
2587 rm -f $DIR/$tdir/a1/f3
2590 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2592 echo "stopall to cleanup object cache"
2595 setupall > /dev/null
2597 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2598 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2600 for k in $(seq $MDSCOUNT); do
2601 # The LFSCK status query internal is 30 seconds. For the case
2602 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2603 # time to guarantee the status sync up.
2604 wait_update_facet mds${k} "$LCTL get_param -n \
2605 mdd.$(facet_svc mds${k}).lfsck_layout |
2606 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2607 error "(3) MDS${k} is not the expected 'completed'"
2610 for k in $(seq $OSTCOUNT); do
2611 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2612 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2613 awk '/^status/ { print $2 }')
2614 [ "$cur_status" == "completed" ] ||
2615 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2618 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2619 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2620 awk '/^repaired_orphan/ { print $2 }')
2621 [ $repaired -eq 2 ] ||
2622 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2624 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2625 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2626 awk '/^repaired_dangling/ { print $2 }')
2627 [ $repaired -eq 0 ] ||
2628 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2630 echo "The file size should be correct after layout LFSCK scanning"
2631 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2632 [ "$cur_size" == "$saved_size1" ] ||
2633 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2635 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2636 [ "$cur_size" == "$saved_size2" ] ||
2637 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2639 echo "The LFSCK should find back the original data."
2640 cat $DIR/$tdir/a1/f2
2641 $LFS path2fid $DIR/$tdir/a1/f2
2642 $LFS getstripe $DIR/$tdir/a1/f2
2643 cat $DIR/$tdir/a1/f4
2644 $LFS path2fid $DIR/$tdir/a1/f4
2645 $LFS getstripe $DIR/$tdir/a1/f4
2647 run_test 18d "Find out orphan OST-object and repair it (4)"
2650 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2651 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2652 skip "MDS older than 2.5.55, LU-3336"
2655 echo "The target MDT-object layout EA slot is occpuied by some new"
2656 echo "created OST-object when repair dangling reference case. Such"
2657 echo "conflict OST-object has been modified by others. To keep the"
2658 echo "new data, the LFSCK will create a new file to refernece this"
2659 echo "old orphan OST-object."
2662 check_mount_and_prep
2664 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2665 echo "guard" > $DIR/$tdir/a1/f1
2666 echo "foo" > $DIR/$tdir/a1/f2
2668 echo "guard" > $DIR/$tdir/a1/f3
2669 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2670 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2671 echo "foo" > $DIR/$tdir/a1/f4
2673 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2674 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2676 $LFS path2fid $DIR/$tdir/a1/f1
2677 $LFS getstripe $DIR/$tdir/a1/f1
2678 $LFS path2fid $DIR/$tdir/a1/f2
2679 $LFS getstripe $DIR/$tdir/a1/f2
2680 $LFS path2fid $DIR/$tdir/a1/f3
2681 $LFS getstripe $DIR/$tdir/a1/f3
2682 $LFS path2fid $DIR/$tdir/a1/f4
2683 $LFS getstripe $DIR/$tdir/a1/f4
2684 cancel_lru_locks osc
2686 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2687 echo "to reference the same OST-object (which is f1's OST-obejct)."
2688 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2689 echo "dangling reference case, but f2's old OST-object is there."
2691 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2692 echo "to reference the same OST-object (which is f3's OST-obejct)."
2693 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2694 echo "dangling reference case, but f4's old OST-object is there."
2697 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2698 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2699 chown 1.1 $DIR/$tdir/a1/f2
2700 chown 1.1 $DIR/$tdir/a1/f4
2701 rm -f $DIR/$tdir/a1/f1
2702 rm -f $DIR/$tdir/a1/f3
2705 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2707 echo "stopall to cleanup object cache"
2710 setupall > /dev/null
2712 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2713 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2715 start_full_debug_logging
2717 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2718 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2720 wait_update_facet mds1 "$LCTL get_param -n \
2721 mdd.$(facet_svc mds1).lfsck_layout |
2722 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2723 error "(3) MDS1 is not the expected 'scanning-phase2'"
2725 # to guarantee all updates are synced.
2729 echo "Write new data to f2/f4 to modify the new created OST-object."
2730 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2731 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2733 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2735 for k in $(seq $MDSCOUNT); do
2736 # The LFSCK status query internal is 30 seconds. For the case
2737 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2738 # time to guarantee the status sync up.
2739 wait_update_facet mds${k} "$LCTL get_param -n \
2740 mdd.$(facet_svc mds${k}).lfsck_layout |
2741 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2742 error "(4) MDS${k} is not the expected 'completed'"
2745 for k in $(seq $OSTCOUNT); do
2746 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2747 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2748 awk '/^status/ { print $2 }')
2749 [ "$cur_status" == "completed" ] ||
2750 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2753 stop_full_debug_logging
2755 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2756 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2757 awk '/^repaired_orphan/ { print $2 }')
2758 [ $repaired -eq 2 ] ||
2759 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2761 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2762 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2763 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2765 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2766 if [ $count -ne 2 ]; then
2767 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2768 error "(8) Expect 2 stubs under lost+found, but got $count"
2771 echo "The stub file should keep the original f2 or f4 data"
2772 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2773 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2774 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2775 error "(9) Got unexpected $cur_size"
2778 $LFS path2fid $cname
2779 $LFS getstripe $cname
2781 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2782 cur_size=$(ls -il $cname | awk '{ print $6 }')
2783 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2784 error "(10) Got unexpected $cur_size"
2787 $LFS path2fid $cname
2788 $LFS getstripe $cname
2790 echo "The f2/f4 should contains new data."
2791 cat $DIR/$tdir/a1/f2
2792 $LFS path2fid $DIR/$tdir/a1/f2
2793 $LFS getstripe $DIR/$tdir/a1/f2
2794 cat $DIR/$tdir/a1/f4
2795 $LFS path2fid $DIR/$tdir/a1/f4
2796 $LFS getstripe $DIR/$tdir/a1/f4
2798 run_test 18e "Find out orphan OST-object and repair it (5)"
2801 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2804 echo "The target MDT-object is lost. The LFSCK should re-create the"
2805 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2806 echo "to verify some OST-object(s) during the first stage-scanning,"
2807 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2808 echo "should not be affected."
2811 check_mount_and_prep
2812 $LFS mkdir -i 0 $DIR/$tdir/a1
2813 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2814 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2815 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2816 $LFS mkdir -i 0 $DIR/$tdir/a2
2817 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2818 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2819 $LFS getstripe $DIR/$tdir/a1/f1
2820 $LFS getstripe $DIR/$tdir/a2/f2
2822 if [ $MDSCOUNT -ge 2 ]; then
2823 $LFS mkdir -i 1 $DIR/$tdir/a3
2824 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2825 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2826 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2827 $LFS mkdir -i 1 $DIR/$tdir/a4
2828 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2829 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2830 $LFS getstripe $DIR/$tdir/a3/f3
2831 $LFS getstripe $DIR/$tdir/a4/f4
2834 cancel_lru_locks osc
2836 echo "Inject failure, to simulate the case of missing the MDT-object"
2837 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2838 do_facet mds1 $LCTL set_param fail_loc=0x1616
2839 rm -f $DIR/$tdir/a1/f1
2840 rm -f $DIR/$tdir/a2/f2
2842 if [ $MDSCOUNT -ge 2 ]; then
2843 do_facet mds2 $LCTL set_param fail_loc=0x1616
2844 rm -f $DIR/$tdir/a3/f3
2845 rm -f $DIR/$tdir/a4/f4
2851 do_facet mds1 $LCTL set_param fail_loc=0
2852 if [ $MDSCOUNT -ge 2 ]; then
2853 do_facet mds2 $LCTL set_param fail_loc=0
2856 cancel_lru_locks mdc
2857 cancel_lru_locks osc
2859 echo "Inject failure, to simulate the OST0 fail to handle"
2860 echo "MDT0 LFSCK request during the first-stage scanning."
2861 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2862 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2864 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2865 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2867 for k in $(seq $MDSCOUNT); do
2868 # The LFSCK status query internal is 30 seconds. For the case
2869 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2870 # time to guarantee the status sync up.
2871 wait_update_facet mds${k} "$LCTL get_param -n \
2872 mdd.$(facet_svc mds${k}).lfsck_layout |
2873 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2874 error "(2) MDS${k} is not the expected 'partial'"
2877 wait_update_facet ost1 "$LCTL get_param -n \
2878 obdfilter.$(facet_svc ost1).lfsck_layout |
2879 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2880 error "(3) OST1 is not the expected 'partial'"
2883 wait_update_facet ost2 "$LCTL get_param -n \
2884 obdfilter.$(facet_svc ost2).lfsck_layout |
2885 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2886 error "(4) OST2 is not the expected 'completed'"
2889 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2891 local repaired=$(do_facet mds1 $LCTL get_param -n \
2892 mdd.$(facet_svc mds1).lfsck_layout |
2893 awk '/^repaired_orphan/ { print $2 }')
2894 [ $repaired -eq 1 ] ||
2895 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2897 if [ $MDSCOUNT -ge 2 ]; then
2898 repaired=$(do_facet mds2 $LCTL get_param -n \
2899 mdd.$(facet_svc mds2).lfsck_layout |
2900 awk '/^repaired_orphan/ { print $2 }')
2901 [ $repaired -eq 1 ] ||
2902 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2905 echo "Trigger layout LFSCK on all devices again to cleanup"
2906 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2908 for k in $(seq $MDSCOUNT); do
2909 # The LFSCK status query internal is 30 seconds. For the case
2910 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2911 # time to guarantee the status sync up.
2912 wait_update_facet mds${k} "$LCTL get_param -n \
2913 mdd.$(facet_svc mds${k}).lfsck_layout |
2914 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2915 error "(8) MDS${k} is not the expected 'completed'"
2918 for k in $(seq $OSTCOUNT); do
2919 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2920 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2921 awk '/^status/ { print $2 }')
2922 [ "$cur_status" == "completed" ] ||
2923 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2927 local repaired=$(do_facet mds1 $LCTL get_param -n \
2928 mdd.$(facet_svc mds1).lfsck_layout |
2929 awk '/^repaired_orphan/ { print $2 }')
2930 [ $repaired -eq 2 ] ||
2931 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2933 if [ $MDSCOUNT -ge 2 ]; then
2934 repaired=$(do_facet mds2 $LCTL get_param -n \
2935 mdd.$(facet_svc mds2).lfsck_layout |
2936 awk '/^repaired_orphan/ { print $2 }')
2937 [ $repaired -eq 2 ] ||
2938 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2941 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2944 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2947 echo "The target MDT-object is lost, but related OI mapping is there"
2948 echo "The LFSCK should recreate the lost MDT-object without affected"
2949 echo "by the stale OI mapping."
2952 check_mount_and_prep
2953 $LFS mkdir -i 0 $DIR/$tdir/a1
2954 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2955 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2956 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2958 $LFS getstripe $DIR/$tdir/a1/f1
2959 cancel_lru_locks osc
2961 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2962 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2963 do_facet mds1 $LCTL set_param fail_loc=0x162e
2964 rm -f $DIR/$tdir/a1/f1
2966 do_facet mds1 $LCTL set_param fail_loc=0
2967 cancel_lru_locks mdc
2968 cancel_lru_locks osc
2970 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2971 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2973 for k in $(seq $MDSCOUNT); do
2974 # The LFSCK status query internal is 30 seconds. For the case
2975 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2976 # time to guarantee the status sync up.
2977 wait_update_facet mds${k} "$LCTL get_param -n \
2978 mdd.$(facet_svc mds${k}).lfsck_layout |
2979 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2980 error "(2) MDS${k} is not the expected 'completed'"
2983 for k in $(seq $OSTCOUNT); do
2984 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2985 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2986 awk '/^status/ { print $2 }')
2987 [ "$cur_status" == "completed" ] ||
2988 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2991 local repaired=$(do_facet mds1 $LCTL get_param -n \
2992 mdd.$(facet_svc mds1).lfsck_layout |
2993 awk '/^repaired_orphan/ { print $2 }')
2994 [ $repaired -eq $OSTCOUNT ] ||
2995 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2997 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2998 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2999 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3001 $LFS path2fid $DIR/$tdir/a1/f1
3002 $LFS getstripe $DIR/$tdir/a1/f1
3004 run_test 18g "Find out orphan OST-object and repair it (7)"
3008 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
3009 echo "the layout LFSCK will keep the bad PFL file(s) there without"
3010 echo "scanning its OST-object(s). Then in the second stage scanning,"
3011 echo "the OST will return related OST-object(s) to the MDT as orphan."
3012 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
3013 echo "the 'orphan(s)' stripe information."
3016 check_mount_and_prep
3018 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
3019 error "(0) Fail to create PFL $DIR/$tdir/f0"
3021 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
3022 error "(1.1) Fail to write $DIR/$tdir/f0"
3024 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
3025 error "(1.2) Fail to write $DIR/$tdir/f0"
3027 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3029 echo "Inject failure stub to simulate bad PFL extent range"
3030 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3031 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3033 chown 1.1 $DIR/$tdir/f0
3035 cancel_lru_locks mdc
3036 cancel_lru_locks osc
3037 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3039 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3040 error "(2) Write to bad PFL file should fail"
3042 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3043 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3045 for k in $(seq $MDSCOUNT); do
3046 # The LFSCK status query internal is 30 seconds. For the case
3047 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3048 # time to guarantee the status sync up.
3049 wait_update_facet mds${k} "$LCTL get_param -n \
3050 mdd.$(facet_svc mds${k}).lfsck_layout |
3051 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3052 error "(4.1) MDS${k} is not the expected 'completed'"
3055 for k in $(seq $OSTCOUNT); do
3056 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3057 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3058 awk '/^status/ { print $2 }')
3059 [ "$cur_status" == "completed" ] ||
3060 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3064 local repaired=$($SHOW_LAYOUT |
3065 awk '/^repaired_orphan/ { print $2 }')
3066 [ $repaired -eq 2 ] ||
3067 error "(5) Fail to repair crashed PFL range: $repaired"
3069 echo "Data in $DIR/$tdir/f0 should not be broken"
3070 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3071 error "(6) Data in $DIR/$tdir/f0 is broken"
3073 echo "Write should succeed after LFSCK repairing the bad PFL range"
3074 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3075 error "(7) Write should succeed after LFSCK"
3077 run_test 18h "LFSCK can repair crashed PFL extent range"
3079 $LCTL set_param debug=-cache > /dev/null
3082 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3083 skip "MDS older than 2.5.55, LU-3951"
3085 check_mount_and_prep
3086 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3088 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3089 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3091 echo "foo1" > $DIR/$tdir/a0
3092 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3093 error "(0) Fail to create PFL $DIR/$tdir/a1"
3094 echo "foo2" > $DIR/$tdir/a1
3095 echo "guard" > $DIR/$tdir/a2
3096 cancel_lru_locks osc
3098 echo "Inject failure, then client will offer wrong parent FID when read"
3099 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3100 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3102 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3103 $LCTL set_param fail_loc=0x1619
3105 echo "Read RPC with wrong parent FID should be denied"
3106 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3107 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3108 $LCTL set_param fail_loc=0
3110 run_test 19a "OST-object inconsistency self detect"
3113 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3114 skip "MDS older than 2.5.55, LU-3951"
3116 check_mount_and_prep
3117 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3119 echo "Inject failure stub to make the OST-object to back point to"
3120 echo "non-exist MDT-object"
3122 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3123 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3125 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3126 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3127 echo "foo1" > $DIR/$tdir/f0
3128 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3129 error "(0) Fail to create PFL $DIR/$tdir/f1"
3130 echo "foo2" > $DIR/$tdir/f1
3131 cancel_lru_locks osc
3132 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3134 do_facet ost1 $LCTL set_param -n \
3135 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3136 echo "Nothing should be fixed since self detect and repair is disabled"
3137 local repaired=$(do_facet ost1 $LCTL get_param -n \
3138 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3139 awk '/^repaired/ { print $2 }')
3140 [ $repaired -eq 0 ] ||
3141 error "(1) Expected 0 repaired, but got $repaired"
3143 echo "Read RPC with right parent FID should be accepted,"
3144 echo "and cause parent FID on OST to be fixed"
3146 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3147 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3149 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3150 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3152 repaired=$(do_facet ost1 $LCTL get_param -n \
3153 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3154 awk '/^repaired/ { print $2 }')
3155 [ $repaired -eq 2 ] ||
3156 error "(3) Expected 1 repaired, but got $repaired"
3158 run_test 19b "OST-object inconsistency self repair"
3160 PATTERN_WITH_HOLE="40000001"
3161 PATTERN_WITHOUT_HOLE="raid0"
3164 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3165 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3166 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3167 skip "MDS older than 2.5.55, LU-4887"
3170 echo "The target MDT-object and some of its OST-object are lost."
3171 echo "The LFSCK should find out the left OST-objects and re-create"
3172 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3173 echo "with the partial OST-objects (LOV EA hole)."
3175 echo "New client can access the file with LOV EA hole via normal"
3176 echo "system tools or commands without crash the system."
3178 echo "For old client, even though it cannot access the file with"
3179 echo "LOV EA hole, it should not cause the system crash."
3182 check_mount_and_prep
3183 $LFS mkdir -i 0 $DIR/$tdir/a1
3184 if [ $OSTCOUNT -gt 2 ]; then
3185 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3188 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3192 # 256 blocks on the stripe0.
3193 # 1 block on the stripe1 for 2 OSTs case.
3194 # 256 blocks on the stripe1 for other cases.
3195 # 1 block on the stripe2 if OSTs > 2
3196 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3197 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3198 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3200 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3201 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3202 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3205 $LFS getstripe $DIR/$tdir/a1/f0
3207 $LFS getstripe $DIR/$tdir/a1/f1
3209 $LFS getstripe $DIR/$tdir/a1/f2
3211 if [ $OSTCOUNT -gt 2 ]; then
3212 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3213 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3215 $LFS getstripe $DIR/$tdir/a1/f3
3218 cancel_lru_locks osc
3220 echo "Inject failure..."
3221 echo "To simulate f0 lost MDT-object"
3222 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3223 do_facet mds1 $LCTL set_param fail_loc=0x1616
3224 rm -f $DIR/$tdir/a1/f0
3226 echo "To simulate f1 lost MDT-object and OST-object0"
3227 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3228 do_facet mds1 $LCTL set_param fail_loc=0x161a
3229 rm -f $DIR/$tdir/a1/f1
3231 echo "To simulate f2 lost MDT-object and OST-object1"
3232 do_facet mds1 $LCTL set_param fail_val=1
3233 rm -f $DIR/$tdir/a1/f2
3235 if [ $OSTCOUNT -gt 2 ]; then
3236 echo "To simulate f3 lost MDT-object and OST-object2"
3237 do_facet mds1 $LCTL set_param fail_val=2
3238 rm -f $DIR/$tdir/a1/f3
3241 umount_client $MOUNT
3244 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3246 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3247 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3249 for k in $(seq $MDSCOUNT); do
3250 # The LFSCK status query internal is 30 seconds. For the case
3251 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3252 # time to guarantee the status sync up.
3253 wait_update_facet mds${k} "$LCTL get_param -n \
3254 mdd.$(facet_svc mds${k}).lfsck_layout |
3255 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3256 error "(2) MDS${k} is not the expected 'completed'"
3259 for k in $(seq $OSTCOUNT); do
3260 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3261 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3262 awk '/^status/ { print $2 }')
3263 [ "$cur_status" == "completed" ] ||
3264 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3267 local repaired=$(do_facet mds1 $LCTL get_param -n \
3268 mdd.$(facet_svc mds1).lfsck_layout |
3269 awk '/^repaired_orphan/ { print $2 }')
3270 if [ $OSTCOUNT -gt 2 ]; then
3271 [ $repaired -eq 9 ] ||
3272 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3274 [ $repaired -eq 4 ] ||
3275 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3278 mount_client $MOUNT || error "(5.0) Fail to start client!"
3280 LOV_PATTERN_F_HOLE=0x40000000
3283 # ${fid0}-R-0 is the old f0
3285 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3286 echo "Check $name, which is the old f0"
3288 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3290 local pattern=$($LFS getstripe -L $name)
3291 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3292 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3294 local stripes=$($LFS getstripe -c $name)
3295 if [ $OSTCOUNT -gt 2 ]; then
3296 [ $stripes -eq 3 ] ||
3297 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3299 [ $stripes -eq 2 ] ||
3300 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3303 local size=$(stat $name | awk '/Size:/ { print $2 }')
3304 [ $size -eq $((4096 * $bcount)) ] ||
3305 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3307 cat $name > /dev/null || error "(5.5) cannot read $name"
3309 echo "dummy" >> $name || error "(5.6) cannot write $name"
3311 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3313 touch $name || error "(5.8) cannot touch $name"
3315 rm -f $name || error "(5.9) cannot unlink $name"
3318 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3320 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3321 if [ $OSTCOUNT -gt 2 ]; then
3322 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3324 echo "Check $name, it contains the old f1's stripe1"
3327 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3329 pattern=$($LFS getstripe -L $name)
3330 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3331 error "(6.2) expect pattern flag hole, but got $pattern"
3333 stripes=$($LFS getstripe -c $name)
3334 if [ $OSTCOUNT -gt 2 ]; then
3335 [ $stripes -eq 3 ] ||
3336 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3338 [ $stripes -eq 2 ] ||
3339 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3342 size=$(stat $name | awk '/Size:/ { print $2 }')
3343 [ $size -eq $((4096 * $bcount)) ] ||
3344 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3346 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3348 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3349 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3352 [ $failures -eq 256 ] ||
3353 error "(6.6) expect 256 IO failures, but get $failures"
3355 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3356 [ $size -eq $((4096 * $bcount)) ] ||
3357 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3359 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3360 error "(6.8) write to the LOV EA hole should fail"
3362 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3363 error "(6.9) write to normal stripe should NOT fail"
3365 echo "foo" >> $name && error "(6.10) append write $name should fail"
3367 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3369 touch $name || error "(6.12) cannot touch $name"
3371 rm -f $name || error "(6.13) cannot unlink $name"
3374 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3376 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3377 if [ $OSTCOUNT -gt 2 ]; then
3378 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3380 echo "Check $name, it contains the old f2's stripe0"
3383 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3385 pattern=$($LFS getstripe -L $name)
3386 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3387 error "(7.2) expect pattern flag hole, but got $pattern"
3389 stripes=$($LFS getstripe -c $name)
3390 size=$(stat $name | awk '/Size:/ { print $2 }')
3391 if [ $OSTCOUNT -gt 2 ]; then
3392 [ $stripes -eq 3 ] ||
3393 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3395 [ $size -eq $((4096 * $bcount)) ] ||
3396 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3398 cat $name > /dev/null &&
3399 error "(7.5.1) normal read $name should fail"
3401 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3402 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3404 [ $failures -eq 256 ] ||
3405 error "(7.6) expect 256 IO failures, but get $failures"
3407 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3408 [ $size -eq $((4096 * $bcount)) ] ||
3409 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3411 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3412 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3414 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3415 error "(7.8.1) write to normal stripe should NOT fail"
3417 echo "foo" >> $name &&
3418 error "(7.8.3) append write $name should fail"
3420 chown $RUNAS_ID:$RUNAS_GID $name ||
3421 error "(7.9.1) cannot chown on $name"
3423 touch $name || error "(7.10.1) cannot touch $name"
3425 [ $stripes -eq 2 ] ||
3426 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3429 [ $size -eq $((4096 * (256 + 0))) ] ||
3430 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3432 cat $name > /dev/null &&
3433 error "(7.5.2) normal read $name should fail"
3435 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3436 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3437 [ $failures -eq 256 ] ||
3438 error "(7.6.2) expect 256 IO failures, but get $failures"
3441 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3442 [ $size -eq $((4096 * $bcount)) ] ||
3443 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3445 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3446 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3448 chown $RUNAS_ID:$RUNAS_GID $name ||
3449 error "(7.9.2) cannot chown on $name"
3451 touch $name || error "(7.10.2) cannot touch $name"
3454 rm -f $name || error "(7.11) cannot unlink $name"
3456 [ $OSTCOUNT -le 2 ] && return
3459 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3461 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3462 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3464 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3466 pattern=$($LFS getstripe -L $name)
3467 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3468 error "(8.2) expect pattern flag hole, but got $pattern"
3470 stripes=$($LFS getstripe -c $name)
3471 [ $stripes -eq 3 ] ||
3472 error "(8.3) expect the stripe count is 3, but got $stripes"
3474 size=$(stat $name | awk '/Size:/ { print $2 }')
3476 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3477 error "(8.4) expect the size $((4096 * 512)), but got $size"
3479 cat $name > /dev/null &&
3480 error "(8.5) normal read $name should fail"
3482 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3483 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3485 [ $failures -eq 256 ] ||
3486 error "(8.6) expect 256 IO failures, but get $failures"
3489 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3490 [ $size -eq $((4096 * $bcount)) ] ||
3491 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3493 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3494 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3496 chown $RUNAS_ID:$RUNAS_GID $name ||
3497 error "(8.9) cannot chown on $name"
3499 touch $name || error "(8.10) cannot touch $name"
3501 rm -f $name || error "(8.11) cannot unlink $name"
3503 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3506 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3507 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3508 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3509 skip "MDS older than 2.5.55, LU-4887"
3512 echo "The target MDT-object and some of its OST-object are lost."
3513 echo "The LFSCK should find out the left OST-objects and re-create"
3514 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3515 echo "with the partial OST-objects (LOV EA hole)."
3517 echo "New client can access the file with LOV EA hole via normal"
3518 echo "system tools or commands without crash the system - PFL case."
3521 check_mount_and_prep
3523 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3524 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3525 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3526 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3527 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3528 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3530 local bcount=$((256 * 3 + 1))
3532 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3533 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3534 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3536 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3537 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3538 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3541 $LFS getstripe $DIR/$tdir/f0
3543 $LFS getstripe $DIR/$tdir/f1
3545 $LFS getstripe $DIR/$tdir/f2
3547 cancel_lru_locks mdc
3548 cancel_lru_locks osc
3550 echo "Inject failure..."
3551 echo "To simulate f0 lost MDT-object"
3552 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3553 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3556 echo "To simulate the case of f1 lost MDT-object and "
3557 echo "the first OST-object in each PFL component"
3558 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3559 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3562 echo "To simulate the case of f2 lost MDT-object and "
3563 echo "the second OST-object in each PFL component"
3564 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3569 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3571 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3572 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3574 for k in $(seq $MDSCOUNT); do
3575 # The LFSCK status query internal is 30 seconds. For the case
3576 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3577 # time to guarantee the status sync up.
3578 wait_update_facet mds${k} "$LCTL get_param -n \
3579 mdd.$(facet_svc mds${k}).lfsck_layout |
3580 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3581 error "(4) MDS${k} is not the expected 'completed'"
3584 for k in $(seq $OSTCOUNT); do
3585 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3586 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3587 awk '/^status/ { print $2 }')
3588 [ "$cur_status" == "completed" ] ||
3589 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3592 local repaired=$(do_facet mds1 $LCTL get_param -n \
3593 mdd.$(facet_svc mds1).lfsck_layout |
3594 awk '/^repaired_orphan/ { print $2 }')
3595 [ $repaired -eq 8 ] ||
3596 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3599 # ${fid0}-R-0 is the old f0
3601 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3602 echo "Check $name, which is the old f0"
3604 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3606 local pattern=$($LFS getstripe -L -I1 $name)
3607 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3608 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3610 pattern=$($LFS getstripe -L -I2 $name)
3611 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3612 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3614 local stripes=$($LFS getstripe -c -I1 $name)
3615 [ $stripes -eq 2 ] ||
3616 error "(7.3.1) expect 2 stripes, but got $stripes"
3618 stripes=$($LFS getstripe -c -I2 $name)
3619 [ $stripes -eq 2 ] ||
3620 error "(7.3.2) expect 2 stripes, but got $stripes"
3622 local e_start=$($LFS getstripe -I1 $name |
3623 awk '/lcme_extent.e_start:/ { print $2 }')
3624 [ $e_start -eq 0 ] ||
3625 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3627 local e_end=$($LFS getstripe -I1 $name |
3628 awk '/lcme_extent.e_end:/ { print $2 }')
3629 [ $e_end -eq 2097152 ] ||
3630 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3632 e_start=$($LFS getstripe -I2 $name |
3633 awk '/lcme_extent.e_start:/ { print $2 }')
3634 [ $e_start -eq 2097152 ] ||
3635 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3637 e_end=$($LFS getstripe -I2 $name |
3638 awk '/lcme_extent.e_end:/ { print $2 }')
3639 [ "$e_end" = "EOF" ] ||
3640 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3642 local size=$(stat $name | awk '/Size:/ { print $2 }')
3643 [ $size -eq $((4096 * $bcount)) ] ||
3644 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3646 cat $name > /dev/null || error "(7.7) cannot read $name"
3648 echo "dummy" >> $name || error "(7.8) cannot write $name"
3650 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3652 touch $name || error "(7.10) cannot touch $name"
3654 rm -f $name || error "(7.11) cannot unlink $name"
3657 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3659 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3660 echo "Check $name, it contains f1's second OST-object in each COMP"
3662 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3664 pattern=$($LFS getstripe -L -I1 $name)
3665 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3666 error "(8.2.1) expect pattern flag hole, but got $pattern"
3668 pattern=$($LFS getstripe -L -I2 $name)
3669 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3670 error "(8.2.2) expect pattern flag hole, but got $pattern"
3672 stripes=$($LFS getstripe -c -I1 $name)
3673 [ $stripes -eq 2 ] ||
3674 error "(8.3.2) expect 2 stripes, but got $stripes"
3676 stripes=$($LFS getstripe -c -I2 $name)
3677 [ $stripes -eq 2 ] ||
3678 error "(8.3.2) expect 2 stripes, but got $stripes"
3680 e_start=$($LFS getstripe -I1 $name |
3681 awk '/lcme_extent.e_start:/ { print $2 }')
3682 [ $e_start -eq 0 ] ||
3683 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3685 e_end=$($LFS getstripe -I1 $name |
3686 awk '/lcme_extent.e_end:/ { print $2 }')
3687 [ $e_end -eq 2097152 ] ||
3688 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3690 e_start=$($LFS getstripe -I2 $name |
3691 awk '/lcme_extent.e_start:/ { print $2 }')
3692 [ $e_start -eq 2097152 ] ||
3693 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3695 e_end=$($LFS getstripe -I2 $name |
3696 awk '/lcme_extent.e_end:/ { print $2 }')
3697 [ "$e_end" = "EOF" ] ||
3698 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3700 size=$(stat $name | awk '/Size:/ { print $2 }')
3701 [ $size -eq $((4096 * $bcount)) ] ||
3702 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3704 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3706 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3707 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3709 # The first stripe in each COMP was lost
3710 [ $failures -eq 512 ] ||
3711 error "(8.8) expect 512 IO failures, but get $failures"
3713 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3714 [ $size -eq $((4096 * $bcount)) ] ||
3715 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3717 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3718 error "(8.10) write to the LOV EA hole should fail"
3720 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3721 error "(8.11) write to normal stripe should NOT fail"
3723 echo "foo" >> $name && error "(8.12) append write $name should fail"
3725 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3727 touch $name || error "(8.14) cannot touch $name"
3729 rm -f $name || error "(8.15) cannot unlink $name"
3732 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3734 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3735 echo "Check $name, it contains f2's first stripe in each COMP"
3737 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3739 pattern=$($LFS getstripe -L -I1 $name)
3740 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3741 error "(9.2.1) expect pattern flag hole, but got $pattern"
3743 pattern=$($LFS getstripe -L -I2 $name)
3744 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3745 error "(9.2.2) expect pattern flag hole, but got $pattern"
3747 stripes=$($LFS getstripe -c -I1 $name)
3748 [ $stripes -eq 2 ] ||
3749 error "(9.3.2) expect 2 stripes, but got $stripes"
3751 stripes=$($LFS getstripe -c -I2 $name)
3752 [ $stripes -eq 2 ] ||
3753 error "(9.3.2) expect 2 stripes, but got $stripes"
3755 e_start=$($LFS getstripe -I1 $name |
3756 awk '/lcme_extent.e_start:/ { print $2 }')
3757 [ $e_start -eq 0 ] ||
3758 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3760 e_end=$($LFS getstripe -I1 $name |
3761 awk '/lcme_extent.e_end:/ { print $2 }')
3762 [ $e_end -eq 2097152 ] ||
3763 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3765 e_start=$($LFS getstripe -I2 $name |
3766 awk '/lcme_extent.e_start:/ { print $2 }')
3767 [ $e_start -eq 2097152 ] ||
3768 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3770 e_end=$($LFS getstripe -I2 $name |
3771 awk '/lcme_extent.e_end:/ { print $2 }')
3772 [ "$e_end" = "EOF" ] ||
3773 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3775 size=$(stat $name | awk '/Size:/ { print $2 }')
3776 # The second stripe in COMP was lost, so we do not know there
3777 # have ever been some data before. 'stat' will regard it as
3778 # no data on the lost stripe.
3780 [ $size -eq $((4096 * $bcount)) ] ||
3781 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3783 cat $name > /dev/null &&
3784 error "(9.7) normal read $name should fail"
3786 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3787 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3788 [ $failures -eq 512 ] ||
3789 error "(9.8) expect 256 IO failures, but get $failures"
3791 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3792 # The second stripe in COMP was lost, so we do not know there
3793 # have ever been some data before. Since 'dd' skip failure,
3794 # it will regard the lost stripe contains data.
3796 [ $size -eq $((4096 * $bcount)) ] ||
3797 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3799 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3800 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3802 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3803 error "(9.11) write to normal stripe should NOT fail"
3805 echo "foo" >> $name &&
3806 error "(9.12) append write $name should fail"
3808 chown $RUNAS_ID:$RUNAS_GID $name ||
3809 error "(9.13) cannot chown on $name"
3811 touch $name || error "(9.14) cannot touch $name"
3813 rm -f $name || error "(7.15) cannot unlink $name"
3815 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3818 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3819 skip "MDS older than 2.5.59, LU-4887"
3821 check_mount_and_prep
3822 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3824 echo "Start all LFSCK components by default (-s 1)"
3825 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3826 error "Fail to start LFSCK"
3828 echo "namespace LFSCK should be in 'scanning-phase1' status"
3829 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3830 [ "$STATUS" == "scanning-phase1" ] ||
3831 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3833 echo "layout LFSCK should be in 'scanning-phase1' status"
3834 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3835 [ "$STATUS" == "scanning-phase1" ] ||
3836 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3838 echo "Stop all LFSCK components by default"
3839 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3840 error "Fail to stop LFSCK"
3842 run_test 21 "run all LFSCK components by default"
3845 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3846 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3847 skip "MDS older than 2.6.50, LU-5511"
3850 echo "The parent_A references the child directory via some name entry,"
3851 echo "but the child directory back references another parent_B via its"
3852 echo "".." name entry. The parent_B does not exist. Then the namespace"
3853 echo "LFSCK will repair the child directory's ".." name entry."
3856 check_mount_and_prep
3858 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3859 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3861 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3862 echo "The dummy's dotdot name entry references the guard."
3863 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3864 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3865 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3866 error "(3) Fail to mkdir on MDT0"
3867 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3869 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3871 echo "Trigger namespace LFSCK to repair unmatched pairs"
3872 $START_NAMESPACE -A -r ||
3873 error "(5) Fail to start LFSCK for namespace"
3875 wait_all_targets_blocked namespace completed 6
3877 local repaired=$($SHOW_NAMESPACE |
3878 awk '/^unmatched_pairs_repaired/ { print $2 }')
3879 [ $repaired -eq 1 ] ||
3880 error "(7) Fail to repair unmatched pairs: $repaired"
3882 echo "'ls' should success after namespace LFSCK repairing"
3883 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3884 error "(8) ls should success."
3886 run_test 22a "LFSCK can repair unmatched pairs (1)"
3889 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3890 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3891 skip "MDS older than 2.6.50, LU-5511"
3894 echo "The parent_A references the child directory via the name entry_B,"
3895 echo "but the child directory back references another parent_C via its"
3896 echo "".." name entry. The parent_C exists, but there is no the name"
3897 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3898 echo "the child directory's ".." name entry and its linkEA."
3901 check_mount_and_prep
3903 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3904 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3906 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3907 echo "and bad linkEA. The dummy's dotdot name entry references the"
3908 echo "guard. The dummy's linkEA references n non-exist name entry."
3909 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3910 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3911 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3912 error "(3) Fail to mkdir on MDT0"
3913 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3915 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3916 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3917 local dummyname=$($LFS fid2path $DIR $dummyfid)
3918 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3919 error "(4) fid2path works unexpectedly."
3921 echo "Trigger namespace LFSCK to repair unmatched pairs"
3922 $START_NAMESPACE -A -r ||
3923 error "(5) Fail to start LFSCK for namespace"
3925 wait_all_targets_blocked namespace completed 6
3927 local repaired=$($SHOW_NAMESPACE |
3928 awk '/^unmatched_pairs_repaired/ { print $2 }')
3929 [ $repaired -eq 1 ] ||
3930 error "(7) Fail to repair unmatched pairs: $repaired"
3932 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3933 local dummyname=$($LFS fid2path $DIR $dummyfid)
3934 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3935 error "(8) fid2path does not work"
3937 run_test 22b "LFSCK can repair unmatched pairs (2)"
3940 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3941 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3942 skip "MDS older than 2.6.50, LU-5512"
3945 echo "The name entry is there, but the MDT-object for such name "
3946 echo "entry does not exist. The namespace LFSCK should find out "
3947 echo "and repair the inconsistency as required."
3950 check_mount_and_prep
3952 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3953 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3955 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3956 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3957 do_facet mds2 $LCTL set_param fail_loc=0x1620
3958 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3959 do_facet mds2 $LCTL set_param fail_loc=0
3961 echo "'ls' should fail because of dangling name entry"
3962 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3964 echo "Trigger namespace LFSCK to find out dangling name entry"
3965 $START_NAMESPACE -A -r ||
3966 error "(5) Fail to start LFSCK for namespace"
3968 wait_all_targets_blocked namespace completed 6
3970 local repaired=$($SHOW_NAMESPACE |
3971 awk '/^dangling_repaired/ { print $2 }')
3972 [ $repaired -eq 1 ] ||
3973 error "(7) Fail to repair dangling name entry: $repaired"
3975 echo "'ls' should fail because not re-create MDT-object by default"
3976 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3978 echo "Trigger namespace LFSCK again to repair dangling name entry"
3979 $START_NAMESPACE -A -r -C ||
3980 error "(9) Fail to start LFSCK for namespace"
3982 wait_all_targets_blocked namespace completed 10
3984 repaired=$($SHOW_NAMESPACE |
3985 awk '/^dangling_repaired/ { print $2 }')
3986 [ $repaired -eq 1 ] ||
3987 error "(11) Fail to repair dangling name entry: $repaired"
3989 echo "'ls' should success after namespace LFSCK repairing"
3990 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3992 run_test 23a "LFSCK can repair dangling name entry (1)"
3995 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3996 skip "MDS older than 2.6.50, LU-5512"
3999 echo "The objectA has multiple hard links, one of them corresponding"
4000 echo "to the name entry_B. But there is something wrong for the name"
4001 echo "entry_B and cause entry_B to references non-exist object_C."
4002 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4003 echo "as dangling, and re-create the lost object_C. When the LFSCK"
4004 echo "comes to the second-stage scanning, it will find that the"
4005 echo "former re-creating object_C is not proper, and will try to"
4006 echo "replace the object_C with the real object_A."
4009 check_mount_and_prep
4011 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4012 $LFS path2fid $DIR/$tdir/d0
4014 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4016 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4017 $LFS path2fid $DIR/$tdir/d0/f0
4019 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4020 $LFS path2fid $DIR/$tdir/d0/f1
4022 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
4023 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
4025 if [ "$SEQ0" != "$SEQ1" ]; then
4026 # To guarantee that the f0 and f1 are in the same FID seq
4027 rm -f $DIR/$tdir/d0/f0 ||
4028 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4029 echo "dummy" > $DIR/$tdir/d0/f0 ||
4030 error "(3.2) Fail to touch on MDT0"
4031 $LFS path2fid $DIR/$tdir/d0/f0
4034 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4035 OID=$(printf %d $OID)
4037 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4038 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4039 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4040 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4041 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4043 # If there is creation after the dangling injection, it may re-use
4044 # the just released local object (inode) that is referenced by the
4045 # dangling name entry. It will fail the dangling injection.
4046 # So before deleting the target object for the dangling name entry,
4047 # remove some other objects to avoid the target object being reused
4048 # by some potential creations. LU-7429
4049 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4051 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4053 echo "'ls' should fail because of dangling name entry"
4054 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4055 error "(6) ls should fail."
4057 echo "Trigger namespace LFSCK to find out dangling name entry"
4058 $START_NAMESPACE -r -C ||
4059 error "(7) Fail to start LFSCK for namespace"
4061 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4062 mdd.${MDT_DEV}.lfsck_namespace |
4063 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4065 error "(8) unexpected status"
4068 local repaired=$($SHOW_NAMESPACE |
4069 awk '/^dangling_repaired/ { print $2 }')
4070 [ $repaired -eq 1 ] ||
4071 error "(9) Fail to repair dangling name entry: $repaired"
4073 repaired=$($SHOW_NAMESPACE |
4074 awk '/^multiple_linked_repaired/ { print $2 }')
4075 [ $repaired -eq 1 ] ||
4076 error "(10) Fail to drop the former created object: $repaired"
4078 local data=$(cat $DIR/$tdir/d0/foo)
4079 [ "$data" == "dummy" ] ||
4080 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4082 run_test 23b "LFSCK can repair dangling name entry (2)"
4085 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4086 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4087 mdd.${MDT_DEV}.lfsck_namespace |
4088 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4090 error "(10) unexpected status"
4093 stop_full_debug_logging
4097 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4098 skip "MDS older than 2.6.50, LU-5512"
4101 echo "The objectA has multiple hard links, one of them corresponding"
4102 echo "to the name entry_B. But there is something wrong for the name"
4103 echo "entry_B and cause entry_B to references non-exist object_C."
4104 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4105 echo "as dangling, and re-create the lost object_C. And then others"
4106 echo "modified the re-created object_C. When the LFSCK comes to the"
4107 echo "second-stage scanning, it will find that the former re-creating"
4108 echo "object_C maybe wrong and try to replace the object_C with the"
4109 echo "real object_A. But because object_C has been modified, so the"
4110 echo "LFSCK cannot replace it."
4113 start_full_debug_logging
4115 check_mount_and_prep
4117 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4118 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4119 echo "parent_fid=$parent_fid"
4121 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4123 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4124 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4125 echo "f0_fid=$f0_fid"
4127 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4128 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4129 echo "f1_fid=$f1_fid"
4131 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4132 # To guarantee that the f0 and f1 are in the same FID seq
4133 rm -f $DIR/$tdir/d0/f0 ||
4134 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4135 echo "dummy" > $DIR/$tdir/d0/f0 ||
4136 error "(3.2) Fail to touch on MDT0"
4137 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4138 echo "f0_fid=$f0_fid (replaced)"
4141 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4143 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4144 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4145 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4146 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4147 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4149 # If there is creation after the dangling injection, it may re-use
4150 # the just released local object (inode) that is referenced by the
4151 # dangling name entry. It will fail the dangling injection.
4152 # So before deleting the target object for the dangling name entry,
4153 # remove some other objects to avoid the target object being reused
4154 # by some potential creations. LU-7429
4155 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4157 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4159 echo "'ls' should fail because of dangling name entry"
4160 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4161 error "(6) ls should fail."
4163 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4164 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4166 echo "Trigger namespace LFSCK to find out dangling name entry"
4167 $START_NAMESPACE -r -C ||
4168 error "(7) Fail to start LFSCK for namespace"
4170 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4171 # While unexpected by the test, it is valid for LFSCK to repair
4172 # the link to the original object before any data is written.
4173 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4175 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4176 log "LFSCK repaired file prematurely"
4181 stat $DIR/$tdir/d0/foo
4183 error "(8) unexpected size"
4186 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4187 cancel_lru_locks osc
4191 local repaired=$($SHOW_NAMESPACE |
4192 awk '/^dangling_repaired/ { print $2 }')
4193 [ $repaired -eq 1 ] ||
4194 error "(11) Fail to repair dangling name entry: $repaired"
4196 local data=$(cat $DIR/$tdir/d0/foo)
4197 [ "$data" != "dummy" ] ||
4198 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4200 run_test 23c "LFSCK can repair dangling name entry (3)"
4203 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4204 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4205 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4206 skip "MDS older than 2.6.50, LU-5513"
4209 echo "Two MDT-objects back reference the same name entry via their"
4210 echo "each own linkEA entry, but the name entry only references one"
4211 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4212 echo "for the MDT-object that is not recognized. If such MDT-object"
4213 echo "has no other linkEA entry after the removing, then the LFSCK"
4214 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4217 check_mount_and_prep
4219 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4221 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4222 $LFS path2fid $DIR/$tdir/d0/guard
4224 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4225 $LFS path2fid $DIR/$tdir/d0/dummy
4228 if [ $mds1_FSTYPE != ldiskfs ]; then
4229 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4231 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4234 touch $DIR/$tdir/d0/guard/foo ||
4235 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4237 echo "Inject failure stub on MDT0 to simulate the case that"
4238 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4239 echo "that references $DIR/$tdir/d0/guard/foo."
4240 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4241 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4242 echo "there with the same linkEA entry as another MDT-object"
4243 echo "$DIR/$tdir/d0/guard/foo has"
4245 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4246 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4247 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4248 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4249 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4250 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4251 rmdir $DIR/$tdir/d0/dummy/foo ||
4252 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4255 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4256 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4257 error "(6) stat successfully unexpectedly"
4259 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4260 $START_NAMESPACE -A -r ||
4261 error "(7) Fail to start LFSCK for namespace"
4263 wait_all_targets_blocked namespace completed 8
4265 local repaired=$($SHOW_NAMESPACE |
4266 awk '/^multiple_referenced_repaired/ { print $2 }')
4267 [ $repaired -eq 1 ] ||
4268 error "(9) Fail to repair multiple referenced name entry: $repaired"
4270 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4271 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4272 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4274 local cname="$cfid-$pfid-D-0"
4275 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4276 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4278 run_test 24 "LFSCK can repair multiple-referenced name entry"
4281 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4282 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4283 skip "MDS older than 2.6.50, LU-5515"
4286 echo "The file type in the name entry does not match the file type"
4287 echo "claimed by the referenced object. Then the LFSCK will update"
4288 echo "the file type in the name entry."
4291 check_mount_and_prep
4293 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4295 echo "Inject failure stub on MDT0 to simulate the case that"
4296 echo "the file type stored in the name entry is wrong."
4298 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4300 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4301 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4303 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4304 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4306 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4307 mdd.${MDT_DEV}.lfsck_namespace |
4308 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4310 error "(4) unexpected status"
4313 local repaired=$($SHOW_NAMESPACE |
4314 awk '/^bad_file_type_repaired/ { print $2 }')
4315 [ $repaired -eq 1 ] ||
4316 error "(5) Fail to repair bad file type in name entry: $repaired"
4318 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4320 run_test 25 "LFSCK can repair bad file type in the name entry"
4323 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4324 skip "MDS older than 2.6.50, LU-5516"
4327 echo "The local name entry back referenced by the MDT-object is lost."
4328 echo "The namespace LFSCK will add the missing local name entry back"
4329 echo "to the normal namespace."
4332 check_mount_and_prep
4334 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4335 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4336 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4338 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4339 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4341 echo "Inject failure stub on MDT0 to simulate the case that"
4342 echo "foo's name entry will be removed, but the foo's object"
4343 echo "and its linkEA are kept in the system."
4345 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4347 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4348 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4350 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4351 error "(5) 'ls' should fail"
4353 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4354 $START_NAMESPACE -r -A ||
4355 error "(6) Fail to start LFSCK for namespace"
4357 wait_all_targets_blocked namespace completed 7
4359 local repaired=$($SHOW_NAMESPACE |
4360 awk '/^lost_dirent_repaired/ { print $2 }')
4361 [ $repaired -eq 1 ] ||
4362 error "(8) Fail to repair lost dirent: $repaired"
4364 ls -ail $DIR/$tdir/d0/foo ||
4365 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4367 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4368 [ "$foofid" == "$foofid2" ] ||
4369 error "(10) foo's FID changed: $foofid, $foofid2"
4371 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4374 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4375 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4376 skip "MDS older than 2.6.50, LU-5516"
4379 echo "The remote name entry back referenced by the MDT-object is lost."
4380 echo "The namespace LFSCK will add the missing remote name entry back"
4381 echo "to the normal namespace."
4384 check_mount_and_prep
4386 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4387 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4388 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4390 echo "Inject failure stub on MDT0 to simulate the case that"
4391 echo "foo's name entry will be removed, but the foo's object"
4392 echo "and its linkEA are kept in the system."
4394 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4395 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4396 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4397 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4399 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4400 error "(4) 'ls' should fail"
4402 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4403 $START_NAMESPACE -r -A ||
4404 error "(5) Fail to start LFSCK for namespace"
4406 wait_all_targets_blocked namespace completed 6
4408 local repaired=$($SHOW_NAMESPACE |
4409 awk '/^lost_dirent_repaired/ { print $2 }')
4410 [ $repaired -eq 1 ] ||
4411 error "(7) Fail to repair lost dirent: $repaired"
4413 ls -ail $DIR/$tdir/d0/foo ||
4414 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4416 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4417 [ "$foofid" == "$foofid2" ] ||
4418 error "(9) foo's FID changed: $foofid, $foofid2"
4420 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4423 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4424 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4425 skip "MDS older than 2.6.50, LU-5516"
4428 echo "The local parent referenced by the MDT-object linkEA is lost."
4429 echo "The namespace LFSCK will re-create the lost parent as orphan."
4432 check_mount_and_prep
4434 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4435 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4436 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4437 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4439 echo "Inject failure stub on MDT0 to simulate the case that"
4440 echo "foo's name entry will be removed, but the foo's object"
4441 echo "and its linkEA are kept in the system. And then remove"
4442 echo "another hard link and the parent directory."
4444 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4445 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4446 rm -f $DIR/$tdir/d0/foo ||
4447 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4448 rm -f $DIR/$tdir/d0/dummy ||
4449 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4450 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4452 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4453 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4455 echo "Trigger namespace LFSCK to repair the lost parent"
4456 $START_NAMESPACE -r -A ||
4457 error "(6) Fail to start LFSCK for namespace"
4459 wait_all_targets_blocked namespace completed 7
4461 local repaired=$($SHOW_NAMESPACE |
4462 awk '/^lost_dirent_repaired/ { print $2 }')
4463 [ $repaired -eq 1 ] ||
4464 error "(8) Fail to repair lost dirent: $repaired"
4466 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4467 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4468 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4470 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4472 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4473 [ ! -z "$cname" ] ||
4474 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4476 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4479 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4480 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4481 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4482 skip "MDS older than 2.6.50, LU-5516"
4485 echo "The remote parent referenced by the MDT-object linkEA is lost."
4486 echo "The namespace LFSCK will re-create the lost parent as orphan."
4489 check_mount_and_prep
4491 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4492 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4494 $LFS path2fid $DIR/$tdir/d0
4496 echo "Inject failure stub on MDT0 to simulate the case that"
4497 echo "foo's name entry will be removed, but the foo's object"
4498 echo "and its linkEA are kept in the system. And then remove"
4499 echo "the parent directory."
4501 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4503 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4504 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4506 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4507 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4509 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4510 $START_NAMESPACE -r -A ||
4511 error "(6) Fail to start LFSCK for namespace"
4513 wait_all_targets_blocked namespace completed 7
4515 local repaired=$($SHOW_NAMESPACE |
4516 awk '/^lost_dirent_repaired/ { print $2 }')
4517 [ $repaired -eq 1 ] ||
4518 error "(8) Fail to repair lost dirent: $repaired"
4520 ls -ail $MOUNT/.lustre/lost+found/
4522 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4523 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4524 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4526 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4528 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4529 [ ! -z "$cname" ] ||
4530 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4532 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4535 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4536 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4537 skip "MDS older than 2.6.50, LU-5506"
4540 echo "The target name entry is lost. The LFSCK should insert the"
4541 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4542 echo "the MDT (on which the orphan MDT-object resides) has ever"
4543 echo "failed to respond some name entry verification during the"
4544 echo "first stage-scanning, then the LFSCK should skip to handle"
4545 echo "orphan MDT-object on this MDT. But other MDTs should not"
4549 check_mount_and_prep
4550 $LFS mkdir -i 0 $DIR/$tdir/d1
4551 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4552 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4554 $LFS mkdir -i 1 $DIR/$tdir/d2
4555 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4556 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4558 echo "Inject failure stub on MDT0 to simulate the case that"
4559 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4560 echo "and its linkEA are kept in the system. And the case that"
4561 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4562 echo "and its linkEA are kept in the system."
4564 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4565 do_facet mds1 $LCTL set_param fail_loc=0x1624
4566 do_facet mds2 $LCTL set_param fail_loc=0x1624
4567 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4568 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4569 do_facet mds1 $LCTL set_param fail_loc=0
4570 do_facet mds2 $LCTL set_param fail_loc=0
4572 cancel_lru_locks mdc
4573 cancel_lru_locks osc
4575 echo "Inject failure, to simulate the MDT0 fail to handle"
4576 echo "MDT1 LFSCK request during the first-stage scanning."
4577 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4578 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4580 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4581 $START_NAMESPACE -r -A ||
4582 error "(3) Fail to start LFSCK for namespace"
4584 wait_update_facet mds1 "$LCTL get_param -n \
4585 mdd.$(facet_svc mds1).lfsck_namespace |
4586 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4587 error "(4) mds1 is not the expected 'partial'"
4590 wait_update_facet mds2 "$LCTL get_param -n \
4591 mdd.$(facet_svc mds2).lfsck_namespace |
4592 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4593 error "(5) mds2 is not the expected 'completed'"
4596 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4598 local repaired=$(do_facet mds1 $LCTL get_param -n \
4599 mdd.$(facet_svc mds1).lfsck_namespace |
4600 awk '/^lost_dirent_repaired/ { print $2 }')
4601 [ $repaired -eq 0 ] ||
4602 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4604 repaired=$(do_facet mds2 $LCTL get_param -n \
4605 mdd.$(facet_svc mds2).lfsck_namespace |
4606 awk '/^lost_dirent_repaired/ { print $2 }')
4607 [ $repaired -eq 1 ] ||
4608 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4610 echo "Trigger namespace LFSCK on all devices again to cleanup"
4611 $START_NAMESPACE -r -A ||
4612 error "(8) Fail to start LFSCK for namespace"
4614 wait_all_targets_blocked namespace completed 9
4616 local repaired=$(do_facet mds1 $LCTL get_param -n \
4617 mdd.$(facet_svc mds1).lfsck_namespace |
4618 awk '/^lost_dirent_repaired/ { print $2 }')
4619 [ $repaired -eq 1 ] ||
4620 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4622 repaired=$(do_facet mds2 $LCTL get_param -n \
4623 mdd.$(facet_svc mds2).lfsck_namespace |
4624 awk '/^lost_dirent_repaired/ { print $2 }')
4625 [ $repaired -eq 0 ] ||
4626 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4628 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4631 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4632 skip "MDS older than 2.6.50, LU-5517"
4635 echo "The object's nlink attribute is larger than the object's known"
4636 echo "name entries count. The LFSCK will repair the object's nlink"
4637 echo "attribute to match the known name entries count"
4640 check_mount_and_prep
4642 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4643 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4645 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4646 echo "nlink attribute is larger than its name entries count."
4648 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4649 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4650 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4651 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4652 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4654 cancel_lru_locks mdc
4655 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4656 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4658 echo "Trigger namespace LFSCK to repair the nlink count"
4659 $START_NAMESPACE -r -A ||
4660 error "(5) Fail to start LFSCK for namespace"
4662 wait_all_targets_blocked namespace completed 6
4664 local repaired=$($SHOW_NAMESPACE |
4665 awk '/^nlinks_repaired/ { print $2 }')
4666 [ $repaired -eq 1 ] ||
4667 error "(7) Fail to repair nlink count: $repaired"
4669 cancel_lru_locks mdc
4670 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4671 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4673 # Disable 29a, we only allow nlink to be updated if the known linkEA
4674 # entries is larger than nlink count.
4676 #run_test 29a "LFSCK can repair bad nlink count (1)"
4679 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4680 skip "MDS older than 2.6.50, LU-5517"
4683 echo "The object's nlink attribute is smaller than the object's known"
4684 echo "name entries count. The LFSCK will repair the object's nlink"
4685 echo "attribute to match the known name entries count"
4688 check_mount_and_prep
4690 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4691 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4693 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4694 echo "nlink attribute is smaller than its name entries count."
4696 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4697 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4698 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4699 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4700 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4702 cancel_lru_locks mdc
4703 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4704 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4706 echo "Trigger namespace LFSCK to repair the nlink count"
4707 $START_NAMESPACE -r -A ||
4708 error "(5) Fail to start LFSCK for namespace"
4710 wait_all_targets_blocked namespace completed 6
4712 local repaired=$($SHOW_NAMESPACE |
4713 awk '/^nlinks_repaired/ { print $2 }')
4714 [ $repaired -eq 1 ] ||
4715 error "(7) Fail to repair nlink count: $repaired"
4717 cancel_lru_locks mdc
4718 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4719 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4721 run_test 29b "LFSCK can repair bad nlink count (2)"
4725 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4726 skip "MDS older than 2.6.50, LU-5517"
4729 echo "The namespace LFSCK will create many hard links to the target"
4730 echo "file as to exceed the linkEA size limitation. Under such case"
4731 echo "the linkEA will be marked as overflow that will prevent the"
4732 echo "target file to be migrated. Then remove some hard links to"
4733 echo "make the left hard links to be held within the linkEA size"
4734 echo "limitation. But before the namespace LFSCK adding all the"
4735 echo "missed linkEA entries back, the overflow mark (timestamp)"
4736 echo "will not be cleared."
4739 check_mount_and_prep
4741 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4742 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4743 error "(0.2) Fail to mkdir"
4744 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4745 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4747 # define MAX_LINKEA_SIZE 4096
4748 # sizeof(link_ea_header) = 24
4749 # sizeof(link_ea_entry) = 18
4750 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4751 # (sizeof(link_ea_entry) + name_length))
4752 # If the average name length is 12 bytes, then 150 hard links
4753 # is totally enough to overflow the linkEA
4754 echo "Create 150 hard links should succeed although the linkEA overflow"
4755 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4756 error "(2) Fail to hard link"
4758 cancel_lru_locks mdc
4759 if [ $MDSCOUNT -ge 2 ]; then
4760 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4761 error "(3.1) Migrate should fail"
4763 echo "The object with linkEA overflow should NOT be migrated"
4764 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4765 [ "$newfid" == "$oldfid" ] ||
4766 error "(3.2) Migrate should fail: $newfid != $oldfid"
4769 # Remove 100 hard links, then the linkEA should have space
4770 # to hold the missed linkEA entries.
4771 echo "Remove 100 hard links to save space for the missed linkEA entries"
4772 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4774 if [ $MDSCOUNT -ge 2 ]; then
4775 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4776 error "(5.1) Migrate should fail"
4778 # The overflow timestamp is still there, so migration will fail.
4779 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4780 [ "$newfid" == "$oldfid" ] ||
4781 error "(5.2) Migrate should fail: $newfid != $oldfid"
4784 # sleep 3 seconds to guarantee that the overflow is recognized
4787 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4788 $START_NAMESPACE -r -A ||
4789 error "(6) Fail to start LFSCK for namespace"
4791 wait_all_targets_blocked namespace completed 7
4793 local repaired=$($SHOW_NAMESPACE |
4794 awk '/^linkea_overflow_cleared/ { print $2 }')
4795 [ $repaired -eq 1 ] ||
4796 error "(8) Fail to clear linkea overflow: $repaired"
4798 repaired=$($SHOW_NAMESPACE |
4799 awk '/^nlinks_repaired/ { print $2 }')
4800 [ $repaired -eq 0 ] ||
4801 error "(9) Unexpected nlink repaired: $repaired"
4803 if [ $MDSCOUNT -ge 2 ]; then
4804 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4805 error "(10.1) Migrate failure"
4807 # Migration should succeed after clear the overflow timestamp.
4808 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4809 [ "$newfid" != "$oldfid" ] ||
4810 error "(10.2) Migrate should succeed"
4812 ls -l $DIR/$tdir/foo > /dev/null ||
4813 error "(11) 'ls' failed after migration"
4816 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4817 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4819 run_test 29c "verify linkEA size limitation"
4822 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4823 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4824 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4825 skip "MDS older than 2.6.50, LU-5518"
4828 echo "The namespace LFSCK will move the orphans from backend"
4829 echo "/lost+found directory to normal client visible namespace"
4830 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4833 check_mount_and_prep
4835 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4836 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4838 echo "Inject failure stub on MDT0 to simulate the case that"
4839 echo "directory d0 has no linkEA entry, then the LFSCK will"
4840 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4842 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4843 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4844 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4845 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4847 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4848 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4850 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4851 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4853 echo "Inject failure stub on MDT0 to simulate the case that the"
4854 echo "object's name entry will be removed, but not destroy the"
4855 echo "object. Then backend e2fsck will handle it as orphan and"
4856 echo "add them into the backend /lost+found directory."
4858 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4859 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4860 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4861 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4862 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4863 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4864 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4866 umount_client $MOUNT || error "(10) Fail to stop client!"
4868 stop $SINGLEMDS || error "(11) Fail to stop $SINGLEMDS"
4870 local dev=$(facet_device $SINGLEMDS)
4872 echo "run e2fsck on $SINGLEMDS"
4873 run_e2fsck $(facet_active_host $SINGLEMDS) $dev "-y" ||
4874 error "(12) Fail to run e2fsck"
4876 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 13
4878 echo "Trigger namespace LFSCK to recover backend orphans"
4879 $START_NAMESPACE -r -A ||
4880 error "(14) Fail to start LFSCK for namespace"
4882 wait_all_targets_blocked namespace completed 15
4884 local repaired=$($SHOW_NAMESPACE |
4885 awk '/^local_lost_found_moved/ { print $2 }')
4886 [ $repaired -ge 4 ] ||
4887 error "(16) Fail to recover backend orphans: $repaired"
4889 mount_client $MOUNT || error "(17) Fail to start client!"
4891 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4893 ls -ail $MOUNT/.lustre/lost+found/
4895 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4896 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4897 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4899 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4901 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4902 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4904 stat ${cname}/d1 || error "(21) d1 is not recovered"
4905 stat ${cname}/f1 || error "(22) f1 is not recovered"
4907 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4910 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4911 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4912 skip "MDS older than 2.6.50, LU-5519"
4915 echo "For the name entry under a striped directory, if the name"
4916 echo "hash does not match the shard, then the LFSCK will repair"
4917 echo "the bad name entry"
4920 check_mount_and_prep
4922 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4923 error "(1) Fail to create striped directory"
4925 echo "Inject failure stub on client to simulate the case that"
4926 echo "some name entry should be inserted into other non-first"
4927 echo "shard, but inserted into the first shard by wrong"
4929 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4930 $LCTL set_param fail_loc=0x1628 fail_val=0
4931 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4932 error "(2) Fail to create file under striped directory"
4933 $LCTL set_param fail_loc=0 fail_val=0
4935 echo "Trigger namespace LFSCK to repair bad name hash"
4936 $START_NAMESPACE -r -A ||
4937 error "(3) Fail to start LFSCK for namespace"
4939 wait_all_targets_blocked namespace completed 4
4941 local repaired=$($SHOW_NAMESPACE |
4942 awk '/^name_hash_repaired/ { print $2 }')
4943 [ $repaired -ge 1 ] ||
4944 error "(5) Fail to repair bad name hash: $repaired"
4946 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
4948 error "Fail to find flag bad type: $rc"
4950 umount_client $MOUNT || error "(6) umount failed"
4951 mount_client $MOUNT || error "(7) mount failed"
4953 for ((i = 0; i < $MDSCOUNT; i++)); do
4954 stat $DIR/$tdir/striped_dir/d$i ||
4955 error "(8) Fail to stat d$i after LFSCK"
4956 rmdir $DIR/$tdir/striped_dir/d$i ||
4957 error "(9) Fail to unlink d$i after LFSCK"
4960 rmdir $DIR/$tdir/striped_dir ||
4961 error "(10) Fail to remove the striped directory after LFSCK"
4963 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4966 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4967 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4968 skip "MDS older than 2.6.50, LU-5519"
4971 echo "For the name entry under a striped directory, if the name"
4972 echo "hash does not match the shard, then the LFSCK will repair"
4973 echo "the bad name entry"
4976 check_mount_and_prep
4978 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4979 error "(1) Fail to create striped directory"
4981 echo "Inject failure stub on client to simulate the case that"
4982 echo "some name entry should be inserted into other non-second"
4983 echo "shard, but inserted into the secod shard by wrong"
4985 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4986 $LCTL set_param fail_loc=0x1628 fail_val=1
4987 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
4988 error "(2) Fail to create file under striped directory"
4989 $LCTL set_param fail_loc=0 fail_val=0
4991 echo "Trigger namespace LFSCK to repair bad name hash"
4992 $START_NAMESPACE -r -A ||
4993 error "(3) Fail to start LFSCK for namespace"
4995 wait_all_targets_blocked namespace completed 4
4997 local repaired=$(do_facet mds2 $LCTL get_param -n \
4998 mdd.$(facet_svc mds2).lfsck_namespace |
4999 awk '/^name_hash_repaired/ { print $2 }')
5000 echo "repaired $repaired name entries with bad hash"
5001 [ $repaired -ge 1 ] ||
5002 error "(5) Fail to repair bad name hash: $repaired"
5004 umount_client $MOUNT || error "(6) umount failed"
5005 mount_client $MOUNT || error "(7) mount failed"
5007 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
5008 stat $DIR/$tdir/striped_dir/d$i ||
5009 error "(8) Fail to stat d$i after LFSCK"
5010 rmdir $DIR/$tdir/striped_dir/d$i ||
5011 error "(9) Fail to unlink d$i after LFSCK"
5014 rmdir $DIR/$tdir/striped_dir ||
5015 error "(10) Fail to remove the striped directory after LFSCK"
5017 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
5020 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5021 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5022 skip "MDS older than 2.6.50, LU-5519"
5025 echo "For some reason, the master MDT-object of the striped directory"
5026 echo "may lost its master LMV EA. If nobody created files under the"
5027 echo "master directly after the master LMV EA lost, then the LFSCK"
5028 echo "should re-generate the master LMV EA."
5031 check_mount_and_prep
5033 echo "Inject failure stub on MDT0 to simulate the case that the"
5034 echo "master MDT-object of the striped directory lost the LMV EA."
5036 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5037 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5038 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5039 error "(1) Fail to create striped directory"
5040 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5042 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5043 $START_NAMESPACE -r -A ||
5044 error "(2) Fail to start LFSCK for namespace"
5046 wait_all_targets_blocked namespace completed 3
5048 local repaired=$($SHOW_NAMESPACE |
5049 awk '/^striped_dirs_repaired/ { print $2 }')
5050 [ $repaired -eq 1 ] ||
5051 error "(4) Fail to re-generate master LMV EA: $repaired"
5053 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5054 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5056 umount_client $MOUNT || error "(5) umount failed"
5057 mount_client $MOUNT || error "(6) mount failed"
5059 local empty=$(ls $DIR/$tdir/striped_dir/)
5060 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5062 rmdir $DIR/$tdir/striped_dir ||
5063 error "(8) Fail to remove the striped directory after LFSCK"
5065 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5068 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5069 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5070 skip "MDS older than 2.6.50, LU-5519"
5073 echo "For some reason, the master MDT-object of the striped directory"
5074 echo "may lost its master LMV EA. If somebody created files under the"
5075 echo "master directly after the master LMV EA lost, then the LFSCK"
5076 echo "should NOT re-generate the master LMV EA, instead, it should"
5077 echo "change the broken striped dirctory as read-only to prevent"
5078 echo "further damage"
5081 check_mount_and_prep
5083 echo "Inject failure stub on MDT0 to simulate the case that the"
5084 echo "master MDT-object of the striped directory lost the LMV EA."
5086 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5087 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5088 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5089 error "(1) Fail to create striped directory"
5090 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5092 umount_client $MOUNT || error "(2) umount failed"
5093 mount_client $MOUNT || error "(3) mount failed"
5095 touch $DIR/$tdir/striped_dir/dummy ||
5096 error "(4) Fail to touch under broken striped directory"
5098 echo "Trigger namespace LFSCK to find out the inconsistency"
5099 $START_NAMESPACE -r -A ||
5100 error "(5) Fail to start LFSCK for namespace"
5102 wait_all_targets_blocked namespace completed 6
5104 local repaired=$($SHOW_NAMESPACE |
5105 awk '/^striped_dirs_repaired/ { print $2 }')
5106 [ $repaired -eq 0 ] ||
5107 error "(7) Re-generate master LMV EA unexpected: $repaired"
5109 stat $DIR/$tdir/striped_dir/dummy ||
5110 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5112 touch $DIR/$tdir/striped_dir/foo &&
5113 error "(9) The broken striped directory should be read-only"
5115 chattr -i $DIR/$tdir/striped_dir ||
5116 error "(10) Fail to chattr on the broken striped directory"
5118 rmdir $DIR/$tdir/striped_dir ||
5119 error "(11) Fail to remove the striped directory after LFSCK"
5121 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5124 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5125 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5126 skip "MDS older than 2.6.50, LU-5519"
5129 echo "For some reason, the slave MDT-object of the striped directory"
5130 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5131 echo "slave LMV EA."
5134 check_mount_and_prep
5136 echo "Inject failure stub on MDT0 to simulate the case that the"
5137 echo "slave MDT-object (that resides on the same MDT as the master"
5138 echo "MDT-object resides on) lost the LMV EA."
5140 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5141 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5142 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5143 error "(1) Fail to create striped directory"
5144 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5146 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5147 $START_NAMESPACE -r -A ||
5148 error "(2) Fail to start LFSCK for namespace"
5150 wait_all_targets_blocked namespace completed 3
5152 local repaired=$($SHOW_NAMESPACE |
5153 awk '/^striped_shards_repaired/ { print $2 }')
5154 [ $repaired -eq 1 ] ||
5155 error "(4) Fail to re-generate slave LMV EA: $repaired"
5157 rmdir $DIR/$tdir/striped_dir ||
5158 error "(5) Fail to remove the striped directory after LFSCK"
5160 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5163 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5164 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5165 skip "MDS older than 2.6.50, LU-5519"
5168 echo "For some reason, the slave MDT-object of the striped directory"
5169 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5170 echo "slave LMV EA."
5173 check_mount_and_prep
5175 echo "Inject failure stub on MDT0 to simulate the case that the"
5176 echo "slave MDT-object (that resides on different MDT as the master"
5177 echo "MDT-object resides on) lost the LMV EA."
5179 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5180 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5181 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5182 error "(1) Fail to create striped directory"
5183 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5185 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5186 $START_NAMESPACE -r -A ||
5187 error "(2) Fail to start LFSCK for namespace"
5189 wait_all_targets_blocked namespace completed 3
5191 local repaired=$(do_facet mds2 $LCTL get_param -n \
5192 mdd.$(facet_svc mds2).lfsck_namespace |
5193 awk '/^striped_shards_repaired/ { print $2 }')
5194 [ $repaired -eq 1 ] ||
5195 error "(4) Fail to re-generate slave LMV EA: $repaired"
5197 rmdir $DIR/$tdir/striped_dir ||
5198 error "(5) Fail to remove the striped directory after LFSCK"
5200 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5203 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5204 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5205 skip "MDS older than 2.6.50, LU-5519"
5208 echo "For some reason, the stripe index in the slave LMV EA is"
5209 echo "corrupted. The LFSCK should repair the slave LMV EA."
5212 check_mount_and_prep
5214 echo "Inject failure stub on MDT0 to simulate the case that the"
5215 echo "slave LMV EA on the first shard of the striped directory"
5216 echo "claims the same index as the second shard claims"
5218 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5220 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5221 error "(1) Fail to create striped directory"
5222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5224 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5225 $START_NAMESPACE -r -A ||
5226 error "(2) Fail to start LFSCK for namespace"
5228 wait_all_targets_blocked namespace completed 3
5230 local repaired=$($SHOW_NAMESPACE |
5231 awk '/^striped_shards_repaired/ { print $2 }')
5232 [ $repaired -eq 1 ] ||
5233 error "(4) Fail to repair slave LMV EA: $repaired"
5235 umount_client $MOUNT || error "(5) umount failed"
5236 mount_client $MOUNT || error "(6) mount failed"
5238 touch $DIR/$tdir/striped_dir/foo ||
5239 error "(7) Fail to touch file after the LFSCK"
5241 rm -f $DIR/$tdir/striped_dir/foo ||
5242 error "(8) Fail to unlink file after the LFSCK"
5244 rmdir $DIR/$tdir/striped_dir ||
5245 error "(9) Fail to remove the striped directory after LFSCK"
5247 run_test 31g "Repair the corrupted slave LMV EA"
5250 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5251 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5252 skip "MDS older than 2.6.50, LU-5519"
5255 echo "For some reason, the shard's name entry in the striped"
5256 echo "directory may be corrupted. The LFSCK should repair the"
5257 echo "bad shard's name entry."
5260 check_mount_and_prep
5262 echo "Inject failure stub on MDT0 to simulate the case that the"
5263 echo "first shard's name entry in the striped directory claims"
5264 echo "the same index as the second shard's name entry claims."
5266 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5267 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5268 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5269 error "(1) Fail to create striped directory"
5270 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5272 echo "Trigger namespace LFSCK to repair the shard's name entry"
5273 $START_NAMESPACE -r -A ||
5274 error "(2) Fail to start LFSCK for namespace"
5276 wait_all_targets_blocked namespace completed 3
5278 local repaired=$($SHOW_NAMESPACE |
5279 awk '/^dirent_repaired/ { print $2 }')
5280 [ $repaired -eq 1 ] ||
5281 error "(4) Fail to repair shard's name entry: $repaired"
5283 umount_client $MOUNT || error "(5) umount failed"
5284 mount_client $MOUNT || error "(6) mount failed"
5286 touch $DIR/$tdir/striped_dir/foo ||
5287 error "(7) Fail to touch file after the LFSCK"
5289 rm -f $DIR/$tdir/striped_dir/foo ||
5290 error "(8) Fail to unlink file after the LFSCK"
5292 rmdir $DIR/$tdir/striped_dir ||
5293 error "(9) Fail to remove the striped directory after LFSCK"
5295 run_test 31h "Repair the corrupted shard's name entry"
5300 umount_client $MOUNT
5302 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5303 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5304 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5306 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5307 [ "$STATUS" == "scanning-phase1" ] ||
5308 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5311 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5317 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5319 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5320 error "(5) Fail to start ost1"
5322 run_test 32a "stop LFSCK when some OST failed"
5326 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5329 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5330 error "(1) Fail to create $DIR/$tdir/dp"
5331 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5332 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5333 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5334 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5335 umount_client $MOUNT
5337 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5338 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5339 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5341 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5342 mdd.${MDT_DEV}.lfsck_namespace |
5343 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5345 error "(5) unexpected status"
5349 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5351 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5355 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5357 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5358 error "(8) Fail to start MDT2"
5360 run_test 32b "stop LFSCK when some MDT failed"
5366 $START_LAYOUT --dryrun -o -r ||
5367 error "(1) Fail to start layout LFSCK"
5368 wait_all_targets_blocked layout completed 2
5370 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5371 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5372 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5374 $START_NAMESPACE -e abort -A -r ||
5375 error "(4) Fail to start namespace LFSCK"
5376 wait_all_targets_blocked namespace completed 5
5378 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5379 [ "$PARAMS" == "failout,all_targets" ] ||
5380 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5382 run_test 33 "check LFSCK paramters"
5386 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5387 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5391 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5392 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5393 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5394 error "(1) Fail to create $DIR/$tdir/dummy"
5396 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5397 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5398 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5399 mdd.${MDT_DEV}.lfsck_namespace |
5400 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5402 error "(3) unexpected status"
5405 local repaired=$($SHOW_NAMESPACE |
5406 awk '/^dirent_repaired/ { print $2 }')
5407 [ $repaired -eq 1 ] ||
5408 error "(4) Fail to repair the lost agent object: $repaired"
5410 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5411 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5412 mdd.${MDT_DEV}.lfsck_namespace |
5413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5415 error "(6) unexpected status"
5418 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5419 [ $repaired -eq 0 ] ||
5420 error "(7) Unexpected repairing: $repaired"
5422 run_test 34 "LFSCK can rebuild the lost agent object"
5426 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5430 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5431 do_facet mds2 $LCTL set_param fail_loc=0x1631
5432 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5433 error "(1) Fail to create $DIR/$tdir/dummy"
5436 do_facet mds2 $LCTL set_param fail_loc=0
5437 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5438 wait_update_facet mds2 "$LCTL get_param -n \
5439 mdd.$(facet_svc mds2).lfsck_namespace |
5440 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5441 error "(3) MDS${k} is not the expected 'completed'"
5443 local repaired=$(do_facet mds2 $LCTL get_param -n \
5444 mdd.$(facet_svc mds2).lfsck_namespace |
5445 awk '/^agent_entries_repaired/ { print $2 }')
5446 [ $repaired -eq 1 ] ||
5447 error "(4) Fail to repair the lost agent entry: $repaired"
5449 echo "stopall to cleanup object cache"
5452 setupall > /dev/null
5454 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5455 wait_update_facet mds2 "$LCTL get_param -n \
5456 mdd.$(facet_svc mds2).lfsck_namespace |
5457 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5458 error "(6) MDS${k} is not the expected 'completed'"
5460 repaired=$(do_facet mds2 $LCTL get_param -n \
5461 mdd.$(facet_svc mds2).lfsck_namespace |
5462 awk '/^agent_entries_repaired/ { print $2 }')
5463 [ $repaired -eq 0 ] ||
5464 error "(7) Unexpected repairing: $repaired"
5466 run_test 35 "LFSCK can rebuild the lost agent entry"
5469 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5472 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5473 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5474 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5477 check_mount_and_prep
5481 lctl get_param osc.*.*grant*
5482 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5484 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5485 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5486 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5487 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5488 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5489 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5490 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5491 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5492 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5494 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5495 error "(3) Fail to write $DIR/$tdir/f0"
5496 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5497 error "(4) Fail to write $DIR/$tdir/f1"
5498 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5499 error "(5) Fail to write $DIR/$tdir/f2"
5501 $LFS mirror resync $DIR/$tdir/f0 ||
5502 error "(6) Fail to resync $DIR/$tdir/f0"
5503 $LFS mirror resync $DIR/$tdir/f1 ||
5504 error "(7) Fail to resync $DIR/$tdir/f1"
5505 $LFS mirror resync $DIR/$tdir/f2 ||
5506 error "(8) Fail to resync $DIR/$tdir/f2"
5508 cancel_lru_locks mdc
5509 cancel_lru_locks osc
5511 $LFS getstripe $DIR/$tdir/f0 ||
5512 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5513 $LFS getstripe $DIR/$tdir/f1 ||
5514 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5515 $LFS getstripe $DIR/$tdir/f2 ||
5516 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5518 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5519 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5520 do_facet mds1 $LCTL set_param fail_loc=0x1616
5522 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5523 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5524 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5525 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5526 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5527 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5531 do_facet mds1 $LCTL set_param fail_loc=0
5533 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5534 error "(15) The 1st of mirror is not destroyed"
5535 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5536 error "(16) The 2nd of mirror is not destroyed"
5537 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5538 error "(17) The 3rd of mirror is not destroyed"
5542 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5543 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5544 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5545 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5546 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5547 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5549 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5550 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5552 for k in $(seq $MDSCOUNT); do
5553 # The LFSCK status query internal is 30 seconds. For the case
5554 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5555 # time to guarantee the status sync up.
5556 wait_update_facet mds${k} "$LCTL get_param -n \
5557 mdd.$(facet_svc mds${k}).lfsck_layout |
5558 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5559 error "(22) MDS${k} is not the expected 'completed'"
5562 for k in $(seq $OSTCOUNT); do
5563 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5564 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5565 awk '/^status/ { print $2 }')
5566 [ "$cur_status" == "completed" ] ||
5567 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5570 local repaired=$(do_facet mds1 $LCTL get_param -n \
5571 mdd.$(facet_svc mds1).lfsck_layout |
5572 awk '/^repaired_orphan/ { print $2 }')
5573 [ $repaired -eq 9 ] ||
5574 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5576 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5577 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5578 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5579 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5580 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5581 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5583 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5584 $LFS getstripe $DIR/$tdir/f0
5585 error "(28) The 1st of mirror is not recovered"
5588 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5589 $LFS getstripe $DIR/$tdir/f1
5590 error "(29) The 2nd of mirror is not recovered"
5593 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5594 $LFS getstripe $DIR/$tdir/f2
5595 error "(30) The 3rd of mirror is not recovered"
5598 run_test 36a "rebuild LOV EA for mirrored file (1)"
5601 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5602 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5605 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5606 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5607 echo "with the PFID EA of related OST-object(s) belong to the file. "
5610 check_mount_and_prep
5612 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5613 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5614 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5616 local fid=$($LFS path2fid $DIR/$tdir/f0)
5618 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5619 error "(1) Fail to write $DIR/$tdir/f0"
5620 $LFS mirror resync $DIR/$tdir/f0 ||
5621 error "(2) Fail to resync $DIR/$tdir/f0"
5623 cancel_lru_locks mdc
5624 cancel_lru_locks osc
5626 $LFS getstripe $DIR/$tdir/f0 ||
5627 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5629 echo "Inject failure, to simulate the case of missing the MDT-object"
5630 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5631 do_facet mds1 $LCTL set_param fail_loc=0x1616
5632 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5636 do_facet mds1 $LCTL set_param fail_loc=0
5638 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5639 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5641 for k in $(seq $MDSCOUNT); do
5642 # The LFSCK status query internal is 30 seconds. For the case
5643 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5644 # time to guarantee the status sync up.
5645 wait_update_facet mds${k} "$LCTL get_param -n \
5646 mdd.$(facet_svc mds${k}).lfsck_layout |
5647 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5648 error "(6) MDS${k} is not the expected 'completed'"
5651 for k in $(seq $OSTCOUNT); do
5652 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5653 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5654 awk '/^status/ { print $2 }')
5655 [ "$cur_status" == "completed" ] ||
5656 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5659 local count=$(do_facet mds1 $LCTL get_param -n \
5660 mdd.$(facet_svc mds1).lfsck_layout |
5661 awk '/^repaired_orphan/ { print $2 }')
5662 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5664 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5665 count=$($LFS getstripe --mirror-count $name)
5666 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5668 count=$($LFS getstripe --component-count $name)
5669 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5671 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5672 $LFS getstripe $name
5673 error "(11) The 1st of mirror is not recovered"
5676 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5677 $LFS getstripe $name
5678 error "(12) The 2nd of mirror is not recovered"
5681 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5682 $LFS getstripe $name
5683 error "(13) The 3rd of mirror is not recovered"
5686 run_test 36b "rebuild LOV EA for mirrored file (2)"
5689 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5690 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5693 echo "The mirrored file has been modified, not resynced yet, then "
5694 echo "lost its MDT-object, but relatd OST-objects are still there. "
5695 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5696 echo "with the PFID EA of related OST-object(s) belong to the file. "
5699 check_mount_and_prep
5701 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5703 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5705 local fid=$($LFS path2fid $DIR/$tdir/f0)
5707 # The 1st dd && resync makes all related OST-objects have been written
5708 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5709 error "(1.1) Fail to write $DIR/$tdir/f0"
5710 $LFS mirror resync $DIR/$tdir/f0 ||
5711 error "(1.2) Fail to resync $DIR/$tdir/f0"
5712 # The 2nd dd makes one mirror to be stale
5713 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5714 error "(1.3) Fail to write $DIR/$tdir/f0"
5716 cancel_lru_locks mdc
5717 cancel_lru_locks osc
5719 $LFS getstripe $DIR/$tdir/f0 ||
5720 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5722 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5723 awk '/lcme_flags/ { print $2 }')
5724 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5725 awk '/lcme_flags/ { print $2 }')
5727 echo "Inject failure, to simulate the case of missing the MDT-object"
5728 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5729 do_facet mds1 $LCTL set_param fail_loc=0x1616
5730 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5734 do_facet mds1 $LCTL set_param fail_loc=0
5736 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5737 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5739 for k in $(seq $MDSCOUNT); do
5740 # The LFSCK status query internal is 30 seconds. For the case
5741 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5742 # time to guarantee the status sync up.
5743 wait_update_facet mds${k} "$LCTL get_param -n \
5744 mdd.$(facet_svc mds${k}).lfsck_layout |
5745 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5746 error "(5) MDS${k} is not the expected 'completed'"
5749 for k in $(seq $OSTCOUNT); do
5750 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5751 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5752 awk '/^status/ { print $2 }')
5753 [ "$cur_status" == "completed" ] ||
5754 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5757 local count=$(do_facet mds1 $LCTL get_param -n \
5758 mdd.$(facet_svc mds1).lfsck_layout |
5759 awk '/^repaired_orphan/ { print $2 }')
5760 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5762 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5763 count=$($LFS getstripe --mirror-count $name)
5764 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5766 count=$($LFS getstripe --component-count $name)
5767 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5769 local flags=$($LFS getstripe $name | head -n 10 |
5770 awk '/lcme_flags/ { print $2 }')
5771 [ "$flags" == "$saved_flags1" ] || {
5772 $LFS getstripe $name
5773 error "(10) expect flags $saved_flags1, got $flags"
5776 flags=$($LFS getstripe $name | tail -n 10 |
5777 awk '/lcme_flags/ { print $2 }')
5778 [ "$flags" == "$saved_flags2" ] || {
5779 $LFS getstripe $name
5780 error "(11) expect flags $saved_flags2, got $flags"
5783 run_test 36c "rebuild LOV EA for mirrored file (3)"
5789 local t_dir="$DIR/$tdir/d0"
5790 check_mount_and_prep
5792 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5793 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5797 $START_NAMESPACE -r -A || {
5798 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5800 wait_all_targets_blocked namespace completed 4
5805 run_test 37 "LFSCK must skip a ORPHAN"
5809 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5810 skip "Need MDS version newer than 2.12.51"
5812 test_mkdir $DIR/$tdir
5813 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5814 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5816 # create foreign file
5817 $LFS setstripe --foreign=none --flags 0xda05 \
5818 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5819 error "$DIR/$tdir/$tfile: create failed"
5821 $LFS getstripe -v $DIR/$tdir/$tfile |
5822 grep "lfm_magic:.*0x0BD70BD0" ||
5823 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5824 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5825 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5826 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5827 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5828 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5829 $LFS getstripe -v $DIR/$tdir/$tfile |
5830 grep "lfm_flags:.*0x0000DA05" ||
5831 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5832 $LFS getstripe $DIR/$tdir/$tfile |
5833 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5834 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5836 # modify striping should fail
5837 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5838 error "$DIR/$tdir/$tfile: setstripe should fail"
5840 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5842 wait_all_targets_blocked namespace completed 1
5844 # check that "global" namespace_repaired == 0 !!!
5845 local repaired=$(do_facet mds1 \
5846 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5847 awk '/^namespace_repaired/ { print \\\$2 }'")
5848 [ $repaired -eq 0 ] ||
5849 error "(2) Expect no namespace repair, but got: $repaired"
5851 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5853 wait_all_targets_blocked layout completed 2
5855 # check that "global" layout_repaired == 0 !!!
5856 local repaired=$(do_facet mds1 \
5857 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5858 awk '/^layout_repaired/ { print \\\$2 }'")
5859 [ $repaired -eq 0 ] ||
5860 error "(2) Expect no layout repair, but got: $repaired"
5862 echo "post-lfsck checks of foreign file"
5864 $LFS getstripe -v $DIR/$tdir/$tfile |
5865 grep "lfm_magic:.*0x0BD70BD0" ||
5866 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5867 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5868 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5869 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5870 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5871 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5872 $LFS getstripe -v $DIR/$tdir/$tfile |
5873 grep "lfm_flags:.*0x0000DA05" ||
5874 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5875 $LFS getstripe $DIR/$tdir/$tfile |
5876 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5877 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5879 # modify striping should fail
5880 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5881 error "$DIR/$tdir/$tfile: setstripe should fail"
5884 cat $DIR/$tdir/$tfile && error "$DIR/$tdir/$tfile: read should fail"
5885 cat /etc/passwd > $DIR/$tdir/$tfile &&
5886 error "$DIR/$tdir/$tfile: write should fail"
5888 #remove foreign file
5889 rm $DIR/$tdir/$tfile ||
5890 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5892 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5896 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5897 skip "Need MDS version newer than 2.12.51"
5899 test_mkdir $DIR/$tdir
5900 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5901 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5903 # create foreign dir
5904 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5905 $DIR/$tdir/${tdir}2 ||
5906 error "$DIR/$tdir/${tdir}2: create failed"
5908 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5909 grep "lfm_magic:.*0x0CD50CD0" ||
5910 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5911 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5912 # - sizeof(lfm_type) - sizeof(lfm_flags)
5913 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5914 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5915 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5916 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5917 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5918 grep "lfm_flags:.*0x0000DA05" ||
5919 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5920 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5921 grep "lfm_value.*${uuid1}@${uuid2}" ||
5922 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5924 # file create in dir should fail
5925 touch $DIR/$tdir/${tdir}2/$tfile &&
5926 "$DIR/${tdir}2: file create should fail"
5929 chmod 777 $DIR/$tdir/${tdir}2 ||
5930 error "$DIR/${tdir}2: chmod failed"
5933 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5934 error "$DIR/${tdir}2: chown failed"
5936 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5938 wait_all_targets_blocked namespace completed 1
5940 # check that "global" namespace_repaired == 0 !!!
5941 local repaired=$(do_facet mds1 \
5942 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5943 awk '/^namespace_repaired/ { print \\\$2 }'")
5944 [ $repaired -eq 0 ] ||
5945 error "(2) Expect nothing to be repaired, but got: $repaired"
5947 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5949 wait_all_targets_blocked layout completed 2
5951 # check that "global" layout_repaired == 0 !!!
5952 local repaired=$(do_facet mds1 \
5953 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5954 awk '/^layout_repaired/ { print \\\$2 }'")
5955 [ $repaired -eq 0 ] ||
5956 error "(2) Expect no layout repair, but got: $repaired"
5958 echo "post-lfsck checks of foreign dir"
5960 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5961 grep "lfm_magic:.*0x0CD50CD0" ||
5962 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5963 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5964 # - sizeof(lfm_type) - sizeof(lfm_flags)
5965 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5966 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5967 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5968 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5969 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5970 grep "lfm_flags:.*0x0000DA05" ||
5971 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5972 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5973 grep "lfm_value.*${uuid1}@${uuid2}" ||
5974 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5976 # file create in dir should fail
5977 touch $DIR/$tdir/${tdir}2/$tfile &&
5978 "$DIR/${tdir}2: file create should fail"
5981 chmod 777 $DIR/$tdir/${tdir}2 ||
5982 error "$DIR/${tdir}2: chmod failed"
5985 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5986 error "$DIR/${tdir}2: chown failed"
5989 rmdir $DIR/$tdir/${tdir}2 ||
5990 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5992 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5995 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5997 check_mount_and_prep
5998 $LFS mkdir -i 1 $DIR/$tdir/dir1
5999 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
6001 touch $DIR/$tdir/dir1/f1
6002 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
6004 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
6005 $LFS migrate -m 0 $DIR/$tdir/dir1
6007 echo "trigger LFSCK for layout"
6008 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
6010 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6011 mdd.${MDT_DEV}.lfsck_layout |
6012 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6014 error "(2) unexpected status"
6017 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
6019 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
6021 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
6025 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
6027 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6028 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6029 do_facet $SINGLEMDS $LCTL dk > /dev/null
6031 echo "trigger LFSCK for SEL layout"
6032 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6033 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6034 mdd.${MDT_DEV}.lfsck_layout |
6035 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6037 error "(2) unexpected status"
6040 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6041 grep "lfsck_layout_verify_header")
6043 [[ "x$errors" == "x" ]] || {
6045 error "lfsck failed"
6048 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6050 run_test 41 "SEL support in LFSCK"
6052 # restore MDS/OST size
6053 MDSSIZE=${SAVED_MDSSIZE}
6054 OSTSIZE=${SAVED_OSTSIZE}
6055 OSTCOUNT=${SAVED_OSTCOUNT}
6057 # cleanup the system at last
6058 REFORMAT="yes" cleanup_and_setup_lustre
6061 check_and_cleanup_lustre