3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
33 skip "Need MDS version at least 2.3.60"
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
45 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 MDT_DEV=$(devicelabel $SINGLEMDS $(facet_device $SINGLEMDS))
55 OST_DEV="${FSNAME}-OST0000"
56 START_NAMESPACE="do_facet $SINGLEMDS \
57 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
58 START_LAYOUT="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
60 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
61 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
62 SHOW_NAMESPACE="do_facet $SINGLEMDS \
63 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
64 SHOW_LAYOUT="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
66 SHOW_LAYOUT_ON_OST="do_facet ost1 \
67 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
68 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
69 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
70 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
79 echo "preparing... $nfiles * $ndirs files will be created $(date)."
80 if [ ! -z $igif ]; then
81 #define OBD_FAIL_FID_IGIF 0x1504
82 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
85 cp $LUSTRE/tests/*.sh $DIR/$tdir/
86 if [ $ndirs -gt 0 ]; then
87 createmany -d $DIR/$tdir/d $ndirs
88 createmany -m $DIR/$tdir/f $ndirs
89 if [ $nfiles -gt 0 ]; then
90 for ((i = 0; i < $ndirs; i++)); do
91 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
92 /dev/null || error "createmany $nfiles"
95 createmany -d $DIR/$tdir/e $ndirs
98 if [ ! -z $igif ]; then
99 touch $DIR/$tdir/dummy
100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
103 echo "prepared $(date)."
110 local dev=$(facet_device $facet)
112 start $facet $dev $opts > /dev/null ||
113 error "($err) Fail to start $facet!"
116 run_e2fsck_on_mds_facet() {
117 [ $mds1_FSTYPE == ldiskfs ] || return 0
121 stop $mds > /dev/null || error "(0) Fail to the stop $mds"
122 local host=$(facet_active_host $mds)
123 local dev=$(facet_device $mds)
125 run_e2fsck $host $dev "-n" |
127 run_e2fsck $host $dev "-n"
128 error "(2) Detected inconsistency on $mds"
130 start_facet $mds "$MOUNT_OPTS_NOSCRUB" 3
133 wait_all_targets_blocked() {
138 local count=$(do_facet mds1 \
139 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
141 [[ $count -eq $MDSCOUNT ]] || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
152 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
153 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
154 "$MDSCOUNT" $LTIME || {
155 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
156 error "($err) some MDTs are not in ${status}"
163 #define OBD_FAIL_LFSCK_DELAY1 0x1600
164 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
165 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
167 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
169 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
170 [ "$STATUS" == "scanning-phase1" ] ||
171 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
173 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
176 [ "$STATUS" == "stopped" ] ||
177 error "(6) Expect 'stopped', but got '$STATUS'"
179 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
181 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
182 [ "$STATUS" == "scanning-phase1" ] ||
183 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
187 mdd.${MDT_DEV}.lfsck_namespace |
188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
190 error "(9) unexpected status"
193 local repaired=$($SHOW_NAMESPACE |
194 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 0 ] ||
196 error "(10) Expect nothing to be repaired, but got: $repaired"
198 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
199 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(12) unexpected status"
207 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
208 [ $((scanned1 + 1)) -eq $scanned2 ] ||
209 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
211 echo "stopall, should NOT crash LU-3649"
212 stopall || error "(14) Fail to stopall"
214 run_test 0 "Control LFSCK manually"
219 #define OBD_FAIL_FID_INDIR 0x1501
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
221 touch $DIR/$tdir/dummy
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
225 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
227 mdd.${MDT_DEV}.lfsck_namespace |
228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
230 error "(4) unexpected status"
233 local repaired=$($SHOW_NAMESPACE |
234 awk '/^dirent_repaired/ { print $2 }')
235 # for interop with old server
236 [ -z "$repaired" ] &&
237 repaired=$($SHOW_NAMESPACE |
238 awk '/^updated_phase1/ { print $2 }')
240 [ $repaired -eq 1 ] ||
241 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
243 run_e2fsck_on_mds_facet $SINGLEMDS
245 mount_client $MOUNT || error "(6) Fail to start client!"
247 #define OBD_FAIL_FID_LOOKUP 0x1505
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
249 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
253 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
257 [ "$mds1_FSTYPE" != ldiskfs ] &&
258 skip "OI Scrub not implemented for ZFS"
262 #define OBD_FAIL_FID_INLMA 0x1502
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
264 touch $DIR/$tdir/dummy
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
268 #define OBD_FAIL_FID_NOLMA 0x1506
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
270 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
272 mdd.${MDT_DEV}.lfsck_namespace |
273 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
275 error "(4) unexpected status"
278 local repaired=$($SHOW_NAMESPACE |
279 awk '/^dirent_repaired/ { print $2 }')
280 # for interop with old server
281 [ -z "$repaired" ] &&
282 repaired=$($SHOW_NAMESPACE |
283 awk '/^updated_phase1/ { print $2 }')
285 [ $repaired -eq 1 ] ||
286 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
289 run_e2fsck_on_mds_facet $SINGLEMDS
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
304 #define OBD_FAIL_FID_IGIF 0x1504
305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
306 touch $DIR/$tdir/dummy
308 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
310 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
311 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
312 mdd.${MDT_DEV}.lfsck_namespace |
313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
315 error "(4) unexpected status"
318 local repaired=$($SHOW_NAMESPACE |
319 awk '/^dirent_repaired/ { print $2 }')
320 # for interop with old server
321 [ -z "$repaired" ] &&
322 repaired=$($SHOW_NAMESPACE |
323 awk '/^updated_phase1/ { print $2 }')
325 [ $repaired -eq 1 ] ||
326 error "(5) Fail to repair lost FID-in-dirent: $repaired"
328 run_e2fsck_on_mds_facet $SINGLEMDS
330 mount_client $MOUNT || error "(6) Fail to start client!"
332 #define OBD_FAIL_FID_LOOKUP 0x1505
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
334 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
338 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
341 [ $MDS1_VERSION -lt $(version_code 2.13.57) ] &&
342 skip "MDS older than 2.13.57"
343 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
347 touch $DIR/$tdir/$tfile
348 mkdir $DIR/$tdir/subdir
349 $LFS mkdir -i 1 $DIR/$tdir/remotedir
350 $LFS path2fid $DIR/$tdir
351 ll_decode_linkea $DIR/$tdir/$tfile
352 ll_decode_linkea $DIR/$tdir/subdir
353 ll_decode_linkea $DIR/$tdir/remotedir
355 local mntpt=$(facet_mntpt mds1)
357 # unlink OI files to remove the stale entry
358 local saved_opts=$MDS_MOUNT_OPTS
361 mount_fstype mds1 $mntpt
362 # increase $tdir FID oid in LMA
363 do_facet mds1 "getfattr -d -m trusted.lma -e hex \
364 --absolute-names $mntpt/ROOT/$tdir | \
365 sed -E 's/0(.{8})$/1\1/' | setfattr --restore=-"
366 unmount_fstype mds1 $mntpt
369 # the FID oid in LMA was increased above, and it's not in OI table,
370 # run scrub first to generate mapping in OI, so the following namespace
371 # check can fix linkea correctly, this is not necessary normally.
372 do_facet mds1 $LCTL lfsck_start -M ${MDT_DEV} -t scrub ||
373 error "failed to start LFSCK for scrub!"
374 wait_update_facet mds1 "$LCTL get_param -n \
375 osd-*.$(facet_svc mds1).oi_scrub |
376 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
377 error "unexpected status"
379 $START_NAMESPACE -r -A || error "fail to start LFSCK for namespace!"
380 wait_update_facet mds1 "$LCTL get_param -n \
381 mdd.${MDT_DEV}.lfsck_namespace |
382 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
384 error "unexpected status"
386 $LFS path2fid $DIR/$tdir
387 ll_decode_linkea $DIR/$tdir/$tfile
388 ll_decode_linkea $DIR/$tdir/subdir
389 ll_decode_linkea $DIR/$tdir/remotedir
394 fid=$($LFS path2fid $DIR/$tdir)
395 for f in $tfile subdir remotedir; do
396 pfid=$(ll_decode_linkea $DIR/$tdir/$f |
397 awk '/pfid/ { print $3 }')
399 [ "$pfid" == "$fid" ] || error "$fid in LMA != $pfid in linkea"
402 run_test 1d "LFSCK can fix mismatch of FID in LMA and FID in child linkea"
407 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
409 touch $DIR/$tdir/dummy
411 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
413 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
414 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
415 mdd.${MDT_DEV}.lfsck_namespace |
416 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
418 error "(4) unexpected status"
421 local repaired=$($SHOW_NAMESPACE |
422 awk '/^linkea_repaired/ { print $2 }')
423 # for interop with old server
424 [ -z "$repaired" ] &&
425 repaired=$($SHOW_NAMESPACE |
426 awk '/^updated_phase2/ { print $2 }')
428 [ $repaired -eq 1 ] ||
429 error "(5) Fail to repair crashed linkEA: $repaired"
431 run_e2fsck_on_mds_facet $SINGLEMDS
433 mount_client $MOUNT || error "(6) Fail to start client!"
435 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
436 error "(7) Fail to stat $DIR/$tdir/dummy"
438 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
439 local dummyname=$($LFS fid2path $DIR $dummyfid)
440 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
441 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
443 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
449 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
450 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
451 touch $DIR/$tdir/dummy
453 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
455 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
456 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
457 mdd.${MDT_DEV}.lfsck_namespace |
458 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
460 error "(4) unexpected status"
463 local repaired=$($SHOW_NAMESPACE |
464 awk '/^updated_phase2/ { print $2 }')
465 [ $repaired -eq 1 ] ||
466 error "(5) Fail to repair crashed linkEA: $repaired"
468 run_e2fsck_on_mds_facet $SINGLEMDS
470 mount_client $MOUNT || error "(6) Fail to start client!"
472 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
473 error "(7) Fail to stat $DIR/$tdir/dummy"
475 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
476 local dummyname=$($LFS fid2path $DIR $dummyfid)
477 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
478 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
480 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
484 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
485 skip "MDS older than 2.4.90"
489 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
490 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
491 touch $DIR/$tdir/dummy
493 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
495 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
496 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
497 mdd.${MDT_DEV}.lfsck_namespace |
498 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
500 error "(4) unexpected status"
503 local repaired=$($SHOW_NAMESPACE |
504 awk '/^updated_phase2/ { print $2 }')
505 [ $repaired -eq 1 ] ||
506 error "(5) Fail to repair crashed linkEA: $repaired"
508 run_e2fsck_on_mds_facet $SINGLEMDS
510 mount_client $MOUNT || error "(6) Fail to start client!"
512 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
513 error "(7) Fail to stat $DIR/$tdir/dummy"
515 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
516 local dummyname=$($LFS fid2path $DIR $dummyfid)
517 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
518 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
520 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
524 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
525 skip "MDS older than 2.6.50, LU-4788"
529 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
530 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
531 touch $DIR/$tdir/dummy
533 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
535 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
536 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
537 mdd.${MDT_DEV}.lfsck_namespace |
538 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
540 error "(4) unexpected status"
543 local repaired=$($SHOW_NAMESPACE |
544 awk '/^linkea_repaired/ { print $2 }')
545 [ $repaired -eq 1 ] ||
546 error "(5) Fail to repair crashed linkEA: $repaired"
548 run_e2fsck_on_mds_facet $SINGLEMDS
550 mount_client $MOUNT || error "(6) Fail to start client!"
552 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
553 error "(7) Fail to stat $DIR/$tdir/dummy"
555 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
556 local dummyname=$($LFS fid2path $DIR $dummyfid)
557 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
558 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
560 run_test 2d "LFSCK can recover the missing linkEA entry"
564 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
565 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
566 skip "MDS older than 2.6.50, LU-5511"
570 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
572 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
573 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
574 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
575 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
577 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
579 wait_all_targets_blocked namespace completed 4
581 local repaired=$($SHOW_NAMESPACE |
582 awk '/^linkea_repaired/ { print $2 }')
583 [ $repaired -eq 1 ] ||
584 error "(5) Fail to repair crashed linkEA: $repaired"
586 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
587 local name=$($LFS fid2path $DIR $fid)
588 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
589 error "(6) Fail to repair linkEA: $fid $name"
591 run_test 2e "namespace LFSCK can verify remote object linkEA"
595 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
596 skip "MDS older than 2.6.50, LU-4788"
600 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
601 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
602 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
604 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
605 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
606 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
608 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
609 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
610 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
612 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
613 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
614 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
616 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
618 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
619 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
620 mdd.${MDT_DEV}.lfsck_namespace |
621 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
623 error "(10) unexpected status"
626 local checked=$($SHOW_NAMESPACE |
627 awk '/^checked_phase2/ { print $2 }')
628 [ $checked -ge 4 ] ||
629 error "(11) Fail to check multiple-linked object: $checked"
631 local repaired=$($SHOW_NAMESPACE |
632 awk '/^multiple_linked_repaired/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(12) Fail to repair multiple-linked object: $repaired"
636 run_test 3 "LFSCK can verify multiple-linked objects"
640 [ "$mds1_FSTYPE" != ldiskfs ] &&
641 skip "OI Scrub not implemented for ZFS"
644 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
645 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
647 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
648 echo "start $SINGLEMDS with disabling OI scrub"
649 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
651 #define OBD_FAIL_LFSCK_DELAY2 0x1601
652 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
653 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
658 error "(5) unexpected status"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
665 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
666 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
667 mdd.${MDT_DEV}.lfsck_namespace |
668 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
670 error "(7) unexpected status"
673 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
674 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
676 local repaired=$($SHOW_NAMESPACE |
677 awk '/^dirent_repaired/ { print $2 }')
678 # for interop with old server
679 [ -z "$repaired" ] &&
680 repaired=$($SHOW_NAMESPACE |
681 awk '/^updated_phase1/ { print $2 }')
683 [ $repaired -ge 9 ] ||
684 error "(9) Fail to re-generate FID-in-dirent: $repaired"
686 run_e2fsck_on_mds_facet $SINGLEMDS
688 mount_client $MOUNT || error "(10) Fail to start client!"
690 #define OBD_FAIL_FID_LOOKUP 0x1505
691 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
692 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
693 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
695 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
699 [ "$mds1_FSTYPE" != ldiskfs ] &&
700 skip "OI Scrub not implemented for ZFS"
703 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
704 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
706 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
707 echo "start $SINGLEMDS with disabling OI scrub"
708 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
710 #define OBD_FAIL_LFSCK_DELAY2 0x1601
711 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
712 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
713 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
714 mdd.${MDT_DEV}.lfsck_namespace |
715 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
717 error "(5) unexpected status"
720 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
721 [ "$STATUS" == "scanning-phase1" ] ||
722 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
724 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
725 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
726 mdd.${MDT_DEV}.lfsck_namespace |
727 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
729 error "(7) unexpected status"
732 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
733 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
735 local repaired=$($SHOW_NAMESPACE |
736 awk '/^dirent_repaired/ { print $2 }')
737 # for interop with old server
738 [ -z "$repaired" ] &&
739 repaired=$($SHOW_NAMESPACE |
740 awk '/^updated_phase1/ { print $2 }')
742 [ $repaired -ge 2 ] ||
743 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
745 run_e2fsck_on_mds_facet $SINGLEMDS
747 mount_client $MOUNT || error "(10) Fail to start client!"
749 #define OBD_FAIL_FID_LOOKUP 0x1505
750 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
751 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
753 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
755 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
756 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
757 local dummyname=$($LFS fid2path $DIR $dummyfid)
758 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
759 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
761 run_test 5 "LFSCK can handle IGIF object upgrading"
766 #define OBD_FAIL_LFSCK_DELAY1 0x1600
767 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
768 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
770 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
771 [ "$STATUS" == "scanning-phase1" ] ||
772 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
774 # Sleep 3 sec to guarantee at least one object processed by LFSCK
776 # Fail the LFSCK to guarantee there is at least one checkpoint
777 #define OBD_FAIL_LFSCK_FATAL1 0x1608
778 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
779 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
780 mdd.${MDT_DEV}.lfsck_namespace |
781 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
783 error "(4) unexpected status"
786 local POS0=$($SHOW_NAMESPACE |
787 awk '/^last_checkpoint_position/ { print $2 }' |
790 #define OBD_FAIL_LFSCK_DELAY1 0x1600
791 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
792 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
794 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
795 [ "$STATUS" == "scanning-phase1" ] ||
796 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
798 local POS1=$($SHOW_NAMESPACE |
799 awk '/^latest_start_position/ { print $2 }' |
801 [[ $POS0 -lt $POS1 ]] ||
802 error "(7) Expect larger than: $POS0, but got $POS1"
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
805 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
806 mdd.${MDT_DEV}.lfsck_namespace |
807 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
809 error "(8) unexpected status"
812 run_test 6a "LFSCK resumes from last checkpoint (1)"
817 #define OBD_FAIL_LFSCK_DELAY2 0x1601
818 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
819 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
821 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
822 [ "$STATUS" == "scanning-phase1" ] ||
823 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
825 # Sleep 5 sec to guarantee that we are in the directory scanning
827 # Fail the LFSCK to guarantee there is at least one checkpoint
828 #define OBD_FAIL_LFSCK_FATAL2 0x1609
829 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
830 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
831 mdd.${MDT_DEV}.lfsck_namespace |
832 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
834 error "(4) unexpected status"
837 local O_POS0=$($SHOW_NAMESPACE |
838 awk '/^last_checkpoint_position/ { print $2 }' |
841 local D_POS0=$($SHOW_NAMESPACE |
842 awk '/^last_checkpoint_position/ { print $4 }')
844 #define OBD_FAIL_LFSCK_DELAY2 0x1601
845 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
846 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
848 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
849 [ "$STATUS" == "scanning-phase1" ] ||
850 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
852 local O_POS1=$($SHOW_NAMESPACE |
853 awk '/^latest_start_position/ { print $2 }' |
855 local D_POS1=$($SHOW_NAMESPACE |
856 awk '/^latest_start_position/ { print $4 }')
858 echo "Additional debug for 6b"
860 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
861 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
862 [[ $O_POS0 -lt $O_POS1 ]] ||
863 error "(7.1) $O_POS1 is not larger than $O_POS0"
865 [[ $D_POS0 -lt $D_POS1 ]] ||
866 error "(7.2) $D_POS1 is not larger than $D_POS0"
869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
870 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
871 mdd.${MDT_DEV}.lfsck_namespace |
872 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
874 error "(8) unexpected status"
877 run_test 6b "LFSCK resumes from last checkpoint (2)"
884 #define OBD_FAIL_LFSCK_DELAY2 0x1601
885 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
886 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
888 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
889 [ "$STATUS" == "scanning-phase1" ] ||
890 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
892 # Sleep 3 sec to guarantee at least one object processed by LFSCK
894 echo "stop $SINGLEMDS"
895 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop $SINGLEMDS!"
897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
898 echo "start $SINGLEMDS"
899 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 5
901 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
902 mdd.${MDT_DEV}.lfsck_namespace |
903 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
905 error "(6) unexpected status"
908 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
914 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
915 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
916 for ((i = 0; i < 20; i++)); do
917 touch $DIR/$tdir/dummy${i}
920 #define OBD_FAIL_LFSCK_DELAY3 0x1602
921 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
922 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
923 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
924 mdd.${MDT_DEV}.lfsck_namespace |
925 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
927 error "(4) unexpected status"
931 echo "stop $SINGLEMDS"
932 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop $SINGLEMDS!"
934 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
935 echo "start $SINGLEMDS"
936 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 6
938 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
939 mdd.${MDT_DEV}.lfsck_namespace |
940 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
942 error "(7) unexpected status"
945 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
956 formatall > /dev/null
962 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
963 [ "$STATUS" == "init" ] ||
964 namespace_error "(2) Expect 'init', but got '$STATUS'"
966 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
967 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
968 mkdir $DIR/$tdir/crashed
970 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
971 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
972 for ((i = 0; i < 5; i++)); do
973 touch $DIR/$tdir/dummy${i}
976 umount_client $MOUNT || error "(3) Fail to stop client!"
978 #define OBD_FAIL_LFSCK_DELAY2 0x1601
979 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
981 namespace_error "(4) Fail to start LFSCK for namespace!"
983 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
984 [ "$STATUS" == "scanning-phase1" ] ||
985 namespace_error "(5) Expect 'scanning-phase1', but got '$STATUS'"
987 $STOP_LFSCK || namespace_error "(6) Fail to stop LFSCK!"
989 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
990 [ "$STATUS" == "stopped" ] ||
991 namespace_error "(7) Expect 'stopped', but got '$STATUS'"
994 namespace_error "(8) Fail to start LFSCK for namespace!"
996 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
997 [ "$STATUS" == "scanning-phase1" ] ||
998 namespace_error "(9) Expect 'scanning-phase1', but got '$STATUS'"
1000 #define OBD_FAIL_LFSCK_FATAL2 0x1609
1001 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
1002 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1003 mdd.${MDT_DEV}.lfsck_namespace |
1004 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
1006 namespace_error "(10) unexpected status"
1009 #define OBD_FAIL_LFSCK_DELAY1 0x1600
1010 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
1012 namespace_error "(11) Fail to start LFSCK for namespace!"
1014 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1015 [ "$STATUS" == "scanning-phase1" ] ||
1016 namespace_error "(12) Expect 'scanning-phase1', but got '$STATUS'"
1018 #define OBD_FAIL_LFSCK_CRASH 0x160a
1019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
1022 echo "stop $SINGLEMDS"
1023 stop $SINGLEMDS > /dev/null || namespace_error "(13) Fail to stop MDS!"
1025 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1026 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1028 echo "start $SINGLEMDS"
1029 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 14
1031 local timeout=$(max_recovery_time)
1034 while [ $timer -lt $timeout ]; do
1035 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1036 mdt.${MDT_DEV}.recovery_status |
1037 awk '/^status/ { print \\\$2 }'")
1038 [ "$STATUS" != "RECOVERING" ] && break;
1040 timer=$((timer + 1))
1043 [ $timer != $timeout ] || (
1044 do_facet $SINGLEMDS "$LCTL get_param -n \
1045 mdt.${MDT_DEV}.recovery_status"
1046 error "(14.1) recovery timeout"
1049 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1050 [ "$STATUS" == "crashed" ] ||
1051 namespace_error "(15) Expect 'crashed', but got '$STATUS'"
1053 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1054 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1057 [ -n "$($SHOW_NAMESPACE |
1058 grep -E "status: init|status: completed")" ] && {
1060 namespace_error "(16) Fail to start LFSCK for namespace!"
1061 } || echo "lfsck for namespace has been started"
1063 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1064 [ "$STATUS" == "scanning-phase1" ] ||
1065 namespace_error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1067 echo "stop $SINGLEMDS"
1068 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop $SINGLEMDS!"
1070 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1071 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1073 echo "start $SINGLEMDS"
1074 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 19
1077 while [ $timer -lt $timeout ]; do
1078 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1079 mdt.${MDT_DEV}.recovery_status |
1080 awk '/^status/ { print \\\$2 }'")
1081 [ "$STATUS" != "RECOVERING" ] && break;
1083 timer=$((timer + 1))
1086 [ $timer != $timeout ] || (
1087 do_facet $SINGLEMDS "$LCTL get_param -n \
1088 mdt.${MDT_DEV}.recovery_status"
1089 error "(19.1) recovery timeout"
1092 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1093 [ "$STATUS" == "paused" ] ||
1094 namespace_error "(20) Expect 'paused', but got '$STATUS'"
1096 echo "stop $SINGLEMDS"
1097 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1099 echo "start $SINGLEMDS without resume LFSCK"
1100 start_facet $SINGLEMDS "$MOUNT_OPTS_SKIP_LFSCK" 20.2
1103 while [ $timer -lt $timeout ]; do
1104 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1105 mdt.${MDT_DEV}.recovery_status |
1106 awk '/^status/ { print \\\$2 }'")
1107 [ "$STATUS" != "RECOVERING" ] && break;
1109 timer=$((timer + 1))
1112 [ $timer != $timeout ] || (
1113 do_facet $SINGLEMDS "$LCTL get_param -n \
1114 mdt.${MDT_DEV}.recovery_status"
1115 error "(20.3) recovery timeout"
1118 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1119 [ "$STATUS" == "paused" ] ||
1120 namespace_error "(20.4) Expect 'paused', but got '$STATUS'"
1122 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1123 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1126 namespace_error "(21) Fail to start LFSCK for namespace!"
1127 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1128 mdd.${MDT_DEV}.lfsck_namespace |
1129 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1131 namespace_error "(22) unexpected status"
1134 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1137 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1138 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1139 namespace_error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1141 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1142 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1143 mdd.${MDT_DEV}.lfsck_namespace |
1144 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1146 namespace_error "(24) unexpected status"
1149 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1151 namespace_error "(25) Expect empty flags, but got '$FLAGS'"
1153 run_test 8 "LFSCK state machine"
1156 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1157 skip "Testing on UP system, the speed may be inaccurate."
1161 check_mount_and_prep
1162 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1163 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1164 createmany -o $DIR/$tdir/lfsck/f 5000
1166 local BASE_SPEED1=100
1168 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1171 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1172 [ "$STATUS" == "scanning-phase1" ] ||
1173 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1175 local SPEED=$($SHOW_LAYOUT |
1176 awk '/^average_speed_phase1/ { print $2 }')
1178 # There may be time error, normally it should be less than 2 seconds.
1179 # We allow another 20% schedule error.
1181 # MAX_MARGIN = 1.3 = 13 / 10
1182 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1183 RUN_TIME1 * 13 / 10))
1184 [ $SPEED -lt $MAX_SPEED ] || {
1186 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1187 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1190 # adjust speed limit
1191 local BASE_SPEED2=300
1193 do_facet $SINGLEMDS \
1194 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1197 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1198 # MIN_MARGIN = 0.7 = 7 / 10
1199 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1200 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1201 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1202 [ $SPEED -gt $MIN_SPEED ] || {
1203 if [ $mds1_FSTYPE != ldiskfs ]; then
1204 error_ignore LU-5624 \
1205 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1208 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1212 # MAX_MARGIN = 1.3 = 13 / 10
1213 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1214 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1215 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1216 [ $SPEED -lt $MAX_SPEED ] || {
1218 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1219 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1220 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1223 do_nodes $(comma_list $(mdts_nodes)) \
1224 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1225 do_nodes $(comma_list $(osts_nodes)) \
1226 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1228 wait_update_facet $SINGLEMDS \
1229 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1230 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1231 error "(7) Failed to get expected 'completed'"
1233 run_test 9a "LFSCK speed control (1)"
1236 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1237 skip "Testing on UP system, the speed may be inaccurate."
1243 echo "Preparing another 50 * 50 files (with error) at $(date)."
1244 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1246 createmany -d $DIR/$tdir/d 50
1247 createmany -m $DIR/$tdir/f 50
1248 for ((i = 0; i < 50; i++)); do
1249 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1252 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1254 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1255 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1256 mdd.${MDT_DEV}.lfsck_namespace |
1257 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1259 error "(5) unexpected status"
1262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1263 echo "Prepared at $(date)."
1265 local BASE_SPEED1=50
1267 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1270 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1271 [ "$STATUS" == "scanning-phase2" ] ||
1272 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1274 local SPEED=$($SHOW_NAMESPACE |
1275 awk '/^average_speed_phase2/ { print $2 }')
1276 # There may be time error, normally it should be less than 2 seconds.
1277 # We allow another 20% schedule error.
1279 # MAX_MARGIN = 1.3 = 13 / 10
1280 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1281 RUN_TIME1 * 13 / 10))
1282 [ $SPEED -lt $MAX_SPEED ] || {
1284 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1285 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1288 # adjust speed limit
1289 local BASE_SPEED2=150
1291 do_facet $SINGLEMDS \
1292 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1295 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1296 # MIN_MARGIN = 0.7 = 7 / 10
1297 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1298 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1299 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1300 [ $SPEED -gt $MIN_SPEED ] || {
1301 if [ $mds1_FSTYPE != ldiskfs ]; then
1302 error_ignore LU-5624 \
1303 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1306 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1310 # MAX_MARGIN = 1.3 = 13 / 10
1311 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1312 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1313 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1314 [ $SPEED -lt $MAX_SPEED ] || {
1316 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1317 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1318 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1321 do_nodes $(comma_list $(mdts_nodes)) \
1322 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1323 do_nodes $(comma_list $(osts_nodes)) \
1324 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1325 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1326 mdd.${MDT_DEV}.lfsck_namespace |
1327 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1329 error "(11) unexpected status"
1332 run_test 9b "LFSCK speed control (2)"
1336 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1340 echo "Preparing more files with error at $(date)."
1341 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1342 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1344 for ((i = 0; i < 1000; i = $((i+2)))); do
1345 mkdir -p $DIR/$tdir/d${i}
1346 touch $DIR/$tdir/f${i}
1347 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1350 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1351 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1353 for ((i = 1; i < 1000; i = $((i+2)))); do
1354 mkdir -p $DIR/$tdir/d${i}
1355 touch $DIR/$tdir/f${i}
1356 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1359 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1360 echo "Prepared at $(date)."
1362 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1364 umount_client $MOUNT
1365 mount_client $MOUNT || error "(3) Fail to start client!"
1367 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1370 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1371 [ "$STATUS" == "scanning-phase1" ] ||
1372 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1374 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1376 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1378 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1380 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1382 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1384 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1386 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1388 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1389 error "(14) Fail to softlink!"
1391 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1392 [ "$STATUS" == "scanning-phase1" ] ||
1393 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1395 do_nodes $(comma_list $(mdts_nodes)) \
1396 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1397 do_nodes $(comma_list $(osts_nodes)) \
1398 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1399 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1400 mdd.${MDT_DEV}.lfsck_namespace |
1401 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1403 error "(16) unexpected status"
1406 run_test 10 "System is available during LFSCK scanning"
1409 ost_remove_lastid() {
1412 local rcmd="do_facet ost${ost}"
1414 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1416 # step 1: local mount
1417 mount_fstype ost${ost} || return 1
1418 # step 2: remove the specified LAST_ID
1419 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1421 unmount_fstype ost${ost} || return 2
1425 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1426 skip "MDS older than 2.5.55, LU-1267"
1428 check_mount_and_prep
1429 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1430 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1435 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1437 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1438 error "(2) Fail to start ost1"
1440 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1441 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1443 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1444 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1446 wait_update_facet ost1 "$LCTL get_param -n \
1447 obdfilter.${OST_DEV}.lfsck_layout |
1448 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1450 error "(5) unexpected status"
1453 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1455 wait_update_facet ost1 "$LCTL get_param -n \
1456 obdfilter.${OST_DEV}.lfsck_layout |
1457 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1459 error "(6) unexpected status"
1462 echo "the LAST_ID(s) should have been rebuilt"
1463 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1464 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1466 run_test 11a "LFSCK can rebuild lost last_id"
1469 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1470 skip "MDS older than 2.5.55, LU-1267"
1472 check_mount_and_prep
1473 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1475 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1476 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1477 do_facet ost1 $LCTL set_param fail_loc=0x160d
1479 local count=$(precreated_ost_obj_count 0 0)
1481 createmany -o $DIR/$tdir/f $((count + 32))
1483 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1484 local seq=$(do_facet mds1 $LCTL get_param -n \
1485 osp.${proc_path}.prealloc_last_seq)
1486 local id_used=$(do_facet mds1 $LCTL get_param -n \
1487 osp.${proc_path}.prealloc_last_id)
1489 umount_client $MOUNT
1490 stop ost1 || error "(1) Fail to stop ost1"
1492 #define OBD_FAIL_OST_ENOSPC 0x215
1493 do_facet ost1 $LCTL set_param fail_loc=0x215
1495 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1496 error "(2) Fail to start ost1"
1498 for ((i = 0; i < 60; i++)); do
1499 id_ost1=$(do_facet ost1 \
1500 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1501 awk -F: "/$seq/ { print \$2 }")
1502 [ -n "$id_ost1" ] && break
1506 echo "the on-disk LAST_ID should be smaller than the expected one"
1507 [ $id_used -gt $id_ost1 ] ||
1508 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1510 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1511 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1513 wait_update_facet ost1 \
1514 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1515 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1517 error "(6) unexpected status"
1520 stop ost1 || error "(7) Fail to stop ost1"
1522 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1523 error "(8) Fail to start ost1"
1525 echo "the on-disk LAST_ID should have been rebuilt"
1526 # last_id may be larger than $id_used if objects were created/skipped
1527 wait_update_facet_cond ost1 \
1528 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1529 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1530 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1531 error "(9) expect last_id >= id_used $seq:$id_used"
1534 do_facet ost1 $LCTL set_param fail_loc=0
1535 stopall || error "(10) Fail to stopall"
1537 run_test 11b "LFSCK can rebuild crashed last_id"
1540 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1541 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1542 skip "MDS older than 2.5.55, LU-3950"
1544 check_mount_and_prep
1545 for k in $(seq $MDSCOUNT); do
1546 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1547 createmany -o $DIR/$tdir/${k}/f 100 ||
1548 error "(0) Fail to create 100 files."
1551 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1552 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1553 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1555 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1556 wait_all_targets namespace scanning-phase1 3
1558 echo "Stop namespace LFSCK on all targets by single lctl command."
1559 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1560 error "(4) Fail to stop LFSCK on all devices!"
1562 echo "All the LFSCK targets should be in 'stopped' status."
1563 wait_all_targets_blocked namespace stopped 5
1565 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1566 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1567 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1569 echo "All the LFSCK targets should be in 'completed' status."
1570 wait_all_targets_blocked namespace completed 7
1572 start_full_debug_logging
1574 echo "Start layout LFSCK on all targets by single command (-s 1)."
1575 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1576 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1578 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1579 wait_all_targets layout scanning-phase1 9
1581 echo "Stop layout LFSCK on all targets by single lctl command."
1582 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1583 error "(10) Fail to stop LFSCK on all devices!"
1585 echo "All the LFSCK targets should be in 'stopped' status."
1586 wait_all_targets_blocked layout stopped 11
1588 for k in $(seq $OSTCOUNT); do
1589 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1590 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1591 awk '/^status/ { print $2 }')
1592 [ "$STATUS" == "stopped" ] ||
1593 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1596 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1597 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1598 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1600 echo "All the LFSCK targets should be in 'completed' status."
1601 wait_all_targets_blocked layout completed 14
1603 stop_full_debug_logging
1605 run_test 12a "single command to trigger LFSCK on all devices"
1608 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1609 skip "MDS older than 2.5.55, LU-3950"
1611 check_mount_and_prep
1613 echo "Start LFSCK without '-M' specified."
1614 do_facet mds1 $LCTL lfsck_start -A -r ||
1615 error "(0) Fail to start LFSCK without '-M'"
1617 wait_all_targets_blocked namespace completed 1
1618 wait_all_targets_blocked layout completed 2
1620 local count=$(do_facet mds1 $LCTL dl |
1621 awk '{ print $3 }' | grep mdt | wc -l)
1622 if [ $count -gt 1 ]; then
1624 echo "Start layout LFSCK on the node with multipe targets,"
1625 echo "but not specify '-M'/'-A' option. Should get failure."
1627 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1628 error "(3) Start layout LFSCK should fail" || true
1631 run_test 12b "auto detect Lustre device"
1634 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1635 skip "MDS older than 2.5.55, LU-3593"
1638 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1639 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1640 echo "MDT-object FID."
1643 check_mount_and_prep
1645 echo "Inject failure stub to simulate bad lmm_oi"
1646 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1647 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1648 createmany -o $DIR/$tdir/f 1
1649 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1650 error "(0) Fail to create PFL $DIR/$tdir/f1"
1651 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1653 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1654 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1656 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1657 mdd.${MDT_DEV}.lfsck_layout |
1658 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1660 error "(2) unexpected status"
1663 local repaired=$($SHOW_LAYOUT |
1664 awk '/^repaired_others/ { print $2 }')
1665 [ $repaired -eq 2 ] ||
1666 error "(3) Fail to repair crashed lmm_oi: $repaired"
1668 run_test 13 "LFSCK can repair crashed lmm_oi"
1671 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1672 skip "MDS older than 2.5.55, LU-3590"
1675 echo "The OST-object referenced by the MDT-object should be there;"
1676 echo "otherwise, the LFSCK should re-create the missing OST-object."
1677 echo "without '--delay-create-ostobj' option."
1680 check_mount_and_prep
1681 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1683 echo "Inject failure stub to simulate dangling referenced MDT-object"
1684 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1685 do_facet ost1 $LCTL set_param fail_loc=0x1610
1686 local count=$(precreated_ost_obj_count 0 0)
1688 createmany -o $DIR/$tdir/f $((count + 16)) ||
1689 error "(0.1) Fail to create $DIR/$tdir/fx"
1690 touch $DIR/$tdir/guard0
1692 for ((i = 0; i < 16; i++)); do
1693 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1694 $DIR/$tdir/f_comp${i} ||
1695 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1697 touch $DIR/$tdir/guard1
1699 do_facet ost1 $LCTL set_param fail_loc=0
1701 start_full_debug_logging
1703 # exhaust other pre-created dangling cases
1704 count=$(precreated_ost_obj_count 0 0)
1705 createmany -o $DIR/$tdir/a $count ||
1706 error "(0.5) Fail to create $count files."
1708 echo "'ls' should fail because of dangling referenced MDT-object"
1709 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1711 echo "Trigger layout LFSCK to find out dangling reference"
1712 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1714 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1715 mdd.${MDT_DEV}.lfsck_layout |
1716 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1718 error "(3) unexpected status"
1721 local repaired=$($SHOW_LAYOUT |
1722 awk '/^repaired_dangling/ { print $2 }')
1723 [ $repaired -ge 32 ] ||
1724 error "(4) Fail to repair dangling reference: $repaired"
1726 echo "'stat' should fail because of not repair dangling by default"
1727 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1728 error "(5.1) stat should fail"
1729 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1730 error "(5.2) stat should fail"
1732 echo "Trigger layout LFSCK to repair dangling reference"
1733 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1735 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1736 mdd.${MDT_DEV}.lfsck_layout |
1737 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1739 error "(7) unexpected status"
1742 # There may be some async LFSCK updates in processing, wait for
1743 # a while until the target reparation has been done. LU-4970.
1745 echo "'stat' should success after layout LFSCK repairing"
1746 wait_update_facet client "stat $DIR/$tdir/guard0 |
1747 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1748 stat $DIR/$tdir/guard0
1750 error "(8.1) unexpected size"
1753 wait_update_facet client "stat $DIR/$tdir/guard1 |
1754 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1755 stat $DIR/$tdir/guard1
1757 error "(8.2) unexpected size"
1760 repaired=$($SHOW_LAYOUT |
1761 awk '/^repaired_dangling/ { print $2 }')
1762 [ $repaired -ge 32 ] ||
1763 error "(9) Fail to repair dangling reference: $repaired"
1765 stop_full_debug_logging
1767 echo "stopall to cleanup object cache"
1770 setupall > /dev/null
1772 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1775 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1776 skip "MDS older than 2.5.55, LU-3590"
1779 echo "The OST-object referenced by the MDT-object should be there;"
1780 echo "otherwise, the LFSCK should re-create the missing OST-object."
1781 echo "with '--delay-create-ostobj' option."
1784 check_mount_and_prep
1785 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1787 echo "Inject failure stub to simulate dangling referenced MDT-object"
1788 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1789 do_facet ost1 $LCTL set_param fail_loc=0x1610
1790 local count=$(precreated_ost_obj_count 0 0)
1792 createmany -o $DIR/$tdir/f $((count + 31))
1793 touch $DIR/$tdir/guard
1794 do_facet ost1 $LCTL set_param fail_loc=0
1796 start_full_debug_logging
1798 # exhaust other pre-created dangling cases
1799 count=$(precreated_ost_obj_count 0 0)
1800 createmany -o $DIR/$tdir/a $count ||
1801 error "(0) Fail to create $count files."
1803 echo "'ls' should fail because of dangling referenced MDT-object"
1804 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1806 echo "Trigger layout LFSCK to find out dangling reference"
1807 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1809 wait_all_targets_blocked layout completed 3
1811 local repaired=$($SHOW_LAYOUT |
1812 awk '/^repaired_dangling/ { print $2 }')
1813 [ $repaired -ge 32 ] ||
1814 error "(4) Fail to repair dangling reference: $repaired"
1816 echo "'stat' should fail because of not repair dangling by default"
1817 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1819 echo "Trigger layout LFSCK to repair dangling reference"
1820 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1822 wait_all_targets_blocked layout completed 7
1824 # There may be some async LFSCK updates in processing, wait for
1825 # a while until the target reparation has been done. LU-4970.
1827 echo "'stat' should success after layout LFSCK repairing"
1828 wait_update_facet client "stat $DIR/$tdir/guard |
1829 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1830 stat $DIR/$tdir/guard
1832 error "(8) unexpected size"
1835 repaired=$($SHOW_LAYOUT |
1836 awk '/^repaired_dangling/ { print $2 }')
1837 [ $repaired -ge 32 ] ||
1838 error "(9) Fail to repair dangling reference: $repaired"
1840 stop_full_debug_logging
1842 echo "stopall to cleanup object cache"
1845 setupall > /dev/null
1847 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1850 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1851 skip "MDS older than 2.5.55, LU-3591"
1854 echo "If the OST-object referenced by the MDT-object back points"
1855 echo "to some non-exist MDT-object, then the LFSCK should repair"
1856 echo "the OST-object to back point to the right MDT-object."
1859 check_mount_and_prep
1860 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1862 echo "Inject failure stub to make the OST-object to back point to"
1863 echo "non-exist MDT-object."
1864 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1866 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1867 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1868 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1870 error "(0) Fail to create PFL $DIR/$tdir/f1"
1871 # 'dd' will trigger punch RPC firstly on every OST-objects.
1872 # So even though some OST-object will not be write by 'dd',
1873 # as long as it is allocated (may be NOT allocated in pfl_3b)
1874 # its layout information will be set also.
1875 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1876 cancel_lru_locks osc
1877 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1879 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1880 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1882 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1883 mdd.${MDT_DEV}.lfsck_layout |
1884 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1886 error "(2) unexpected status"
1889 local repaired=$($SHOW_LAYOUT |
1890 awk '/^repaired_unmatched_pair/ { print $2 }')
1891 [ $repaired -ge 3 ] ||
1892 error "(3) Fail to repair unmatched pair: $repaired"
1894 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1897 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1898 skip "MDS older than 2.5.55, LU-3591"
1901 echo "If the OST-object referenced by the MDT-object back points"
1902 echo "to other MDT-object that doesn't recognize the OST-object,"
1903 echo "then the LFSCK should repair it to back point to the right"
1904 echo "MDT-object (the first one)."
1907 check_mount_and_prep
1908 mkdir -p $DIR/$tdir/0
1909 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1910 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1911 cancel_lru_locks osc
1913 echo "Inject failure stub to make the OST-object to back point to"
1914 echo "other MDT-object"
1917 [ $OSTCOUNT -ge 2 ] && stripes=2
1919 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1920 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1921 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1922 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1924 error "(0) Fail to create PFL $DIR/$tdir/f1"
1925 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1926 cancel_lru_locks osc
1927 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1929 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1930 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1932 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1933 mdd.${MDT_DEV}.lfsck_layout |
1934 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1936 error "(2) unexpected status"
1939 local repaired=$($SHOW_LAYOUT |
1940 awk '/^repaired_unmatched_pair/ { print $2 }')
1941 [ $repaired -eq 4 ] ||
1942 error "(3) Fail to repair unmatched pair: $repaired"
1944 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1947 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1948 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1949 skip "MDS newer than 2.7.55, LU-6475"
1950 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1951 skip "MDS older than 2.5.55, LU-3591"
1954 echo "According to current metadata migration implementation,"
1955 echo "before the old MDT-object is removed, both the new MDT-object"
1956 echo "and old MDT-object will reference the same LOV layout. Then if"
1957 echo "the layout LFSCK finds the new MDT-object by race, it will"
1958 echo "regard related OST-object(s) as multiple referenced case, and"
1959 echo "will try to create new OST-object(s) for the new MDT-object."
1960 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1961 echo "MDT-object before confirm the multiple referenced case."
1964 check_mount_and_prep
1965 $LFS mkdir -i 1 $DIR/$tdir/a1
1966 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1967 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1968 cancel_lru_locks osc
1970 echo "Inject failure stub on MDT1 to delay the migration"
1972 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1973 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1974 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1975 $LFS migrate -m 0 $DIR/$tdir/a1 &
1978 echo "Trigger layout LFSCK to race with the migration"
1979 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1981 wait_all_targets_blocked layout completed 2
1983 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1984 local repaired=$($SHOW_LAYOUT |
1985 awk '/^repaired_unmatched_pair/ { print $2 }')
1986 [ $repaired -eq 1 ] ||
1987 error "(3) Fail to repair unmatched pair: $repaired"
1989 repaired=$($SHOW_LAYOUT |
1990 awk '/^repaired_multiple_referenced/ { print $2 }')
1991 [ $repaired -eq 0 ] ||
1992 error "(4) Unexpectedly repaird multiple references: $repaired"
1994 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1997 (( $MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
1999 check_mount_and_prep
2001 $LFS mkdir -c -1 $DIR/$tdir || error "create $tdir failed"
2002 $LFS setdirstripe -D -i -1 -c 1 $DIR/$tdir ||
2003 error "setdirstripe failed"
2005 createmany -o $DIR/$tdir/f 100 || error "create sub files failed"
2006 createmany -d $DIR/$tdir/s 100 || error "create sub dirs failed"
2008 echo "Migrate $DIR/$tdir to MDT1"
2009 $LFS migrate -m 1 $DIR/$tdir &
2013 # fail sub transactions on random MDTs, which may cause some file
2015 #define OBD_FAIL_OUT_EIO 0x1709
2016 for ((i = 0; i < $MDSCOUNT; i++)); do
2017 do_facet mds$i $LCTL set_param fail_loc=0x1709
2019 do_facet mds$i $LCTL set_param fail_loc=0
2024 # LFSCK can't fully fix migrating directories, and may leave some
2025 # files inaccessible, but it shouldn't cause crash
2026 $START_NAMESPACE -A -r ||
2027 error "Fail to start LFSCK for namespace"
2029 wait_all_targets_blocked namespace completed 1
2031 # resume migration may fail because some file may be inaccessible, but
2032 # it shouldn't cause crash
2033 $LFS migrate -m 1 $DIR/$tdir
2035 # rm $tdir to avoid cleanup failure in the end
2037 $LFS rm_entry $DIR/$tdir/*
2039 REFORMAT="yes" cleanup_and_setup_lustre
2041 run_test 15d "LFSCK don't crash upon dir migration failure"
2044 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2045 skip "MDS older than 2.5.55, LU-3594"
2048 echo "If the OST-object's owner information does not match the owner"
2049 echo "information stored in the MDT-object, then the LFSCK trust the"
2050 echo "MDT-object and update the OST-object's owner information."
2053 check_mount_and_prep
2054 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2055 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
2056 cancel_lru_locks osc
2058 # created but no setattr or write to the file.
2060 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
2061 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
2063 echo "Inject failure stub to skip OST-object owner changing"
2064 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
2065 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
2066 chown 1.1 $DIR/$tdir/f0
2067 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2069 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
2072 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
2074 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2075 mdd.${MDT_DEV}.lfsck_layout |
2076 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2078 error "(2) unexpected status"
2081 local repaired=$($SHOW_LAYOUT |
2082 awk '/^repaired_inconsistent_owner/ { print $2 }')
2083 [ $repaired -eq 1 ] ||
2084 error "(3) Fail to repair inconsistent owner: $repaired"
2086 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2089 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2090 skip "MDS older than 2.5.55, LU-3594"
2093 echo "If more than one MDT-objects reference the same OST-object,"
2094 echo "and the OST-object only recognizes one MDT-object, then the"
2095 echo "LFSCK should create new OST-objects for such non-recognized"
2099 check_mount_and_prep
2100 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2102 echo "Inject failure stub to make two MDT-objects to refernce"
2103 echo "the OST-object"
2105 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2106 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2107 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2108 cancel_lru_locks mdc
2109 cancel_lru_locks osc
2111 createmany -o $DIR/$tdir/f 1
2112 cancel_lru_locks mdc
2113 cancel_lru_locks osc
2115 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2117 error "(0) Fail to create PFL $DIR/$tdir/f1"
2118 cancel_lru_locks mdc
2119 cancel_lru_locks osc
2120 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2122 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2123 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2124 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2125 [ $size -eq 1048576 ] ||
2126 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2128 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2129 [ $size -eq 1048576 ] ||
2130 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2132 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2135 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2137 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2138 mdd.${MDT_DEV}.lfsck_layout |
2139 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2141 error "(3) unexpected status"
2144 local repaired=$($SHOW_LAYOUT |
2145 awk '/^repaired_multiple_referenced/ { print $2 }')
2146 [ $repaired -eq 2 ] ||
2147 error "(4) Fail to repair multiple references: $repaired"
2149 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2150 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2151 error "(5) Fail to write f0."
2152 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2153 [ $size -eq 1048576 ] ||
2154 error "(6) guard size should be 1048576, but got $size"
2156 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2157 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2158 error "(7) Fail to write f1."
2159 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2160 [ $size -eq 1048576 ] ||
2161 error "(8) guard size should be 1048576, but got $size"
2163 run_test 17 "LFSCK can repair multiple references"
2165 $LCTL set_param debug=+cache > /dev/null
2168 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2169 skip "MDS older than 2.5.55, LU-3336"
2172 echo "The target MDT-object is there, but related stripe information"
2173 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2174 echo "layout EA entries."
2177 check_mount_and_prep
2178 $LFS mkdir -i 0 $DIR/$tdir/a1
2179 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2180 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2182 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2184 $LFS path2fid $DIR/$tdir/a1/f1
2185 $LFS getstripe $DIR/$tdir/a1/f1
2187 if [ $MDSCOUNT -ge 2 ]; then
2188 $LFS mkdir -i 1 $DIR/$tdir/a2
2189 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2190 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2191 $LFS path2fid $DIR/$tdir/a2/f2
2192 $LFS getstripe $DIR/$tdir/a2/f2
2195 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2196 error "(0) Fail to create PFL $DIR/$tdir/f3"
2198 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2200 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2202 $LFS path2fid $DIR/$tdir/f3
2203 $LFS getstripe $DIR/$tdir/f3
2205 cancel_lru_locks osc
2207 echo "Inject failure, to make the MDT-object lost its layout EA"
2208 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2209 do_facet mds1 $LCTL set_param fail_loc=0x1615
2210 chown 1.1 $DIR/$tdir/a1/f1
2212 if [ $MDSCOUNT -ge 2 ]; then
2213 do_facet mds2 $LCTL set_param fail_loc=0x1615
2214 chown 1.1 $DIR/$tdir/a2/f2
2217 chown 1.1 $DIR/$tdir/f3
2222 do_facet mds1 $LCTL set_param fail_loc=0
2223 if [ $MDSCOUNT -ge 2 ]; then
2224 do_facet mds2 $LCTL set_param fail_loc=0
2227 cancel_lru_locks mdc
2228 cancel_lru_locks osc
2230 echo "The file size should be incorrect since layout EA is lost"
2231 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2232 [ "$cur_size" != "$saved_size1" ] ||
2233 error "(1) Expect incorrect file1 size"
2235 if [ $MDSCOUNT -ge 2 ]; then
2236 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2237 [ "$cur_size" != "$saved_size1" ] ||
2238 error "(2) Expect incorrect file2 size"
2241 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2242 [ "$cur_size" != "$saved_size2" ] ||
2243 error "(1.2) Expect incorrect file3 size"
2245 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2246 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2248 for k in $(seq $MDSCOUNT); do
2249 # The LFSCK status query internal is 30 seconds. For the case
2250 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2251 # time to guarantee the status sync up.
2252 wait_update_facet mds${k} "$LCTL get_param -n \
2253 mdd.$(facet_svc mds${k}).lfsck_layout |
2254 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2255 error "(4) MDS${k} is not the expected 'completed'"
2258 for k in $(seq $OSTCOUNT); do
2259 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2260 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2261 awk '/^status/ { print $2 }')
2262 [ "$cur_status" == "completed" ] ||
2263 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2266 local repaired=$(do_facet mds1 $LCTL get_param -n \
2267 mdd.$(facet_svc mds1).lfsck_layout |
2268 awk '/^repaired_orphan/ { print $2 }')
2269 [ $repaired -eq 3 ] ||
2270 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2272 if [ $MDSCOUNT -ge 2 ]; then
2273 repaired=$(do_facet mds2 $LCTL get_param -n \
2274 mdd.$(facet_svc mds2).lfsck_layout |
2275 awk '/^repaired_orphan/ { print $2 }')
2276 [ $repaired -eq 2 ] ||
2277 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2280 $LFS path2fid $DIR/$tdir/a1/f1
2281 $LFS getstripe $DIR/$tdir/a1/f1
2283 if [ $MDSCOUNT -ge 2 ]; then
2284 $LFS path2fid $DIR/$tdir/a2/f2
2285 $LFS getstripe $DIR/$tdir/a2/f2
2288 $LFS path2fid $DIR/$tdir/f3
2289 $LFS getstripe $DIR/$tdir/f3
2291 echo "The file size should be correct after layout LFSCK scanning"
2292 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2293 [ "$cur_size" == "$saved_size1" ] ||
2294 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2296 if [ $MDSCOUNT -ge 2 ]; then
2297 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2298 [ "$cur_size" == "$saved_size1" ] ||
2299 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2302 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2303 [ "$cur_size" == "$saved_size2" ] ||
2304 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2306 run_test 18a "Find out orphan OST-object and repair it (1)"
2309 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2310 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2311 skip "MDS older than 2.5.55, LU-3336"
2314 echo "The target MDT-object is lost. The LFSCK should re-create the"
2315 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2316 echo "can move it back to normal namespace manually."
2319 check_mount_and_prep
2320 $LFS mkdir -i 0 $DIR/$tdir/a1
2321 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2322 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2323 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2324 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2326 $LFS getstripe $DIR/$tdir/a1/f1
2328 if [ $MDSCOUNT -ge 2 ]; then
2329 $LFS mkdir -i 1 $DIR/$tdir/a2
2330 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2331 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2332 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2334 $LFS getstripe $DIR/$tdir/a2/f2
2337 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2338 error "(0) Fail to create PFL $DIR/$tdir/f3"
2340 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2342 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2343 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2345 $LFS getstripe $DIR/$tdir/f3
2347 cancel_lru_locks osc
2349 echo "Inject failure, to simulate the case of missing the MDT-object"
2350 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2351 do_facet mds1 $LCTL set_param fail_loc=0x1616
2352 rm -f $DIR/$tdir/a1/f1
2354 if [ $MDSCOUNT -ge 2 ]; then
2355 do_facet mds2 $LCTL set_param fail_loc=0x1616
2356 rm -f $DIR/$tdir/a2/f2
2364 do_facet mds1 $LCTL set_param fail_loc=0
2365 if [ $MDSCOUNT -ge 2 ]; then
2366 do_facet mds2 $LCTL set_param fail_loc=0
2369 cancel_lru_locks mdc
2370 cancel_lru_locks osc
2372 # dryrun mode only check orphans, not repaie
2373 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2374 $START_LAYOUT --dryrun -o -r ||
2375 error "Fail to start layout LFSCK in dryrun mode"
2376 wait_all_targets_blocked layout completed 2
2378 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2379 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2380 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2382 local orphans=$(do_facet mds1 $LCTL get_param -n \
2383 mdd.$(facet_svc mds1).lfsck_layout |
2384 awk '/^inconsistent_orphan/ { print $2 }')
2385 [ $orphans -eq 3 ] ||
2386 error "Expect 3 found on mds1, but got: $orphans"
2388 # orphan parents should not be created
2390 for subdir in $MOUNT/.lustre/lost+found/*; do
2391 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2394 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2395 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2397 for k in $(seq $MDSCOUNT); do
2398 # The LFSCK status query internal is 30 seconds. For the case
2399 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2400 # time to guarantee the status sync up.
2401 wait_update_facet mds${k} "$LCTL get_param -n \
2402 mdd.$(facet_svc mds${k}).lfsck_layout |
2403 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2404 error "(2) MDS${k} is not the expected 'completed'"
2407 for k in $(seq $OSTCOUNT); do
2408 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2409 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2410 awk '/^status/ { print $2 }')
2411 [ "$cur_status" == "completed" ] ||
2412 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2415 local repaired=$(do_facet mds1 $LCTL get_param -n \
2416 mdd.$(facet_svc mds1).lfsck_layout |
2417 awk '/^repaired_orphan/ { print $2 }')
2418 [ $repaired -eq 3 ] ||
2419 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2421 if [ $MDSCOUNT -ge 2 ]; then
2422 repaired=$(do_facet mds2 $LCTL get_param -n \
2423 mdd.$(facet_svc mds2).lfsck_layout |
2424 awk '/^repaired_orphan/ { print $2 }')
2425 [ $repaired -eq 2 ] ||
2426 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2429 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2430 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2431 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2433 if [ $MDSCOUNT -ge 2 ]; then
2434 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2435 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2438 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2439 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2441 $LFS path2fid $DIR/$tdir/a1/f1
2442 $LFS getstripe $DIR/$tdir/a1/f1
2444 if [ $MDSCOUNT -ge 2 ]; then
2445 $LFS path2fid $DIR/$tdir/a2/f2
2446 $LFS getstripe $DIR/$tdir/a2/f2
2449 $LFS path2fid $DIR/$tdir/f3
2450 $LFS getstripe $DIR/$tdir/f3
2452 echo "The file size should be correct after layout LFSCK scanning"
2453 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2454 [ "$cur_size" == "$saved_size1" ] ||
2455 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2457 if [ $MDSCOUNT -ge 2 ]; then
2458 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2459 [ "$cur_size" == "$saved_size1" ] ||
2460 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2463 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2464 [ "$cur_size" == "$saved_size2" ] ||
2465 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2467 run_test 18b "Find out orphan OST-object and repair it (2)"
2470 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2471 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2472 skip "MDS older than 2.5.55, LU-3336"
2475 echo "The target MDT-object is lost, and the OST-object FID is missing."
2476 echo "The LFSCK should re-create the MDT-object with new FID under the "
2477 echo "directory .lustre/lost+found/MDTxxxx."
2480 check_mount_and_prep
2481 $LFS mkdir -i 0 $DIR/$tdir/a1
2482 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2484 echo "Inject failure, to simulate the case of missing parent FID"
2485 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2486 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2488 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2489 $LFS getstripe $DIR/$tdir/a1/f1
2491 if [ $MDSCOUNT -ge 2 ]; then
2492 $LFS mkdir -i 1 $DIR/$tdir/a2
2493 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2494 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2495 $LFS getstripe $DIR/$tdir/a2/f2
2498 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2499 error "(0) Fail to create PFL $DIR/$tdir/f3"
2501 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2502 $LFS getstripe $DIR/$tdir/f3
2504 cancel_lru_locks osc
2505 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2507 echo "Inject failure, to simulate the case of missing the MDT-object"
2508 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2509 do_facet mds1 $LCTL set_param fail_loc=0x1616
2510 rm -f $DIR/$tdir/a1/f1
2512 if [ $MDSCOUNT -ge 2 ]; then
2513 do_facet mds2 $LCTL set_param fail_loc=0x1616
2514 rm -f $DIR/$tdir/a2/f2
2522 do_facet mds1 $LCTL set_param fail_loc=0
2523 if [ $MDSCOUNT -ge 2 ]; then
2524 do_facet mds2 $LCTL set_param fail_loc=0
2527 cancel_lru_locks mdc
2528 cancel_lru_locks osc
2530 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2531 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2533 for k in $(seq $MDSCOUNT); do
2534 # The LFSCK status query internal is 30 seconds. For the case
2535 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2536 # time to guarantee the status sync up.
2537 wait_update_facet mds${k} "$LCTL get_param -n \
2538 mdd.$(facet_svc mds${k}).lfsck_layout |
2539 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2540 error "(2) MDS${k} is not the expected 'completed'"
2543 for k in $(seq $OSTCOUNT); do
2544 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2545 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2546 awk '/^status/ { print $2 }')
2547 [ "$cur_status" == "completed" ] ||
2548 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2551 if [ $MDSCOUNT -ge 2 ]; then
2557 local repaired=$(do_facet mds1 $LCTL get_param -n \
2558 mdd.$(facet_svc mds1).lfsck_layout |
2559 awk '/^repaired_orphan/ { print $2 }')
2560 [ $repaired -eq $expected ] ||
2561 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2563 if [ $MDSCOUNT -ge 2 ]; then
2564 repaired=$(do_facet mds2 $LCTL get_param -n \
2565 mdd.$(facet_svc mds2).lfsck_layout |
2566 awk '/^repaired_orphan/ { print $2 }')
2567 [ $repaired -eq 0 ] ||
2568 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2571 ls -ail $MOUNT/.lustre/lost+found/
2573 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2574 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2575 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2577 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2580 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2581 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2582 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2584 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2585 [ ! -z "$cname" ] ||
2586 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2588 run_test 18c "Find out orphan OST-object and repair it (3)"
2591 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2592 skip "MDS older than 2.5.55, LU-3336"
2595 echo "The target MDT-object layout EA is corrupted, but the right"
2596 echo "OST-object is still alive as orphan. The layout LFSCK will"
2597 echo "not create new OST-object to occupy such slot."
2600 check_mount_and_prep
2602 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2603 echo "guard" > $DIR/$tdir/a1/f1
2604 echo "foo" > $DIR/$tdir/a1/f2
2606 echo "guard" > $DIR/$tdir/a1/f3
2607 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2608 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2609 echo "foo" > $DIR/$tdir/a1/f4
2611 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2612 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2613 $LFS path2fid $DIR/$tdir/a1/f1
2614 $LFS getstripe $DIR/$tdir/a1/f1
2615 $LFS path2fid $DIR/$tdir/a1/f2
2616 $LFS getstripe $DIR/$tdir/a1/f2
2617 $LFS path2fid $DIR/$tdir/a1/f3
2618 $LFS getstripe $DIR/$tdir/a1/f3
2619 $LFS path2fid $DIR/$tdir/a1/f4
2620 $LFS getstripe $DIR/$tdir/a1/f4
2621 cancel_lru_locks osc
2623 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2624 echo "to reference the same OST-object (which is f1's OST-obejct)."
2625 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2626 echo "dangling reference case, but f2's old OST-object is there."
2628 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2629 echo "to reference the same OST-object (which is f3's OST-obejct)."
2630 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2631 echo "dangling reference case, but f4's old OST-object is there."
2634 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2635 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2636 chown 1.1 $DIR/$tdir/a1/f2
2637 chown 1.1 $DIR/$tdir/a1/f4
2638 rm -f $DIR/$tdir/a1/f1
2639 rm -f $DIR/$tdir/a1/f3
2642 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2644 echo "stopall to cleanup object cache"
2647 setupall > /dev/null
2649 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2650 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2652 for k in $(seq $MDSCOUNT); do
2653 # The LFSCK status query internal is 30 seconds. For the case
2654 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2655 # time to guarantee the status sync up.
2656 wait_update_facet mds${k} "$LCTL get_param -n \
2657 mdd.$(facet_svc mds${k}).lfsck_layout |
2658 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2659 error "(3) MDS${k} is not the expected 'completed'"
2662 for k in $(seq $OSTCOUNT); do
2663 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2664 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2665 awk '/^status/ { print $2 }')
2666 [ "$cur_status" == "completed" ] ||
2667 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2670 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2671 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2672 awk '/^repaired_orphan/ { print $2 }')
2673 [ $repaired -eq 2 ] ||
2674 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2676 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2677 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2678 awk '/^repaired_dangling/ { print $2 }')
2679 [ $repaired -eq 0 ] ||
2680 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2682 echo "The file size should be correct after layout LFSCK scanning"
2683 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2684 [ "$cur_size" == "$saved_size1" ] ||
2685 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2687 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2688 [ "$cur_size" == "$saved_size2" ] ||
2689 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2691 echo "The LFSCK should find back the original data."
2692 cat $DIR/$tdir/a1/f2
2693 $LFS path2fid $DIR/$tdir/a1/f2
2694 $LFS getstripe $DIR/$tdir/a1/f2
2695 cat $DIR/$tdir/a1/f4
2696 $LFS path2fid $DIR/$tdir/a1/f4
2697 $LFS getstripe $DIR/$tdir/a1/f4
2699 run_test 18d "Find out orphan OST-object and repair it (4)"
2702 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2703 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2704 skip "MDS older than 2.5.55, LU-3336"
2707 echo "The target MDT-object layout EA slot is occpuied by some new"
2708 echo "created OST-object when repair dangling reference case. Such"
2709 echo "conflict OST-object has been modified by others. To keep the"
2710 echo "new data, the LFSCK will create a new file to refernece this"
2711 echo "old orphan OST-object."
2714 check_mount_and_prep
2716 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2717 echo "guard" > $DIR/$tdir/a1/f1
2718 echo "foo" > $DIR/$tdir/a1/f2
2720 echo "guard" > $DIR/$tdir/a1/f3
2721 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2722 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2723 echo "foo" > $DIR/$tdir/a1/f4
2725 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2726 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2728 $LFS path2fid $DIR/$tdir/a1/f1
2729 $LFS getstripe $DIR/$tdir/a1/f1
2730 $LFS path2fid $DIR/$tdir/a1/f2
2731 $LFS getstripe $DIR/$tdir/a1/f2
2732 $LFS path2fid $DIR/$tdir/a1/f3
2733 $LFS getstripe $DIR/$tdir/a1/f3
2734 $LFS path2fid $DIR/$tdir/a1/f4
2735 $LFS getstripe $DIR/$tdir/a1/f4
2736 cancel_lru_locks osc
2738 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2739 echo "to reference the same OST-object (which is f1's OST-obejct)."
2740 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2741 echo "dangling reference case, but f2's old OST-object is there."
2743 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2744 echo "to reference the same OST-object (which is f3's OST-obejct)."
2745 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2746 echo "dangling reference case, but f4's old OST-object is there."
2749 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2750 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2751 chown 1.1 $DIR/$tdir/a1/f2
2752 chown 1.1 $DIR/$tdir/a1/f4
2753 rm -f $DIR/$tdir/a1/f1
2754 rm -f $DIR/$tdir/a1/f3
2757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2759 echo "stopall to cleanup object cache"
2762 setupall > /dev/null
2764 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2765 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2767 start_full_debug_logging
2769 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2770 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2772 wait_update_facet mds1 "$LCTL get_param -n \
2773 mdd.$(facet_svc mds1).lfsck_layout |
2774 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2775 error "(3) MDS1 is not the expected 'scanning-phase2'"
2777 # to guarantee all updates are synced.
2781 echo "Write new data to f2/f4 to modify the new created OST-object."
2782 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2783 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2785 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2787 for k in $(seq $MDSCOUNT); do
2788 # The LFSCK status query internal is 30 seconds. For the case
2789 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2790 # time to guarantee the status sync up.
2791 wait_update_facet mds${k} "$LCTL get_param -n \
2792 mdd.$(facet_svc mds${k}).lfsck_layout |
2793 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2794 error "(4) MDS${k} is not the expected 'completed'"
2797 for k in $(seq $OSTCOUNT); do
2798 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2799 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2800 awk '/^status/ { print $2 }')
2801 [ "$cur_status" == "completed" ] ||
2802 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2805 stop_full_debug_logging
2807 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2808 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2809 awk '/^repaired_orphan/ { print $2 }')
2810 [ $repaired -eq 2 ] ||
2811 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2813 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2814 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2815 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2817 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2818 if [ $count -ne 2 ]; then
2819 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2820 error "(8) Expect 2 stubs under lost+found, but got $count"
2823 echo "The stub file should keep the original f2 or f4 data"
2824 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2825 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2826 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2827 error "(9) Got unexpected $cur_size"
2830 $LFS path2fid $cname
2831 $LFS getstripe $cname
2833 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2834 cur_size=$(ls -il $cname | awk '{ print $6 }')
2835 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2836 error "(10) Got unexpected $cur_size"
2839 $LFS path2fid $cname
2840 $LFS getstripe $cname
2842 echo "The f2/f4 should contains new data."
2843 cat $DIR/$tdir/a1/f2
2844 $LFS path2fid $DIR/$tdir/a1/f2
2845 $LFS getstripe $DIR/$tdir/a1/f2
2846 cat $DIR/$tdir/a1/f4
2847 $LFS path2fid $DIR/$tdir/a1/f4
2848 $LFS getstripe $DIR/$tdir/a1/f4
2850 run_test 18e "Find out orphan OST-object and repair it (5)"
2853 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2856 echo "The target MDT-object is lost. The LFSCK should re-create the"
2857 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2858 echo "to verify some OST-object(s) during the first stage-scanning,"
2859 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2860 echo "should not be affected."
2863 check_mount_and_prep
2864 $LFS mkdir -i 0 $DIR/$tdir/a1
2865 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2866 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2867 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2868 $LFS mkdir -i 0 $DIR/$tdir/a2
2869 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2870 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2871 $LFS getstripe $DIR/$tdir/a1/f1
2872 $LFS getstripe $DIR/$tdir/a2/f2
2874 if [ $MDSCOUNT -ge 2 ]; then
2875 $LFS mkdir -i 1 $DIR/$tdir/a3
2876 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2877 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2878 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2879 $LFS mkdir -i 1 $DIR/$tdir/a4
2880 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2881 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2882 $LFS getstripe $DIR/$tdir/a3/f3
2883 $LFS getstripe $DIR/$tdir/a4/f4
2886 cancel_lru_locks osc
2888 echo "Inject failure, to simulate the case of missing the MDT-object"
2889 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2890 do_facet mds1 $LCTL set_param fail_loc=0x1616
2891 rm -f $DIR/$tdir/a1/f1
2892 rm -f $DIR/$tdir/a2/f2
2894 if [ $MDSCOUNT -ge 2 ]; then
2895 do_facet mds2 $LCTL set_param fail_loc=0x1616
2896 rm -f $DIR/$tdir/a3/f3
2897 rm -f $DIR/$tdir/a4/f4
2903 do_facet mds1 $LCTL set_param fail_loc=0
2904 if [ $MDSCOUNT -ge 2 ]; then
2905 do_facet mds2 $LCTL set_param fail_loc=0
2908 cancel_lru_locks mdc
2909 cancel_lru_locks osc
2911 echo "Inject failure, to simulate the OST0 fail to handle"
2912 echo "MDT0 LFSCK request during the first-stage scanning."
2913 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2914 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2916 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2917 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2919 for k in $(seq $MDSCOUNT); do
2920 # The LFSCK status query internal is 30 seconds. For the case
2921 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2922 # time to guarantee the status sync up.
2923 wait_update_facet mds${k} "$LCTL get_param -n \
2924 mdd.$(facet_svc mds${k}).lfsck_layout |
2925 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2926 error "(2) MDS${k} is not the expected 'partial'"
2929 wait_update_facet ost1 "$LCTL get_param -n \
2930 obdfilter.$(facet_svc ost1).lfsck_layout |
2931 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2932 error "(3) OST1 is not the expected 'partial'"
2935 wait_update_facet ost2 "$LCTL get_param -n \
2936 obdfilter.$(facet_svc ost2).lfsck_layout |
2937 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2938 error "(4) OST2 is not the expected 'completed'"
2941 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2943 local repaired=$(do_facet mds1 $LCTL get_param -n \
2944 mdd.$(facet_svc mds1).lfsck_layout |
2945 awk '/^repaired_orphan/ { print $2 }')
2946 [ $repaired -eq 1 ] ||
2947 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2949 if [ $MDSCOUNT -ge 2 ]; then
2950 repaired=$(do_facet mds2 $LCTL get_param -n \
2951 mdd.$(facet_svc mds2).lfsck_layout |
2952 awk '/^repaired_orphan/ { print $2 }')
2953 [ $repaired -eq 1 ] ||
2954 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2957 echo "Trigger layout LFSCK on all devices again to cleanup"
2958 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2960 for k in $(seq $MDSCOUNT); do
2961 # The LFSCK status query internal is 30 seconds. For the case
2962 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2963 # time to guarantee the status sync up.
2964 wait_update_facet mds${k} "$LCTL get_param -n \
2965 mdd.$(facet_svc mds${k}).lfsck_layout |
2966 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2967 error "(8) MDS${k} is not the expected 'completed'"
2970 for k in $(seq $OSTCOUNT); do
2971 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2972 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2973 awk '/^status/ { print $2 }')
2974 [ "$cur_status" == "completed" ] ||
2975 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2979 local repaired=$(do_facet mds1 $LCTL get_param -n \
2980 mdd.$(facet_svc mds1).lfsck_layout |
2981 awk '/^repaired_orphan/ { print $2 }')
2982 [ $repaired -eq 2 ] ||
2983 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2985 if [ $MDSCOUNT -ge 2 ]; then
2986 repaired=$(do_facet mds2 $LCTL get_param -n \
2987 mdd.$(facet_svc mds2).lfsck_layout |
2988 awk '/^repaired_orphan/ { print $2 }')
2989 [ $repaired -eq 2 ] ||
2990 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2993 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2996 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2999 echo "The target MDT-object is lost, but related OI mapping is there"
3000 echo "The LFSCK should recreate the lost MDT-object without affected"
3001 echo "by the stale OI mapping."
3004 check_mount_and_prep
3005 $LFS mkdir -i 0 $DIR/$tdir/a1
3006 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
3007 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
3008 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3010 $LFS getstripe $DIR/$tdir/a1/f1
3011 cancel_lru_locks osc
3013 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
3014 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
3015 do_facet mds1 $LCTL set_param fail_loc=0x162e
3016 rm -f $DIR/$tdir/a1/f1
3018 do_facet mds1 $LCTL set_param fail_loc=0
3019 cancel_lru_locks mdc
3020 cancel_lru_locks osc
3022 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3023 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3025 for k in $(seq $MDSCOUNT); do
3026 # The LFSCK status query internal is 30 seconds. For the case
3027 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3028 # time to guarantee the status sync up.
3029 wait_update_facet mds${k} "$LCTL get_param -n \
3030 mdd.$(facet_svc mds${k}).lfsck_layout |
3031 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3032 error "(2) MDS${k} is not the expected 'completed'"
3035 for k in $(seq $OSTCOUNT); do
3036 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3037 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3038 awk '/^status/ { print $2 }')
3039 [ "$cur_status" == "completed" ] ||
3040 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3043 local repaired=$(do_facet mds1 $LCTL get_param -n \
3044 mdd.$(facet_svc mds1).lfsck_layout |
3045 awk '/^repaired_orphan/ { print $2 }')
3046 [ $repaired -eq $OSTCOUNT ] ||
3047 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
3049 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
3050 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
3051 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3053 $LFS path2fid $DIR/$tdir/a1/f1
3054 $LFS getstripe $DIR/$tdir/a1/f1
3056 run_test 18g "Find out orphan OST-object and repair it (7)"
3060 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
3061 echo "the layout LFSCK will keep the bad PFL file(s) there without"
3062 echo "scanning its OST-object(s). Then in the second stage scanning,"
3063 echo "the OST will return related OST-object(s) to the MDT as orphan."
3064 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
3065 echo "the 'orphan(s)' stripe information."
3068 check_mount_and_prep
3070 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
3071 error "(0) Fail to create PFL $DIR/$tdir/f0"
3073 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
3074 error "(1.1) Fail to write $DIR/$tdir/f0"
3076 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
3077 error "(1.2) Fail to write $DIR/$tdir/f0"
3079 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3081 echo "Inject failure stub to simulate bad PFL extent range"
3082 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3083 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3085 chown 1.1 $DIR/$tdir/f0
3087 cancel_lru_locks mdc
3088 cancel_lru_locks osc
3089 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3091 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3092 error "(2) Write to bad PFL file should fail"
3094 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3095 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3097 for k in $(seq $MDSCOUNT); do
3098 # The LFSCK status query internal is 30 seconds. For the case
3099 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3100 # time to guarantee the status sync up.
3101 wait_update_facet mds${k} "$LCTL get_param -n \
3102 mdd.$(facet_svc mds${k}).lfsck_layout |
3103 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3104 error "(4.1) MDS${k} is not the expected 'completed'"
3107 for k in $(seq $OSTCOUNT); do
3108 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3109 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3110 awk '/^status/ { print $2 }')
3111 [ "$cur_status" == "completed" ] ||
3112 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3116 local repaired=$($SHOW_LAYOUT |
3117 awk '/^repaired_orphan/ { print $2 }')
3118 [ $repaired -eq 2 ] ||
3119 error "(5) Fail to repair crashed PFL range: $repaired"
3121 echo "Data in $DIR/$tdir/f0 should not be broken"
3122 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3123 error "(6) Data in $DIR/$tdir/f0 is broken"
3125 echo "Write should succeed after LFSCK repairing the bad PFL range"
3126 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3127 error "(7) Write should succeed after LFSCK"
3129 run_test 18h "LFSCK can repair crashed PFL extent range"
3131 $LCTL set_param debug=-cache > /dev/null
3134 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3135 skip "MDS older than 2.5.55, LU-3951"
3137 check_mount_and_prep
3138 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3140 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3141 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3143 echo "foo1" > $DIR/$tdir/a0
3144 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3145 error "(0) Fail to create PFL $DIR/$tdir/a1"
3146 echo "foo2" > $DIR/$tdir/a1
3147 echo "guard" > $DIR/$tdir/a2
3148 cancel_lru_locks osc
3150 echo "Inject failure, then client will offer wrong parent FID when read"
3151 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3152 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3154 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3155 $LCTL set_param fail_loc=0x1619
3157 echo "Read RPC with wrong parent FID should be denied"
3158 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3159 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3160 $LCTL set_param fail_loc=0
3162 run_test 19a "OST-object inconsistency self detect"
3165 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3166 skip "MDS older than 2.5.55, LU-3951"
3168 check_mount_and_prep
3169 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3171 echo "Inject failure stub to make the OST-object to back point to"
3172 echo "non-exist MDT-object"
3174 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3175 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3177 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3178 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3179 echo "foo1" > $DIR/$tdir/f0
3180 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3181 error "(0) Fail to create PFL $DIR/$tdir/f1"
3182 echo "foo2" > $DIR/$tdir/f1
3183 cancel_lru_locks osc
3184 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3186 do_facet ost1 $LCTL set_param -n \
3187 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3188 echo "Nothing should be fixed since self detect and repair is disabled"
3189 local repaired=$(do_facet ost1 $LCTL get_param -n \
3190 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3191 awk '/^repaired/ { print $2 }')
3192 [ $repaired -eq 0 ] ||
3193 error "(1) Expected 0 repaired, but got $repaired"
3195 echo "Read RPC with right parent FID should be accepted,"
3196 echo "and cause parent FID on OST to be fixed"
3198 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3199 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3201 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3202 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3204 repaired=$(do_facet ost1 $LCTL get_param -n \
3205 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3206 awk '/^repaired/ { print $2 }')
3207 [ $repaired -eq 2 ] ||
3208 error "(3) Expected 1 repaired, but got $repaired"
3210 run_test 19b "OST-object inconsistency self repair"
3212 PATTERN_WITH_HOLE="40000001"
3213 PATTERN_WITHOUT_HOLE="raid0"
3216 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3217 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3218 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3219 skip "MDS older than 2.5.55, LU-4887"
3222 echo "The target MDT-object and some of its OST-object are lost."
3223 echo "The LFSCK should find out the left OST-objects and re-create"
3224 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3225 echo "with the partial OST-objects (LOV EA hole)."
3227 echo "New client can access the file with LOV EA hole via normal"
3228 echo "system tools or commands without crash the system."
3230 echo "For old client, even though it cannot access the file with"
3231 echo "LOV EA hole, it should not cause the system crash."
3234 check_mount_and_prep
3235 $LFS mkdir -i 0 $DIR/$tdir/a1
3236 if [ $OSTCOUNT -gt 2 ]; then
3237 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3240 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3244 # 256 blocks on the stripe0.
3245 # 1 block on the stripe1 for 2 OSTs case.
3246 # 256 blocks on the stripe1 for other cases.
3247 # 1 block on the stripe2 if OSTs > 2
3248 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3249 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3250 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3252 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3253 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3254 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3257 $LFS getstripe $DIR/$tdir/a1/f0
3259 $LFS getstripe $DIR/$tdir/a1/f1
3261 $LFS getstripe $DIR/$tdir/a1/f2
3263 if [ $OSTCOUNT -gt 2 ]; then
3264 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3265 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3267 $LFS getstripe $DIR/$tdir/a1/f3
3270 cancel_lru_locks osc
3272 echo "Inject failure..."
3273 echo "To simulate f0 lost MDT-object"
3274 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3275 do_facet mds1 $LCTL set_param fail_loc=0x1616
3276 rm -f $DIR/$tdir/a1/f0
3278 echo "To simulate f1 lost MDT-object and OST-object0"
3279 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3280 do_facet mds1 $LCTL set_param fail_loc=0x161a
3281 rm -f $DIR/$tdir/a1/f1
3283 echo "To simulate f2 lost MDT-object and OST-object1"
3284 do_facet mds1 $LCTL set_param fail_val=1
3285 rm -f $DIR/$tdir/a1/f2
3287 if [ $OSTCOUNT -gt 2 ]; then
3288 echo "To simulate f3 lost MDT-object and OST-object2"
3289 do_facet mds1 $LCTL set_param fail_val=2
3290 rm -f $DIR/$tdir/a1/f3
3293 umount_client $MOUNT
3296 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3298 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3299 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3301 for k in $(seq $MDSCOUNT); do
3302 # The LFSCK status query internal is 30 seconds. For the case
3303 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3304 # time to guarantee the status sync up.
3305 wait_update_facet mds${k} "$LCTL get_param -n \
3306 mdd.$(facet_svc mds${k}).lfsck_layout |
3307 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3308 error "(2) MDS${k} is not the expected 'completed'"
3311 for k in $(seq $OSTCOUNT); do
3312 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3313 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3314 awk '/^status/ { print $2 }')
3315 [ "$cur_status" == "completed" ] ||
3316 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3319 local repaired=$(do_facet mds1 $LCTL get_param -n \
3320 mdd.$(facet_svc mds1).lfsck_layout |
3321 awk '/^repaired_orphan/ { print $2 }')
3322 if [ $OSTCOUNT -gt 2 ]; then
3323 [ $repaired -eq 9 ] ||
3324 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3326 [ $repaired -eq 4 ] ||
3327 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3330 mount_client $MOUNT || error "(5.0) Fail to start client!"
3332 LOV_PATTERN_F_HOLE=0x40000000
3335 # ${fid0}-R-0 is the old f0
3337 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3338 echo "Check $name, which is the old f0"
3340 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3342 local pattern=$($LFS getstripe -L $name)
3343 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3344 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3346 local stripes=$($LFS getstripe -c $name)
3347 if [ $OSTCOUNT -gt 2 ]; then
3348 [ $stripes -eq 3 ] ||
3349 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3351 [ $stripes -eq 2 ] ||
3352 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3355 local size=$(stat $name | awk '/Size:/ { print $2 }')
3356 [ $size -eq $((4096 * $bcount)) ] ||
3357 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3359 cat $name > /dev/null || error "(5.5) cannot read $name"
3361 echo "dummy" >> $name || error "(5.6) cannot write $name"
3363 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3365 touch $name || error "(5.8) cannot touch $name"
3367 rm -f $name || error "(5.9) cannot unlink $name"
3370 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3372 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3373 if [ $OSTCOUNT -gt 2 ]; then
3374 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3376 echo "Check $name, it contains the old f1's stripe1"
3379 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3381 pattern=$($LFS getstripe -L $name)
3382 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3383 error "(6.2) expect pattern flag hole, but got $pattern"
3385 stripes=$($LFS getstripe -c $name)
3386 if [ $OSTCOUNT -gt 2 ]; then
3387 [ $stripes -eq 3 ] ||
3388 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3390 [ $stripes -eq 2 ] ||
3391 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3394 size=$(stat $name | awk '/Size:/ { print $2 }')
3395 [ $size -eq $((4096 * $bcount)) ] ||
3396 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3398 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3400 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3401 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3404 [ $failures -eq 256 ] ||
3405 error "(6.6) expect 256 IO failures, but get $failures"
3407 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3408 [ $size -eq $((4096 * $bcount)) ] ||
3409 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3411 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3412 error "(6.8) write to the LOV EA hole should fail"
3414 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3415 error "(6.9) write to normal stripe should NOT fail"
3417 echo "foo" >> $name && error "(6.10) append write $name should fail"
3419 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3421 touch $name || error "(6.12) cannot touch $name"
3423 rm -f $name || error "(6.13) cannot unlink $name"
3426 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3428 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3429 if [ $OSTCOUNT -gt 2 ]; then
3430 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3432 echo "Check $name, it contains the old f2's stripe0"
3435 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3437 pattern=$($LFS getstripe -L $name)
3438 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3439 error "(7.2) expect pattern flag hole, but got $pattern"
3441 stripes=$($LFS getstripe -c $name)
3442 size=$(stat $name | awk '/Size:/ { print $2 }')
3443 if [ $OSTCOUNT -gt 2 ]; then
3444 [ $stripes -eq 3 ] ||
3445 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3447 [ $size -eq $((4096 * $bcount)) ] ||
3448 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3450 cat $name > /dev/null &&
3451 error "(7.5.1) normal read $name should fail"
3453 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3454 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3456 [ $failures -eq 256 ] ||
3457 error "(7.6) expect 256 IO failures, but get $failures"
3459 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3460 [ $size -eq $((4096 * $bcount)) ] ||
3461 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3463 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3464 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3466 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3467 error "(7.8.1) write to normal stripe should NOT fail"
3469 echo "foo" >> $name &&
3470 error "(7.8.3) append write $name should fail"
3472 chown $RUNAS_ID:$RUNAS_GID $name ||
3473 error "(7.9.1) cannot chown on $name"
3475 touch $name || error "(7.10.1) cannot touch $name"
3477 [ $stripes -eq 2 ] ||
3478 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3481 [ $size -eq $((4096 * (256 + 0))) ] ||
3482 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3484 cat $name > /dev/null &&
3485 error "(7.5.2) normal read $name should fail"
3487 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3488 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3489 [ $failures -eq 256 ] ||
3490 error "(7.6.2) expect 256 IO failures, but get $failures"
3493 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3494 [ $size -eq $((4096 * $bcount)) ] ||
3495 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3497 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3498 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3500 chown $RUNAS_ID:$RUNAS_GID $name ||
3501 error "(7.9.2) cannot chown on $name"
3503 touch $name || error "(7.10.2) cannot touch $name"
3506 rm -f $name || error "(7.11) cannot unlink $name"
3508 [ $OSTCOUNT -le 2 ] && return
3511 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3513 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3514 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3516 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3518 pattern=$($LFS getstripe -L $name)
3519 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3520 error "(8.2) expect pattern flag hole, but got $pattern"
3522 stripes=$($LFS getstripe -c $name)
3523 [ $stripes -eq 3 ] ||
3524 error "(8.3) expect the stripe count is 3, but got $stripes"
3526 size=$(stat $name | awk '/Size:/ { print $2 }')
3528 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3529 error "(8.4) expect the size $((4096 * 512)), but got $size"
3531 cat $name > /dev/null &&
3532 error "(8.5) normal read $name should fail"
3534 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3535 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3537 [ $failures -eq 256 ] ||
3538 error "(8.6) expect 256 IO failures, but get $failures"
3541 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3542 [ $size -eq $((4096 * $bcount)) ] ||
3543 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3545 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3546 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3548 chown $RUNAS_ID:$RUNAS_GID $name ||
3549 error "(8.9) cannot chown on $name"
3551 touch $name || error "(8.10) cannot touch $name"
3553 rm -f $name || error "(8.11) cannot unlink $name"
3555 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3558 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3559 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3560 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3561 skip "MDS older than 2.5.55, LU-4887"
3564 echo "The target MDT-object and some of its OST-object are lost."
3565 echo "The LFSCK should find out the left OST-objects and re-create"
3566 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3567 echo "with the partial OST-objects (LOV EA hole)."
3569 echo "New client can access the file with LOV EA hole via normal"
3570 echo "system tools or commands without crash the system - PFL case."
3573 check_mount_and_prep
3575 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3576 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3577 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3578 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3579 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3580 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3582 local bcount=$((256 * 3 + 1))
3584 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3585 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3586 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3588 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3589 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3590 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3593 $LFS getstripe $DIR/$tdir/f0
3595 $LFS getstripe $DIR/$tdir/f1
3597 $LFS getstripe $DIR/$tdir/f2
3599 cancel_lru_locks mdc
3600 cancel_lru_locks osc
3602 echo "Inject failure..."
3603 echo "To simulate f0 lost MDT-object"
3604 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3605 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3608 echo "To simulate the case of f1 lost MDT-object and "
3609 echo "the first OST-object in each PFL component"
3610 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3611 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3614 echo "To simulate the case of f2 lost MDT-object and "
3615 echo "the second OST-object in each PFL component"
3616 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3621 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3623 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3624 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3626 for k in $(seq $MDSCOUNT); do
3627 # The LFSCK status query internal is 30 seconds. For the case
3628 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3629 # time to guarantee the status sync up.
3630 wait_update_facet mds${k} "$LCTL get_param -n \
3631 mdd.$(facet_svc mds${k}).lfsck_layout |
3632 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3633 error "(4) MDS${k} is not the expected 'completed'"
3636 for k in $(seq $OSTCOUNT); do
3637 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3638 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3639 awk '/^status/ { print $2 }')
3640 [ "$cur_status" == "completed" ] ||
3641 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3644 local repaired=$(do_facet mds1 $LCTL get_param -n \
3645 mdd.$(facet_svc mds1).lfsck_layout |
3646 awk '/^repaired_orphan/ { print $2 }')
3647 [ $repaired -eq 8 ] ||
3648 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3651 # ${fid0}-R-0 is the old f0
3653 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3654 echo "Check $name, which is the old f0"
3656 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3658 local pattern=$($LFS getstripe -L -I1 $name)
3659 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3660 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3662 pattern=$($LFS getstripe -L -I2 $name)
3663 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3664 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3666 local stripes=$($LFS getstripe -c -I1 $name)
3667 [ $stripes -eq 2 ] ||
3668 error "(7.3.1) expect 2 stripes, but got $stripes"
3670 stripes=$($LFS getstripe -c -I2 $name)
3671 [ $stripes -eq 2 ] ||
3672 error "(7.3.2) expect 2 stripes, but got $stripes"
3674 local e_start=$($LFS getstripe -I1 $name |
3675 awk '/lcme_extent.e_start:/ { print $2 }')
3676 [ $e_start -eq 0 ] ||
3677 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3679 local e_end=$($LFS getstripe -I1 $name |
3680 awk '/lcme_extent.e_end:/ { print $2 }')
3681 [ $e_end -eq 2097152 ] ||
3682 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3684 e_start=$($LFS getstripe -I2 $name |
3685 awk '/lcme_extent.e_start:/ { print $2 }')
3686 [ $e_start -eq 2097152 ] ||
3687 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3689 e_end=$($LFS getstripe -I2 $name |
3690 awk '/lcme_extent.e_end:/ { print $2 }')
3691 [ "$e_end" = "EOF" ] ||
3692 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3694 local size=$(stat $name | awk '/Size:/ { print $2 }')
3695 [ $size -eq $((4096 * $bcount)) ] ||
3696 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3698 cat $name > /dev/null || error "(7.7) cannot read $name"
3700 echo "dummy" >> $name || error "(7.8) cannot write $name"
3702 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3704 touch $name || error "(7.10) cannot touch $name"
3706 rm -f $name || error "(7.11) cannot unlink $name"
3709 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3711 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3712 echo "Check $name, it contains f1's second OST-object in each COMP"
3714 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3716 pattern=$($LFS getstripe -L -I1 $name)
3717 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3718 error "(8.2.1) expect pattern flag hole, but got $pattern"
3720 pattern=$($LFS getstripe -L -I2 $name)
3721 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3722 error "(8.2.2) expect pattern flag hole, but got $pattern"
3724 stripes=$($LFS getstripe -c -I1 $name)
3725 [ $stripes -eq 2 ] ||
3726 error "(8.3.2) expect 2 stripes, but got $stripes"
3728 stripes=$($LFS getstripe -c -I2 $name)
3729 [ $stripes -eq 2 ] ||
3730 error "(8.3.2) expect 2 stripes, but got $stripes"
3732 e_start=$($LFS getstripe -I1 $name |
3733 awk '/lcme_extent.e_start:/ { print $2 }')
3734 [ $e_start -eq 0 ] ||
3735 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3737 e_end=$($LFS getstripe -I1 $name |
3738 awk '/lcme_extent.e_end:/ { print $2 }')
3739 [ $e_end -eq 2097152 ] ||
3740 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3742 e_start=$($LFS getstripe -I2 $name |
3743 awk '/lcme_extent.e_start:/ { print $2 }')
3744 [ $e_start -eq 2097152 ] ||
3745 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3747 e_end=$($LFS getstripe -I2 $name |
3748 awk '/lcme_extent.e_end:/ { print $2 }')
3749 [ "$e_end" = "EOF" ] ||
3750 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3752 size=$(stat $name | awk '/Size:/ { print $2 }')
3753 [ $size -eq $((4096 * $bcount)) ] ||
3754 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3756 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3758 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3759 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3761 # The first stripe in each COMP was lost
3762 [ $failures -eq 512 ] ||
3763 error "(8.8) expect 512 IO failures, but get $failures"
3765 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3766 [ $size -eq $((4096 * $bcount)) ] ||
3767 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3769 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3770 error "(8.10) write to the LOV EA hole should fail"
3772 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3773 error "(8.11) write to normal stripe should NOT fail"
3775 echo "foo" >> $name && error "(8.12) append write $name should fail"
3777 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3779 touch $name || error "(8.14) cannot touch $name"
3781 rm -f $name || error "(8.15) cannot unlink $name"
3784 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3786 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3787 echo "Check $name, it contains f2's first stripe in each COMP"
3789 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3791 pattern=$($LFS getstripe -L -I1 $name)
3792 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3793 error "(9.2.1) expect pattern flag hole, but got $pattern"
3795 pattern=$($LFS getstripe -L -I2 $name)
3796 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3797 error "(9.2.2) expect pattern flag hole, but got $pattern"
3799 stripes=$($LFS getstripe -c -I1 $name)
3800 [ $stripes -eq 2 ] ||
3801 error "(9.3.2) expect 2 stripes, but got $stripes"
3803 stripes=$($LFS getstripe -c -I2 $name)
3804 [ $stripes -eq 2 ] ||
3805 error "(9.3.2) expect 2 stripes, but got $stripes"
3807 e_start=$($LFS getstripe -I1 $name |
3808 awk '/lcme_extent.e_start:/ { print $2 }')
3809 [ $e_start -eq 0 ] ||
3810 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3812 e_end=$($LFS getstripe -I1 $name |
3813 awk '/lcme_extent.e_end:/ { print $2 }')
3814 [ $e_end -eq 2097152 ] ||
3815 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3817 e_start=$($LFS getstripe -I2 $name |
3818 awk '/lcme_extent.e_start:/ { print $2 }')
3819 [ $e_start -eq 2097152 ] ||
3820 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3822 e_end=$($LFS getstripe -I2 $name |
3823 awk '/lcme_extent.e_end:/ { print $2 }')
3824 [ "$e_end" = "EOF" ] ||
3825 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3827 size=$(stat $name | awk '/Size:/ { print $2 }')
3828 # The second stripe in COMP was lost, so we do not know there
3829 # have ever been some data before. 'stat' will regard it as
3830 # no data on the lost stripe.
3832 [ $size -eq $((4096 * $bcount)) ] ||
3833 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3835 cat $name > /dev/null &&
3836 error "(9.7) normal read $name should fail"
3838 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3839 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3840 [ $failures -eq 512 ] ||
3841 error "(9.8) expect 256 IO failures, but get $failures"
3843 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3844 # The second stripe in COMP was lost, so we do not know there
3845 # have ever been some data before. Since 'dd' skip failure,
3846 # it will regard the lost stripe contains data.
3848 [ $size -eq $((4096 * $bcount)) ] ||
3849 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3851 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3852 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3854 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3855 error "(9.11) write to normal stripe should NOT fail"
3857 echo "foo" >> $name &&
3858 error "(9.12) append write $name should fail"
3860 chown $RUNAS_ID:$RUNAS_GID $name ||
3861 error "(9.13) cannot chown on $name"
3863 touch $name || error "(9.14) cannot touch $name"
3865 rm -f $name || error "(7.15) cannot unlink $name"
3867 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3870 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3871 skip "MDS older than 2.5.59, LU-4887"
3873 check_mount_and_prep
3874 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3876 echo "Start all LFSCK components by default (-s 1)"
3877 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3878 error "Fail to start LFSCK"
3880 echo "namespace LFSCK should be in 'scanning-phase1' status"
3881 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3882 [ "$STATUS" == "scanning-phase1" ] ||
3883 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3885 echo "layout LFSCK should be in 'scanning-phase1' status"
3886 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3887 [ "$STATUS" == "scanning-phase1" ] ||
3888 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3890 echo "Stop all LFSCK components by default"
3891 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3892 error "Fail to stop LFSCK"
3894 run_test 21 "run all LFSCK components by default"
3897 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3898 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3899 skip "MDS older than 2.6.50, LU-5511"
3902 echo "The parent_A references the child directory via some name entry,"
3903 echo "but the child directory back references another parent_B via its"
3904 echo "".." name entry. The parent_B does not exist. Then the namespace"
3905 echo "LFSCK will repair the child directory's ".." name entry."
3908 check_mount_and_prep
3910 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3911 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3913 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3914 echo "The dummy's dotdot name entry references the guard."
3915 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3916 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3917 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3918 error "(3) Fail to mkdir on MDT0"
3919 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3921 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3923 echo "Trigger namespace LFSCK to repair unmatched pairs"
3924 $START_NAMESPACE -A -r ||
3925 error "(5) Fail to start LFSCK for namespace"
3927 wait_all_targets_blocked namespace completed 6
3929 local repaired=$($SHOW_NAMESPACE |
3930 awk '/^unmatched_pairs_repaired/ { print $2 }')
3931 [ $repaired -eq 1 ] ||
3932 error "(7) Fail to repair unmatched pairs: $repaired"
3934 echo "'ls' should success after namespace LFSCK repairing"
3935 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3936 error "(8) ls should success."
3938 run_test 22a "LFSCK can repair unmatched pairs (1)"
3941 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3942 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3943 skip "MDS older than 2.6.50, LU-5511"
3946 echo "The parent_A references the child directory via the name entry_B,"
3947 echo "but the child directory back references another parent_C via its"
3948 echo "".." name entry. The parent_C exists, but there is no the name"
3949 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3950 echo "the child directory's ".." name entry and its linkEA."
3953 check_mount_and_prep
3955 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3956 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3958 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3959 echo "and bad linkEA. The dummy's dotdot name entry references the"
3960 echo "guard. The dummy's linkEA references n non-exist name entry."
3961 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3962 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3963 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3964 error "(3) Fail to mkdir on MDT0"
3965 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3967 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3968 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3969 local dummyname=$($LFS fid2path $DIR $dummyfid)
3970 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3971 error "(4) fid2path works unexpectedly."
3973 echo "Trigger namespace LFSCK to repair unmatched pairs"
3974 $START_NAMESPACE -A -r ||
3975 error "(5) Fail to start LFSCK for namespace"
3977 wait_all_targets_blocked namespace completed 6
3979 local repaired=$($SHOW_NAMESPACE |
3980 awk '/^unmatched_pairs_repaired/ { print $2 }')
3981 [ $repaired -eq 1 ] ||
3982 error "(7) Fail to repair unmatched pairs: $repaired"
3984 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3985 local dummyname=$($LFS fid2path $DIR $dummyfid)
3986 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3987 error "(8) fid2path does not work"
3989 run_test 22b "LFSCK can repair unmatched pairs (2)"
3992 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3993 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3994 skip "MDS older than 2.6.50, LU-5512"
3997 echo "The name entry is there, but the MDT-object for such name "
3998 echo "entry does not exist. The namespace LFSCK should find out "
3999 echo "and repair the inconsistency as required."
4002 check_mount_and_prep
4004 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4005 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
4007 echo "Inject failure stub on MDT1 to simulate dangling name entry"
4008 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
4009 do_facet mds2 $LCTL set_param fail_loc=0x1620
4010 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
4011 do_facet mds2 $LCTL set_param fail_loc=0
4013 echo "'ls' should fail because of dangling name entry"
4014 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
4016 echo "Trigger namespace LFSCK to find out dangling name entry"
4017 $START_NAMESPACE -A -r ||
4018 error "(5) Fail to start LFSCK for namespace"
4020 wait_all_targets_blocked namespace completed 6
4022 local repaired=$($SHOW_NAMESPACE |
4023 awk '/^dangling_repaired/ { print $2 }')
4024 [ $repaired -eq 1 ] ||
4025 error "(7) Fail to repair dangling name entry: $repaired"
4027 echo "'ls' should fail because not re-create MDT-object by default"
4028 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
4030 echo "Trigger namespace LFSCK again to repair dangling name entry"
4031 $START_NAMESPACE -A -r -C ||
4032 error "(9) Fail to start LFSCK for namespace"
4034 wait_all_targets_blocked namespace completed 10
4036 repaired=$($SHOW_NAMESPACE |
4037 awk '/^dangling_repaired/ { print $2 }')
4038 [ $repaired -eq 1 ] ||
4039 error "(11) Fail to repair dangling name entry: $repaired"
4041 echo "'ls' should success after namespace LFSCK repairing"
4042 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
4044 run_test 23a "LFSCK can repair dangling name entry (1)"
4047 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4048 skip "MDS older than 2.6.50, LU-5512"
4051 echo "The objectA has multiple hard links, one of them corresponding"
4052 echo "to the name entry_B. But there is something wrong for the name"
4053 echo "entry_B and cause entry_B to references non-exist object_C."
4054 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4055 echo "as dangling, and re-create the lost object_C. When the LFSCK"
4056 echo "comes to the second-stage scanning, it will find that the"
4057 echo "former re-creating object_C is not proper, and will try to"
4058 echo "replace the object_C with the real object_A."
4061 check_mount_and_prep
4063 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4064 $LFS path2fid $DIR/$tdir/d0
4066 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4068 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4069 $LFS path2fid $DIR/$tdir/d0/f0
4071 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4072 $LFS path2fid $DIR/$tdir/d0/f1
4074 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
4075 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
4077 if [ "$SEQ0" != "$SEQ1" ]; then
4078 # To guarantee that the f0 and f1 are in the same FID seq
4079 rm -f $DIR/$tdir/d0/f0 ||
4080 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4081 echo "dummy" > $DIR/$tdir/d0/f0 ||
4082 error "(3.2) Fail to touch on MDT0"
4083 $LFS path2fid $DIR/$tdir/d0/f0
4086 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4087 OID=$(printf %d $OID)
4089 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4090 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4091 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4092 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4093 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4095 # If there is creation after the dangling injection, it may re-use
4096 # the just released local object (inode) that is referenced by the
4097 # dangling name entry. It will fail the dangling injection.
4098 # So before deleting the target object for the dangling name entry,
4099 # remove some other objects to avoid the target object being reused
4100 # by some potential creations. LU-7429
4101 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4103 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4105 echo "'ls' should fail because of dangling name entry"
4106 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4107 error "(6) ls should fail."
4109 echo "Trigger namespace LFSCK to find out dangling name entry"
4110 $START_NAMESPACE -r -C ||
4111 error "(7) Fail to start LFSCK for namespace"
4113 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4114 mdd.${MDT_DEV}.lfsck_namespace |
4115 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4117 error "(8) unexpected status"
4120 local repaired=$($SHOW_NAMESPACE |
4121 awk '/^dangling_repaired/ { print $2 }')
4122 [ $repaired -eq 1 ] ||
4123 error "(9) Fail to repair dangling name entry: $repaired"
4125 repaired=$($SHOW_NAMESPACE |
4126 awk '/^multiple_linked_repaired/ { print $2 }')
4127 [ $repaired -eq 1 ] ||
4128 error "(10) Fail to drop the former created object: $repaired"
4130 local data=$(cat $DIR/$tdir/d0/foo)
4131 [ "$data" == "dummy" ] ||
4132 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4134 run_test 23b "LFSCK can repair dangling name entry (2)"
4137 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4138 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4139 mdd.${MDT_DEV}.lfsck_namespace |
4140 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4142 error "(10) unexpected status"
4145 stop_full_debug_logging
4149 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4150 skip "MDS older than 2.6.50, LU-5512"
4153 echo "The objectA has multiple hard links, one of them corresponding"
4154 echo "to the name entry_B. But there is something wrong for the name"
4155 echo "entry_B and cause entry_B to references non-exist object_C."
4156 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4157 echo "as dangling, and re-create the lost object_C. And then others"
4158 echo "modified the re-created object_C. When the LFSCK comes to the"
4159 echo "second-stage scanning, it will find that the former re-creating"
4160 echo "object_C maybe wrong and try to replace the object_C with the"
4161 echo "real object_A. But because object_C has been modified, so the"
4162 echo "LFSCK cannot replace it."
4165 start_full_debug_logging
4167 check_mount_and_prep
4169 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4170 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4171 echo "parent_fid=$parent_fid"
4173 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4175 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4176 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4177 echo "f0_fid=$f0_fid"
4179 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4180 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4181 echo "f1_fid=$f1_fid"
4183 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4184 # To guarantee that the f0 and f1 are in the same FID seq
4185 rm -f $DIR/$tdir/d0/f0 ||
4186 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4187 echo "dummy" > $DIR/$tdir/d0/f0 ||
4188 error "(3.2) Fail to touch on MDT0"
4189 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4190 echo "f0_fid=$f0_fid (replaced)"
4193 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4195 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4196 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4197 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4198 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4199 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4201 # If there is creation after the dangling injection, it may re-use
4202 # the just released local object (inode) that is referenced by the
4203 # dangling name entry. It will fail the dangling injection.
4204 # So before deleting the target object for the dangling name entry,
4205 # remove some other objects to avoid the target object being reused
4206 # by some potential creations. LU-7429
4207 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4209 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4211 echo "'ls' should fail because of dangling name entry"
4212 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4213 error "(6) ls should fail."
4215 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4216 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4218 echo "Trigger namespace LFSCK to find out dangling name entry"
4219 $START_NAMESPACE -r -C ||
4220 error "(7) Fail to start LFSCK for namespace"
4222 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4223 # While unexpected by the test, it is valid for LFSCK to repair
4224 # the link to the original object before any data is written.
4225 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4227 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4228 log "LFSCK repaired file prematurely"
4233 stat $DIR/$tdir/d0/foo
4235 error "(8) unexpected size"
4238 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4239 cancel_lru_locks osc
4243 local repaired=$($SHOW_NAMESPACE |
4244 awk '/^dangling_repaired/ { print $2 }')
4245 [ $repaired -eq 1 ] ||
4246 error "(11) Fail to repair dangling name entry: $repaired"
4248 local data=$(cat $DIR/$tdir/d0/foo)
4249 [ "$data" != "dummy" ] ||
4250 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4252 run_test 23c "LFSCK can repair dangling name entry (3)"
4255 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4256 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4257 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4258 skip "MDS older than 2.6.50, LU-5513"
4261 echo "Two MDT-objects back reference the same name entry via their"
4262 echo "each own linkEA entry, but the name entry only references one"
4263 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4264 echo "for the MDT-object that is not recognized. If such MDT-object"
4265 echo "has no other linkEA entry after the removing, then the LFSCK"
4266 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4269 check_mount_and_prep
4271 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4273 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4274 $LFS path2fid $DIR/$tdir/d0/guard
4276 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4277 $LFS path2fid $DIR/$tdir/d0/dummy
4280 if [ $mds1_FSTYPE != ldiskfs ]; then
4281 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4283 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4286 touch $DIR/$tdir/d0/guard/foo ||
4287 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4289 echo "Inject failure stub on MDT0 to simulate the case that"
4290 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4291 echo "that references $DIR/$tdir/d0/guard/foo."
4292 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4293 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4294 echo "there with the same linkEA entry as another MDT-object"
4295 echo "$DIR/$tdir/d0/guard/foo has"
4297 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4298 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4299 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4300 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4301 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4302 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4303 rmdir $DIR/$tdir/d0/dummy/foo ||
4304 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4307 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4308 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4309 error "(6) stat successfully unexpectedly"
4311 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4312 $START_NAMESPACE -A -r ||
4313 error "(7) Fail to start LFSCK for namespace"
4315 wait_all_targets_blocked namespace completed 8
4317 local repaired=$($SHOW_NAMESPACE |
4318 awk '/^multiple_referenced_repaired/ { print $2 }')
4319 [ $repaired -eq 1 ] ||
4320 error "(9) Fail to repair multiple referenced name entry: $repaired"
4322 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4323 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4324 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4326 local cname="$cfid-$pfid-D-0"
4327 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4328 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4330 run_test 24 "LFSCK can repair multiple-referenced name entry"
4333 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4334 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4335 skip "MDS older than 2.6.50, LU-5515"
4338 echo "The file type in the name entry does not match the file type"
4339 echo "claimed by the referenced object. Then the LFSCK will update"
4340 echo "the file type in the name entry."
4343 check_mount_and_prep
4345 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4347 echo "Inject failure stub on MDT0 to simulate the case that"
4348 echo "the file type stored in the name entry is wrong."
4350 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4351 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4352 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4353 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4355 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4356 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4358 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4359 mdd.${MDT_DEV}.lfsck_namespace |
4360 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4362 error "(4) unexpected status"
4365 local repaired=$($SHOW_NAMESPACE |
4366 awk '/^bad_file_type_repaired/ { print $2 }')
4367 [ $repaired -eq 1 ] ||
4368 error "(5) Fail to repair bad file type in name entry: $repaired"
4370 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4372 run_test 25 "LFSCK can repair bad file type in the name entry"
4375 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4376 skip "MDS older than 2.6.50, LU-5516"
4379 echo "The local name entry back referenced by the MDT-object is lost."
4380 echo "The namespace LFSCK will add the missing local name entry back"
4381 echo "to the normal namespace."
4384 check_mount_and_prep
4386 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4387 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4388 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4390 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4391 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4393 echo "Inject failure stub on MDT0 to simulate the case that"
4394 echo "foo's name entry will be removed, but the foo's object"
4395 echo "and its linkEA are kept in the system."
4397 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4398 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4399 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4400 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4402 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4403 error "(5) 'ls' should fail"
4405 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4406 $START_NAMESPACE -r -A ||
4407 error "(6) Fail to start LFSCK for namespace"
4409 wait_all_targets_blocked namespace completed 7
4411 local repaired=$($SHOW_NAMESPACE |
4412 awk '/^lost_dirent_repaired/ { print $2 }')
4413 [ $repaired -eq 1 ] ||
4414 error "(8) Fail to repair lost dirent: $repaired"
4416 ls -ail $DIR/$tdir/d0/foo ||
4417 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4419 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4420 [ "$foofid" == "$foofid2" ] ||
4421 error "(10) foo's FID changed: $foofid, $foofid2"
4423 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4426 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4427 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4428 skip "MDS older than 2.6.50, LU-5516"
4431 echo "The remote name entry back referenced by the MDT-object is lost."
4432 echo "The namespace LFSCK will add the missing remote name entry back"
4433 echo "to the normal namespace."
4436 check_mount_and_prep
4438 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4439 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4440 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4442 echo "Inject failure stub on MDT0 to simulate the case that"
4443 echo "foo's name entry will be removed, but the foo's object"
4444 echo "and its linkEA are kept in the system."
4446 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4447 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4448 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4449 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4451 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4452 error "(4) 'ls' should fail"
4454 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4455 $START_NAMESPACE -r -A ||
4456 error "(5) Fail to start LFSCK for namespace"
4458 wait_all_targets_blocked namespace completed 6
4460 local repaired=$($SHOW_NAMESPACE |
4461 awk '/^lost_dirent_repaired/ { print $2 }')
4462 [ $repaired -eq 1 ] ||
4463 error "(7) Fail to repair lost dirent: $repaired"
4465 ls -ail $DIR/$tdir/d0/foo ||
4466 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4468 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4469 [ "$foofid" == "$foofid2" ] ||
4470 error "(9) foo's FID changed: $foofid, $foofid2"
4472 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4475 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4476 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4477 skip "MDS older than 2.6.50, LU-5516"
4480 echo "The local parent referenced by the MDT-object linkEA is lost."
4481 echo "The namespace LFSCK will re-create the lost parent as orphan."
4484 check_mount_and_prep
4486 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4487 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4488 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4489 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4491 echo "Inject failure stub on MDT0 to simulate the case that"
4492 echo "foo's name entry will be removed, but the foo's object"
4493 echo "and its linkEA are kept in the system. And then remove"
4494 echo "another hard link and the parent directory."
4496 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4498 rm -f $DIR/$tdir/d0/foo ||
4499 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4500 rm -f $DIR/$tdir/d0/dummy ||
4501 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4504 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4505 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4507 echo "Trigger namespace LFSCK to repair the lost parent"
4508 $START_NAMESPACE -r -A ||
4509 error "(6) Fail to start LFSCK for namespace"
4511 wait_all_targets_blocked namespace completed 7
4513 local repaired=$($SHOW_NAMESPACE |
4514 awk '/^lost_dirent_repaired/ { print $2 }')
4515 [ $repaired -eq 1 ] ||
4516 error "(8) Fail to repair lost dirent: $repaired"
4518 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4519 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4520 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4522 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4524 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4525 [ ! -z "$cname" ] ||
4526 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4528 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4531 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4532 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4533 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4534 skip "MDS older than 2.6.50, LU-5516"
4537 echo "The remote parent referenced by the MDT-object linkEA is lost."
4538 echo "The namespace LFSCK will re-create the lost parent as orphan."
4541 check_mount_and_prep
4543 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4544 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4546 $LFS path2fid $DIR/$tdir/d0
4548 echo "Inject failure stub on MDT0 to simulate the case that"
4549 echo "foo's name entry will be removed, but the foo's object"
4550 echo "and its linkEA are kept in the system. And then remove"
4551 echo "the parent directory."
4553 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4554 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4555 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4556 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4558 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4559 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4561 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4562 $START_NAMESPACE -r -A ||
4563 error "(6) Fail to start LFSCK for namespace"
4565 wait_all_targets_blocked namespace completed 7
4567 local repaired=$($SHOW_NAMESPACE |
4568 awk '/^lost_dirent_repaired/ { print $2 }')
4569 [ $repaired -eq 1 ] ||
4570 error "(8) Fail to repair lost dirent: $repaired"
4572 ls -ail $MOUNT/.lustre/lost+found/
4574 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4575 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4576 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4578 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4580 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4581 [ ! -z "$cname" ] ||
4582 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4584 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4587 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4588 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4589 skip "MDS older than 2.6.50, LU-5506"
4592 echo "The target name entry is lost. The LFSCK should insert the"
4593 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4594 echo "the MDT (on which the orphan MDT-object resides) has ever"
4595 echo "failed to respond some name entry verification during the"
4596 echo "first stage-scanning, then the LFSCK should skip to handle"
4597 echo "orphan MDT-object on this MDT. But other MDTs should not"
4601 check_mount_and_prep
4602 $LFS mkdir -i 0 $DIR/$tdir/d1
4603 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4604 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4606 $LFS mkdir -i 1 $DIR/$tdir/d2
4607 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4608 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4610 echo "Inject failure stub on MDT0 to simulate the case that"
4611 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4612 echo "and its linkEA are kept in the system. And the case that"
4613 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4614 echo "and its linkEA are kept in the system."
4616 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4617 do_facet mds1 $LCTL set_param fail_loc=0x1624
4618 do_facet mds2 $LCTL set_param fail_loc=0x1624
4619 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4620 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4621 do_facet mds1 $LCTL set_param fail_loc=0
4622 do_facet mds2 $LCTL set_param fail_loc=0
4624 cancel_lru_locks mdc
4625 cancel_lru_locks osc
4627 echo "Inject failure, to simulate the MDT0 fail to handle"
4628 echo "MDT1 LFSCK request during the first-stage scanning."
4629 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4630 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4632 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4633 $START_NAMESPACE -r -A ||
4634 error "(3) Fail to start LFSCK for namespace"
4636 wait_update_facet mds1 "$LCTL get_param -n \
4637 mdd.$(facet_svc mds1).lfsck_namespace |
4638 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4639 error "(4) mds1 is not the expected 'partial'"
4642 wait_update_facet mds2 "$LCTL get_param -n \
4643 mdd.$(facet_svc mds2).lfsck_namespace |
4644 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4645 error "(5) mds2 is not the expected 'completed'"
4648 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4650 local repaired=$(do_facet mds1 $LCTL get_param -n \
4651 mdd.$(facet_svc mds1).lfsck_namespace |
4652 awk '/^lost_dirent_repaired/ { print $2 }')
4653 [ $repaired -eq 0 ] ||
4654 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4656 repaired=$(do_facet mds2 $LCTL get_param -n \
4657 mdd.$(facet_svc mds2).lfsck_namespace |
4658 awk '/^lost_dirent_repaired/ { print $2 }')
4659 [ $repaired -eq 1 ] ||
4660 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4662 echo "Trigger namespace LFSCK on all devices again to cleanup"
4663 $START_NAMESPACE -r -A ||
4664 error "(8) Fail to start LFSCK for namespace"
4666 wait_all_targets_blocked namespace completed 9
4668 local repaired=$(do_facet mds1 $LCTL get_param -n \
4669 mdd.$(facet_svc mds1).lfsck_namespace |
4670 awk '/^lost_dirent_repaired/ { print $2 }')
4671 [ $repaired -eq 1 ] ||
4672 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4674 repaired=$(do_facet mds2 $LCTL get_param -n \
4675 mdd.$(facet_svc mds2).lfsck_namespace |
4676 awk '/^lost_dirent_repaired/ { print $2 }')
4677 [ $repaired -eq 0 ] ||
4678 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4680 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4683 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4684 skip "MDS older than 2.6.50, LU-5517"
4687 echo "The object's nlink attribute is larger than the object's known"
4688 echo "name entries count. The LFSCK will repair the object's nlink"
4689 echo "attribute to match the known name entries count"
4692 check_mount_and_prep
4694 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4695 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4697 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4698 echo "nlink attribute is larger than its name entries count."
4700 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4701 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4702 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4703 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4704 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4706 cancel_lru_locks mdc
4707 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4708 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4710 echo "Trigger namespace LFSCK to repair the nlink count"
4711 $START_NAMESPACE -r -A ||
4712 error "(5) Fail to start LFSCK for namespace"
4714 wait_all_targets_blocked namespace completed 6
4716 local repaired=$($SHOW_NAMESPACE |
4717 awk '/^nlinks_repaired/ { print $2 }')
4718 [ $repaired -eq 1 ] ||
4719 error "(7) Fail to repair nlink count: $repaired"
4721 cancel_lru_locks mdc
4722 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4723 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4725 # Disable 29a, we only allow nlink to be updated if the known linkEA
4726 # entries is larger than nlink count.
4728 #run_test 29a "LFSCK can repair bad nlink count (1)"
4731 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4732 skip "MDS older than 2.6.50, LU-5517"
4735 echo "The object's nlink attribute is smaller than the object's known"
4736 echo "name entries count. The LFSCK will repair the object's nlink"
4737 echo "attribute to match the known name entries count"
4740 check_mount_and_prep
4742 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4743 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4745 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4746 echo "nlink attribute is smaller than its name entries count."
4748 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4749 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4750 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4751 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4752 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4754 cancel_lru_locks mdc
4755 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4756 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4758 echo "Trigger namespace LFSCK to repair the nlink count"
4759 $START_NAMESPACE -r -A ||
4760 error "(5) Fail to start LFSCK for namespace"
4762 wait_all_targets_blocked namespace completed 6
4764 local repaired=$($SHOW_NAMESPACE |
4765 awk '/^nlinks_repaired/ { print $2 }')
4766 [ $repaired -eq 1 ] ||
4767 error "(7) Fail to repair nlink count: $repaired"
4769 cancel_lru_locks mdc
4770 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4771 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4773 run_test 29b "LFSCK can repair bad nlink count (2)"
4777 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4778 skip "MDS older than 2.6.50, LU-5517"
4781 echo "The namespace LFSCK will create many hard links to the target"
4782 echo "file as to exceed the linkEA size limitation. Under such case"
4783 echo "the linkEA will be marked as overflow that will prevent the"
4784 echo "target file to be migrated. Then remove some hard links to"
4785 echo "make the left hard links to be held within the linkEA size"
4786 echo "limitation. But before the namespace LFSCK adding all the"
4787 echo "missed linkEA entries back, the overflow mark (timestamp)"
4788 echo "will not be cleared."
4791 check_mount_and_prep
4793 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4794 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4795 error "(0.2) Fail to mkdir"
4796 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4797 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4799 # define MAX_LINKEA_SIZE 4096
4800 # sizeof(link_ea_header) = 24
4801 # sizeof(link_ea_entry) = 18
4802 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4803 # (sizeof(link_ea_entry) + name_length))
4804 # If the average name length is 12 bytes, then 150 hard links
4805 # is totally enough to overflow the linkEA
4806 echo "Create 150 hard links should succeed although the linkEA overflow"
4807 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4808 error "(2) Fail to hard link"
4810 cancel_lru_locks mdc
4811 if [ $MDSCOUNT -ge 2 ]; then
4812 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4813 error "(3.1) Migrate should fail"
4815 echo "The object with linkEA overflow should NOT be migrated"
4816 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4817 [ "$newfid" == "$oldfid" ] ||
4818 error "(3.2) Migrate should fail: $newfid != $oldfid"
4821 # Remove 100 hard links, then the linkEA should have space
4822 # to hold the missed linkEA entries.
4823 echo "Remove 100 hard links to save space for the missed linkEA entries"
4824 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4826 if [ $MDSCOUNT -ge 2 ]; then
4827 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4828 error "(5.1) Migrate should fail"
4830 # The overflow timestamp is still there, so migration will fail.
4831 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4832 [ "$newfid" == "$oldfid" ] ||
4833 error "(5.2) Migrate should fail: $newfid != $oldfid"
4836 # sleep 3 seconds to guarantee that the overflow is recognized
4839 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4840 $START_NAMESPACE -r -A ||
4841 error "(6) Fail to start LFSCK for namespace"
4843 wait_all_targets_blocked namespace completed 7
4845 local repaired=$($SHOW_NAMESPACE |
4846 awk '/^linkea_overflow_cleared/ { print $2 }')
4847 [ $repaired -eq 1 ] ||
4848 error "(8) Fail to clear linkea overflow: $repaired"
4850 repaired=$($SHOW_NAMESPACE |
4851 awk '/^nlinks_repaired/ { print $2 }')
4852 [ $repaired -eq 0 ] ||
4853 error "(9) Unexpected nlink repaired: $repaired"
4855 if [ $MDSCOUNT -ge 2 ]; then
4856 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4857 error "(10.1) Migrate failure"
4859 # Migration should succeed after clear the overflow timestamp.
4860 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4861 [ "$newfid" != "$oldfid" ] ||
4862 error "(10.2) Migrate should succeed"
4864 ls -l $DIR/$tdir/foo > /dev/null ||
4865 error "(11) 'ls' failed after migration"
4868 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4869 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4871 run_test 29c "verify linkEA size limitation"
4874 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4875 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4876 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4877 skip "MDS older than 2.6.50, LU-5518"
4880 echo "The namespace LFSCK will move the orphans from backend"
4881 echo "/lost+found directory to normal client visible namespace"
4882 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4885 check_mount_and_prep
4887 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4888 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4890 echo "Inject failure stub on MDT0 to simulate the case that"
4891 echo "directory d0 has no linkEA entry, then the LFSCK will"
4892 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4894 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4895 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4896 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4899 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4900 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4902 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4903 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4905 echo "Inject failure stub on MDT0 to simulate the case that the"
4906 echo "object's name entry will be removed, but not destroy the"
4907 echo "object. Then backend e2fsck will handle it as orphan and"
4908 echo "add them into the backend /lost+found directory."
4910 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4911 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4912 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4913 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4914 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4915 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4916 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4918 umount_client $MOUNT || error "(10) Fail to stop client!"
4920 stop $SINGLEMDS || error "(11) Fail to stop $SINGLEMDS"
4922 local dev=$(facet_device $SINGLEMDS)
4924 echo "run e2fsck on $SINGLEMDS"
4925 run_e2fsck $(facet_active_host $SINGLEMDS) $dev "-y" ||
4926 error "(12) Fail to run e2fsck"
4928 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 13
4930 echo "Trigger namespace LFSCK to recover backend orphans"
4931 $START_NAMESPACE -r -A ||
4932 error "(14) Fail to start LFSCK for namespace"
4934 wait_all_targets_blocked namespace completed 15
4936 local repaired=$($SHOW_NAMESPACE |
4937 awk '/^local_lost_found_moved/ { print $2 }')
4938 [ $repaired -ge 4 ] ||
4939 error "(16) Fail to recover backend orphans: $repaired"
4941 mount_client $MOUNT || error "(17) Fail to start client!"
4943 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4945 ls -ail $MOUNT/.lustre/lost+found/
4947 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4948 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4949 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4951 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4953 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4954 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4956 stat ${cname}/d1 || error "(21) d1 is not recovered"
4957 stat ${cname}/f1 || error "(22) f1 is not recovered"
4959 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4962 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4963 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4964 skip "MDS older than 2.6.50, LU-5519"
4967 echo "For the name entry under a striped directory, if the name"
4968 echo "hash does not match the shard, then the LFSCK will repair"
4969 echo "the bad name entry"
4972 check_mount_and_prep
4974 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4975 error "(1) Fail to create striped directory"
4977 echo "Inject failure stub on client to simulate the case that"
4978 echo "some name entry should be inserted into other non-first"
4979 echo "shard, but inserted into the first shard by wrong"
4981 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4982 $LCTL set_param fail_loc=0x1628 fail_val=0
4983 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4984 error "(2) Fail to create file under striped directory"
4985 $LCTL set_param fail_loc=0 fail_val=0
4987 echo "Trigger namespace LFSCK to repair bad name hash"
4988 $START_NAMESPACE -r -A ||
4989 error "(3) Fail to start LFSCK for namespace"
4991 wait_all_targets_blocked namespace completed 4
4993 local repaired=$($SHOW_NAMESPACE |
4994 awk '/^name_hash_repaired/ { print $2 }')
4995 [ $repaired -ge 1 ] ||
4996 error "(5) Fail to repair bad name hash: $repaired"
4998 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
5000 error "Fail to find flag bad type: $rc"
5002 umount_client $MOUNT || error "(6) umount failed"
5003 mount_client $MOUNT || error "(7) mount failed"
5005 for ((i = 0; i < $MDSCOUNT; i++)); do
5006 stat $DIR/$tdir/striped_dir/d$i ||
5007 error "(8) Fail to stat d$i after LFSCK"
5008 rmdir $DIR/$tdir/striped_dir/d$i ||
5009 error "(9) Fail to unlink d$i after LFSCK"
5012 rmdir $DIR/$tdir/striped_dir ||
5013 error "(10) Fail to remove the striped directory after LFSCK"
5015 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
5018 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5019 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5020 skip "MDS older than 2.6.50, LU-5519"
5023 echo "For the name entry under a striped directory, if the name"
5024 echo "hash does not match the shard, then the LFSCK will repair"
5025 echo "the bad name entry"
5028 check_mount_and_prep
5030 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5031 error "(1) Fail to create striped directory"
5033 echo "Inject failure stub on client to simulate the case that"
5034 echo "some name entry should be inserted into other non-second"
5035 echo "shard, but inserted into the secod shard by wrong"
5037 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
5038 $LCTL set_param fail_loc=0x1628 fail_val=1
5039 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
5040 error "(2) Fail to create file under striped directory"
5041 $LCTL set_param fail_loc=0 fail_val=0
5043 echo "Trigger namespace LFSCK to repair bad name hash"
5044 $START_NAMESPACE -r -A ||
5045 error "(3) Fail to start LFSCK for namespace"
5047 wait_all_targets_blocked namespace completed 4
5049 local repaired=$(do_facet mds2 $LCTL get_param -n \
5050 mdd.$(facet_svc mds2).lfsck_namespace |
5051 awk '/^name_hash_repaired/ { print $2 }')
5052 echo "repaired $repaired name entries with bad hash"
5053 [ $repaired -ge 1 ] ||
5054 error "(5) Fail to repair bad name hash: $repaired"
5056 umount_client $MOUNT || error "(6) umount failed"
5057 mount_client $MOUNT || error "(7) mount failed"
5059 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
5060 stat $DIR/$tdir/striped_dir/d$i ||
5061 error "(8) Fail to stat d$i after LFSCK"
5062 rmdir $DIR/$tdir/striped_dir/d$i ||
5063 error "(9) Fail to unlink d$i after LFSCK"
5066 rmdir $DIR/$tdir/striped_dir ||
5067 error "(10) Fail to remove the striped directory after LFSCK"
5069 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
5072 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5073 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5074 skip "MDS older than 2.6.50, LU-5519"
5077 echo "For some reason, the master MDT-object of the striped directory"
5078 echo "may lost its master LMV EA. If nobody created files under the"
5079 echo "master directly after the master LMV EA lost, then the LFSCK"
5080 echo "should re-generate the master LMV EA."
5083 check_mount_and_prep
5085 echo "Inject failure stub on MDT0 to simulate the case that the"
5086 echo "master MDT-object of the striped directory lost the LMV EA."
5088 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5089 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5090 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5091 error "(1) Fail to create striped directory"
5092 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5094 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5095 $START_NAMESPACE -r -A ||
5096 error "(2) Fail to start LFSCK for namespace"
5098 wait_all_targets_blocked namespace completed 3
5100 local repaired=$($SHOW_NAMESPACE |
5101 awk '/^striped_dirs_repaired/ { print $2 }')
5102 [ $repaired -eq 1 ] ||
5103 error "(4) Fail to re-generate master LMV EA: $repaired"
5105 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5106 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5108 umount_client $MOUNT || error "(5) umount failed"
5109 mount_client $MOUNT || error "(6) mount failed"
5111 local empty=$(ls $DIR/$tdir/striped_dir/)
5112 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5114 rmdir $DIR/$tdir/striped_dir ||
5115 error "(8) Fail to remove the striped directory after LFSCK"
5117 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5120 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5121 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5122 skip "MDS older than 2.6.50, LU-5519"
5125 echo "For some reason, the master MDT-object of the striped directory"
5126 echo "may lost its master LMV EA. If somebody created files under the"
5127 echo "master directly after the master LMV EA lost, then the LFSCK"
5128 echo "should NOT re-generate the master LMV EA, instead, it should"
5129 echo "change the broken striped dirctory as read-only to prevent"
5130 echo "further damage"
5133 check_mount_and_prep
5135 echo "Inject failure stub on MDT0 to simulate the case that the"
5136 echo "master MDT-object of the striped directory lost the LMV EA."
5138 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5139 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5140 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5141 error "(1) Fail to create striped directory"
5142 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5144 umount_client $MOUNT || error "(2) umount failed"
5145 mount_client $MOUNT || error "(3) mount failed"
5147 touch $DIR/$tdir/striped_dir/dummy ||
5148 error "(4) Fail to touch under broken striped directory"
5150 echo "Trigger namespace LFSCK to find out the inconsistency"
5151 $START_NAMESPACE -r -A ||
5152 error "(5) Fail to start LFSCK for namespace"
5154 wait_all_targets_blocked namespace completed 6
5156 local repaired=$($SHOW_NAMESPACE |
5157 awk '/^striped_dirs_repaired/ { print $2 }')
5158 [ $repaired -eq 0 ] ||
5159 error "(7) Re-generate master LMV EA unexpected: $repaired"
5161 stat $DIR/$tdir/striped_dir/dummy ||
5162 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5164 touch $DIR/$tdir/striped_dir/foo &&
5165 error "(9) The broken striped directory should be read-only"
5167 chattr -i $DIR/$tdir/striped_dir ||
5168 error "(10) Fail to chattr on the broken striped directory"
5170 rmdir $DIR/$tdir/striped_dir ||
5171 error "(11) Fail to remove the striped directory after LFSCK"
5173 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5176 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5177 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5178 skip "MDS older than 2.6.50, LU-5519"
5181 echo "For some reason, the slave MDT-object of the striped directory"
5182 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5183 echo "slave LMV EA."
5186 check_mount_and_prep
5188 echo "Inject failure stub on MDT0 to simulate the case that the"
5189 echo "slave MDT-object (that resides on the same MDT as the master"
5190 echo "MDT-object resides on) lost the LMV EA."
5192 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5193 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5194 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5195 error "(1) Fail to create striped directory"
5196 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5198 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5199 $START_NAMESPACE -r -A ||
5200 error "(2) Fail to start LFSCK for namespace"
5202 wait_all_targets_blocked namespace completed 3
5204 local repaired=$($SHOW_NAMESPACE |
5205 awk '/^striped_shards_repaired/ { print $2 }')
5206 [ $repaired -eq 1 ] ||
5207 error "(4) Fail to re-generate slave LMV EA: $repaired"
5209 rmdir $DIR/$tdir/striped_dir ||
5210 error "(5) Fail to remove the striped directory after LFSCK"
5212 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5215 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5216 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5217 skip "MDS older than 2.6.50, LU-5519"
5220 echo "For some reason, the slave MDT-object of the striped directory"
5221 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5222 echo "slave LMV EA."
5225 check_mount_and_prep
5227 echo "Inject failure stub on MDT0 to simulate the case that the"
5228 echo "slave MDT-object (that resides on different MDT as the master"
5229 echo "MDT-object resides on) lost the LMV EA."
5231 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5232 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5233 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5234 error "(1) Fail to create striped directory"
5235 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5237 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5238 $START_NAMESPACE -r -A ||
5239 error "(2) Fail to start LFSCK for namespace"
5241 wait_all_targets_blocked namespace completed 3
5243 local repaired=$(do_facet mds2 $LCTL get_param -n \
5244 mdd.$(facet_svc mds2).lfsck_namespace |
5245 awk '/^striped_shards_repaired/ { print $2 }')
5246 [ $repaired -eq 1 ] ||
5247 error "(4) Fail to re-generate slave LMV EA: $repaired"
5249 rmdir $DIR/$tdir/striped_dir ||
5250 error "(5) Fail to remove the striped directory after LFSCK"
5252 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5255 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5256 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5257 skip "MDS older than 2.6.50, LU-5519"
5260 echo "For some reason, the stripe index in the slave LMV EA is"
5261 echo "corrupted. The LFSCK should repair the slave LMV EA."
5264 check_mount_and_prep
5266 echo "Inject failure stub on MDT0 to simulate the case that the"
5267 echo "slave LMV EA on the first shard of the striped directory"
5268 echo "claims the same index as the second shard claims"
5270 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5272 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5273 error "(1) Fail to create striped directory"
5274 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5276 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5277 $START_NAMESPACE -r -A ||
5278 error "(2) Fail to start LFSCK for namespace"
5280 wait_all_targets_blocked namespace completed 3
5282 local repaired=$($SHOW_NAMESPACE |
5283 awk '/^striped_shards_repaired/ { print $2 }')
5284 [ $repaired -eq 1 ] ||
5285 error "(4) Fail to repair slave LMV EA: $repaired"
5287 umount_client $MOUNT || error "(5) umount failed"
5288 mount_client $MOUNT || error "(6) mount failed"
5290 touch $DIR/$tdir/striped_dir/foo ||
5291 error "(7) Fail to touch file after the LFSCK"
5293 rm -f $DIR/$tdir/striped_dir/foo ||
5294 error "(8) Fail to unlink file after the LFSCK"
5296 rmdir $DIR/$tdir/striped_dir ||
5297 error "(9) Fail to remove the striped directory after LFSCK"
5299 run_test 31g "Repair the corrupted slave LMV EA"
5302 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5303 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5304 skip "MDS older than 2.6.50, LU-5519"
5307 echo "For some reason, the shard's name entry in the striped"
5308 echo "directory may be corrupted. The LFSCK should repair the"
5309 echo "bad shard's name entry."
5312 check_mount_and_prep
5314 echo "Inject failure stub on MDT0 to simulate the case that the"
5315 echo "first shard's name entry in the striped directory claims"
5316 echo "the same index as the second shard's name entry claims."
5318 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5319 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5320 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5321 error "(1) Fail to create striped directory"
5322 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5324 echo "Trigger namespace LFSCK to repair the shard's name entry"
5325 $START_NAMESPACE -r -A ||
5326 error "(2) Fail to start LFSCK for namespace"
5328 wait_all_targets_blocked namespace completed 3
5330 local repaired=$($SHOW_NAMESPACE |
5331 awk '/^dirent_repaired/ { print $2 }')
5332 [ $repaired -eq 1 ] ||
5333 error "(4) Fail to repair shard's name entry: $repaired"
5335 umount_client $MOUNT || error "(5) umount failed"
5336 mount_client $MOUNT || error "(6) mount failed"
5338 touch $DIR/$tdir/striped_dir/foo ||
5339 error "(7) Fail to touch file after the LFSCK"
5341 rm -f $DIR/$tdir/striped_dir/foo ||
5342 error "(8) Fail to unlink file after the LFSCK"
5344 rmdir $DIR/$tdir/striped_dir ||
5345 error "(9) Fail to remove the striped directory after LFSCK"
5347 run_test 31h "Repair the corrupted shard's name entry"
5352 umount_client $MOUNT
5354 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5355 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5356 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5358 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5359 [ "$STATUS" == "scanning-phase1" ] ||
5360 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5363 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5365 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5369 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5371 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5372 error "(5) Fail to start ost1"
5374 run_test 32a "stop LFSCK when some OST failed"
5378 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5381 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5382 error "(1) Fail to create $DIR/$tdir/dp"
5383 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5384 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5385 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5386 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5387 umount_client $MOUNT
5389 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5390 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5391 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5393 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5394 mdd.${MDT_DEV}.lfsck_namespace |
5395 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5397 error "(5) unexpected status"
5401 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5407 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5409 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5410 error "(8) Fail to start MDT2"
5412 run_test 32b "stop LFSCK when some MDT failed"
5418 $START_LAYOUT --dryrun -o -r ||
5419 error "(1) Fail to start layout LFSCK"
5420 wait_all_targets_blocked layout completed 2
5422 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5423 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5424 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5426 $START_NAMESPACE -e abort -A -r ||
5427 error "(4) Fail to start namespace LFSCK"
5428 wait_all_targets_blocked namespace completed 5
5430 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5431 [ "$PARAMS" == "failout,all_targets" ] ||
5432 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5434 run_test 33 "check LFSCK paramters"
5438 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5439 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5443 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5444 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5445 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5446 error "(1) Fail to create $DIR/$tdir/dummy"
5448 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5449 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5450 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5451 mdd.${MDT_DEV}.lfsck_namespace |
5452 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5454 error "(3) unexpected status"
5457 local repaired=$($SHOW_NAMESPACE |
5458 awk '/^dirent_repaired/ { print $2 }')
5459 [ $repaired -eq 1 ] ||
5460 error "(4) Fail to repair the lost agent object: $repaired"
5462 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5463 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5464 mdd.${MDT_DEV}.lfsck_namespace |
5465 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5467 error "(6) unexpected status"
5470 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5471 [ $repaired -eq 0 ] ||
5472 error "(7) Unexpected repairing: $repaired"
5474 run_test 34 "LFSCK can rebuild the lost agent object"
5478 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5482 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5483 do_facet mds2 $LCTL set_param fail_loc=0x1631
5484 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5485 error "(1) Fail to create $DIR/$tdir/dummy"
5488 do_facet mds2 $LCTL set_param fail_loc=0
5489 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5490 wait_update_facet mds2 "$LCTL get_param -n \
5491 mdd.$(facet_svc mds2).lfsck_namespace |
5492 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5493 error "(3) MDS${k} is not the expected 'completed'"
5495 local repaired=$(do_facet mds2 $LCTL get_param -n \
5496 mdd.$(facet_svc mds2).lfsck_namespace |
5497 awk '/^agent_entries_repaired/ { print $2 }')
5498 [ $repaired -eq 1 ] ||
5499 error "(4) Fail to repair the lost agent entry: $repaired"
5501 echo "stopall to cleanup object cache"
5504 setupall > /dev/null
5506 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5507 wait_update_facet mds2 "$LCTL get_param -n \
5508 mdd.$(facet_svc mds2).lfsck_namespace |
5509 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5510 error "(6) MDS${k} is not the expected 'completed'"
5512 repaired=$(do_facet mds2 $LCTL get_param -n \
5513 mdd.$(facet_svc mds2).lfsck_namespace |
5514 awk '/^agent_entries_repaired/ { print $2 }')
5515 [ $repaired -eq 0 ] ||
5516 error "(7) Unexpected repairing: $repaired"
5518 run_test 35 "LFSCK can rebuild the lost agent entry"
5521 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5524 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5525 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5526 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5529 check_mount_and_prep
5533 lctl get_param osc.*.*grant*
5534 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5536 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5537 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5538 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5539 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5540 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5541 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5542 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5543 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5544 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5546 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5547 error "(3) Fail to write $DIR/$tdir/f0"
5548 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5549 error "(4) Fail to write $DIR/$tdir/f1"
5550 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5551 error "(5) Fail to write $DIR/$tdir/f2"
5553 $LFS mirror resync $DIR/$tdir/f0 ||
5554 error "(6) Fail to resync $DIR/$tdir/f0"
5555 $LFS mirror resync $DIR/$tdir/f1 ||
5556 error "(7) Fail to resync $DIR/$tdir/f1"
5557 $LFS mirror resync $DIR/$tdir/f2 ||
5558 error "(8) Fail to resync $DIR/$tdir/f2"
5560 cancel_lru_locks mdc
5561 cancel_lru_locks osc
5563 $LFS getstripe $DIR/$tdir/f0 ||
5564 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5565 $LFS getstripe $DIR/$tdir/f1 ||
5566 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5567 $LFS getstripe $DIR/$tdir/f2 ||
5568 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5570 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5571 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5572 do_facet mds1 $LCTL set_param fail_loc=0x1616
5574 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5575 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5576 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5577 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5578 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5579 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5583 do_facet mds1 $LCTL set_param fail_loc=0
5585 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5586 error "(15) The 1st of mirror is not destroyed"
5587 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5588 error "(16) The 2nd of mirror is not destroyed"
5589 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5590 error "(17) The 3rd of mirror is not destroyed"
5594 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5595 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5596 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5597 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5598 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5599 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5601 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5602 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5604 for k in $(seq $MDSCOUNT); do
5605 # The LFSCK status query internal is 30 seconds. For the case
5606 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5607 # time to guarantee the status sync up.
5608 wait_update_facet mds${k} "$LCTL get_param -n \
5609 mdd.$(facet_svc mds${k}).lfsck_layout |
5610 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5611 error "(22) MDS${k} is not the expected 'completed'"
5614 for k in $(seq $OSTCOUNT); do
5615 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5616 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5617 awk '/^status/ { print $2 }')
5618 [ "$cur_status" == "completed" ] ||
5619 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5622 local repaired=$(do_facet mds1 $LCTL get_param -n \
5623 mdd.$(facet_svc mds1).lfsck_layout |
5624 awk '/^repaired_orphan/ { print $2 }')
5625 [ $repaired -eq 9 ] ||
5626 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5628 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5629 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5630 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5631 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5632 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5633 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5635 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5636 $LFS getstripe $DIR/$tdir/f0
5637 error "(28) The 1st of mirror is not recovered"
5640 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5641 $LFS getstripe $DIR/$tdir/f1
5642 error "(29) The 2nd of mirror is not recovered"
5645 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5646 $LFS getstripe $DIR/$tdir/f2
5647 error "(30) The 3rd of mirror is not recovered"
5650 run_test 36a "rebuild LOV EA for mirrored file (1)"
5653 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5654 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5657 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5658 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5659 echo "with the PFID EA of related OST-object(s) belong to the file. "
5662 check_mount_and_prep
5664 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5665 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5666 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5668 local fid=$($LFS path2fid $DIR/$tdir/f0)
5670 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5671 error "(1) Fail to write $DIR/$tdir/f0"
5672 $LFS mirror resync $DIR/$tdir/f0 ||
5673 error "(2) Fail to resync $DIR/$tdir/f0"
5675 cancel_lru_locks mdc
5676 cancel_lru_locks osc
5678 $LFS getstripe $DIR/$tdir/f0 ||
5679 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5681 echo "Inject failure, to simulate the case of missing the MDT-object"
5682 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5683 do_facet mds1 $LCTL set_param fail_loc=0x1616
5684 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5688 do_facet mds1 $LCTL set_param fail_loc=0
5690 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5691 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5693 for k in $(seq $MDSCOUNT); do
5694 # The LFSCK status query internal is 30 seconds. For the case
5695 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5696 # time to guarantee the status sync up.
5697 wait_update_facet mds${k} "$LCTL get_param -n \
5698 mdd.$(facet_svc mds${k}).lfsck_layout |
5699 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5700 error "(6) MDS${k} is not the expected 'completed'"
5703 for k in $(seq $OSTCOUNT); do
5704 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5705 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5706 awk '/^status/ { print $2 }')
5707 [ "$cur_status" == "completed" ] ||
5708 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5711 local count=$(do_facet mds1 $LCTL get_param -n \
5712 mdd.$(facet_svc mds1).lfsck_layout |
5713 awk '/^repaired_orphan/ { print $2 }')
5714 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5716 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5717 count=$($LFS getstripe --mirror-count $name)
5718 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5720 count=$($LFS getstripe --component-count $name)
5721 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5723 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5724 $LFS getstripe $name
5725 error "(11) The 1st of mirror is not recovered"
5728 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5729 $LFS getstripe $name
5730 error "(12) The 2nd of mirror is not recovered"
5733 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5734 $LFS getstripe $name
5735 error "(13) The 3rd of mirror is not recovered"
5738 run_test 36b "rebuild LOV EA for mirrored file (2)"
5741 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5742 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5745 echo "The mirrored file has been modified, not resynced yet, then "
5746 echo "lost its MDT-object, but relatd OST-objects are still there. "
5747 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5748 echo "with the PFID EA of related OST-object(s) belong to the file. "
5751 check_mount_and_prep
5753 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5755 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5757 local fid=$($LFS path2fid $DIR/$tdir/f0)
5759 # The 1st dd && resync makes all related OST-objects have been written
5760 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5761 error "(1.1) Fail to write $DIR/$tdir/f0"
5762 $LFS mirror resync $DIR/$tdir/f0 ||
5763 error "(1.2) Fail to resync $DIR/$tdir/f0"
5764 # The 2nd dd makes one mirror to be stale
5765 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5766 error "(1.3) Fail to write $DIR/$tdir/f0"
5768 cancel_lru_locks mdc
5769 cancel_lru_locks osc
5771 $LFS getstripe $DIR/$tdir/f0 ||
5772 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5774 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5775 awk '/lcme_flags/ { print $2 }')
5776 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5777 awk '/lcme_flags/ { print $2 }')
5779 echo "Inject failure, to simulate the case of missing the MDT-object"
5780 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5781 do_facet mds1 $LCTL set_param fail_loc=0x1616
5782 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5786 do_facet mds1 $LCTL set_param fail_loc=0
5788 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5789 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5791 for k in $(seq $MDSCOUNT); do
5792 # The LFSCK status query internal is 30 seconds. For the case
5793 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5794 # time to guarantee the status sync up.
5795 wait_update_facet mds${k} "$LCTL get_param -n \
5796 mdd.$(facet_svc mds${k}).lfsck_layout |
5797 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5798 error "(5) MDS${k} is not the expected 'completed'"
5801 for k in $(seq $OSTCOUNT); do
5802 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5803 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5804 awk '/^status/ { print $2 }')
5805 [ "$cur_status" == "completed" ] ||
5806 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5809 local count=$(do_facet mds1 $LCTL get_param -n \
5810 mdd.$(facet_svc mds1).lfsck_layout |
5811 awk '/^repaired_orphan/ { print $2 }')
5812 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5814 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5815 count=$($LFS getstripe --mirror-count $name)
5816 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5818 count=$($LFS getstripe --component-count $name)
5819 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5821 local flags=$($LFS getstripe $name | head -n 10 |
5822 awk '/lcme_flags/ { print $2 }')
5823 [ "$flags" == "$saved_flags1" ] || {
5824 $LFS getstripe $name
5825 error "(10) expect flags $saved_flags1, got $flags"
5828 flags=$($LFS getstripe $name | tail -n 10 |
5829 awk '/lcme_flags/ { print $2 }')
5830 [ "$flags" == "$saved_flags2" ] || {
5831 $LFS getstripe $name
5832 error "(11) expect flags $saved_flags2, got $flags"
5835 run_test 36c "rebuild LOV EA for mirrored file (3)"
5841 local t_dir="$DIR/$tdir/d0"
5842 check_mount_and_prep
5844 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5845 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5849 $START_NAMESPACE -r -A || {
5850 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5852 wait_all_targets_blocked namespace completed 4
5857 run_test 37 "LFSCK must skip a ORPHAN"
5861 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5862 skip "Need MDS version newer than 2.12.51"
5864 test_mkdir $DIR/$tdir
5865 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5866 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5868 # create foreign file
5869 $LFS setstripe --foreign=none --flags 0xda05 \
5870 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5871 error "$DIR/$tdir/$tfile: create failed"
5873 $LFS getstripe -v $DIR/$tdir/$tfile |
5874 grep "lfm_magic:.*0x0BD70BD0" ||
5875 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5876 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5877 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5878 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5879 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5880 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5881 $LFS getstripe -v $DIR/$tdir/$tfile |
5882 grep "lfm_flags:.*0x0000DA05" ||
5883 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5884 $LFS getstripe $DIR/$tdir/$tfile |
5885 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5886 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5888 # modify striping should fail
5889 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5890 error "$DIR/$tdir/$tfile: setstripe should fail"
5892 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5894 wait_all_targets_blocked namespace completed 1
5896 # check that "global" namespace_repaired == 0 !!!
5897 local repaired=$(do_facet mds1 \
5898 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5899 awk '/^namespace_repaired/ { print \\\$2 }'")
5900 [ $repaired -eq 0 ] ||
5901 error "(2) Expect no namespace repair, but got: $repaired"
5903 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5905 wait_all_targets_blocked layout completed 2
5907 # check that "global" layout_repaired == 0 !!!
5908 local repaired=$(do_facet mds1 \
5909 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5910 awk '/^layout_repaired/ { print \\\$2 }'")
5911 [ $repaired -eq 0 ] ||
5912 error "(2) Expect no layout repair, but got: $repaired"
5914 echo "post-lfsck checks of foreign file"
5916 $LFS getstripe -v $DIR/$tdir/$tfile |
5917 grep "lfm_magic:.*0x0BD70BD0" ||
5918 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5919 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5920 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5921 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5922 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5923 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5924 $LFS getstripe -v $DIR/$tdir/$tfile |
5925 grep "lfm_flags:.*0x0000DA05" ||
5926 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5927 $LFS getstripe $DIR/$tdir/$tfile |
5928 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5929 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5931 # modify striping should fail
5932 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5933 error "$DIR/$tdir/$tfile: setstripe should fail"
5936 cat $DIR/$tdir/$tfile && error "$DIR/$tdir/$tfile: read should fail"
5937 cat /etc/passwd > $DIR/$tdir/$tfile &&
5938 error "$DIR/$tdir/$tfile: write should fail"
5940 #remove foreign file
5941 rm $DIR/$tdir/$tfile ||
5942 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5944 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5948 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5949 skip "Need MDS version newer than 2.12.51"
5951 test_mkdir $DIR/$tdir
5952 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5953 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5955 # create foreign dir
5956 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5957 $DIR/$tdir/${tdir}2 ||
5958 error "$DIR/$tdir/${tdir}2: create failed"
5960 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5961 grep "lfm_magic:.*0x0CD50CD0" ||
5962 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5963 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5964 # - sizeof(lfm_type) - sizeof(lfm_flags)
5965 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5966 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5967 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5968 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5969 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5970 grep "lfm_flags:.*0x0000DA05" ||
5971 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5972 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5973 grep "lfm_value.*${uuid1}@${uuid2}" ||
5974 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5976 # file create in dir should fail
5977 touch $DIR/$tdir/${tdir}2/$tfile &&
5978 "$DIR/${tdir}2: file create should fail"
5981 chmod 777 $DIR/$tdir/${tdir}2 ||
5982 error "$DIR/${tdir}2: chmod failed"
5985 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5986 error "$DIR/${tdir}2: chown failed"
5988 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5990 wait_all_targets_blocked namespace completed 1
5992 # check that "global" namespace_repaired == 0 !!!
5993 local repaired=$(do_facet mds1 \
5994 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5995 awk '/^namespace_repaired/ { print \\\$2 }'")
5996 [ $repaired -eq 0 ] ||
5997 error "(2) Expect nothing to be repaired, but got: $repaired"
5999 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
6001 wait_all_targets_blocked layout completed 2
6003 # check that "global" layout_repaired == 0 !!!
6004 local repaired=$(do_facet mds1 \
6005 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
6006 awk '/^layout_repaired/ { print \\\$2 }'")
6007 [ $repaired -eq 0 ] ||
6008 error "(2) Expect no layout repair, but got: $repaired"
6010 echo "post-lfsck checks of foreign dir"
6012 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6013 grep "lfm_magic:.*0x0CD50CD0" ||
6014 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
6015 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
6016 # - sizeof(lfm_type) - sizeof(lfm_flags)
6017 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
6018 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
6019 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
6020 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
6021 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6022 grep "lfm_flags:.*0x0000DA05" ||
6023 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
6024 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
6025 grep "lfm_value.*${uuid1}@${uuid2}" ||
6026 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
6028 # file create in dir should fail
6029 touch $DIR/$tdir/${tdir}2/$tfile &&
6030 "$DIR/${tdir}2: file create should fail"
6033 chmod 777 $DIR/$tdir/${tdir}2 ||
6034 error "$DIR/${tdir}2: chmod failed"
6037 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
6038 error "$DIR/${tdir}2: chown failed"
6041 rmdir $DIR/$tdir/${tdir}2 ||
6042 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
6044 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
6047 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
6049 check_mount_and_prep
6050 $LFS mkdir -i 1 $DIR/$tdir/dir1
6051 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
6053 touch $DIR/$tdir/dir1/f1
6054 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
6056 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
6057 $LFS migrate -m 0 $DIR/$tdir/dir1
6059 echo "trigger LFSCK for layout"
6060 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
6062 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6063 mdd.${MDT_DEV}.lfsck_layout |
6064 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6066 error "(2) unexpected status"
6069 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
6071 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
6073 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
6077 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
6079 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6080 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6081 do_facet $SINGLEMDS $LCTL dk > /dev/null
6083 echo "trigger LFSCK for SEL layout"
6084 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6085 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6086 mdd.${MDT_DEV}.lfsck_layout |
6087 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6089 error "(2) unexpected status"
6092 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6093 grep "lfsck_layout_verify_header")
6095 [[ "x$errors" == "x" ]] || {
6097 error "lfsck failed"
6100 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6102 run_test 41 "SEL support in LFSCK"
6104 # restore MDS/OST size
6105 MDSSIZE=${SAVED_MDSSIZE}
6106 OSTSIZE=${SAVED_OSTSIZE}
6107 OSTCOUNT=${SAVED_OSTCOUNT}
6109 # cleanup the system at last
6110 REFORMAT="yes" cleanup_and_setup_lustre
6113 check_and_cleanup_lustre