3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
33 skip "Need MDS version at least 2.3.60"
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
45 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 MDT_DEV=$(devicelabel $SINGLEMDS $(facet_device $SINGLEMDS))
55 OST_DEV="${FSNAME}-OST0000"
56 START_NAMESPACE="do_facet $SINGLEMDS \
57 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
58 START_LAYOUT="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
60 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
61 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
62 SHOW_NAMESPACE="do_facet $SINGLEMDS \
63 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
64 SHOW_LAYOUT="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
66 SHOW_LAYOUT_ON_OST="do_facet ost1 \
67 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
68 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
69 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
70 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
79 echo "preparing... $nfiles * $ndirs files will be created $(date)."
80 if [ ! -z $igif ]; then
81 #define OBD_FAIL_FID_IGIF 0x1504
82 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
85 cp $LUSTRE/tests/*.sh $DIR/$tdir/
86 if [ $ndirs -gt 0 ]; then
87 createmany -d $DIR/$tdir/d $ndirs
88 createmany -m $DIR/$tdir/f $ndirs
89 if [ $nfiles -gt 0 ]; then
90 for ((i = 0; i < $ndirs; i++)); do
91 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
92 /dev/null || error "createmany $nfiles"
95 createmany -d $DIR/$tdir/e $ndirs
98 if [ ! -z $igif ]; then
99 touch $DIR/$tdir/dummy
100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
103 echo "prepared $(date)."
110 local dev=$(facet_device $facet)
112 start $facet $dev $opts > /dev/null ||
113 error "($err) Fail to start $facet!"
116 run_e2fsck_on_mds_facet() {
117 [ $mds1_FSTYPE == ldiskfs ] || return 0
121 stop $mds > /dev/null || error "(0) Fail to the stop $mds"
122 local host=$(facet_active_host $mds)
123 local dev=$(facet_device $mds)
125 run_e2fsck $host $dev "-n" |
127 run_e2fsck $host $dev "-n"
128 error "(2) Detected inconsistency on $mds"
130 start_facet $mds "$MOUNT_OPTS_NOSCRUB" 3
133 wait_all_targets_blocked() {
138 local count=$(do_facet mds1 \
139 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
141 [[ $count -eq $MDSCOUNT ]] || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
152 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
153 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
154 "$MDSCOUNT" $LTIME || {
155 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
156 error "($err) some MDTs are not in ${status}"
163 #define OBD_FAIL_LFSCK_DELAY1 0x1600
164 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
165 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
167 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
169 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
170 [ "$STATUS" == "scanning-phase1" ] ||
171 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
173 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
176 [ "$STATUS" == "stopped" ] ||
177 error "(6) Expect 'stopped', but got '$STATUS'"
179 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
181 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
182 [ "$STATUS" == "scanning-phase1" ] ||
183 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
187 mdd.${MDT_DEV}.lfsck_namespace |
188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
190 error "(9) unexpected status"
193 local repaired=$($SHOW_NAMESPACE |
194 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 0 ] ||
196 error "(10) Expect nothing to be repaired, but got: $repaired"
198 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
199 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(12) unexpected status"
207 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
208 [ $((scanned1 + 1)) -eq $scanned2 ] ||
209 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
211 echo "stopall, should NOT crash LU-3649"
212 stopall || error "(14) Fail to stopall"
214 run_test 0 "Control LFSCK manually"
219 #define OBD_FAIL_FID_INDIR 0x1501
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
221 touch $DIR/$tdir/dummy
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
225 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
227 mdd.${MDT_DEV}.lfsck_namespace |
228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
230 error "(4) unexpected status"
233 local repaired=$($SHOW_NAMESPACE |
234 awk '/^dirent_repaired/ { print $2 }')
235 # for interop with old server
236 [ -z "$repaired" ] &&
237 repaired=$($SHOW_NAMESPACE |
238 awk '/^updated_phase1/ { print $2 }')
240 [ $repaired -eq 1 ] ||
241 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
243 run_e2fsck_on_mds_facet $SINGLEMDS
245 mount_client $MOUNT || error "(6) Fail to start client!"
247 #define OBD_FAIL_FID_LOOKUP 0x1505
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
249 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
253 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
257 [ "$mds1_FSTYPE" != ldiskfs ] &&
258 skip "OI Scrub not implemented for ZFS"
262 #define OBD_FAIL_FID_INLMA 0x1502
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
264 touch $DIR/$tdir/dummy
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
268 #define OBD_FAIL_FID_NOLMA 0x1506
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
270 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
272 mdd.${MDT_DEV}.lfsck_namespace |
273 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
275 error "(4) unexpected status"
278 local repaired=$($SHOW_NAMESPACE |
279 awk '/^dirent_repaired/ { print $2 }')
280 # for interop with old server
281 [ -z "$repaired" ] &&
282 repaired=$($SHOW_NAMESPACE |
283 awk '/^updated_phase1/ { print $2 }')
285 [ $repaired -eq 1 ] ||
286 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
289 run_e2fsck_on_mds_facet $SINGLEMDS
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
304 #define OBD_FAIL_FID_IGIF 0x1504
305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
306 touch $DIR/$tdir/dummy
308 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
310 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
311 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
312 mdd.${MDT_DEV}.lfsck_namespace |
313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
315 error "(4) unexpected status"
318 local repaired=$($SHOW_NAMESPACE |
319 awk '/^dirent_repaired/ { print $2 }')
320 # for interop with old server
321 [ -z "$repaired" ] &&
322 repaired=$($SHOW_NAMESPACE |
323 awk '/^updated_phase1/ { print $2 }')
325 [ $repaired -eq 1 ] ||
326 error "(5) Fail to repair lost FID-in-dirent: $repaired"
328 run_e2fsck_on_mds_facet $SINGLEMDS
330 mount_client $MOUNT || error "(6) Fail to start client!"
332 #define OBD_FAIL_FID_LOOKUP 0x1505
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
334 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
338 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
341 [ $MDS1_VERSION -lt $(version_code 2.13.57) ] &&
342 skip "MDS older than 2.13.57"
343 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
347 touch $DIR/$tdir/$tfile
348 mkdir $DIR/$tdir/subdir
349 $LFS mkdir -i 1 $DIR/$tdir/remotedir
350 $LFS path2fid $DIR/$tdir
351 ll_decode_linkea $DIR/$tdir/$tfile
352 ll_decode_linkea $DIR/$tdir/subdir
353 ll_decode_linkea $DIR/$tdir/remotedir
355 local mntpt=$(facet_mntpt mds1)
357 # unlink OI files to remove the stale entry
358 local saved_opts=$MDS_MOUNT_OPTS
361 mount_fstype mds1 $mntpt
362 # increase $tdir FID oid in LMA
363 do_facet mds1 "getfattr -d -m trusted.lma -e hex \
364 --absolute-names $mntpt/ROOT/$tdir | \
365 sed -E 's/0(.{8})$/1\1/' | setfattr --restore=-"
366 unmount_fstype mds1 $mntpt
369 # the FID oid in LMA was increased above, and it's not in OI table,
370 # run scrub first to generate mapping in OI, so the following namespace
371 # check can fix linkea correctly, this is not necessary normally.
372 do_facet mds1 $LCTL lfsck_start -M ${MDT_DEV} -t scrub ||
373 error "failed to start LFSCK for scrub!"
374 wait_update_facet mds1 "$LCTL get_param -n \
375 osd-*.$(facet_svc mds1).oi_scrub |
376 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
377 error "unexpected status"
379 $START_NAMESPACE -r -A || error "fail to start LFSCK for namespace!"
380 wait_update_facet mds1 "$LCTL get_param -n \
381 mdd.${MDT_DEV}.lfsck_namespace |
382 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
384 error "unexpected status"
386 $LFS path2fid $DIR/$tdir
387 ll_decode_linkea $DIR/$tdir/$tfile
388 ll_decode_linkea $DIR/$tdir/subdir
389 ll_decode_linkea $DIR/$tdir/remotedir
394 fid=$($LFS path2fid $DIR/$tdir)
395 for f in $tfile subdir remotedir; do
396 pfid=$(ll_decode_linkea $DIR/$tdir/$f |
397 awk '/pfid/ { print $3 }')
399 [ "$pfid" == "$fid" ] || error "$fid in LMA != $pfid in linkea"
402 run_test 1d "LFSCK can fix mismatch of FID in LMA and FID in child linkea"
407 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
409 touch $DIR/$tdir/dummy
411 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
413 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
414 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
415 mdd.${MDT_DEV}.lfsck_namespace |
416 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
418 error "(4) unexpected status"
421 local repaired=$($SHOW_NAMESPACE |
422 awk '/^linkea_repaired/ { print $2 }')
423 # for interop with old server
424 [ -z "$repaired" ] &&
425 repaired=$($SHOW_NAMESPACE |
426 awk '/^updated_phase2/ { print $2 }')
428 [ $repaired -eq 1 ] ||
429 error "(5) Fail to repair crashed linkEA: $repaired"
431 run_e2fsck_on_mds_facet $SINGLEMDS
433 mount_client $MOUNT || error "(6) Fail to start client!"
435 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
436 error "(7) Fail to stat $DIR/$tdir/dummy"
438 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
439 local dummyname=$($LFS fid2path $DIR $dummyfid)
440 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
441 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
443 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
449 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
450 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
451 touch $DIR/$tdir/dummy
453 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
455 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
456 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
457 mdd.${MDT_DEV}.lfsck_namespace |
458 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
460 error "(4) unexpected status"
463 local repaired=$($SHOW_NAMESPACE |
464 awk '/^updated_phase2/ { print $2 }')
465 [ $repaired -eq 1 ] ||
466 error "(5) Fail to repair crashed linkEA: $repaired"
468 run_e2fsck_on_mds_facet $SINGLEMDS
470 mount_client $MOUNT || error "(6) Fail to start client!"
472 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
473 error "(7) Fail to stat $DIR/$tdir/dummy"
475 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
476 local dummyname=$($LFS fid2path $DIR $dummyfid)
477 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
478 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
480 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
484 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
485 skip "MDS older than 2.4.90"
489 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
490 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
491 touch $DIR/$tdir/dummy
493 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
495 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
496 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
497 mdd.${MDT_DEV}.lfsck_namespace |
498 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
500 error "(4) unexpected status"
503 local repaired=$($SHOW_NAMESPACE |
504 awk '/^updated_phase2/ { print $2 }')
505 [ $repaired -eq 1 ] ||
506 error "(5) Fail to repair crashed linkEA: $repaired"
508 run_e2fsck_on_mds_facet $SINGLEMDS
510 mount_client $MOUNT || error "(6) Fail to start client!"
512 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
513 error "(7) Fail to stat $DIR/$tdir/dummy"
515 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
516 local dummyname=$($LFS fid2path $DIR $dummyfid)
517 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
518 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
520 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
524 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
525 skip "MDS older than 2.6.50, LU-4788"
529 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
530 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
531 touch $DIR/$tdir/dummy
533 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
535 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
536 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
537 mdd.${MDT_DEV}.lfsck_namespace |
538 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
540 error "(4) unexpected status"
543 local repaired=$($SHOW_NAMESPACE |
544 awk '/^linkea_repaired/ { print $2 }')
545 [ $repaired -eq 1 ] ||
546 error "(5) Fail to repair crashed linkEA: $repaired"
548 run_e2fsck_on_mds_facet $SINGLEMDS
550 mount_client $MOUNT || error "(6) Fail to start client!"
552 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
553 error "(7) Fail to stat $DIR/$tdir/dummy"
555 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
556 local dummyname=$($LFS fid2path $DIR $dummyfid)
557 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
558 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
560 run_test 2d "LFSCK can recover the missing linkEA entry"
564 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
565 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
566 skip "MDS older than 2.6.50, LU-5511"
570 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
572 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
573 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
574 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
575 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
577 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
579 wait_all_targets_blocked namespace completed 4
581 local repaired=$($SHOW_NAMESPACE |
582 awk '/^linkea_repaired/ { print $2 }')
583 [ $repaired -eq 1 ] ||
584 error "(5) Fail to repair crashed linkEA: $repaired"
586 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
587 local name=$($LFS fid2path $DIR $fid)
588 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
589 error "(6) Fail to repair linkEA: $fid $name"
591 run_test 2e "namespace LFSCK can verify remote object linkEA"
595 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
596 skip "MDS older than 2.6.50, LU-4788"
600 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
601 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
602 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
604 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
605 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
606 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
608 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
609 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
610 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
612 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
613 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
614 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
616 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
618 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
619 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
620 mdd.${MDT_DEV}.lfsck_namespace |
621 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
623 error "(10) unexpected status"
626 local checked=$($SHOW_NAMESPACE |
627 awk '/^checked_phase2/ { print $2 }')
628 [ $checked -ge 4 ] ||
629 error "(11) Fail to check multiple-linked object: $checked"
631 local repaired=$($SHOW_NAMESPACE |
632 awk '/^multiple_linked_repaired/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(12) Fail to repair multiple-linked object: $repaired"
636 run_test 3 "LFSCK can verify multiple-linked objects"
640 [ "$mds1_FSTYPE" != ldiskfs ] &&
641 skip "OI Scrub not implemented for ZFS"
644 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
645 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
647 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
648 echo "start $SINGLEMDS with disabling OI scrub"
649 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
651 #define OBD_FAIL_LFSCK_DELAY2 0x1601
652 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
653 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
658 error "(5) unexpected status"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
665 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
666 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
667 mdd.${MDT_DEV}.lfsck_namespace |
668 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
670 error "(7) unexpected status"
673 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
674 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
676 local repaired=$($SHOW_NAMESPACE |
677 awk '/^dirent_repaired/ { print $2 }')
678 # for interop with old server
679 [ -z "$repaired" ] &&
680 repaired=$($SHOW_NAMESPACE |
681 awk '/^updated_phase1/ { print $2 }')
683 [ $repaired -ge 9 ] ||
684 error "(9) Fail to re-generate FID-in-dirent: $repaired"
686 run_e2fsck_on_mds_facet $SINGLEMDS
688 mount_client $MOUNT || error "(10) Fail to start client!"
690 #define OBD_FAIL_FID_LOOKUP 0x1505
691 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
692 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
693 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
695 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
699 [ "$mds1_FSTYPE" != ldiskfs ] &&
700 skip "OI Scrub not implemented for ZFS"
703 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
704 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
706 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
707 echo "start $SINGLEMDS with disabling OI scrub"
708 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
710 #define OBD_FAIL_LFSCK_DELAY2 0x1601
711 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
712 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
713 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
714 mdd.${MDT_DEV}.lfsck_namespace |
715 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
717 error "(5) unexpected status"
720 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
721 [ "$STATUS" == "scanning-phase1" ] ||
722 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
724 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
725 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
726 mdd.${MDT_DEV}.lfsck_namespace |
727 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
729 error "(7) unexpected status"
732 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
733 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
735 local repaired=$($SHOW_NAMESPACE |
736 awk '/^dirent_repaired/ { print $2 }')
737 # for interop with old server
738 [ -z "$repaired" ] &&
739 repaired=$($SHOW_NAMESPACE |
740 awk '/^updated_phase1/ { print $2 }')
742 [ $repaired -ge 2 ] ||
743 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
745 run_e2fsck_on_mds_facet $SINGLEMDS
747 mount_client $MOUNT || error "(10) Fail to start client!"
749 #define OBD_FAIL_FID_LOOKUP 0x1505
750 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
751 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
753 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
755 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
756 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
757 local dummyname=$($LFS fid2path $DIR $dummyfid)
758 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
759 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
761 run_test 5 "LFSCK can handle IGIF object upgrading"
766 #define OBD_FAIL_LFSCK_DELAY1 0x1600
767 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
768 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
770 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
771 [ "$STATUS" == "scanning-phase1" ] ||
772 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
774 # Sleep 3 sec to guarantee at least one object processed by LFSCK
776 # Fail the LFSCK to guarantee there is at least one checkpoint
777 #define OBD_FAIL_LFSCK_FATAL1 0x1608
778 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
779 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
780 mdd.${MDT_DEV}.lfsck_namespace |
781 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
783 error "(4) unexpected status"
786 local POS0=$($SHOW_NAMESPACE |
787 awk '/^last_checkpoint_position/ { print $2 }' |
790 #define OBD_FAIL_LFSCK_DELAY1 0x1600
791 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
792 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
794 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
795 [ "$STATUS" == "scanning-phase1" ] ||
796 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
798 local POS1=$($SHOW_NAMESPACE |
799 awk '/^latest_start_position/ { print $2 }' |
801 [[ $POS0 -lt $POS1 ]] ||
802 error "(7) Expect larger than: $POS0, but got $POS1"
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
805 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
806 mdd.${MDT_DEV}.lfsck_namespace |
807 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
809 error "(8) unexpected status"
812 run_test 6a "LFSCK resumes from last checkpoint (1)"
817 #define OBD_FAIL_LFSCK_DELAY2 0x1601
818 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
819 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
821 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
822 [ "$STATUS" == "scanning-phase1" ] ||
823 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
825 # Sleep 5 sec to guarantee that we are in the directory scanning
827 # Fail the LFSCK to guarantee there is at least one checkpoint
828 #define OBD_FAIL_LFSCK_FATAL2 0x1609
829 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
830 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
831 mdd.${MDT_DEV}.lfsck_namespace |
832 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
834 error "(4) unexpected status"
837 local O_POS0=$($SHOW_NAMESPACE |
838 awk '/^last_checkpoint_position/ { print $2 }' |
841 local D_POS0=$($SHOW_NAMESPACE |
842 awk '/^last_checkpoint_position/ { print $4 }')
844 #define OBD_FAIL_LFSCK_DELAY2 0x1601
845 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
846 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
848 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
849 [ "$STATUS" == "scanning-phase1" ] ||
850 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
852 local O_POS1=$($SHOW_NAMESPACE |
853 awk '/^latest_start_position/ { print $2 }' |
855 local D_POS1=$($SHOW_NAMESPACE |
856 awk '/^latest_start_position/ { print $4 }')
858 echo "Additional debug for 6b"
860 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
861 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
862 [[ $O_POS0 -lt $O_POS1 ]] ||
863 error "(7.1) $O_POS1 is not larger than $O_POS0"
865 [[ $D_POS0 -lt $D_POS1 ]] ||
866 error "(7.2) $D_POS1 is not larger than $D_POS0"
869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
870 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
871 mdd.${MDT_DEV}.lfsck_namespace |
872 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
874 error "(8) unexpected status"
877 run_test 6b "LFSCK resumes from last checkpoint (2)"
884 #define OBD_FAIL_LFSCK_DELAY2 0x1601
885 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
886 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
888 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
889 [ "$STATUS" == "scanning-phase1" ] ||
890 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
892 # Sleep 3 sec to guarantee at least one object processed by LFSCK
894 echo "stop $SINGLEMDS"
895 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop $SINGLEMDS!"
897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
898 echo "start $SINGLEMDS"
899 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 5
901 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
902 mdd.${MDT_DEV}.lfsck_namespace |
903 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
905 error "(6) unexpected status"
908 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
914 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
915 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
916 for ((i = 0; i < 20; i++)); do
917 touch $DIR/$tdir/dummy${i}
920 #define OBD_FAIL_LFSCK_DELAY3 0x1602
921 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
922 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
923 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
924 mdd.${MDT_DEV}.lfsck_namespace |
925 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
927 error "(4) unexpected status"
931 echo "stop $SINGLEMDS"
932 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop $SINGLEMDS!"
934 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
935 echo "start $SINGLEMDS"
936 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 6
938 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
939 mdd.${MDT_DEV}.lfsck_namespace |
940 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
942 error "(7) unexpected status"
945 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
956 formatall > /dev/null
962 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
963 [ "$STATUS" == "init" ] ||
964 namespace_error "(2) Expect 'init', but got '$STATUS'"
966 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
967 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
968 mkdir $DIR/$tdir/crashed
970 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
971 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
972 for ((i = 0; i < 5; i++)); do
973 touch $DIR/$tdir/dummy${i}
976 umount_client $MOUNT || error "(3) Fail to stop client!"
978 #define OBD_FAIL_LFSCK_DELAY2 0x1601
979 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
981 namespace_error "(4) Fail to start LFSCK for namespace!"
983 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
984 [ "$STATUS" == "scanning-phase1" ] ||
985 namespace_error "(5) Expect 'scanning-phase1', but got '$STATUS'"
987 $STOP_LFSCK || namespace_error "(6) Fail to stop LFSCK!"
989 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
990 [ "$STATUS" == "stopped" ] ||
991 namespace_error "(7) Expect 'stopped', but got '$STATUS'"
994 namespace_error "(8) Fail to start LFSCK for namespace!"
996 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
997 [ "$STATUS" == "scanning-phase1" ] ||
998 namespace_error "(9) Expect 'scanning-phase1', but got '$STATUS'"
1000 #define OBD_FAIL_LFSCK_FATAL2 0x1609
1001 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
1002 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1003 mdd.${MDT_DEV}.lfsck_namespace |
1004 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
1006 namespace_error "(10) unexpected status"
1009 #define OBD_FAIL_LFSCK_DELAY1 0x1600
1010 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
1012 namespace_error "(11) Fail to start LFSCK for namespace!"
1014 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1015 [ "$STATUS" == "scanning-phase1" ] ||
1016 namespace_error "(12) Expect 'scanning-phase1', but got '$STATUS'"
1018 #define OBD_FAIL_LFSCK_CRASH 0x160a
1019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
1022 echo "stop $SINGLEMDS"
1023 stop $SINGLEMDS > /dev/null || namespace_error "(13) Fail to stop MDS!"
1025 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1026 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1028 echo "start $SINGLEMDS"
1029 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 14
1031 local timeout=$(max_recovery_time)
1034 while [ $timer -lt $timeout ]; do
1035 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1036 mdt.${MDT_DEV}.recovery_status |
1037 awk '/^status/ { print \\\$2 }'")
1038 [ "$STATUS" != "RECOVERING" ] && break;
1040 timer=$((timer + 1))
1043 [ $timer != $timeout ] || (
1044 do_facet $SINGLEMDS "$LCTL get_param -n \
1045 mdt.${MDT_DEV}.recovery_status"
1046 error "(14.1) recovery timeout"
1049 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1050 [ "$STATUS" == "crashed" ] ||
1051 namespace_error "(15) Expect 'crashed', but got '$STATUS'"
1053 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1054 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1056 namespace_error "(16) Fail to start LFSCK for namespace!"
1058 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1059 [ "$STATUS" == "scanning-phase1" ] ||
1060 namespace_error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1062 echo "stop $SINGLEMDS"
1063 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop $SINGLEMDS!"
1065 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1066 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1068 echo "start $SINGLEMDS"
1069 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 19
1072 while [ $timer -lt $timeout ]; do
1073 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1074 mdt.${MDT_DEV}.recovery_status |
1075 awk '/^status/ { print \\\$2 }'")
1076 [ "$STATUS" != "RECOVERING" ] && break;
1078 timer=$((timer + 1))
1081 [ $timer != $timeout ] || (
1082 do_facet $SINGLEMDS "$LCTL get_param -n \
1083 mdt.${MDT_DEV}.recovery_status"
1084 error "(19.1) recovery timeout"
1087 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1088 [ "$STATUS" == "paused" ] ||
1089 namespace_error "(20) Expect 'paused', but got '$STATUS'"
1091 echo "stop $SINGLEMDS"
1092 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1094 echo "start $SINGLEMDS without resume LFSCK"
1095 start_facet $SINGLEMDS "$MOUNT_OPTS_SKIP_LFSCK" 20.2
1098 while [ $timer -lt $timeout ]; do
1099 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1100 mdt.${MDT_DEV}.recovery_status |
1101 awk '/^status/ { print \\\$2 }'")
1102 [ "$STATUS" != "RECOVERING" ] && break;
1104 timer=$((timer + 1))
1107 [ $timer != $timeout ] || (
1108 do_facet $SINGLEMDS "$LCTL get_param -n \
1109 mdt.${MDT_DEV}.recovery_status"
1110 error "(20.3) recovery timeout"
1113 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1114 [ "$STATUS" == "paused" ] ||
1115 namespace_error "(20.4) Expect 'paused', but got '$STATUS'"
1117 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1118 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1121 namespace_error "(21) Fail to start LFSCK for namespace!"
1122 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1123 mdd.${MDT_DEV}.lfsck_namespace |
1124 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1126 namespace_error "(22) unexpected status"
1129 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1132 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1133 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1134 namespace_error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1136 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1137 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1138 mdd.${MDT_DEV}.lfsck_namespace |
1139 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1141 namespace_error "(24) unexpected status"
1144 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1146 namespace_error "(25) Expect empty flags, but got '$FLAGS'"
1148 run_test 8 "LFSCK state machine"
1151 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1152 skip "Testing on UP system, the speed may be inaccurate."
1156 check_mount_and_prep
1157 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1158 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1159 createmany -o $DIR/$tdir/lfsck/f 5000
1161 local BASE_SPEED1=100
1163 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1166 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1167 [ "$STATUS" == "scanning-phase1" ] ||
1168 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1170 local SPEED=$($SHOW_LAYOUT |
1171 awk '/^average_speed_phase1/ { print $2 }')
1173 # There may be time error, normally it should be less than 2 seconds.
1174 # We allow another 20% schedule error.
1176 # MAX_MARGIN = 1.3 = 13 / 10
1177 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1178 RUN_TIME1 * 13 / 10))
1179 [ $SPEED -lt $MAX_SPEED ] || {
1181 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1182 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1185 # adjust speed limit
1186 local BASE_SPEED2=300
1188 do_facet $SINGLEMDS \
1189 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1192 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1193 # MIN_MARGIN = 0.7 = 7 / 10
1194 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1195 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1196 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1197 [ $SPEED -gt $MIN_SPEED ] || {
1198 if [ $mds1_FSTYPE != ldiskfs ]; then
1199 error_ignore LU-5624 \
1200 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1203 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1207 # MAX_MARGIN = 1.3 = 13 / 10
1208 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1209 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1210 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1211 [ $SPEED -lt $MAX_SPEED ] || {
1213 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1214 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1215 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1218 do_nodes $(comma_list $(mdts_nodes)) \
1219 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1220 do_nodes $(comma_list $(osts_nodes)) \
1221 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1223 wait_update_facet $SINGLEMDS \
1224 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1225 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1226 error "(7) Failed to get expected 'completed'"
1228 run_test 9a "LFSCK speed control (1)"
1231 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1232 skip "Testing on UP system, the speed may be inaccurate."
1238 echo "Preparing another 50 * 50 files (with error) at $(date)."
1239 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1241 createmany -d $DIR/$tdir/d 50
1242 createmany -m $DIR/$tdir/f 50
1243 for ((i = 0; i < 50; i++)); do
1244 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1247 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1249 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1250 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1251 mdd.${MDT_DEV}.lfsck_namespace |
1252 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1254 error "(5) unexpected status"
1257 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1258 echo "Prepared at $(date)."
1260 local BASE_SPEED1=50
1262 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1265 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1266 [ "$STATUS" == "scanning-phase2" ] ||
1267 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1269 local SPEED=$($SHOW_NAMESPACE |
1270 awk '/^average_speed_phase2/ { print $2 }')
1271 # There may be time error, normally it should be less than 2 seconds.
1272 # We allow another 20% schedule error.
1274 # MAX_MARGIN = 1.3 = 13 / 10
1275 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1276 RUN_TIME1 * 13 / 10))
1277 [ $SPEED -lt $MAX_SPEED ] || {
1279 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1280 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1283 # adjust speed limit
1284 local BASE_SPEED2=150
1286 do_facet $SINGLEMDS \
1287 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1290 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1291 # MIN_MARGIN = 0.7 = 7 / 10
1292 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1293 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1294 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1295 [ $SPEED -gt $MIN_SPEED ] || {
1296 if [ $mds1_FSTYPE != ldiskfs ]; then
1297 error_ignore LU-5624 \
1298 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1301 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1305 # MAX_MARGIN = 1.3 = 13 / 10
1306 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1307 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1308 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1309 [ $SPEED -lt $MAX_SPEED ] || {
1311 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1312 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1313 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1316 do_nodes $(comma_list $(mdts_nodes)) \
1317 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1318 do_nodes $(comma_list $(osts_nodes)) \
1319 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1320 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1321 mdd.${MDT_DEV}.lfsck_namespace |
1322 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1324 error "(11) unexpected status"
1327 run_test 9b "LFSCK speed control (2)"
1331 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1335 echo "Preparing more files with error at $(date)."
1336 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1337 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1339 for ((i = 0; i < 1000; i = $((i+2)))); do
1340 mkdir -p $DIR/$tdir/d${i}
1341 touch $DIR/$tdir/f${i}
1342 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1345 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1348 for ((i = 1; i < 1000; i = $((i+2)))); do
1349 mkdir -p $DIR/$tdir/d${i}
1350 touch $DIR/$tdir/f${i}
1351 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1354 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1355 echo "Prepared at $(date)."
1357 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1359 umount_client $MOUNT
1360 mount_client $MOUNT || error "(3) Fail to start client!"
1362 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1365 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1366 [ "$STATUS" == "scanning-phase1" ] ||
1367 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1369 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1371 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1373 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1375 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1377 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1379 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1381 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1383 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1384 error "(14) Fail to softlink!"
1386 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1387 [ "$STATUS" == "scanning-phase1" ] ||
1388 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1390 do_nodes $(comma_list $(mdts_nodes)) \
1391 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1392 do_nodes $(comma_list $(osts_nodes)) \
1393 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1394 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1395 mdd.${MDT_DEV}.lfsck_namespace |
1396 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1398 error "(16) unexpected status"
1401 run_test 10 "System is available during LFSCK scanning"
1404 ost_remove_lastid() {
1407 local rcmd="do_facet ost${ost}"
1409 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1411 # step 1: local mount
1412 mount_fstype ost${ost} || return 1
1413 # step 2: remove the specified LAST_ID
1414 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1416 unmount_fstype ost${ost} || return 2
1420 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1421 skip "MDS older than 2.5.55, LU-1267"
1423 check_mount_and_prep
1424 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1425 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1430 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1432 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1433 error "(2) Fail to start ost1"
1435 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1436 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1438 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1439 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1441 wait_update_facet ost1 "$LCTL get_param -n \
1442 obdfilter.${OST_DEV}.lfsck_layout |
1443 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1445 error "(5) unexpected status"
1448 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1450 wait_update_facet ost1 "$LCTL get_param -n \
1451 obdfilter.${OST_DEV}.lfsck_layout |
1452 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1454 error "(6) unexpected status"
1457 echo "the LAST_ID(s) should have been rebuilt"
1458 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1459 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1461 run_test 11a "LFSCK can rebuild lost last_id"
1464 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1465 skip "MDS older than 2.5.55, LU-1267"
1467 check_mount_and_prep
1468 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1470 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1471 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1472 do_facet ost1 $LCTL set_param fail_loc=0x160d
1474 local count=$(precreated_ost_obj_count 0 0)
1476 createmany -o $DIR/$tdir/f $((count + 32))
1478 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1479 local seq=$(do_facet mds1 $LCTL get_param -n \
1480 osp.${proc_path}.prealloc_last_seq)
1481 local id_used=$(do_facet mds1 $LCTL get_param -n \
1482 osp.${proc_path}.prealloc_last_id)
1484 umount_client $MOUNT
1485 stop ost1 || error "(1) Fail to stop ost1"
1487 #define OBD_FAIL_OST_ENOSPC 0x215
1488 do_facet ost1 $LCTL set_param fail_loc=0x215
1490 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1491 error "(2) Fail to start ost1"
1493 for ((i = 0; i < 60; i++)); do
1494 id_ost1=$(do_facet ost1 \
1495 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1496 awk -F: "/$seq/ { print \$2 }")
1497 [ -n "$id_ost1" ] && break
1501 echo "the on-disk LAST_ID should be smaller than the expected one"
1502 [ $id_used -gt $id_ost1 ] ||
1503 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1505 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1506 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1508 wait_update_facet ost1 \
1509 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1510 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1512 error "(6) unexpected status"
1515 stop ost1 || error "(7) Fail to stop ost1"
1517 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1518 error "(8) Fail to start ost1"
1520 echo "the on-disk LAST_ID should have been rebuilt"
1521 # last_id may be larger than $id_used if objects were created/skipped
1522 wait_update_facet_cond ost1 \
1523 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1524 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1525 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1526 error "(9) expect last_id >= id_used $seq:$id_used"
1529 do_facet ost1 $LCTL set_param fail_loc=0
1530 stopall || error "(10) Fail to stopall"
1532 run_test 11b "LFSCK can rebuild crashed last_id"
1535 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1536 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1537 skip "MDS older than 2.5.55, LU-3950"
1539 check_mount_and_prep
1540 for k in $(seq $MDSCOUNT); do
1541 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1542 createmany -o $DIR/$tdir/${k}/f 100 ||
1543 error "(0) Fail to create 100 files."
1546 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1547 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1548 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1550 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1551 wait_all_targets namespace scanning-phase1 3
1553 echo "Stop namespace LFSCK on all targets by single lctl command."
1554 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1555 error "(4) Fail to stop LFSCK on all devices!"
1557 echo "All the LFSCK targets should be in 'stopped' status."
1558 wait_all_targets_blocked namespace stopped 5
1560 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1561 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1562 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1564 echo "All the LFSCK targets should be in 'completed' status."
1565 wait_all_targets_blocked namespace completed 7
1567 start_full_debug_logging
1569 echo "Start layout LFSCK on all targets by single command (-s 1)."
1570 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1571 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1573 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1574 wait_all_targets layout scanning-phase1 9
1576 echo "Stop layout LFSCK on all targets by single lctl command."
1577 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1578 error "(10) Fail to stop LFSCK on all devices!"
1580 echo "All the LFSCK targets should be in 'stopped' status."
1581 wait_all_targets_blocked layout stopped 11
1583 for k in $(seq $OSTCOUNT); do
1584 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1585 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1586 awk '/^status/ { print $2 }')
1587 [ "$STATUS" == "stopped" ] ||
1588 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1591 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1592 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1593 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1595 echo "All the LFSCK targets should be in 'completed' status."
1596 wait_all_targets_blocked layout completed 14
1598 stop_full_debug_logging
1600 run_test 12a "single command to trigger LFSCK on all devices"
1603 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1604 skip "MDS older than 2.5.55, LU-3950"
1606 check_mount_and_prep
1608 echo "Start LFSCK without '-M' specified."
1609 do_facet mds1 $LCTL lfsck_start -A -r ||
1610 error "(0) Fail to start LFSCK without '-M'"
1612 wait_all_targets_blocked namespace completed 1
1613 wait_all_targets_blocked layout completed 2
1615 local count=$(do_facet mds1 $LCTL dl |
1616 awk '{ print $3 }' | grep mdt | wc -l)
1617 if [ $count -gt 1 ]; then
1619 echo "Start layout LFSCK on the node with multipe targets,"
1620 echo "but not specify '-M'/'-A' option. Should get failure."
1622 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1623 error "(3) Start layout LFSCK should fail" || true
1626 run_test 12b "auto detect Lustre device"
1629 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1630 skip "MDS older than 2.5.55, LU-3593"
1633 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1634 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1635 echo "MDT-object FID."
1638 check_mount_and_prep
1640 echo "Inject failure stub to simulate bad lmm_oi"
1641 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1642 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1643 createmany -o $DIR/$tdir/f 1
1644 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1645 error "(0) Fail to create PFL $DIR/$tdir/f1"
1646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1648 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1649 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1651 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1652 mdd.${MDT_DEV}.lfsck_layout |
1653 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1655 error "(2) unexpected status"
1658 local repaired=$($SHOW_LAYOUT |
1659 awk '/^repaired_others/ { print $2 }')
1660 [ $repaired -eq 2 ] ||
1661 error "(3) Fail to repair crashed lmm_oi: $repaired"
1663 run_test 13 "LFSCK can repair crashed lmm_oi"
1666 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1667 skip "MDS older than 2.5.55, LU-3590"
1670 echo "The OST-object referenced by the MDT-object should be there;"
1671 echo "otherwise, the LFSCK should re-create the missing OST-object."
1672 echo "without '--delay-create-ostobj' option."
1675 check_mount_and_prep
1676 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1678 echo "Inject failure stub to simulate dangling referenced MDT-object"
1679 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1680 do_facet ost1 $LCTL set_param fail_loc=0x1610
1681 local count=$(precreated_ost_obj_count 0 0)
1683 createmany -o $DIR/$tdir/f $((count + 16)) ||
1684 error "(0.1) Fail to create $DIR/$tdir/fx"
1685 touch $DIR/$tdir/guard0
1687 for ((i = 0; i < 16; i++)); do
1688 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1689 $DIR/$tdir/f_comp${i} ||
1690 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1692 touch $DIR/$tdir/guard1
1694 do_facet ost1 $LCTL set_param fail_loc=0
1696 start_full_debug_logging
1698 # exhaust other pre-created dangling cases
1699 count=$(precreated_ost_obj_count 0 0)
1700 createmany -o $DIR/$tdir/a $count ||
1701 error "(0.5) Fail to create $count files."
1703 echo "'ls' should fail because of dangling referenced MDT-object"
1704 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1706 echo "Trigger layout LFSCK to find out dangling reference"
1707 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1709 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1710 mdd.${MDT_DEV}.lfsck_layout |
1711 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1713 error "(3) unexpected status"
1716 local repaired=$($SHOW_LAYOUT |
1717 awk '/^repaired_dangling/ { print $2 }')
1718 [ $repaired -ge 32 ] ||
1719 error "(4) Fail to repair dangling reference: $repaired"
1721 echo "'stat' should fail because of not repair dangling by default"
1722 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1723 error "(5.1) stat should fail"
1724 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1725 error "(5.2) stat should fail"
1727 echo "Trigger layout LFSCK to repair dangling reference"
1728 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1730 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1731 mdd.${MDT_DEV}.lfsck_layout |
1732 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1734 error "(7) unexpected status"
1737 # There may be some async LFSCK updates in processing, wait for
1738 # a while until the target reparation has been done. LU-4970.
1740 echo "'stat' should success after layout LFSCK repairing"
1741 wait_update_facet client "stat $DIR/$tdir/guard0 |
1742 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1743 stat $DIR/$tdir/guard0
1745 error "(8.1) unexpected size"
1748 wait_update_facet client "stat $DIR/$tdir/guard1 |
1749 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1750 stat $DIR/$tdir/guard1
1752 error "(8.2) unexpected size"
1755 repaired=$($SHOW_LAYOUT |
1756 awk '/^repaired_dangling/ { print $2 }')
1757 [ $repaired -ge 32 ] ||
1758 error "(9) Fail to repair dangling reference: $repaired"
1760 stop_full_debug_logging
1762 echo "stopall to cleanup object cache"
1765 setupall > /dev/null
1767 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1770 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1771 skip "MDS older than 2.5.55, LU-3590"
1774 echo "The OST-object referenced by the MDT-object should be there;"
1775 echo "otherwise, the LFSCK should re-create the missing OST-object."
1776 echo "with '--delay-create-ostobj' option."
1779 check_mount_and_prep
1780 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1782 echo "Inject failure stub to simulate dangling referenced MDT-object"
1783 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1784 do_facet ost1 $LCTL set_param fail_loc=0x1610
1785 local count=$(precreated_ost_obj_count 0 0)
1787 createmany -o $DIR/$tdir/f $((count + 31))
1788 touch $DIR/$tdir/guard
1789 do_facet ost1 $LCTL set_param fail_loc=0
1791 start_full_debug_logging
1793 # exhaust other pre-created dangling cases
1794 count=$(precreated_ost_obj_count 0 0)
1795 createmany -o $DIR/$tdir/a $count ||
1796 error "(0) Fail to create $count files."
1798 echo "'ls' should fail because of dangling referenced MDT-object"
1799 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1801 echo "Trigger layout LFSCK to find out dangling reference"
1802 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1804 wait_all_targets_blocked layout completed 3
1806 local repaired=$($SHOW_LAYOUT |
1807 awk '/^repaired_dangling/ { print $2 }')
1808 [ $repaired -ge 32 ] ||
1809 error "(4) Fail to repair dangling reference: $repaired"
1811 echo "'stat' should fail because of not repair dangling by default"
1812 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1814 echo "Trigger layout LFSCK to repair dangling reference"
1815 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1817 wait_all_targets_blocked layout completed 7
1819 # There may be some async LFSCK updates in processing, wait for
1820 # a while until the target reparation has been done. LU-4970.
1822 echo "'stat' should success after layout LFSCK repairing"
1823 wait_update_facet client "stat $DIR/$tdir/guard |
1824 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1825 stat $DIR/$tdir/guard
1827 error "(8) unexpected size"
1830 repaired=$($SHOW_LAYOUT |
1831 awk '/^repaired_dangling/ { print $2 }')
1832 [ $repaired -ge 32 ] ||
1833 error "(9) Fail to repair dangling reference: $repaired"
1835 stop_full_debug_logging
1837 echo "stopall to cleanup object cache"
1840 setupall > /dev/null
1842 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1845 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1846 skip "MDS older than 2.5.55, LU-3591"
1849 echo "If the OST-object referenced by the MDT-object back points"
1850 echo "to some non-exist MDT-object, then the LFSCK should repair"
1851 echo "the OST-object to back point to the right MDT-object."
1854 check_mount_and_prep
1855 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1857 echo "Inject failure stub to make the OST-object to back point to"
1858 echo "non-exist MDT-object."
1859 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1861 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1862 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1863 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1865 error "(0) Fail to create PFL $DIR/$tdir/f1"
1866 # 'dd' will trigger punch RPC firstly on every OST-objects.
1867 # So even though some OST-object will not be write by 'dd',
1868 # as long as it is allocated (may be NOT allocated in pfl_3b)
1869 # its layout information will be set also.
1870 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1871 cancel_lru_locks osc
1872 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1874 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1875 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1877 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1878 mdd.${MDT_DEV}.lfsck_layout |
1879 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1881 error "(2) unexpected status"
1884 local repaired=$($SHOW_LAYOUT |
1885 awk '/^repaired_unmatched_pair/ { print $2 }')
1886 [ $repaired -ge 3 ] ||
1887 error "(3) Fail to repair unmatched pair: $repaired"
1889 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1892 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1893 skip "MDS older than 2.5.55, LU-3591"
1896 echo "If the OST-object referenced by the MDT-object back points"
1897 echo "to other MDT-object that doesn't recognize the OST-object,"
1898 echo "then the LFSCK should repair it to back point to the right"
1899 echo "MDT-object (the first one)."
1902 check_mount_and_prep
1903 mkdir -p $DIR/$tdir/0
1904 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1905 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1906 cancel_lru_locks osc
1908 echo "Inject failure stub to make the OST-object to back point to"
1909 echo "other MDT-object"
1912 [ $OSTCOUNT -ge 2 ] && stripes=2
1914 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1915 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1916 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1917 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1919 error "(0) Fail to create PFL $DIR/$tdir/f1"
1920 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1921 cancel_lru_locks osc
1922 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1924 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1925 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1927 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1928 mdd.${MDT_DEV}.lfsck_layout |
1929 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1931 error "(2) unexpected status"
1934 local repaired=$($SHOW_LAYOUT |
1935 awk '/^repaired_unmatched_pair/ { print $2 }')
1936 [ $repaired -eq 4 ] ||
1937 error "(3) Fail to repair unmatched pair: $repaired"
1939 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1942 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1943 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1944 skip "MDS newer than 2.7.55, LU-6475"
1945 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1946 skip "MDS older than 2.5.55, LU-3591"
1949 echo "According to current metadata migration implementation,"
1950 echo "before the old MDT-object is removed, both the new MDT-object"
1951 echo "and old MDT-object will reference the same LOV layout. Then if"
1952 echo "the layout LFSCK finds the new MDT-object by race, it will"
1953 echo "regard related OST-object(s) as multiple referenced case, and"
1954 echo "will try to create new OST-object(s) for the new MDT-object."
1955 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1956 echo "MDT-object before confirm the multiple referenced case."
1959 check_mount_and_prep
1960 $LFS mkdir -i 1 $DIR/$tdir/a1
1961 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1962 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1963 cancel_lru_locks osc
1965 echo "Inject failure stub on MDT1 to delay the migration"
1967 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1968 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1969 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1970 $LFS migrate -m 0 $DIR/$tdir/a1 &
1973 echo "Trigger layout LFSCK to race with the migration"
1974 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1976 wait_all_targets_blocked layout completed 2
1978 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1979 local repaired=$($SHOW_LAYOUT |
1980 awk '/^repaired_unmatched_pair/ { print $2 }')
1981 [ $repaired -eq 1 ] ||
1982 error "(3) Fail to repair unmatched pair: $repaired"
1984 repaired=$($SHOW_LAYOUT |
1985 awk '/^repaired_multiple_referenced/ { print $2 }')
1986 [ $repaired -eq 0 ] ||
1987 error "(4) Unexpectedly repaird multiple references: $repaired"
1989 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1992 (( $MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
1994 check_mount_and_prep
1996 $LFS mkdir -c -1 $DIR/$tdir || error "create $tdir failed"
1997 $LFS setdirstripe -D -i -1 -c 1 $DIR/$tdir ||
1998 error "setdirstripe failed"
2000 createmany -o $DIR/$tdir/f 100 || error "create sub files failed"
2001 createmany -d $DIR/$tdir/s 100 || error "create sub dirs failed"
2003 echo "Migrate $DIR/$tdir to MDT1"
2004 $LFS migrate -m 1 $DIR/$tdir &
2008 # fail sub transactions on random MDTs, which may cause some file
2010 #define OBD_FAIL_OUT_EIO 0x1709
2011 for ((i = 0; i < $MDSCOUNT; i++)); do
2012 do_facet mds$i $LCTL set_param fail_loc=0x1709
2014 do_facet mds$i $LCTL set_param fail_loc=0
2019 # LFSCK can't fully fix migrating directories, and may leave some
2020 # files inaccessible, but it shouldn't cause crash
2021 $START_NAMESPACE -A -r ||
2022 error "Fail to start LFSCK for namespace"
2024 wait_all_targets_blocked namespace completed 1
2026 # resume migration may fail because some file may be inaccessible, but
2027 # it shouldn't cause crash
2028 $LFS migrate -m 1 $DIR/$tdir
2030 # rm $tdir to avoid cleanup failure in the end
2032 $LFS rm_entry $DIR/$tdir/*
2034 REFORMAT="yes" cleanup_and_setup_lustre
2036 run_test 15d "LFSCK don't crash upon dir migration failure"
2039 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2040 skip "MDS older than 2.5.55, LU-3594"
2043 echo "If the OST-object's owner information does not match the owner"
2044 echo "information stored in the MDT-object, then the LFSCK trust the"
2045 echo "MDT-object and update the OST-object's owner information."
2048 check_mount_and_prep
2049 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2050 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
2051 cancel_lru_locks osc
2053 # created but no setattr or write to the file.
2055 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
2056 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
2058 echo "Inject failure stub to skip OST-object owner changing"
2059 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
2060 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
2061 chown 1.1 $DIR/$tdir/f0
2062 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2064 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
2067 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
2069 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2070 mdd.${MDT_DEV}.lfsck_layout |
2071 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2073 error "(2) unexpected status"
2076 local repaired=$($SHOW_LAYOUT |
2077 awk '/^repaired_inconsistent_owner/ { print $2 }')
2078 [ $repaired -eq 1 ] ||
2079 error "(3) Fail to repair inconsistent owner: $repaired"
2081 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2084 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2085 skip "MDS older than 2.5.55, LU-3594"
2088 echo "If more than one MDT-objects reference the same OST-object,"
2089 echo "and the OST-object only recognizes one MDT-object, then the"
2090 echo "LFSCK should create new OST-objects for such non-recognized"
2094 check_mount_and_prep
2095 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2097 echo "Inject failure stub to make two MDT-objects to refernce"
2098 echo "the OST-object"
2100 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2101 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2102 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2103 cancel_lru_locks mdc
2104 cancel_lru_locks osc
2106 createmany -o $DIR/$tdir/f 1
2107 cancel_lru_locks mdc
2108 cancel_lru_locks osc
2110 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2112 error "(0) Fail to create PFL $DIR/$tdir/f1"
2113 cancel_lru_locks mdc
2114 cancel_lru_locks osc
2115 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2117 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2118 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2119 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2120 [ $size -eq 1048576 ] ||
2121 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2123 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2124 [ $size -eq 1048576 ] ||
2125 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2127 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2130 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2132 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2133 mdd.${MDT_DEV}.lfsck_layout |
2134 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2136 error "(3) unexpected status"
2139 local repaired=$($SHOW_LAYOUT |
2140 awk '/^repaired_multiple_referenced/ { print $2 }')
2141 [ $repaired -eq 2 ] ||
2142 error "(4) Fail to repair multiple references: $repaired"
2144 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2145 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2146 error "(5) Fail to write f0."
2147 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2148 [ $size -eq 1048576 ] ||
2149 error "(6) guard size should be 1048576, but got $size"
2151 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2152 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2153 error "(7) Fail to write f1."
2154 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2155 [ $size -eq 1048576 ] ||
2156 error "(8) guard size should be 1048576, but got $size"
2158 run_test 17 "LFSCK can repair multiple references"
2160 $LCTL set_param debug=+cache > /dev/null
2163 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2164 skip "MDS older than 2.5.55, LU-3336"
2167 echo "The target MDT-object is there, but related stripe information"
2168 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2169 echo "layout EA entries."
2172 check_mount_and_prep
2173 $LFS mkdir -i 0 $DIR/$tdir/a1
2174 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2175 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2177 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2179 $LFS path2fid $DIR/$tdir/a1/f1
2180 $LFS getstripe $DIR/$tdir/a1/f1
2182 if [ $MDSCOUNT -ge 2 ]; then
2183 $LFS mkdir -i 1 $DIR/$tdir/a2
2184 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2185 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2186 $LFS path2fid $DIR/$tdir/a2/f2
2187 $LFS getstripe $DIR/$tdir/a2/f2
2190 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2191 error "(0) Fail to create PFL $DIR/$tdir/f3"
2193 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2195 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2197 $LFS path2fid $DIR/$tdir/f3
2198 $LFS getstripe $DIR/$tdir/f3
2200 cancel_lru_locks osc
2202 echo "Inject failure, to make the MDT-object lost its layout EA"
2203 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2204 do_facet mds1 $LCTL set_param fail_loc=0x1615
2205 chown 1.1 $DIR/$tdir/a1/f1
2207 if [ $MDSCOUNT -ge 2 ]; then
2208 do_facet mds2 $LCTL set_param fail_loc=0x1615
2209 chown 1.1 $DIR/$tdir/a2/f2
2212 chown 1.1 $DIR/$tdir/f3
2217 do_facet mds1 $LCTL set_param fail_loc=0
2218 if [ $MDSCOUNT -ge 2 ]; then
2219 do_facet mds2 $LCTL set_param fail_loc=0
2222 cancel_lru_locks mdc
2223 cancel_lru_locks osc
2225 echo "The file size should be incorrect since layout EA is lost"
2226 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2227 [ "$cur_size" != "$saved_size1" ] ||
2228 error "(1) Expect incorrect file1 size"
2230 if [ $MDSCOUNT -ge 2 ]; then
2231 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2232 [ "$cur_size" != "$saved_size1" ] ||
2233 error "(2) Expect incorrect file2 size"
2236 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2237 [ "$cur_size" != "$saved_size2" ] ||
2238 error "(1.2) Expect incorrect file3 size"
2240 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2241 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2243 for k in $(seq $MDSCOUNT); do
2244 # The LFSCK status query internal is 30 seconds. For the case
2245 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2246 # time to guarantee the status sync up.
2247 wait_update_facet mds${k} "$LCTL get_param -n \
2248 mdd.$(facet_svc mds${k}).lfsck_layout |
2249 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2250 error "(4) MDS${k} is not the expected 'completed'"
2253 for k in $(seq $OSTCOUNT); do
2254 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2255 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2256 awk '/^status/ { print $2 }')
2257 [ "$cur_status" == "completed" ] ||
2258 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2261 local repaired=$(do_facet mds1 $LCTL get_param -n \
2262 mdd.$(facet_svc mds1).lfsck_layout |
2263 awk '/^repaired_orphan/ { print $2 }')
2264 [ $repaired -eq 3 ] ||
2265 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2267 if [ $MDSCOUNT -ge 2 ]; then
2268 repaired=$(do_facet mds2 $LCTL get_param -n \
2269 mdd.$(facet_svc mds2).lfsck_layout |
2270 awk '/^repaired_orphan/ { print $2 }')
2271 [ $repaired -eq 2 ] ||
2272 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2275 $LFS path2fid $DIR/$tdir/a1/f1
2276 $LFS getstripe $DIR/$tdir/a1/f1
2278 if [ $MDSCOUNT -ge 2 ]; then
2279 $LFS path2fid $DIR/$tdir/a2/f2
2280 $LFS getstripe $DIR/$tdir/a2/f2
2283 $LFS path2fid $DIR/$tdir/f3
2284 $LFS getstripe $DIR/$tdir/f3
2286 echo "The file size should be correct after layout LFSCK scanning"
2287 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2288 [ "$cur_size" == "$saved_size1" ] ||
2289 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2291 if [ $MDSCOUNT -ge 2 ]; then
2292 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2293 [ "$cur_size" == "$saved_size1" ] ||
2294 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2297 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2298 [ "$cur_size" == "$saved_size2" ] ||
2299 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2301 run_test 18a "Find out orphan OST-object and repair it (1)"
2304 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2305 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2306 skip "MDS older than 2.5.55, LU-3336"
2309 echo "The target MDT-object is lost. The LFSCK should re-create the"
2310 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2311 echo "can move it back to normal namespace manually."
2314 check_mount_and_prep
2315 $LFS mkdir -i 0 $DIR/$tdir/a1
2316 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2317 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2318 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2319 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2321 $LFS getstripe $DIR/$tdir/a1/f1
2323 if [ $MDSCOUNT -ge 2 ]; then
2324 $LFS mkdir -i 1 $DIR/$tdir/a2
2325 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2326 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2327 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2329 $LFS getstripe $DIR/$tdir/a2/f2
2332 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2333 error "(0) Fail to create PFL $DIR/$tdir/f3"
2335 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2337 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2338 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2340 $LFS getstripe $DIR/$tdir/f3
2342 cancel_lru_locks osc
2344 echo "Inject failure, to simulate the case of missing the MDT-object"
2345 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2346 do_facet mds1 $LCTL set_param fail_loc=0x1616
2347 rm -f $DIR/$tdir/a1/f1
2349 if [ $MDSCOUNT -ge 2 ]; then
2350 do_facet mds2 $LCTL set_param fail_loc=0x1616
2351 rm -f $DIR/$tdir/a2/f2
2359 do_facet mds1 $LCTL set_param fail_loc=0
2360 if [ $MDSCOUNT -ge 2 ]; then
2361 do_facet mds2 $LCTL set_param fail_loc=0
2364 cancel_lru_locks mdc
2365 cancel_lru_locks osc
2367 # dryrun mode only check orphans, not repaie
2368 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2369 $START_LAYOUT --dryrun -o -r ||
2370 error "Fail to start layout LFSCK in dryrun mode"
2371 wait_all_targets_blocked layout completed 2
2373 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2374 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2375 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2377 local orphans=$(do_facet mds1 $LCTL get_param -n \
2378 mdd.$(facet_svc mds1).lfsck_layout |
2379 awk '/^inconsistent_orphan/ { print $2 }')
2380 [ $orphans -eq 3 ] ||
2381 error "Expect 3 found on mds1, but got: $orphans"
2383 # orphan parents should not be created
2385 for subdir in $MOUNT/.lustre/lost+found/*; do
2386 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2389 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2390 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2392 for k in $(seq $MDSCOUNT); do
2393 # The LFSCK status query internal is 30 seconds. For the case
2394 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2395 # time to guarantee the status sync up.
2396 wait_update_facet mds${k} "$LCTL get_param -n \
2397 mdd.$(facet_svc mds${k}).lfsck_layout |
2398 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2399 error "(2) MDS${k} is not the expected 'completed'"
2402 for k in $(seq $OSTCOUNT); do
2403 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2404 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2405 awk '/^status/ { print $2 }')
2406 [ "$cur_status" == "completed" ] ||
2407 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2410 local repaired=$(do_facet mds1 $LCTL get_param -n \
2411 mdd.$(facet_svc mds1).lfsck_layout |
2412 awk '/^repaired_orphan/ { print $2 }')
2413 [ $repaired -eq 3 ] ||
2414 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2416 if [ $MDSCOUNT -ge 2 ]; then
2417 repaired=$(do_facet mds2 $LCTL get_param -n \
2418 mdd.$(facet_svc mds2).lfsck_layout |
2419 awk '/^repaired_orphan/ { print $2 }')
2420 [ $repaired -eq 2 ] ||
2421 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2424 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2425 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2426 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2428 if [ $MDSCOUNT -ge 2 ]; then
2429 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2430 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2433 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2434 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2436 $LFS path2fid $DIR/$tdir/a1/f1
2437 $LFS getstripe $DIR/$tdir/a1/f1
2439 if [ $MDSCOUNT -ge 2 ]; then
2440 $LFS path2fid $DIR/$tdir/a2/f2
2441 $LFS getstripe $DIR/$tdir/a2/f2
2444 $LFS path2fid $DIR/$tdir/f3
2445 $LFS getstripe $DIR/$tdir/f3
2447 echo "The file size should be correct after layout LFSCK scanning"
2448 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2449 [ "$cur_size" == "$saved_size1" ] ||
2450 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2452 if [ $MDSCOUNT -ge 2 ]; then
2453 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2454 [ "$cur_size" == "$saved_size1" ] ||
2455 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2458 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2459 [ "$cur_size" == "$saved_size2" ] ||
2460 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2462 run_test 18b "Find out orphan OST-object and repair it (2)"
2465 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2466 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2467 skip "MDS older than 2.5.55, LU-3336"
2470 echo "The target MDT-object is lost, and the OST-object FID is missing."
2471 echo "The LFSCK should re-create the MDT-object with new FID under the "
2472 echo "directory .lustre/lost+found/MDTxxxx."
2475 check_mount_and_prep
2476 $LFS mkdir -i 0 $DIR/$tdir/a1
2477 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2479 echo "Inject failure, to simulate the case of missing parent FID"
2480 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2481 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2483 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2484 $LFS getstripe $DIR/$tdir/a1/f1
2486 if [ $MDSCOUNT -ge 2 ]; then
2487 $LFS mkdir -i 1 $DIR/$tdir/a2
2488 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2489 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2490 $LFS getstripe $DIR/$tdir/a2/f2
2493 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2494 error "(0) Fail to create PFL $DIR/$tdir/f3"
2496 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2497 $LFS getstripe $DIR/$tdir/f3
2499 cancel_lru_locks osc
2500 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2502 echo "Inject failure, to simulate the case of missing the MDT-object"
2503 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2504 do_facet mds1 $LCTL set_param fail_loc=0x1616
2505 rm -f $DIR/$tdir/a1/f1
2507 if [ $MDSCOUNT -ge 2 ]; then
2508 do_facet mds2 $LCTL set_param fail_loc=0x1616
2509 rm -f $DIR/$tdir/a2/f2
2517 do_facet mds1 $LCTL set_param fail_loc=0
2518 if [ $MDSCOUNT -ge 2 ]; then
2519 do_facet mds2 $LCTL set_param fail_loc=0
2522 cancel_lru_locks mdc
2523 cancel_lru_locks osc
2525 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2526 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2528 for k in $(seq $MDSCOUNT); do
2529 # The LFSCK status query internal is 30 seconds. For the case
2530 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2531 # time to guarantee the status sync up.
2532 wait_update_facet mds${k} "$LCTL get_param -n \
2533 mdd.$(facet_svc mds${k}).lfsck_layout |
2534 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2535 error "(2) MDS${k} is not the expected 'completed'"
2538 for k in $(seq $OSTCOUNT); do
2539 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2540 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2541 awk '/^status/ { print $2 }')
2542 [ "$cur_status" == "completed" ] ||
2543 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2546 if [ $MDSCOUNT -ge 2 ]; then
2552 local repaired=$(do_facet mds1 $LCTL get_param -n \
2553 mdd.$(facet_svc mds1).lfsck_layout |
2554 awk '/^repaired_orphan/ { print $2 }')
2555 [ $repaired -eq $expected ] ||
2556 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2558 if [ $MDSCOUNT -ge 2 ]; then
2559 repaired=$(do_facet mds2 $LCTL get_param -n \
2560 mdd.$(facet_svc mds2).lfsck_layout |
2561 awk '/^repaired_orphan/ { print $2 }')
2562 [ $repaired -eq 0 ] ||
2563 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2566 ls -ail $MOUNT/.lustre/lost+found/
2568 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2569 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2570 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2572 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2575 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2576 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2577 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2579 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2580 [ ! -z "$cname" ] ||
2581 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2583 run_test 18c "Find out orphan OST-object and repair it (3)"
2586 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2587 skip "MDS older than 2.5.55, LU-3336"
2590 echo "The target MDT-object layout EA is corrupted, but the right"
2591 echo "OST-object is still alive as orphan. The layout LFSCK will"
2592 echo "not create new OST-object to occupy such slot."
2595 check_mount_and_prep
2597 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2598 echo "guard" > $DIR/$tdir/a1/f1
2599 echo "foo" > $DIR/$tdir/a1/f2
2601 echo "guard" > $DIR/$tdir/a1/f3
2602 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2603 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2604 echo "foo" > $DIR/$tdir/a1/f4
2606 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2607 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2608 $LFS path2fid $DIR/$tdir/a1/f1
2609 $LFS getstripe $DIR/$tdir/a1/f1
2610 $LFS path2fid $DIR/$tdir/a1/f2
2611 $LFS getstripe $DIR/$tdir/a1/f2
2612 $LFS path2fid $DIR/$tdir/a1/f3
2613 $LFS getstripe $DIR/$tdir/a1/f3
2614 $LFS path2fid $DIR/$tdir/a1/f4
2615 $LFS getstripe $DIR/$tdir/a1/f4
2616 cancel_lru_locks osc
2618 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2619 echo "to reference the same OST-object (which is f1's OST-obejct)."
2620 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2621 echo "dangling reference case, but f2's old OST-object is there."
2623 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2624 echo "to reference the same OST-object (which is f3's OST-obejct)."
2625 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2626 echo "dangling reference case, but f4's old OST-object is there."
2629 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2630 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2631 chown 1.1 $DIR/$tdir/a1/f2
2632 chown 1.1 $DIR/$tdir/a1/f4
2633 rm -f $DIR/$tdir/a1/f1
2634 rm -f $DIR/$tdir/a1/f3
2637 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2639 echo "stopall to cleanup object cache"
2642 setupall > /dev/null
2644 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2645 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2647 for k in $(seq $MDSCOUNT); do
2648 # The LFSCK status query internal is 30 seconds. For the case
2649 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2650 # time to guarantee the status sync up.
2651 wait_update_facet mds${k} "$LCTL get_param -n \
2652 mdd.$(facet_svc mds${k}).lfsck_layout |
2653 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2654 error "(3) MDS${k} is not the expected 'completed'"
2657 for k in $(seq $OSTCOUNT); do
2658 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2659 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2660 awk '/^status/ { print $2 }')
2661 [ "$cur_status" == "completed" ] ||
2662 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2665 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2666 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2667 awk '/^repaired_orphan/ { print $2 }')
2668 [ $repaired -eq 2 ] ||
2669 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2671 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2672 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2673 awk '/^repaired_dangling/ { print $2 }')
2674 [ $repaired -eq 0 ] ||
2675 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2677 echo "The file size should be correct after layout LFSCK scanning"
2678 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2679 [ "$cur_size" == "$saved_size1" ] ||
2680 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2682 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2683 [ "$cur_size" == "$saved_size2" ] ||
2684 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2686 echo "The LFSCK should find back the original data."
2687 cat $DIR/$tdir/a1/f2
2688 $LFS path2fid $DIR/$tdir/a1/f2
2689 $LFS getstripe $DIR/$tdir/a1/f2
2690 cat $DIR/$tdir/a1/f4
2691 $LFS path2fid $DIR/$tdir/a1/f4
2692 $LFS getstripe $DIR/$tdir/a1/f4
2694 run_test 18d "Find out orphan OST-object and repair it (4)"
2697 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2698 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2699 skip "MDS older than 2.5.55, LU-3336"
2702 echo "The target MDT-object layout EA slot is occpuied by some new"
2703 echo "created OST-object when repair dangling reference case. Such"
2704 echo "conflict OST-object has been modified by others. To keep the"
2705 echo "new data, the LFSCK will create a new file to refernece this"
2706 echo "old orphan OST-object."
2709 check_mount_and_prep
2711 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2712 echo "guard" > $DIR/$tdir/a1/f1
2713 echo "foo" > $DIR/$tdir/a1/f2
2715 echo "guard" > $DIR/$tdir/a1/f3
2716 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2717 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2718 echo "foo" > $DIR/$tdir/a1/f4
2720 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2721 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2723 $LFS path2fid $DIR/$tdir/a1/f1
2724 $LFS getstripe $DIR/$tdir/a1/f1
2725 $LFS path2fid $DIR/$tdir/a1/f2
2726 $LFS getstripe $DIR/$tdir/a1/f2
2727 $LFS path2fid $DIR/$tdir/a1/f3
2728 $LFS getstripe $DIR/$tdir/a1/f3
2729 $LFS path2fid $DIR/$tdir/a1/f4
2730 $LFS getstripe $DIR/$tdir/a1/f4
2731 cancel_lru_locks osc
2733 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2734 echo "to reference the same OST-object (which is f1's OST-obejct)."
2735 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2736 echo "dangling reference case, but f2's old OST-object is there."
2738 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2739 echo "to reference the same OST-object (which is f3's OST-obejct)."
2740 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2741 echo "dangling reference case, but f4's old OST-object is there."
2744 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2745 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2746 chown 1.1 $DIR/$tdir/a1/f2
2747 chown 1.1 $DIR/$tdir/a1/f4
2748 rm -f $DIR/$tdir/a1/f1
2749 rm -f $DIR/$tdir/a1/f3
2752 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2754 echo "stopall to cleanup object cache"
2757 setupall > /dev/null
2759 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2760 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2762 start_full_debug_logging
2764 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2765 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2767 wait_update_facet mds1 "$LCTL get_param -n \
2768 mdd.$(facet_svc mds1).lfsck_layout |
2769 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2770 error "(3) MDS1 is not the expected 'scanning-phase2'"
2772 # to guarantee all updates are synced.
2776 echo "Write new data to f2/f4 to modify the new created OST-object."
2777 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2778 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2780 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2782 for k in $(seq $MDSCOUNT); do
2783 # The LFSCK status query internal is 30 seconds. For the case
2784 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2785 # time to guarantee the status sync up.
2786 wait_update_facet mds${k} "$LCTL get_param -n \
2787 mdd.$(facet_svc mds${k}).lfsck_layout |
2788 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2789 error "(4) MDS${k} is not the expected 'completed'"
2792 for k in $(seq $OSTCOUNT); do
2793 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2794 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2795 awk '/^status/ { print $2 }')
2796 [ "$cur_status" == "completed" ] ||
2797 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2800 stop_full_debug_logging
2802 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2803 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2804 awk '/^repaired_orphan/ { print $2 }')
2805 [ $repaired -eq 2 ] ||
2806 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2808 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2809 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2810 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2812 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2813 if [ $count -ne 2 ]; then
2814 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2815 error "(8) Expect 2 stubs under lost+found, but got $count"
2818 echo "The stub file should keep the original f2 or f4 data"
2819 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2820 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2821 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2822 error "(9) Got unexpected $cur_size"
2825 $LFS path2fid $cname
2826 $LFS getstripe $cname
2828 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2829 cur_size=$(ls -il $cname | awk '{ print $6 }')
2830 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2831 error "(10) Got unexpected $cur_size"
2834 $LFS path2fid $cname
2835 $LFS getstripe $cname
2837 echo "The f2/f4 should contains new data."
2838 cat $DIR/$tdir/a1/f2
2839 $LFS path2fid $DIR/$tdir/a1/f2
2840 $LFS getstripe $DIR/$tdir/a1/f2
2841 cat $DIR/$tdir/a1/f4
2842 $LFS path2fid $DIR/$tdir/a1/f4
2843 $LFS getstripe $DIR/$tdir/a1/f4
2845 run_test 18e "Find out orphan OST-object and repair it (5)"
2848 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2851 echo "The target MDT-object is lost. The LFSCK should re-create the"
2852 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2853 echo "to verify some OST-object(s) during the first stage-scanning,"
2854 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2855 echo "should not be affected."
2858 check_mount_and_prep
2859 $LFS mkdir -i 0 $DIR/$tdir/a1
2860 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2861 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2862 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2863 $LFS mkdir -i 0 $DIR/$tdir/a2
2864 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2865 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2866 $LFS getstripe $DIR/$tdir/a1/f1
2867 $LFS getstripe $DIR/$tdir/a2/f2
2869 if [ $MDSCOUNT -ge 2 ]; then
2870 $LFS mkdir -i 1 $DIR/$tdir/a3
2871 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2872 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2873 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2874 $LFS mkdir -i 1 $DIR/$tdir/a4
2875 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2876 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2877 $LFS getstripe $DIR/$tdir/a3/f3
2878 $LFS getstripe $DIR/$tdir/a4/f4
2881 cancel_lru_locks osc
2883 echo "Inject failure, to simulate the case of missing the MDT-object"
2884 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2885 do_facet mds1 $LCTL set_param fail_loc=0x1616
2886 rm -f $DIR/$tdir/a1/f1
2887 rm -f $DIR/$tdir/a2/f2
2889 if [ $MDSCOUNT -ge 2 ]; then
2890 do_facet mds2 $LCTL set_param fail_loc=0x1616
2891 rm -f $DIR/$tdir/a3/f3
2892 rm -f $DIR/$tdir/a4/f4
2898 do_facet mds1 $LCTL set_param fail_loc=0
2899 if [ $MDSCOUNT -ge 2 ]; then
2900 do_facet mds2 $LCTL set_param fail_loc=0
2903 cancel_lru_locks mdc
2904 cancel_lru_locks osc
2906 echo "Inject failure, to simulate the OST0 fail to handle"
2907 echo "MDT0 LFSCK request during the first-stage scanning."
2908 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2909 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2911 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2912 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2914 for k in $(seq $MDSCOUNT); do
2915 # The LFSCK status query internal is 30 seconds. For the case
2916 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2917 # time to guarantee the status sync up.
2918 wait_update_facet mds${k} "$LCTL get_param -n \
2919 mdd.$(facet_svc mds${k}).lfsck_layout |
2920 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2921 error "(2) MDS${k} is not the expected 'partial'"
2924 wait_update_facet ost1 "$LCTL get_param -n \
2925 obdfilter.$(facet_svc ost1).lfsck_layout |
2926 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2927 error "(3) OST1 is not the expected 'partial'"
2930 wait_update_facet ost2 "$LCTL get_param -n \
2931 obdfilter.$(facet_svc ost2).lfsck_layout |
2932 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2933 error "(4) OST2 is not the expected 'completed'"
2936 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2938 local repaired=$(do_facet mds1 $LCTL get_param -n \
2939 mdd.$(facet_svc mds1).lfsck_layout |
2940 awk '/^repaired_orphan/ { print $2 }')
2941 [ $repaired -eq 1 ] ||
2942 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2944 if [ $MDSCOUNT -ge 2 ]; then
2945 repaired=$(do_facet mds2 $LCTL get_param -n \
2946 mdd.$(facet_svc mds2).lfsck_layout |
2947 awk '/^repaired_orphan/ { print $2 }')
2948 [ $repaired -eq 1 ] ||
2949 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2952 echo "Trigger layout LFSCK on all devices again to cleanup"
2953 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2955 for k in $(seq $MDSCOUNT); do
2956 # The LFSCK status query internal is 30 seconds. For the case
2957 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2958 # time to guarantee the status sync up.
2959 wait_update_facet mds${k} "$LCTL get_param -n \
2960 mdd.$(facet_svc mds${k}).lfsck_layout |
2961 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2962 error "(8) MDS${k} is not the expected 'completed'"
2965 for k in $(seq $OSTCOUNT); do
2966 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2967 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2968 awk '/^status/ { print $2 }')
2969 [ "$cur_status" == "completed" ] ||
2970 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2974 local repaired=$(do_facet mds1 $LCTL get_param -n \
2975 mdd.$(facet_svc mds1).lfsck_layout |
2976 awk '/^repaired_orphan/ { print $2 }')
2977 [ $repaired -eq 2 ] ||
2978 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2980 if [ $MDSCOUNT -ge 2 ]; then
2981 repaired=$(do_facet mds2 $LCTL get_param -n \
2982 mdd.$(facet_svc mds2).lfsck_layout |
2983 awk '/^repaired_orphan/ { print $2 }')
2984 [ $repaired -eq 2 ] ||
2985 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2988 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2991 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2994 echo "The target MDT-object is lost, but related OI mapping is there"
2995 echo "The LFSCK should recreate the lost MDT-object without affected"
2996 echo "by the stale OI mapping."
2999 check_mount_and_prep
3000 $LFS mkdir -i 0 $DIR/$tdir/a1
3001 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
3002 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
3003 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3005 $LFS getstripe $DIR/$tdir/a1/f1
3006 cancel_lru_locks osc
3008 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
3009 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
3010 do_facet mds1 $LCTL set_param fail_loc=0x162e
3011 rm -f $DIR/$tdir/a1/f1
3013 do_facet mds1 $LCTL set_param fail_loc=0
3014 cancel_lru_locks mdc
3015 cancel_lru_locks osc
3017 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3018 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3020 for k in $(seq $MDSCOUNT); do
3021 # The LFSCK status query internal is 30 seconds. For the case
3022 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3023 # time to guarantee the status sync up.
3024 wait_update_facet mds${k} "$LCTL get_param -n \
3025 mdd.$(facet_svc mds${k}).lfsck_layout |
3026 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3027 error "(2) MDS${k} is not the expected 'completed'"
3030 for k in $(seq $OSTCOUNT); do
3031 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3032 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3033 awk '/^status/ { print $2 }')
3034 [ "$cur_status" == "completed" ] ||
3035 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3038 local repaired=$(do_facet mds1 $LCTL get_param -n \
3039 mdd.$(facet_svc mds1).lfsck_layout |
3040 awk '/^repaired_orphan/ { print $2 }')
3041 [ $repaired -eq $OSTCOUNT ] ||
3042 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
3044 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
3045 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
3046 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3048 $LFS path2fid $DIR/$tdir/a1/f1
3049 $LFS getstripe $DIR/$tdir/a1/f1
3051 run_test 18g "Find out orphan OST-object and repair it (7)"
3055 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
3056 echo "the layout LFSCK will keep the bad PFL file(s) there without"
3057 echo "scanning its OST-object(s). Then in the second stage scanning,"
3058 echo "the OST will return related OST-object(s) to the MDT as orphan."
3059 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
3060 echo "the 'orphan(s)' stripe information."
3063 check_mount_and_prep
3065 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
3066 error "(0) Fail to create PFL $DIR/$tdir/f0"
3068 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
3069 error "(1.1) Fail to write $DIR/$tdir/f0"
3071 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
3072 error "(1.2) Fail to write $DIR/$tdir/f0"
3074 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3076 echo "Inject failure stub to simulate bad PFL extent range"
3077 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3078 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3080 chown 1.1 $DIR/$tdir/f0
3082 cancel_lru_locks mdc
3083 cancel_lru_locks osc
3084 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3086 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3087 error "(2) Write to bad PFL file should fail"
3089 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3090 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3092 for k in $(seq $MDSCOUNT); do
3093 # The LFSCK status query internal is 30 seconds. For the case
3094 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3095 # time to guarantee the status sync up.
3096 wait_update_facet mds${k} "$LCTL get_param -n \
3097 mdd.$(facet_svc mds${k}).lfsck_layout |
3098 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3099 error "(4.1) MDS${k} is not the expected 'completed'"
3102 for k in $(seq $OSTCOUNT); do
3103 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3104 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3105 awk '/^status/ { print $2 }')
3106 [ "$cur_status" == "completed" ] ||
3107 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3111 local repaired=$($SHOW_LAYOUT |
3112 awk '/^repaired_orphan/ { print $2 }')
3113 [ $repaired -eq 2 ] ||
3114 error "(5) Fail to repair crashed PFL range: $repaired"
3116 echo "Data in $DIR/$tdir/f0 should not be broken"
3117 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3118 error "(6) Data in $DIR/$tdir/f0 is broken"
3120 echo "Write should succeed after LFSCK repairing the bad PFL range"
3121 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3122 error "(7) Write should succeed after LFSCK"
3124 run_test 18h "LFSCK can repair crashed PFL extent range"
3126 $LCTL set_param debug=-cache > /dev/null
3129 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3130 skip "MDS older than 2.5.55, LU-3951"
3132 check_mount_and_prep
3133 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3135 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3136 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3138 echo "foo1" > $DIR/$tdir/a0
3139 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3140 error "(0) Fail to create PFL $DIR/$tdir/a1"
3141 echo "foo2" > $DIR/$tdir/a1
3142 echo "guard" > $DIR/$tdir/a2
3143 cancel_lru_locks osc
3145 echo "Inject failure, then client will offer wrong parent FID when read"
3146 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3147 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3149 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3150 $LCTL set_param fail_loc=0x1619
3152 echo "Read RPC with wrong parent FID should be denied"
3153 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3154 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3155 $LCTL set_param fail_loc=0
3157 run_test 19a "OST-object inconsistency self detect"
3160 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3161 skip "MDS older than 2.5.55, LU-3951"
3163 check_mount_and_prep
3164 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3166 echo "Inject failure stub to make the OST-object to back point to"
3167 echo "non-exist MDT-object"
3169 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3170 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3172 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3173 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3174 echo "foo1" > $DIR/$tdir/f0
3175 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3176 error "(0) Fail to create PFL $DIR/$tdir/f1"
3177 echo "foo2" > $DIR/$tdir/f1
3178 cancel_lru_locks osc
3179 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3181 do_facet ost1 $LCTL set_param -n \
3182 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3183 echo "Nothing should be fixed since self detect and repair is disabled"
3184 local repaired=$(do_facet ost1 $LCTL get_param -n \
3185 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3186 awk '/^repaired/ { print $2 }')
3187 [ $repaired -eq 0 ] ||
3188 error "(1) Expected 0 repaired, but got $repaired"
3190 echo "Read RPC with right parent FID should be accepted,"
3191 echo "and cause parent FID on OST to be fixed"
3193 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3194 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3196 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3197 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3199 repaired=$(do_facet ost1 $LCTL get_param -n \
3200 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3201 awk '/^repaired/ { print $2 }')
3202 [ $repaired -eq 2 ] ||
3203 error "(3) Expected 1 repaired, but got $repaired"
3205 run_test 19b "OST-object inconsistency self repair"
3207 PATTERN_WITH_HOLE="40000001"
3208 PATTERN_WITHOUT_HOLE="raid0"
3211 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3212 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3213 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3214 skip "MDS older than 2.5.55, LU-4887"
3217 echo "The target MDT-object and some of its OST-object are lost."
3218 echo "The LFSCK should find out the left OST-objects and re-create"
3219 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3220 echo "with the partial OST-objects (LOV EA hole)."
3222 echo "New client can access the file with LOV EA hole via normal"
3223 echo "system tools or commands without crash the system."
3225 echo "For old client, even though it cannot access the file with"
3226 echo "LOV EA hole, it should not cause the system crash."
3229 check_mount_and_prep
3230 $LFS mkdir -i 0 $DIR/$tdir/a1
3231 if [ $OSTCOUNT -gt 2 ]; then
3232 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3235 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3239 # 256 blocks on the stripe0.
3240 # 1 block on the stripe1 for 2 OSTs case.
3241 # 256 blocks on the stripe1 for other cases.
3242 # 1 block on the stripe2 if OSTs > 2
3243 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3244 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3245 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3247 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3248 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3249 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3252 $LFS getstripe $DIR/$tdir/a1/f0
3254 $LFS getstripe $DIR/$tdir/a1/f1
3256 $LFS getstripe $DIR/$tdir/a1/f2
3258 if [ $OSTCOUNT -gt 2 ]; then
3259 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3260 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3262 $LFS getstripe $DIR/$tdir/a1/f3
3265 cancel_lru_locks osc
3267 echo "Inject failure..."
3268 echo "To simulate f0 lost MDT-object"
3269 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3270 do_facet mds1 $LCTL set_param fail_loc=0x1616
3271 rm -f $DIR/$tdir/a1/f0
3273 echo "To simulate f1 lost MDT-object and OST-object0"
3274 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3275 do_facet mds1 $LCTL set_param fail_loc=0x161a
3276 rm -f $DIR/$tdir/a1/f1
3278 echo "To simulate f2 lost MDT-object and OST-object1"
3279 do_facet mds1 $LCTL set_param fail_val=1
3280 rm -f $DIR/$tdir/a1/f2
3282 if [ $OSTCOUNT -gt 2 ]; then
3283 echo "To simulate f3 lost MDT-object and OST-object2"
3284 do_facet mds1 $LCTL set_param fail_val=2
3285 rm -f $DIR/$tdir/a1/f3
3288 umount_client $MOUNT
3291 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3293 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3294 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3296 for k in $(seq $MDSCOUNT); do
3297 # The LFSCK status query internal is 30 seconds. For the case
3298 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3299 # time to guarantee the status sync up.
3300 wait_update_facet mds${k} "$LCTL get_param -n \
3301 mdd.$(facet_svc mds${k}).lfsck_layout |
3302 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3303 error "(2) MDS${k} is not the expected 'completed'"
3306 for k in $(seq $OSTCOUNT); do
3307 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3308 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3309 awk '/^status/ { print $2 }')
3310 [ "$cur_status" == "completed" ] ||
3311 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3314 local repaired=$(do_facet mds1 $LCTL get_param -n \
3315 mdd.$(facet_svc mds1).lfsck_layout |
3316 awk '/^repaired_orphan/ { print $2 }')
3317 if [ $OSTCOUNT -gt 2 ]; then
3318 [ $repaired -eq 9 ] ||
3319 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3321 [ $repaired -eq 4 ] ||
3322 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3325 mount_client $MOUNT || error "(5.0) Fail to start client!"
3327 LOV_PATTERN_F_HOLE=0x40000000
3330 # ${fid0}-R-0 is the old f0
3332 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3333 echo "Check $name, which is the old f0"
3335 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3337 local pattern=$($LFS getstripe -L $name)
3338 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3339 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3341 local stripes=$($LFS getstripe -c $name)
3342 if [ $OSTCOUNT -gt 2 ]; then
3343 [ $stripes -eq 3 ] ||
3344 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3346 [ $stripes -eq 2 ] ||
3347 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3350 local size=$(stat $name | awk '/Size:/ { print $2 }')
3351 [ $size -eq $((4096 * $bcount)) ] ||
3352 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3354 cat $name > /dev/null || error "(5.5) cannot read $name"
3356 echo "dummy" >> $name || error "(5.6) cannot write $name"
3358 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3360 touch $name || error "(5.8) cannot touch $name"
3362 rm -f $name || error "(5.9) cannot unlink $name"
3365 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3367 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3368 if [ $OSTCOUNT -gt 2 ]; then
3369 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3371 echo "Check $name, it contains the old f1's stripe1"
3374 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3376 pattern=$($LFS getstripe -L $name)
3377 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3378 error "(6.2) expect pattern flag hole, but got $pattern"
3380 stripes=$($LFS getstripe -c $name)
3381 if [ $OSTCOUNT -gt 2 ]; then
3382 [ $stripes -eq 3 ] ||
3383 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3385 [ $stripes -eq 2 ] ||
3386 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3389 size=$(stat $name | awk '/Size:/ { print $2 }')
3390 [ $size -eq $((4096 * $bcount)) ] ||
3391 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3393 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3395 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3396 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3399 [ $failures -eq 256 ] ||
3400 error "(6.6) expect 256 IO failures, but get $failures"
3402 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3403 [ $size -eq $((4096 * $bcount)) ] ||
3404 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3406 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3407 error "(6.8) write to the LOV EA hole should fail"
3409 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3410 error "(6.9) write to normal stripe should NOT fail"
3412 echo "foo" >> $name && error "(6.10) append write $name should fail"
3414 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3416 touch $name || error "(6.12) cannot touch $name"
3418 rm -f $name || error "(6.13) cannot unlink $name"
3421 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3423 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3424 if [ $OSTCOUNT -gt 2 ]; then
3425 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3427 echo "Check $name, it contains the old f2's stripe0"
3430 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3432 pattern=$($LFS getstripe -L $name)
3433 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3434 error "(7.2) expect pattern flag hole, but got $pattern"
3436 stripes=$($LFS getstripe -c $name)
3437 size=$(stat $name | awk '/Size:/ { print $2 }')
3438 if [ $OSTCOUNT -gt 2 ]; then
3439 [ $stripes -eq 3 ] ||
3440 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3442 [ $size -eq $((4096 * $bcount)) ] ||
3443 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3445 cat $name > /dev/null &&
3446 error "(7.5.1) normal read $name should fail"
3448 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3449 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3451 [ $failures -eq 256 ] ||
3452 error "(7.6) expect 256 IO failures, but get $failures"
3454 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3455 [ $size -eq $((4096 * $bcount)) ] ||
3456 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3458 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3459 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3461 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3462 error "(7.8.1) write to normal stripe should NOT fail"
3464 echo "foo" >> $name &&
3465 error "(7.8.3) append write $name should fail"
3467 chown $RUNAS_ID:$RUNAS_GID $name ||
3468 error "(7.9.1) cannot chown on $name"
3470 touch $name || error "(7.10.1) cannot touch $name"
3472 [ $stripes -eq 2 ] ||
3473 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3476 [ $size -eq $((4096 * (256 + 0))) ] ||
3477 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3479 cat $name > /dev/null &&
3480 error "(7.5.2) normal read $name should fail"
3482 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3483 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3484 [ $failures -eq 256 ] ||
3485 error "(7.6.2) expect 256 IO failures, but get $failures"
3488 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3489 [ $size -eq $((4096 * $bcount)) ] ||
3490 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3492 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3493 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3495 chown $RUNAS_ID:$RUNAS_GID $name ||
3496 error "(7.9.2) cannot chown on $name"
3498 touch $name || error "(7.10.2) cannot touch $name"
3501 rm -f $name || error "(7.11) cannot unlink $name"
3503 [ $OSTCOUNT -le 2 ] && return
3506 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3508 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3509 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3511 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3513 pattern=$($LFS getstripe -L $name)
3514 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3515 error "(8.2) expect pattern flag hole, but got $pattern"
3517 stripes=$($LFS getstripe -c $name)
3518 [ $stripes -eq 3 ] ||
3519 error "(8.3) expect the stripe count is 3, but got $stripes"
3521 size=$(stat $name | awk '/Size:/ { print $2 }')
3523 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3524 error "(8.4) expect the size $((4096 * 512)), but got $size"
3526 cat $name > /dev/null &&
3527 error "(8.5) normal read $name should fail"
3529 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3530 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3532 [ $failures -eq 256 ] ||
3533 error "(8.6) expect 256 IO failures, but get $failures"
3536 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3537 [ $size -eq $((4096 * $bcount)) ] ||
3538 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3540 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3541 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3543 chown $RUNAS_ID:$RUNAS_GID $name ||
3544 error "(8.9) cannot chown on $name"
3546 touch $name || error "(8.10) cannot touch $name"
3548 rm -f $name || error "(8.11) cannot unlink $name"
3550 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3553 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3554 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3555 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3556 skip "MDS older than 2.5.55, LU-4887"
3559 echo "The target MDT-object and some of its OST-object are lost."
3560 echo "The LFSCK should find out the left OST-objects and re-create"
3561 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3562 echo "with the partial OST-objects (LOV EA hole)."
3564 echo "New client can access the file with LOV EA hole via normal"
3565 echo "system tools or commands without crash the system - PFL case."
3568 check_mount_and_prep
3570 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3571 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3572 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3573 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3574 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3575 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3577 local bcount=$((256 * 3 + 1))
3579 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3580 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3581 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3583 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3584 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3585 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3588 $LFS getstripe $DIR/$tdir/f0
3590 $LFS getstripe $DIR/$tdir/f1
3592 $LFS getstripe $DIR/$tdir/f2
3594 cancel_lru_locks mdc
3595 cancel_lru_locks osc
3597 echo "Inject failure..."
3598 echo "To simulate f0 lost MDT-object"
3599 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3600 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3603 echo "To simulate the case of f1 lost MDT-object and "
3604 echo "the first OST-object in each PFL component"
3605 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3606 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3609 echo "To simulate the case of f2 lost MDT-object and "
3610 echo "the second OST-object in each PFL component"
3611 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3616 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3618 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3619 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3621 for k in $(seq $MDSCOUNT); do
3622 # The LFSCK status query internal is 30 seconds. For the case
3623 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3624 # time to guarantee the status sync up.
3625 wait_update_facet mds${k} "$LCTL get_param -n \
3626 mdd.$(facet_svc mds${k}).lfsck_layout |
3627 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3628 error "(4) MDS${k} is not the expected 'completed'"
3631 for k in $(seq $OSTCOUNT); do
3632 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3633 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3634 awk '/^status/ { print $2 }')
3635 [ "$cur_status" == "completed" ] ||
3636 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3639 local repaired=$(do_facet mds1 $LCTL get_param -n \
3640 mdd.$(facet_svc mds1).lfsck_layout |
3641 awk '/^repaired_orphan/ { print $2 }')
3642 [ $repaired -eq 8 ] ||
3643 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3646 # ${fid0}-R-0 is the old f0
3648 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3649 echo "Check $name, which is the old f0"
3651 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3653 local pattern=$($LFS getstripe -L -I1 $name)
3654 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3655 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3657 pattern=$($LFS getstripe -L -I2 $name)
3658 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3659 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3661 local stripes=$($LFS getstripe -c -I1 $name)
3662 [ $stripes -eq 2 ] ||
3663 error "(7.3.1) expect 2 stripes, but got $stripes"
3665 stripes=$($LFS getstripe -c -I2 $name)
3666 [ $stripes -eq 2 ] ||
3667 error "(7.3.2) expect 2 stripes, but got $stripes"
3669 local e_start=$($LFS getstripe -I1 $name |
3670 awk '/lcme_extent.e_start:/ { print $2 }')
3671 [ $e_start -eq 0 ] ||
3672 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3674 local e_end=$($LFS getstripe -I1 $name |
3675 awk '/lcme_extent.e_end:/ { print $2 }')
3676 [ $e_end -eq 2097152 ] ||
3677 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3679 e_start=$($LFS getstripe -I2 $name |
3680 awk '/lcme_extent.e_start:/ { print $2 }')
3681 [ $e_start -eq 2097152 ] ||
3682 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3684 e_end=$($LFS getstripe -I2 $name |
3685 awk '/lcme_extent.e_end:/ { print $2 }')
3686 [ "$e_end" = "EOF" ] ||
3687 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3689 local size=$(stat $name | awk '/Size:/ { print $2 }')
3690 [ $size -eq $((4096 * $bcount)) ] ||
3691 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3693 cat $name > /dev/null || error "(7.7) cannot read $name"
3695 echo "dummy" >> $name || error "(7.8) cannot write $name"
3697 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3699 touch $name || error "(7.10) cannot touch $name"
3701 rm -f $name || error "(7.11) cannot unlink $name"
3704 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3706 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3707 echo "Check $name, it contains f1's second OST-object in each COMP"
3709 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3711 pattern=$($LFS getstripe -L -I1 $name)
3712 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3713 error "(8.2.1) expect pattern flag hole, but got $pattern"
3715 pattern=$($LFS getstripe -L -I2 $name)
3716 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3717 error "(8.2.2) expect pattern flag hole, but got $pattern"
3719 stripes=$($LFS getstripe -c -I1 $name)
3720 [ $stripes -eq 2 ] ||
3721 error "(8.3.2) expect 2 stripes, but got $stripes"
3723 stripes=$($LFS getstripe -c -I2 $name)
3724 [ $stripes -eq 2 ] ||
3725 error "(8.3.2) expect 2 stripes, but got $stripes"
3727 e_start=$($LFS getstripe -I1 $name |
3728 awk '/lcme_extent.e_start:/ { print $2 }')
3729 [ $e_start -eq 0 ] ||
3730 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3732 e_end=$($LFS getstripe -I1 $name |
3733 awk '/lcme_extent.e_end:/ { print $2 }')
3734 [ $e_end -eq 2097152 ] ||
3735 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3737 e_start=$($LFS getstripe -I2 $name |
3738 awk '/lcme_extent.e_start:/ { print $2 }')
3739 [ $e_start -eq 2097152 ] ||
3740 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3742 e_end=$($LFS getstripe -I2 $name |
3743 awk '/lcme_extent.e_end:/ { print $2 }')
3744 [ "$e_end" = "EOF" ] ||
3745 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3747 size=$(stat $name | awk '/Size:/ { print $2 }')
3748 [ $size -eq $((4096 * $bcount)) ] ||
3749 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3751 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3753 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3754 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3756 # The first stripe in each COMP was lost
3757 [ $failures -eq 512 ] ||
3758 error "(8.8) expect 512 IO failures, but get $failures"
3760 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3761 [ $size -eq $((4096 * $bcount)) ] ||
3762 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3764 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3765 error "(8.10) write to the LOV EA hole should fail"
3767 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3768 error "(8.11) write to normal stripe should NOT fail"
3770 echo "foo" >> $name && error "(8.12) append write $name should fail"
3772 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3774 touch $name || error "(8.14) cannot touch $name"
3776 rm -f $name || error "(8.15) cannot unlink $name"
3779 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3781 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3782 echo "Check $name, it contains f2's first stripe in each COMP"
3784 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3786 pattern=$($LFS getstripe -L -I1 $name)
3787 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3788 error "(9.2.1) expect pattern flag hole, but got $pattern"
3790 pattern=$($LFS getstripe -L -I2 $name)
3791 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3792 error "(9.2.2) expect pattern flag hole, but got $pattern"
3794 stripes=$($LFS getstripe -c -I1 $name)
3795 [ $stripes -eq 2 ] ||
3796 error "(9.3.2) expect 2 stripes, but got $stripes"
3798 stripes=$($LFS getstripe -c -I2 $name)
3799 [ $stripes -eq 2 ] ||
3800 error "(9.3.2) expect 2 stripes, but got $stripes"
3802 e_start=$($LFS getstripe -I1 $name |
3803 awk '/lcme_extent.e_start:/ { print $2 }')
3804 [ $e_start -eq 0 ] ||
3805 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3807 e_end=$($LFS getstripe -I1 $name |
3808 awk '/lcme_extent.e_end:/ { print $2 }')
3809 [ $e_end -eq 2097152 ] ||
3810 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3812 e_start=$($LFS getstripe -I2 $name |
3813 awk '/lcme_extent.e_start:/ { print $2 }')
3814 [ $e_start -eq 2097152 ] ||
3815 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3817 e_end=$($LFS getstripe -I2 $name |
3818 awk '/lcme_extent.e_end:/ { print $2 }')
3819 [ "$e_end" = "EOF" ] ||
3820 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3822 size=$(stat $name | awk '/Size:/ { print $2 }')
3823 # The second stripe in COMP was lost, so we do not know there
3824 # have ever been some data before. 'stat' will regard it as
3825 # no data on the lost stripe.
3827 [ $size -eq $((4096 * $bcount)) ] ||
3828 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3830 cat $name > /dev/null &&
3831 error "(9.7) normal read $name should fail"
3833 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3834 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3835 [ $failures -eq 512 ] ||
3836 error "(9.8) expect 256 IO failures, but get $failures"
3838 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3839 # The second stripe in COMP was lost, so we do not know there
3840 # have ever been some data before. Since 'dd' skip failure,
3841 # it will regard the lost stripe contains data.
3843 [ $size -eq $((4096 * $bcount)) ] ||
3844 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3846 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3847 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3849 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3850 error "(9.11) write to normal stripe should NOT fail"
3852 echo "foo" >> $name &&
3853 error "(9.12) append write $name should fail"
3855 chown $RUNAS_ID:$RUNAS_GID $name ||
3856 error "(9.13) cannot chown on $name"
3858 touch $name || error "(9.14) cannot touch $name"
3860 rm -f $name || error "(7.15) cannot unlink $name"
3862 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3865 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3866 skip "MDS older than 2.5.59, LU-4887"
3868 check_mount_and_prep
3869 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3871 echo "Start all LFSCK components by default (-s 1)"
3872 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3873 error "Fail to start LFSCK"
3875 echo "namespace LFSCK should be in 'scanning-phase1' status"
3876 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3877 [ "$STATUS" == "scanning-phase1" ] ||
3878 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3880 echo "layout LFSCK should be in 'scanning-phase1' status"
3881 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3882 [ "$STATUS" == "scanning-phase1" ] ||
3883 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3885 echo "Stop all LFSCK components by default"
3886 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3887 error "Fail to stop LFSCK"
3889 run_test 21 "run all LFSCK components by default"
3892 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3893 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3894 skip "MDS older than 2.6.50, LU-5511"
3897 echo "The parent_A references the child directory via some name entry,"
3898 echo "but the child directory back references another parent_B via its"
3899 echo "".." name entry. The parent_B does not exist. Then the namespace"
3900 echo "LFSCK will repair the child directory's ".." name entry."
3903 check_mount_and_prep
3905 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3906 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3908 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3909 echo "The dummy's dotdot name entry references the guard."
3910 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3911 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3912 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3913 error "(3) Fail to mkdir on MDT0"
3914 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3916 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3918 echo "Trigger namespace LFSCK to repair unmatched pairs"
3919 $START_NAMESPACE -A -r ||
3920 error "(5) Fail to start LFSCK for namespace"
3922 wait_all_targets_blocked namespace completed 6
3924 local repaired=$($SHOW_NAMESPACE |
3925 awk '/^unmatched_pairs_repaired/ { print $2 }')
3926 [ $repaired -eq 1 ] ||
3927 error "(7) Fail to repair unmatched pairs: $repaired"
3929 echo "'ls' should success after namespace LFSCK repairing"
3930 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3931 error "(8) ls should success."
3933 run_test 22a "LFSCK can repair unmatched pairs (1)"
3936 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3937 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3938 skip "MDS older than 2.6.50, LU-5511"
3941 echo "The parent_A references the child directory via the name entry_B,"
3942 echo "but the child directory back references another parent_C via its"
3943 echo "".." name entry. The parent_C exists, but there is no the name"
3944 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3945 echo "the child directory's ".." name entry and its linkEA."
3948 check_mount_and_prep
3950 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3951 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3953 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3954 echo "and bad linkEA. The dummy's dotdot name entry references the"
3955 echo "guard. The dummy's linkEA references n non-exist name entry."
3956 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3957 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3958 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3959 error "(3) Fail to mkdir on MDT0"
3960 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3962 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3963 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3964 local dummyname=$($LFS fid2path $DIR $dummyfid)
3965 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3966 error "(4) fid2path works unexpectedly."
3968 echo "Trigger namespace LFSCK to repair unmatched pairs"
3969 $START_NAMESPACE -A -r ||
3970 error "(5) Fail to start LFSCK for namespace"
3972 wait_all_targets_blocked namespace completed 6
3974 local repaired=$($SHOW_NAMESPACE |
3975 awk '/^unmatched_pairs_repaired/ { print $2 }')
3976 [ $repaired -eq 1 ] ||
3977 error "(7) Fail to repair unmatched pairs: $repaired"
3979 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3980 local dummyname=$($LFS fid2path $DIR $dummyfid)
3981 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3982 error "(8) fid2path does not work"
3984 run_test 22b "LFSCK can repair unmatched pairs (2)"
3987 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3988 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3989 skip "MDS older than 2.6.50, LU-5512"
3992 echo "The name entry is there, but the MDT-object for such name "
3993 echo "entry does not exist. The namespace LFSCK should find out "
3994 echo "and repair the inconsistency as required."
3997 check_mount_and_prep
3999 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4000 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
4002 echo "Inject failure stub on MDT1 to simulate dangling name entry"
4003 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
4004 do_facet mds2 $LCTL set_param fail_loc=0x1620
4005 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
4006 do_facet mds2 $LCTL set_param fail_loc=0
4008 echo "'ls' should fail because of dangling name entry"
4009 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
4011 echo "Trigger namespace LFSCK to find out dangling name entry"
4012 $START_NAMESPACE -A -r ||
4013 error "(5) Fail to start LFSCK for namespace"
4015 wait_all_targets_blocked namespace completed 6
4017 local repaired=$($SHOW_NAMESPACE |
4018 awk '/^dangling_repaired/ { print $2 }')
4019 [ $repaired -eq 1 ] ||
4020 error "(7) Fail to repair dangling name entry: $repaired"
4022 echo "'ls' should fail because not re-create MDT-object by default"
4023 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
4025 echo "Trigger namespace LFSCK again to repair dangling name entry"
4026 $START_NAMESPACE -A -r -C ||
4027 error "(9) Fail to start LFSCK for namespace"
4029 wait_all_targets_blocked namespace completed 10
4031 repaired=$($SHOW_NAMESPACE |
4032 awk '/^dangling_repaired/ { print $2 }')
4033 [ $repaired -eq 1 ] ||
4034 error "(11) Fail to repair dangling name entry: $repaired"
4036 echo "'ls' should success after namespace LFSCK repairing"
4037 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
4039 run_test 23a "LFSCK can repair dangling name entry (1)"
4042 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4043 skip "MDS older than 2.6.50, LU-5512"
4046 echo "The objectA has multiple hard links, one of them corresponding"
4047 echo "to the name entry_B. But there is something wrong for the name"
4048 echo "entry_B and cause entry_B to references non-exist object_C."
4049 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4050 echo "as dangling, and re-create the lost object_C. When the LFSCK"
4051 echo "comes to the second-stage scanning, it will find that the"
4052 echo "former re-creating object_C is not proper, and will try to"
4053 echo "replace the object_C with the real object_A."
4056 check_mount_and_prep
4058 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4059 $LFS path2fid $DIR/$tdir/d0
4061 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4063 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4064 $LFS path2fid $DIR/$tdir/d0/f0
4066 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4067 $LFS path2fid $DIR/$tdir/d0/f1
4069 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
4070 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
4072 if [ "$SEQ0" != "$SEQ1" ]; then
4073 # To guarantee that the f0 and f1 are in the same FID seq
4074 rm -f $DIR/$tdir/d0/f0 ||
4075 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4076 echo "dummy" > $DIR/$tdir/d0/f0 ||
4077 error "(3.2) Fail to touch on MDT0"
4078 $LFS path2fid $DIR/$tdir/d0/f0
4081 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4082 OID=$(printf %d $OID)
4084 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4085 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4086 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4087 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4088 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4090 # If there is creation after the dangling injection, it may re-use
4091 # the just released local object (inode) that is referenced by the
4092 # dangling name entry. It will fail the dangling injection.
4093 # So before deleting the target object for the dangling name entry,
4094 # remove some other objects to avoid the target object being reused
4095 # by some potential creations. LU-7429
4096 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4098 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4100 echo "'ls' should fail because of dangling name entry"
4101 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4102 error "(6) ls should fail."
4104 echo "Trigger namespace LFSCK to find out dangling name entry"
4105 $START_NAMESPACE -r -C ||
4106 error "(7) Fail to start LFSCK for namespace"
4108 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4109 mdd.${MDT_DEV}.lfsck_namespace |
4110 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4112 error "(8) unexpected status"
4115 local repaired=$($SHOW_NAMESPACE |
4116 awk '/^dangling_repaired/ { print $2 }')
4117 [ $repaired -eq 1 ] ||
4118 error "(9) Fail to repair dangling name entry: $repaired"
4120 repaired=$($SHOW_NAMESPACE |
4121 awk '/^multiple_linked_repaired/ { print $2 }')
4122 [ $repaired -eq 1 ] ||
4123 error "(10) Fail to drop the former created object: $repaired"
4125 local data=$(cat $DIR/$tdir/d0/foo)
4126 [ "$data" == "dummy" ] ||
4127 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4129 run_test 23b "LFSCK can repair dangling name entry (2)"
4132 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4133 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4134 mdd.${MDT_DEV}.lfsck_namespace |
4135 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4137 error "(10) unexpected status"
4140 stop_full_debug_logging
4144 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4145 skip "MDS older than 2.6.50, LU-5512"
4148 echo "The objectA has multiple hard links, one of them corresponding"
4149 echo "to the name entry_B. But there is something wrong for the name"
4150 echo "entry_B and cause entry_B to references non-exist object_C."
4151 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4152 echo "as dangling, and re-create the lost object_C. And then others"
4153 echo "modified the re-created object_C. When the LFSCK comes to the"
4154 echo "second-stage scanning, it will find that the former re-creating"
4155 echo "object_C maybe wrong and try to replace the object_C with the"
4156 echo "real object_A. But because object_C has been modified, so the"
4157 echo "LFSCK cannot replace it."
4160 start_full_debug_logging
4162 check_mount_and_prep
4164 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4165 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4166 echo "parent_fid=$parent_fid"
4168 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4170 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4171 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4172 echo "f0_fid=$f0_fid"
4174 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4175 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4176 echo "f1_fid=$f1_fid"
4178 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4179 # To guarantee that the f0 and f1 are in the same FID seq
4180 rm -f $DIR/$tdir/d0/f0 ||
4181 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4182 echo "dummy" > $DIR/$tdir/d0/f0 ||
4183 error "(3.2) Fail to touch on MDT0"
4184 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4185 echo "f0_fid=$f0_fid (replaced)"
4188 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4190 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4191 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4192 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4193 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4194 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4196 # If there is creation after the dangling injection, it may re-use
4197 # the just released local object (inode) that is referenced by the
4198 # dangling name entry. It will fail the dangling injection.
4199 # So before deleting the target object for the dangling name entry,
4200 # remove some other objects to avoid the target object being reused
4201 # by some potential creations. LU-7429
4202 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4204 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4206 echo "'ls' should fail because of dangling name entry"
4207 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4208 error "(6) ls should fail."
4210 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4211 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4213 echo "Trigger namespace LFSCK to find out dangling name entry"
4214 $START_NAMESPACE -r -C ||
4215 error "(7) Fail to start LFSCK for namespace"
4217 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4218 # While unexpected by the test, it is valid for LFSCK to repair
4219 # the link to the original object before any data is written.
4220 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4222 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4223 log "LFSCK repaired file prematurely"
4228 stat $DIR/$tdir/d0/foo
4230 error "(8) unexpected size"
4233 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4234 cancel_lru_locks osc
4238 local repaired=$($SHOW_NAMESPACE |
4239 awk '/^dangling_repaired/ { print $2 }')
4240 [ $repaired -eq 1 ] ||
4241 error "(11) Fail to repair dangling name entry: $repaired"
4243 local data=$(cat $DIR/$tdir/d0/foo)
4244 [ "$data" != "dummy" ] ||
4245 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4247 run_test 23c "LFSCK can repair dangling name entry (3)"
4250 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4251 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4252 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4253 skip "MDS older than 2.6.50, LU-5513"
4256 echo "Two MDT-objects back reference the same name entry via their"
4257 echo "each own linkEA entry, but the name entry only references one"
4258 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4259 echo "for the MDT-object that is not recognized. If such MDT-object"
4260 echo "has no other linkEA entry after the removing, then the LFSCK"
4261 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4264 check_mount_and_prep
4266 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4268 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4269 $LFS path2fid $DIR/$tdir/d0/guard
4271 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4272 $LFS path2fid $DIR/$tdir/d0/dummy
4275 if [ $mds1_FSTYPE != ldiskfs ]; then
4276 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4278 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4281 touch $DIR/$tdir/d0/guard/foo ||
4282 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4284 echo "Inject failure stub on MDT0 to simulate the case that"
4285 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4286 echo "that references $DIR/$tdir/d0/guard/foo."
4287 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4288 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4289 echo "there with the same linkEA entry as another MDT-object"
4290 echo "$DIR/$tdir/d0/guard/foo has"
4292 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4293 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4294 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4295 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4296 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4297 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4298 rmdir $DIR/$tdir/d0/dummy/foo ||
4299 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4300 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4302 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4303 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4304 error "(6) stat successfully unexpectedly"
4306 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4307 $START_NAMESPACE -A -r ||
4308 error "(7) Fail to start LFSCK for namespace"
4310 wait_all_targets_blocked namespace completed 8
4312 local repaired=$($SHOW_NAMESPACE |
4313 awk '/^multiple_referenced_repaired/ { print $2 }')
4314 [ $repaired -eq 1 ] ||
4315 error "(9) Fail to repair multiple referenced name entry: $repaired"
4317 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4318 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4319 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4321 local cname="$cfid-$pfid-D-0"
4322 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4323 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4325 run_test 24 "LFSCK can repair multiple-referenced name entry"
4328 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4329 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4330 skip "MDS older than 2.6.50, LU-5515"
4333 echo "The file type in the name entry does not match the file type"
4334 echo "claimed by the referenced object. Then the LFSCK will update"
4335 echo "the file type in the name entry."
4338 check_mount_and_prep
4340 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4342 echo "Inject failure stub on MDT0 to simulate the case that"
4343 echo "the file type stored in the name entry is wrong."
4345 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4347 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4348 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4350 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4351 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4353 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4354 mdd.${MDT_DEV}.lfsck_namespace |
4355 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4357 error "(4) unexpected status"
4360 local repaired=$($SHOW_NAMESPACE |
4361 awk '/^bad_file_type_repaired/ { print $2 }')
4362 [ $repaired -eq 1 ] ||
4363 error "(5) Fail to repair bad file type in name entry: $repaired"
4365 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4367 run_test 25 "LFSCK can repair bad file type in the name entry"
4370 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4371 skip "MDS older than 2.6.50, LU-5516"
4374 echo "The local name entry back referenced by the MDT-object is lost."
4375 echo "The namespace LFSCK will add the missing local name entry back"
4376 echo "to the normal namespace."
4379 check_mount_and_prep
4381 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4382 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4383 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4385 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4386 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4388 echo "Inject failure stub on MDT0 to simulate the case that"
4389 echo "foo's name entry will be removed, but the foo's object"
4390 echo "and its linkEA are kept in the system."
4392 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4393 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4394 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4395 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4397 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4398 error "(5) 'ls' should fail"
4400 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4401 $START_NAMESPACE -r -A ||
4402 error "(6) Fail to start LFSCK for namespace"
4404 wait_all_targets_blocked namespace completed 7
4406 local repaired=$($SHOW_NAMESPACE |
4407 awk '/^lost_dirent_repaired/ { print $2 }')
4408 [ $repaired -eq 1 ] ||
4409 error "(8) Fail to repair lost dirent: $repaired"
4411 ls -ail $DIR/$tdir/d0/foo ||
4412 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4414 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4415 [ "$foofid" == "$foofid2" ] ||
4416 error "(10) foo's FID changed: $foofid, $foofid2"
4418 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4421 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4422 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4423 skip "MDS older than 2.6.50, LU-5516"
4426 echo "The remote name entry back referenced by the MDT-object is lost."
4427 echo "The namespace LFSCK will add the missing remote name entry back"
4428 echo "to the normal namespace."
4431 check_mount_and_prep
4433 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4434 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4435 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4437 echo "Inject failure stub on MDT0 to simulate the case that"
4438 echo "foo's name entry will be removed, but the foo's object"
4439 echo "and its linkEA are kept in the system."
4441 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4442 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4443 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4444 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4446 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4447 error "(4) 'ls' should fail"
4449 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4450 $START_NAMESPACE -r -A ||
4451 error "(5) Fail to start LFSCK for namespace"
4453 wait_all_targets_blocked namespace completed 6
4455 local repaired=$($SHOW_NAMESPACE |
4456 awk '/^lost_dirent_repaired/ { print $2 }')
4457 [ $repaired -eq 1 ] ||
4458 error "(7) Fail to repair lost dirent: $repaired"
4460 ls -ail $DIR/$tdir/d0/foo ||
4461 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4463 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4464 [ "$foofid" == "$foofid2" ] ||
4465 error "(9) foo's FID changed: $foofid, $foofid2"
4467 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4470 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4471 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4472 skip "MDS older than 2.6.50, LU-5516"
4475 echo "The local parent referenced by the MDT-object linkEA is lost."
4476 echo "The namespace LFSCK will re-create the lost parent as orphan."
4479 check_mount_and_prep
4481 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4482 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4483 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4484 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4486 echo "Inject failure stub on MDT0 to simulate the case that"
4487 echo "foo's name entry will be removed, but the foo's object"
4488 echo "and its linkEA are kept in the system. And then remove"
4489 echo "another hard link and the parent directory."
4491 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4492 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4493 rm -f $DIR/$tdir/d0/foo ||
4494 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4495 rm -f $DIR/$tdir/d0/dummy ||
4496 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4499 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4500 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4502 echo "Trigger namespace LFSCK to repair the lost parent"
4503 $START_NAMESPACE -r -A ||
4504 error "(6) Fail to start LFSCK for namespace"
4506 wait_all_targets_blocked namespace completed 7
4508 local repaired=$($SHOW_NAMESPACE |
4509 awk '/^lost_dirent_repaired/ { print $2 }')
4510 [ $repaired -eq 1 ] ||
4511 error "(8) Fail to repair lost dirent: $repaired"
4513 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4514 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4515 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4517 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4519 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4520 [ ! -z "$cname" ] ||
4521 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4523 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4526 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4527 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4528 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4529 skip "MDS older than 2.6.50, LU-5516"
4532 echo "The remote parent referenced by the MDT-object linkEA is lost."
4533 echo "The namespace LFSCK will re-create the lost parent as orphan."
4536 check_mount_and_prep
4538 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4539 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4541 $LFS path2fid $DIR/$tdir/d0
4543 echo "Inject failure stub on MDT0 to simulate the case that"
4544 echo "foo's name entry will be removed, but the foo's object"
4545 echo "and its linkEA are kept in the system. And then remove"
4546 echo "the parent directory."
4548 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4549 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4550 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4551 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4553 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4554 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4556 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4557 $START_NAMESPACE -r -A ||
4558 error "(6) Fail to start LFSCK for namespace"
4560 wait_all_targets_blocked namespace completed 7
4562 local repaired=$($SHOW_NAMESPACE |
4563 awk '/^lost_dirent_repaired/ { print $2 }')
4564 [ $repaired -eq 1 ] ||
4565 error "(8) Fail to repair lost dirent: $repaired"
4567 ls -ail $MOUNT/.lustre/lost+found/
4569 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4570 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4571 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4573 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4575 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4576 [ ! -z "$cname" ] ||
4577 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4579 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4582 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4583 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4584 skip "MDS older than 2.6.50, LU-5506"
4587 echo "The target name entry is lost. The LFSCK should insert the"
4588 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4589 echo "the MDT (on which the orphan MDT-object resides) has ever"
4590 echo "failed to respond some name entry verification during the"
4591 echo "first stage-scanning, then the LFSCK should skip to handle"
4592 echo "orphan MDT-object on this MDT. But other MDTs should not"
4596 check_mount_and_prep
4597 $LFS mkdir -i 0 $DIR/$tdir/d1
4598 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4599 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4601 $LFS mkdir -i 1 $DIR/$tdir/d2
4602 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4603 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4605 echo "Inject failure stub on MDT0 to simulate the case that"
4606 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4607 echo "and its linkEA are kept in the system. And the case that"
4608 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4609 echo "and its linkEA are kept in the system."
4611 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4612 do_facet mds1 $LCTL set_param fail_loc=0x1624
4613 do_facet mds2 $LCTL set_param fail_loc=0x1624
4614 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4615 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4616 do_facet mds1 $LCTL set_param fail_loc=0
4617 do_facet mds2 $LCTL set_param fail_loc=0
4619 cancel_lru_locks mdc
4620 cancel_lru_locks osc
4622 echo "Inject failure, to simulate the MDT0 fail to handle"
4623 echo "MDT1 LFSCK request during the first-stage scanning."
4624 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4625 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4627 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4628 $START_NAMESPACE -r -A ||
4629 error "(3) Fail to start LFSCK for namespace"
4631 wait_update_facet mds1 "$LCTL get_param -n \
4632 mdd.$(facet_svc mds1).lfsck_namespace |
4633 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4634 error "(4) mds1 is not the expected 'partial'"
4637 wait_update_facet mds2 "$LCTL get_param -n \
4638 mdd.$(facet_svc mds2).lfsck_namespace |
4639 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4640 error "(5) mds2 is not the expected 'completed'"
4643 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4645 local repaired=$(do_facet mds1 $LCTL get_param -n \
4646 mdd.$(facet_svc mds1).lfsck_namespace |
4647 awk '/^lost_dirent_repaired/ { print $2 }')
4648 [ $repaired -eq 0 ] ||
4649 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4651 repaired=$(do_facet mds2 $LCTL get_param -n \
4652 mdd.$(facet_svc mds2).lfsck_namespace |
4653 awk '/^lost_dirent_repaired/ { print $2 }')
4654 [ $repaired -eq 1 ] ||
4655 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4657 echo "Trigger namespace LFSCK on all devices again to cleanup"
4658 $START_NAMESPACE -r -A ||
4659 error "(8) Fail to start LFSCK for namespace"
4661 wait_all_targets_blocked namespace completed 9
4663 local repaired=$(do_facet mds1 $LCTL get_param -n \
4664 mdd.$(facet_svc mds1).lfsck_namespace |
4665 awk '/^lost_dirent_repaired/ { print $2 }')
4666 [ $repaired -eq 1 ] ||
4667 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4669 repaired=$(do_facet mds2 $LCTL get_param -n \
4670 mdd.$(facet_svc mds2).lfsck_namespace |
4671 awk '/^lost_dirent_repaired/ { print $2 }')
4672 [ $repaired -eq 0 ] ||
4673 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4675 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4678 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4679 skip "MDS older than 2.6.50, LU-5517"
4682 echo "The object's nlink attribute is larger than the object's known"
4683 echo "name entries count. The LFSCK will repair the object's nlink"
4684 echo "attribute to match the known name entries count"
4687 check_mount_and_prep
4689 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4690 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4692 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4693 echo "nlink attribute is larger than its name entries count."
4695 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4696 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4697 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4698 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4699 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4701 cancel_lru_locks mdc
4702 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4703 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4705 echo "Trigger namespace LFSCK to repair the nlink count"
4706 $START_NAMESPACE -r -A ||
4707 error "(5) Fail to start LFSCK for namespace"
4709 wait_all_targets_blocked namespace completed 6
4711 local repaired=$($SHOW_NAMESPACE |
4712 awk '/^nlinks_repaired/ { print $2 }')
4713 [ $repaired -eq 1 ] ||
4714 error "(7) Fail to repair nlink count: $repaired"
4716 cancel_lru_locks mdc
4717 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4718 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4720 # Disable 29a, we only allow nlink to be updated if the known linkEA
4721 # entries is larger than nlink count.
4723 #run_test 29a "LFSCK can repair bad nlink count (1)"
4726 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4727 skip "MDS older than 2.6.50, LU-5517"
4730 echo "The object's nlink attribute is smaller than the object's known"
4731 echo "name entries count. The LFSCK will repair the object's nlink"
4732 echo "attribute to match the known name entries count"
4735 check_mount_and_prep
4737 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4738 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4740 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4741 echo "nlink attribute is smaller than its name entries count."
4743 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4744 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4745 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4746 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4747 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4749 cancel_lru_locks mdc
4750 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4751 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4753 echo "Trigger namespace LFSCK to repair the nlink count"
4754 $START_NAMESPACE -r -A ||
4755 error "(5) Fail to start LFSCK for namespace"
4757 wait_all_targets_blocked namespace completed 6
4759 local repaired=$($SHOW_NAMESPACE |
4760 awk '/^nlinks_repaired/ { print $2 }')
4761 [ $repaired -eq 1 ] ||
4762 error "(7) Fail to repair nlink count: $repaired"
4764 cancel_lru_locks mdc
4765 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4766 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4768 run_test 29b "LFSCK can repair bad nlink count (2)"
4772 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4773 skip "MDS older than 2.6.50, LU-5517"
4776 echo "The namespace LFSCK will create many hard links to the target"
4777 echo "file as to exceed the linkEA size limitation. Under such case"
4778 echo "the linkEA will be marked as overflow that will prevent the"
4779 echo "target file to be migrated. Then remove some hard links to"
4780 echo "make the left hard links to be held within the linkEA size"
4781 echo "limitation. But before the namespace LFSCK adding all the"
4782 echo "missed linkEA entries back, the overflow mark (timestamp)"
4783 echo "will not be cleared."
4786 check_mount_and_prep
4788 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4789 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4790 error "(0.2) Fail to mkdir"
4791 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4792 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4794 # define MAX_LINKEA_SIZE 4096
4795 # sizeof(link_ea_header) = 24
4796 # sizeof(link_ea_entry) = 18
4797 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4798 # (sizeof(link_ea_entry) + name_length))
4799 # If the average name length is 12 bytes, then 150 hard links
4800 # is totally enough to overflow the linkEA
4801 echo "Create 150 hard links should succeed although the linkEA overflow"
4802 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4803 error "(2) Fail to hard link"
4805 cancel_lru_locks mdc
4806 if [ $MDSCOUNT -ge 2 ]; then
4807 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4808 error "(3.1) Migrate should fail"
4810 echo "The object with linkEA overflow should NOT be migrated"
4811 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4812 [ "$newfid" == "$oldfid" ] ||
4813 error "(3.2) Migrate should fail: $newfid != $oldfid"
4816 # Remove 100 hard links, then the linkEA should have space
4817 # to hold the missed linkEA entries.
4818 echo "Remove 100 hard links to save space for the missed linkEA entries"
4819 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4821 if [ $MDSCOUNT -ge 2 ]; then
4822 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4823 error "(5.1) Migrate should fail"
4825 # The overflow timestamp is still there, so migration will fail.
4826 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4827 [ "$newfid" == "$oldfid" ] ||
4828 error "(5.2) Migrate should fail: $newfid != $oldfid"
4831 # sleep 3 seconds to guarantee that the overflow is recognized
4834 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4835 $START_NAMESPACE -r -A ||
4836 error "(6) Fail to start LFSCK for namespace"
4838 wait_all_targets_blocked namespace completed 7
4840 local repaired=$($SHOW_NAMESPACE |
4841 awk '/^linkea_overflow_cleared/ { print $2 }')
4842 [ $repaired -eq 1 ] ||
4843 error "(8) Fail to clear linkea overflow: $repaired"
4845 repaired=$($SHOW_NAMESPACE |
4846 awk '/^nlinks_repaired/ { print $2 }')
4847 [ $repaired -eq 0 ] ||
4848 error "(9) Unexpected nlink repaired: $repaired"
4850 if [ $MDSCOUNT -ge 2 ]; then
4851 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4852 error "(10.1) Migrate failure"
4854 # Migration should succeed after clear the overflow timestamp.
4855 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4856 [ "$newfid" != "$oldfid" ] ||
4857 error "(10.2) Migrate should succeed"
4859 ls -l $DIR/$tdir/foo > /dev/null ||
4860 error "(11) 'ls' failed after migration"
4863 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4864 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4866 run_test 29c "verify linkEA size limitation"
4869 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4870 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4871 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4872 skip "MDS older than 2.6.50, LU-5518"
4875 echo "The namespace LFSCK will move the orphans from backend"
4876 echo "/lost+found directory to normal client visible namespace"
4877 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4880 check_mount_and_prep
4882 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4883 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4885 echo "Inject failure stub on MDT0 to simulate the case that"
4886 echo "directory d0 has no linkEA entry, then the LFSCK will"
4887 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4889 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4891 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4894 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4895 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4897 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4898 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4900 echo "Inject failure stub on MDT0 to simulate the case that the"
4901 echo "object's name entry will be removed, but not destroy the"
4902 echo "object. Then backend e2fsck will handle it as orphan and"
4903 echo "add them into the backend /lost+found directory."
4905 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4906 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4907 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4908 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4909 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4910 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4911 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4913 umount_client $MOUNT || error "(10) Fail to stop client!"
4915 stop $SINGLEMDS || error "(11) Fail to stop $SINGLEMDS"
4917 local dev=$(facet_device $SINGLEMDS)
4919 echo "run e2fsck on $SINGLEMDS"
4920 run_e2fsck $(facet_active_host $SINGLEMDS) $dev "-y" ||
4921 error "(12) Fail to run e2fsck"
4923 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 13
4925 echo "Trigger namespace LFSCK to recover backend orphans"
4926 $START_NAMESPACE -r -A ||
4927 error "(14) Fail to start LFSCK for namespace"
4929 wait_all_targets_blocked namespace completed 15
4931 local repaired=$($SHOW_NAMESPACE |
4932 awk '/^local_lost_found_moved/ { print $2 }')
4933 [ $repaired -ge 4 ] ||
4934 error "(16) Fail to recover backend orphans: $repaired"
4936 mount_client $MOUNT || error "(17) Fail to start client!"
4938 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4940 ls -ail $MOUNT/.lustre/lost+found/
4942 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4943 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4944 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4946 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4948 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4949 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4951 stat ${cname}/d1 || error "(21) d1 is not recovered"
4952 stat ${cname}/f1 || error "(22) f1 is not recovered"
4954 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4957 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4958 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4959 skip "MDS older than 2.6.50, LU-5519"
4962 echo "For the name entry under a striped directory, if the name"
4963 echo "hash does not match the shard, then the LFSCK will repair"
4964 echo "the bad name entry"
4967 check_mount_and_prep
4969 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4970 error "(1) Fail to create striped directory"
4972 echo "Inject failure stub on client to simulate the case that"
4973 echo "some name entry should be inserted into other non-first"
4974 echo "shard, but inserted into the first shard by wrong"
4976 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4977 $LCTL set_param fail_loc=0x1628 fail_val=0
4978 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4979 error "(2) Fail to create file under striped directory"
4980 $LCTL set_param fail_loc=0 fail_val=0
4982 echo "Trigger namespace LFSCK to repair bad name hash"
4983 $START_NAMESPACE -r -A ||
4984 error "(3) Fail to start LFSCK for namespace"
4986 wait_all_targets_blocked namespace completed 4
4988 local repaired=$($SHOW_NAMESPACE |
4989 awk '/^name_hash_repaired/ { print $2 }')
4990 [ $repaired -ge 1 ] ||
4991 error "(5) Fail to repair bad name hash: $repaired"
4993 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
4995 error "Fail to find flag bad type: $rc"
4997 umount_client $MOUNT || error "(6) umount failed"
4998 mount_client $MOUNT || error "(7) mount failed"
5000 for ((i = 0; i < $MDSCOUNT; i++)); do
5001 stat $DIR/$tdir/striped_dir/d$i ||
5002 error "(8) Fail to stat d$i after LFSCK"
5003 rmdir $DIR/$tdir/striped_dir/d$i ||
5004 error "(9) Fail to unlink d$i after LFSCK"
5007 rmdir $DIR/$tdir/striped_dir ||
5008 error "(10) Fail to remove the striped directory after LFSCK"
5010 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
5013 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5014 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5015 skip "MDS older than 2.6.50, LU-5519"
5018 echo "For the name entry under a striped directory, if the name"
5019 echo "hash does not match the shard, then the LFSCK will repair"
5020 echo "the bad name entry"
5023 check_mount_and_prep
5025 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5026 error "(1) Fail to create striped directory"
5028 echo "Inject failure stub on client to simulate the case that"
5029 echo "some name entry should be inserted into other non-second"
5030 echo "shard, but inserted into the secod shard by wrong"
5032 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
5033 $LCTL set_param fail_loc=0x1628 fail_val=1
5034 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
5035 error "(2) Fail to create file under striped directory"
5036 $LCTL set_param fail_loc=0 fail_val=0
5038 echo "Trigger namespace LFSCK to repair bad name hash"
5039 $START_NAMESPACE -r -A ||
5040 error "(3) Fail to start LFSCK for namespace"
5042 wait_all_targets_blocked namespace completed 4
5044 local repaired=$(do_facet mds2 $LCTL get_param -n \
5045 mdd.$(facet_svc mds2).lfsck_namespace |
5046 awk '/^name_hash_repaired/ { print $2 }')
5047 echo "repaired $repaired name entries with bad hash"
5048 [ $repaired -ge 1 ] ||
5049 error "(5) Fail to repair bad name hash: $repaired"
5051 umount_client $MOUNT || error "(6) umount failed"
5052 mount_client $MOUNT || error "(7) mount failed"
5054 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
5055 stat $DIR/$tdir/striped_dir/d$i ||
5056 error "(8) Fail to stat d$i after LFSCK"
5057 rmdir $DIR/$tdir/striped_dir/d$i ||
5058 error "(9) Fail to unlink d$i after LFSCK"
5061 rmdir $DIR/$tdir/striped_dir ||
5062 error "(10) Fail to remove the striped directory after LFSCK"
5064 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
5067 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5068 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5069 skip "MDS older than 2.6.50, LU-5519"
5072 echo "For some reason, the master MDT-object of the striped directory"
5073 echo "may lost its master LMV EA. If nobody created files under the"
5074 echo "master directly after the master LMV EA lost, then the LFSCK"
5075 echo "should re-generate the master LMV EA."
5078 check_mount_and_prep
5080 echo "Inject failure stub on MDT0 to simulate the case that the"
5081 echo "master MDT-object of the striped directory lost the LMV EA."
5083 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5084 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5085 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5086 error "(1) Fail to create striped directory"
5087 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5089 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5090 $START_NAMESPACE -r -A ||
5091 error "(2) Fail to start LFSCK for namespace"
5093 wait_all_targets_blocked namespace completed 3
5095 local repaired=$($SHOW_NAMESPACE |
5096 awk '/^striped_dirs_repaired/ { print $2 }')
5097 [ $repaired -eq 1 ] ||
5098 error "(4) Fail to re-generate master LMV EA: $repaired"
5100 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5101 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5103 umount_client $MOUNT || error "(5) umount failed"
5104 mount_client $MOUNT || error "(6) mount failed"
5106 local empty=$(ls $DIR/$tdir/striped_dir/)
5107 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5109 rmdir $DIR/$tdir/striped_dir ||
5110 error "(8) Fail to remove the striped directory after LFSCK"
5112 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5115 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5116 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5117 skip "MDS older than 2.6.50, LU-5519"
5120 echo "For some reason, the master MDT-object of the striped directory"
5121 echo "may lost its master LMV EA. If somebody created files under the"
5122 echo "master directly after the master LMV EA lost, then the LFSCK"
5123 echo "should NOT re-generate the master LMV EA, instead, it should"
5124 echo "change the broken striped dirctory as read-only to prevent"
5125 echo "further damage"
5128 check_mount_and_prep
5130 echo "Inject failure stub on MDT0 to simulate the case that the"
5131 echo "master MDT-object of the striped directory lost the LMV EA."
5133 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5134 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5135 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5136 error "(1) Fail to create striped directory"
5137 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5139 umount_client $MOUNT || error "(2) umount failed"
5140 mount_client $MOUNT || error "(3) mount failed"
5142 touch $DIR/$tdir/striped_dir/dummy ||
5143 error "(4) Fail to touch under broken striped directory"
5145 echo "Trigger namespace LFSCK to find out the inconsistency"
5146 $START_NAMESPACE -r -A ||
5147 error "(5) Fail to start LFSCK for namespace"
5149 wait_all_targets_blocked namespace completed 6
5151 local repaired=$($SHOW_NAMESPACE |
5152 awk '/^striped_dirs_repaired/ { print $2 }')
5153 [ $repaired -eq 0 ] ||
5154 error "(7) Re-generate master LMV EA unexpected: $repaired"
5156 stat $DIR/$tdir/striped_dir/dummy ||
5157 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5159 touch $DIR/$tdir/striped_dir/foo &&
5160 error "(9) The broken striped directory should be read-only"
5162 chattr -i $DIR/$tdir/striped_dir ||
5163 error "(10) Fail to chattr on the broken striped directory"
5165 rmdir $DIR/$tdir/striped_dir ||
5166 error "(11) Fail to remove the striped directory after LFSCK"
5168 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5171 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5172 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5173 skip "MDS older than 2.6.50, LU-5519"
5176 echo "For some reason, the slave MDT-object of the striped directory"
5177 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5178 echo "slave LMV EA."
5181 check_mount_and_prep
5183 echo "Inject failure stub on MDT0 to simulate the case that the"
5184 echo "slave MDT-object (that resides on the same MDT as the master"
5185 echo "MDT-object resides on) lost the LMV EA."
5187 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5188 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5189 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5190 error "(1) Fail to create striped directory"
5191 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5193 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5194 $START_NAMESPACE -r -A ||
5195 error "(2) Fail to start LFSCK for namespace"
5197 wait_all_targets_blocked namespace completed 3
5199 local repaired=$($SHOW_NAMESPACE |
5200 awk '/^striped_shards_repaired/ { print $2 }')
5201 [ $repaired -eq 1 ] ||
5202 error "(4) Fail to re-generate slave LMV EA: $repaired"
5204 rmdir $DIR/$tdir/striped_dir ||
5205 error "(5) Fail to remove the striped directory after LFSCK"
5207 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5210 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5211 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5212 skip "MDS older than 2.6.50, LU-5519"
5215 echo "For some reason, the slave MDT-object of the striped directory"
5216 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5217 echo "slave LMV EA."
5220 check_mount_and_prep
5222 echo "Inject failure stub on MDT0 to simulate the case that the"
5223 echo "slave MDT-object (that resides on different MDT as the master"
5224 echo "MDT-object resides on) lost the LMV EA."
5226 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5227 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5228 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5229 error "(1) Fail to create striped directory"
5230 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5232 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5233 $START_NAMESPACE -r -A ||
5234 error "(2) Fail to start LFSCK for namespace"
5236 wait_all_targets_blocked namespace completed 3
5238 local repaired=$(do_facet mds2 $LCTL get_param -n \
5239 mdd.$(facet_svc mds2).lfsck_namespace |
5240 awk '/^striped_shards_repaired/ { print $2 }')
5241 [ $repaired -eq 1 ] ||
5242 error "(4) Fail to re-generate slave LMV EA: $repaired"
5244 rmdir $DIR/$tdir/striped_dir ||
5245 error "(5) Fail to remove the striped directory after LFSCK"
5247 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5250 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5251 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5252 skip "MDS older than 2.6.50, LU-5519"
5255 echo "For some reason, the stripe index in the slave LMV EA is"
5256 echo "corrupted. The LFSCK should repair the slave LMV EA."
5259 check_mount_and_prep
5261 echo "Inject failure stub on MDT0 to simulate the case that the"
5262 echo "slave LMV EA on the first shard of the striped directory"
5263 echo "claims the same index as the second shard claims"
5265 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5267 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5268 error "(1) Fail to create striped directory"
5269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5271 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5272 $START_NAMESPACE -r -A ||
5273 error "(2) Fail to start LFSCK for namespace"
5275 wait_all_targets_blocked namespace completed 3
5277 local repaired=$($SHOW_NAMESPACE |
5278 awk '/^striped_shards_repaired/ { print $2 }')
5279 [ $repaired -eq 1 ] ||
5280 error "(4) Fail to repair slave LMV EA: $repaired"
5282 umount_client $MOUNT || error "(5) umount failed"
5283 mount_client $MOUNT || error "(6) mount failed"
5285 touch $DIR/$tdir/striped_dir/foo ||
5286 error "(7) Fail to touch file after the LFSCK"
5288 rm -f $DIR/$tdir/striped_dir/foo ||
5289 error "(8) Fail to unlink file after the LFSCK"
5291 rmdir $DIR/$tdir/striped_dir ||
5292 error "(9) Fail to remove the striped directory after LFSCK"
5294 run_test 31g "Repair the corrupted slave LMV EA"
5297 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5298 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5299 skip "MDS older than 2.6.50, LU-5519"
5302 echo "For some reason, the shard's name entry in the striped"
5303 echo "directory may be corrupted. The LFSCK should repair the"
5304 echo "bad shard's name entry."
5307 check_mount_and_prep
5309 echo "Inject failure stub on MDT0 to simulate the case that the"
5310 echo "first shard's name entry in the striped directory claims"
5311 echo "the same index as the second shard's name entry claims."
5313 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5314 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5315 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5316 error "(1) Fail to create striped directory"
5317 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5319 echo "Trigger namespace LFSCK to repair the shard's name entry"
5320 $START_NAMESPACE -r -A ||
5321 error "(2) Fail to start LFSCK for namespace"
5323 wait_all_targets_blocked namespace completed 3
5325 local repaired=$($SHOW_NAMESPACE |
5326 awk '/^dirent_repaired/ { print $2 }')
5327 [ $repaired -eq 1 ] ||
5328 error "(4) Fail to repair shard's name entry: $repaired"
5330 umount_client $MOUNT || error "(5) umount failed"
5331 mount_client $MOUNT || error "(6) mount failed"
5333 touch $DIR/$tdir/striped_dir/foo ||
5334 error "(7) Fail to touch file after the LFSCK"
5336 rm -f $DIR/$tdir/striped_dir/foo ||
5337 error "(8) Fail to unlink file after the LFSCK"
5339 rmdir $DIR/$tdir/striped_dir ||
5340 error "(9) Fail to remove the striped directory after LFSCK"
5342 run_test 31h "Repair the corrupted shard's name entry"
5347 umount_client $MOUNT
5349 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5350 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5351 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5353 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5354 [ "$STATUS" == "scanning-phase1" ] ||
5355 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5358 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5360 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5364 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5366 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5367 error "(5) Fail to start ost1"
5369 run_test 32a "stop LFSCK when some OST failed"
5373 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5376 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5377 error "(1) Fail to create $DIR/$tdir/dp"
5378 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5379 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5380 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5381 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5382 umount_client $MOUNT
5384 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5385 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5386 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5388 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5389 mdd.${MDT_DEV}.lfsck_namespace |
5390 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5392 error "(5) unexpected status"
5396 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5398 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5402 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5404 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5405 error "(8) Fail to start MDT2"
5407 run_test 32b "stop LFSCK when some MDT failed"
5413 $START_LAYOUT --dryrun -o -r ||
5414 error "(1) Fail to start layout LFSCK"
5415 wait_all_targets_blocked layout completed 2
5417 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5418 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5419 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5421 $START_NAMESPACE -e abort -A -r ||
5422 error "(4) Fail to start namespace LFSCK"
5423 wait_all_targets_blocked namespace completed 5
5425 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5426 [ "$PARAMS" == "failout,all_targets" ] ||
5427 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5429 run_test 33 "check LFSCK paramters"
5433 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5434 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5438 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5439 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5440 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5441 error "(1) Fail to create $DIR/$tdir/dummy"
5443 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5444 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5445 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5446 mdd.${MDT_DEV}.lfsck_namespace |
5447 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5449 error "(3) unexpected status"
5452 local repaired=$($SHOW_NAMESPACE |
5453 awk '/^dirent_repaired/ { print $2 }')
5454 [ $repaired -eq 1 ] ||
5455 error "(4) Fail to repair the lost agent object: $repaired"
5457 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5458 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5459 mdd.${MDT_DEV}.lfsck_namespace |
5460 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5462 error "(6) unexpected status"
5465 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5466 [ $repaired -eq 0 ] ||
5467 error "(7) Unexpected repairing: $repaired"
5469 run_test 34 "LFSCK can rebuild the lost agent object"
5473 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5477 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5478 do_facet mds2 $LCTL set_param fail_loc=0x1631
5479 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5480 error "(1) Fail to create $DIR/$tdir/dummy"
5483 do_facet mds2 $LCTL set_param fail_loc=0
5484 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5485 wait_update_facet mds2 "$LCTL get_param -n \
5486 mdd.$(facet_svc mds2).lfsck_namespace |
5487 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5488 error "(3) MDS${k} is not the expected 'completed'"
5490 local repaired=$(do_facet mds2 $LCTL get_param -n \
5491 mdd.$(facet_svc mds2).lfsck_namespace |
5492 awk '/^agent_entries_repaired/ { print $2 }')
5493 [ $repaired -eq 1 ] ||
5494 error "(4) Fail to repair the lost agent entry: $repaired"
5496 echo "stopall to cleanup object cache"
5499 setupall > /dev/null
5501 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5502 wait_update_facet mds2 "$LCTL get_param -n \
5503 mdd.$(facet_svc mds2).lfsck_namespace |
5504 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5505 error "(6) MDS${k} is not the expected 'completed'"
5507 repaired=$(do_facet mds2 $LCTL get_param -n \
5508 mdd.$(facet_svc mds2).lfsck_namespace |
5509 awk '/^agent_entries_repaired/ { print $2 }')
5510 [ $repaired -eq 0 ] ||
5511 error "(7) Unexpected repairing: $repaired"
5513 run_test 35 "LFSCK can rebuild the lost agent entry"
5516 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5519 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5520 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5521 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5524 check_mount_and_prep
5528 lctl get_param osc.*.*grant*
5529 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5531 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5532 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5533 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5534 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5535 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5536 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5537 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5538 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5539 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5541 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5542 error "(3) Fail to write $DIR/$tdir/f0"
5543 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5544 error "(4) Fail to write $DIR/$tdir/f1"
5545 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5546 error "(5) Fail to write $DIR/$tdir/f2"
5548 $LFS mirror resync $DIR/$tdir/f0 ||
5549 error "(6) Fail to resync $DIR/$tdir/f0"
5550 $LFS mirror resync $DIR/$tdir/f1 ||
5551 error "(7) Fail to resync $DIR/$tdir/f1"
5552 $LFS mirror resync $DIR/$tdir/f2 ||
5553 error "(8) Fail to resync $DIR/$tdir/f2"
5555 cancel_lru_locks mdc
5556 cancel_lru_locks osc
5558 $LFS getstripe $DIR/$tdir/f0 ||
5559 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5560 $LFS getstripe $DIR/$tdir/f1 ||
5561 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5562 $LFS getstripe $DIR/$tdir/f2 ||
5563 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5565 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5566 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5567 do_facet mds1 $LCTL set_param fail_loc=0x1616
5569 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5570 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5571 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5572 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5573 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5574 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5578 do_facet mds1 $LCTL set_param fail_loc=0
5580 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5581 error "(15) The 1st of mirror is not destroyed"
5582 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5583 error "(16) The 2nd of mirror is not destroyed"
5584 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5585 error "(17) The 3rd of mirror is not destroyed"
5589 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5590 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5591 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5592 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5593 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5594 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5596 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5597 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5599 for k in $(seq $MDSCOUNT); do
5600 # The LFSCK status query internal is 30 seconds. For the case
5601 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5602 # time to guarantee the status sync up.
5603 wait_update_facet mds${k} "$LCTL get_param -n \
5604 mdd.$(facet_svc mds${k}).lfsck_layout |
5605 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5606 error "(22) MDS${k} is not the expected 'completed'"
5609 for k in $(seq $OSTCOUNT); do
5610 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5611 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5612 awk '/^status/ { print $2 }')
5613 [ "$cur_status" == "completed" ] ||
5614 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5617 local repaired=$(do_facet mds1 $LCTL get_param -n \
5618 mdd.$(facet_svc mds1).lfsck_layout |
5619 awk '/^repaired_orphan/ { print $2 }')
5620 [ $repaired -eq 9 ] ||
5621 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5623 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5624 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5625 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5626 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5627 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5628 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5630 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5631 $LFS getstripe $DIR/$tdir/f0
5632 error "(28) The 1st of mirror is not recovered"
5635 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5636 $LFS getstripe $DIR/$tdir/f1
5637 error "(29) The 2nd of mirror is not recovered"
5640 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5641 $LFS getstripe $DIR/$tdir/f2
5642 error "(30) The 3rd of mirror is not recovered"
5645 run_test 36a "rebuild LOV EA for mirrored file (1)"
5648 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5649 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5652 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5653 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5654 echo "with the PFID EA of related OST-object(s) belong to the file. "
5657 check_mount_and_prep
5659 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5660 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5661 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5663 local fid=$($LFS path2fid $DIR/$tdir/f0)
5665 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5666 error "(1) Fail to write $DIR/$tdir/f0"
5667 $LFS mirror resync $DIR/$tdir/f0 ||
5668 error "(2) Fail to resync $DIR/$tdir/f0"
5670 cancel_lru_locks mdc
5671 cancel_lru_locks osc
5673 $LFS getstripe $DIR/$tdir/f0 ||
5674 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5676 echo "Inject failure, to simulate the case of missing the MDT-object"
5677 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5678 do_facet mds1 $LCTL set_param fail_loc=0x1616
5679 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5683 do_facet mds1 $LCTL set_param fail_loc=0
5685 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5686 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5688 for k in $(seq $MDSCOUNT); do
5689 # The LFSCK status query internal is 30 seconds. For the case
5690 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5691 # time to guarantee the status sync up.
5692 wait_update_facet mds${k} "$LCTL get_param -n \
5693 mdd.$(facet_svc mds${k}).lfsck_layout |
5694 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5695 error "(6) MDS${k} is not the expected 'completed'"
5698 for k in $(seq $OSTCOUNT); do
5699 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5700 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5701 awk '/^status/ { print $2 }')
5702 [ "$cur_status" == "completed" ] ||
5703 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5706 local count=$(do_facet mds1 $LCTL get_param -n \
5707 mdd.$(facet_svc mds1).lfsck_layout |
5708 awk '/^repaired_orphan/ { print $2 }')
5709 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5711 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5712 count=$($LFS getstripe --mirror-count $name)
5713 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5715 count=$($LFS getstripe --component-count $name)
5716 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5718 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5719 $LFS getstripe $name
5720 error "(11) The 1st of mirror is not recovered"
5723 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5724 $LFS getstripe $name
5725 error "(12) The 2nd of mirror is not recovered"
5728 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5729 $LFS getstripe $name
5730 error "(13) The 3rd of mirror is not recovered"
5733 run_test 36b "rebuild LOV EA for mirrored file (2)"
5736 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5737 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5740 echo "The mirrored file has been modified, not resynced yet, then "
5741 echo "lost its MDT-object, but relatd OST-objects are still there. "
5742 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5743 echo "with the PFID EA of related OST-object(s) belong to the file. "
5746 check_mount_and_prep
5748 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5750 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5752 local fid=$($LFS path2fid $DIR/$tdir/f0)
5754 # The 1st dd && resync makes all related OST-objects have been written
5755 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5756 error "(1.1) Fail to write $DIR/$tdir/f0"
5757 $LFS mirror resync $DIR/$tdir/f0 ||
5758 error "(1.2) Fail to resync $DIR/$tdir/f0"
5759 # The 2nd dd makes one mirror to be stale
5760 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5761 error "(1.3) Fail to write $DIR/$tdir/f0"
5763 cancel_lru_locks mdc
5764 cancel_lru_locks osc
5766 $LFS getstripe $DIR/$tdir/f0 ||
5767 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5769 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5770 awk '/lcme_flags/ { print $2 }')
5771 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5772 awk '/lcme_flags/ { print $2 }')
5774 echo "Inject failure, to simulate the case of missing the MDT-object"
5775 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5776 do_facet mds1 $LCTL set_param fail_loc=0x1616
5777 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5781 do_facet mds1 $LCTL set_param fail_loc=0
5783 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5784 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5786 for k in $(seq $MDSCOUNT); do
5787 # The LFSCK status query internal is 30 seconds. For the case
5788 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5789 # time to guarantee the status sync up.
5790 wait_update_facet mds${k} "$LCTL get_param -n \
5791 mdd.$(facet_svc mds${k}).lfsck_layout |
5792 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5793 error "(5) MDS${k} is not the expected 'completed'"
5796 for k in $(seq $OSTCOUNT); do
5797 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5798 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5799 awk '/^status/ { print $2 }')
5800 [ "$cur_status" == "completed" ] ||
5801 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5804 local count=$(do_facet mds1 $LCTL get_param -n \
5805 mdd.$(facet_svc mds1).lfsck_layout |
5806 awk '/^repaired_orphan/ { print $2 }')
5807 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5809 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5810 count=$($LFS getstripe --mirror-count $name)
5811 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5813 count=$($LFS getstripe --component-count $name)
5814 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5816 local flags=$($LFS getstripe $name | head -n 10 |
5817 awk '/lcme_flags/ { print $2 }')
5818 [ "$flags" == "$saved_flags1" ] || {
5819 $LFS getstripe $name
5820 error "(10) expect flags $saved_flags1, got $flags"
5823 flags=$($LFS getstripe $name | tail -n 10 |
5824 awk '/lcme_flags/ { print $2 }')
5825 [ "$flags" == "$saved_flags2" ] || {
5826 $LFS getstripe $name
5827 error "(11) expect flags $saved_flags2, got $flags"
5830 run_test 36c "rebuild LOV EA for mirrored file (3)"
5836 local t_dir="$DIR/$tdir/d0"
5837 check_mount_and_prep
5839 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5840 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5844 $START_NAMESPACE -r -A || {
5845 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5847 wait_all_targets_blocked namespace completed 4
5852 run_test 37 "LFSCK must skip a ORPHAN"
5856 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5857 skip "Need MDS version newer than 2.12.51"
5859 test_mkdir $DIR/$tdir
5860 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5861 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5863 # create foreign file
5864 $LFS setstripe --foreign=none --flags 0xda05 \
5865 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5866 error "$DIR/$tdir/$tfile: create failed"
5868 $LFS getstripe -v $DIR/$tdir/$tfile |
5869 grep "lfm_magic:.*0x0BD70BD0" ||
5870 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5871 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5872 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5873 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5874 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5875 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5876 $LFS getstripe -v $DIR/$tdir/$tfile |
5877 grep "lfm_flags:.*0x0000DA05" ||
5878 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5879 $LFS getstripe $DIR/$tdir/$tfile |
5880 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5881 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5883 # modify striping should fail
5884 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5885 error "$DIR/$tdir/$tfile: setstripe should fail"
5887 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5889 wait_all_targets_blocked namespace completed 1
5891 # check that "global" namespace_repaired == 0 !!!
5892 local repaired=$(do_facet mds1 \
5893 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5894 awk '/^namespace_repaired/ { print \\\$2 }'")
5895 [ $repaired -eq 0 ] ||
5896 error "(2) Expect no namespace repair, but got: $repaired"
5898 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5900 wait_all_targets_blocked layout completed 2
5902 # check that "global" layout_repaired == 0 !!!
5903 local repaired=$(do_facet mds1 \
5904 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5905 awk '/^layout_repaired/ { print \\\$2 }'")
5906 [ $repaired -eq 0 ] ||
5907 error "(2) Expect no layout repair, but got: $repaired"
5909 echo "post-lfsck checks of foreign file"
5911 $LFS getstripe -v $DIR/$tdir/$tfile |
5912 grep "lfm_magic:.*0x0BD70BD0" ||
5913 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5914 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5915 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5916 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5917 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5918 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5919 $LFS getstripe -v $DIR/$tdir/$tfile |
5920 grep "lfm_flags:.*0x0000DA05" ||
5921 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5922 $LFS getstripe $DIR/$tdir/$tfile |
5923 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5924 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5926 # modify striping should fail
5927 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5928 error "$DIR/$tdir/$tfile: setstripe should fail"
5931 cat $DIR/$tdir/$tfile && error "$DIR/$tdir/$tfile: read should fail"
5932 cat /etc/passwd > $DIR/$tdir/$tfile &&
5933 error "$DIR/$tdir/$tfile: write should fail"
5935 #remove foreign file
5936 rm $DIR/$tdir/$tfile ||
5937 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5939 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5943 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5944 skip "Need MDS version newer than 2.12.51"
5946 test_mkdir $DIR/$tdir
5947 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5948 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5950 # create foreign dir
5951 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5952 $DIR/$tdir/${tdir}2 ||
5953 error "$DIR/$tdir/${tdir}2: create failed"
5955 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5956 grep "lfm_magic:.*0x0CD50CD0" ||
5957 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5958 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5959 # - sizeof(lfm_type) - sizeof(lfm_flags)
5960 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5961 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5962 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5963 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5964 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5965 grep "lfm_flags:.*0x0000DA05" ||
5966 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5967 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5968 grep "lfm_value.*${uuid1}@${uuid2}" ||
5969 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5971 # file create in dir should fail
5972 touch $DIR/$tdir/${tdir}2/$tfile &&
5973 "$DIR/${tdir}2: file create should fail"
5976 chmod 777 $DIR/$tdir/${tdir}2 ||
5977 error "$DIR/${tdir}2: chmod failed"
5980 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5981 error "$DIR/${tdir}2: chown failed"
5983 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5985 wait_all_targets_blocked namespace completed 1
5987 # check that "global" namespace_repaired == 0 !!!
5988 local repaired=$(do_facet mds1 \
5989 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5990 awk '/^namespace_repaired/ { print \\\$2 }'")
5991 [ $repaired -eq 0 ] ||
5992 error "(2) Expect nothing to be repaired, but got: $repaired"
5994 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5996 wait_all_targets_blocked layout completed 2
5998 # check that "global" layout_repaired == 0 !!!
5999 local repaired=$(do_facet mds1 \
6000 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
6001 awk '/^layout_repaired/ { print \\\$2 }'")
6002 [ $repaired -eq 0 ] ||
6003 error "(2) Expect no layout repair, but got: $repaired"
6005 echo "post-lfsck checks of foreign dir"
6007 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6008 grep "lfm_magic:.*0x0CD50CD0" ||
6009 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
6010 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
6011 # - sizeof(lfm_type) - sizeof(lfm_flags)
6012 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
6013 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
6014 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
6015 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
6016 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6017 grep "lfm_flags:.*0x0000DA05" ||
6018 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
6019 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
6020 grep "lfm_value.*${uuid1}@${uuid2}" ||
6021 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
6023 # file create in dir should fail
6024 touch $DIR/$tdir/${tdir}2/$tfile &&
6025 "$DIR/${tdir}2: file create should fail"
6028 chmod 777 $DIR/$tdir/${tdir}2 ||
6029 error "$DIR/${tdir}2: chmod failed"
6032 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
6033 error "$DIR/${tdir}2: chown failed"
6036 rmdir $DIR/$tdir/${tdir}2 ||
6037 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
6039 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
6042 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
6044 check_mount_and_prep
6045 $LFS mkdir -i 1 $DIR/$tdir/dir1
6046 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
6048 touch $DIR/$tdir/dir1/f1
6049 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
6051 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
6052 $LFS migrate -m 0 $DIR/$tdir/dir1
6054 echo "trigger LFSCK for layout"
6055 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
6057 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6058 mdd.${MDT_DEV}.lfsck_layout |
6059 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6061 error "(2) unexpected status"
6064 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
6066 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
6068 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
6072 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
6074 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6075 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6076 do_facet $SINGLEMDS $LCTL dk > /dev/null
6078 echo "trigger LFSCK for SEL layout"
6079 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6080 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6081 mdd.${MDT_DEV}.lfsck_layout |
6082 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6084 error "(2) unexpected status"
6087 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6088 grep "lfsck_layout_verify_header")
6090 [[ "x$errors" == "x" ]] || {
6092 error "lfsck failed"
6095 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6097 run_test 41 "SEL support in LFSCK"
6099 # restore MDS/OST size
6100 MDSSIZE=${SAVED_MDSSIZE}
6101 OSTSIZE=${SAVED_OSTSIZE}
6102 OSTCOUNT=${SAVED_OSTCOUNT}
6104 # cleanup the system at last
6105 REFORMAT="yes" cleanup_and_setup_lustre
6108 check_and_cleanup_lustre