3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 LUSTRE=${LUSTRE:-$(dirname $0)/..}
12 . $LUSTRE/tests/test-framework.sh
16 # bug number for skipped test:
17 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 (( $MDS1_VERSION >= $(version_code 2.3.60) )) ||
33 skip "Need MDS version at least 2.3.60"
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ "$mds1_FSTYPE" == zfs ] && MDSSIZE=300000
45 [ "$ost1_FSTYPE" == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 MDT_DEV=$(devicelabel $SINGLEMDS $(facet_device $SINGLEMDS))
55 OST_DEV="${FSNAME}-OST0000"
56 START_NAMESPACE="do_facet $SINGLEMDS \
57 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
58 START_LAYOUT="do_facet $SINGLEMDS \
59 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
60 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
61 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
62 SHOW_NAMESPACE="do_facet $SINGLEMDS \
63 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
64 SHOW_LAYOUT="do_facet $SINGLEMDS \
65 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
66 SHOW_LAYOUT_ON_OST="do_facet ost1 \
67 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
68 MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr"
69 MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub"
70 MOUNT_OPTS_SKIP_LFSCK="$MDS_MOUNT_OPTS -o user_xattr,skip_lfsck"
79 echo "preparing... $nfiles * $ndirs files will be created $(date)."
80 if [ ! -z $igif ]; then
81 #define OBD_FAIL_FID_IGIF 0x1504
82 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
85 cp $LUSTRE/tests/*.sh $DIR/$tdir/
86 if [ $ndirs -gt 0 ]; then
87 createmany -d $DIR/$tdir/d $ndirs
88 createmany -m $DIR/$tdir/f $ndirs
89 if [ $nfiles -gt 0 ]; then
90 for ((i = 0; i < $ndirs; i++)); do
91 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
92 /dev/null || error "createmany $nfiles"
95 createmany -d $DIR/$tdir/e $ndirs
98 if [ ! -z $igif ]; then
99 touch $DIR/$tdir/dummy
100 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
103 echo "prepared $(date)."
110 local dev=$(facet_device $facet)
112 start $facet $dev $opts > /dev/null ||
113 error "($err) Fail to start $facet!"
116 run_e2fsck_on_mds_facet() {
117 [ $mds1_FSTYPE == ldiskfs ] || return 0
121 stop $mds > /dev/null || error "(0) Fail to the stop $mds"
122 local host=$(facet_active_host $mds)
123 local dev=$(facet_device $mds)
125 run_e2fsck $host $dev "-n" |
127 run_e2fsck $host $dev "-n"
128 error "(2) Detected inconsistency on $mds"
130 start_facet $mds "$MOUNT_OPTS_NOSCRUB" 3
133 wait_all_targets_blocked() {
138 local count=$(do_facet mds1 \
139 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
140 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
141 [[ $count -eq $MDSCOUNT ]] || {
142 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
143 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
152 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
153 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
154 "$MDSCOUNT" $LTIME || {
155 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
156 error "($err) some MDTs are not in ${status}"
163 #define OBD_FAIL_LFSCK_DELAY1 0x1600
164 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
165 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
167 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
169 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
170 [ "$STATUS" == "scanning-phase1" ] ||
171 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
173 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
176 [ "$STATUS" == "stopped" ] ||
177 error "(6) Expect 'stopped', but got '$STATUS'"
179 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
181 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
182 [ "$STATUS" == "scanning-phase1" ] ||
183 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
187 mdd.${MDT_DEV}.lfsck_namespace |
188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
190 error "(9) unexpected status"
193 local repaired=$($SHOW_NAMESPACE |
194 awk '/^updated_phase1/ { print $2 }')
195 [ $repaired -eq 0 ] ||
196 error "(10) Expect nothing to be repaired, but got: $repaired"
198 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
199 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
200 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
201 mdd.${MDT_DEV}.lfsck_namespace |
202 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
204 error "(12) unexpected status"
207 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
208 [ $((scanned1 + 1)) -eq $scanned2 ] ||
209 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
211 echo "stopall, should NOT crash LU-3649"
212 stopall || error "(14) Fail to stopall"
214 run_test 0 "Control LFSCK manually"
219 #define OBD_FAIL_FID_INDIR 0x1501
220 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
221 touch $DIR/$tdir/dummy
223 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
225 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
226 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
227 mdd.${MDT_DEV}.lfsck_namespace |
228 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
230 error "(4) unexpected status"
233 local repaired=$($SHOW_NAMESPACE |
234 awk '/^dirent_repaired/ { print $2 }')
235 # for interop with old server
236 [ -z "$repaired" ] &&
237 repaired=$($SHOW_NAMESPACE |
238 awk '/^updated_phase1/ { print $2 }')
240 [ $repaired -eq 1 ] ||
241 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
243 run_e2fsck_on_mds_facet $SINGLEMDS
245 mount_client $MOUNT || error "(6) Fail to start client!"
247 #define OBD_FAIL_FID_LOOKUP 0x1505
248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
249 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
251 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
253 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
257 [ "$mds1_FSTYPE" != ldiskfs ] &&
258 skip "OI Scrub not implemented for ZFS"
262 #define OBD_FAIL_FID_INLMA 0x1502
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
264 touch $DIR/$tdir/dummy
266 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
268 #define OBD_FAIL_FID_NOLMA 0x1506
269 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
270 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
271 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
272 mdd.${MDT_DEV}.lfsck_namespace |
273 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
275 error "(4) unexpected status"
278 local repaired=$($SHOW_NAMESPACE |
279 awk '/^dirent_repaired/ { print $2 }')
280 # for interop with old server
281 [ -z "$repaired" ] &&
282 repaired=$($SHOW_NAMESPACE |
283 awk '/^updated_phase1/ { print $2 }')
285 [ $repaired -eq 1 ] ||
286 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
288 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
289 run_e2fsck_on_mds_facet $SINGLEMDS
291 mount_client $MOUNT || error "(6) Fail to start client!"
293 #define OBD_FAIL_FID_LOOKUP 0x1505
294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
295 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
297 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
299 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
304 #define OBD_FAIL_FID_IGIF 0x1504
305 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
306 touch $DIR/$tdir/dummy
308 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
310 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
311 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
312 mdd.${MDT_DEV}.lfsck_namespace |
313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
315 error "(4) unexpected status"
318 local repaired=$($SHOW_NAMESPACE |
319 awk '/^dirent_repaired/ { print $2 }')
320 # for interop with old server
321 [ -z "$repaired" ] &&
322 repaired=$($SHOW_NAMESPACE |
323 awk '/^updated_phase1/ { print $2 }')
325 [ $repaired -eq 1 ] ||
326 error "(5) Fail to repair lost FID-in-dirent: $repaired"
328 run_e2fsck_on_mds_facet $SINGLEMDS
330 mount_client $MOUNT || error "(6) Fail to start client!"
332 #define OBD_FAIL_FID_LOOKUP 0x1505
333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
334 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
336 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
338 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
341 [ $MDS1_VERSION -lt $(version_code 2.13.57) ] &&
342 skip "MDS older than 2.13.57"
343 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
347 touch $DIR/$tdir/$tfile
348 mkdir $DIR/$tdir/subdir
349 $LFS mkdir -i 1 $DIR/$tdir/remotedir
350 $LFS path2fid $DIR/$tdir
351 ll_decode_linkea $DIR/$tdir/$tfile
352 ll_decode_linkea $DIR/$tdir/subdir
353 ll_decode_linkea $DIR/$tdir/remotedir
355 local mntpt=$(facet_mntpt mds1)
357 # unlink OI files to remove the stale entry
358 local saved_opts=$MDS_MOUNT_OPTS
361 mount_fstype mds1 $mntpt
362 # increase $tdir FID oid in LMA
363 do_facet mds1 "getfattr -d -m trusted.lma -e hex \
364 --absolute-names $mntpt/ROOT/$tdir | \
365 sed -E 's/0(.{8})$/1\1/' | setfattr --restore=-"
366 unmount_fstype mds1 $mntpt
369 # the FID oid in LMA was increased above, and it's not in OI table,
370 # run scrub first to generate mapping in OI, so the following namespace
371 # check can fix linkea correctly, this is not necessary normally.
372 do_facet mds1 $LCTL lfsck_start -M ${MDT_DEV} -t scrub ||
373 error "failed to start LFSCK for scrub!"
374 wait_update_facet mds1 "$LCTL get_param -n \
375 osd-*.$(facet_svc mds1).oi_scrub |
376 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
377 error "unexpected status"
379 $START_NAMESPACE -r -A || error "fail to start LFSCK for namespace!"
380 wait_update_facet mds1 "$LCTL get_param -n \
381 mdd.${MDT_DEV}.lfsck_namespace |
382 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
384 error "unexpected status"
386 $LFS path2fid $DIR/$tdir
387 ll_decode_linkea $DIR/$tdir/$tfile
388 ll_decode_linkea $DIR/$tdir/subdir
389 ll_decode_linkea $DIR/$tdir/remotedir
394 fid=$($LFS path2fid $DIR/$tdir)
395 for f in $tfile subdir remotedir; do
396 pfid=$(ll_decode_linkea $DIR/$tdir/$f |
397 awk '/pfid/ { print $3 }')
399 [ "$pfid" == "$fid" ] || error "$fid in LMA != $pfid in linkea"
402 run_test 1d "LFSCK can fix mismatch of FID in LMA and FID in child linkea"
407 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
408 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
409 touch $DIR/$tdir/dummy
411 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
413 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
414 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
415 mdd.${MDT_DEV}.lfsck_namespace |
416 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
418 error "(4) unexpected status"
421 local repaired=$($SHOW_NAMESPACE |
422 awk '/^linkea_repaired/ { print $2 }')
423 # for interop with old server
424 [ -z "$repaired" ] &&
425 repaired=$($SHOW_NAMESPACE |
426 awk '/^updated_phase2/ { print $2 }')
428 [ $repaired -eq 1 ] ||
429 error "(5) Fail to repair crashed linkEA: $repaired"
431 run_e2fsck_on_mds_facet $SINGLEMDS
433 mount_client $MOUNT || error "(6) Fail to start client!"
435 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
436 error "(7) Fail to stat $DIR/$tdir/dummy"
438 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
439 local dummyname=$($LFS fid2path $DIR $dummyfid)
440 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
441 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
443 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
449 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
450 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
451 touch $DIR/$tdir/dummy
453 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
455 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
456 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
457 mdd.${MDT_DEV}.lfsck_namespace |
458 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
460 error "(4) unexpected status"
463 local repaired=$($SHOW_NAMESPACE |
464 awk '/^updated_phase2/ { print $2 }')
465 [ $repaired -eq 1 ] ||
466 error "(5) Fail to repair crashed linkEA: $repaired"
468 run_e2fsck_on_mds_facet $SINGLEMDS
470 mount_client $MOUNT || error "(6) Fail to start client!"
472 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
473 error "(7) Fail to stat $DIR/$tdir/dummy"
475 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
476 local dummyname=$($LFS fid2path $DIR $dummyfid)
477 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
478 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
480 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
484 (( $MDS1_VERSION > $(version_code 2.4.90) )) ||
485 skip "MDS older than 2.4.90"
489 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
490 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
491 touch $DIR/$tdir/dummy
493 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
495 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
496 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
497 mdd.${MDT_DEV}.lfsck_namespace |
498 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
500 error "(4) unexpected status"
503 local repaired=$($SHOW_NAMESPACE |
504 awk '/^updated_phase2/ { print $2 }')
505 [ $repaired -eq 1 ] ||
506 error "(5) Fail to repair crashed linkEA: $repaired"
508 run_e2fsck_on_mds_facet $SINGLEMDS
510 mount_client $MOUNT || error "(6) Fail to start client!"
512 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
513 error "(7) Fail to stat $DIR/$tdir/dummy"
515 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
516 local dummyname=$($LFS fid2path $DIR $dummyfid)
517 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
518 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
520 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
524 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
525 skip "MDS older than 2.6.50, LU-4788"
529 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
530 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
531 touch $DIR/$tdir/dummy
533 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
535 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
536 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
537 mdd.${MDT_DEV}.lfsck_namespace |
538 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
540 error "(4) unexpected status"
543 local repaired=$($SHOW_NAMESPACE |
544 awk '/^linkea_repaired/ { print $2 }')
545 [ $repaired -eq 1 ] ||
546 error "(5) Fail to repair crashed linkEA: $repaired"
548 run_e2fsck_on_mds_facet $SINGLEMDS
550 mount_client $MOUNT || error "(6) Fail to start client!"
552 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
553 error "(7) Fail to stat $DIR/$tdir/dummy"
555 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
556 local dummyname=$($LFS fid2path $DIR $dummyfid)
557 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
558 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
560 run_test 2d "LFSCK can recover the missing linkEA entry"
564 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
565 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
566 skip "MDS older than 2.6.50, LU-5511"
570 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
572 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
573 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
574 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
575 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
577 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
579 wait_all_targets_blocked namespace completed 4
581 local repaired=$($SHOW_NAMESPACE |
582 awk '/^linkea_repaired/ { print $2 }')
583 [ $repaired -eq 1 ] ||
584 error "(5) Fail to repair crashed linkEA: $repaired"
586 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
587 local name=$($LFS fid2path $DIR $fid)
588 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
589 error "(6) Fail to repair linkEA: $fid $name"
591 run_test 2e "namespace LFSCK can verify remote object linkEA"
595 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
596 skip "MDS older than 2.6.50, LU-4788"
600 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
601 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
602 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
604 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
605 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
606 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
608 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
609 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
610 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
612 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
613 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
614 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
616 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
618 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
619 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
620 mdd.${MDT_DEV}.lfsck_namespace |
621 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
623 error "(10) unexpected status"
626 local checked=$($SHOW_NAMESPACE |
627 awk '/^checked_phase2/ { print $2 }')
628 [ $checked -ge 4 ] ||
629 error "(11) Fail to check multiple-linked object: $checked"
631 local repaired=$($SHOW_NAMESPACE |
632 awk '/^multiple_linked_repaired/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(12) Fail to repair multiple-linked object: $repaired"
636 run_test 3 "LFSCK can verify multiple-linked objects"
640 [ "$mds1_FSTYPE" != ldiskfs ] &&
641 skip "OI Scrub not implemented for ZFS"
644 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
645 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
647 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
648 echo "start $SINGLEMDS with disabling OI scrub"
649 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
651 #define OBD_FAIL_LFSCK_DELAY2 0x1601
652 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
653 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
658 error "(5) unexpected status"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
665 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
666 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
667 mdd.${MDT_DEV}.lfsck_namespace |
668 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
670 error "(7) unexpected status"
673 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
674 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
676 local repaired=$($SHOW_NAMESPACE |
677 awk '/^dirent_repaired/ { print $2 }')
678 # for interop with old server
679 [ -z "$repaired" ] &&
680 repaired=$($SHOW_NAMESPACE |
681 awk '/^updated_phase1/ { print $2 }')
683 [ $repaired -ge 9 ] ||
684 error "(9) Fail to re-generate FID-in-dirent: $repaired"
686 run_e2fsck_on_mds_facet $SINGLEMDS
688 mount_client $MOUNT || error "(10) Fail to start client!"
690 #define OBD_FAIL_FID_LOOKUP 0x1505
691 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
692 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
693 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
695 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
699 [ "$mds1_FSTYPE" != ldiskfs ] &&
700 skip "OI Scrub not implemented for ZFS"
703 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
704 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop $SINGLEMDS!"
706 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
707 echo "start $SINGLEMDS with disabling OI scrub"
708 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 2
710 #define OBD_FAIL_LFSCK_DELAY2 0x1601
711 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
712 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
713 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
714 mdd.${MDT_DEV}.lfsck_namespace |
715 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
717 error "(5) unexpected status"
720 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
721 [ "$STATUS" == "scanning-phase1" ] ||
722 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
724 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
725 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
726 mdd.${MDT_DEV}.lfsck_namespace |
727 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
729 error "(7) unexpected status"
732 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
733 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
735 local repaired=$($SHOW_NAMESPACE |
736 awk '/^dirent_repaired/ { print $2 }')
737 # for interop with old server
738 [ -z "$repaired" ] &&
739 repaired=$($SHOW_NAMESPACE |
740 awk '/^updated_phase1/ { print $2 }')
742 [ $repaired -ge 2 ] ||
743 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
745 run_e2fsck_on_mds_facet $SINGLEMDS
747 mount_client $MOUNT || error "(10) Fail to start client!"
749 #define OBD_FAIL_FID_LOOKUP 0x1505
750 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
751 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
753 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
755 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
756 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
757 local dummyname=$($LFS fid2path $DIR $dummyfid)
758 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
759 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
761 run_test 5 "LFSCK can handle IGIF object upgrading"
766 #define OBD_FAIL_LFSCK_DELAY1 0x1600
767 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
768 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
770 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
771 [ "$STATUS" == "scanning-phase1" ] ||
772 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
774 # Sleep 3 sec to guarantee at least one object processed by LFSCK
776 # Fail the LFSCK to guarantee there is at least one checkpoint
777 #define OBD_FAIL_LFSCK_FATAL1 0x1608
778 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
779 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
780 mdd.${MDT_DEV}.lfsck_namespace |
781 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
783 error "(4) unexpected status"
786 local POS0=$($SHOW_NAMESPACE |
787 awk '/^last_checkpoint_position/ { print $2 }' |
790 #define OBD_FAIL_LFSCK_DELAY1 0x1600
791 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
792 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
794 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
795 [ "$STATUS" == "scanning-phase1" ] ||
796 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
798 local POS1=$($SHOW_NAMESPACE |
799 awk '/^latest_start_position/ { print $2 }' |
801 [[ $POS0 -lt $POS1 ]] ||
802 error "(7) Expect larger than: $POS0, but got $POS1"
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
805 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
806 mdd.${MDT_DEV}.lfsck_namespace |
807 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
809 error "(8) unexpected status"
812 run_test 6a "LFSCK resumes from last checkpoint (1)"
817 #define OBD_FAIL_LFSCK_DELAY2 0x1601
818 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
819 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
821 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
822 [ "$STATUS" == "scanning-phase1" ] ||
823 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
825 # Sleep 5 sec to guarantee that we are in the directory scanning
827 # Fail the LFSCK to guarantee there is at least one checkpoint
828 #define OBD_FAIL_LFSCK_FATAL2 0x1609
829 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
830 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
831 mdd.${MDT_DEV}.lfsck_namespace |
832 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
834 error "(4) unexpected status"
837 local O_POS0=$($SHOW_NAMESPACE |
838 awk '/^last_checkpoint_position/ { print $2 }' |
841 local D_POS0=$($SHOW_NAMESPACE |
842 awk '/^last_checkpoint_position/ { print $4 }')
844 #define OBD_FAIL_LFSCK_DELAY2 0x1601
845 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
846 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
848 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
849 [ "$STATUS" == "scanning-phase1" ] ||
850 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
852 local O_POS1=$($SHOW_NAMESPACE |
853 awk '/^latest_start_position/ { print $2 }' |
855 local D_POS1=$($SHOW_NAMESPACE |
856 awk '/^latest_start_position/ { print $4 }')
858 echo "Additional debug for 6b"
860 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
861 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
862 [[ $O_POS0 -lt $O_POS1 ]] ||
863 error "(7.1) $O_POS1 is not larger than $O_POS0"
865 [[ $D_POS0 -lt $D_POS1 ]] ||
866 error "(7.2) $D_POS1 is not larger than $D_POS0"
869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
870 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
871 mdd.${MDT_DEV}.lfsck_namespace |
872 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
874 error "(8) unexpected status"
877 run_test 6b "LFSCK resumes from last checkpoint (2)"
884 #define OBD_FAIL_LFSCK_DELAY2 0x1601
885 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
886 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
888 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
889 [ "$STATUS" == "scanning-phase1" ] ||
890 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
892 # Sleep 3 sec to guarantee at least one object processed by LFSCK
894 echo "stop $SINGLEMDS"
895 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop $SINGLEMDS!"
897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
898 echo "start $SINGLEMDS"
899 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 5
901 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
902 mdd.${MDT_DEV}.lfsck_namespace |
903 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
905 error "(6) unexpected status"
908 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
914 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
915 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
916 for ((i = 0; i < 20; i++)); do
917 touch $DIR/$tdir/dummy${i}
920 #define OBD_FAIL_LFSCK_DELAY3 0x1602
921 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
922 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
923 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
924 mdd.${MDT_DEV}.lfsck_namespace |
925 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
927 error "(4) unexpected status"
931 echo "stop $SINGLEMDS"
932 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop $SINGLEMDS!"
934 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
935 echo "start $SINGLEMDS"
936 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 6
938 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
939 mdd.${MDT_DEV}.lfsck_namespace |
940 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
942 error "(7) unexpected status"
945 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
956 formatall > /dev/null
962 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
963 [ "$STATUS" == "init" ] ||
964 namespace_error "(2) Expect 'init', but got '$STATUS'"
966 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
967 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
968 mkdir $DIR/$tdir/crashed
970 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
971 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
972 for ((i = 0; i < 5; i++)); do
973 touch $DIR/$tdir/dummy${i}
976 umount_client $MOUNT || error "(3) Fail to stop client!"
978 #define OBD_FAIL_LFSCK_DELAY2 0x1601
979 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
981 namespace_error "(4) Fail to start LFSCK for namespace!"
983 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
984 [ "$STATUS" == "scanning-phase1" ] ||
985 namespace_error "(5) Expect 'scanning-phase1', but got '$STATUS'"
987 $STOP_LFSCK || namespace_error "(6) Fail to stop LFSCK!"
989 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
990 [ "$STATUS" == "stopped" ] ||
991 namespace_error "(7) Expect 'stopped', but got '$STATUS'"
994 namespace_error "(8) Fail to start LFSCK for namespace!"
996 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
997 [ "$STATUS" == "scanning-phase1" ] ||
998 namespace_error "(9) Expect 'scanning-phase1', but got '$STATUS'"
1000 #define OBD_FAIL_LFSCK_FATAL2 0x1609
1001 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
1002 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1003 mdd.${MDT_DEV}.lfsck_namespace |
1004 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
1006 namespace_error "(10) unexpected status"
1009 #define OBD_FAIL_LFSCK_DELAY1 0x1600
1010 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
1012 namespace_error "(11) Fail to start LFSCK for namespace!"
1014 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1015 [ "$STATUS" == "scanning-phase1" ] ||
1016 namespace_error "(12) Expect 'scanning-phase1', but got '$STATUS'"
1018 #define OBD_FAIL_LFSCK_CRASH 0x160a
1019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
1022 echo "stop $SINGLEMDS"
1023 stop $SINGLEMDS > /dev/null || namespace_error "(13) Fail to stop MDS!"
1025 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1026 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1028 echo "start $SINGLEMDS"
1029 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 14
1031 local timeout=$(max_recovery_time)
1034 while [ $timer -lt $timeout ]; do
1035 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1036 mdt.${MDT_DEV}.recovery_status |
1037 awk '/^status/ { print \\\$2 }'")
1038 [ "$STATUS" != "RECOVERING" ] && break;
1040 timer=$((timer + 1))
1043 [ $timer != $timeout ] || (
1044 do_facet $SINGLEMDS "$LCTL get_param -n \
1045 mdt.${MDT_DEV}.recovery_status"
1046 error "(14.1) recovery timeout"
1049 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1050 [ "$STATUS" == "crashed" ] ||
1051 namespace_error "(15) Expect 'crashed', but got '$STATUS'"
1053 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1054 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1056 namespace_error "(16) Fail to start LFSCK for namespace!"
1058 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1059 [ "$STATUS" == "scanning-phase1" ] ||
1060 namespace_error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1062 echo "stop $SINGLEMDS"
1063 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop $SINGLEMDS!"
1065 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1066 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1068 echo "start $SINGLEMDS"
1069 start_facet $SINGLEMDS "$MOUNT_OPTS_SCRUB" 19
1072 while [ $timer -lt $timeout ]; do
1073 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1074 mdt.${MDT_DEV}.recovery_status |
1075 awk '/^status/ { print \\\$2 }'")
1076 [ "$STATUS" != "RECOVERING" ] && break;
1078 timer=$((timer + 1))
1081 [ $timer != $timeout ] || (
1082 do_facet $SINGLEMDS "$LCTL get_param -n \
1083 mdt.${MDT_DEV}.recovery_status"
1084 error "(19.1) recovery timeout"
1087 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1088 [ "$STATUS" == "paused" ] ||
1089 namespace_error "(20) Expect 'paused', but got '$STATUS'"
1091 echo "stop $SINGLEMDS"
1092 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1094 echo "start $SINGLEMDS without resume LFSCK"
1095 start_facet $SINGLEMDS "$MOUNT_OPTS_SKIP_LFSCK" 20.2
1098 while [ $timer -lt $timeout ]; do
1099 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1100 mdt.${MDT_DEV}.recovery_status |
1101 awk '/^status/ { print \\\$2 }'")
1102 [ "$STATUS" != "RECOVERING" ] && break;
1104 timer=$((timer + 1))
1107 [ $timer != $timeout ] || (
1108 do_facet $SINGLEMDS "$LCTL get_param -n \
1109 mdt.${MDT_DEV}.recovery_status"
1110 error "(20.3) recovery timeout"
1113 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1114 [ "$STATUS" == "paused" ] ||
1115 namespace_error "(20.4) Expect 'paused', but got '$STATUS'"
1117 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1118 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1121 namespace_error "(21) Fail to start LFSCK for namespace!"
1122 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1123 mdd.${MDT_DEV}.lfsck_namespace |
1124 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1126 namespace_error "(22) unexpected status"
1129 # wait to process one inode at least (OBD_FAIL_LFSCK_DELAY3)
1132 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1133 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1134 namespace_error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1136 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1137 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1138 mdd.${MDT_DEV}.lfsck_namespace |
1139 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1141 namespace_error "(24) unexpected status"
1144 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1146 namespace_error "(25) Expect empty flags, but got '$FLAGS'"
1148 run_test 8 "LFSCK state machine"
1151 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1152 skip "Testing on UP system, the speed may be inaccurate."
1156 check_mount_and_prep
1157 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1158 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1159 createmany -o $DIR/$tdir/lfsck/f 5000
1161 local BASE_SPEED1=100
1163 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1166 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1167 [ "$STATUS" == "scanning-phase1" ] ||
1168 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1170 local SPEED=$($SHOW_LAYOUT |
1171 awk '/^average_speed_phase1/ { print $2 }')
1173 # There may be time error, normally it should be less than 2 seconds.
1174 # We allow another 20% schedule error.
1176 # MAX_MARGIN = 1.3 = 13 / 10
1177 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1178 RUN_TIME1 * 13 / 10))
1179 [ $SPEED -lt $MAX_SPEED ] || {
1181 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1182 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1185 # adjust speed limit
1186 local BASE_SPEED2=300
1188 do_facet $SINGLEMDS \
1189 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1192 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1193 # MIN_MARGIN = 0.7 = 7 / 10
1194 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1195 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1196 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1197 [ $SPEED -gt $MIN_SPEED ] || {
1198 if [ $mds1_FSTYPE != ldiskfs ]; then
1199 error_ignore LU-5624 \
1200 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1203 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1207 # MAX_MARGIN = 1.3 = 13 / 10
1208 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1209 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1210 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1211 [ $SPEED -lt $MAX_SPEED ] || {
1213 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1214 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1215 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1218 do_nodes $(comma_list $(mdts_nodes)) \
1219 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1220 do_nodes $(comma_list $(osts_nodes)) \
1221 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1223 wait_update_facet $SINGLEMDS \
1224 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1225 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1226 error "(7) Failed to get expected 'completed'"
1228 run_test 9a "LFSCK speed control (1)"
1231 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1232 skip "Testing on UP system, the speed may be inaccurate."
1238 echo "Preparing another 50 * 50 files (with error) at $(date)."
1239 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1241 createmany -d $DIR/$tdir/d 50
1242 createmany -m $DIR/$tdir/f 50
1243 for ((i = 0; i < 50; i++)); do
1244 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1247 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1249 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1250 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1251 mdd.${MDT_DEV}.lfsck_namespace |
1252 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1254 error "(5) unexpected status"
1257 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1258 echo "Prepared at $(date)."
1260 local BASE_SPEED1=50
1262 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1265 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1266 [ "$STATUS" == "scanning-phase2" ] ||
1267 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1269 local SPEED=$($SHOW_NAMESPACE |
1270 awk '/^average_speed_phase2/ { print $2 }')
1271 # There may be time error, normally it should be less than 2 seconds.
1272 # We allow another 20% schedule error.
1274 # MAX_MARGIN = 1.3 = 13 / 10
1275 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) /
1276 RUN_TIME1 * 13 / 10))
1277 [ $SPEED -lt $MAX_SPEED ] || {
1279 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1280 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1283 # adjust speed limit
1284 local BASE_SPEED2=150
1286 do_facet $SINGLEMDS \
1287 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1290 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1291 # MIN_MARGIN = 0.7 = 7 / 10
1292 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) +
1293 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) /
1294 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1295 [ $SPEED -gt $MIN_SPEED ] || {
1296 if [ $mds1_FSTYPE != ldiskfs ]; then
1297 error_ignore LU-5624 \
1298 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1301 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1305 # MAX_MARGIN = 1.3 = 13 / 10
1306 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) +
1307 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) /
1308 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1309 [ $SPEED -lt $MAX_SPEED ] || {
1311 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1312 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1313 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1316 do_nodes $(comma_list $(mdts_nodes)) \
1317 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1318 do_nodes $(comma_list $(osts_nodes)) \
1319 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1320 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1321 mdd.${MDT_DEV}.lfsck_namespace |
1322 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1324 error "(11) unexpected status"
1327 run_test 9b "LFSCK speed control (2)"
1331 [[ $mds1_FSTYPE == ldiskfs ]] || skip "lookup(..)/linkea on ZFS issue"
1335 echo "Preparing more files with error at $(date)."
1336 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1337 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1339 for ((i = 0; i < 1000; i = $((i+2)))); do
1340 mkdir -p $DIR/$tdir/d${i}
1341 touch $DIR/$tdir/f${i}
1342 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1345 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1348 for ((i = 1; i < 1000; i = $((i+2)))); do
1349 mkdir -p $DIR/$tdir/d${i}
1350 touch $DIR/$tdir/f${i}
1351 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1354 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1355 echo "Prepared at $(date)."
1357 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1359 umount_client $MOUNT
1360 mount_client $MOUNT || error "(3) Fail to start client!"
1362 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1365 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1366 [ "$STATUS" == "scanning-phase1" ] ||
1367 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1369 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1371 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1373 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1375 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1377 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1379 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1381 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1383 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1384 error "(14) Fail to softlink!"
1386 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1387 [ "$STATUS" == "scanning-phase1" ] ||
1388 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1390 do_nodes $(comma_list $(mdts_nodes)) \
1391 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1392 do_nodes $(comma_list $(osts_nodes)) \
1393 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1394 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1395 mdd.${MDT_DEV}.lfsck_namespace |
1396 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1398 error "(16) unexpected status"
1401 run_test 10 "System is available during LFSCK scanning"
1404 ost_remove_lastid() {
1407 local rcmd="do_facet ost${ost}"
1409 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1411 # step 1: local mount
1412 mount_fstype ost${ost} || return 1
1413 # step 2: remove the specified LAST_ID
1414 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1416 unmount_fstype ost${ost} || return 2
1420 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1421 skip "MDS older than 2.5.55, LU-1267"
1423 check_mount_and_prep
1424 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1425 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1430 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1432 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1433 error "(2) Fail to start ost1"
1435 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1436 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1438 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1439 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1441 wait_update_facet ost1 "$LCTL get_param -n \
1442 obdfilter.${OST_DEV}.lfsck_layout |
1443 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1445 error "(5) unexpected status"
1448 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1450 wait_update_facet ost1 "$LCTL get_param -n \
1451 obdfilter.${OST_DEV}.lfsck_layout |
1452 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1454 error "(6) unexpected status"
1457 echo "the LAST_ID(s) should have been rebuilt"
1458 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1459 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1461 run_test 11a "LFSCK can rebuild lost last_id"
1464 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1465 skip "MDS older than 2.5.55, LU-1267"
1467 check_mount_and_prep
1468 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1470 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1471 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1472 do_facet ost1 $LCTL set_param fail_loc=0x160d
1474 local count=$(precreated_ost_obj_count 0 0)
1476 createmany -o $DIR/$tdir/f $((count + 32))
1478 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1479 local seq=$(do_facet mds1 $LCTL get_param -n \
1480 osp.${proc_path}.prealloc_last_seq)
1481 local id_used=$(do_facet mds1 $LCTL get_param -n \
1482 osp.${proc_path}.prealloc_last_id)
1484 umount_client $MOUNT
1485 stop ost1 || error "(1) Fail to stop ost1"
1487 #define OBD_FAIL_OST_ENOSPC 0x215
1488 do_facet ost1 $LCTL set_param fail_loc=0x215
1490 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1491 error "(2) Fail to start ost1"
1493 for ((i = 0; i < 60; i++)); do
1494 id_ost1=$(do_facet ost1 \
1495 "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
1496 awk -F: "/$seq/ { print \$2 }")
1497 [ -n "$id_ost1" ] && break
1501 echo "the on-disk LAST_ID should be smaller than the expected one"
1502 [ $id_used -gt $id_ost1 ] ||
1503 error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
1505 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1506 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1508 wait_update_facet ost1 \
1509 "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
1510 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1512 error "(6) unexpected status"
1515 stop ost1 || error "(7) Fail to stop ost1"
1517 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1518 error "(8) Fail to start ost1"
1520 echo "the on-disk LAST_ID should have been rebuilt"
1521 # last_id may be larger than $id_used if objects were created/skipped
1522 wait_update_facet_cond ost1 \
1523 "$LCTL get_param -n obdfilter.$ost1_svc.last_id |
1524 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
1525 do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
1526 error "(9) expect last_id >= id_used $seq:$id_used"
1529 do_facet ost1 $LCTL set_param fail_loc=0
1530 stopall || error "(10) Fail to stopall"
1532 run_test 11b "LFSCK can rebuild crashed last_id"
1535 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1536 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1537 skip "MDS older than 2.5.55, LU-3950"
1539 check_mount_and_prep
1540 for k in $(seq $MDSCOUNT); do
1541 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1542 createmany -o $DIR/$tdir/${k}/f 100 ||
1543 error "(0) Fail to create 100 files."
1546 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1547 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1548 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1550 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1551 wait_all_targets namespace scanning-phase1 3
1553 echo "Stop namespace LFSCK on all targets by single lctl command."
1554 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1555 error "(4) Fail to stop LFSCK on all devices!"
1557 echo "All the LFSCK targets should be in 'stopped' status."
1558 wait_all_targets_blocked namespace stopped 5
1560 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1561 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1562 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1564 echo "All the LFSCK targets should be in 'completed' status."
1565 wait_all_targets_blocked namespace completed 7
1567 start_full_debug_logging
1569 echo "Start layout LFSCK on all targets by single command (-s 1)."
1570 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1571 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1573 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1574 wait_all_targets layout scanning-phase1 9
1576 echo "Stop layout LFSCK on all targets by single lctl command."
1577 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1578 error "(10) Fail to stop LFSCK on all devices!"
1580 echo "All the LFSCK targets should be in 'stopped' status."
1581 wait_all_targets_blocked layout stopped 11
1583 for k in $(seq $OSTCOUNT); do
1584 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1585 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1586 awk '/^status/ { print $2 }')
1587 [ "$STATUS" == "stopped" ] ||
1588 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1591 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1592 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1593 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1595 echo "All the LFSCK targets should be in 'completed' status."
1596 wait_all_targets_blocked layout completed 14
1598 stop_full_debug_logging
1600 run_test 12a "single command to trigger LFSCK on all devices"
1603 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1604 skip "MDS older than 2.5.55, LU-3950"
1606 check_mount_and_prep
1608 echo "Start LFSCK without '-M' specified."
1609 do_facet mds1 $LCTL lfsck_start -A -r ||
1610 error "(0) Fail to start LFSCK without '-M'"
1612 wait_all_targets_blocked namespace completed 1
1613 wait_all_targets_blocked layout completed 2
1615 local count=$(do_facet mds1 $LCTL dl |
1616 awk '{ print $3 }' | grep mdt | wc -l)
1617 if [ $count -gt 1 ]; then
1619 echo "Start layout LFSCK on the node with multipe targets,"
1620 echo "but not specify '-M'/'-A' option. Should get failure."
1622 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1623 error "(3) Start layout LFSCK should fail" || true
1626 run_test 12b "auto detect Lustre device"
1629 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1630 skip "MDS older than 2.5.55, LU-3593"
1633 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1634 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1635 echo "MDT-object FID."
1638 check_mount_and_prep
1640 echo "Inject failure stub to simulate bad lmm_oi"
1641 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1642 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1643 createmany -o $DIR/$tdir/f 1
1644 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1645 error "(0) Fail to create PFL $DIR/$tdir/f1"
1646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1648 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1649 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1651 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1652 mdd.${MDT_DEV}.lfsck_layout |
1653 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1655 error "(2) unexpected status"
1658 local repaired=$($SHOW_LAYOUT |
1659 awk '/^repaired_others/ { print $2 }')
1660 [ $repaired -eq 2 ] ||
1661 error "(3) Fail to repair crashed lmm_oi: $repaired"
1663 run_test 13 "LFSCK can repair crashed lmm_oi"
1666 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1667 skip "MDS older than 2.5.55, LU-3590"
1670 echo "The OST-object referenced by the MDT-object should be there;"
1671 echo "otherwise, the LFSCK should re-create the missing OST-object."
1672 echo "without '--delay-create-ostobj' option."
1675 check_mount_and_prep
1676 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1678 echo "Inject failure stub to simulate dangling referenced MDT-object"
1679 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1680 do_facet ost1 $LCTL set_param fail_loc=0x1610
1681 local count=$(precreated_ost_obj_count 0 0)
1683 createmany -o $DIR/$tdir/f $((count + 16)) ||
1684 error "(0.1) Fail to create $DIR/$tdir/fx"
1685 touch $DIR/$tdir/guard0
1687 for ((i = 0; i < 16; i++)); do
1688 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1689 $DIR/$tdir/f_comp${i} ||
1690 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1692 touch $DIR/$tdir/guard1
1694 do_facet ost1 $LCTL set_param fail_loc=0
1696 start_full_debug_logging
1698 # exhaust other pre-created dangling cases
1699 count=$(precreated_ost_obj_count 0 0)
1700 createmany -o $DIR/$tdir/a $count ||
1701 error "(0.5) Fail to create $count files."
1703 echo "'ls' should fail because of dangling referenced MDT-object"
1704 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1706 echo "Trigger layout LFSCK to find out dangling reference"
1707 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1709 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1710 mdd.${MDT_DEV}.lfsck_layout |
1711 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1713 error "(3) unexpected status"
1716 local repaired=$($SHOW_LAYOUT |
1717 awk '/^repaired_dangling/ { print $2 }')
1718 [ $repaired -ge 32 ] ||
1719 error "(4) Fail to repair dangling reference: $repaired"
1721 echo "'stat' should fail because of not repair dangling by default"
1722 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1723 error "(5.1) stat should fail"
1724 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1725 error "(5.2) stat should fail"
1727 echo "Trigger layout LFSCK to repair dangling reference"
1728 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1730 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1731 mdd.${MDT_DEV}.lfsck_layout |
1732 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1734 error "(7) unexpected status"
1737 # There may be some async LFSCK updates in processing, wait for
1738 # a while until the target reparation has been done. LU-4970.
1740 echo "'stat' should success after layout LFSCK repairing"
1741 wait_update_facet client "stat $DIR/$tdir/guard0 |
1742 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1743 stat $DIR/$tdir/guard0
1745 error "(8.1) unexpected size"
1748 wait_update_facet client "stat $DIR/$tdir/guard1 |
1749 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1750 stat $DIR/$tdir/guard1
1752 error "(8.2) unexpected size"
1755 repaired=$($SHOW_LAYOUT |
1756 awk '/^repaired_dangling/ { print $2 }')
1757 [ $repaired -ge 32 ] ||
1758 error "(9) Fail to repair dangling reference: $repaired"
1760 stop_full_debug_logging
1762 echo "stopall to cleanup object cache"
1765 setupall > /dev/null
1767 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1770 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1771 skip "MDS older than 2.5.55, LU-3590"
1774 echo "The OST-object referenced by the MDT-object should be there;"
1775 echo "otherwise, the LFSCK should re-create the missing OST-object."
1776 echo "with '--delay-create-ostobj' option."
1779 check_mount_and_prep
1780 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1782 echo "Inject failure stub to simulate dangling referenced MDT-object"
1783 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1784 do_facet ost1 $LCTL set_param fail_loc=0x1610
1785 local count=$(precreated_ost_obj_count 0 0)
1787 createmany -o $DIR/$tdir/f $((count + 31))
1788 touch $DIR/$tdir/guard
1789 do_facet ost1 $LCTL set_param fail_loc=0
1791 start_full_debug_logging
1793 # exhaust other pre-created dangling cases
1794 count=$(precreated_ost_obj_count 0 0)
1795 createmany -o $DIR/$tdir/a $count ||
1796 error "(0) Fail to create $count files."
1798 echo "'ls' should fail because of dangling referenced MDT-object"
1799 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1801 echo "Trigger layout LFSCK to find out dangling reference"
1802 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1804 wait_all_targets_blocked layout completed 3
1806 local repaired=$($SHOW_LAYOUT |
1807 awk '/^repaired_dangling/ { print $2 }')
1808 [ $repaired -ge 32 ] ||
1809 error "(4) Fail to repair dangling reference: $repaired"
1811 echo "'stat' should fail because of not repair dangling by default"
1812 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1814 echo "Trigger layout LFSCK to repair dangling reference"
1815 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1817 wait_all_targets_blocked layout completed 7
1819 # There may be some async LFSCK updates in processing, wait for
1820 # a while until the target reparation has been done. LU-4970.
1822 echo "'stat' should success after layout LFSCK repairing"
1823 wait_update_facet client "stat $DIR/$tdir/guard |
1824 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1825 stat $DIR/$tdir/guard
1827 error "(8) unexpected size"
1830 repaired=$($SHOW_LAYOUT |
1831 awk '/^repaired_dangling/ { print $2 }')
1832 [ $repaired -ge 32 ] ||
1833 error "(9) Fail to repair dangling reference: $repaired"
1835 stop_full_debug_logging
1837 echo "stopall to cleanup object cache"
1840 setupall > /dev/null
1842 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1845 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1846 skip "MDS older than 2.5.55, LU-3591"
1849 echo "If the OST-object referenced by the MDT-object back points"
1850 echo "to some non-exist MDT-object, then the LFSCK should repair"
1851 echo "the OST-object to back point to the right MDT-object."
1854 check_mount_and_prep
1855 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1857 echo "Inject failure stub to make the OST-object to back point to"
1858 echo "non-exist MDT-object."
1859 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1861 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1862 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1863 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1865 error "(0) Fail to create PFL $DIR/$tdir/f1"
1866 # 'dd' will trigger punch RPC firstly on every OST-objects.
1867 # So even though some OST-object will not be write by 'dd',
1868 # as long as it is allocated (may be NOT allocated in pfl_3b)
1869 # its layout information will be set also.
1870 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1871 cancel_lru_locks osc
1872 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1874 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1875 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1877 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1878 mdd.${MDT_DEV}.lfsck_layout |
1879 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1881 error "(2) unexpected status"
1884 local repaired=$($SHOW_LAYOUT |
1885 awk '/^repaired_unmatched_pair/ { print $2 }')
1886 [ $repaired -ge 3 ] ||
1887 error "(3) Fail to repair unmatched pair: $repaired"
1889 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1892 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1893 skip "MDS older than 2.5.55, LU-3591"
1896 echo "If the OST-object referenced by the MDT-object back points"
1897 echo "to other MDT-object that doesn't recognize the OST-object,"
1898 echo "then the LFSCK should repair it to back point to the right"
1899 echo "MDT-object (the first one)."
1902 check_mount_and_prep
1903 mkdir -p $DIR/$tdir/0
1904 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1905 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1906 cancel_lru_locks osc
1908 echo "Inject failure stub to make the OST-object to back point to"
1909 echo "other MDT-object"
1912 [ $OSTCOUNT -ge 2 ] && stripes=2
1914 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1915 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1916 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1917 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1919 error "(0) Fail to create PFL $DIR/$tdir/f1"
1920 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1921 cancel_lru_locks osc
1922 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1924 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1925 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1927 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1928 mdd.${MDT_DEV}.lfsck_layout |
1929 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1931 error "(2) unexpected status"
1934 local repaired=$($SHOW_LAYOUT |
1935 awk '/^repaired_unmatched_pair/ { print $2 }')
1936 [ $repaired -eq 4 ] ||
1937 error "(3) Fail to repair unmatched pair: $repaired"
1939 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1942 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
1943 (( $MDS1_VERSION < $(version_code 2.7.55) )) ||
1944 skip "MDS newer than 2.7.55, LU-6475"
1945 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
1946 skip "MDS older than 2.5.55, LU-3591"
1949 echo "According to current metadata migration implementation,"
1950 echo "before the old MDT-object is removed, both the new MDT-object"
1951 echo "and old MDT-object will reference the same LOV layout. Then if"
1952 echo "the layout LFSCK finds the new MDT-object by race, it will"
1953 echo "regard related OST-object(s) as multiple referenced case, and"
1954 echo "will try to create new OST-object(s) for the new MDT-object."
1955 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1956 echo "MDT-object before confirm the multiple referenced case."
1959 check_mount_and_prep
1960 $LFS mkdir -i 1 $DIR/$tdir/a1
1961 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1962 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1963 cancel_lru_locks osc
1965 echo "Inject failure stub on MDT1 to delay the migration"
1967 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1968 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1969 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1970 $LFS migrate -m 0 $DIR/$tdir/a1 &
1973 echo "Trigger layout LFSCK to race with the migration"
1974 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1976 wait_all_targets_blocked layout completed 2
1978 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1979 local repaired=$($SHOW_LAYOUT |
1980 awk '/^repaired_unmatched_pair/ { print $2 }')
1981 [ $repaired -eq 1 ] ||
1982 error "(3) Fail to repair unmatched pair: $repaired"
1984 repaired=$($SHOW_LAYOUT |
1985 awk '/^repaired_multiple_referenced/ { print $2 }')
1986 [ $repaired -eq 0 ] ||
1987 error "(4) Unexpectedly repaird multiple references: $repaired"
1989 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1992 (( $MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
1994 check_mount_and_prep
1996 $LFS mkdir -c -1 $DIR/$tdir || error "create $tdir failed"
1997 $LFS setdirstripe -D -i -1 -c 1 $DIR/$tdir ||
1998 error "setdirstripe failed"
2000 createmany -o $DIR/$tdir/f 100 || error "create sub files failed"
2001 createmany -d $DIR/$tdir/s 100 || error "create sub dirs failed"
2003 echo "Migrate $DIR/$tdir to MDT1"
2004 $LFS migrate -m 1 $DIR/$tdir &
2008 # fail sub transactions on random MDTs, which may cause some file
2010 #define OBD_FAIL_OUT_EIO 0x1709
2011 for ((i = 0; i < $MDSCOUNT; i++)); do
2012 do_facet mds$i $LCTL set_param fail_loc=0x1709
2014 do_facet mds$i $LCTL set_param fail_loc=0
2019 # LFSCK can't fully fix migrating directories, and may leave some
2020 # files inaccessible, but it shouldn't cause crash
2021 $START_NAMESPACE -A -r ||
2022 error "Fail to start LFSCK for namespace"
2024 wait_all_targets_blocked namespace completed 1
2026 # resume migration may fail because some file may be inaccessible, but
2027 # it shouldn't cause crash
2028 $LFS migrate -m 1 $DIR/$tdir
2030 # rm $tdir to avoid cleanup failure in the end
2032 $LFS rm_entry $DIR/$tdir/*
2033 rm -rf $DIR/$tdir || error "rm $tdir failed"
2035 run_test 15d "LFSCK don't crash upon dir migration failure"
2038 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2039 skip "MDS older than 2.5.55, LU-3594"
2042 echo "If the OST-object's owner information does not match the owner"
2043 echo "information stored in the MDT-object, then the LFSCK trust the"
2044 echo "MDT-object and update the OST-object's owner information."
2047 check_mount_and_prep
2048 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2049 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
2050 cancel_lru_locks osc
2052 # created but no setattr or write to the file.
2054 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
2055 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
2057 echo "Inject failure stub to skip OST-object owner changing"
2058 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
2059 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
2060 chown 1.1 $DIR/$tdir/f0
2061 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2063 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
2066 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
2068 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2069 mdd.${MDT_DEV}.lfsck_layout |
2070 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2072 error "(2) unexpected status"
2075 local repaired=$($SHOW_LAYOUT |
2076 awk '/^repaired_inconsistent_owner/ { print $2 }')
2077 [ $repaired -eq 1 ] ||
2078 error "(3) Fail to repair inconsistent owner: $repaired"
2080 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
2083 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2084 skip "MDS older than 2.5.55, LU-3594"
2087 echo "If more than one MDT-objects reference the same OST-object,"
2088 echo "and the OST-object only recognizes one MDT-object, then the"
2089 echo "LFSCK should create new OST-objects for such non-recognized"
2093 check_mount_and_prep
2094 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2096 echo "Inject failure stub to make two MDT-objects to refernce"
2097 echo "the OST-object"
2099 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
2100 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
2101 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
2102 cancel_lru_locks mdc
2103 cancel_lru_locks osc
2105 createmany -o $DIR/$tdir/f 1
2106 cancel_lru_locks mdc
2107 cancel_lru_locks osc
2109 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2111 error "(0) Fail to create PFL $DIR/$tdir/f1"
2112 cancel_lru_locks mdc
2113 cancel_lru_locks osc
2114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2116 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2117 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2118 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2119 [ $size -eq 1048576 ] ||
2120 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2122 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2123 [ $size -eq 1048576 ] ||
2124 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2126 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2129 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2131 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2132 mdd.${MDT_DEV}.lfsck_layout |
2133 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2135 error "(3) unexpected status"
2138 local repaired=$($SHOW_LAYOUT |
2139 awk '/^repaired_multiple_referenced/ { print $2 }')
2140 [ $repaired -eq 2 ] ||
2141 error "(4) Fail to repair multiple references: $repaired"
2143 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2144 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2145 error "(5) Fail to write f0."
2146 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2147 [ $size -eq 1048576 ] ||
2148 error "(6) guard size should be 1048576, but got $size"
2150 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2151 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2152 error "(7) Fail to write f1."
2153 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2154 [ $size -eq 1048576 ] ||
2155 error "(8) guard size should be 1048576, but got $size"
2157 run_test 17 "LFSCK can repair multiple references"
2159 $LCTL set_param debug=+cache > /dev/null
2162 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2163 skip "MDS older than 2.5.55, LU-3336"
2166 echo "The target MDT-object is there, but related stripe information"
2167 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2168 echo "layout EA entries."
2171 check_mount_and_prep
2172 $LFS mkdir -i 0 $DIR/$tdir/a1
2173 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2174 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2176 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2178 $LFS path2fid $DIR/$tdir/a1/f1
2179 $LFS getstripe $DIR/$tdir/a1/f1
2181 if [ $MDSCOUNT -ge 2 ]; then
2182 $LFS mkdir -i 1 $DIR/$tdir/a2
2183 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2184 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2185 $LFS path2fid $DIR/$tdir/a2/f2
2186 $LFS getstripe $DIR/$tdir/a2/f2
2189 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2190 error "(0) Fail to create PFL $DIR/$tdir/f3"
2192 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2194 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2196 $LFS path2fid $DIR/$tdir/f3
2197 $LFS getstripe $DIR/$tdir/f3
2199 cancel_lru_locks osc
2201 echo "Inject failure, to make the MDT-object lost its layout EA"
2202 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2203 do_facet mds1 $LCTL set_param fail_loc=0x1615
2204 chown 1.1 $DIR/$tdir/a1/f1
2206 if [ $MDSCOUNT -ge 2 ]; then
2207 do_facet mds2 $LCTL set_param fail_loc=0x1615
2208 chown 1.1 $DIR/$tdir/a2/f2
2211 chown 1.1 $DIR/$tdir/f3
2216 do_facet mds1 $LCTL set_param fail_loc=0
2217 if [ $MDSCOUNT -ge 2 ]; then
2218 do_facet mds2 $LCTL set_param fail_loc=0
2221 cancel_lru_locks mdc
2222 cancel_lru_locks osc
2224 echo "The file size should be incorrect since layout EA is lost"
2225 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2226 [ "$cur_size" != "$saved_size1" ] ||
2227 error "(1) Expect incorrect file1 size"
2229 if [ $MDSCOUNT -ge 2 ]; then
2230 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2231 [ "$cur_size" != "$saved_size1" ] ||
2232 error "(2) Expect incorrect file2 size"
2235 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2236 [ "$cur_size" != "$saved_size2" ] ||
2237 error "(1.2) Expect incorrect file3 size"
2239 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2240 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2242 for k in $(seq $MDSCOUNT); do
2243 # The LFSCK status query internal is 30 seconds. For the case
2244 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2245 # time to guarantee the status sync up.
2246 wait_update_facet mds${k} "$LCTL get_param -n \
2247 mdd.$(facet_svc mds${k}).lfsck_layout |
2248 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2249 error "(4) MDS${k} is not the expected 'completed'"
2252 for k in $(seq $OSTCOUNT); do
2253 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2254 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2255 awk '/^status/ { print $2 }')
2256 [ "$cur_status" == "completed" ] ||
2257 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2260 local repaired=$(do_facet mds1 $LCTL get_param -n \
2261 mdd.$(facet_svc mds1).lfsck_layout |
2262 awk '/^repaired_orphan/ { print $2 }')
2263 [ $repaired -eq 3 ] ||
2264 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2266 if [ $MDSCOUNT -ge 2 ]; then
2267 repaired=$(do_facet mds2 $LCTL get_param -n \
2268 mdd.$(facet_svc mds2).lfsck_layout |
2269 awk '/^repaired_orphan/ { print $2 }')
2270 [ $repaired -eq 2 ] ||
2271 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2274 $LFS path2fid $DIR/$tdir/a1/f1
2275 $LFS getstripe $DIR/$tdir/a1/f1
2277 if [ $MDSCOUNT -ge 2 ]; then
2278 $LFS path2fid $DIR/$tdir/a2/f2
2279 $LFS getstripe $DIR/$tdir/a2/f2
2282 $LFS path2fid $DIR/$tdir/f3
2283 $LFS getstripe $DIR/$tdir/f3
2285 echo "The file size should be correct after layout LFSCK scanning"
2286 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2287 [ "$cur_size" == "$saved_size1" ] ||
2288 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2290 if [ $MDSCOUNT -ge 2 ]; then
2291 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2292 [ "$cur_size" == "$saved_size1" ] ||
2293 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2296 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2297 [ "$cur_size" == "$saved_size2" ] ||
2298 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2300 run_test 18a "Find out orphan OST-object and repair it (1)"
2303 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2304 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2305 skip "MDS older than 2.5.55, LU-3336"
2308 echo "The target MDT-object is lost. The LFSCK should re-create the"
2309 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2310 echo "can move it back to normal namespace manually."
2313 check_mount_and_prep
2314 $LFS mkdir -i 0 $DIR/$tdir/a1
2315 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2316 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2317 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2318 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2320 $LFS getstripe $DIR/$tdir/a1/f1
2322 if [ $MDSCOUNT -ge 2 ]; then
2323 $LFS mkdir -i 1 $DIR/$tdir/a2
2324 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2325 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2326 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2328 $LFS getstripe $DIR/$tdir/a2/f2
2331 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2332 error "(0) Fail to create PFL $DIR/$tdir/f3"
2334 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2336 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2337 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2339 $LFS getstripe $DIR/$tdir/f3
2341 cancel_lru_locks osc
2343 echo "Inject failure, to simulate the case of missing the MDT-object"
2344 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2345 do_facet mds1 $LCTL set_param fail_loc=0x1616
2346 rm -f $DIR/$tdir/a1/f1
2348 if [ $MDSCOUNT -ge 2 ]; then
2349 do_facet mds2 $LCTL set_param fail_loc=0x1616
2350 rm -f $DIR/$tdir/a2/f2
2358 do_facet mds1 $LCTL set_param fail_loc=0
2359 if [ $MDSCOUNT -ge 2 ]; then
2360 do_facet mds2 $LCTL set_param fail_loc=0
2363 cancel_lru_locks mdc
2364 cancel_lru_locks osc
2366 # dryrun mode only check orphans, not repaie
2367 echo "Trigger layout LFSCK --dryrun to find out orphan OST-object"
2368 $START_LAYOUT --dryrun -o -r ||
2369 error "Fail to start layout LFSCK in dryrun mode"
2370 wait_all_targets_blocked layout completed 2
2372 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
2373 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
2374 error "Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
2376 local orphans=$(do_facet mds1 $LCTL get_param -n \
2377 mdd.$(facet_svc mds1).lfsck_layout |
2378 awk '/^inconsistent_orphan/ { print $2 }')
2379 [ $orphans -eq 3 ] ||
2380 error "Expect 3 found on mds1, but got: $orphans"
2382 # orphan parents should not be created
2384 for subdir in $MOUNT/.lustre/lost+found/*; do
2385 [ ! "$(ls -A $subdir)" ] || error "$subdir not empty"
2388 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2389 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2391 for k in $(seq $MDSCOUNT); do
2392 # The LFSCK status query internal is 30 seconds. For the case
2393 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2394 # time to guarantee the status sync up.
2395 wait_update_facet mds${k} "$LCTL get_param -n \
2396 mdd.$(facet_svc mds${k}).lfsck_layout |
2397 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2398 error "(2) MDS${k} is not the expected 'completed'"
2401 for k in $(seq $OSTCOUNT); do
2402 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2403 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2404 awk '/^status/ { print $2 }')
2405 [ "$cur_status" == "completed" ] ||
2406 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2409 local repaired=$(do_facet mds1 $LCTL get_param -n \
2410 mdd.$(facet_svc mds1).lfsck_layout |
2411 awk '/^repaired_orphan/ { print $2 }')
2412 [ $repaired -eq 3 ] ||
2413 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2415 if [ $MDSCOUNT -ge 2 ]; then
2416 repaired=$(do_facet mds2 $LCTL get_param -n \
2417 mdd.$(facet_svc mds2).lfsck_layout |
2418 awk '/^repaired_orphan/ { print $2 }')
2419 [ $repaired -eq 2 ] ||
2420 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2423 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2424 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2425 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2427 if [ $MDSCOUNT -ge 2 ]; then
2428 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2429 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2432 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2433 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2435 $LFS path2fid $DIR/$tdir/a1/f1
2436 $LFS getstripe $DIR/$tdir/a1/f1
2438 if [ $MDSCOUNT -ge 2 ]; then
2439 $LFS path2fid $DIR/$tdir/a2/f2
2440 $LFS getstripe $DIR/$tdir/a2/f2
2443 $LFS path2fid $DIR/$tdir/f3
2444 $LFS getstripe $DIR/$tdir/f3
2446 echo "The file size should be correct after layout LFSCK scanning"
2447 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2448 [ "$cur_size" == "$saved_size1" ] ||
2449 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2451 if [ $MDSCOUNT -ge 2 ]; then
2452 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2453 [ "$cur_size" == "$saved_size1" ] ||
2454 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2457 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2458 [ "$cur_size" == "$saved_size2" ] ||
2459 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2461 run_test 18b "Find out orphan OST-object and repair it (2)"
2464 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2465 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2466 skip "MDS older than 2.5.55, LU-3336"
2469 echo "The target MDT-object is lost, and the OST-object FID is missing."
2470 echo "The LFSCK should re-create the MDT-object with new FID under the "
2471 echo "directory .lustre/lost+found/MDTxxxx."
2474 check_mount_and_prep
2475 $LFS mkdir -i 0 $DIR/$tdir/a1
2476 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2478 echo "Inject failure, to simulate the case of missing parent FID"
2479 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2480 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2482 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2483 $LFS getstripe $DIR/$tdir/a1/f1
2485 if [ $MDSCOUNT -ge 2 ]; then
2486 $LFS mkdir -i 1 $DIR/$tdir/a2
2487 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2488 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2489 $LFS getstripe $DIR/$tdir/a2/f2
2492 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2493 error "(0) Fail to create PFL $DIR/$tdir/f3"
2495 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2496 $LFS getstripe $DIR/$tdir/f3
2498 cancel_lru_locks osc
2499 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2501 echo "Inject failure, to simulate the case of missing the MDT-object"
2502 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2503 do_facet mds1 $LCTL set_param fail_loc=0x1616
2504 rm -f $DIR/$tdir/a1/f1
2506 if [ $MDSCOUNT -ge 2 ]; then
2507 do_facet mds2 $LCTL set_param fail_loc=0x1616
2508 rm -f $DIR/$tdir/a2/f2
2516 do_facet mds1 $LCTL set_param fail_loc=0
2517 if [ $MDSCOUNT -ge 2 ]; then
2518 do_facet mds2 $LCTL set_param fail_loc=0
2521 cancel_lru_locks mdc
2522 cancel_lru_locks osc
2524 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2525 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2527 for k in $(seq $MDSCOUNT); do
2528 # The LFSCK status query internal is 30 seconds. For the case
2529 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2530 # time to guarantee the status sync up.
2531 wait_update_facet mds${k} "$LCTL get_param -n \
2532 mdd.$(facet_svc mds${k}).lfsck_layout |
2533 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2534 error "(2) MDS${k} is not the expected 'completed'"
2537 for k in $(seq $OSTCOUNT); do
2538 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2539 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2540 awk '/^status/ { print $2 }')
2541 [ "$cur_status" == "completed" ] ||
2542 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2545 if [ $MDSCOUNT -ge 2 ]; then
2551 local repaired=$(do_facet mds1 $LCTL get_param -n \
2552 mdd.$(facet_svc mds1).lfsck_layout |
2553 awk '/^repaired_orphan/ { print $2 }')
2554 [ $repaired -eq $expected ] ||
2555 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2557 if [ $MDSCOUNT -ge 2 ]; then
2558 repaired=$(do_facet mds2 $LCTL get_param -n \
2559 mdd.$(facet_svc mds2).lfsck_layout |
2560 awk '/^repaired_orphan/ { print $2 }')
2561 [ $repaired -eq 0 ] ||
2562 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2565 ls -ail $MOUNT/.lustre/lost+found/
2567 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2568 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2569 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2571 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2574 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2575 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2576 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2578 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2579 [ ! -z "$cname" ] ||
2580 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2582 run_test 18c "Find out orphan OST-object and repair it (3)"
2585 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2586 skip "MDS older than 2.5.55, LU-3336"
2589 echo "The target MDT-object layout EA is corrupted, but the right"
2590 echo "OST-object is still alive as orphan. The layout LFSCK will"
2591 echo "not create new OST-object to occupy such slot."
2594 check_mount_and_prep
2596 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2597 echo "guard" > $DIR/$tdir/a1/f1
2598 echo "foo" > $DIR/$tdir/a1/f2
2600 echo "guard" > $DIR/$tdir/a1/f3
2601 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2602 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2603 echo "foo" > $DIR/$tdir/a1/f4
2605 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2606 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2607 $LFS path2fid $DIR/$tdir/a1/f1
2608 $LFS getstripe $DIR/$tdir/a1/f1
2609 $LFS path2fid $DIR/$tdir/a1/f2
2610 $LFS getstripe $DIR/$tdir/a1/f2
2611 $LFS path2fid $DIR/$tdir/a1/f3
2612 $LFS getstripe $DIR/$tdir/a1/f3
2613 $LFS path2fid $DIR/$tdir/a1/f4
2614 $LFS getstripe $DIR/$tdir/a1/f4
2615 cancel_lru_locks osc
2617 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2618 echo "to reference the same OST-object (which is f1's OST-obejct)."
2619 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2620 echo "dangling reference case, but f2's old OST-object is there."
2622 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2623 echo "to reference the same OST-object (which is f3's OST-obejct)."
2624 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2625 echo "dangling reference case, but f4's old OST-object is there."
2628 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2629 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2630 chown 1.1 $DIR/$tdir/a1/f2
2631 chown 1.1 $DIR/$tdir/a1/f4
2632 rm -f $DIR/$tdir/a1/f1
2633 rm -f $DIR/$tdir/a1/f3
2636 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2638 echo "stopall to cleanup object cache"
2641 setupall > /dev/null
2643 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2644 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2646 for k in $(seq $MDSCOUNT); do
2647 # The LFSCK status query internal is 30 seconds. For the case
2648 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2649 # time to guarantee the status sync up.
2650 wait_update_facet mds${k} "$LCTL get_param -n \
2651 mdd.$(facet_svc mds${k}).lfsck_layout |
2652 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2653 error "(3) MDS${k} is not the expected 'completed'"
2656 for k in $(seq $OSTCOUNT); do
2657 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2658 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2659 awk '/^status/ { print $2 }')
2660 [ "$cur_status" == "completed" ] ||
2661 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2664 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2665 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2666 awk '/^repaired_orphan/ { print $2 }')
2667 [ $repaired -eq 2 ] ||
2668 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2670 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2671 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2672 awk '/^repaired_dangling/ { print $2 }')
2673 [ $repaired -eq 0 ] ||
2674 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2676 echo "The file size should be correct after layout LFSCK scanning"
2677 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2678 [ "$cur_size" == "$saved_size1" ] ||
2679 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2681 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2682 [ "$cur_size" == "$saved_size2" ] ||
2683 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2685 echo "The LFSCK should find back the original data."
2686 cat $DIR/$tdir/a1/f2
2687 $LFS path2fid $DIR/$tdir/a1/f2
2688 $LFS getstripe $DIR/$tdir/a1/f2
2689 cat $DIR/$tdir/a1/f4
2690 $LFS path2fid $DIR/$tdir/a1/f4
2691 $LFS getstripe $DIR/$tdir/a1/f4
2693 run_test 18d "Find out orphan OST-object and repair it (4)"
2696 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2697 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
2698 skip "MDS older than 2.5.55, LU-3336"
2701 echo "The target MDT-object layout EA slot is occpuied by some new"
2702 echo "created OST-object when repair dangling reference case. Such"
2703 echo "conflict OST-object has been modified by others. To keep the"
2704 echo "new data, the LFSCK will create a new file to refernece this"
2705 echo "old orphan OST-object."
2708 check_mount_and_prep
2710 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2711 echo "guard" > $DIR/$tdir/a1/f1
2712 echo "foo" > $DIR/$tdir/a1/f2
2714 echo "guard" > $DIR/$tdir/a1/f3
2715 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2716 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2717 echo "foo" > $DIR/$tdir/a1/f4
2719 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2720 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2722 $LFS path2fid $DIR/$tdir/a1/f1
2723 $LFS getstripe $DIR/$tdir/a1/f1
2724 $LFS path2fid $DIR/$tdir/a1/f2
2725 $LFS getstripe $DIR/$tdir/a1/f2
2726 $LFS path2fid $DIR/$tdir/a1/f3
2727 $LFS getstripe $DIR/$tdir/a1/f3
2728 $LFS path2fid $DIR/$tdir/a1/f4
2729 $LFS getstripe $DIR/$tdir/a1/f4
2730 cancel_lru_locks osc
2732 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2733 echo "to reference the same OST-object (which is f1's OST-obejct)."
2734 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2735 echo "dangling reference case, but f2's old OST-object is there."
2737 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2738 echo "to reference the same OST-object (which is f3's OST-obejct)."
2739 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2740 echo "dangling reference case, but f4's old OST-object is there."
2743 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2744 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2745 chown 1.1 $DIR/$tdir/a1/f2
2746 chown 1.1 $DIR/$tdir/a1/f4
2747 rm -f $DIR/$tdir/a1/f1
2748 rm -f $DIR/$tdir/a1/f3
2751 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2753 echo "stopall to cleanup object cache"
2756 setupall > /dev/null
2758 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2759 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2761 start_full_debug_logging
2763 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2764 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2766 wait_update_facet mds1 "$LCTL get_param -n \
2767 mdd.$(facet_svc mds1).lfsck_layout |
2768 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2769 error "(3) MDS1 is not the expected 'scanning-phase2'"
2771 # to guarantee all updates are synced.
2775 echo "Write new data to f2/f4 to modify the new created OST-object."
2776 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2777 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2779 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2781 for k in $(seq $MDSCOUNT); do
2782 # The LFSCK status query internal is 30 seconds. For the case
2783 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2784 # time to guarantee the status sync up.
2785 wait_update_facet mds${k} "$LCTL get_param -n \
2786 mdd.$(facet_svc mds${k}).lfsck_layout |
2787 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2788 error "(4) MDS${k} is not the expected 'completed'"
2791 for k in $(seq $OSTCOUNT); do
2792 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2793 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2794 awk '/^status/ { print $2 }')
2795 [ "$cur_status" == "completed" ] ||
2796 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2799 stop_full_debug_logging
2801 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2802 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2803 awk '/^repaired_orphan/ { print $2 }')
2804 [ $repaired -eq 2 ] ||
2805 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2807 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2808 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2809 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2811 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2812 if [ $count -ne 2 ]; then
2813 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2814 error "(8) Expect 2 stubs under lost+found, but got $count"
2817 echo "The stub file should keep the original f2 or f4 data"
2818 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2819 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2820 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2821 error "(9) Got unexpected $cur_size"
2824 $LFS path2fid $cname
2825 $LFS getstripe $cname
2827 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2828 cur_size=$(ls -il $cname | awk '{ print $6 }')
2829 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2830 error "(10) Got unexpected $cur_size"
2833 $LFS path2fid $cname
2834 $LFS getstripe $cname
2836 echo "The f2/f4 should contains new data."
2837 cat $DIR/$tdir/a1/f2
2838 $LFS path2fid $DIR/$tdir/a1/f2
2839 $LFS getstripe $DIR/$tdir/a1/f2
2840 cat $DIR/$tdir/a1/f4
2841 $LFS path2fid $DIR/$tdir/a1/f4
2842 $LFS getstripe $DIR/$tdir/a1/f4
2844 run_test 18e "Find out orphan OST-object and repair it (5)"
2847 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2850 echo "The target MDT-object is lost. The LFSCK should re-create the"
2851 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2852 echo "to verify some OST-object(s) during the first stage-scanning,"
2853 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2854 echo "should not be affected."
2857 check_mount_and_prep
2858 $LFS mkdir -i 0 $DIR/$tdir/a1
2859 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2860 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2861 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2862 $LFS mkdir -i 0 $DIR/$tdir/a2
2863 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2864 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2865 $LFS getstripe $DIR/$tdir/a1/f1
2866 $LFS getstripe $DIR/$tdir/a2/f2
2868 if [ $MDSCOUNT -ge 2 ]; then
2869 $LFS mkdir -i 1 $DIR/$tdir/a3
2870 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2871 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2872 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2873 $LFS mkdir -i 1 $DIR/$tdir/a4
2874 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2875 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2876 $LFS getstripe $DIR/$tdir/a3/f3
2877 $LFS getstripe $DIR/$tdir/a4/f4
2880 cancel_lru_locks osc
2882 echo "Inject failure, to simulate the case of missing the MDT-object"
2883 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2884 do_facet mds1 $LCTL set_param fail_loc=0x1616
2885 rm -f $DIR/$tdir/a1/f1
2886 rm -f $DIR/$tdir/a2/f2
2888 if [ $MDSCOUNT -ge 2 ]; then
2889 do_facet mds2 $LCTL set_param fail_loc=0x1616
2890 rm -f $DIR/$tdir/a3/f3
2891 rm -f $DIR/$tdir/a4/f4
2897 do_facet mds1 $LCTL set_param fail_loc=0
2898 if [ $MDSCOUNT -ge 2 ]; then
2899 do_facet mds2 $LCTL set_param fail_loc=0
2902 cancel_lru_locks mdc
2903 cancel_lru_locks osc
2905 echo "Inject failure, to simulate the OST0 fail to handle"
2906 echo "MDT0 LFSCK request during the first-stage scanning."
2907 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2908 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2910 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2911 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2913 for k in $(seq $MDSCOUNT); do
2914 # The LFSCK status query internal is 30 seconds. For the case
2915 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2916 # time to guarantee the status sync up.
2917 wait_update_facet mds${k} "$LCTL get_param -n \
2918 mdd.$(facet_svc mds${k}).lfsck_layout |
2919 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2920 error "(2) MDS${k} is not the expected 'partial'"
2923 wait_update_facet ost1 "$LCTL get_param -n \
2924 obdfilter.$(facet_svc ost1).lfsck_layout |
2925 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2926 error "(3) OST1 is not the expected 'partial'"
2929 wait_update_facet ost2 "$LCTL get_param -n \
2930 obdfilter.$(facet_svc ost2).lfsck_layout |
2931 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2932 error "(4) OST2 is not the expected 'completed'"
2935 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2937 local repaired=$(do_facet mds1 $LCTL get_param -n \
2938 mdd.$(facet_svc mds1).lfsck_layout |
2939 awk '/^repaired_orphan/ { print $2 }')
2940 [ $repaired -eq 1 ] ||
2941 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2943 if [ $MDSCOUNT -ge 2 ]; then
2944 repaired=$(do_facet mds2 $LCTL get_param -n \
2945 mdd.$(facet_svc mds2).lfsck_layout |
2946 awk '/^repaired_orphan/ { print $2 }')
2947 [ $repaired -eq 1 ] ||
2948 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2951 echo "Trigger layout LFSCK on all devices again to cleanup"
2952 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2954 for k in $(seq $MDSCOUNT); do
2955 # The LFSCK status query internal is 30 seconds. For the case
2956 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2957 # time to guarantee the status sync up.
2958 wait_update_facet mds${k} "$LCTL get_param -n \
2959 mdd.$(facet_svc mds${k}).lfsck_layout |
2960 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2961 error "(8) MDS${k} is not the expected 'completed'"
2964 for k in $(seq $OSTCOUNT); do
2965 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2966 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2967 awk '/^status/ { print $2 }')
2968 [ "$cur_status" == "completed" ] ||
2969 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2973 local repaired=$(do_facet mds1 $LCTL get_param -n \
2974 mdd.$(facet_svc mds1).lfsck_layout |
2975 awk '/^repaired_orphan/ { print $2 }')
2976 [ $repaired -eq 2 ] ||
2977 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2979 if [ $MDSCOUNT -ge 2 ]; then
2980 repaired=$(do_facet mds2 $LCTL get_param -n \
2981 mdd.$(facet_svc mds2).lfsck_layout |
2982 awk '/^repaired_orphan/ { print $2 }')
2983 [ $repaired -eq 2 ] ||
2984 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2987 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2990 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2993 echo "The target MDT-object is lost, but related OI mapping is there"
2994 echo "The LFSCK should recreate the lost MDT-object without affected"
2995 echo "by the stale OI mapping."
2998 check_mount_and_prep
2999 $LFS mkdir -i 0 $DIR/$tdir/a1
3000 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
3001 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
3002 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3004 $LFS getstripe $DIR/$tdir/a1/f1
3005 cancel_lru_locks osc
3007 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
3008 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
3009 do_facet mds1 $LCTL set_param fail_loc=0x162e
3010 rm -f $DIR/$tdir/a1/f1
3012 do_facet mds1 $LCTL set_param fail_loc=0
3013 cancel_lru_locks mdc
3014 cancel_lru_locks osc
3016 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3017 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3019 for k in $(seq $MDSCOUNT); do
3020 # The LFSCK status query internal is 30 seconds. For the case
3021 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3022 # time to guarantee the status sync up.
3023 wait_update_facet mds${k} "$LCTL get_param -n \
3024 mdd.$(facet_svc mds${k}).lfsck_layout |
3025 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3026 error "(2) MDS${k} is not the expected 'completed'"
3029 for k in $(seq $OSTCOUNT); do
3030 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3031 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3032 awk '/^status/ { print $2 }')
3033 [ "$cur_status" == "completed" ] ||
3034 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3037 local repaired=$(do_facet mds1 $LCTL get_param -n \
3038 mdd.$(facet_svc mds1).lfsck_layout |
3039 awk '/^repaired_orphan/ { print $2 }')
3040 [ $repaired -eq $OSTCOUNT ] ||
3041 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
3043 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
3044 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
3045 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3047 $LFS path2fid $DIR/$tdir/a1/f1
3048 $LFS getstripe $DIR/$tdir/a1/f1
3050 run_test 18g "Find out orphan OST-object and repair it (7)"
3054 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
3055 echo "the layout LFSCK will keep the bad PFL file(s) there without"
3056 echo "scanning its OST-object(s). Then in the second stage scanning,"
3057 echo "the OST will return related OST-object(s) to the MDT as orphan."
3058 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
3059 echo "the 'orphan(s)' stripe information."
3062 check_mount_and_prep
3064 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
3065 error "(0) Fail to create PFL $DIR/$tdir/f0"
3067 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
3068 error "(1.1) Fail to write $DIR/$tdir/f0"
3070 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
3071 error "(1.2) Fail to write $DIR/$tdir/f0"
3073 cp $DIR/$tdir/f0 $DIR/$tdir/guard
3075 echo "Inject failure stub to simulate bad PFL extent range"
3076 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
3077 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
3079 chown 1.1 $DIR/$tdir/f0
3081 cancel_lru_locks mdc
3082 cancel_lru_locks osc
3083 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3085 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
3086 error "(2) Write to bad PFL file should fail"
3088 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
3089 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3091 for k in $(seq $MDSCOUNT); do
3092 # The LFSCK status query internal is 30 seconds. For the case
3093 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3094 # time to guarantee the status sync up.
3095 wait_update_facet mds${k} "$LCTL get_param -n \
3096 mdd.$(facet_svc mds${k}).lfsck_layout |
3097 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
3098 error "(4.1) MDS${k} is not the expected 'completed'"
3101 for k in $(seq $OSTCOUNT); do
3102 cur_status=$(do_facet ost${k} $LCTL get_param -n \
3103 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3104 awk '/^status/ { print $2 }')
3105 [ "$cur_status" == "completed" ] ||
3106 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
3110 local repaired=$($SHOW_LAYOUT |
3111 awk '/^repaired_orphan/ { print $2 }')
3112 [ $repaired -eq 2 ] ||
3113 error "(5) Fail to repair crashed PFL range: $repaired"
3115 echo "Data in $DIR/$tdir/f0 should not be broken"
3116 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
3117 error "(6) Data in $DIR/$tdir/f0 is broken"
3119 echo "Write should succeed after LFSCK repairing the bad PFL range"
3120 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
3121 error "(7) Write should succeed after LFSCK"
3123 run_test 18h "LFSCK can repair crashed PFL extent range"
3125 $LCTL set_param debug=-cache > /dev/null
3128 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3129 skip "MDS older than 2.5.55, LU-3951"
3131 check_mount_and_prep
3132 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3134 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3135 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3137 echo "foo1" > $DIR/$tdir/a0
3138 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
3139 error "(0) Fail to create PFL $DIR/$tdir/a1"
3140 echo "foo2" > $DIR/$tdir/a1
3141 echo "guard" > $DIR/$tdir/a2
3142 cancel_lru_locks osc
3144 echo "Inject failure, then client will offer wrong parent FID when read"
3145 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3146 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3148 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
3149 $LCTL set_param fail_loc=0x1619
3151 echo "Read RPC with wrong parent FID should be denied"
3152 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3153 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3154 $LCTL set_param fail_loc=0
3156 run_test 19a "OST-object inconsistency self detect"
3159 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3160 skip "MDS older than 2.5.55, LU-3951"
3162 check_mount_and_prep
3163 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3165 echo "Inject failure stub to make the OST-object to back point to"
3166 echo "non-exist MDT-object"
3168 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3169 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3171 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3172 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3173 echo "foo1" > $DIR/$tdir/f0
3174 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3175 error "(0) Fail to create PFL $DIR/$tdir/f1"
3176 echo "foo2" > $DIR/$tdir/f1
3177 cancel_lru_locks osc
3178 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3180 do_facet ost1 $LCTL set_param -n \
3181 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3182 echo "Nothing should be fixed since self detect and repair is disabled"
3183 local repaired=$(do_facet ost1 $LCTL get_param -n \
3184 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3185 awk '/^repaired/ { print $2 }')
3186 [ $repaired -eq 0 ] ||
3187 error "(1) Expected 0 repaired, but got $repaired"
3189 echo "Read RPC with right parent FID should be accepted,"
3190 echo "and cause parent FID on OST to be fixed"
3192 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3193 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3195 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3196 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3198 repaired=$(do_facet ost1 $LCTL get_param -n \
3199 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3200 awk '/^repaired/ { print $2 }')
3201 [ $repaired -eq 2 ] ||
3202 error "(3) Expected 1 repaired, but got $repaired"
3204 run_test 19b "OST-object inconsistency self repair"
3206 PATTERN_WITH_HOLE="40000001"
3207 PATTERN_WITHOUT_HOLE="raid0"
3210 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3211 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3212 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3213 skip "MDS older than 2.5.55, LU-4887"
3216 echo "The target MDT-object and some of its OST-object are lost."
3217 echo "The LFSCK should find out the left OST-objects and re-create"
3218 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3219 echo "with the partial OST-objects (LOV EA hole)."
3221 echo "New client can access the file with LOV EA hole via normal"
3222 echo "system tools or commands without crash the system."
3224 echo "For old client, even though it cannot access the file with"
3225 echo "LOV EA hole, it should not cause the system crash."
3228 check_mount_and_prep
3229 $LFS mkdir -i 0 $DIR/$tdir/a1
3230 if [ $OSTCOUNT -gt 2 ]; then
3231 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3234 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3238 # 256 blocks on the stripe0.
3239 # 1 block on the stripe1 for 2 OSTs case.
3240 # 256 blocks on the stripe1 for other cases.
3241 # 1 block on the stripe2 if OSTs > 2
3242 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3243 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3244 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3246 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3247 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3248 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3251 $LFS getstripe $DIR/$tdir/a1/f0
3253 $LFS getstripe $DIR/$tdir/a1/f1
3255 $LFS getstripe $DIR/$tdir/a1/f2
3257 if [ $OSTCOUNT -gt 2 ]; then
3258 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3259 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3261 $LFS getstripe $DIR/$tdir/a1/f3
3264 cancel_lru_locks osc
3266 echo "Inject failure..."
3267 echo "To simulate f0 lost MDT-object"
3268 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3269 do_facet mds1 $LCTL set_param fail_loc=0x1616
3270 rm -f $DIR/$tdir/a1/f0
3272 echo "To simulate f1 lost MDT-object and OST-object0"
3273 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3274 do_facet mds1 $LCTL set_param fail_loc=0x161a
3275 rm -f $DIR/$tdir/a1/f1
3277 echo "To simulate f2 lost MDT-object and OST-object1"
3278 do_facet mds1 $LCTL set_param fail_val=1
3279 rm -f $DIR/$tdir/a1/f2
3281 if [ $OSTCOUNT -gt 2 ]; then
3282 echo "To simulate f3 lost MDT-object and OST-object2"
3283 do_facet mds1 $LCTL set_param fail_val=2
3284 rm -f $DIR/$tdir/a1/f3
3287 umount_client $MOUNT
3290 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3292 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3293 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3295 for k in $(seq $MDSCOUNT); do
3296 # The LFSCK status query internal is 30 seconds. For the case
3297 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3298 # time to guarantee the status sync up.
3299 wait_update_facet mds${k} "$LCTL get_param -n \
3300 mdd.$(facet_svc mds${k}).lfsck_layout |
3301 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3302 error "(2) MDS${k} is not the expected 'completed'"
3305 for k in $(seq $OSTCOUNT); do
3306 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3307 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3308 awk '/^status/ { print $2 }')
3309 [ "$cur_status" == "completed" ] ||
3310 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3313 local repaired=$(do_facet mds1 $LCTL get_param -n \
3314 mdd.$(facet_svc mds1).lfsck_layout |
3315 awk '/^repaired_orphan/ { print $2 }')
3316 if [ $OSTCOUNT -gt 2 ]; then
3317 [ $repaired -eq 9 ] ||
3318 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3320 [ $repaired -eq 4 ] ||
3321 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3324 mount_client $MOUNT || error "(5.0) Fail to start client!"
3326 LOV_PATTERN_F_HOLE=0x40000000
3329 # ${fid0}-R-0 is the old f0
3331 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3332 echo "Check $name, which is the old f0"
3334 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3336 local pattern=$($LFS getstripe -L $name)
3337 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3338 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3340 local stripes=$($LFS getstripe -c $name)
3341 if [ $OSTCOUNT -gt 2 ]; then
3342 [ $stripes -eq 3 ] ||
3343 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3345 [ $stripes -eq 2 ] ||
3346 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3349 local size=$(stat $name | awk '/Size:/ { print $2 }')
3350 [ $size -eq $((4096 * $bcount)) ] ||
3351 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3353 cat $name > /dev/null || error "(5.5) cannot read $name"
3355 echo "dummy" >> $name || error "(5.6) cannot write $name"
3357 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3359 touch $name || error "(5.8) cannot touch $name"
3361 rm -f $name || error "(5.9) cannot unlink $name"
3364 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3366 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3367 if [ $OSTCOUNT -gt 2 ]; then
3368 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3370 echo "Check $name, it contains the old f1's stripe1"
3373 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3375 pattern=$($LFS getstripe -L $name)
3376 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3377 error "(6.2) expect pattern flag hole, but got $pattern"
3379 stripes=$($LFS getstripe -c $name)
3380 if [ $OSTCOUNT -gt 2 ]; then
3381 [ $stripes -eq 3 ] ||
3382 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3384 [ $stripes -eq 2 ] ||
3385 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3388 size=$(stat $name | awk '/Size:/ { print $2 }')
3389 [ $size -eq $((4096 * $bcount)) ] ||
3390 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3392 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3394 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3395 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3398 [ $failures -eq 256 ] ||
3399 error "(6.6) expect 256 IO failures, but get $failures"
3401 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3402 [ $size -eq $((4096 * $bcount)) ] ||
3403 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3405 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3406 error "(6.8) write to the LOV EA hole should fail"
3408 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3409 error "(6.9) write to normal stripe should NOT fail"
3411 echo "foo" >> $name && error "(6.10) append write $name should fail"
3413 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3415 touch $name || error "(6.12) cannot touch $name"
3417 rm -f $name || error "(6.13) cannot unlink $name"
3420 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3422 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3423 if [ $OSTCOUNT -gt 2 ]; then
3424 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3426 echo "Check $name, it contains the old f2's stripe0"
3429 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3431 pattern=$($LFS getstripe -L $name)
3432 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3433 error "(7.2) expect pattern flag hole, but got $pattern"
3435 stripes=$($LFS getstripe -c $name)
3436 size=$(stat $name | awk '/Size:/ { print $2 }')
3437 if [ $OSTCOUNT -gt 2 ]; then
3438 [ $stripes -eq 3 ] ||
3439 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3441 [ $size -eq $((4096 * $bcount)) ] ||
3442 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3444 cat $name > /dev/null &&
3445 error "(7.5.1) normal read $name should fail"
3447 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3448 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3450 [ $failures -eq 256 ] ||
3451 error "(7.6) expect 256 IO failures, but get $failures"
3453 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3454 [ $size -eq $((4096 * $bcount)) ] ||
3455 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3457 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3458 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3460 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3461 error "(7.8.1) write to normal stripe should NOT fail"
3463 echo "foo" >> $name &&
3464 error "(7.8.3) append write $name should fail"
3466 chown $RUNAS_ID:$RUNAS_GID $name ||
3467 error "(7.9.1) cannot chown on $name"
3469 touch $name || error "(7.10.1) cannot touch $name"
3471 [ $stripes -eq 2 ] ||
3472 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3475 [ $size -eq $((4096 * (256 + 0))) ] ||
3476 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3478 cat $name > /dev/null &&
3479 error "(7.5.2) normal read $name should fail"
3481 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3482 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3483 [ $failures -eq 256 ] ||
3484 error "(7.6.2) expect 256 IO failures, but get $failures"
3487 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3488 [ $size -eq $((4096 * $bcount)) ] ||
3489 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3491 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3492 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3494 chown $RUNAS_ID:$RUNAS_GID $name ||
3495 error "(7.9.2) cannot chown on $name"
3497 touch $name || error "(7.10.2) cannot touch $name"
3500 rm -f $name || error "(7.11) cannot unlink $name"
3502 [ $OSTCOUNT -le 2 ] && return
3505 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3507 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3508 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3510 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3512 pattern=$($LFS getstripe -L $name)
3513 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3514 error "(8.2) expect pattern flag hole, but got $pattern"
3516 stripes=$($LFS getstripe -c $name)
3517 [ $stripes -eq 3 ] ||
3518 error "(8.3) expect the stripe count is 3, but got $stripes"
3520 size=$(stat $name | awk '/Size:/ { print $2 }')
3522 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3523 error "(8.4) expect the size $((4096 * 512)), but got $size"
3525 cat $name > /dev/null &&
3526 error "(8.5) normal read $name should fail"
3528 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3529 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3531 [ $failures -eq 256 ] ||
3532 error "(8.6) expect 256 IO failures, but get $failures"
3535 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3536 [ $size -eq $((4096 * $bcount)) ] ||
3537 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3539 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3540 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3542 chown $RUNAS_ID:$RUNAS_GID $name ||
3543 error "(8.9) cannot chown on $name"
3545 touch $name || error "(8.10) cannot touch $name"
3547 rm -f $name || error "(8.11) cannot unlink $name"
3549 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3552 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3553 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3554 (( $MDS1_VERSION > $(version_code 2.5.55) )) ||
3555 skip "MDS older than 2.5.55, LU-4887"
3558 echo "The target MDT-object and some of its OST-object are lost."
3559 echo "The LFSCK should find out the left OST-objects and re-create"
3560 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3561 echo "with the partial OST-objects (LOV EA hole)."
3563 echo "New client can access the file with LOV EA hole via normal"
3564 echo "system tools or commands without crash the system - PFL case."
3567 check_mount_and_prep
3569 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3570 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3571 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3572 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3573 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3574 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3576 local bcount=$((256 * 3 + 1))
3578 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3579 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3580 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3582 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3583 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3584 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3587 $LFS getstripe $DIR/$tdir/f0
3589 $LFS getstripe $DIR/$tdir/f1
3591 $LFS getstripe $DIR/$tdir/f2
3593 cancel_lru_locks mdc
3594 cancel_lru_locks osc
3596 echo "Inject failure..."
3597 echo "To simulate f0 lost MDT-object"
3598 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3599 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3602 echo "To simulate the case of f1 lost MDT-object and "
3603 echo "the first OST-object in each PFL component"
3604 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3605 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3608 echo "To simulate the case of f2 lost MDT-object and "
3609 echo "the second OST-object in each PFL component"
3610 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3617 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3618 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3620 for k in $(seq $MDSCOUNT); do
3621 # The LFSCK status query internal is 30 seconds. For the case
3622 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3623 # time to guarantee the status sync up.
3624 wait_update_facet mds${k} "$LCTL get_param -n \
3625 mdd.$(facet_svc mds${k}).lfsck_layout |
3626 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3627 error "(4) MDS${k} is not the expected 'completed'"
3630 for k in $(seq $OSTCOUNT); do
3631 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3632 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3633 awk '/^status/ { print $2 }')
3634 [ "$cur_status" == "completed" ] ||
3635 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3638 local repaired=$(do_facet mds1 $LCTL get_param -n \
3639 mdd.$(facet_svc mds1).lfsck_layout |
3640 awk '/^repaired_orphan/ { print $2 }')
3641 [ $repaired -eq 8 ] ||
3642 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3645 # ${fid0}-R-0 is the old f0
3647 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3648 echo "Check $name, which is the old f0"
3650 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3652 local pattern=$($LFS getstripe -L -I1 $name)
3653 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3654 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3656 pattern=$($LFS getstripe -L -I2 $name)
3657 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3658 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3660 local stripes=$($LFS getstripe -c -I1 $name)
3661 [ $stripes -eq 2 ] ||
3662 error "(7.3.1) expect 2 stripes, but got $stripes"
3664 stripes=$($LFS getstripe -c -I2 $name)
3665 [ $stripes -eq 2 ] ||
3666 error "(7.3.2) expect 2 stripes, but got $stripes"
3668 local e_start=$($LFS getstripe -I1 $name |
3669 awk '/lcme_extent.e_start:/ { print $2 }')
3670 [ $e_start -eq 0 ] ||
3671 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3673 local e_end=$($LFS getstripe -I1 $name |
3674 awk '/lcme_extent.e_end:/ { print $2 }')
3675 [ $e_end -eq 2097152 ] ||
3676 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3678 e_start=$($LFS getstripe -I2 $name |
3679 awk '/lcme_extent.e_start:/ { print $2 }')
3680 [ $e_start -eq 2097152 ] ||
3681 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3683 e_end=$($LFS getstripe -I2 $name |
3684 awk '/lcme_extent.e_end:/ { print $2 }')
3685 [ "$e_end" = "EOF" ] ||
3686 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3688 local size=$(stat $name | awk '/Size:/ { print $2 }')
3689 [ $size -eq $((4096 * $bcount)) ] ||
3690 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3692 cat $name > /dev/null || error "(7.7) cannot read $name"
3694 echo "dummy" >> $name || error "(7.8) cannot write $name"
3696 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3698 touch $name || error "(7.10) cannot touch $name"
3700 rm -f $name || error "(7.11) cannot unlink $name"
3703 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3705 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3706 echo "Check $name, it contains f1's second OST-object in each COMP"
3708 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3710 pattern=$($LFS getstripe -L -I1 $name)
3711 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3712 error "(8.2.1) expect pattern flag hole, but got $pattern"
3714 pattern=$($LFS getstripe -L -I2 $name)
3715 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3716 error "(8.2.2) expect pattern flag hole, but got $pattern"
3718 stripes=$($LFS getstripe -c -I1 $name)
3719 [ $stripes -eq 2 ] ||
3720 error "(8.3.2) expect 2 stripes, but got $stripes"
3722 stripes=$($LFS getstripe -c -I2 $name)
3723 [ $stripes -eq 2 ] ||
3724 error "(8.3.2) expect 2 stripes, but got $stripes"
3726 e_start=$($LFS getstripe -I1 $name |
3727 awk '/lcme_extent.e_start:/ { print $2 }')
3728 [ $e_start -eq 0 ] ||
3729 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3731 e_end=$($LFS getstripe -I1 $name |
3732 awk '/lcme_extent.e_end:/ { print $2 }')
3733 [ $e_end -eq 2097152 ] ||
3734 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3736 e_start=$($LFS getstripe -I2 $name |
3737 awk '/lcme_extent.e_start:/ { print $2 }')
3738 [ $e_start -eq 2097152 ] ||
3739 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3741 e_end=$($LFS getstripe -I2 $name |
3742 awk '/lcme_extent.e_end:/ { print $2 }')
3743 [ "$e_end" = "EOF" ] ||
3744 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3746 size=$(stat $name | awk '/Size:/ { print $2 }')
3747 [ $size -eq $((4096 * $bcount)) ] ||
3748 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3750 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3752 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3753 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3755 # The first stripe in each COMP was lost
3756 [ $failures -eq 512 ] ||
3757 error "(8.8) expect 512 IO failures, but get $failures"
3759 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3760 [ $size -eq $((4096 * $bcount)) ] ||
3761 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3763 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3764 error "(8.10) write to the LOV EA hole should fail"
3766 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3767 error "(8.11) write to normal stripe should NOT fail"
3769 echo "foo" >> $name && error "(8.12) append write $name should fail"
3771 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3773 touch $name || error "(8.14) cannot touch $name"
3775 rm -f $name || error "(8.15) cannot unlink $name"
3778 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3780 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3781 echo "Check $name, it contains f2's first stripe in each COMP"
3783 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3785 pattern=$($LFS getstripe -L -I1 $name)
3786 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3787 error "(9.2.1) expect pattern flag hole, but got $pattern"
3789 pattern=$($LFS getstripe -L -I2 $name)
3790 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3791 error "(9.2.2) expect pattern flag hole, but got $pattern"
3793 stripes=$($LFS getstripe -c -I1 $name)
3794 [ $stripes -eq 2 ] ||
3795 error "(9.3.2) expect 2 stripes, but got $stripes"
3797 stripes=$($LFS getstripe -c -I2 $name)
3798 [ $stripes -eq 2 ] ||
3799 error "(9.3.2) expect 2 stripes, but got $stripes"
3801 e_start=$($LFS getstripe -I1 $name |
3802 awk '/lcme_extent.e_start:/ { print $2 }')
3803 [ $e_start -eq 0 ] ||
3804 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3806 e_end=$($LFS getstripe -I1 $name |
3807 awk '/lcme_extent.e_end:/ { print $2 }')
3808 [ $e_end -eq 2097152 ] ||
3809 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3811 e_start=$($LFS getstripe -I2 $name |
3812 awk '/lcme_extent.e_start:/ { print $2 }')
3813 [ $e_start -eq 2097152 ] ||
3814 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3816 e_end=$($LFS getstripe -I2 $name |
3817 awk '/lcme_extent.e_end:/ { print $2 }')
3818 [ "$e_end" = "EOF" ] ||
3819 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3821 size=$(stat $name | awk '/Size:/ { print $2 }')
3822 # The second stripe in COMP was lost, so we do not know there
3823 # have ever been some data before. 'stat' will regard it as
3824 # no data on the lost stripe.
3826 [ $size -eq $((4096 * $bcount)) ] ||
3827 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3829 cat $name > /dev/null &&
3830 error "(9.7) normal read $name should fail"
3832 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3833 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3834 [ $failures -eq 512 ] ||
3835 error "(9.8) expect 256 IO failures, but get $failures"
3837 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3838 # The second stripe in COMP was lost, so we do not know there
3839 # have ever been some data before. Since 'dd' skip failure,
3840 # it will regard the lost stripe contains data.
3842 [ $size -eq $((4096 * $bcount)) ] ||
3843 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3845 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3846 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3848 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3849 error "(9.11) write to normal stripe should NOT fail"
3851 echo "foo" >> $name &&
3852 error "(9.12) append write $name should fail"
3854 chown $RUNAS_ID:$RUNAS_GID $name ||
3855 error "(9.13) cannot chown on $name"
3857 touch $name || error "(9.14) cannot touch $name"
3859 rm -f $name || error "(7.15) cannot unlink $name"
3861 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3864 (( $MDS1_VERSION > $(version_code 2.5.59) )) ||
3865 skip "MDS older than 2.5.59, LU-4887"
3867 check_mount_and_prep
3868 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3870 echo "Start all LFSCK components by default (-s 1)"
3871 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3872 error "Fail to start LFSCK"
3874 echo "namespace LFSCK should be in 'scanning-phase1' status"
3875 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3876 [ "$STATUS" == "scanning-phase1" ] ||
3877 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3879 echo "layout LFSCK should be in 'scanning-phase1' status"
3880 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3881 [ "$STATUS" == "scanning-phase1" ] ||
3882 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3884 echo "Stop all LFSCK components by default"
3885 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3886 error "Fail to stop LFSCK"
3888 run_test 21 "run all LFSCK components by default"
3891 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3892 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3893 skip "MDS older than 2.6.50, LU-5511"
3896 echo "The parent_A references the child directory via some name entry,"
3897 echo "but the child directory back references another parent_B via its"
3898 echo "".." name entry. The parent_B does not exist. Then the namespace"
3899 echo "LFSCK will repair the child directory's ".." name entry."
3902 check_mount_and_prep
3904 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3905 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3907 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3908 echo "The dummy's dotdot name entry references the guard."
3909 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3910 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3911 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3912 error "(3) Fail to mkdir on MDT0"
3913 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3915 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3917 echo "Trigger namespace LFSCK to repair unmatched pairs"
3918 $START_NAMESPACE -A -r ||
3919 error "(5) Fail to start LFSCK for namespace"
3921 wait_all_targets_blocked namespace completed 6
3923 local repaired=$($SHOW_NAMESPACE |
3924 awk '/^unmatched_pairs_repaired/ { print $2 }')
3925 [ $repaired -eq 1 ] ||
3926 error "(7) Fail to repair unmatched pairs: $repaired"
3928 echo "'ls' should success after namespace LFSCK repairing"
3929 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3930 error "(8) ls should success."
3932 run_test 22a "LFSCK can repair unmatched pairs (1)"
3935 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3936 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3937 skip "MDS older than 2.6.50, LU-5511"
3940 echo "The parent_A references the child directory via the name entry_B,"
3941 echo "but the child directory back references another parent_C via its"
3942 echo "".." name entry. The parent_C exists, but there is no the name"
3943 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3944 echo "the child directory's ".." name entry and its linkEA."
3947 check_mount_and_prep
3949 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3950 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3952 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3953 echo "and bad linkEA. The dummy's dotdot name entry references the"
3954 echo "guard. The dummy's linkEA references n non-exist name entry."
3955 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3956 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3957 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3958 error "(3) Fail to mkdir on MDT0"
3959 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3961 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3962 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3963 local dummyname=$($LFS fid2path $DIR $dummyfid)
3964 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3965 error "(4) fid2path works unexpectedly."
3967 echo "Trigger namespace LFSCK to repair unmatched pairs"
3968 $START_NAMESPACE -A -r ||
3969 error "(5) Fail to start LFSCK for namespace"
3971 wait_all_targets_blocked namespace completed 6
3973 local repaired=$($SHOW_NAMESPACE |
3974 awk '/^unmatched_pairs_repaired/ { print $2 }')
3975 [ $repaired -eq 1 ] ||
3976 error "(7) Fail to repair unmatched pairs: $repaired"
3978 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3979 local dummyname=$($LFS fid2path $DIR $dummyfid)
3980 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3981 error "(8) fid2path does not work"
3983 run_test 22b "LFSCK can repair unmatched pairs (2)"
3986 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3987 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
3988 skip "MDS older than 2.6.50, LU-5512"
3991 echo "The name entry is there, but the MDT-object for such name "
3992 echo "entry does not exist. The namespace LFSCK should find out "
3993 echo "and repair the inconsistency as required."
3996 check_mount_and_prep
3998 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3999 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
4001 echo "Inject failure stub on MDT1 to simulate dangling name entry"
4002 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
4003 do_facet mds2 $LCTL set_param fail_loc=0x1620
4004 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
4005 do_facet mds2 $LCTL set_param fail_loc=0
4007 echo "'ls' should fail because of dangling name entry"
4008 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
4010 echo "Trigger namespace LFSCK to find out dangling name entry"
4011 $START_NAMESPACE -A -r ||
4012 error "(5) Fail to start LFSCK for namespace"
4014 wait_all_targets_blocked namespace completed 6
4016 local repaired=$($SHOW_NAMESPACE |
4017 awk '/^dangling_repaired/ { print $2 }')
4018 [ $repaired -eq 1 ] ||
4019 error "(7) Fail to repair dangling name entry: $repaired"
4021 echo "'ls' should fail because not re-create MDT-object by default"
4022 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
4024 echo "Trigger namespace LFSCK again to repair dangling name entry"
4025 $START_NAMESPACE -A -r -C ||
4026 error "(9) Fail to start LFSCK for namespace"
4028 wait_all_targets_blocked namespace completed 10
4030 repaired=$($SHOW_NAMESPACE |
4031 awk '/^dangling_repaired/ { print $2 }')
4032 [ $repaired -eq 1 ] ||
4033 error "(11) Fail to repair dangling name entry: $repaired"
4035 echo "'ls' should success after namespace LFSCK repairing"
4036 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
4038 run_test 23a "LFSCK can repair dangling name entry (1)"
4041 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4042 skip "MDS older than 2.6.50, LU-5512"
4045 echo "The objectA has multiple hard links, one of them corresponding"
4046 echo "to the name entry_B. But there is something wrong for the name"
4047 echo "entry_B and cause entry_B to references non-exist object_C."
4048 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4049 echo "as dangling, and re-create the lost object_C. When the LFSCK"
4050 echo "comes to the second-stage scanning, it will find that the"
4051 echo "former re-creating object_C is not proper, and will try to"
4052 echo "replace the object_C with the real object_A."
4055 check_mount_and_prep
4057 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4058 $LFS path2fid $DIR/$tdir/d0
4060 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4062 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4063 $LFS path2fid $DIR/$tdir/d0/f0
4065 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4066 $LFS path2fid $DIR/$tdir/d0/f1
4068 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
4069 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
4071 if [ "$SEQ0" != "$SEQ1" ]; then
4072 # To guarantee that the f0 and f1 are in the same FID seq
4073 rm -f $DIR/$tdir/d0/f0 ||
4074 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4075 echo "dummy" > $DIR/$tdir/d0/f0 ||
4076 error "(3.2) Fail to touch on MDT0"
4077 $LFS path2fid $DIR/$tdir/d0/f0
4080 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
4081 OID=$(printf %d $OID)
4083 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4084 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4085 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
4086 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4087 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4089 # If there is creation after the dangling injection, it may re-use
4090 # the just released local object (inode) that is referenced by the
4091 # dangling name entry. It will fail the dangling injection.
4092 # So before deleting the target object for the dangling name entry,
4093 # remove some other objects to avoid the target object being reused
4094 # by some potential creations. LU-7429
4095 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4097 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4099 echo "'ls' should fail because of dangling name entry"
4100 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4101 error "(6) ls should fail."
4103 echo "Trigger namespace LFSCK to find out dangling name entry"
4104 $START_NAMESPACE -r -C ||
4105 error "(7) Fail to start LFSCK for namespace"
4107 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4108 mdd.${MDT_DEV}.lfsck_namespace |
4109 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4111 error "(8) unexpected status"
4114 local repaired=$($SHOW_NAMESPACE |
4115 awk '/^dangling_repaired/ { print $2 }')
4116 [ $repaired -eq 1 ] ||
4117 error "(9) Fail to repair dangling name entry: $repaired"
4119 repaired=$($SHOW_NAMESPACE |
4120 awk '/^multiple_linked_repaired/ { print $2 }')
4121 [ $repaired -eq 1 ] ||
4122 error "(10) Fail to drop the former created object: $repaired"
4124 local data=$(cat $DIR/$tdir/d0/foo)
4125 [ "$data" == "dummy" ] ||
4126 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
4128 run_test 23b "LFSCK can repair dangling name entry (2)"
4131 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4132 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4133 mdd.${MDT_DEV}.lfsck_namespace |
4134 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4136 error "(10) unexpected status"
4139 stop_full_debug_logging
4143 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4144 skip "MDS older than 2.6.50, LU-5512"
4147 echo "The objectA has multiple hard links, one of them corresponding"
4148 echo "to the name entry_B. But there is something wrong for the name"
4149 echo "entry_B and cause entry_B to references non-exist object_C."
4150 echo "In the first-stage scanning, the LFSCK will think the entry_B"
4151 echo "as dangling, and re-create the lost object_C. And then others"
4152 echo "modified the re-created object_C. When the LFSCK comes to the"
4153 echo "second-stage scanning, it will find that the former re-creating"
4154 echo "object_C maybe wrong and try to replace the object_C with the"
4155 echo "real object_A. But because object_C has been modified, so the"
4156 echo "LFSCK cannot replace it."
4159 start_full_debug_logging
4161 check_mount_and_prep
4163 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
4164 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
4165 echo "parent_fid=$parent_fid"
4167 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
4169 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
4170 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4171 echo "f0_fid=$f0_fid"
4173 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4174 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4175 echo "f1_fid=$f1_fid"
4177 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4178 # To guarantee that the f0 and f1 are in the same FID seq
4179 rm -f $DIR/$tdir/d0/f0 ||
4180 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4181 echo "dummy" > $DIR/$tdir/d0/f0 ||
4182 error "(3.2) Fail to touch on MDT0"
4183 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4184 echo "f0_fid=$f0_fid (replaced)"
4187 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4189 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4190 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4191 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4192 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4193 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4195 # If there is creation after the dangling injection, it may re-use
4196 # the just released local object (inode) that is referenced by the
4197 # dangling name entry. It will fail the dangling injection.
4198 # So before deleting the target object for the dangling name entry,
4199 # remove some other objects to avoid the target object being reused
4200 # by some potential creations. LU-7429
4201 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4203 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4205 echo "'ls' should fail because of dangling name entry"
4206 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4207 error "(6) ls should fail."
4209 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4210 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4212 echo "Trigger namespace LFSCK to find out dangling name entry"
4213 $START_NAMESPACE -r -C ||
4214 error "(7) Fail to start LFSCK for namespace"
4216 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4217 # While unexpected by the test, it is valid for LFSCK to repair
4218 # the link to the original object before any data is written.
4219 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4221 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4222 log "LFSCK repaired file prematurely"
4227 stat $DIR/$tdir/d0/foo
4229 error "(8) unexpected size"
4232 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4233 cancel_lru_locks osc
4237 local repaired=$($SHOW_NAMESPACE |
4238 awk '/^dangling_repaired/ { print $2 }')
4239 [ $repaired -eq 1 ] ||
4240 error "(11) Fail to repair dangling name entry: $repaired"
4242 local data=$(cat $DIR/$tdir/d0/foo)
4243 [ "$data" != "dummy" ] ||
4244 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4246 run_test 23c "LFSCK can repair dangling name entry (3)"
4249 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4250 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4251 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4252 skip "MDS older than 2.6.50, LU-5513"
4255 echo "Two MDT-objects back reference the same name entry via their"
4256 echo "each own linkEA entry, but the name entry only references one"
4257 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4258 echo "for the MDT-object that is not recognized. If such MDT-object"
4259 echo "has no other linkEA entry after the removing, then the LFSCK"
4260 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4263 check_mount_and_prep
4265 mkdir_on_mdt -i1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4267 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4268 $LFS path2fid $DIR/$tdir/d0/guard
4270 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4271 $LFS path2fid $DIR/$tdir/d0/dummy
4274 if [ $mds1_FSTYPE != ldiskfs ]; then
4275 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4277 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4280 touch $DIR/$tdir/d0/guard/foo ||
4281 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4283 echo "Inject failure stub on MDT0 to simulate the case that"
4284 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4285 echo "that references $DIR/$tdir/d0/guard/foo."
4286 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4287 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4288 echo "there with the same linkEA entry as another MDT-object"
4289 echo "$DIR/$tdir/d0/guard/foo has"
4291 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4292 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4293 mkdir_on_mdt -i0 $DIR/$tdir/d0/dummy/foo ||
4294 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4295 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4296 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4297 rmdir $DIR/$tdir/d0/dummy/foo ||
4298 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4301 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4302 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4303 error "(6) stat successfully unexpectedly"
4305 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4306 $START_NAMESPACE -A -r ||
4307 error "(7) Fail to start LFSCK for namespace"
4309 wait_all_targets_blocked namespace completed 8
4311 local repaired=$($SHOW_NAMESPACE |
4312 awk '/^multiple_referenced_repaired/ { print $2 }')
4313 [ $repaired -eq 1 ] ||
4314 error "(9) Fail to repair multiple referenced name entry: $repaired"
4316 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4317 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4318 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4320 local cname="$cfid-$pfid-D-0"
4321 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4322 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4324 run_test 24 "LFSCK can repair multiple-referenced name entry"
4327 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs fixes dirent type"
4328 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4329 skip "MDS older than 2.6.50, LU-5515"
4332 echo "The file type in the name entry does not match the file type"
4333 echo "claimed by the referenced object. Then the LFSCK will update"
4334 echo "the file type in the name entry."
4337 check_mount_and_prep
4339 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4341 echo "Inject failure stub on MDT0 to simulate the case that"
4342 echo "the file type stored in the name entry is wrong."
4344 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4345 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4346 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4347 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4349 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4350 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4352 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4353 mdd.${MDT_DEV}.lfsck_namespace |
4354 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4356 error "(4) unexpected status"
4359 local repaired=$($SHOW_NAMESPACE |
4360 awk '/^bad_file_type_repaired/ { print $2 }')
4361 [ $repaired -eq 1 ] ||
4362 error "(5) Fail to repair bad file type in name entry: $repaired"
4364 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4366 run_test 25 "LFSCK can repair bad file type in the name entry"
4369 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4370 skip "MDS older than 2.6.50, LU-5516"
4373 echo "The local name entry back referenced by the MDT-object is lost."
4374 echo "The namespace LFSCK will add the missing local name entry back"
4375 echo "to the normal namespace."
4378 check_mount_and_prep
4380 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4381 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4382 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4384 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4385 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4387 echo "Inject failure stub on MDT0 to simulate the case that"
4388 echo "foo's name entry will be removed, but the foo's object"
4389 echo "and its linkEA are kept in the system."
4391 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4392 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4393 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4394 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4396 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4397 error "(5) 'ls' should fail"
4399 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4400 $START_NAMESPACE -r -A ||
4401 error "(6) Fail to start LFSCK for namespace"
4403 wait_all_targets_blocked namespace completed 7
4405 local repaired=$($SHOW_NAMESPACE |
4406 awk '/^lost_dirent_repaired/ { print $2 }')
4407 [ $repaired -eq 1 ] ||
4408 error "(8) Fail to repair lost dirent: $repaired"
4410 ls -ail $DIR/$tdir/d0/foo ||
4411 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4413 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4414 [ "$foofid" == "$foofid2" ] ||
4415 error "(10) foo's FID changed: $foofid, $foofid2"
4417 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4420 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4421 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4422 skip "MDS older than 2.6.50, LU-5516"
4425 echo "The remote name entry back referenced by the MDT-object is lost."
4426 echo "The namespace LFSCK will add the missing remote name entry back"
4427 echo "to the normal namespace."
4430 check_mount_and_prep
4432 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4433 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4434 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4436 echo "Inject failure stub on MDT0 to simulate the case that"
4437 echo "foo's name entry will be removed, but the foo's object"
4438 echo "and its linkEA are kept in the system."
4440 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4441 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4442 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4443 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4445 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4446 error "(4) 'ls' should fail"
4448 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4449 $START_NAMESPACE -r -A ||
4450 error "(5) Fail to start LFSCK for namespace"
4452 wait_all_targets_blocked namespace completed 6
4454 local repaired=$($SHOW_NAMESPACE |
4455 awk '/^lost_dirent_repaired/ { print $2 }')
4456 [ $repaired -eq 1 ] ||
4457 error "(7) Fail to repair lost dirent: $repaired"
4459 ls -ail $DIR/$tdir/d0/foo ||
4460 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4462 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4463 [ "$foofid" == "$foofid2" ] ||
4464 error "(9) foo's FID changed: $foofid, $foofid2"
4466 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4469 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4470 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4471 skip "MDS older than 2.6.50, LU-5516"
4474 echo "The local parent referenced by the MDT-object linkEA is lost."
4475 echo "The namespace LFSCK will re-create the lost parent as orphan."
4478 check_mount_and_prep
4480 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4481 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4482 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4483 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4485 echo "Inject failure stub on MDT0 to simulate the case that"
4486 echo "foo's name entry will be removed, but the foo's object"
4487 echo "and its linkEA are kept in the system. And then remove"
4488 echo "another hard link and the parent directory."
4490 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4491 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4492 rm -f $DIR/$tdir/d0/foo ||
4493 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4494 rm -f $DIR/$tdir/d0/dummy ||
4495 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4496 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4498 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4499 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4501 echo "Trigger namespace LFSCK to repair the lost parent"
4502 $START_NAMESPACE -r -A ||
4503 error "(6) Fail to start LFSCK for namespace"
4505 wait_all_targets_blocked namespace completed 7
4507 local repaired=$($SHOW_NAMESPACE |
4508 awk '/^lost_dirent_repaired/ { print $2 }')
4509 [ $repaired -eq 1 ] ||
4510 error "(8) Fail to repair lost dirent: $repaired"
4512 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4513 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4514 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4516 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4518 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4519 [ ! -z "$cname" ] ||
4520 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4522 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4525 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4526 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4527 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4528 skip "MDS older than 2.6.50, LU-5516"
4531 echo "The remote parent referenced by the MDT-object linkEA is lost."
4532 echo "The namespace LFSCK will re-create the lost parent as orphan."
4535 check_mount_and_prep
4537 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4538 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4540 $LFS path2fid $DIR/$tdir/d0
4542 echo "Inject failure stub on MDT0 to simulate the case that"
4543 echo "foo's name entry will be removed, but the foo's object"
4544 echo "and its linkEA are kept in the system. And then remove"
4545 echo "the parent directory."
4547 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4548 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4549 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4550 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4552 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4553 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4555 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4556 $START_NAMESPACE -r -A ||
4557 error "(6) Fail to start LFSCK for namespace"
4559 wait_all_targets_blocked namespace completed 7
4561 local repaired=$($SHOW_NAMESPACE |
4562 awk '/^lost_dirent_repaired/ { print $2 }')
4563 [ $repaired -eq 1 ] ||
4564 error "(8) Fail to repair lost dirent: $repaired"
4566 ls -ail $MOUNT/.lustre/lost+found/
4568 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4569 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4570 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4572 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4574 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4575 [ ! -z "$cname" ] ||
4576 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4578 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4581 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4582 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4583 skip "MDS older than 2.6.50, LU-5506"
4586 echo "The target name entry is lost. The LFSCK should insert the"
4587 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4588 echo "the MDT (on which the orphan MDT-object resides) has ever"
4589 echo "failed to respond some name entry verification during the"
4590 echo "first stage-scanning, then the LFSCK should skip to handle"
4591 echo "orphan MDT-object on this MDT. But other MDTs should not"
4595 check_mount_and_prep
4596 $LFS mkdir -i 0 $DIR/$tdir/d1
4597 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4598 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4600 $LFS mkdir -i 1 $DIR/$tdir/d2
4601 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4602 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4604 echo "Inject failure stub on MDT0 to simulate the case that"
4605 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4606 echo "and its linkEA are kept in the system. And the case that"
4607 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4608 echo "and its linkEA are kept in the system."
4610 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4611 do_facet mds1 $LCTL set_param fail_loc=0x1624
4612 do_facet mds2 $LCTL set_param fail_loc=0x1624
4613 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4614 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4615 do_facet mds1 $LCTL set_param fail_loc=0
4616 do_facet mds2 $LCTL set_param fail_loc=0
4618 cancel_lru_locks mdc
4619 cancel_lru_locks osc
4621 echo "Inject failure, to simulate the MDT0 fail to handle"
4622 echo "MDT1 LFSCK request during the first-stage scanning."
4623 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4624 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4626 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4627 $START_NAMESPACE -r -A ||
4628 error "(3) Fail to start LFSCK for namespace"
4630 wait_update_facet mds1 "$LCTL get_param -n \
4631 mdd.$(facet_svc mds1).lfsck_namespace |
4632 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4633 error "(4) mds1 is not the expected 'partial'"
4636 wait_update_facet mds2 "$LCTL get_param -n \
4637 mdd.$(facet_svc mds2).lfsck_namespace |
4638 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4639 error "(5) mds2 is not the expected 'completed'"
4642 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4644 local repaired=$(do_facet mds1 $LCTL get_param -n \
4645 mdd.$(facet_svc mds1).lfsck_namespace |
4646 awk '/^lost_dirent_repaired/ { print $2 }')
4647 [ $repaired -eq 0 ] ||
4648 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4650 repaired=$(do_facet mds2 $LCTL get_param -n \
4651 mdd.$(facet_svc mds2).lfsck_namespace |
4652 awk '/^lost_dirent_repaired/ { print $2 }')
4653 [ $repaired -eq 1 ] ||
4654 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4656 echo "Trigger namespace LFSCK on all devices again to cleanup"
4657 $START_NAMESPACE -r -A ||
4658 error "(8) Fail to start LFSCK for namespace"
4660 wait_all_targets_blocked namespace completed 9
4662 local repaired=$(do_facet mds1 $LCTL get_param -n \
4663 mdd.$(facet_svc mds1).lfsck_namespace |
4664 awk '/^lost_dirent_repaired/ { print $2 }')
4665 [ $repaired -eq 1 ] ||
4666 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4668 repaired=$(do_facet mds2 $LCTL get_param -n \
4669 mdd.$(facet_svc mds2).lfsck_namespace |
4670 awk '/^lost_dirent_repaired/ { print $2 }')
4671 [ $repaired -eq 0 ] ||
4672 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4674 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4677 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4678 skip "MDS older than 2.6.50, LU-5517"
4681 echo "The object's nlink attribute is larger than the object's known"
4682 echo "name entries count. The LFSCK will repair the object's nlink"
4683 echo "attribute to match the known name entries count"
4686 check_mount_and_prep
4688 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4689 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4691 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4692 echo "nlink attribute is larger than its name entries count."
4694 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4696 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4697 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4698 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4700 cancel_lru_locks mdc
4701 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4702 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4704 echo "Trigger namespace LFSCK to repair the nlink count"
4705 $START_NAMESPACE -r -A ||
4706 error "(5) Fail to start LFSCK for namespace"
4708 wait_all_targets_blocked namespace completed 6
4710 local repaired=$($SHOW_NAMESPACE |
4711 awk '/^nlinks_repaired/ { print $2 }')
4712 [ $repaired -eq 1 ] ||
4713 error "(7) Fail to repair nlink count: $repaired"
4715 cancel_lru_locks mdc
4716 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4717 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4719 # Disable 29a, we only allow nlink to be updated if the known linkEA
4720 # entries is larger than nlink count.
4722 #run_test 29a "LFSCK can repair bad nlink count (1)"
4725 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4726 skip "MDS older than 2.6.50, LU-5517"
4729 echo "The object's nlink attribute is smaller than the object's known"
4730 echo "name entries count. The LFSCK will repair the object's nlink"
4731 echo "attribute to match the known name entries count"
4734 check_mount_and_prep
4736 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4737 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4739 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4740 echo "nlink attribute is smaller than its name entries count."
4742 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4743 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4744 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4745 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4746 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4748 cancel_lru_locks mdc
4749 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4750 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4752 echo "Trigger namespace LFSCK to repair the nlink count"
4753 $START_NAMESPACE -r -A ||
4754 error "(5) Fail to start LFSCK for namespace"
4756 wait_all_targets_blocked namespace completed 6
4758 local repaired=$($SHOW_NAMESPACE |
4759 awk '/^nlinks_repaired/ { print $2 }')
4760 [ $repaired -eq 1 ] ||
4761 error "(7) Fail to repair nlink count: $repaired"
4763 cancel_lru_locks mdc
4764 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4765 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4767 run_test 29b "LFSCK can repair bad nlink count (2)"
4771 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4772 skip "MDS older than 2.6.50, LU-5517"
4775 echo "The namespace LFSCK will create many hard links to the target"
4776 echo "file as to exceed the linkEA size limitation. Under such case"
4777 echo "the linkEA will be marked as overflow that will prevent the"
4778 echo "target file to be migrated. Then remove some hard links to"
4779 echo "make the left hard links to be held within the linkEA size"
4780 echo "limitation. But before the namespace LFSCK adding all the"
4781 echo "missed linkEA entries back, the overflow mark (timestamp)"
4782 echo "will not be cleared."
4785 check_mount_and_prep
4787 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4788 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4789 error "(0.2) Fail to mkdir"
4790 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4791 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4793 # define MAX_LINKEA_SIZE 4096
4794 # sizeof(link_ea_header) = 24
4795 # sizeof(link_ea_entry) = 18
4796 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4797 # (sizeof(link_ea_entry) + name_length))
4798 # If the average name length is 12 bytes, then 150 hard links
4799 # is totally enough to overflow the linkEA
4800 echo "Create 150 hard links should succeed although the linkEA overflow"
4801 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4802 error "(2) Fail to hard link"
4804 cancel_lru_locks mdc
4805 if [ $MDSCOUNT -ge 2 ]; then
4806 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4807 error "(3.1) Migrate should fail"
4809 echo "The object with linkEA overflow should NOT be migrated"
4810 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4811 [ "$newfid" == "$oldfid" ] ||
4812 error "(3.2) Migrate should fail: $newfid != $oldfid"
4815 # Remove 100 hard links, then the linkEA should have space
4816 # to hold the missed linkEA entries.
4817 echo "Remove 100 hard links to save space for the missed linkEA entries"
4818 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4820 if [ $MDSCOUNT -ge 2 ]; then
4821 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4822 error "(5.1) Migrate should fail"
4824 # The overflow timestamp is still there, so migration will fail.
4825 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4826 [ "$newfid" == "$oldfid" ] ||
4827 error "(5.2) Migrate should fail: $newfid != $oldfid"
4830 # sleep 3 seconds to guarantee that the overflow is recognized
4833 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4834 $START_NAMESPACE -r -A ||
4835 error "(6) Fail to start LFSCK for namespace"
4837 wait_all_targets_blocked namespace completed 7
4839 local repaired=$($SHOW_NAMESPACE |
4840 awk '/^linkea_overflow_cleared/ { print $2 }')
4841 [ $repaired -eq 1 ] ||
4842 error "(8) Fail to clear linkea overflow: $repaired"
4844 repaired=$($SHOW_NAMESPACE |
4845 awk '/^nlinks_repaired/ { print $2 }')
4846 [ $repaired -eq 0 ] ||
4847 error "(9) Unexpected nlink repaired: $repaired"
4849 if [ $MDSCOUNT -ge 2 ]; then
4850 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4851 error "(10.1) Migrate failure"
4853 # Migration should succeed after clear the overflow timestamp.
4854 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4855 [ "$newfid" != "$oldfid" ] ||
4856 error "(10.2) Migrate should succeed"
4858 ls -l $DIR/$tdir/foo > /dev/null ||
4859 error "(11) 'ls' failed after migration"
4862 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4863 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4865 run_test 29c "verify linkEA size limitation"
4868 [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs has lost+found"
4869 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4870 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4871 skip "MDS older than 2.6.50, LU-5518"
4874 echo "The namespace LFSCK will move the orphans from backend"
4875 echo "/lost+found directory to normal client visible namespace"
4876 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4879 check_mount_and_prep
4881 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4882 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4884 echo "Inject failure stub on MDT0 to simulate the case that"
4885 echo "directory d0 has no linkEA entry, then the LFSCK will"
4886 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4888 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4890 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4891 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4893 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4894 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4896 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4897 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4899 echo "Inject failure stub on MDT0 to simulate the case that the"
4900 echo "object's name entry will be removed, but not destroy the"
4901 echo "object. Then backend e2fsck will handle it as orphan and"
4902 echo "add them into the backend /lost+found directory."
4904 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4905 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4906 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4907 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4908 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4909 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4910 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4912 umount_client $MOUNT || error "(10) Fail to stop client!"
4914 stop $SINGLEMDS || error "(11) Fail to stop $SINGLEMDS"
4916 local dev=$(facet_device $SINGLEMDS)
4918 echo "run e2fsck on $SINGLEMDS"
4919 run_e2fsck $(facet_active_host $SINGLEMDS) $dev "-y" ||
4920 error "(12) Fail to run e2fsck"
4922 start_facet $SINGLEMDS "$MOUNT_OPTS_NOSCRUB" 13
4924 echo "Trigger namespace LFSCK to recover backend orphans"
4925 $START_NAMESPACE -r -A ||
4926 error "(14) Fail to start LFSCK for namespace"
4928 wait_all_targets_blocked namespace completed 15
4930 local repaired=$($SHOW_NAMESPACE |
4931 awk '/^local_lost_found_moved/ { print $2 }')
4932 [ $repaired -ge 4 ] ||
4933 error "(16) Fail to recover backend orphans: $repaired"
4935 mount_client $MOUNT || error "(17) Fail to start client!"
4937 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4939 ls -ail $MOUNT/.lustre/lost+found/
4941 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4942 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4943 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4945 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4947 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4948 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4950 stat ${cname}/d1 || error "(21) d1 is not recovered"
4951 stat ${cname}/f1 || error "(22) f1 is not recovered"
4953 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4956 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
4957 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
4958 skip "MDS older than 2.6.50, LU-5519"
4961 echo "For the name entry under a striped directory, if the name"
4962 echo "hash does not match the shard, then the LFSCK will repair"
4963 echo "the bad name entry"
4966 check_mount_and_prep
4968 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4969 error "(1) Fail to create striped directory"
4971 echo "Inject failure stub on client to simulate the case that"
4972 echo "some name entry should be inserted into other non-first"
4973 echo "shard, but inserted into the first shard by wrong"
4975 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4976 $LCTL set_param fail_loc=0x1628 fail_val=0
4977 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4978 error "(2) Fail to create file under striped directory"
4979 $LCTL set_param fail_loc=0 fail_val=0
4981 echo "Trigger namespace LFSCK to repair bad name hash"
4982 $START_NAMESPACE -r -A ||
4983 error "(3) Fail to start LFSCK for namespace"
4985 wait_all_targets_blocked namespace completed 4
4987 local repaired=$($SHOW_NAMESPACE |
4988 awk '/^name_hash_repaired/ { print $2 }')
4989 [ $repaired -ge 1 ] ||
4990 error "(5) Fail to repair bad name hash: $repaired"
4992 local rc=$($LFS find -H badtype $DIR/$tdir/striped_dir | wc -l)
4994 error "Fail to find flag bad type: $rc"
4996 umount_client $MOUNT || error "(6) umount failed"
4997 mount_client $MOUNT || error "(7) mount failed"
4999 for ((i = 0; i < $MDSCOUNT; i++)); do
5000 stat $DIR/$tdir/striped_dir/d$i ||
5001 error "(8) Fail to stat d$i after LFSCK"
5002 rmdir $DIR/$tdir/striped_dir/d$i ||
5003 error "(9) Fail to unlink d$i after LFSCK"
5006 rmdir $DIR/$tdir/striped_dir ||
5007 error "(10) Fail to remove the striped directory after LFSCK"
5009 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
5012 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5013 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5014 skip "MDS older than 2.6.50, LU-5519"
5017 echo "For the name entry under a striped directory, if the name"
5018 echo "hash does not match the shard, then the LFSCK will repair"
5019 echo "the bad name entry"
5022 check_mount_and_prep
5024 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5025 error "(1) Fail to create striped directory"
5027 echo "Inject failure stub on client to simulate the case that"
5028 echo "some name entry should be inserted into other non-second"
5029 echo "shard, but inserted into the secod shard by wrong"
5031 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
5032 $LCTL set_param fail_loc=0x1628 fail_val=1
5033 createmany -d $DIR/$tdir/striped_dir/d $((MDSCOUNT * 5)) ||
5034 error "(2) Fail to create file under striped directory"
5035 $LCTL set_param fail_loc=0 fail_val=0
5037 echo "Trigger namespace LFSCK to repair bad name hash"
5038 $START_NAMESPACE -r -A ||
5039 error "(3) Fail to start LFSCK for namespace"
5041 wait_all_targets_blocked namespace completed 4
5043 local repaired=$(do_facet mds2 $LCTL get_param -n \
5044 mdd.$(facet_svc mds2).lfsck_namespace |
5045 awk '/^name_hash_repaired/ { print $2 }')
5046 echo "repaired $repaired name entries with bad hash"
5047 [ $repaired -ge 1 ] ||
5048 error "(5) Fail to repair bad name hash: $repaired"
5050 umount_client $MOUNT || error "(6) umount failed"
5051 mount_client $MOUNT || error "(7) mount failed"
5053 for ((i = 0; i < $((MDSCOUNT * 5)); i++)); do
5054 stat $DIR/$tdir/striped_dir/d$i ||
5055 error "(8) Fail to stat d$i after LFSCK"
5056 rmdir $DIR/$tdir/striped_dir/d$i ||
5057 error "(9) Fail to unlink d$i after LFSCK"
5060 rmdir $DIR/$tdir/striped_dir ||
5061 error "(10) Fail to remove the striped directory after LFSCK"
5063 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
5066 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5067 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5068 skip "MDS older than 2.6.50, LU-5519"
5071 echo "For some reason, the master MDT-object of the striped directory"
5072 echo "may lost its master LMV EA. If nobody created files under the"
5073 echo "master directly after the master LMV EA lost, then the LFSCK"
5074 echo "should re-generate the master LMV EA."
5077 check_mount_and_prep
5079 echo "Inject failure stub on MDT0 to simulate the case that the"
5080 echo "master MDT-object of the striped directory lost the LMV EA."
5082 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5083 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5084 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5085 error "(1) Fail to create striped directory"
5086 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5088 echo "Trigger namespace LFSCK to re-generate master LMV EA"
5089 $START_NAMESPACE -r -A ||
5090 error "(2) Fail to start LFSCK for namespace"
5092 wait_all_targets_blocked namespace completed 3
5094 local repaired=$($SHOW_NAMESPACE |
5095 awk '/^striped_dirs_repaired/ { print $2 }')
5096 [ $repaired -eq 1 ] ||
5097 error "(4) Fail to re-generate master LMV EA: $repaired"
5099 local rc=$($LFS find -H lostlmv $DIR/$tdir/striped_dir | wc -l)
5100 [ $rc -eq 1 ] || error "Fail to find flag lost LMV: $rc"
5102 umount_client $MOUNT || error "(5) umount failed"
5103 mount_client $MOUNT || error "(6) mount failed"
5105 local empty=$(ls $DIR/$tdir/striped_dir/)
5106 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
5108 rmdir $DIR/$tdir/striped_dir ||
5109 error "(8) Fail to remove the striped directory after LFSCK"
5111 run_test 31c "Re-generate the lost master LMV EA for striped directory"
5114 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5115 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5116 skip "MDS older than 2.6.50, LU-5519"
5119 echo "For some reason, the master MDT-object of the striped directory"
5120 echo "may lost its master LMV EA. If somebody created files under the"
5121 echo "master directly after the master LMV EA lost, then the LFSCK"
5122 echo "should NOT re-generate the master LMV EA, instead, it should"
5123 echo "change the broken striped dirctory as read-only to prevent"
5124 echo "further damage"
5127 check_mount_and_prep
5129 echo "Inject failure stub on MDT0 to simulate the case that the"
5130 echo "master MDT-object of the striped directory lost the LMV EA."
5132 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
5133 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
5134 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5135 error "(1) Fail to create striped directory"
5136 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
5138 umount_client $MOUNT || error "(2) umount failed"
5139 mount_client $MOUNT || error "(3) mount failed"
5141 touch $DIR/$tdir/striped_dir/dummy ||
5142 error "(4) Fail to touch under broken striped directory"
5144 echo "Trigger namespace LFSCK to find out the inconsistency"
5145 $START_NAMESPACE -r -A ||
5146 error "(5) Fail to start LFSCK for namespace"
5148 wait_all_targets_blocked namespace completed 6
5150 local repaired=$($SHOW_NAMESPACE |
5151 awk '/^striped_dirs_repaired/ { print $2 }')
5152 [ $repaired -eq 0 ] ||
5153 error "(7) Re-generate master LMV EA unexpected: $repaired"
5155 stat $DIR/$tdir/striped_dir/dummy ||
5156 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
5158 touch $DIR/$tdir/striped_dir/foo &&
5159 error "(9) The broken striped directory should be read-only"
5161 chattr -i $DIR/$tdir/striped_dir ||
5162 error "(10) Fail to chattr on the broken striped directory"
5164 rmdir $DIR/$tdir/striped_dir ||
5165 error "(11) Fail to remove the striped directory after LFSCK"
5167 run_test 31d "Set broken striped directory (modified after broken) as read-only"
5170 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5171 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5172 skip "MDS older than 2.6.50, LU-5519"
5175 echo "For some reason, the slave MDT-object of the striped directory"
5176 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5177 echo "slave LMV EA."
5180 check_mount_and_prep
5182 echo "Inject failure stub on MDT0 to simulate the case that the"
5183 echo "slave MDT-object (that resides on the same MDT as the master"
5184 echo "MDT-object resides on) lost the LMV EA."
5186 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
5188 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5189 error "(1) Fail to create striped directory"
5190 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5192 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5193 $START_NAMESPACE -r -A ||
5194 error "(2) Fail to start LFSCK for namespace"
5196 wait_all_targets_blocked namespace completed 3
5198 local repaired=$($SHOW_NAMESPACE |
5199 awk '/^striped_shards_repaired/ { print $2 }')
5200 [ $repaired -eq 1 ] ||
5201 error "(4) Fail to re-generate slave LMV EA: $repaired"
5203 rmdir $DIR/$tdir/striped_dir ||
5204 error "(5) Fail to remove the striped directory after LFSCK"
5206 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
5209 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5210 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5211 skip "MDS older than 2.6.50, LU-5519"
5214 echo "For some reason, the slave MDT-object of the striped directory"
5215 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
5216 echo "slave LMV EA."
5219 check_mount_and_prep
5221 echo "Inject failure stub on MDT0 to simulate the case that the"
5222 echo "slave MDT-object (that resides on different MDT as the master"
5223 echo "MDT-object resides on) lost the LMV EA."
5225 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5226 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5227 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5228 error "(1) Fail to create striped directory"
5229 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5231 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5232 $START_NAMESPACE -r -A ||
5233 error "(2) Fail to start LFSCK for namespace"
5235 wait_all_targets_blocked namespace completed 3
5237 local repaired=$(do_facet mds2 $LCTL get_param -n \
5238 mdd.$(facet_svc mds2).lfsck_namespace |
5239 awk '/^striped_shards_repaired/ { print $2 }')
5240 [ $repaired -eq 1 ] ||
5241 error "(4) Fail to re-generate slave LMV EA: $repaired"
5243 rmdir $DIR/$tdir/striped_dir ||
5244 error "(5) Fail to remove the striped directory after LFSCK"
5246 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5249 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5250 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5251 skip "MDS older than 2.6.50, LU-5519"
5254 echo "For some reason, the stripe index in the slave LMV EA is"
5255 echo "corrupted. The LFSCK should repair the slave LMV EA."
5258 check_mount_and_prep
5260 echo "Inject failure stub on MDT0 to simulate the case that the"
5261 echo "slave LMV EA on the first shard of the striped directory"
5262 echo "claims the same index as the second shard claims"
5264 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5266 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5267 error "(1) Fail to create striped directory"
5268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5270 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5271 $START_NAMESPACE -r -A ||
5272 error "(2) Fail to start LFSCK for namespace"
5274 wait_all_targets_blocked namespace completed 3
5276 local repaired=$($SHOW_NAMESPACE |
5277 awk '/^striped_shards_repaired/ { print $2 }')
5278 [ $repaired -eq 1 ] ||
5279 error "(4) Fail to repair slave LMV EA: $repaired"
5281 umount_client $MOUNT || error "(5) umount failed"
5282 mount_client $MOUNT || error "(6) mount failed"
5284 touch $DIR/$tdir/striped_dir/foo ||
5285 error "(7) Fail to touch file after the LFSCK"
5287 rm -f $DIR/$tdir/striped_dir/foo ||
5288 error "(8) Fail to unlink file after the LFSCK"
5290 rmdir $DIR/$tdir/striped_dir ||
5291 error "(9) Fail to remove the striped directory after LFSCK"
5293 run_test 31g "Repair the corrupted slave LMV EA"
5296 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5297 (( $MDS1_VERSION > $(version_code 2.6.50) )) ||
5298 skip "MDS older than 2.6.50, LU-5519"
5301 echo "For some reason, the shard's name entry in the striped"
5302 echo "directory may be corrupted. The LFSCK should repair the"
5303 echo "bad shard's name entry."
5306 check_mount_and_prep
5308 echo "Inject failure stub on MDT0 to simulate the case that the"
5309 echo "first shard's name entry in the striped directory claims"
5310 echo "the same index as the second shard's name entry claims."
5312 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5314 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5315 error "(1) Fail to create striped directory"
5316 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5318 echo "Trigger namespace LFSCK to repair the shard's name entry"
5319 $START_NAMESPACE -r -A ||
5320 error "(2) Fail to start LFSCK for namespace"
5322 wait_all_targets_blocked namespace completed 3
5324 local repaired=$($SHOW_NAMESPACE |
5325 awk '/^dirent_repaired/ { print $2 }')
5326 [ $repaired -eq 1 ] ||
5327 error "(4) Fail to repair shard's name entry: $repaired"
5329 umount_client $MOUNT || error "(5) umount failed"
5330 mount_client $MOUNT || error "(6) mount failed"
5332 touch $DIR/$tdir/striped_dir/foo ||
5333 error "(7) Fail to touch file after the LFSCK"
5335 rm -f $DIR/$tdir/striped_dir/foo ||
5336 error "(8) Fail to unlink file after the LFSCK"
5338 rmdir $DIR/$tdir/striped_dir ||
5339 error "(9) Fail to remove the striped directory after LFSCK"
5341 run_test 31h "Repair the corrupted shard's name entry"
5346 umount_client $MOUNT
5348 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5349 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5350 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5352 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5353 [ "$STATUS" == "scanning-phase1" ] ||
5354 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5357 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5359 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5363 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5365 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5366 error "(5) Fail to start ost1"
5368 run_test 32a "stop LFSCK when some OST failed"
5372 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5375 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5376 error "(1) Fail to create $DIR/$tdir/dp"
5377 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5378 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5379 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5380 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5381 umount_client $MOUNT
5383 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5384 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5385 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5387 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5388 mdd.${MDT_DEV}.lfsck_namespace |
5389 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5391 error "(5) unexpected status"
5395 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5397 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5401 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5403 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5404 error "(8) Fail to start MDT2"
5406 run_test 32b "stop LFSCK when some MDT failed"
5412 $START_LAYOUT --dryrun -o -r ||
5413 error "(1) Fail to start layout LFSCK"
5414 wait_all_targets_blocked layout completed 2
5416 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5417 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5418 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5420 $START_NAMESPACE -e abort -A -r ||
5421 error "(4) Fail to start namespace LFSCK"
5422 wait_all_targets_blocked namespace completed 5
5424 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5425 [ "$PARAMS" == "failout,all_targets" ] ||
5426 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5428 run_test 33 "check LFSCK paramters"
5432 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
5433 [ "$mds1_FSTYPE" != zfs ] && skip "Only valid for ZFS backend"
5437 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5438 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5439 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5440 error "(1) Fail to create $DIR/$tdir/dummy"
5442 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5443 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5444 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5445 mdd.${MDT_DEV}.lfsck_namespace |
5446 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5448 error "(3) unexpected status"
5451 local repaired=$($SHOW_NAMESPACE |
5452 awk '/^dirent_repaired/ { print $2 }')
5453 [ $repaired -eq 1 ] ||
5454 error "(4) Fail to repair the lost agent object: $repaired"
5456 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5457 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5458 mdd.${MDT_DEV}.lfsck_namespace |
5459 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5461 error "(6) unexpected status"
5464 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5465 [ $repaired -eq 0 ] ||
5466 error "(7) Unexpected repairing: $repaired"
5468 run_test 34 "LFSCK can rebuild the lost agent object"
5472 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5476 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5477 do_facet mds2 $LCTL set_param fail_loc=0x1631
5478 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5479 error "(1) Fail to create $DIR/$tdir/dummy"
5482 do_facet mds2 $LCTL set_param fail_loc=0
5483 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5484 wait_update_facet mds2 "$LCTL get_param -n \
5485 mdd.$(facet_svc mds2).lfsck_namespace |
5486 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5487 error "(3) MDS${k} is not the expected 'completed'"
5489 local repaired=$(do_facet mds2 $LCTL get_param -n \
5490 mdd.$(facet_svc mds2).lfsck_namespace |
5491 awk '/^agent_entries_repaired/ { print $2 }')
5492 [ $repaired -eq 1 ] ||
5493 error "(4) Fail to repair the lost agent entry: $repaired"
5495 echo "stopall to cleanup object cache"
5498 setupall > /dev/null
5500 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5501 wait_update_facet mds2 "$LCTL get_param -n \
5502 mdd.$(facet_svc mds2).lfsck_namespace |
5503 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5504 error "(6) MDS${k} is not the expected 'completed'"
5506 repaired=$(do_facet mds2 $LCTL get_param -n \
5507 mdd.$(facet_svc mds2).lfsck_namespace |
5508 awk '/^agent_entries_repaired/ { print $2 }')
5509 [ $repaired -eq 0 ] ||
5510 error "(7) Unexpected repairing: $repaired"
5512 run_test 35 "LFSCK can rebuild the lost agent entry"
5515 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5518 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5519 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5520 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5523 check_mount_and_prep
5527 lctl get_param osc.*.*grant*
5528 stack_trap "lfs df $DIR; lfs df -i $DIR; lctl get_param osc.*.*grant*"
5530 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5531 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5532 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5533 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5534 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5535 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5536 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5537 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5538 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5540 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5541 error "(3) Fail to write $DIR/$tdir/f0"
5542 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5543 error "(4) Fail to write $DIR/$tdir/f1"
5544 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5545 error "(5) Fail to write $DIR/$tdir/f2"
5547 $LFS mirror resync $DIR/$tdir/f0 ||
5548 error "(6) Fail to resync $DIR/$tdir/f0"
5549 $LFS mirror resync $DIR/$tdir/f1 ||
5550 error "(7) Fail to resync $DIR/$tdir/f1"
5551 $LFS mirror resync $DIR/$tdir/f2 ||
5552 error "(8) Fail to resync $DIR/$tdir/f2"
5554 cancel_lru_locks mdc
5555 cancel_lru_locks osc
5557 $LFS getstripe $DIR/$tdir/f0 ||
5558 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5559 $LFS getstripe $DIR/$tdir/f1 ||
5560 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5561 $LFS getstripe $DIR/$tdir/f2 ||
5562 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5564 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5565 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5566 do_facet mds1 $LCTL set_param fail_loc=0x1616
5568 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5569 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5570 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5571 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5572 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5573 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5577 do_facet mds1 $LCTL set_param fail_loc=0
5579 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5580 error "(15) The 1st of mirror is not destroyed"
5581 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5582 error "(16) The 2nd of mirror is not destroyed"
5583 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5584 error "(17) The 3rd of mirror is not destroyed"
5588 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5589 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5590 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5591 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5592 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5593 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5595 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5596 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5598 for k in $(seq $MDSCOUNT); do
5599 # The LFSCK status query internal is 30 seconds. For the case
5600 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5601 # time to guarantee the status sync up.
5602 wait_update_facet mds${k} "$LCTL get_param -n \
5603 mdd.$(facet_svc mds${k}).lfsck_layout |
5604 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5605 error "(22) MDS${k} is not the expected 'completed'"
5608 for k in $(seq $OSTCOUNT); do
5609 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5610 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5611 awk '/^status/ { print $2 }')
5612 [ "$cur_status" == "completed" ] ||
5613 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5616 local repaired=$(do_facet mds1 $LCTL get_param -n \
5617 mdd.$(facet_svc mds1).lfsck_layout |
5618 awk '/^repaired_orphan/ { print $2 }')
5619 [ $repaired -eq 9 ] ||
5620 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5622 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5623 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5624 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5625 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5626 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5627 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5629 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5630 $LFS getstripe $DIR/$tdir/f0
5631 error "(28) The 1st of mirror is not recovered"
5634 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5635 $LFS getstripe $DIR/$tdir/f1
5636 error "(29) The 2nd of mirror is not recovered"
5639 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5640 $LFS getstripe $DIR/$tdir/f2
5641 error "(30) The 3rd of mirror is not recovered"
5644 run_test 36a "rebuild LOV EA for mirrored file (1)"
5647 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5648 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5651 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5652 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5653 echo "with the PFID EA of related OST-object(s) belong to the file. "
5656 check_mount_and_prep
5658 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5659 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5660 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5662 local fid=$($LFS path2fid $DIR/$tdir/f0)
5664 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5665 error "(1) Fail to write $DIR/$tdir/f0"
5666 $LFS mirror resync $DIR/$tdir/f0 ||
5667 error "(2) Fail to resync $DIR/$tdir/f0"
5669 cancel_lru_locks mdc
5670 cancel_lru_locks osc
5672 $LFS getstripe $DIR/$tdir/f0 ||
5673 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5675 echo "Inject failure, to simulate the case of missing the MDT-object"
5676 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5677 do_facet mds1 $LCTL set_param fail_loc=0x1616
5678 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5682 do_facet mds1 $LCTL set_param fail_loc=0
5684 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5685 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5687 for k in $(seq $MDSCOUNT); do
5688 # The LFSCK status query internal is 30 seconds. For the case
5689 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5690 # time to guarantee the status sync up.
5691 wait_update_facet mds${k} "$LCTL get_param -n \
5692 mdd.$(facet_svc mds${k}).lfsck_layout |
5693 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5694 error "(6) MDS${k} is not the expected 'completed'"
5697 for k in $(seq $OSTCOUNT); do
5698 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5699 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5700 awk '/^status/ { print $2 }')
5701 [ "$cur_status" == "completed" ] ||
5702 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5705 local count=$(do_facet mds1 $LCTL get_param -n \
5706 mdd.$(facet_svc mds1).lfsck_layout |
5707 awk '/^repaired_orphan/ { print $2 }')
5708 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5710 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5711 count=$($LFS getstripe --mirror-count $name)
5712 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5714 count=$($LFS getstripe --component-count $name)
5715 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5717 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5718 $LFS getstripe $name
5719 error "(11) The 1st of mirror is not recovered"
5722 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5723 $LFS getstripe $name
5724 error "(12) The 2nd of mirror is not recovered"
5727 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5728 $LFS getstripe $name
5729 error "(13) The 3rd of mirror is not recovered"
5732 run_test 36b "rebuild LOV EA for mirrored file (2)"
5735 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5736 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5739 echo "The mirrored file has been modified, not resynced yet, then "
5740 echo "lost its MDT-object, but relatd OST-objects are still there. "
5741 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5742 echo "with the PFID EA of related OST-object(s) belong to the file. "
5745 check_mount_and_prep
5747 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5749 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5751 local fid=$($LFS path2fid $DIR/$tdir/f0)
5753 # The 1st dd && resync makes all related OST-objects have been written
5754 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5755 error "(1.1) Fail to write $DIR/$tdir/f0"
5756 $LFS mirror resync $DIR/$tdir/f0 ||
5757 error "(1.2) Fail to resync $DIR/$tdir/f0"
5758 # The 2nd dd makes one mirror to be stale
5759 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5760 error "(1.3) Fail to write $DIR/$tdir/f0"
5762 cancel_lru_locks mdc
5763 cancel_lru_locks osc
5765 $LFS getstripe $DIR/$tdir/f0 ||
5766 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5768 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5769 awk '/lcme_flags/ { print $2 }')
5770 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5771 awk '/lcme_flags/ { print $2 }')
5773 echo "Inject failure, to simulate the case of missing the MDT-object"
5774 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5775 do_facet mds1 $LCTL set_param fail_loc=0x1616
5776 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5780 do_facet mds1 $LCTL set_param fail_loc=0
5782 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5783 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5785 for k in $(seq $MDSCOUNT); do
5786 # The LFSCK status query internal is 30 seconds. For the case
5787 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5788 # time to guarantee the status sync up.
5789 wait_update_facet mds${k} "$LCTL get_param -n \
5790 mdd.$(facet_svc mds${k}).lfsck_layout |
5791 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5792 error "(5) MDS${k} is not the expected 'completed'"
5795 for k in $(seq $OSTCOUNT); do
5796 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5797 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5798 awk '/^status/ { print $2 }')
5799 [ "$cur_status" == "completed" ] ||
5800 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5803 local count=$(do_facet mds1 $LCTL get_param -n \
5804 mdd.$(facet_svc mds1).lfsck_layout |
5805 awk '/^repaired_orphan/ { print $2 }')
5806 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5808 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5809 count=$($LFS getstripe --mirror-count $name)
5810 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5812 count=$($LFS getstripe --component-count $name)
5813 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5815 local flags=$($LFS getstripe $name | head -n 10 |
5816 awk '/lcme_flags/ { print $2 }')
5817 [ "$flags" == "$saved_flags1" ] || {
5818 $LFS getstripe $name
5819 error "(10) expect flags $saved_flags1, got $flags"
5822 flags=$($LFS getstripe $name | tail -n 10 |
5823 awk '/lcme_flags/ { print $2 }')
5824 [ "$flags" == "$saved_flags2" ] || {
5825 $LFS getstripe $name
5826 error "(11) expect flags $saved_flags2, got $flags"
5829 run_test 36c "rebuild LOV EA for mirrored file (3)"
5835 local t_dir="$DIR/$tdir/d0"
5836 check_mount_and_prep
5838 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5839 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5843 $START_NAMESPACE -r -A || {
5844 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5846 wait_all_targets_blocked namespace completed 4
5851 run_test 37 "LFSCK must skip a ORPHAN"
5855 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5856 skip "Need MDS version newer than 2.12.51"
5858 test_mkdir $DIR/$tdir
5859 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5860 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5862 # create foreign file
5863 $LFS setstripe --foreign=none --flags 0xda05 \
5864 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5865 error "$DIR/$tdir/$tfile: create failed"
5867 $LFS getstripe -v $DIR/$tdir/$tfile |
5868 grep "lfm_magic:.*0x0BD70BD0" ||
5869 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5870 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5871 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5872 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5873 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5874 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5875 $LFS getstripe -v $DIR/$tdir/$tfile |
5876 grep "lfm_flags:.*0x0000DA05" ||
5877 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5878 $LFS getstripe $DIR/$tdir/$tfile |
5879 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5880 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5882 # modify striping should fail
5883 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5884 error "$DIR/$tdir/$tfile: setstripe should fail"
5886 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5888 wait_all_targets_blocked namespace completed 1
5890 # check that "global" namespace_repaired == 0 !!!
5891 local repaired=$(do_facet mds1 \
5892 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5893 awk '/^namespace_repaired/ { print \\\$2 }'")
5894 [ $repaired -eq 0 ] ||
5895 error "(2) Expect no namespace repair, but got: $repaired"
5897 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5899 wait_all_targets_blocked layout completed 2
5901 # check that "global" layout_repaired == 0 !!!
5902 local repaired=$(do_facet mds1 \
5903 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5904 awk '/^layout_repaired/ { print \\\$2 }'")
5905 [ $repaired -eq 0 ] ||
5906 error "(2) Expect no layout repair, but got: $repaired"
5908 echo "post-lfsck checks of foreign file"
5910 $LFS getstripe -v $DIR/$tdir/$tfile |
5911 grep "lfm_magic:.*0x0BD70BD0" ||
5912 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5913 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5914 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5915 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5916 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*none" ||
5917 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5918 $LFS getstripe -v $DIR/$tdir/$tfile |
5919 grep "lfm_flags:.*0x0000DA05" ||
5920 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5921 $LFS getstripe $DIR/$tdir/$tfile |
5922 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5923 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5925 # modify striping should fail
5926 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5927 error "$DIR/$tdir/$tfile: setstripe should fail"
5930 cat $DIR/$tdir/$tfile && error "$DIR/$tdir/$tfile: read should fail"
5931 cat /etc/passwd > $DIR/$tdir/$tfile &&
5932 error "$DIR/$tdir/$tfile: write should fail"
5934 #remove foreign file
5935 rm $DIR/$tdir/$tfile ||
5936 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5938 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5942 [[ "$MDS1_VERSION" -le $(version_code 2.12.51) ]] &&
5943 skip "Need MDS version newer than 2.12.51"
5945 test_mkdir $DIR/$tdir
5946 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5947 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5949 # create foreign dir
5950 $LFS mkdir --foreign=none --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5951 $DIR/$tdir/${tdir}2 ||
5952 error "$DIR/$tdir/${tdir}2: create failed"
5954 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5955 grep "lfm_magic:.*0x0CD50CD0" ||
5956 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5957 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5958 # - sizeof(lfm_type) - sizeof(lfm_flags)
5959 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5960 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5961 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
5962 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5963 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5964 grep "lfm_flags:.*0x0000DA05" ||
5965 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5966 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5967 grep "lfm_value.*${uuid1}@${uuid2}" ||
5968 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5970 # file create in dir should fail
5971 touch $DIR/$tdir/${tdir}2/$tfile &&
5972 "$DIR/${tdir}2: file create should fail"
5975 chmod 777 $DIR/$tdir/${tdir}2 ||
5976 error "$DIR/${tdir}2: chmod failed"
5979 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5980 error "$DIR/${tdir}2: chown failed"
5982 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5984 wait_all_targets_blocked namespace completed 1
5986 # check that "global" namespace_repaired == 0 !!!
5987 local repaired=$(do_facet mds1 \
5988 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5989 awk '/^namespace_repaired/ { print \\\$2 }'")
5990 [ $repaired -eq 0 ] ||
5991 error "(2) Expect nothing to be repaired, but got: $repaired"
5993 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5995 wait_all_targets_blocked layout completed 2
5997 # check that "global" layout_repaired == 0 !!!
5998 local repaired=$(do_facet mds1 \
5999 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
6000 awk '/^layout_repaired/ { print \\\$2 }'")
6001 [ $repaired -eq 0 ] ||
6002 error "(2) Expect no layout repair, but got: $repaired"
6004 echo "post-lfsck checks of foreign dir"
6006 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6007 grep "lfm_magic:.*0x0CD50CD0" ||
6008 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
6009 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
6010 # - sizeof(lfm_type) - sizeof(lfm_flags)
6011 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
6012 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
6013 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*none" ||
6014 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
6015 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
6016 grep "lfm_flags:.*0x0000DA05" ||
6017 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
6018 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
6019 grep "lfm_value.*${uuid1}@${uuid2}" ||
6020 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
6022 # file create in dir should fail
6023 touch $DIR/$tdir/${tdir}2/$tfile &&
6024 "$DIR/${tdir}2: file create should fail"
6027 chmod 777 $DIR/$tdir/${tdir}2 ||
6028 error "$DIR/${tdir}2: chmod failed"
6031 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
6032 error "$DIR/${tdir}2: chown failed"
6035 rmdir $DIR/$tdir/${tdir}2 ||
6036 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
6038 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
6041 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
6043 check_mount_and_prep
6044 $LFS mkdir -i 1 $DIR/$tdir/dir1
6045 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
6047 touch $DIR/$tdir/dir1/f1
6048 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
6050 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
6051 $LFS migrate -m 0 $DIR/$tdir/dir1
6053 echo "trigger LFSCK for layout"
6054 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
6056 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6057 mdd.${MDT_DEV}.lfsck_layout |
6058 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6060 error "(2) unexpected status"
6063 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
6065 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
6067 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
6071 local old_debug=$(do_facet $SINGLEMDS $LCTL get_param -n debug)
6073 do_facet $SINGLEMDS $LCTL set_param debug=+lfsck
6074 $LFS setstripe -E 1G -z 64M -E -1 -z 128M $DIR/$tfile
6075 do_facet $SINGLEMDS $LCTL dk > /dev/null
6077 echo "trigger LFSCK for SEL layout"
6078 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -A -t all -r -n on
6079 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
6080 mdd.${MDT_DEV}.lfsck_layout |
6081 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
6083 error "(2) unexpected status"
6086 local errors=$(do_facet $SINGLEMDS $LCTL dk |
6087 grep "lfsck_layout_verify_header")
6089 [[ "x$errors" == "x" ]] || {
6091 error "lfsck failed"
6094 do_facet $SINGLEMDS "$LCTL set_param debug='$old_debug'"
6096 run_test 41 "SEL support in LFSCK"
6098 # restore MDS/OST size
6099 MDSSIZE=${SAVED_MDSSIZE}
6100 OSTSIZE=${SAVED_OSTSIZE}
6101 OSTCOUNT=${SAVED_OSTCOUNT}
6103 # cleanup the system at last
6104 REFORMAT="yes" cleanup_and_setup_lustre
6107 check_and_cleanup_lustre