3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
45 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
57 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
60 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
61 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
63 # DNE does not support striped directory on zfs-based backend yet.
64 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
65 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
69 MDT_DEV="${FSNAME}-MDT0000"
70 OST_DEV="${FSNAME}-OST0000"
71 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
72 START_NAMESPACE="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
74 START_LAYOUT="do_facet $SINGLEMDS \
75 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
76 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
77 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
78 SHOW_NAMESPACE="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
80 SHOW_LAYOUT="do_facet $SINGLEMDS \
81 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
82 SHOW_LAYOUT_ON_OST="do_facet ost1 \
83 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
84 MOUNT_OPTS_SCRUB="-o user_xattr"
85 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
86 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
95 echo "preparing... $nfiles * $ndirs files will be created $(date)."
96 if [ ! -z $igif ]; then
97 #define OBD_FAIL_FID_IGIF 0x1504
98 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
101 cp $LUSTRE/tests/*.sh $DIR/$tdir/
102 if [ $ndirs -gt 0 ]; then
103 createmany -d $DIR/$tdir/d $ndirs
104 createmany -m $DIR/$tdir/f $ndirs
105 if [ $nfiles -gt 0 ]; then
106 for ((i = 0; i < $ndirs; i++)); do
107 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
108 /dev/null || error "createmany $nfiles"
111 createmany -d $DIR/$tdir/e $ndirs
114 if [ ! -z $igif ]; then
115 touch $DIR/$tdir/dummy
116 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
119 echo "prepared $(date)."
122 run_e2fsck_on_mdt0() {
123 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
125 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
128 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
129 error "(2) Detected inconsistency on MDT0"
131 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
132 error "(3) Fail to start MDT0"
135 wait_all_targets_blocked() {
140 local count=$(do_facet mds1 \
141 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
142 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
143 [[ $count -eq $MDSCOUNT ]] || {
144 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
145 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
154 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
155 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
156 "$MDSCOUNT" $LTIME || {
157 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
158 error "($err) some MDTs are not in ${status}"
165 #define OBD_FAIL_LFSCK_DELAY1 0x1600
166 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
167 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
169 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
171 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
172 [ "$STATUS" == "scanning-phase1" ] ||
173 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
175 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
177 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
178 [ "$STATUS" == "stopped" ] ||
179 error "(6) Expect 'stopped', but got '$STATUS'"
181 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
183 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
184 [ "$STATUS" == "scanning-phase1" ] ||
185 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
188 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
189 mdd.${MDT_DEV}.lfsck_namespace |
190 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
192 error "(9) unexpected status"
195 local repaired=$($SHOW_NAMESPACE |
196 awk '/^updated_phase1/ { print $2 }')
197 [ $repaired -eq 0 ] ||
198 error "(10) Expect nothing to be repaired, but got: $repaired"
200 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
201 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
202 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
203 mdd.${MDT_DEV}.lfsck_namespace |
204 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
206 error "(12) unexpected status"
209 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
210 [ $((scanned1 + 1)) -eq $scanned2 ] ||
211 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
213 echo "stopall, should NOT crash LU-3649"
214 stopall || error "(14) Fail to stopall"
216 run_test 0 "Control LFSCK manually"
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_FID_IGIF 0x1504
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^dirent_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase1/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair lost FID-in-dirent: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 #define OBD_FAIL_FID_LOOKUP 0x1505
335 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
336 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
338 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
340 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
343 [ $MDS1_VERSION -lt $(version_code 2.13.57) ] &&
344 skip "MDS older than 2.13.57"
345 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
349 touch $DIR/$tdir/$tfile
350 mkdir $DIR/$tdir/subdir
351 $LFS mkdir -i 1 $DIR/$tdir/remotedir
352 $LFS path2fid $DIR/$tdir
353 ll_decode_linkea $DIR/$tdir/$tfile
354 ll_decode_linkea $DIR/$tdir/subdir
355 ll_decode_linkea $DIR/$tdir/remotedir
357 local mntpt=$(facet_mntpt mds1)
359 # unlink OI files to remove the stale entry
360 local saved_opts=$MDS_MOUNT_OPTS
363 mount_fstype mds1 $mntpt
364 # increase $tdir FID oid in LMA
365 do_facet mds1 "getfattr -d -m trusted.lma -e hex \
366 --absolute-names $mntpt/ROOT/$tdir | \
367 sed -E 's/0(.{8})$/1\1/' | setfattr --restore=-"
368 unmount_fstype mds1 $mntpt
371 # the FID oid in LMA was increased above, and it's not in OI table,
372 # run scrub first to generate mapping in OI, so the following namespace
373 # check can fix linkea correctly, this is not necessary normally.
374 do_facet mds1 $LCTL lfsck_start -M ${MDT_DEV} -t scrub ||
375 error "failed to start LFSCK for scrub!"
376 wait_update_facet mds1 "$LCTL get_param -n \
377 osd-*.$(facet_svc mds1).oi_scrub |
378 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
379 error "unexpected status"
381 $START_NAMESPACE -r -A || error "fail to start LFSCK for namespace!"
382 wait_update_facet mds1 "$LCTL get_param -n \
383 mdd.${MDT_DEV}.lfsck_namespace |
384 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
386 error "unexpected status"
388 $LFS path2fid $DIR/$tdir
389 ll_decode_linkea $DIR/$tdir/$tfile
390 ll_decode_linkea $DIR/$tdir/subdir
391 ll_decode_linkea $DIR/$tdir/remotedir
396 fid=$($LFS path2fid $DIR/$tdir)
397 for f in $tfile subdir remotedir; do
398 pfid=$(ll_decode_linkea $DIR/$tdir/$f |
399 awk '/pfid/ { print $3 }')
401 [ "$pfid" == "$fid" ] || error "$fid in LMA != $pfid in linkea"
404 run_test 1d "LFSCK can fix mismatch of FID in LMA and FID in child linkea"
409 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
410 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
411 touch $DIR/$tdir/dummy
413 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
415 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
416 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
417 mdd.${MDT_DEV}.lfsck_namespace |
418 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
420 error "(4) unexpected status"
423 local repaired=$($SHOW_NAMESPACE |
424 awk '/^linkea_repaired/ { print $2 }')
425 # for interop with old server
426 [ -z "$repaired" ] &&
427 repaired=$($SHOW_NAMESPACE |
428 awk '/^updated_phase2/ { print $2 }')
430 [ $repaired -eq 1 ] ||
431 error "(5) Fail to repair crashed linkEA: $repaired"
435 mount_client $MOUNT || error "(6) Fail to start client!"
437 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
438 error "(7) Fail to stat $DIR/$tdir/dummy"
440 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
441 local dummyname=$($LFS fid2path $DIR $dummyfid)
442 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
443 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
445 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
451 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
452 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
453 touch $DIR/$tdir/dummy
455 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
457 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
458 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
459 mdd.${MDT_DEV}.lfsck_namespace |
460 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
462 error "(4) unexpected status"
465 local repaired=$($SHOW_NAMESPACE |
466 awk '/^updated_phase2/ { print $2 }')
467 [ $repaired -eq 1 ] ||
468 error "(5) Fail to repair crashed linkEA: $repaired"
472 mount_client $MOUNT || error "(6) Fail to start client!"
474 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
475 error "(7) Fail to stat $DIR/$tdir/dummy"
477 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
478 local dummyname=$($LFS fid2path $DIR $dummyfid)
479 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
480 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
482 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
488 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
489 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
490 touch $DIR/$tdir/dummy
492 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
494 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
495 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
496 mdd.${MDT_DEV}.lfsck_namespace |
497 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
499 error "(4) unexpected status"
502 local repaired=$($SHOW_NAMESPACE |
503 awk '/^updated_phase2/ { print $2 }')
504 [ $repaired -eq 1 ] ||
505 error "(5) Fail to repair crashed linkEA: $repaired"
509 mount_client $MOUNT || error "(6) Fail to start client!"
511 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
512 error "(7) Fail to stat $DIR/$tdir/dummy"
514 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
515 local dummyname=$($LFS fid2path $DIR $dummyfid)
516 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
517 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
519 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
525 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
526 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
527 touch $DIR/$tdir/dummy
529 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
531 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
532 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
533 mdd.${MDT_DEV}.lfsck_namespace |
534 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
536 error "(4) unexpected status"
539 local repaired=$($SHOW_NAMESPACE |
540 awk '/^linkea_repaired/ { print $2 }')
541 [ $repaired -eq 1 ] ||
542 error "(5) Fail to repair crashed linkEA: $repaired"
546 mount_client $MOUNT || error "(6) Fail to start client!"
548 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
549 error "(7) Fail to stat $DIR/$tdir/dummy"
551 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
552 local dummyname=$($LFS fid2path $DIR $dummyfid)
553 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
554 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
556 run_test 2d "LFSCK can recover the missing linkEA entry"
560 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
564 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
566 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
567 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
568 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
569 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
571 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
573 wait_all_targets_blocked namespace completed 4
575 local repaired=$($SHOW_NAMESPACE |
576 awk '/^linkea_repaired/ { print $2 }')
577 [ $repaired -eq 1 ] ||
578 error "(5) Fail to repair crashed linkEA: $repaired"
580 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
581 local name=$($LFS fid2path $DIR $fid)
582 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
583 error "(6) Fail to repair linkEA: $fid $name"
585 run_test 2e "namespace LFSCK can verify remote object linkEA"
591 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
592 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
593 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
595 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
596 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
597 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
599 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
600 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
601 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
603 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
604 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
605 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
607 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
609 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
610 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
611 mdd.${MDT_DEV}.lfsck_namespace |
612 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
614 error "(10) unexpected status"
617 local checked=$($SHOW_NAMESPACE |
618 awk '/^checked_phase2/ { print $2 }')
619 [ $checked -ge 4 ] ||
620 error "(11) Fail to check multiple-linked object: $checked"
622 local repaired=$($SHOW_NAMESPACE |
623 awk '/^multiple_linked_repaired/ { print $2 }')
624 [ $repaired -ge 2 ] ||
625 error "(12) Fail to repair multiple-linked object: $repaired"
627 run_test 3 "LFSCK can verify multiple-linked objects"
631 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
632 skip "OI Scrub not implemented for ZFS" && return
635 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
636 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
638 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
639 echo "start $SINGLEMDS with disabling OI scrub"
640 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
641 error "(2) Fail to start MDS!"
643 #define OBD_FAIL_LFSCK_DELAY2 0x1601
644 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
645 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
646 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
647 mdd.${MDT_DEV}.lfsck_namespace |
648 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
650 error "(5) unexpected status"
653 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
654 [ "$STATUS" == "scanning-phase1" ] ||
655 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
657 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
658 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
659 mdd.${MDT_DEV}.lfsck_namespace |
660 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
662 error "(7) unexpected status"
665 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
666 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
668 local repaired=$($SHOW_NAMESPACE |
669 awk '/^dirent_repaired/ { print $2 }')
670 # for interop with old server
671 [ -z "$repaired" ] &&
672 repaired=$($SHOW_NAMESPACE |
673 awk '/^updated_phase1/ { print $2 }')
675 [ $repaired -ge 9 ] ||
676 error "(9) Fail to re-generate FID-in-dirent: $repaired"
680 mount_client $MOUNT || error "(10) Fail to start client!"
682 #define OBD_FAIL_FID_LOOKUP 0x1505
683 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
684 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
685 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
687 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
691 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
692 skip "OI Scrub not implemented for ZFS" && return
695 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
696 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
698 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
699 echo "start $SINGLEMDS with disabling OI scrub"
700 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
701 error "(2) Fail to start MDS!"
703 #define OBD_FAIL_LFSCK_DELAY2 0x1601
704 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
705 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
706 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
707 mdd.${MDT_DEV}.lfsck_namespace |
708 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
710 error "(5) unexpected status"
713 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
714 [ "$STATUS" == "scanning-phase1" ] ||
715 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
717 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
718 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
719 mdd.${MDT_DEV}.lfsck_namespace |
720 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
722 error "(7) unexpected status"
725 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
726 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
728 local repaired=$($SHOW_NAMESPACE |
729 awk '/^dirent_repaired/ { print $2 }')
730 # for interop with old server
731 [ -z "$repaired" ] &&
732 repaired=$($SHOW_NAMESPACE |
733 awk '/^updated_phase1/ { print $2 }')
735 [ $repaired -ge 2 ] ||
736 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
740 mount_client $MOUNT || error "(10) Fail to start client!"
742 #define OBD_FAIL_FID_LOOKUP 0x1505
743 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
744 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
746 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
748 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
749 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
750 local dummyname=$($LFS fid2path $DIR $dummyfid)
751 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
752 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
754 run_test 5 "LFSCK can handle IGIF object upgrading"
759 #define OBD_FAIL_LFSCK_DELAY1 0x1600
760 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
761 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
763 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
764 [ "$STATUS" == "scanning-phase1" ] ||
765 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
767 # Sleep 3 sec to guarantee at least one object processed by LFSCK
769 # Fail the LFSCK to guarantee there is at least one checkpoint
770 #define OBD_FAIL_LFSCK_FATAL1 0x1608
771 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
772 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
773 mdd.${MDT_DEV}.lfsck_namespace |
774 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
776 error "(4) unexpected status"
779 local POS0=$($SHOW_NAMESPACE |
780 awk '/^last_checkpoint_position/ { print $2 }' |
783 #define OBD_FAIL_LFSCK_DELAY1 0x1600
784 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
785 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
787 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
788 [ "$STATUS" == "scanning-phase1" ] ||
789 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
791 local POS1=$($SHOW_NAMESPACE |
792 awk '/^latest_start_position/ { print $2 }' |
794 [[ $POS0 -lt $POS1 ]] ||
795 error "(7) Expect larger than: $POS0, but got $POS1"
797 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
798 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
799 mdd.${MDT_DEV}.lfsck_namespace |
800 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
802 error "(8) unexpected status"
805 run_test 6a "LFSCK resumes from last checkpoint (1)"
810 #define OBD_FAIL_LFSCK_DELAY2 0x1601
811 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
812 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
814 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
815 [ "$STATUS" == "scanning-phase1" ] ||
816 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
818 # Sleep 5 sec to guarantee that we are in the directory scanning
820 # Fail the LFSCK to guarantee there is at least one checkpoint
821 #define OBD_FAIL_LFSCK_FATAL2 0x1609
822 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
823 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
824 mdd.${MDT_DEV}.lfsck_namespace |
825 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
827 error "(4) unexpected status"
830 local O_POS0=$($SHOW_NAMESPACE |
831 awk '/^last_checkpoint_position/ { print $2 }' |
834 local D_POS0=$($SHOW_NAMESPACE |
835 awk '/^last_checkpoint_position/ { print $4 }')
837 #define OBD_FAIL_LFSCK_DELAY2 0x1601
838 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
839 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
841 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
842 [ "$STATUS" == "scanning-phase1" ] ||
843 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
845 local O_POS1=$($SHOW_NAMESPACE |
846 awk '/^latest_start_position/ { print $2 }' |
848 local D_POS1=$($SHOW_NAMESPACE |
849 awk '/^latest_start_position/ { print $4 }')
851 echo "Additional debug for 6b"
853 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
854 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
855 [[ $O_POS0 -lt $O_POS1 ]] ||
856 error "(7.1) $O_POS1 is not larger than $O_POS0"
858 [[ $D_POS0 -lt $D_POS1 ]] ||
859 error "(7.2) $D_POS1 is not larger than $D_POS0"
862 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
863 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
864 mdd.${MDT_DEV}.lfsck_namespace |
865 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
867 error "(8) unexpected status"
870 run_test 6b "LFSCK resumes from last checkpoint (2)"
877 #define OBD_FAIL_LFSCK_DELAY2 0x1601
878 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
879 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
881 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
882 [ "$STATUS" == "scanning-phase1" ] ||
883 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
885 # Sleep 3 sec to guarantee at least one object processed by LFSCK
887 echo "stop $SINGLEMDS"
888 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
891 echo "start $SINGLEMDS"
892 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
893 error "(5) Fail to start MDS!"
895 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
896 mdd.${MDT_DEV}.lfsck_namespace |
897 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
899 error "(6) unexpected status"
902 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
908 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
909 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
910 for ((i = 0; i < 20; i++)); do
911 touch $DIR/$tdir/dummy${i}
914 #define OBD_FAIL_LFSCK_DELAY3 0x1602
915 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
916 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
917 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
918 mdd.${MDT_DEV}.lfsck_namespace |
919 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
921 error "(4) unexpected status"
925 echo "stop $SINGLEMDS"
926 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
928 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
929 echo "start $SINGLEMDS"
930 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
931 error "(6) Fail to start MDS!"
933 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
934 mdd.${MDT_DEV}.lfsck_namespace |
935 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
937 error "(7) unexpected status"
940 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
945 formatall > /dev/null
951 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
952 [ "$STATUS" == "init" ] ||
953 error "(2) Expect 'init', but got '$STATUS'"
955 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
956 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
957 mkdir $DIR/$tdir/crashed
959 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
960 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
961 for ((i = 0; i < 5; i++)); do
962 touch $DIR/$tdir/dummy${i}
965 umount_client $MOUNT || error "(3) Fail to stop client!"
967 #define OBD_FAIL_LFSCK_DELAY2 0x1601
968 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
969 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
971 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
972 [ "$STATUS" == "scanning-phase1" ] ||
973 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
975 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
977 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
978 [ "$STATUS" == "stopped" ] ||
979 error "(7) Expect 'stopped', but got '$STATUS'"
981 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
983 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
984 [ "$STATUS" == "scanning-phase1" ] ||
985 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
987 #define OBD_FAIL_LFSCK_FATAL2 0x1609
988 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
989 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
990 mdd.${MDT_DEV}.lfsck_namespace |
991 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
993 error "(10) unexpected status"
996 #define OBD_FAIL_LFSCK_DELAY1 0x1600
997 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
998 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
1000 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1001 [ "$STATUS" == "scanning-phase1" ] ||
1002 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
1004 #define OBD_FAIL_LFSCK_CRASH 0x160a
1005 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
1008 echo "stop $SINGLEMDS"
1009 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
1011 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1012 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1014 echo "start $SINGLEMDS"
1015 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1016 error "(14) Fail to start MDS!"
1018 local timeout=$(max_recovery_time)
1021 while [ $timer -lt $timeout ]; do
1022 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1023 mdt.${MDT_DEV}.recovery_status |
1024 awk '/^status/ { print \\\$2 }'")
1025 [ "$STATUS" != "RECOVERING" ] && break;
1027 timer=$((timer + 1))
1030 [ $timer != $timeout ] ||
1031 error "(14.1) recovery timeout"
1033 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1034 [ "$STATUS" == "crashed" ] ||
1035 error "(15) Expect 'crashed', but got '$STATUS'"
1037 #define OBD_FAIL_LFSCK_DELAY2 0x1601
1038 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
1039 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
1041 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1042 [ "$STATUS" == "scanning-phase1" ] ||
1043 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
1045 echo "stop $SINGLEMDS"
1046 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
1048 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
1049 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
1051 echo "start $SINGLEMDS"
1052 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1053 error "(19) Fail to start MDS!"
1056 while [ $timer -lt $timeout ]; do
1057 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1058 mdt.${MDT_DEV}.recovery_status |
1059 awk '/^status/ { print \\\$2 }'")
1060 [ "$STATUS" != "RECOVERING" ] && break;
1062 timer=$((timer + 1))
1065 [ $timer != $timeout ] ||
1066 error "(19.1) recovery timeout"
1068 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1069 [ "$STATUS" == "paused" ] ||
1070 error "(20) Expect 'paused', but got '$STATUS'"
1072 echo "stop $SINGLEMDS"
1073 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1075 echo "start $SINGLEMDS without resume LFSCK"
1076 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1077 error "(20.2) Fail to start MDS!"
1080 while [ $timer -lt $timeout ]; do
1081 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1082 mdt.${MDT_DEV}.recovery_status |
1083 awk '/^status/ { print \\\$2 }'")
1084 [ "$STATUS" != "RECOVERING" ] && break;
1086 timer=$((timer + 1))
1089 [ $timer != $timeout ] ||
1090 error "(20.3) recovery timeout"
1092 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1093 [ "$STATUS" == "paused" ] ||
1094 error "(20.4) Expect 'paused', but got '$STATUS'"
1096 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1097 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1099 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1100 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1101 mdd.${MDT_DEV}.lfsck_namespace |
1102 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1104 error "(22) unexpected status"
1107 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1108 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1109 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1111 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1112 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1113 mdd.${MDT_DEV}.lfsck_namespace |
1114 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1116 error "(24) unexpected status"
1119 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1120 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1122 run_test 8 "LFSCK state machine"
1125 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1126 skip "Testing on UP system, the speed may be inaccurate."
1130 check_mount_and_prep
1131 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1132 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1133 createmany -o $DIR/$tdir/lfsck/f 5000
1135 local BASE_SPEED1=100
1137 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1140 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1141 [ "$STATUS" == "scanning-phase1" ] ||
1142 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1144 local SPEED=$($SHOW_LAYOUT |
1145 awk '/^average_speed_phase1/ { print $2 }')
1147 # There may be time error, normally it should be less than 2 seconds.
1148 # We allow another 20% schedule error.
1150 # MAX_MARGIN = 1.3 = 13 / 10
1151 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1152 RUN_TIME1 * 13 / 10))
1153 [ $SPEED -lt $MAX_SPEED ] || {
1155 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1156 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1159 # adjust speed limit
1160 local BASE_SPEED2=300
1162 do_facet $SINGLEMDS \
1163 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1166 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1167 # MIN_MARGIN = 0.7 = 7 / 10
1168 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1169 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1170 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1171 [ $SPEED -gt $MIN_SPEED ] || {
1172 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1173 error_ignore LU-5624 \
1174 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1177 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1181 # MAX_MARGIN = 1.3 = 13 / 10
1182 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1183 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1184 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1185 [ $SPEED -lt $MAX_SPEED ] || {
1187 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1188 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1189 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1192 do_nodes $(comma_list $(mdts_nodes)) \
1193 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1194 do_nodes $(comma_list $(osts_nodes)) \
1195 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1197 wait_update_facet $SINGLEMDS \
1198 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1199 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1200 error "(7) Failed to get expected 'completed'"
1202 run_test 9a "LFSCK speed control (1)"
1205 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1206 skip "Testing on UP system, the speed may be inaccurate."
1212 echo "Preparing another 50 * 50 files (with error) at $(date)."
1213 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1214 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1215 createmany -d $DIR/$tdir/d 50
1216 createmany -m $DIR/$tdir/f 50
1217 for ((i = 0; i < 50; i++)); do
1218 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1221 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1223 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1224 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1225 mdd.${MDT_DEV}.lfsck_namespace |
1226 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1228 error "(5) unexpected status"
1231 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1232 echo "Prepared at $(date)."
1234 local BASE_SPEED1=50
1236 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1239 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1240 [ "$STATUS" == "scanning-phase2" ] ||
1241 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1243 local SPEED=$($SHOW_NAMESPACE |
1244 awk '/^average_speed_phase2/ { print $2 }')
1245 # There may be time error, normally it should be less than 2 seconds.
1246 # We allow another 20% schedule error.
1248 # MAX_MARGIN = 1.3 = 13 / 10
1249 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1250 RUN_TIME1 * 13 / 10))
1251 [ $SPEED -lt $MAX_SPEED ] || {
1253 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1254 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1257 # adjust speed limit
1258 local BASE_SPEED2=150
1260 do_facet $SINGLEMDS \
1261 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1264 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1265 # MIN_MARGIN = 0.7 = 7 / 10
1266 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1267 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1268 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1269 [ $SPEED -gt $MIN_SPEED ] || {
1270 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1271 error_ignore LU-5624 \
1272 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1275 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1279 # MAX_MARGIN = 1.3 = 13 / 10
1280 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1281 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1282 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1283 [ $SPEED -lt $MAX_SPEED ] || {
1285 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1286 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1287 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1290 do_nodes $(comma_list $(mdts_nodes)) \
1291 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1292 do_nodes $(comma_list $(osts_nodes)) \
1293 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1294 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1295 mdd.${MDT_DEV}.lfsck_namespace |
1296 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1298 error "(11) unexpected status"
1301 run_test 9b "LFSCK speed control (2)"
1305 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1306 skip "lookup(..)/linkea on ZFS issue" && return
1310 echo "Preparing more files with error at $(date)."
1311 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1312 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1314 for ((i = 0; i < 1000; i = $((i+2)))); do
1315 mkdir -p $DIR/$tdir/d${i}
1316 touch $DIR/$tdir/f${i}
1317 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1320 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1321 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1323 for ((i = 1; i < 1000; i = $((i+2)))); do
1324 mkdir -p $DIR/$tdir/d${i}
1325 touch $DIR/$tdir/f${i}
1326 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1329 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1330 echo "Prepared at $(date)."
1332 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1334 umount_client $MOUNT
1335 mount_client $MOUNT || error "(3) Fail to start client!"
1337 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1340 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1341 [ "$STATUS" == "scanning-phase1" ] ||
1342 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1344 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1346 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1348 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1350 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1352 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1354 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1356 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1358 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1359 error "(14) Fail to softlink!"
1361 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1362 [ "$STATUS" == "scanning-phase1" ] ||
1363 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1365 do_nodes $(comma_list $(mdts_nodes)) \
1366 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1367 do_nodes $(comma_list $(osts_nodes)) \
1368 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1369 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1370 mdd.${MDT_DEV}.lfsck_namespace |
1371 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1373 error "(16) unexpected status"
1376 run_test 10 "System is available during LFSCK scanning"
1379 ost_remove_lastid() {
1382 local rcmd="do_facet ost${ost}"
1384 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1386 # step 1: local mount
1387 mount_fstype ost${ost} || return 1
1388 # step 2: remove the specified LAST_ID
1389 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1391 unmount_fstype ost${ost} || return 2
1395 check_mount_and_prep
1396 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1397 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1402 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1404 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1405 error "(2) Fail to start ost1"
1407 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1408 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1410 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1411 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1413 wait_update_facet ost1 "$LCTL get_param -n \
1414 obdfilter.${OST_DEV}.lfsck_layout |
1415 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1417 error "(5) unexpected status"
1420 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1422 wait_update_facet ost1 "$LCTL get_param -n \
1423 obdfilter.${OST_DEV}.lfsck_layout |
1424 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1426 error "(6) unexpected status"
1429 echo "the LAST_ID(s) should have been rebuilt"
1430 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1431 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1433 run_test 11a "LFSCK can rebuild lost last_id"
1436 check_mount_and_prep
1437 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1439 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1440 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1441 do_facet ost1 $LCTL set_param fail_loc=0x160d
1443 local count=$(precreated_ost_obj_count 0 0)
1445 createmany -o $DIR/$tdir/f $((count + 32))
1447 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1448 local seq=$(do_facet mds1 $LCTL get_param -n \
1449 osp.${proc_path}.prealloc_last_seq)
1450 local lastid1=$(do_facet ost1 "lctl get_param -n \
1451 obdfilter.${ost1_svc}.last_id" | grep $seq |
1452 awk -F: '{ print $2 }')
1454 umount_client $MOUNT
1455 stop ost1 || error "(1) Fail to stop ost1"
1457 #define OBD_FAIL_OST_ENOSPC 0x215
1458 do_facet ost1 $LCTL set_param fail_loc=0x215
1460 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1461 error "(2) Fail to start ost1"
1463 for ((i = 0; i < 60; i++)); do
1464 lastid2=$(do_facet ost1 "lctl get_param -n \
1465 obdfilter.${ost1_svc}.last_id" | grep $seq |
1466 awk -F: '{ print $2 }')
1467 [ ! -z $lastid2 ] && break;
1471 echo "the on-disk LAST_ID should be smaller than the expected one"
1472 [ $lastid1 -gt $lastid2 ] ||
1473 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1475 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1476 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1478 wait_update_facet ost1 "$LCTL get_param -n \
1479 obdfilter.${OST_DEV}.lfsck_layout |
1480 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1482 error "(6) unexpected status"
1485 stop ost1 || error "(7) Fail to stop ost1"
1487 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1488 error "(8) Fail to start ost1"
1490 echo "the on-disk LAST_ID should have been rebuilt"
1491 wait_update_facet ost1 "$LCTL get_param -n \
1492 obdfilter.${ost1_svc}.last_id | grep $seq |
1493 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1494 do_facet ost1 $LCTL get_param -n \
1495 obdfilter.${ost1_svc}.last_id
1496 error "(9) expect lastid1 $seq:$lastid1"
1499 do_facet ost1 $LCTL set_param fail_loc=0
1500 stopall || error "(10) Fail to stopall"
1502 run_test 11b "LFSCK can rebuild crashed last_id"
1505 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1507 check_mount_and_prep
1508 for k in $(seq $MDSCOUNT); do
1509 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1510 createmany -o $DIR/$tdir/${k}/f 100 ||
1511 error "(0) Fail to create 100 files."
1514 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1515 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1516 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1518 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1519 wait_all_targets namespace scanning-phase1 3
1521 echo "Stop namespace LFSCK on all targets by single lctl command."
1522 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1523 error "(4) Fail to stop LFSCK on all devices!"
1525 echo "All the LFSCK targets should be in 'stopped' status."
1526 wait_all_targets_blocked namespace stopped 5
1528 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1529 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1530 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1532 echo "All the LFSCK targets should be in 'completed' status."
1533 wait_all_targets_blocked namespace completed 7
1535 start_full_debug_logging
1537 echo "Start layout LFSCK on all targets by single command (-s 1)."
1538 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1539 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1541 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1542 wait_all_targets layout scanning-phase1 9
1544 echo "Stop layout LFSCK on all targets by single lctl command."
1545 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1546 error "(10) Fail to stop LFSCK on all devices!"
1548 echo "All the LFSCK targets should be in 'stopped' status."
1549 wait_all_targets_blocked layout stopped 11
1551 for k in $(seq $OSTCOUNT); do
1552 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1553 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1554 awk '/^status/ { print $2 }')
1555 [ "$STATUS" == "stopped" ] ||
1556 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1559 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1560 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1561 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1563 echo "All the LFSCK targets should be in 'completed' status."
1564 wait_all_targets_blocked layout completed 14
1566 stop_full_debug_logging
1568 run_test 12a "single command to trigger LFSCK on all devices"
1571 check_mount_and_prep
1573 echo "Start LFSCK without '-M' specified."
1574 do_facet mds1 $LCTL lfsck_start -A -r ||
1575 error "(0) Fail to start LFSCK without '-M'"
1577 wait_all_targets_blocked namespace completed 1
1578 wait_all_targets_blocked layout completed 2
1580 local count=$(do_facet mds1 $LCTL dl |
1581 awk '{ print $3 }' | grep mdt | wc -l)
1582 if [ $count -gt 1 ]; then
1584 echo "Start layout LFSCK on the node with multipe targets,"
1585 echo "but not specify '-M'/'-A' option. Should get failure."
1587 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1588 error "(3) Start layout LFSCK should fail" || true
1591 run_test 12b "auto detect Lustre device"
1595 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1596 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1597 echo "MDT-object FID."
1600 check_mount_and_prep
1602 echo "Inject failure stub to simulate bad lmm_oi"
1603 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1604 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1605 createmany -o $DIR/$tdir/f 1
1606 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1607 error "(0) Fail to create PFL $DIR/$tdir/f1"
1608 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1610 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1611 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1613 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1614 mdd.${MDT_DEV}.lfsck_layout |
1615 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1617 error "(2) unexpected status"
1620 local repaired=$($SHOW_LAYOUT |
1621 awk '/^repaired_others/ { print $2 }')
1622 [ $repaired -eq 2 ] ||
1623 error "(3) Fail to repair crashed lmm_oi: $repaired"
1625 run_test 13 "LFSCK can repair crashed lmm_oi"
1629 echo "The OST-object referenced by the MDT-object should be there;"
1630 echo "otherwise, the LFSCK should re-create the missing OST-object."
1631 echo "without '--delay-create-ostobj' option."
1634 check_mount_and_prep
1635 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1637 echo "Inject failure stub to simulate dangling referenced MDT-object"
1638 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1639 do_facet ost1 $LCTL set_param fail_loc=0x1610
1640 local count=$(precreated_ost_obj_count 0 0)
1642 createmany -o $DIR/$tdir/f $((count + 16)) ||
1643 error "(0.1) Fail to create $DIR/$tdir/fx"
1644 touch $DIR/$tdir/guard0
1646 for ((i = 0; i < 16; i++)); do
1647 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1648 $DIR/$tdir/f_comp${i} ||
1649 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1651 touch $DIR/$tdir/guard1
1653 do_facet ost1 $LCTL set_param fail_loc=0
1655 start_full_debug_logging
1657 # exhaust other pre-created dangling cases
1658 count=$(precreated_ost_obj_count 0 0)
1659 createmany -o $DIR/$tdir/a $count ||
1660 error "(0.5) Fail to create $count files."
1662 echo "'ls' should fail because of dangling referenced MDT-object"
1663 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1665 echo "Trigger layout LFSCK to find out dangling reference"
1666 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1668 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1669 mdd.${MDT_DEV}.lfsck_layout |
1670 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1672 error "(3) unexpected status"
1675 local repaired=$($SHOW_LAYOUT |
1676 awk '/^repaired_dangling/ { print $2 }')
1677 [ $repaired -ge 32 ] ||
1678 error "(4) Fail to repair dangling reference: $repaired"
1680 echo "'stat' should fail because of not repair dangling by default"
1681 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1682 error "(5.1) stat should fail"
1683 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1684 error "(5.2) stat should fail"
1686 echo "Trigger layout LFSCK to repair dangling reference"
1687 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1689 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1690 mdd.${MDT_DEV}.lfsck_layout |
1691 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1693 error "(7) unexpected status"
1696 # There may be some async LFSCK updates in processing, wait for
1697 # a while until the target reparation has been done. LU-4970.
1699 echo "'stat' should success after layout LFSCK repairing"
1700 wait_update_facet client "stat $DIR/$tdir/guard0 |
1701 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1702 stat $DIR/$tdir/guard0
1704 error "(8.1) unexpected size"
1707 wait_update_facet client "stat $DIR/$tdir/guard1 |
1708 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1709 stat $DIR/$tdir/guard1
1711 error "(8.2) unexpected size"
1714 repaired=$($SHOW_LAYOUT |
1715 awk '/^repaired_dangling/ { print $2 }')
1716 [ $repaired -ge 32 ] ||
1717 error "(9) Fail to repair dangling reference: $repaired"
1719 stop_full_debug_logging
1721 echo "stopall to cleanup object cache"
1724 setupall > /dev/null
1726 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1730 echo "The OST-object referenced by the MDT-object should be there;"
1731 echo "otherwise, the LFSCK should re-create the missing OST-object."
1732 echo "with '--delay-create-ostobj' option."
1735 check_mount_and_prep
1736 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1738 echo "Inject failure stub to simulate dangling referenced MDT-object"
1739 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1740 do_facet ost1 $LCTL set_param fail_loc=0x1610
1741 local count=$(precreated_ost_obj_count 0 0)
1743 createmany -o $DIR/$tdir/f $((count + 31))
1744 touch $DIR/$tdir/guard
1745 do_facet ost1 $LCTL set_param fail_loc=0
1747 start_full_debug_logging
1749 # exhaust other pre-created dangling cases
1750 count=$(precreated_ost_obj_count 0 0)
1751 createmany -o $DIR/$tdir/a $count ||
1752 error "(0) Fail to create $count files."
1754 echo "'ls' should fail because of dangling referenced MDT-object"
1755 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1757 echo "Trigger layout LFSCK to find out dangling reference"
1758 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1760 wait_all_targets_blocked layout completed 3
1762 local repaired=$($SHOW_LAYOUT |
1763 awk '/^repaired_dangling/ { print $2 }')
1764 [ $repaired -ge 32 ] ||
1765 error "(4) Fail to repair dangling reference: $repaired"
1767 echo "'stat' should fail because of not repair dangling by default"
1768 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1770 echo "Trigger layout LFSCK to repair dangling reference"
1771 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1773 wait_all_targets_blocked layout completed 7
1775 # There may be some async LFSCK updates in processing, wait for
1776 # a while until the target reparation has been done. LU-4970.
1778 echo "'stat' should success after layout LFSCK repairing"
1779 wait_update_facet client "stat $DIR/$tdir/guard |
1780 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1781 stat $DIR/$tdir/guard
1783 error "(8) unexpected size"
1786 repaired=$($SHOW_LAYOUT |
1787 awk '/^repaired_dangling/ { print $2 }')
1788 [ $repaired -ge 32 ] ||
1789 error "(9) Fail to repair dangling reference: $repaired"
1791 stop_full_debug_logging
1793 echo "stopall to cleanup object cache"
1796 setupall > /dev/null
1798 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1802 echo "If the OST-object referenced by the MDT-object back points"
1803 echo "to some non-exist MDT-object, then the LFSCK should repair"
1804 echo "the OST-object to back point to the right MDT-object."
1807 check_mount_and_prep
1808 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1810 echo "Inject failure stub to make the OST-object to back point to"
1811 echo "non-exist MDT-object."
1812 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1814 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1815 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1816 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1818 error "(0) Fail to create PFL $DIR/$tdir/f1"
1819 # 'dd' will trigger punch RPC firstly on every OST-objects.
1820 # So even though some OST-object will not be write by 'dd',
1821 # as long as it is allocated (may be NOT allocated in pfl_3b)
1822 # its layout information will be set also.
1823 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1824 cancel_lru_locks osc
1825 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1827 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1828 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1830 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1831 mdd.${MDT_DEV}.lfsck_layout |
1832 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1834 error "(2) unexpected status"
1837 local repaired=$($SHOW_LAYOUT |
1838 awk '/^repaired_unmatched_pair/ { print $2 }')
1839 [ $repaired -ge 3 ] ||
1840 error "(3) Fail to repair unmatched pair: $repaired"
1842 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1846 echo "If the OST-object referenced by the MDT-object back points"
1847 echo "to other MDT-object that doesn't recognize the OST-object,"
1848 echo "then the LFSCK should repair it to back point to the right"
1849 echo "MDT-object (the first one)."
1852 check_mount_and_prep
1853 mkdir -p $DIR/$tdir/0
1854 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1855 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1856 cancel_lru_locks osc
1858 echo "Inject failure stub to make the OST-object to back point to"
1859 echo "other MDT-object"
1862 [ $OSTCOUNT -ge 2 ] && stripes=2
1864 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1865 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1866 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1867 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1869 error "(0) Fail to create PFL $DIR/$tdir/f1"
1870 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1871 cancel_lru_locks osc
1872 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1874 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1875 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1877 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1878 mdd.${MDT_DEV}.lfsck_layout |
1879 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1881 error "(2) unexpected status"
1884 local repaired=$($SHOW_LAYOUT |
1885 awk '/^repaired_unmatched_pair/ { print $2 }')
1886 [ $repaired -eq 4 ] ||
1887 error "(3) Fail to repair unmatched pair: $repaired"
1889 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1892 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1894 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1895 skip "Skip the test after 2.7.55 see LU-6437" && return
1898 echo "According to current metadata migration implementation,"
1899 echo "before the old MDT-object is removed, both the new MDT-object"
1900 echo "and old MDT-object will reference the same LOV layout. Then if"
1901 echo "the layout LFSCK finds the new MDT-object by race, it will"
1902 echo "regard related OST-object(s) as multiple referenced case, and"
1903 echo "will try to create new OST-object(s) for the new MDT-object."
1904 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1905 echo "MDT-object before confirm the multiple referenced case."
1908 check_mount_and_prep
1909 $LFS mkdir -i 1 $DIR/$tdir/a1
1910 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1911 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1912 cancel_lru_locks osc
1914 echo "Inject failure stub on MDT1 to delay the migration"
1916 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1917 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1918 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1919 $LFS migrate -m 0 $DIR/$tdir/a1 &
1922 echo "Trigger layout LFSCK to race with the migration"
1923 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1925 wait_all_targets_blocked layout completed 2
1927 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1928 local repaired=$($SHOW_LAYOUT |
1929 awk '/^repaired_unmatched_pair/ { print $2 }')
1930 [ $repaired -eq 1 ] ||
1931 error "(3) Fail to repair unmatched pair: $repaired"
1933 repaired=$($SHOW_LAYOUT |
1934 awk '/^repaired_multiple_referenced/ { print $2 }')
1935 [ $repaired -eq 0 ] ||
1936 error "(4) Unexpectedly repaird multiple references: $repaired"
1938 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1942 echo "If the OST-object's owner information does not match the owner"
1943 echo "information stored in the MDT-object, then the LFSCK trust the"
1944 echo "MDT-object and update the OST-object's owner information."
1947 check_mount_and_prep
1948 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1949 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1950 cancel_lru_locks osc
1952 echo "Inject failure stub to skip OST-object owner changing"
1953 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1954 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1955 chown 1.1 $DIR/$tdir/f0
1956 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1958 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1961 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1963 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1964 mdd.${MDT_DEV}.lfsck_layout |
1965 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1967 error "(2) unexpected status"
1970 local repaired=$($SHOW_LAYOUT |
1971 awk '/^repaired_inconsistent_owner/ { print $2 }')
1972 [ $repaired -eq 1 ] ||
1973 error "(3) Fail to repair inconsistent owner: $repaired"
1975 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1979 echo "If more than one MDT-objects reference the same OST-object,"
1980 echo "and the OST-object only recognizes one MDT-object, then the"
1981 echo "LFSCK should create new OST-objects for such non-recognized"
1985 check_mount_and_prep
1986 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1988 echo "Inject failure stub to make two MDT-objects to refernce"
1989 echo "the OST-object"
1991 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1992 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1993 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1994 cancel_lru_locks mdc
1995 cancel_lru_locks osc
1997 createmany -o $DIR/$tdir/f 1
1998 cancel_lru_locks mdc
1999 cancel_lru_locks osc
2001 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
2003 error "(0) Fail to create PFL $DIR/$tdir/f1"
2004 cancel_lru_locks mdc
2005 cancel_lru_locks osc
2006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
2008 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
2009 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
2010 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
2011 [ $size -eq 1048576 ] ||
2012 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
2014 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
2015 [ $size -eq 1048576 ] ||
2016 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
2018 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
2021 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
2023 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
2024 mdd.${MDT_DEV}.lfsck_layout |
2025 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
2027 error "(3) unexpected status"
2030 local repaired=$($SHOW_LAYOUT |
2031 awk '/^repaired_multiple_referenced/ { print $2 }')
2032 [ $repaired -eq 2 ] ||
2033 error "(4) Fail to repair multiple references: $repaired"
2035 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
2036 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
2037 error "(5) Fail to write f0."
2038 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2039 [ $size -eq 1048576 ] ||
2040 error "(6) guard size should be 1048576, but got $size"
2042 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
2043 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
2044 error "(7) Fail to write f1."
2045 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
2046 [ $size -eq 1048576 ] ||
2047 error "(8) guard size should be 1048576, but got $size"
2049 run_test 17 "LFSCK can repair multiple references"
2051 $LCTL set_param debug=+cache > /dev/null
2055 echo "The target MDT-object is there, but related stripe information"
2056 echo "is lost or partly lost. The LFSCK should regenerate the missing"
2057 echo "layout EA entries."
2060 check_mount_and_prep
2061 $LFS mkdir -i 0 $DIR/$tdir/a1
2062 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2063 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2065 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2067 $LFS path2fid $DIR/$tdir/a1/f1
2068 $LFS getstripe $DIR/$tdir/a1/f1
2070 if [ $MDSCOUNT -ge 2 ]; then
2071 $LFS mkdir -i 1 $DIR/$tdir/a2
2072 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2073 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2074 $LFS path2fid $DIR/$tdir/a2/f2
2075 $LFS getstripe $DIR/$tdir/a2/f2
2078 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2079 error "(0) Fail to create PFL $DIR/$tdir/f3"
2081 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2083 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2085 $LFS path2fid $DIR/$tdir/f3
2086 $LFS getstripe $DIR/$tdir/f3
2088 cancel_lru_locks osc
2090 echo "Inject failure, to make the MDT-object lost its layout EA"
2091 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2092 do_facet mds1 $LCTL set_param fail_loc=0x1615
2093 chown 1.1 $DIR/$tdir/a1/f1
2095 if [ $MDSCOUNT -ge 2 ]; then
2096 do_facet mds2 $LCTL set_param fail_loc=0x1615
2097 chown 1.1 $DIR/$tdir/a2/f2
2100 chown 1.1 $DIR/$tdir/f3
2105 do_facet mds1 $LCTL set_param fail_loc=0
2106 if [ $MDSCOUNT -ge 2 ]; then
2107 do_facet mds2 $LCTL set_param fail_loc=0
2110 cancel_lru_locks mdc
2111 cancel_lru_locks osc
2113 echo "The file size should be incorrect since layout EA is lost"
2114 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2115 [ "$cur_size" != "$saved_size1" ] ||
2116 error "(1) Expect incorrect file1 size"
2118 if [ $MDSCOUNT -ge 2 ]; then
2119 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2120 [ "$cur_size" != "$saved_size1" ] ||
2121 error "(2) Expect incorrect file2 size"
2124 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2125 [ "$cur_size" != "$saved_size2" ] ||
2126 error "(1.2) Expect incorrect file3 size"
2128 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2129 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2131 for k in $(seq $MDSCOUNT); do
2132 # The LFSCK status query internal is 30 seconds. For the case
2133 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2134 # time to guarantee the status sync up.
2135 wait_update_facet mds${k} "$LCTL get_param -n \
2136 mdd.$(facet_svc mds${k}).lfsck_layout |
2137 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2138 error "(4) MDS${k} is not the expected 'completed'"
2141 for k in $(seq $OSTCOUNT); do
2142 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2143 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2144 awk '/^status/ { print $2 }')
2145 [ "$cur_status" == "completed" ] ||
2146 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2149 local repaired=$(do_facet mds1 $LCTL get_param -n \
2150 mdd.$(facet_svc mds1).lfsck_layout |
2151 awk '/^repaired_orphan/ { print $2 }')
2152 [ $repaired -eq 3 ] ||
2153 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2155 if [ $MDSCOUNT -ge 2 ]; then
2156 repaired=$(do_facet mds2 $LCTL get_param -n \
2157 mdd.$(facet_svc mds2).lfsck_layout |
2158 awk '/^repaired_orphan/ { print $2 }')
2159 [ $repaired -eq 2 ] ||
2160 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2163 $LFS path2fid $DIR/$tdir/a1/f1
2164 $LFS getstripe $DIR/$tdir/a1/f1
2166 if [ $MDSCOUNT -ge 2 ]; then
2167 $LFS path2fid $DIR/$tdir/a2/f2
2168 $LFS getstripe $DIR/$tdir/a2/f2
2171 $LFS path2fid $DIR/$tdir/f3
2172 $LFS getstripe $DIR/$tdir/f3
2174 echo "The file size should be correct after layout LFSCK scanning"
2175 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2176 [ "$cur_size" == "$saved_size1" ] ||
2177 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2179 if [ $MDSCOUNT -ge 2 ]; then
2180 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2181 [ "$cur_size" == "$saved_size1" ] ||
2182 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2185 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2186 [ "$cur_size" == "$saved_size2" ] ||
2187 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2189 run_test 18a "Find out orphan OST-object and repair it (1)"
2193 echo "The target MDT-object is lost. The LFSCK should re-create the"
2194 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2195 echo "can move it back to normal namespace manually."
2198 check_mount_and_prep
2199 $LFS mkdir -i 0 $DIR/$tdir/a1
2200 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2201 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2202 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2203 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2205 $LFS getstripe $DIR/$tdir/a1/f1
2207 if [ $MDSCOUNT -ge 2 ]; then
2208 $LFS mkdir -i 1 $DIR/$tdir/a2
2209 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2210 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2211 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2213 $LFS getstripe $DIR/$tdir/a2/f2
2216 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2217 error "(0) Fail to create PFL $DIR/$tdir/f3"
2219 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2221 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2222 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2224 $LFS getstripe $DIR/$tdir/f3
2226 cancel_lru_locks osc
2228 echo "Inject failure, to simulate the case of missing the MDT-object"
2229 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2230 do_facet mds1 $LCTL set_param fail_loc=0x1616
2231 rm -f $DIR/$tdir/a1/f1
2233 if [ $MDSCOUNT -ge 2 ]; then
2234 do_facet mds2 $LCTL set_param fail_loc=0x1616
2235 rm -f $DIR/$tdir/a2/f2
2243 do_facet mds1 $LCTL set_param fail_loc=0
2244 if [ $MDSCOUNT -ge 2 ]; then
2245 do_facet mds2 $LCTL set_param fail_loc=0
2248 cancel_lru_locks mdc
2249 cancel_lru_locks osc
2251 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2252 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2254 for k in $(seq $MDSCOUNT); do
2255 # The LFSCK status query internal is 30 seconds. For the case
2256 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2257 # time to guarantee the status sync up.
2258 wait_update_facet mds${k} "$LCTL get_param -n \
2259 mdd.$(facet_svc mds${k}).lfsck_layout |
2260 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2261 error "(2) MDS${k} is not the expected 'completed'"
2264 for k in $(seq $OSTCOUNT); do
2265 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2266 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2267 awk '/^status/ { print $2 }')
2268 [ "$cur_status" == "completed" ] ||
2269 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2272 local repaired=$(do_facet mds1 $LCTL get_param -n \
2273 mdd.$(facet_svc mds1).lfsck_layout |
2274 awk '/^repaired_orphan/ { print $2 }')
2275 [ $repaired -eq 3 ] ||
2276 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2278 if [ $MDSCOUNT -ge 2 ]; then
2279 repaired=$(do_facet mds2 $LCTL get_param -n \
2280 mdd.$(facet_svc mds2).lfsck_layout |
2281 awk '/^repaired_orphan/ { print $2 }')
2282 [ $repaired -eq 2 ] ||
2283 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2286 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2287 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2288 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2290 if [ $MDSCOUNT -ge 2 ]; then
2291 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2292 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2295 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2296 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2298 $LFS path2fid $DIR/$tdir/a1/f1
2299 $LFS getstripe $DIR/$tdir/a1/f1
2301 if [ $MDSCOUNT -ge 2 ]; then
2302 $LFS path2fid $DIR/$tdir/a2/f2
2303 $LFS getstripe $DIR/$tdir/a2/f2
2306 $LFS path2fid $DIR/$tdir/f3
2307 $LFS getstripe $DIR/$tdir/f3
2309 echo "The file size should be correct after layout LFSCK scanning"
2310 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2311 [ "$cur_size" == "$saved_size1" ] ||
2312 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2314 if [ $MDSCOUNT -ge 2 ]; then
2315 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2316 [ "$cur_size" == "$saved_size1" ] ||
2317 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2320 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2321 [ "$cur_size" == "$saved_size2" ] ||
2322 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2324 run_test 18b "Find out orphan OST-object and repair it (2)"
2328 echo "The target MDT-object is lost, and the OST-object FID is missing."
2329 echo "The LFSCK should re-create the MDT-object with new FID under the "
2330 echo "directory .lustre/lost+found/MDTxxxx."
2333 check_mount_and_prep
2334 $LFS mkdir -i 0 $DIR/$tdir/a1
2335 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2337 echo "Inject failure, to simulate the case of missing parent FID"
2338 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2339 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2341 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2342 $LFS getstripe $DIR/$tdir/a1/f1
2344 if [ $MDSCOUNT -ge 2 ]; then
2345 $LFS mkdir -i 1 $DIR/$tdir/a2
2346 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2347 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2348 $LFS getstripe $DIR/$tdir/a2/f2
2351 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2352 error "(0) Fail to create PFL $DIR/$tdir/f3"
2354 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2355 $LFS getstripe $DIR/$tdir/f3
2357 cancel_lru_locks osc
2358 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2360 echo "Inject failure, to simulate the case of missing the MDT-object"
2361 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2362 do_facet mds1 $LCTL set_param fail_loc=0x1616
2363 rm -f $DIR/$tdir/a1/f1
2365 if [ $MDSCOUNT -ge 2 ]; then
2366 do_facet mds2 $LCTL set_param fail_loc=0x1616
2367 rm -f $DIR/$tdir/a2/f2
2375 do_facet mds1 $LCTL set_param fail_loc=0
2376 if [ $MDSCOUNT -ge 2 ]; then
2377 do_facet mds2 $LCTL set_param fail_loc=0
2380 cancel_lru_locks mdc
2381 cancel_lru_locks osc
2383 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2384 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2386 for k in $(seq $MDSCOUNT); do
2387 # The LFSCK status query internal is 30 seconds. For the case
2388 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2389 # time to guarantee the status sync up.
2390 wait_update_facet mds${k} "$LCTL get_param -n \
2391 mdd.$(facet_svc mds${k}).lfsck_layout |
2392 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2393 error "(2) MDS${k} is not the expected 'completed'"
2396 for k in $(seq $OSTCOUNT); do
2397 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2398 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2399 awk '/^status/ { print $2 }')
2400 [ "$cur_status" == "completed" ] ||
2401 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2404 if [ $MDSCOUNT -ge 2 ]; then
2410 local repaired=$(do_facet mds1 $LCTL get_param -n \
2411 mdd.$(facet_svc mds1).lfsck_layout |
2412 awk '/^repaired_orphan/ { print $2 }')
2413 [ $repaired -eq $expected ] ||
2414 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2416 if [ $MDSCOUNT -ge 2 ]; then
2417 repaired=$(do_facet mds2 $LCTL get_param -n \
2418 mdd.$(facet_svc mds2).lfsck_layout |
2419 awk '/^repaired_orphan/ { print $2 }')
2420 [ $repaired -eq 0 ] ||
2421 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2424 ls -ail $MOUNT/.lustre/lost+found/
2426 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2427 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2428 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2430 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2433 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2434 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2435 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2437 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2438 [ ! -z "$cname" ] ||
2439 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2441 run_test 18c "Find out orphan OST-object and repair it (3)"
2445 echo "The target MDT-object layout EA is corrupted, but the right"
2446 echo "OST-object is still alive as orphan. The layout LFSCK will"
2447 echo "not create new OST-object to occupy such slot."
2450 check_mount_and_prep
2452 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2453 echo "guard" > $DIR/$tdir/a1/f1
2454 echo "foo" > $DIR/$tdir/a1/f2
2456 echo "guard" > $DIR/$tdir/a1/f3
2457 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2458 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2459 echo "foo" > $DIR/$tdir/a1/f4
2461 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2462 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2463 $LFS path2fid $DIR/$tdir/a1/f1
2464 $LFS getstripe $DIR/$tdir/a1/f1
2465 $LFS path2fid $DIR/$tdir/a1/f2
2466 $LFS getstripe $DIR/$tdir/a1/f2
2467 $LFS path2fid $DIR/$tdir/a1/f3
2468 $LFS getstripe $DIR/$tdir/a1/f3
2469 $LFS path2fid $DIR/$tdir/a1/f4
2470 $LFS getstripe $DIR/$tdir/a1/f4
2471 cancel_lru_locks osc
2473 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2474 echo "to reference the same OST-object (which is f1's OST-obejct)."
2475 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2476 echo "dangling reference case, but f2's old OST-object is there."
2478 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2479 echo "to reference the same OST-object (which is f3's OST-obejct)."
2480 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2481 echo "dangling reference case, but f4's old OST-object is there."
2484 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2485 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2486 chown 1.1 $DIR/$tdir/a1/f2
2487 chown 1.1 $DIR/$tdir/a1/f4
2488 rm -f $DIR/$tdir/a1/f1
2489 rm -f $DIR/$tdir/a1/f3
2492 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2494 echo "stopall to cleanup object cache"
2497 setupall > /dev/null
2499 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2500 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2502 for k in $(seq $MDSCOUNT); do
2503 # The LFSCK status query internal is 30 seconds. For the case
2504 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2505 # time to guarantee the status sync up.
2506 wait_update_facet mds${k} "$LCTL get_param -n \
2507 mdd.$(facet_svc mds${k}).lfsck_layout |
2508 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2509 error "(3) MDS${k} is not the expected 'completed'"
2512 for k in $(seq $OSTCOUNT); do
2513 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2514 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2515 awk '/^status/ { print $2 }')
2516 [ "$cur_status" == "completed" ] ||
2517 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2520 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2521 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2522 awk '/^repaired_orphan/ { print $2 }')
2523 [ $repaired -eq 2 ] ||
2524 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2526 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2527 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2528 awk '/^repaired_dangling/ { print $2 }')
2529 [ $repaired -eq 0 ] ||
2530 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2532 echo "The file size should be correct after layout LFSCK scanning"
2533 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2534 [ "$cur_size" == "$saved_size1" ] ||
2535 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2537 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2538 [ "$cur_size" == "$saved_size2" ] ||
2539 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2541 echo "The LFSCK should find back the original data."
2542 cat $DIR/$tdir/a1/f2
2543 $LFS path2fid $DIR/$tdir/a1/f2
2544 $LFS getstripe $DIR/$tdir/a1/f2
2545 cat $DIR/$tdir/a1/f4
2546 $LFS path2fid $DIR/$tdir/a1/f4
2547 $LFS getstripe $DIR/$tdir/a1/f4
2549 run_test 18d "Find out orphan OST-object and repair it (4)"
2553 echo "The target MDT-object layout EA slot is occpuied by some new"
2554 echo "created OST-object when repair dangling reference case. Such"
2555 echo "conflict OST-object has been modified by others. To keep the"
2556 echo "new data, the LFSCK will create a new file to refernece this"
2557 echo "old orphan OST-object."
2560 check_mount_and_prep
2562 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2563 echo "guard" > $DIR/$tdir/a1/f1
2564 echo "foo" > $DIR/$tdir/a1/f2
2566 echo "guard" > $DIR/$tdir/a1/f3
2567 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2568 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2569 echo "foo" > $DIR/$tdir/a1/f4
2571 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2572 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2574 $LFS path2fid $DIR/$tdir/a1/f1
2575 $LFS getstripe $DIR/$tdir/a1/f1
2576 $LFS path2fid $DIR/$tdir/a1/f2
2577 $LFS getstripe $DIR/$tdir/a1/f2
2578 $LFS path2fid $DIR/$tdir/a1/f3
2579 $LFS getstripe $DIR/$tdir/a1/f3
2580 $LFS path2fid $DIR/$tdir/a1/f4
2581 $LFS getstripe $DIR/$tdir/a1/f4
2582 cancel_lru_locks osc
2584 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2585 echo "to reference the same OST-object (which is f1's OST-obejct)."
2586 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2587 echo "dangling reference case, but f2's old OST-object is there."
2589 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2590 echo "to reference the same OST-object (which is f3's OST-obejct)."
2591 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2592 echo "dangling reference case, but f4's old OST-object is there."
2595 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2596 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2597 chown 1.1 $DIR/$tdir/a1/f2
2598 chown 1.1 $DIR/$tdir/a1/f4
2599 rm -f $DIR/$tdir/a1/f1
2600 rm -f $DIR/$tdir/a1/f3
2603 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2605 echo "stopall to cleanup object cache"
2608 setupall > /dev/null
2610 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2611 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2613 start_full_debug_logging
2615 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2616 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2618 wait_update_facet mds1 "$LCTL get_param -n \
2619 mdd.$(facet_svc mds1).lfsck_layout |
2620 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2621 error "(3) MDS1 is not the expected 'scanning-phase2'"
2623 # to guarantee all updates are synced.
2627 echo "Write new data to f2/f4 to modify the new created OST-object."
2628 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2629 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2631 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2633 for k in $(seq $MDSCOUNT); do
2634 # The LFSCK status query internal is 30 seconds. For the case
2635 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2636 # time to guarantee the status sync up.
2637 wait_update_facet mds${k} "$LCTL get_param -n \
2638 mdd.$(facet_svc mds${k}).lfsck_layout |
2639 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2640 error "(4) MDS${k} is not the expected 'completed'"
2643 for k in $(seq $OSTCOUNT); do
2644 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2645 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2646 awk '/^status/ { print $2 }')
2647 [ "$cur_status" == "completed" ] ||
2648 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2651 stop_full_debug_logging
2653 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2654 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2655 awk '/^repaired_orphan/ { print $2 }')
2656 [ $repaired -eq 2 ] ||
2657 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2659 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2660 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2661 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2663 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2664 if [ $count -ne 2 ]; then
2665 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2666 error "(8) Expect 2 stubs under lost+found, but got $count"
2669 echo "The stub file should keep the original f2 or f4 data"
2670 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2671 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2672 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2673 error "(9) Got unexpected $cur_size"
2676 $LFS path2fid $cname
2677 $LFS getstripe $cname
2679 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2680 cur_size=$(ls -il $cname | awk '{ print $6 }')
2681 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2682 error "(10) Got unexpected $cur_size"
2685 $LFS path2fid $cname
2686 $LFS getstripe $cname
2688 echo "The f2/f4 should contains new data."
2689 cat $DIR/$tdir/a1/f2
2690 $LFS path2fid $DIR/$tdir/a1/f2
2691 $LFS getstripe $DIR/$tdir/a1/f2
2692 cat $DIR/$tdir/a1/f4
2693 $LFS path2fid $DIR/$tdir/a1/f4
2694 $LFS getstripe $DIR/$tdir/a1/f4
2696 run_test 18e "Find out orphan OST-object and repair it (5)"
2699 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2702 echo "The target MDT-object is lost. The LFSCK should re-create the"
2703 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2704 echo "to verify some OST-object(s) during the first stage-scanning,"
2705 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2706 echo "should not be affected."
2709 check_mount_and_prep
2710 $LFS mkdir -i 0 $DIR/$tdir/a1
2711 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2712 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2713 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2714 $LFS mkdir -i 0 $DIR/$tdir/a2
2715 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2716 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2717 $LFS getstripe $DIR/$tdir/a1/f1
2718 $LFS getstripe $DIR/$tdir/a2/f2
2720 if [ $MDSCOUNT -ge 2 ]; then
2721 $LFS mkdir -i 1 $DIR/$tdir/a3
2722 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2723 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2724 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2725 $LFS mkdir -i 1 $DIR/$tdir/a4
2726 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2727 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2728 $LFS getstripe $DIR/$tdir/a3/f3
2729 $LFS getstripe $DIR/$tdir/a4/f4
2732 cancel_lru_locks osc
2734 echo "Inject failure, to simulate the case of missing the MDT-object"
2735 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2736 do_facet mds1 $LCTL set_param fail_loc=0x1616
2737 rm -f $DIR/$tdir/a1/f1
2738 rm -f $DIR/$tdir/a2/f2
2740 if [ $MDSCOUNT -ge 2 ]; then
2741 do_facet mds2 $LCTL set_param fail_loc=0x1616
2742 rm -f $DIR/$tdir/a3/f3
2743 rm -f $DIR/$tdir/a4/f4
2749 do_facet mds1 $LCTL set_param fail_loc=0
2750 if [ $MDSCOUNT -ge 2 ]; then
2751 do_facet mds2 $LCTL set_param fail_loc=0
2754 cancel_lru_locks mdc
2755 cancel_lru_locks osc
2757 echo "Inject failure, to simulate the OST0 fail to handle"
2758 echo "MDT0 LFSCK request during the first-stage scanning."
2759 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2760 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2762 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2763 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2765 for k in $(seq $MDSCOUNT); do
2766 # The LFSCK status query internal is 30 seconds. For the case
2767 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2768 # time to guarantee the status sync up.
2769 wait_update_facet mds${k} "$LCTL get_param -n \
2770 mdd.$(facet_svc mds${k}).lfsck_layout |
2771 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2772 error "(2) MDS${k} is not the expected 'partial'"
2775 wait_update_facet ost1 "$LCTL get_param -n \
2776 obdfilter.$(facet_svc ost1).lfsck_layout |
2777 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2778 error "(3) OST1 is not the expected 'partial'"
2781 wait_update_facet ost2 "$LCTL get_param -n \
2782 obdfilter.$(facet_svc ost2).lfsck_layout |
2783 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2784 error "(4) OST2 is not the expected 'completed'"
2787 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2789 local repaired=$(do_facet mds1 $LCTL get_param -n \
2790 mdd.$(facet_svc mds1).lfsck_layout |
2791 awk '/^repaired_orphan/ { print $2 }')
2792 [ $repaired -eq 1 ] ||
2793 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2795 if [ $MDSCOUNT -ge 2 ]; then
2796 repaired=$(do_facet mds2 $LCTL get_param -n \
2797 mdd.$(facet_svc mds2).lfsck_layout |
2798 awk '/^repaired_orphan/ { print $2 }')
2799 [ $repaired -eq 1 ] ||
2800 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2803 echo "Trigger layout LFSCK on all devices again to cleanup"
2804 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2806 for k in $(seq $MDSCOUNT); do
2807 # The LFSCK status query internal is 30 seconds. For the case
2808 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2809 # time to guarantee the status sync up.
2810 wait_update_facet mds${k} "$LCTL get_param -n \
2811 mdd.$(facet_svc mds${k}).lfsck_layout |
2812 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2813 error "(8) MDS${k} is not the expected 'completed'"
2816 for k in $(seq $OSTCOUNT); do
2817 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2818 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2819 awk '/^status/ { print $2 }')
2820 [ "$cur_status" == "completed" ] ||
2821 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2825 local repaired=$(do_facet mds1 $LCTL get_param -n \
2826 mdd.$(facet_svc mds1).lfsck_layout |
2827 awk '/^repaired_orphan/ { print $2 }')
2828 [ $repaired -eq 2 ] ||
2829 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2831 if [ $MDSCOUNT -ge 2 ]; then
2832 repaired=$(do_facet mds2 $LCTL get_param -n \
2833 mdd.$(facet_svc mds2).lfsck_layout |
2834 awk '/^repaired_orphan/ { print $2 }')
2835 [ $repaired -eq 2 ] ||
2836 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2839 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2843 echo "The target MDT-object is lost, but related OI mapping is there"
2844 echo "The LFSCK should recreate the lost MDT-object without affected"
2845 echo "by the stale OI mapping."
2848 check_mount_and_prep
2849 $LFS mkdir -i 0 $DIR/$tdir/a1
2850 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2851 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2852 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2854 $LFS getstripe $DIR/$tdir/a1/f1
2855 cancel_lru_locks osc
2857 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2858 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2859 do_facet mds1 $LCTL set_param fail_loc=0x162e
2860 rm -f $DIR/$tdir/a1/f1
2862 do_facet mds1 $LCTL set_param fail_loc=0
2863 cancel_lru_locks mdc
2864 cancel_lru_locks osc
2866 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2867 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2869 for k in $(seq $MDSCOUNT); do
2870 # The LFSCK status query internal is 30 seconds. For the case
2871 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2872 # time to guarantee the status sync up.
2873 wait_update_facet mds${k} "$LCTL get_param -n \
2874 mdd.$(facet_svc mds${k}).lfsck_layout |
2875 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2876 error "(2) MDS${k} is not the expected 'completed'"
2879 for k in $(seq $OSTCOUNT); do
2880 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2881 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2882 awk '/^status/ { print $2 }')
2883 [ "$cur_status" == "completed" ] ||
2884 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2887 local repaired=$(do_facet mds1 $LCTL get_param -n \
2888 mdd.$(facet_svc mds1).lfsck_layout |
2889 awk '/^repaired_orphan/ { print $2 }')
2890 [ $repaired -eq $OSTCOUNT ] ||
2891 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2893 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2894 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2895 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2897 $LFS path2fid $DIR/$tdir/a1/f1
2898 $LFS getstripe $DIR/$tdir/a1/f1
2900 run_test 18g "Find out orphan OST-object and repair it (7)"
2904 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2905 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2906 echo "scanning its OST-object(s). Then in the second stage scanning,"
2907 echo "the OST will return related OST-object(s) to the MDT as orphan."
2908 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2909 echo "the 'orphan(s)' stripe information."
2912 check_mount_and_prep
2914 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2915 error "(0) Fail to create PFL $DIR/$tdir/f0"
2917 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2918 error "(1.1) Fail to write $DIR/$tdir/f0"
2920 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2921 error "(1.2) Fail to write $DIR/$tdir/f0"
2923 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2925 echo "Inject failure stub to simulate bad PFL extent range"
2926 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2927 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2929 chown 1.1 $DIR/$tdir/f0
2931 cancel_lru_locks mdc
2932 cancel_lru_locks osc
2933 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2935 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2936 error "(2) Write to bad PFL file should fail"
2938 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2939 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2941 for k in $(seq $MDSCOUNT); do
2942 # The LFSCK status query internal is 30 seconds. For the case
2943 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2944 # time to guarantee the status sync up.
2945 wait_update_facet mds${k} "$LCTL get_param -n \
2946 mdd.$(facet_svc mds${k}).lfsck_layout |
2947 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2948 error "(4.1) MDS${k} is not the expected 'completed'"
2951 for k in $(seq $OSTCOUNT); do
2952 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2953 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2954 awk '/^status/ { print $2 }')
2955 [ "$cur_status" == "completed" ] ||
2956 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2960 local repaired=$($SHOW_LAYOUT |
2961 awk '/^repaired_orphan/ { print $2 }')
2962 [ $repaired -eq 2 ] ||
2963 error "(5) Fail to repair crashed PFL range: $repaired"
2965 echo "Data in $DIR/$tdir/f0 should not be broken"
2966 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2967 error "(6) Data in $DIR/$tdir/f0 is broken"
2969 echo "Write should succeed after LFSCK repairing the bad PFL range"
2970 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2971 error "(7) Write should succeed after LFSCK"
2973 run_test 18h "LFSCK can repair crashed PFL extent range"
2975 $LCTL set_param debug=-cache > /dev/null
2978 check_mount_and_prep
2979 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2981 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2982 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2984 echo "foo1" > $DIR/$tdir/a0
2985 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2986 error "(0) Fail to create PFL $DIR/$tdir/a1"
2987 echo "foo2" > $DIR/$tdir/a1
2988 echo "guard" > $DIR/$tdir/a2
2989 cancel_lru_locks osc
2991 echo "Inject failure, then client will offer wrong parent FID when read"
2992 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2993 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2995 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2996 $LCTL set_param fail_loc=0x1619
2998 echo "Read RPC with wrong parent FID should be denied"
2999 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
3000 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
3001 $LCTL set_param fail_loc=0
3003 run_test 19a "OST-object inconsistency self detect"
3006 check_mount_and_prep
3007 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3009 echo "Inject failure stub to make the OST-object to back point to"
3010 echo "non-exist MDT-object"
3012 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3013 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3015 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
3016 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
3017 echo "foo1" > $DIR/$tdir/f0
3018 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
3019 error "(0) Fail to create PFL $DIR/$tdir/f1"
3020 echo "foo2" > $DIR/$tdir/f1
3021 cancel_lru_locks osc
3022 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
3024 do_facet ost1 $LCTL set_param -n \
3025 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
3026 echo "Nothing should be fixed since self detect and repair is disabled"
3027 local repaired=$(do_facet ost1 $LCTL get_param -n \
3028 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3029 awk '/^repaired/ { print $2 }')
3030 [ $repaired -eq 0 ] ||
3031 error "(1) Expected 0 repaired, but got $repaired"
3033 echo "Read RPC with right parent FID should be accepted,"
3034 echo "and cause parent FID on OST to be fixed"
3036 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
3037 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
3039 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
3040 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
3042 repaired=$(do_facet ost1 $LCTL get_param -n \
3043 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
3044 awk '/^repaired/ { print $2 }')
3045 [ $repaired -eq 2 ] ||
3046 error "(3) Expected 1 repaired, but got $repaired"
3048 run_test 19b "OST-object inconsistency self repair"
3050 PATTERN_WITH_HOLE="40000001"
3051 PATTERN_WITHOUT_HOLE="raid0"
3054 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3057 echo "The target MDT-object and some of its OST-object are lost."
3058 echo "The LFSCK should find out the left OST-objects and re-create"
3059 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3060 echo "with the partial OST-objects (LOV EA hole)."
3062 echo "New client can access the file with LOV EA hole via normal"
3063 echo "system tools or commands without crash the system."
3065 echo "For old client, even though it cannot access the file with"
3066 echo "LOV EA hole, it should not cause the system crash."
3069 check_mount_and_prep
3070 $LFS mkdir -i 0 $DIR/$tdir/a1
3071 if [ $OSTCOUNT -gt 2 ]; then
3072 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3075 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3079 # 256 blocks on the stripe0.
3080 # 1 block on the stripe1 for 2 OSTs case.
3081 # 256 blocks on the stripe1 for other cases.
3082 # 1 block on the stripe2 if OSTs > 2
3083 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3084 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3085 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3087 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3088 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3089 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3092 $LFS getstripe $DIR/$tdir/a1/f0
3094 $LFS getstripe $DIR/$tdir/a1/f1
3096 $LFS getstripe $DIR/$tdir/a1/f2
3098 if [ $OSTCOUNT -gt 2 ]; then
3099 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3100 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3102 $LFS getstripe $DIR/$tdir/a1/f3
3105 cancel_lru_locks osc
3107 echo "Inject failure..."
3108 echo "To simulate f0 lost MDT-object"
3109 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3110 do_facet mds1 $LCTL set_param fail_loc=0x1616
3111 rm -f $DIR/$tdir/a1/f0
3113 echo "To simulate f1 lost MDT-object and OST-object0"
3114 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3115 do_facet mds1 $LCTL set_param fail_loc=0x161a
3116 rm -f $DIR/$tdir/a1/f1
3118 echo "To simulate f2 lost MDT-object and OST-object1"
3119 do_facet mds1 $LCTL set_param fail_val=1
3120 rm -f $DIR/$tdir/a1/f2
3122 if [ $OSTCOUNT -gt 2 ]; then
3123 echo "To simulate f3 lost MDT-object and OST-object2"
3124 do_facet mds1 $LCTL set_param fail_val=2
3125 rm -f $DIR/$tdir/a1/f3
3128 umount_client $MOUNT
3131 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3133 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3134 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3136 for k in $(seq $MDSCOUNT); do
3137 # The LFSCK status query internal is 30 seconds. For the case
3138 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3139 # time to guarantee the status sync up.
3140 wait_update_facet mds${k} "$LCTL get_param -n \
3141 mdd.$(facet_svc mds${k}).lfsck_layout |
3142 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3143 error "(2) MDS${k} is not the expected 'completed'"
3146 for k in $(seq $OSTCOUNT); do
3147 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3148 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3149 awk '/^status/ { print $2 }')
3150 [ "$cur_status" == "completed" ] ||
3151 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3154 local repaired=$(do_facet mds1 $LCTL get_param -n \
3155 mdd.$(facet_svc mds1).lfsck_layout |
3156 awk '/^repaired_orphan/ { print $2 }')
3157 if [ $OSTCOUNT -gt 2 ]; then
3158 [ $repaired -eq 9 ] ||
3159 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3161 [ $repaired -eq 4 ] ||
3162 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3165 mount_client $MOUNT || error "(5.0) Fail to start client!"
3167 LOV_PATTERN_F_HOLE=0x40000000
3170 # ${fid0}-R-0 is the old f0
3172 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3173 echo "Check $name, which is the old f0"
3175 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3177 local pattern=$($LFS getstripe -L $name)
3178 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3179 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3181 local stripes=$($LFS getstripe -c $name)
3182 if [ $OSTCOUNT -gt 2 ]; then
3183 [ $stripes -eq 3 ] ||
3184 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3186 [ $stripes -eq 2 ] ||
3187 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3190 local size=$(stat $name | awk '/Size:/ { print $2 }')
3191 [ $size -eq $((4096 * $bcount)) ] ||
3192 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3194 cat $name > /dev/null || error "(5.5) cannot read $name"
3196 echo "dummy" >> $name || error "(5.6) cannot write $name"
3198 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3200 touch $name || error "(5.8) cannot touch $name"
3202 rm -f $name || error "(5.9) cannot unlink $name"
3205 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3207 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3208 if [ $OSTCOUNT -gt 2 ]; then
3209 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3211 echo "Check $name, it contains the old f1's stripe1"
3214 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3216 pattern=$($LFS getstripe -L $name)
3217 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3218 error "(6.2) expect pattern flag hole, but got $pattern"
3220 stripes=$($LFS getstripe -c $name)
3221 if [ $OSTCOUNT -gt 2 ]; then
3222 [ $stripes -eq 3 ] ||
3223 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3225 [ $stripes -eq 2 ] ||
3226 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3229 size=$(stat $name | awk '/Size:/ { print $2 }')
3230 [ $size -eq $((4096 * $bcount)) ] ||
3231 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3233 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3235 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3236 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3239 [ $failures -eq 256 ] ||
3240 error "(6.6) expect 256 IO failures, but get $failures"
3242 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3243 [ $size -eq $((4096 * $bcount)) ] ||
3244 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3246 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3247 error "(6.8) write to the LOV EA hole should fail"
3249 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3250 error "(6.9) write to normal stripe should NOT fail"
3252 echo "foo" >> $name && error "(6.10) append write $name should fail"
3254 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3256 touch $name || error "(6.12) cannot touch $name"
3258 rm -f $name || error "(6.13) cannot unlink $name"
3261 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3263 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3264 if [ $OSTCOUNT -gt 2 ]; then
3265 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3267 echo "Check $name, it contains the old f2's stripe0"
3270 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3272 pattern=$($LFS getstripe -L $name)
3273 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3274 error "(7.2) expect pattern flag hole, but got $pattern"
3276 stripes=$($LFS getstripe -c $name)
3277 size=$(stat $name | awk '/Size:/ { print $2 }')
3278 if [ $OSTCOUNT -gt 2 ]; then
3279 [ $stripes -eq 3 ] ||
3280 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3282 [ $size -eq $((4096 * $bcount)) ] ||
3283 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3285 cat $name > /dev/null &&
3286 error "(7.5.1) normal read $name should fail"
3288 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3289 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3291 [ $failures -eq 256 ] ||
3292 error "(7.6) expect 256 IO failures, but get $failures"
3294 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3295 [ $size -eq $((4096 * $bcount)) ] ||
3296 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3298 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3299 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3301 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3302 error "(7.8.1) write to normal stripe should NOT fail"
3304 echo "foo" >> $name &&
3305 error "(7.8.3) append write $name should fail"
3307 chown $RUNAS_ID:$RUNAS_GID $name ||
3308 error "(7.9.1) cannot chown on $name"
3310 touch $name || error "(7.10.1) cannot touch $name"
3312 [ $stripes -eq 2 ] ||
3313 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3316 [ $size -eq $((4096 * (256 + 0))) ] ||
3317 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3319 cat $name > /dev/null &&
3320 error "(7.5.2) normal read $name should fail"
3322 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3323 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3324 [ $failures -eq 256 ] ||
3325 error "(7.6.2) expect 256 IO failures, but get $failures"
3328 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3329 [ $size -eq $((4096 * $bcount)) ] ||
3330 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3332 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3333 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3335 chown $RUNAS_ID:$RUNAS_GID $name ||
3336 error "(7.9.2) cannot chown on $name"
3338 touch $name || error "(7.10.2) cannot touch $name"
3341 rm -f $name || error "(7.11) cannot unlink $name"
3343 [ $OSTCOUNT -le 2 ] && return
3346 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3348 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3349 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3351 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3353 pattern=$($LFS getstripe -L $name)
3354 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3355 error "(8.2) expect pattern flag hole, but got $pattern"
3357 stripes=$($LFS getstripe -c $name)
3358 [ $stripes -eq 3 ] ||
3359 error "(8.3) expect the stripe count is 3, but got $stripes"
3361 size=$(stat $name | awk '/Size:/ { print $2 }')
3363 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3364 error "(8.4) expect the size $((4096 * 512)), but got $size"
3366 cat $name > /dev/null &&
3367 error "(8.5) normal read $name should fail"
3369 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3370 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3372 [ $failures -eq 256 ] ||
3373 error "(8.6) expect 256 IO failures, but get $failures"
3376 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3377 [ $size -eq $((4096 * $bcount)) ] ||
3378 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3380 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3381 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3383 chown $RUNAS_ID:$RUNAS_GID $name ||
3384 error "(8.9) cannot chown on $name"
3386 touch $name || error "(8.10) cannot touch $name"
3388 rm -f $name || error "(8.11) cannot unlink $name"
3390 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3393 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3396 echo "The target MDT-object and some of its OST-object are lost."
3397 echo "The LFSCK should find out the left OST-objects and re-create"
3398 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3399 echo "with the partial OST-objects (LOV EA hole)."
3401 echo "New client can access the file with LOV EA hole via normal"
3402 echo "system tools or commands without crash the system - PFL case."
3405 check_mount_and_prep
3407 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3408 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3409 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3410 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3411 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3412 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3414 local bcount=$((256 * 3 + 1))
3416 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3417 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3418 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3420 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3421 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3422 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3425 $LFS getstripe $DIR/$tdir/f0
3427 $LFS getstripe $DIR/$tdir/f1
3429 $LFS getstripe $DIR/$tdir/f2
3431 cancel_lru_locks mdc
3432 cancel_lru_locks osc
3434 echo "Inject failure..."
3435 echo "To simulate f0 lost MDT-object"
3436 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3437 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3440 echo "To simulate the case of f1 lost MDT-object and "
3441 echo "the first OST-object in each PFL component"
3442 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3443 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3446 echo "To simulate the case of f2 lost MDT-object and "
3447 echo "the second OST-object in each PFL component"
3448 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3453 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3455 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3456 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3458 for k in $(seq $MDSCOUNT); do
3459 # The LFSCK status query internal is 30 seconds. For the case
3460 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3461 # time to guarantee the status sync up.
3462 wait_update_facet mds${k} "$LCTL get_param -n \
3463 mdd.$(facet_svc mds${k}).lfsck_layout |
3464 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3465 error "(4) MDS${k} is not the expected 'completed'"
3468 for k in $(seq $OSTCOUNT); do
3469 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3470 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3471 awk '/^status/ { print $2 }')
3472 [ "$cur_status" == "completed" ] ||
3473 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3476 local repaired=$(do_facet mds1 $LCTL get_param -n \
3477 mdd.$(facet_svc mds1).lfsck_layout |
3478 awk '/^repaired_orphan/ { print $2 }')
3479 [ $repaired -eq 8 ] ||
3480 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3483 # ${fid0}-R-0 is the old f0
3485 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3486 echo "Check $name, which is the old f0"
3488 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3490 local pattern=$($LFS getstripe -L -I1 $name)
3491 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3492 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3494 pattern=$($LFS getstripe -L -I2 $name)
3495 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3496 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3498 local stripes=$($LFS getstripe -c -I1 $name)
3499 [ $stripes -eq 2 ] ||
3500 error "(7.3.1) expect 2 stripes, but got $stripes"
3502 stripes=$($LFS getstripe -c -I2 $name)
3503 [ $stripes -eq 2 ] ||
3504 error "(7.3.2) expect 2 stripes, but got $stripes"
3506 local e_start=$($LFS getstripe -I1 $name |
3507 awk '/lcme_extent.e_start:/ { print $2 }')
3508 [ $e_start -eq 0 ] ||
3509 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3511 local e_end=$($LFS getstripe -I1 $name |
3512 awk '/lcme_extent.e_end:/ { print $2 }')
3513 [ $e_end -eq 2097152 ] ||
3514 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3516 e_start=$($LFS getstripe -I2 $name |
3517 awk '/lcme_extent.e_start:/ { print $2 }')
3518 [ $e_start -eq 2097152 ] ||
3519 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3521 e_end=$($LFS getstripe -I2 $name |
3522 awk '/lcme_extent.e_end:/ { print $2 }')
3523 [ "$e_end" = "EOF" ] ||
3524 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3526 local size=$(stat $name | awk '/Size:/ { print $2 }')
3527 [ $size -eq $((4096 * $bcount)) ] ||
3528 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3530 cat $name > /dev/null || error "(7.7) cannot read $name"
3532 echo "dummy" >> $name || error "(7.8) cannot write $name"
3534 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3536 touch $name || error "(7.10) cannot touch $name"
3538 rm -f $name || error "(7.11) cannot unlink $name"
3541 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3543 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3544 echo "Check $name, it contains f1's second OST-object in each COMP"
3546 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3548 pattern=$($LFS getstripe -L -I1 $name)
3549 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3550 error "(8.2.1) expect pattern flag hole, but got $pattern"
3552 pattern=$($LFS getstripe -L -I2 $name)
3553 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3554 error "(8.2.2) expect pattern flag hole, but got $pattern"
3556 stripes=$($LFS getstripe -c -I1 $name)
3557 [ $stripes -eq 2 ] ||
3558 error "(8.3.2) expect 2 stripes, but got $stripes"
3560 stripes=$($LFS getstripe -c -I2 $name)
3561 [ $stripes -eq 2 ] ||
3562 error "(8.3.2) expect 2 stripes, but got $stripes"
3564 e_start=$($LFS getstripe -I1 $name |
3565 awk '/lcme_extent.e_start:/ { print $2 }')
3566 [ $e_start -eq 0 ] ||
3567 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3569 e_end=$($LFS getstripe -I1 $name |
3570 awk '/lcme_extent.e_end:/ { print $2 }')
3571 [ $e_end -eq 2097152 ] ||
3572 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3574 e_start=$($LFS getstripe -I2 $name |
3575 awk '/lcme_extent.e_start:/ { print $2 }')
3576 [ $e_start -eq 2097152 ] ||
3577 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3579 e_end=$($LFS getstripe -I2 $name |
3580 awk '/lcme_extent.e_end:/ { print $2 }')
3581 [ "$e_end" = "EOF" ] ||
3582 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3584 size=$(stat $name | awk '/Size:/ { print $2 }')
3585 [ $size -eq $((4096 * $bcount)) ] ||
3586 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3588 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3590 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3591 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3593 # The first stripe in each COMP was lost
3594 [ $failures -eq 512 ] ||
3595 error "(8.8) expect 512 IO failures, but get $failures"
3597 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3598 [ $size -eq $((4096 * $bcount)) ] ||
3599 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3601 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3602 error "(8.10) write to the LOV EA hole should fail"
3604 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3605 error "(8.11) write to normal stripe should NOT fail"
3607 echo "foo" >> $name && error "(8.12) append write $name should fail"
3609 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3611 touch $name || error "(8.14) cannot touch $name"
3613 rm -f $name || error "(8.15) cannot unlink $name"
3616 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3618 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3619 echo "Check $name, it contains f2's first stripe in each COMP"
3621 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3623 pattern=$($LFS getstripe -L -I1 $name)
3624 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3625 error "(9.2.1) expect pattern flag hole, but got $pattern"
3627 pattern=$($LFS getstripe -L -I2 $name)
3628 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3629 error "(9.2.2) expect pattern flag hole, but got $pattern"
3631 stripes=$($LFS getstripe -c -I1 $name)
3632 [ $stripes -eq 2 ] ||
3633 error "(9.3.2) expect 2 stripes, but got $stripes"
3635 stripes=$($LFS getstripe -c -I2 $name)
3636 [ $stripes -eq 2 ] ||
3637 error "(9.3.2) expect 2 stripes, but got $stripes"
3639 e_start=$($LFS getstripe -I1 $name |
3640 awk '/lcme_extent.e_start:/ { print $2 }')
3641 [ $e_start -eq 0 ] ||
3642 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3644 e_end=$($LFS getstripe -I1 $name |
3645 awk '/lcme_extent.e_end:/ { print $2 }')
3646 [ $e_end -eq 2097152 ] ||
3647 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3649 e_start=$($LFS getstripe -I2 $name |
3650 awk '/lcme_extent.e_start:/ { print $2 }')
3651 [ $e_start -eq 2097152 ] ||
3652 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3654 e_end=$($LFS getstripe -I2 $name |
3655 awk '/lcme_extent.e_end:/ { print $2 }')
3656 [ "$e_end" = "EOF" ] ||
3657 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3659 size=$(stat $name | awk '/Size:/ { print $2 }')
3660 # The second stripe in COMP was lost, so we do not know there
3661 # have ever been some data before. 'stat' will regard it as
3662 # no data on the lost stripe.
3664 [ $size -eq $((4096 * $bcount)) ] ||
3665 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3667 cat $name > /dev/null &&
3668 error "(9.7) normal read $name should fail"
3670 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3671 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3672 [ $failures -eq 512 ] ||
3673 error "(9.8) expect 256 IO failures, but get $failures"
3675 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3676 # The second stripe in COMP was lost, so we do not know there
3677 # have ever been some data before. Since 'dd' skip failure,
3678 # it will regard the lost stripe contains data.
3680 [ $size -eq $((4096 * $bcount)) ] ||
3681 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3683 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3684 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3686 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3687 error "(9.11) write to normal stripe should NOT fail"
3689 echo "foo" >> $name &&
3690 error "(9.12) append write $name should fail"
3692 chown $RUNAS_ID:$RUNAS_GID $name ||
3693 error "(9.13) cannot chown on $name"
3695 touch $name || error "(9.14) cannot touch $name"
3697 rm -f $name || error "(7.15) cannot unlink $name"
3699 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3702 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3703 skip "ignore the test if MDS is older than 2.5.59" && return
3705 check_mount_and_prep
3706 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3708 echo "Start all LFSCK components by default (-s 1)"
3709 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3710 error "Fail to start LFSCK"
3712 echo "namespace LFSCK should be in 'scanning-phase1' status"
3713 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3714 [ "$STATUS" == "scanning-phase1" ] ||
3715 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3717 echo "layout LFSCK should be in 'scanning-phase1' status"
3718 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3719 [ "$STATUS" == "scanning-phase1" ] ||
3720 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3722 echo "Stop all LFSCK components by default"
3723 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3724 error "Fail to stop LFSCK"
3726 run_test 21 "run all LFSCK components by default"
3729 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3732 echo "The parent_A references the child directory via some name entry,"
3733 echo "but the child directory back references another parent_B via its"
3734 echo "".." name entry. The parent_B does not exist. Then the namespace"
3735 echo "LFSCK will repair the child directory's ".." name entry."
3738 check_mount_and_prep
3740 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3741 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3743 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3744 echo "The dummy's dotdot name entry references the guard."
3745 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3746 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3747 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3748 error "(3) Fail to mkdir on MDT0"
3749 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3751 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3753 echo "Trigger namespace LFSCK to repair unmatched pairs"
3754 $START_NAMESPACE -A -r ||
3755 error "(5) Fail to start LFSCK for namespace"
3757 wait_all_targets_blocked namespace completed 6
3759 local repaired=$($SHOW_NAMESPACE |
3760 awk '/^unmatched_pairs_repaired/ { print $2 }')
3761 [ $repaired -eq 1 ] ||
3762 error "(7) Fail to repair unmatched pairs: $repaired"
3764 echo "'ls' should success after namespace LFSCK repairing"
3765 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3766 error "(8) ls should success."
3768 run_test 22a "LFSCK can repair unmatched pairs (1)"
3771 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3774 echo "The parent_A references the child directory via the name entry_B,"
3775 echo "but the child directory back references another parent_C via its"
3776 echo "".." name entry. The parent_C exists, but there is no the name"
3777 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3778 echo "the child directory's ".." name entry and its linkEA."
3781 check_mount_and_prep
3783 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3784 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3786 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3787 echo "and bad linkEA. The dummy's dotdot name entry references the"
3788 echo "guard. The dummy's linkEA references n non-exist name entry."
3789 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3790 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3791 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3792 error "(3) Fail to mkdir on MDT0"
3793 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3795 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3796 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3797 local dummyname=$($LFS fid2path $DIR $dummyfid)
3798 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3799 error "(4) fid2path works unexpectedly."
3801 echo "Trigger namespace LFSCK to repair unmatched pairs"
3802 $START_NAMESPACE -A -r ||
3803 error "(5) Fail to start LFSCK for namespace"
3805 wait_all_targets_blocked namespace completed 6
3807 local repaired=$($SHOW_NAMESPACE |
3808 awk '/^unmatched_pairs_repaired/ { print $2 }')
3809 [ $repaired -eq 1 ] ||
3810 error "(7) Fail to repair unmatched pairs: $repaired"
3812 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3813 local dummyname=$($LFS fid2path $DIR $dummyfid)
3814 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3815 error "(8) fid2path does not work"
3817 run_test 22b "LFSCK can repair unmatched pairs (2)"
3820 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3823 echo "The name entry is there, but the MDT-object for such name "
3824 echo "entry does not exist. The namespace LFSCK should find out "
3825 echo "and repair the inconsistency as required."
3828 check_mount_and_prep
3830 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3831 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3833 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3834 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3835 do_facet mds2 $LCTL set_param fail_loc=0x1620
3836 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3837 do_facet mds2 $LCTL set_param fail_loc=0
3839 echo "'ls' should fail because of dangling name entry"
3840 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3842 echo "Trigger namespace LFSCK to find out dangling name entry"
3843 $START_NAMESPACE -A -r ||
3844 error "(5) Fail to start LFSCK for namespace"
3846 wait_all_targets_blocked namespace completed 6
3848 local repaired=$($SHOW_NAMESPACE |
3849 awk '/^dangling_repaired/ { print $2 }')
3850 [ $repaired -eq 1 ] ||
3851 error "(7) Fail to repair dangling name entry: $repaired"
3853 echo "'ls' should fail because not re-create MDT-object by default"
3854 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3856 echo "Trigger namespace LFSCK again to repair dangling name entry"
3857 $START_NAMESPACE -A -r -C ||
3858 error "(9) Fail to start LFSCK for namespace"
3860 wait_all_targets_blocked namespace completed 10
3862 repaired=$($SHOW_NAMESPACE |
3863 awk '/^dangling_repaired/ { print $2 }')
3864 [ $repaired -eq 1 ] ||
3865 error "(11) Fail to repair dangling name entry: $repaired"
3867 echo "'ls' should success after namespace LFSCK repairing"
3868 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3870 run_test 23a "LFSCK can repair dangling name entry (1)"
3874 echo "The objectA has multiple hard links, one of them corresponding"
3875 echo "to the name entry_B. But there is something wrong for the name"
3876 echo "entry_B and cause entry_B to references non-exist object_C."
3877 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3878 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3879 echo "comes to the second-stage scanning, it will find that the"
3880 echo "former re-creating object_C is not proper, and will try to"
3881 echo "replace the object_C with the real object_A."
3884 check_mount_and_prep
3886 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3887 $LFS path2fid $DIR/$tdir/d0
3889 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3891 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3892 $LFS path2fid $DIR/$tdir/d0/f0
3894 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3895 $LFS path2fid $DIR/$tdir/d0/f1
3897 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3898 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3900 if [ "$SEQ0" != "$SEQ1" ]; then
3901 # To guarantee that the f0 and f1 are in the same FID seq
3902 rm -f $DIR/$tdir/d0/f0 ||
3903 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3904 echo "dummy" > $DIR/$tdir/d0/f0 ||
3905 error "(3.2) Fail to touch on MDT0"
3906 $LFS path2fid $DIR/$tdir/d0/f0
3909 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3910 OID=$(printf %d $OID)
3912 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3913 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3914 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3915 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3916 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3918 # If there is creation after the dangling injection, it may re-use
3919 # the just released local object (inode) that is referenced by the
3920 # dangling name entry. It will fail the dangling injection.
3921 # So before deleting the target object for the dangling name entry,
3922 # remove some other objects to avoid the target object being reused
3923 # by some potential creations. LU-7429
3924 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3926 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3928 echo "'ls' should fail because of dangling name entry"
3929 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3930 error "(6) ls should fail."
3932 echo "Trigger namespace LFSCK to find out dangling name entry"
3933 $START_NAMESPACE -r -C ||
3934 error "(7) Fail to start LFSCK for namespace"
3936 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3937 mdd.${MDT_DEV}.lfsck_namespace |
3938 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3940 error "(8) unexpected status"
3943 local repaired=$($SHOW_NAMESPACE |
3944 awk '/^dangling_repaired/ { print $2 }')
3945 [ $repaired -eq 1 ] ||
3946 error "(9) Fail to repair dangling name entry: $repaired"
3948 repaired=$($SHOW_NAMESPACE |
3949 awk '/^multiple_linked_repaired/ { print $2 }')
3950 [ $repaired -eq 1 ] ||
3951 error "(10) Fail to drop the former created object: $repaired"
3953 local data=$(cat $DIR/$tdir/d0/foo)
3954 [ "$data" == "dummy" ] ||
3955 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3957 run_test 23b "LFSCK can repair dangling name entry (2)"
3960 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3961 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3962 mdd.${MDT_DEV}.lfsck_namespace |
3963 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3965 error "(10) unexpected status"
3968 stop_full_debug_logging
3973 echo "The objectA has multiple hard links, one of them corresponding"
3974 echo "to the name entry_B. But there is something wrong for the name"
3975 echo "entry_B and cause entry_B to references non-exist object_C."
3976 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3977 echo "as dangling, and re-create the lost object_C. And then others"
3978 echo "modified the re-created object_C. When the LFSCK comes to the"
3979 echo "second-stage scanning, it will find that the former re-creating"
3980 echo "object_C maybe wrong and try to replace the object_C with the"
3981 echo "real object_A. But because object_C has been modified, so the"
3982 echo "LFSCK cannot replace it."
3985 start_full_debug_logging
3987 check_mount_and_prep
3989 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3990 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3991 echo "parent_fid=$parent_fid"
3993 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3995 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3996 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3997 echo "f0_fid=$f0_fid"
3999 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
4000 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
4001 echo "f1_fid=$f1_fid"
4003 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
4004 # To guarantee that the f0 and f1 are in the same FID seq
4005 rm -f $DIR/$tdir/d0/f0 ||
4006 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
4007 echo "dummy" > $DIR/$tdir/d0/f0 ||
4008 error "(3.2) Fail to touch on MDT0"
4009 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
4010 echo "f0_fid=$f0_fid (replaced)"
4013 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
4015 echo "Inject failure stub on MDT0 to simulate dangling name entry"
4016 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
4017 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
4018 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
4019 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
4021 # If there is creation after the dangling injection, it may re-use
4022 # the just released local object (inode) that is referenced by the
4023 # dangling name entry. It will fail the dangling injection.
4024 # So before deleting the target object for the dangling name entry,
4025 # remove some other objects to avoid the target object being reused
4026 # by some potential creations. LU-7429
4027 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
4029 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
4031 echo "'ls' should fail because of dangling name entry"
4032 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4033 error "(6) ls should fail."
4035 #define OBD_FAIL_LFSCK_DELAY3 0x1602
4036 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
4038 echo "Trigger namespace LFSCK to find out dangling name entry"
4039 $START_NAMESPACE -r -C ||
4040 error "(7) Fail to start LFSCK for namespace"
4042 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
4043 # While unexpected by the test, it is valid for LFSCK to repair
4044 # the link to the original object before any data is written.
4045 local size=$(stat -c %s $DIR/$tdir/d0/foo)
4047 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
4048 log "LFSCK repaired file prematurely"
4053 stat $DIR/$tdir/d0/foo
4055 error "(8) unexpected size"
4058 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4059 cancel_lru_locks osc
4063 local repaired=$($SHOW_NAMESPACE |
4064 awk '/^dangling_repaired/ { print $2 }')
4065 [ $repaired -eq 1 ] ||
4066 error "(11) Fail to repair dangling name entry: $repaired"
4068 local data=$(cat $DIR/$tdir/d0/foo)
4069 [ "$data" != "dummy" ] ||
4070 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4072 run_test 23c "LFSCK can repair dangling name entry (3)"
4075 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4078 echo "Two MDT-objects back reference the same name entry via their"
4079 echo "each own linkEA entry, but the name entry only references one"
4080 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4081 echo "for the MDT-object that is not recognized. If such MDT-object"
4082 echo "has no other linkEA entry after the removing, then the LFSCK"
4083 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4086 check_mount_and_prep
4088 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4090 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4091 $LFS path2fid $DIR/$tdir/d0/guard
4093 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4094 $LFS path2fid $DIR/$tdir/d0/dummy
4097 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4098 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4100 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4103 touch $DIR/$tdir/d0/guard/foo ||
4104 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4106 echo "Inject failure stub on MDT0 to simulate the case that"
4107 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4108 echo "that references $DIR/$tdir/d0/guard/foo."
4109 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4110 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4111 echo "there with the same linkEA entry as another MDT-object"
4112 echo "$DIR/$tdir/d0/guard/foo has"
4114 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4115 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4116 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4117 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4118 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4119 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4120 rmdir $DIR/$tdir/d0/dummy/foo ||
4121 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4122 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4124 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4125 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4126 error "(6) stat successfully unexpectedly"
4128 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4129 $START_NAMESPACE -A -r ||
4130 error "(7) Fail to start LFSCK for namespace"
4132 wait_all_targets_blocked namespace completed 8
4134 local repaired=$($SHOW_NAMESPACE |
4135 awk '/^multiple_referenced_repaired/ { print $2 }')
4136 [ $repaired -eq 1 ] ||
4137 error "(9) Fail to repair multiple referenced name entry: $repaired"
4139 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4140 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4141 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4143 local cname="$cfid-$pfid-D-0"
4144 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4145 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4147 run_test 24 "LFSCK can repair multiple-referenced name entry"
4150 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4151 skip "ldiskfs only test" && return
4154 echo "The file type in the name entry does not match the file type"
4155 echo "claimed by the referenced object. Then the LFSCK will update"
4156 echo "the file type in the name entry."
4159 check_mount_and_prep
4161 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4163 echo "Inject failure stub on MDT0 to simulate the case that"
4164 echo "the file type stored in the name entry is wrong."
4166 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4167 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4168 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4169 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4171 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4172 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4174 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4175 mdd.${MDT_DEV}.lfsck_namespace |
4176 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4178 error "(4) unexpected status"
4181 local repaired=$($SHOW_NAMESPACE |
4182 awk '/^bad_file_type_repaired/ { print $2 }')
4183 [ $repaired -eq 1 ] ||
4184 error "(5) Fail to repair bad file type in name entry: $repaired"
4186 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4188 run_test 25 "LFSCK can repair bad file type in the name entry"
4192 echo "The local name entry back referenced by the MDT-object is lost."
4193 echo "The namespace LFSCK will add the missing local name entry back"
4194 echo "to the normal namespace."
4197 check_mount_and_prep
4199 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4200 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4201 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4203 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4204 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4206 echo "Inject failure stub on MDT0 to simulate the case that"
4207 echo "foo's name entry will be removed, but the foo's object"
4208 echo "and its linkEA are kept in the system."
4210 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4212 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4213 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4215 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4216 error "(5) 'ls' should fail"
4218 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4219 $START_NAMESPACE -r -A ||
4220 error "(6) Fail to start LFSCK for namespace"
4222 wait_all_targets_blocked namespace completed 7
4224 local repaired=$($SHOW_NAMESPACE |
4225 awk '/^lost_dirent_repaired/ { print $2 }')
4226 [ $repaired -eq 1 ] ||
4227 error "(8) Fail to repair lost dirent: $repaired"
4229 ls -ail $DIR/$tdir/d0/foo ||
4230 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4232 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4233 [ "$foofid" == "$foofid2" ] ||
4234 error "(10) foo's FID changed: $foofid, $foofid2"
4236 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4239 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4242 echo "The remote name entry back referenced by the MDT-object is lost."
4243 echo "The namespace LFSCK will add the missing remote name entry back"
4244 echo "to the normal namespace."
4247 check_mount_and_prep
4249 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4250 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4251 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4253 echo "Inject failure stub on MDT0 to simulate the case that"
4254 echo "foo's name entry will be removed, but the foo's object"
4255 echo "and its linkEA are kept in the system."
4257 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4258 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4259 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4260 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4262 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4263 error "(4) 'ls' should fail"
4265 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4266 $START_NAMESPACE -r -A ||
4267 error "(5) Fail to start LFSCK for namespace"
4269 wait_all_targets_blocked namespace completed 6
4271 local repaired=$($SHOW_NAMESPACE |
4272 awk '/^lost_dirent_repaired/ { print $2 }')
4273 [ $repaired -eq 1 ] ||
4274 error "(7) Fail to repair lost dirent: $repaired"
4276 ls -ail $DIR/$tdir/d0/foo ||
4277 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4279 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4280 [ "$foofid" == "$foofid2" ] ||
4281 error "(9) foo's FID changed: $foofid, $foofid2"
4283 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4287 echo "The local parent referenced by the MDT-object linkEA is lost."
4288 echo "The namespace LFSCK will re-create the lost parent as orphan."
4291 check_mount_and_prep
4293 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4294 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4295 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4296 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4298 echo "Inject failure stub on MDT0 to simulate the case that"
4299 echo "foo's name entry will be removed, but the foo's object"
4300 echo "and its linkEA are kept in the system. And then remove"
4301 echo "another hard link and the parent directory."
4303 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4304 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4305 rm -f $DIR/$tdir/d0/foo ||
4306 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4307 rm -f $DIR/$tdir/d0/dummy ||
4308 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4309 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4311 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4312 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4314 echo "Trigger namespace LFSCK to repair the lost parent"
4315 $START_NAMESPACE -r -A ||
4316 error "(6) Fail to start LFSCK for namespace"
4318 wait_all_targets_blocked namespace completed 7
4320 local repaired=$($SHOW_NAMESPACE |
4321 awk '/^lost_dirent_repaired/ { print $2 }')
4322 [ $repaired -eq 1 ] ||
4323 error "(8) Fail to repair lost dirent: $repaired"
4325 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4326 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4327 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4329 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4331 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4332 [ ! -z "$cname" ] ||
4333 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4335 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4338 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4341 echo "The remote parent referenced by the MDT-object linkEA is lost."
4342 echo "The namespace LFSCK will re-create the lost parent as orphan."
4345 check_mount_and_prep
4347 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4348 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4350 $LFS path2fid $DIR/$tdir/d0
4352 echo "Inject failure stub on MDT0 to simulate the case that"
4353 echo "foo's name entry will be removed, but the foo's object"
4354 echo "and its linkEA are kept in the system. And then remove"
4355 echo "the parent directory."
4357 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4358 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4359 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4360 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4362 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4363 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4365 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4366 $START_NAMESPACE -r -A ||
4367 error "(6) Fail to start LFSCK for namespace"
4369 wait_all_targets_blocked namespace completed 7
4371 local repaired=$($SHOW_NAMESPACE |
4372 awk '/^lost_dirent_repaired/ { print $2 }')
4373 [ $repaired -eq 1 ] ||
4374 error "(8) Fail to repair lost dirent: $repaired"
4376 ls -ail $MOUNT/.lustre/lost+found/
4378 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4379 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4380 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4382 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4384 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4385 [ ! -z "$cname" ] ||
4386 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4388 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4391 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4394 echo "The target name entry is lost. The LFSCK should insert the"
4395 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4396 echo "the MDT (on which the orphan MDT-object resides) has ever"
4397 echo "failed to respond some name entry verification during the"
4398 echo "first stage-scanning, then the LFSCK should skip to handle"
4399 echo "orphan MDT-object on this MDT. But other MDTs should not"
4403 check_mount_and_prep
4404 $LFS mkdir -i 0 $DIR/$tdir/d1
4405 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4406 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4408 $LFS mkdir -i 1 $DIR/$tdir/d2
4409 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4410 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4412 echo "Inject failure stub on MDT0 to simulate the case that"
4413 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4414 echo "and its linkEA are kept in the system. And the case that"
4415 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4416 echo "and its linkEA are kept in the system."
4418 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4419 do_facet mds1 $LCTL set_param fail_loc=0x1624
4420 do_facet mds2 $LCTL set_param fail_loc=0x1624
4421 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4422 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4423 do_facet mds1 $LCTL set_param fail_loc=0
4424 do_facet mds2 $LCTL set_param fail_loc=0
4426 cancel_lru_locks mdc
4427 cancel_lru_locks osc
4429 echo "Inject failure, to simulate the MDT0 fail to handle"
4430 echo "MDT1 LFSCK request during the first-stage scanning."
4431 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4432 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4434 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4435 $START_NAMESPACE -r -A ||
4436 error "(3) Fail to start LFSCK for namespace"
4438 wait_update_facet mds1 "$LCTL get_param -n \
4439 mdd.$(facet_svc mds1).lfsck_namespace |
4440 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4441 error "(4) mds1 is not the expected 'partial'"
4444 wait_update_facet mds2 "$LCTL get_param -n \
4445 mdd.$(facet_svc mds2).lfsck_namespace |
4446 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4447 error "(5) mds2 is not the expected 'completed'"
4450 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4452 local repaired=$(do_facet mds1 $LCTL get_param -n \
4453 mdd.$(facet_svc mds1).lfsck_namespace |
4454 awk '/^lost_dirent_repaired/ { print $2 }')
4455 [ $repaired -eq 0 ] ||
4456 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4458 repaired=$(do_facet mds2 $LCTL get_param -n \
4459 mdd.$(facet_svc mds2).lfsck_namespace |
4460 awk '/^lost_dirent_repaired/ { print $2 }')
4461 [ $repaired -eq 1 ] ||
4462 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4464 echo "Trigger namespace LFSCK on all devices again to cleanup"
4465 $START_NAMESPACE -r -A ||
4466 error "(8) Fail to start LFSCK for namespace"
4468 wait_all_targets_blocked namespace completed 9
4470 local repaired=$(do_facet mds1 $LCTL get_param -n \
4471 mdd.$(facet_svc mds1).lfsck_namespace |
4472 awk '/^lost_dirent_repaired/ { print $2 }')
4473 [ $repaired -eq 1 ] ||
4474 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4476 repaired=$(do_facet mds2 $LCTL get_param -n \
4477 mdd.$(facet_svc mds2).lfsck_namespace |
4478 awk '/^lost_dirent_repaired/ { print $2 }')
4479 [ $repaired -eq 0 ] ||
4480 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4482 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4486 echo "The object's nlink attribute is larger than the object's known"
4487 echo "name entries count. The LFSCK will repair the object's nlink"
4488 echo "attribute to match the known name entries count"
4491 check_mount_and_prep
4493 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4494 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4496 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4497 echo "nlink attribute is larger than its name entries count."
4499 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4500 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4501 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4502 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4505 cancel_lru_locks mdc
4506 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4507 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4509 echo "Trigger namespace LFSCK to repair the nlink count"
4510 $START_NAMESPACE -r -A ||
4511 error "(5) Fail to start LFSCK for namespace"
4513 wait_all_targets_blocked namespace completed 6
4515 local repaired=$($SHOW_NAMESPACE |
4516 awk '/^nlinks_repaired/ { print $2 }')
4517 [ $repaired -eq 1 ] ||
4518 error "(7) Fail to repair nlink count: $repaired"
4520 cancel_lru_locks mdc
4521 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4522 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4524 # Disable 29a, we only allow nlink to be updated if the known linkEA
4525 # entries is larger than nlink count.
4527 #run_test 29a "LFSCK can repair bad nlink count (1)"
4531 echo "The object's nlink attribute is smaller than the object's known"
4532 echo "name entries count. The LFSCK will repair the object's nlink"
4533 echo "attribute to match the known name entries count"
4536 check_mount_and_prep
4538 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4539 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4541 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4542 echo "nlink attribute is smaller than its name entries count."
4544 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4545 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4546 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4547 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4548 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4550 cancel_lru_locks mdc
4551 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4552 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4554 echo "Trigger namespace LFSCK to repair the nlink count"
4555 $START_NAMESPACE -r -A ||
4556 error "(5) Fail to start LFSCK for namespace"
4558 wait_all_targets_blocked namespace completed 6
4560 local repaired=$($SHOW_NAMESPACE |
4561 awk '/^nlinks_repaired/ { print $2 }')
4562 [ $repaired -eq 1 ] ||
4563 error "(7) Fail to repair nlink count: $repaired"
4565 cancel_lru_locks mdc
4566 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4567 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4569 run_test 29b "LFSCK can repair bad nlink count (2)"
4574 echo "The namespace LFSCK will create many hard links to the target"
4575 echo "file as to exceed the linkEA size limitation. Under such case"
4576 echo "the linkEA will be marked as overflow that will prevent the"
4577 echo "target file to be migrated. Then remove some hard links to"
4578 echo "make the left hard links to be held within the linkEA size"
4579 echo "limitation. But before the namespace LFSCK adding all the"
4580 echo "missed linkEA entries back, the overflow mark (timestamp)"
4581 echo "will not be cleared."
4584 check_mount_and_prep
4586 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4587 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4588 error "(0.2) Fail to mkdir"
4589 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4590 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4592 # define MAX_LINKEA_SIZE 4096
4593 # sizeof(link_ea_header) = 24
4594 # sizeof(link_ea_entry) = 18
4595 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4596 # (sizeof(link_ea_entry) + name_length))
4597 # If the average name length is 12 bytes, then 150 hard links
4598 # is totally enough to overflow the linkEA
4599 echo "Create 150 hard links should succeed although the linkEA overflow"
4600 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4601 error "(2) Fail to hard link"
4603 cancel_lru_locks mdc
4604 if [ $MDSCOUNT -ge 2 ]; then
4605 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4606 error "(3.1) Migrate should fail"
4608 echo "The object with linkEA overflow should NOT be migrated"
4609 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4610 [ "$newfid" == "$oldfid" ] ||
4611 error "(3.2) Migrate should fail: $newfid != $oldfid"
4614 # Remove 100 hard links, then the linkEA should have space
4615 # to hold the missed linkEA entries.
4616 echo "Remove 100 hard links to save space for the missed linkEA entries"
4617 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4619 if [ $MDSCOUNT -ge 2 ]; then
4620 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4621 error "(5.1) Migrate should fail"
4623 # The overflow timestamp is still there, so migration will fail.
4624 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4625 [ "$newfid" == "$oldfid" ] ||
4626 error "(5.2) Migrate should fail: $newfid != $oldfid"
4629 # sleep 3 seconds to guarantee that the overflow is recognized
4632 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4633 $START_NAMESPACE -r -A ||
4634 error "(6) Fail to start LFSCK for namespace"
4636 wait_all_targets_blocked namespace completed 7
4638 local repaired=$($SHOW_NAMESPACE |
4639 awk '/^linkea_overflow_cleared/ { print $2 }')
4640 [ $repaired -eq 1 ] ||
4641 error "(8) Fail to clear linkea overflow: $repaired"
4643 repaired=$($SHOW_NAMESPACE |
4644 awk '/^nlinks_repaired/ { print $2 }')
4645 [ $repaired -eq 0 ] ||
4646 error "(9) Unexpected nlink repaired: $repaired"
4648 if [ $MDSCOUNT -ge 2 ]; then
4649 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4650 error "(10.1) Migrate failure"
4652 # Migration should succeed after clear the overflow timestamp.
4653 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4654 [ "$newfid" != "$oldfid" ] ||
4655 error "(10.2) Migrate should succeed"
4657 ls -l $DIR/$tdir/foo > /dev/null ||
4658 error "(11) 'ls' failed after migration"
4661 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4662 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4664 run_test 29c "verify linkEA size limitation"
4667 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4668 skip "ldiskfs only test" && return
4671 echo "The namespace LFSCK will move the orphans from backend"
4672 echo "/lost+found directory to normal client visible namespace"
4673 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4676 check_mount_and_prep
4678 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4679 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4681 echo "Inject failure stub on MDT0 to simulate the case that"
4682 echo "directory d0 has no linkEA entry, then the LFSCK will"
4683 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4685 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4686 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4687 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4688 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4690 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4691 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4693 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4694 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4696 echo "Inject failure stub on MDT0 to simulate the case that the"
4697 echo "object's name entry will be removed, but not destroy the"
4698 echo "object. Then backend e2fsck will handle it as orphan and"
4699 echo "add them into the backend /lost+found directory."
4701 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4702 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4703 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4704 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4705 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4706 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4707 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4709 umount_client $MOUNT || error "(10) Fail to stop client!"
4711 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4714 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4715 error "(12) Fail to run e2fsck"
4717 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4718 error "(13) Fail to start MDT0"
4720 echo "Trigger namespace LFSCK to recover backend orphans"
4721 $START_NAMESPACE -r -A ||
4722 error "(14) Fail to start LFSCK for namespace"
4724 wait_all_targets_blocked namespace completed 15
4726 local repaired=$($SHOW_NAMESPACE |
4727 awk '/^local_lost_found_moved/ { print $2 }')
4728 [ $repaired -ge 4 ] ||
4729 error "(16) Fail to recover backend orphans: $repaired"
4731 mount_client $MOUNT || error "(17) Fail to start client!"
4733 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4735 ls -ail $MOUNT/.lustre/lost+found/
4737 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4738 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4739 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4741 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4743 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4744 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4746 stat ${cname}/d1 || error "(21) d1 is not recovered"
4747 stat ${cname}/f1 || error "(22) f1 is not recovered"
4749 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4752 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4755 echo "For the name entry under a striped directory, if the name"
4756 echo "hash does not match the shard, then the LFSCK will repair"
4757 echo "the bad name entry"
4760 check_mount_and_prep
4762 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4763 error "(1) Fail to create striped directory"
4765 echo "Inject failure stub on client to simulate the case that"
4766 echo "some name entry should be inserted into other non-first"
4767 echo "shard, but inserted into the first shard by wrong"
4769 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4770 $LCTL set_param fail_loc=0x1628 fail_val=0
4771 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4772 error "(2) Fail to create file under striped directory"
4773 $LCTL set_param fail_loc=0 fail_val=0
4775 echo "Trigger namespace LFSCK to repair bad name hash"
4776 $START_NAMESPACE -r -A ||
4777 error "(3) Fail to start LFSCK for namespace"
4779 wait_all_targets_blocked namespace completed 4
4781 local repaired=$($SHOW_NAMESPACE |
4782 awk '/^name_hash_repaired/ { print $2 }')
4783 [ $repaired -ge 1 ] ||
4784 error "(5) Fail to repair bad name hash: $repaired"
4786 umount_client $MOUNT || error "(6) umount failed"
4787 mount_client $MOUNT || error "(7) mount failed"
4789 for ((i = 0; i < $MDSCOUNT; i++)); do
4790 stat $DIR/$tdir/striped_dir/d$i ||
4791 error "(8) Fail to stat d$i after LFSCK"
4792 rmdir $DIR/$tdir/striped_dir/d$i ||
4793 error "(9) Fail to unlink d$i after LFSCK"
4796 rmdir $DIR/$tdir/striped_dir ||
4797 error "(10) Fail to remove the striped directory after LFSCK"
4799 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4802 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4805 echo "For the name entry under a striped directory, if the name"
4806 echo "hash does not match the shard, then the LFSCK will repair"
4807 echo "the bad name entry"
4810 check_mount_and_prep
4812 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4813 error "(1) Fail to create striped directory"
4815 echo "Inject failure stub on client to simulate the case that"
4816 echo "some name entry should be inserted into other non-second"
4817 echo "shard, but inserted into the secod shard by wrong"
4819 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4820 $LCTL set_param fail_loc=0x1628 fail_val=1
4821 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4822 error "(2) Fail to create file under striped directory"
4823 $LCTL set_param fail_loc=0 fail_val=0
4825 echo "Trigger namespace LFSCK to repair bad name hash"
4826 $START_NAMESPACE -r -A ||
4827 error "(3) Fail to start LFSCK for namespace"
4829 wait_all_targets_blocked namespace completed 4
4831 local repaired=$(do_facet mds2 $LCTL get_param -n \
4832 mdd.$(facet_svc mds2).lfsck_namespace |
4833 awk '/^name_hash_repaired/ { print $2 }')
4834 [ $repaired -ge 1 ] ||
4835 error "(5) Fail to repair bad name hash: $repaired"
4837 umount_client $MOUNT || error "(6) umount failed"
4838 mount_client $MOUNT || error "(7) mount failed"
4840 for ((i = 0; i < $MDSCOUNT; i++)); do
4841 stat $DIR/$tdir/striped_dir/d$i ||
4842 error "(8) Fail to stat d$i after LFSCK"
4843 rmdir $DIR/$tdir/striped_dir/d$i ||
4844 error "(9) Fail to unlink d$i after LFSCK"
4847 rmdir $DIR/$tdir/striped_dir ||
4848 error "(10) Fail to remove the striped directory after LFSCK"
4850 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4853 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4856 echo "For some reason, the master MDT-object of the striped directory"
4857 echo "may lost its master LMV EA. If nobody created files under the"
4858 echo "master directly after the master LMV EA lost, then the LFSCK"
4859 echo "should re-generate the master LMV EA."
4862 check_mount_and_prep
4864 echo "Inject failure stub on MDT0 to simulate the case that the"
4865 echo "master MDT-object of the striped directory lost the LMV EA."
4867 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4868 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4869 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4870 error "(1) Fail to create striped directory"
4871 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4873 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4874 $START_NAMESPACE -r -A ||
4875 error "(2) Fail to start LFSCK for namespace"
4877 wait_all_targets_blocked namespace completed 3
4879 local repaired=$($SHOW_NAMESPACE |
4880 awk '/^striped_dirs_repaired/ { print $2 }')
4881 [ $repaired -eq 1 ] ||
4882 error "(4) Fail to re-generate master LMV EA: $repaired"
4884 umount_client $MOUNT || error "(5) umount failed"
4885 mount_client $MOUNT || error "(6) mount failed"
4887 local empty=$(ls $DIR/$tdir/striped_dir/)
4888 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4890 rmdir $DIR/$tdir/striped_dir ||
4891 error "(8) Fail to remove the striped directory after LFSCK"
4893 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4896 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4899 echo "For some reason, the master MDT-object of the striped directory"
4900 echo "may lost its master LMV EA. If somebody created files under the"
4901 echo "master directly after the master LMV EA lost, then the LFSCK"
4902 echo "should NOT re-generate the master LMV EA, instead, it should"
4903 echo "change the broken striped dirctory as read-only to prevent"
4904 echo "further damage"
4907 check_mount_and_prep
4909 echo "Inject failure stub on MDT0 to simulate the case that the"
4910 echo "master MDT-object of the striped directory lost the LMV EA."
4912 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4913 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4914 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4915 error "(1) Fail to create striped directory"
4916 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4918 umount_client $MOUNT || error "(2) umount failed"
4919 mount_client $MOUNT || error "(3) mount failed"
4921 touch $DIR/$tdir/striped_dir/dummy ||
4922 error "(4) Fail to touch under broken striped directory"
4924 echo "Trigger namespace LFSCK to find out the inconsistency"
4925 $START_NAMESPACE -r -A ||
4926 error "(5) Fail to start LFSCK for namespace"
4928 wait_all_targets_blocked namespace completed 6
4930 local repaired=$($SHOW_NAMESPACE |
4931 awk '/^striped_dirs_repaired/ { print $2 }')
4932 [ $repaired -eq 0 ] ||
4933 error "(7) Re-generate master LMV EA unexpected: $repaired"
4935 stat $DIR/$tdir/striped_dir/dummy ||
4936 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4938 touch $DIR/$tdir/striped_dir/foo &&
4939 error "(9) The broken striped directory should be read-only"
4941 chattr -i $DIR/$tdir/striped_dir ||
4942 error "(10) Fail to chattr on the broken striped directory"
4944 rmdir $DIR/$tdir/striped_dir ||
4945 error "(11) Fail to remove the striped directory after LFSCK"
4947 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4950 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4953 echo "For some reason, the slave MDT-object of the striped directory"
4954 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4955 echo "slave LMV EA."
4958 check_mount_and_prep
4960 echo "Inject failure stub on MDT0 to simulate the case that the"
4961 echo "slave MDT-object (that resides on the same MDT as the master"
4962 echo "MDT-object resides on) lost the LMV EA."
4964 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4965 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4966 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4967 error "(1) Fail to create striped directory"
4968 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4970 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4971 $START_NAMESPACE -r -A ||
4972 error "(2) Fail to start LFSCK for namespace"
4974 wait_all_targets_blocked namespace completed 3
4976 local repaired=$($SHOW_NAMESPACE |
4977 awk '/^striped_shards_repaired/ { print $2 }')
4978 [ $repaired -eq 1 ] ||
4979 error "(4) Fail to re-generate slave LMV EA: $repaired"
4981 rmdir $DIR/$tdir/striped_dir ||
4982 error "(5) Fail to remove the striped directory after LFSCK"
4984 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4987 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4990 echo "For some reason, the slave MDT-object of the striped directory"
4991 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4992 echo "slave LMV EA."
4995 check_mount_and_prep
4997 echo "Inject failure stub on MDT0 to simulate the case that the"
4998 echo "slave MDT-object (that resides on different MDT as the master"
4999 echo "MDT-object resides on) lost the LMV EA."
5001 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
5002 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
5003 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5004 error "(1) Fail to create striped directory"
5005 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5007 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
5008 $START_NAMESPACE -r -A ||
5009 error "(2) Fail to start LFSCK for namespace"
5011 wait_all_targets_blocked namespace completed 3
5013 local repaired=$(do_facet mds2 $LCTL get_param -n \
5014 mdd.$(facet_svc mds2).lfsck_namespace |
5015 awk '/^striped_shards_repaired/ { print $2 }')
5016 [ $repaired -eq 1 ] ||
5017 error "(4) Fail to re-generate slave LMV EA: $repaired"
5019 rmdir $DIR/$tdir/striped_dir ||
5020 error "(5) Fail to remove the striped directory after LFSCK"
5022 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
5025 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5028 echo "For some reason, the stripe index in the slave LMV EA is"
5029 echo "corrupted. The LFSCK should repair the slave LMV EA."
5032 check_mount_and_prep
5034 echo "Inject failure stub on MDT0 to simulate the case that the"
5035 echo "slave LMV EA on the first shard of the striped directory"
5036 echo "claims the same index as the second shard claims"
5038 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
5039 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
5040 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5041 error "(1) Fail to create striped directory"
5042 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5044 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5045 $START_NAMESPACE -r -A ||
5046 error "(2) Fail to start LFSCK for namespace"
5048 wait_all_targets_blocked namespace completed 3
5050 local repaired=$($SHOW_NAMESPACE |
5051 awk '/^striped_shards_repaired/ { print $2 }')
5052 [ $repaired -eq 1 ] ||
5053 error "(4) Fail to repair slave LMV EA: $repaired"
5055 umount_client $MOUNT || error "(5) umount failed"
5056 mount_client $MOUNT || error "(6) mount failed"
5058 touch $DIR/$tdir/striped_dir/foo ||
5059 error "(7) Fail to touch file after the LFSCK"
5061 rm -f $DIR/$tdir/striped_dir/foo ||
5062 error "(8) Fail to unlink file after the LFSCK"
5064 rmdir $DIR/$tdir/striped_dir ||
5065 error "(9) Fail to remove the striped directory after LFSCK"
5067 run_test 31g "Repair the corrupted slave LMV EA"
5070 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5073 echo "For some reason, the shard's name entry in the striped"
5074 echo "directory may be corrupted. The LFSCK should repair the"
5075 echo "bad shard's name entry."
5078 check_mount_and_prep
5080 echo "Inject failure stub on MDT0 to simulate the case that the"
5081 echo "first shard's name entry in the striped directory claims"
5082 echo "the same index as the second shard's name entry claims."
5084 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5085 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5086 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5087 error "(1) Fail to create striped directory"
5088 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5090 echo "Trigger namespace LFSCK to repair the shard's name entry"
5091 $START_NAMESPACE -r -A ||
5092 error "(2) Fail to start LFSCK for namespace"
5094 wait_all_targets_blocked namespace completed 3
5096 local repaired=$($SHOW_NAMESPACE |
5097 awk '/^dirent_repaired/ { print $2 }')
5098 [ $repaired -eq 1 ] ||
5099 error "(4) Fail to repair shard's name entry: $repaired"
5101 umount_client $MOUNT || error "(5) umount failed"
5102 mount_client $MOUNT || error "(6) mount failed"
5104 touch $DIR/$tdir/striped_dir/foo ||
5105 error "(7) Fail to touch file after the LFSCK"
5107 rm -f $DIR/$tdir/striped_dir/foo ||
5108 error "(8) Fail to unlink file after the LFSCK"
5110 rmdir $DIR/$tdir/striped_dir ||
5111 error "(9) Fail to remove the striped directory after LFSCK"
5113 run_test 31h "Repair the corrupted shard's name entry"
5118 umount_client $MOUNT
5120 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5121 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5122 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5124 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5125 [ "$STATUS" == "scanning-phase1" ] ||
5126 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5129 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5131 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5135 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5137 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5138 error "(5) Fail to start ost1"
5140 run_test 32a "stop LFSCK when some OST failed"
5144 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5147 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5148 error "(1) Fail to create $DIR/$tdir/dp"
5149 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5150 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5151 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5152 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5153 umount_client $MOUNT
5155 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5156 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5157 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5159 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5160 mdd.${MDT_DEV}.lfsck_namespace |
5161 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5163 error "(5) unexpected status"
5167 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5169 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5173 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5175 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5176 error "(8) Fail to start MDT2"
5178 run_test 32b "stop LFSCK when some MDT failed"
5184 $START_LAYOUT --dryrun -o -r ||
5185 error "(1) Fail to start layout LFSCK"
5186 wait_all_targets_blocked layout completed 2
5188 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5189 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5190 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5192 $START_NAMESPACE -e abort -A -r ||
5193 error "(4) Fail to start namespace LFSCK"
5194 wait_all_targets_blocked namespace completed 5
5196 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5197 [ "$PARAMS" == "failout,all_targets" ] ||
5198 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5200 run_test 33 "check LFSCK paramters"
5204 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5205 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5206 skip "Only valid for ZFS backend" && return
5210 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5212 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5213 error "(1) Fail to create $DIR/$tdir/dummy"
5215 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5216 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5217 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5218 mdd.${MDT_DEV}.lfsck_namespace |
5219 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5221 error "(3) unexpected status"
5224 local repaired=$($SHOW_NAMESPACE |
5225 awk '/^dirent_repaired/ { print $2 }')
5226 [ $repaired -eq 1 ] ||
5227 error "(4) Fail to repair the lost agent object: $repaired"
5229 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5230 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5231 mdd.${MDT_DEV}.lfsck_namespace |
5232 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5234 error "(6) unexpected status"
5237 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5238 [ $repaired -eq 0 ] ||
5239 error "(7) Unexpected repairing: $repaired"
5241 run_test 34 "LFSCK can rebuild the lost agent object"
5245 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5249 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5250 do_facet mds2 $LCTL set_param fail_loc=0x1631
5251 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5252 error "(1) Fail to create $DIR/$tdir/dummy"
5255 do_facet mds2 $LCTL set_param fail_loc=0
5256 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5257 wait_update_facet mds2 "$LCTL get_param -n \
5258 mdd.$(facet_svc mds2).lfsck_namespace |
5259 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5260 error "(3) MDS${k} is not the expected 'completed'"
5262 local repaired=$(do_facet mds2 $LCTL get_param -n \
5263 mdd.$(facet_svc mds2).lfsck_namespace |
5264 awk '/^agent_entries_repaired/ { print $2 }')
5265 [ $repaired -eq 1 ] ||
5266 error "(4) Fail to repair the lost agent entry: $repaired"
5268 echo "stopall to cleanup object cache"
5271 setupall > /dev/null
5273 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5274 wait_update_facet mds2 "$LCTL get_param -n \
5275 mdd.$(facet_svc mds2).lfsck_namespace |
5276 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5277 error "(6) MDS${k} is not the expected 'completed'"
5279 repaired=$(do_facet mds2 $LCTL get_param -n \
5280 mdd.$(facet_svc mds2).lfsck_namespace |
5281 awk '/^agent_entries_repaired/ { print $2 }')
5282 [ $repaired -eq 0 ] ||
5283 error "(7) Unexpected repairing: $repaired"
5285 run_test 35 "LFSCK can rebuild the lost agent entry"
5288 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5291 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5292 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5293 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5296 check_mount_and_prep
5298 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5299 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5300 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5301 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5302 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5303 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5304 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5305 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5306 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5308 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5309 error "(3) Fail to write $DIR/$tdir/f0"
5310 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5311 error "(4) Fail to write $DIR/$tdir/f1"
5312 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5313 error "(5) Fail to write $DIR/$tdir/f2"
5315 $LFS mirror resync $DIR/$tdir/f0 ||
5316 error "(6) Fail to resync $DIR/$tdir/f0"
5317 $LFS mirror resync $DIR/$tdir/f1 ||
5318 error "(7) Fail to resync $DIR/$tdir/f1"
5319 $LFS mirror resync $DIR/$tdir/f2 ||
5320 error "(8) Fail to resync $DIR/$tdir/f2"
5322 cancel_lru_locks mdc
5323 cancel_lru_locks osc
5325 $LFS getstripe $DIR/$tdir/f0 ||
5326 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5327 $LFS getstripe $DIR/$tdir/f1 ||
5328 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5329 $LFS getstripe $DIR/$tdir/f2 ||
5330 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5332 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5333 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5334 do_facet mds1 $LCTL set_param fail_loc=0x1616
5336 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5337 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5338 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5339 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5340 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5341 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5345 do_facet mds1 $LCTL set_param fail_loc=0
5347 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5348 error "(15) The 1st of mirror is not destroyed"
5349 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5350 error "(16) The 2nd of mirror is not destroyed"
5351 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5352 error "(17) The 3rd of mirror is not destroyed"
5356 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5357 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5358 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5359 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5360 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5361 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5363 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5364 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5366 for k in $(seq $MDSCOUNT); do
5367 # The LFSCK status query internal is 30 seconds. For the case
5368 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5369 # time to guarantee the status sync up.
5370 wait_update_facet mds${k} "$LCTL get_param -n \
5371 mdd.$(facet_svc mds${k}).lfsck_layout |
5372 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5373 error "(22) MDS${k} is not the expected 'completed'"
5376 for k in $(seq $OSTCOUNT); do
5377 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5378 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5379 awk '/^status/ { print $2 }')
5380 [ "$cur_status" == "completed" ] ||
5381 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5384 local repaired=$(do_facet mds1 $LCTL get_param -n \
5385 mdd.$(facet_svc mds1).lfsck_layout |
5386 awk '/^repaired_orphan/ { print $2 }')
5387 [ $repaired -eq 9 ] ||
5388 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5390 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5391 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5392 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5393 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5394 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5395 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5397 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5398 $LFS getstripe $DIR/$tdir/f0
5399 error "(28) The 1st of mirror is not recovered"
5402 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5403 $LFS getstripe $DIR/$tdir/f1
5404 error "(29) The 2nd of mirror is not recovered"
5407 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5408 $LFS getstripe $DIR/$tdir/f2
5409 error "(30) The 3rd of mirror is not recovered"
5412 run_test 36a "rebuild LOV EA for mirrored file (1)"
5415 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5418 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5419 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5420 echo "with the PFID EA of related OST-object(s) belong to the file. "
5423 check_mount_and_prep
5425 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5426 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5427 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5429 local fid=$($LFS path2fid $DIR/$tdir/f0)
5431 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5432 error "(1) Fail to write $DIR/$tdir/f0"
5433 $LFS mirror resync $DIR/$tdir/f0 ||
5434 error "(2) Fail to resync $DIR/$tdir/f0"
5436 cancel_lru_locks mdc
5437 cancel_lru_locks osc
5439 $LFS getstripe $DIR/$tdir/f0 ||
5440 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5442 echo "Inject failure, to simulate the case of missing the MDT-object"
5443 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5444 do_facet mds1 $LCTL set_param fail_loc=0x1616
5445 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5449 do_facet mds1 $LCTL set_param fail_loc=0
5451 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5452 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5454 for k in $(seq $MDSCOUNT); do
5455 # The LFSCK status query internal is 30 seconds. For the case
5456 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5457 # time to guarantee the status sync up.
5458 wait_update_facet mds${k} "$LCTL get_param -n \
5459 mdd.$(facet_svc mds${k}).lfsck_layout |
5460 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5461 error "(6) MDS${k} is not the expected 'completed'"
5464 for k in $(seq $OSTCOUNT); do
5465 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5466 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5467 awk '/^status/ { print $2 }')
5468 [ "$cur_status" == "completed" ] ||
5469 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5472 local count=$(do_facet mds1 $LCTL get_param -n \
5473 mdd.$(facet_svc mds1).lfsck_layout |
5474 awk '/^repaired_orphan/ { print $2 }')
5475 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5477 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5478 count=$($LFS getstripe --mirror-count $name)
5479 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5481 count=$($LFS getstripe --component-count $name)
5482 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5484 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5485 $LFS getstripe $name
5486 error "(11) The 1st of mirror is not recovered"
5489 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5490 $LFS getstripe $name
5491 error "(12) The 2nd of mirror is not recovered"
5494 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5495 $LFS getstripe $name
5496 error "(13) The 3rd of mirror is not recovered"
5499 run_test 36b "rebuild LOV EA for mirrored file (2)"
5502 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5505 echo "The mirrored file has been modified, not resynced yet, then "
5506 echo "lost its MDT-object, but relatd OST-objects are still there. "
5507 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5508 echo "with the PFID EA of related OST-object(s) belong to the file. "
5511 check_mount_and_prep
5513 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5515 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5517 local fid=$($LFS path2fid $DIR/$tdir/f0)
5519 # The 1st dd && resync makes all related OST-objects have been written
5520 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5521 error "(1.1) Fail to write $DIR/$tdir/f0"
5522 $LFS mirror resync $DIR/$tdir/f0 ||
5523 error "(1.2) Fail to resync $DIR/$tdir/f0"
5524 # The 2nd dd makes one mirror to be stale
5525 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5526 error "(1.3) Fail to write $DIR/$tdir/f0"
5528 cancel_lru_locks mdc
5529 cancel_lru_locks osc
5531 $LFS getstripe $DIR/$tdir/f0 ||
5532 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5534 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5535 awk '/lcme_flags/ { print $2 }')
5536 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5537 awk '/lcme_flags/ { print $2 }')
5539 echo "Inject failure, to simulate the case of missing the MDT-object"
5540 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5541 do_facet mds1 $LCTL set_param fail_loc=0x1616
5542 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5546 do_facet mds1 $LCTL set_param fail_loc=0
5548 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5549 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5551 for k in $(seq $MDSCOUNT); do
5552 # The LFSCK status query internal is 30 seconds. For the case
5553 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5554 # time to guarantee the status sync up.
5555 wait_update_facet mds${k} "$LCTL get_param -n \
5556 mdd.$(facet_svc mds${k}).lfsck_layout |
5557 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5558 error "(5) MDS${k} is not the expected 'completed'"
5561 for k in $(seq $OSTCOUNT); do
5562 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5563 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5564 awk '/^status/ { print $2 }')
5565 [ "$cur_status" == "completed" ] ||
5566 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5569 local count=$(do_facet mds1 $LCTL get_param -n \
5570 mdd.$(facet_svc mds1).lfsck_layout |
5571 awk '/^repaired_orphan/ { print $2 }')
5572 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5574 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5575 count=$($LFS getstripe --mirror-count $name)
5576 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5578 count=$($LFS getstripe --component-count $name)
5579 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5581 local flags=$($LFS getstripe $name | head -n 10 |
5582 awk '/lcme_flags/ { print $2 }')
5583 [ "$flags" == "$saved_flags1" ] || {
5584 $LFS getstripe $name
5585 error "(10) expect flags $saved_flags1, got $flags"
5588 flags=$($LFS getstripe $name | tail -n 10 |
5589 awk '/lcme_flags/ { print $2 }')
5590 [ "$flags" == "$saved_flags2" ] || {
5591 $LFS getstripe $name
5592 error "(11) expect flags $saved_flags2, got $flags"
5595 run_test 36c "rebuild LOV EA for mirrored file (3)"
5601 local t_dir="$DIR/$tdir/d0"
5602 check_mount_and_prep
5604 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5605 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5609 $START_NAMESPACE -r -A || {
5610 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5612 wait_all_targets_blocked namespace completed 4
5617 run_test 37 "LFSCK must skip a ORPHAN"
5620 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5622 check_mount_and_prep
5623 $LFS mkdir -i 1 $DIR/$tdir/dir1
5624 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
5626 touch $DIR/$tdir/dir1/f1
5627 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
5629 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
5630 $LFS migrate -m 0 $DIR/$tdir/dir1
5632 echo "trigger LFSCK for layout"
5633 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
5635 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5636 mdd.${MDT_DEV}.lfsck_layout |
5637 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5639 error "(2) unexpected status"
5642 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
5644 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
5646 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
5648 # restore MDS/OST size
5649 MDSSIZE=${SAVED_MDSSIZE}
5650 OSTSIZE=${SAVED_OSTSIZE}
5651 OSTCOUNT=${SAVED_OSTCOUNT}
5653 # cleanup the system at last
5654 REFORMAT="yes" cleanup_and_setup_lustre
5657 check_and_cleanup_lustre