3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
45 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
57 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
60 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
61 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
63 # DNE does not support striped directory on zfs-based backend yet.
64 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
65 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
69 MDT_DEV="${FSNAME}-MDT0000"
70 OST_DEV="${FSNAME}-OST0000"
71 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
72 START_NAMESPACE="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
74 START_LAYOUT="do_facet $SINGLEMDS \
75 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
76 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
77 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
78 SHOW_NAMESPACE="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
80 SHOW_LAYOUT="do_facet $SINGLEMDS \
81 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
82 SHOW_LAYOUT_ON_OST="do_facet ost1 \
83 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
84 MOUNT_OPTS_SCRUB="-o user_xattr"
85 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
86 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
95 echo "preparing... $nfiles * $ndirs files will be created $(date)."
96 if [ ! -z $igif ]; then
97 #define OBD_FAIL_FID_IGIF 0x1504
98 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
101 cp $LUSTRE/tests/*.sh $DIR/$tdir/
102 if [ $ndirs -gt 0 ]; then
103 createmany -d $DIR/$tdir/d $ndirs
104 createmany -m $DIR/$tdir/f $ndirs
105 if [ $nfiles -gt 0 ]; then
106 for ((i = 0; i < $ndirs; i++)); do
107 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
108 /dev/null || error "createmany $nfiles"
111 createmany -d $DIR/$tdir/e $ndirs
114 if [ ! -z $igif ]; then
115 touch $DIR/$tdir/dummy
116 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
119 echo "prepared $(date)."
122 run_e2fsck_on_mdt0() {
123 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
125 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
128 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
129 error "(2) Detected inconsistency on MDT0"
131 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
132 error "(3) Fail to start MDT0"
135 wait_all_targets_blocked() {
140 local count=$(do_facet mds1 \
141 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
142 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
143 [[ $count -eq $MDSCOUNT ]] || {
144 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
145 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
154 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
155 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
156 "$MDSCOUNT" $LTIME || {
157 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
158 error "($err) some MDTs are not in ${status}"
165 #define OBD_FAIL_LFSCK_DELAY1 0x1600
166 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
167 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
169 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
171 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
172 [ "$STATUS" == "scanning-phase1" ] ||
173 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
175 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
177 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
178 [ "$STATUS" == "stopped" ] ||
179 error "(6) Expect 'stopped', but got '$STATUS'"
181 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
183 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
184 [ "$STATUS" == "scanning-phase1" ] ||
185 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
188 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
189 mdd.${MDT_DEV}.lfsck_namespace |
190 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
192 error "(9) unexpected status"
195 local repaired=$($SHOW_NAMESPACE |
196 awk '/^updated_phase1/ { print $2 }')
197 [ $repaired -eq 0 ] ||
198 error "(10) Expect nothing to be repaired, but got: $repaired"
200 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
201 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
202 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
203 mdd.${MDT_DEV}.lfsck_namespace |
204 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
206 error "(12) unexpected status"
209 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
210 [ $((scanned1 + 1)) -eq $scanned2 ] ||
211 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
213 echo "stopall, should NOT crash LU-3649"
214 stopall || error "(14) Fail to stopall"
216 run_test 0 "Control LFSCK manually"
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_FID_IGIF 0x1504
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^dirent_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase1/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair lost FID-in-dirent: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 #define OBD_FAIL_FID_LOOKUP 0x1505
335 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
336 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
338 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
340 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
345 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
347 touch $DIR/$tdir/dummy
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
351 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
352 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
353 mdd.${MDT_DEV}.lfsck_namespace |
354 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
356 error "(4) unexpected status"
359 local repaired=$($SHOW_NAMESPACE |
360 awk '/^linkea_repaired/ { print $2 }')
361 # for interop with old server
362 [ -z "$repaired" ] &&
363 repaired=$($SHOW_NAMESPACE |
364 awk '/^updated_phase2/ { print $2 }')
366 [ $repaired -eq 1 ] ||
367 error "(5) Fail to repair crashed linkEA: $repaired"
371 mount_client $MOUNT || error "(6) Fail to start client!"
373 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
374 error "(7) Fail to stat $DIR/$tdir/dummy"
376 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
377 local dummyname=$($LFS fid2path $DIR $dummyfid)
378 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
379 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
381 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
387 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
388 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
389 touch $DIR/$tdir/dummy
391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
393 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
394 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
395 mdd.${MDT_DEV}.lfsck_namespace |
396 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
398 error "(4) unexpected status"
401 local repaired=$($SHOW_NAMESPACE |
402 awk '/^updated_phase2/ { print $2 }')
403 [ $repaired -eq 1 ] ||
404 error "(5) Fail to repair crashed linkEA: $repaired"
408 mount_client $MOUNT || error "(6) Fail to start client!"
410 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
411 error "(7) Fail to stat $DIR/$tdir/dummy"
413 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
414 local dummyname=$($LFS fid2path $DIR $dummyfid)
415 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
416 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
418 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
424 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
426 touch $DIR/$tdir/dummy
428 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
430 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
431 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
432 mdd.${MDT_DEV}.lfsck_namespace |
433 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
435 error "(4) unexpected status"
438 local repaired=$($SHOW_NAMESPACE |
439 awk '/^updated_phase2/ { print $2 }')
440 [ $repaired -eq 1 ] ||
441 error "(5) Fail to repair crashed linkEA: $repaired"
445 mount_client $MOUNT || error "(6) Fail to start client!"
447 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
448 error "(7) Fail to stat $DIR/$tdir/dummy"
450 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
451 local dummyname=$($LFS fid2path $DIR $dummyfid)
452 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
453 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
455 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
461 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
462 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
463 touch $DIR/$tdir/dummy
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
467 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
468 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
469 mdd.${MDT_DEV}.lfsck_namespace |
470 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
472 error "(4) unexpected status"
475 local repaired=$($SHOW_NAMESPACE |
476 awk '/^linkea_repaired/ { print $2 }')
477 [ $repaired -eq 1 ] ||
478 error "(5) Fail to repair crashed linkEA: $repaired"
482 mount_client $MOUNT || error "(6) Fail to start client!"
484 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
485 error "(7) Fail to stat $DIR/$tdir/dummy"
487 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
488 local dummyname=$($LFS fid2path $DIR $dummyfid)
489 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
490 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
492 run_test 2d "LFSCK can recover the missing linkEA entry"
496 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
500 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
502 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
504 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
509 wait_all_targets_blocked namespace completed 4
511 local repaired=$($SHOW_NAMESPACE |
512 awk '/^linkea_repaired/ { print $2 }')
513 [ $repaired -eq 1 ] ||
514 error "(5) Fail to repair crashed linkEA: $repaired"
516 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
517 local name=$($LFS fid2path $DIR $fid)
518 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
519 error "(6) Fail to repair linkEA: $fid $name"
521 run_test 2e "namespace LFSCK can verify remote object linkEA"
527 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
528 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
529 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
531 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
532 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
533 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
535 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
536 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
537 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
539 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
541 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
543 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
545 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
547 mdd.${MDT_DEV}.lfsck_namespace |
548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
550 error "(10) unexpected status"
553 local checked=$($SHOW_NAMESPACE |
554 awk '/^checked_phase2/ { print $2 }')
555 [ $checked -ge 4 ] ||
556 error "(11) Fail to check multiple-linked object: $checked"
558 local repaired=$($SHOW_NAMESPACE |
559 awk '/^multiple_linked_repaired/ { print $2 }')
560 [ $repaired -ge 2 ] ||
561 error "(12) Fail to repair multiple-linked object: $repaired"
563 run_test 3 "LFSCK can verify multiple-linked objects"
567 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
568 skip "OI Scrub not implemented for ZFS" && return
571 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
572 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
574 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
575 echo "start $SINGLEMDS with disabling OI scrub"
576 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
577 error "(2) Fail to start MDS!"
579 #define OBD_FAIL_LFSCK_DELAY2 0x1601
580 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
581 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
582 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
583 mdd.${MDT_DEV}.lfsck_namespace |
584 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
586 error "(5) unexpected status"
589 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
590 [ "$STATUS" == "scanning-phase1" ] ||
591 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
593 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
594 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
595 mdd.${MDT_DEV}.lfsck_namespace |
596 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
598 error "(7) unexpected status"
601 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
602 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
604 local repaired=$($SHOW_NAMESPACE |
605 awk '/^dirent_repaired/ { print $2 }')
606 # for interop with old server
607 [ -z "$repaired" ] &&
608 repaired=$($SHOW_NAMESPACE |
609 awk '/^updated_phase1/ { print $2 }')
611 [ $repaired -ge 9 ] ||
612 error "(9) Fail to re-generate FID-in-dirent: $repaired"
616 mount_client $MOUNT || error "(10) Fail to start client!"
618 #define OBD_FAIL_FID_LOOKUP 0x1505
619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
620 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
621 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
623 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
627 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
628 skip "OI Scrub not implemented for ZFS" && return
631 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
632 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
634 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
635 echo "start $SINGLEMDS with disabling OI scrub"
636 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
637 error "(2) Fail to start MDS!"
639 #define OBD_FAIL_LFSCK_DELAY2 0x1601
640 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
641 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
643 mdd.${MDT_DEV}.lfsck_namespace |
644 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
646 error "(5) unexpected status"
649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
650 [ "$STATUS" == "scanning-phase1" ] ||
651 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
653 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
658 error "(7) unexpected status"
661 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
662 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
664 local repaired=$($SHOW_NAMESPACE |
665 awk '/^dirent_repaired/ { print $2 }')
666 # for interop with old server
667 [ -z "$repaired" ] &&
668 repaired=$($SHOW_NAMESPACE |
669 awk '/^updated_phase1/ { print $2 }')
671 [ $repaired -ge 2 ] ||
672 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
676 mount_client $MOUNT || error "(10) Fail to start client!"
678 #define OBD_FAIL_FID_LOOKUP 0x1505
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
680 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
682 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
684 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
685 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
686 local dummyname=$($LFS fid2path $DIR $dummyfid)
687 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
688 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
690 run_test 5 "LFSCK can handle IGIF object upgrading"
695 #define OBD_FAIL_LFSCK_DELAY1 0x1600
696 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
697 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
699 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
700 [ "$STATUS" == "scanning-phase1" ] ||
701 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
703 # Sleep 3 sec to guarantee at least one object processed by LFSCK
705 # Fail the LFSCK to guarantee there is at least one checkpoint
706 #define OBD_FAIL_LFSCK_FATAL1 0x1608
707 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
708 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
709 mdd.${MDT_DEV}.lfsck_namespace |
710 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
712 error "(4) unexpected status"
715 local POS0=$($SHOW_NAMESPACE |
716 awk '/^last_checkpoint_position/ { print $2 }' |
719 #define OBD_FAIL_LFSCK_DELAY1 0x1600
720 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
721 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
723 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
724 [ "$STATUS" == "scanning-phase1" ] ||
725 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
727 local POS1=$($SHOW_NAMESPACE |
728 awk '/^latest_start_position/ { print $2 }' |
730 [[ $POS0 -lt $POS1 ]] ||
731 error "(7) Expect larger than: $POS0, but got $POS1"
733 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
734 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
735 mdd.${MDT_DEV}.lfsck_namespace |
736 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
738 error "(8) unexpected status"
741 run_test 6a "LFSCK resumes from last checkpoint (1)"
746 #define OBD_FAIL_LFSCK_DELAY2 0x1601
747 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
748 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
750 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
751 [ "$STATUS" == "scanning-phase1" ] ||
752 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
754 # Sleep 5 sec to guarantee that we are in the directory scanning
756 # Fail the LFSCK to guarantee there is at least one checkpoint
757 #define OBD_FAIL_LFSCK_FATAL2 0x1609
758 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
759 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
760 mdd.${MDT_DEV}.lfsck_namespace |
761 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
763 error "(4) unexpected status"
766 local O_POS0=$($SHOW_NAMESPACE |
767 awk '/^last_checkpoint_position/ { print $2 }' |
770 local D_POS0=$($SHOW_NAMESPACE |
771 awk '/^last_checkpoint_position/ { print $4 }')
773 #define OBD_FAIL_LFSCK_DELAY2 0x1601
774 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
775 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
777 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
778 [ "$STATUS" == "scanning-phase1" ] ||
779 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
781 local O_POS1=$($SHOW_NAMESPACE |
782 awk '/^latest_start_position/ { print $2 }' |
784 local D_POS1=$($SHOW_NAMESPACE |
785 awk '/^latest_start_position/ { print $4 }')
787 echo "Additional debug for 6b"
789 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
790 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
791 [[ $O_POS0 -lt $O_POS1 ]] ||
792 error "(7.1) $O_POS1 is not larger than $O_POS0"
794 [[ $D_POS0 -lt $D_POS1 ]] ||
795 error "(7.2) $D_POS1 is not larger than $D_POS0"
798 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
799 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
800 mdd.${MDT_DEV}.lfsck_namespace |
801 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
803 error "(8) unexpected status"
806 run_test 6b "LFSCK resumes from last checkpoint (2)"
813 #define OBD_FAIL_LFSCK_DELAY2 0x1601
814 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
815 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
817 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
818 [ "$STATUS" == "scanning-phase1" ] ||
819 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
821 # Sleep 3 sec to guarantee at least one object processed by LFSCK
823 echo "stop $SINGLEMDS"
824 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
826 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
827 echo "start $SINGLEMDS"
828 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
829 error "(5) Fail to start MDS!"
831 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
832 mdd.${MDT_DEV}.lfsck_namespace |
833 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
835 error "(6) unexpected status"
838 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
844 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
845 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
846 for ((i = 0; i < 20; i++)); do
847 touch $DIR/$tdir/dummy${i}
850 #define OBD_FAIL_LFSCK_DELAY3 0x1602
851 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
852 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
853 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
854 mdd.${MDT_DEV}.lfsck_namespace |
855 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
857 error "(4) unexpected status"
861 echo "stop $SINGLEMDS"
862 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
864 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
865 echo "start $SINGLEMDS"
866 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
867 error "(6) Fail to start MDS!"
869 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
870 mdd.${MDT_DEV}.lfsck_namespace |
871 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
873 error "(7) unexpected status"
876 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
881 formatall > /dev/null
887 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
888 [ "$STATUS" == "init" ] ||
889 error "(2) Expect 'init', but got '$STATUS'"
891 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
893 mkdir $DIR/$tdir/crashed
895 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
896 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
897 for ((i = 0; i < 5; i++)); do
898 touch $DIR/$tdir/dummy${i}
901 umount_client $MOUNT || error "(3) Fail to stop client!"
903 #define OBD_FAIL_LFSCK_DELAY2 0x1601
904 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
905 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
907 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
908 [ "$STATUS" == "scanning-phase1" ] ||
909 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
911 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
913 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
914 [ "$STATUS" == "stopped" ] ||
915 error "(7) Expect 'stopped', but got '$STATUS'"
917 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
919 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
920 [ "$STATUS" == "scanning-phase1" ] ||
921 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
923 #define OBD_FAIL_LFSCK_FATAL2 0x1609
924 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
925 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
926 mdd.${MDT_DEV}.lfsck_namespace |
927 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
929 error "(10) unexpected status"
932 #define OBD_FAIL_LFSCK_DELAY1 0x1600
933 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
934 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
936 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
937 [ "$STATUS" == "scanning-phase1" ] ||
938 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
940 #define OBD_FAIL_LFSCK_CRASH 0x160a
941 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
944 echo "stop $SINGLEMDS"
945 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
947 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
948 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
950 echo "start $SINGLEMDS"
951 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
952 error "(14) Fail to start MDS!"
954 local timeout=$(max_recovery_time)
957 while [ $timer -lt $timeout ]; do
958 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
959 mdt.${MDT_DEV}.recovery_status |
960 awk '/^status/ { print \\\$2 }'")
961 [ "$STATUS" != "RECOVERING" ] && break;
966 [ $timer != $timeout ] ||
967 error "(14.1) recovery timeout"
969 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
970 [ "$STATUS" == "crashed" ] ||
971 error "(15) Expect 'crashed', but got '$STATUS'"
973 #define OBD_FAIL_LFSCK_DELAY2 0x1601
974 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
975 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
977 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
978 [ "$STATUS" == "scanning-phase1" ] ||
979 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
981 echo "stop $SINGLEMDS"
982 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
984 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
985 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
987 echo "start $SINGLEMDS"
988 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
989 error "(19) Fail to start MDS!"
992 while [ $timer -lt $timeout ]; do
993 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
994 mdt.${MDT_DEV}.recovery_status |
995 awk '/^status/ { print \\\$2 }'")
996 [ "$STATUS" != "RECOVERING" ] && break;
1001 [ $timer != $timeout ] ||
1002 error "(19.1) recovery timeout"
1004 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1005 [ "$STATUS" == "paused" ] ||
1006 error "(20) Expect 'paused', but got '$STATUS'"
1008 echo "stop $SINGLEMDS"
1009 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1011 echo "start $SINGLEMDS without resume LFSCK"
1012 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1013 error "(20.2) Fail to start MDS!"
1016 while [ $timer -lt $timeout ]; do
1017 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1018 mdt.${MDT_DEV}.recovery_status |
1019 awk '/^status/ { print \\\$2 }'")
1020 [ "$STATUS" != "RECOVERING" ] && break;
1022 timer=$((timer + 1))
1025 [ $timer != $timeout ] ||
1026 error "(20.3) recovery timeout"
1028 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1029 [ "$STATUS" == "paused" ] ||
1030 error "(20.4) Expect 'paused', but got '$STATUS'"
1032 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1033 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1035 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1036 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1037 mdd.${MDT_DEV}.lfsck_namespace |
1038 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1040 error "(22) unexpected status"
1043 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1044 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1045 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1047 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1048 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1049 mdd.${MDT_DEV}.lfsck_namespace |
1050 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1052 error "(24) unexpected status"
1055 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1056 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1058 run_test 8 "LFSCK state machine"
1061 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1062 skip "Testing on UP system, the speed may be inaccurate."
1066 check_mount_and_prep
1067 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1068 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1069 createmany -o $DIR/$tdir/lfsck/f 5000
1071 local BASE_SPEED1=100
1073 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1076 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1077 [ "$STATUS" == "scanning-phase1" ] ||
1078 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1080 local SPEED=$($SHOW_LAYOUT |
1081 awk '/^average_speed_phase1/ { print $2 }')
1083 # There may be time error, normally it should be less than 2 seconds.
1084 # We allow another 20% schedule error.
1086 # MAX_MARGIN = 1.3 = 13 / 10
1087 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1088 RUN_TIME1 * 13 / 10))
1089 [ $SPEED -lt $MAX_SPEED ] || {
1091 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1092 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1095 # adjust speed limit
1096 local BASE_SPEED2=300
1098 do_facet $SINGLEMDS \
1099 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1102 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1103 # MIN_MARGIN = 0.7 = 7 / 10
1104 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1105 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1106 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1107 [ $SPEED -gt $MIN_SPEED ] || {
1108 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1109 error_ignore LU-5624 \
1110 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1113 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1117 # MAX_MARGIN = 1.3 = 13 / 10
1118 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1119 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1120 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1121 [ $SPEED -lt $MAX_SPEED ] || {
1123 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1124 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1125 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1128 do_nodes $(comma_list $(mdts_nodes)) \
1129 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1130 do_nodes $(comma_list $(osts_nodes)) \
1131 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1133 wait_update_facet $SINGLEMDS \
1134 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1135 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1136 error "(7) Failed to get expected 'completed'"
1138 run_test 9a "LFSCK speed control (1)"
1141 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1142 skip "Testing on UP system, the speed may be inaccurate."
1148 echo "Preparing another 50 * 50 files (with error) at $(date)."
1149 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1150 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1151 createmany -d $DIR/$tdir/d 50
1152 createmany -m $DIR/$tdir/f 50
1153 for ((i = 0; i < 50; i++)); do
1154 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1157 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1158 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1159 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1160 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1161 mdd.${MDT_DEV}.lfsck_namespace |
1162 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1164 error "(5) unexpected status"
1167 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1168 echo "Prepared at $(date)."
1170 local BASE_SPEED1=50
1172 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1176 [ "$STATUS" == "scanning-phase2" ] ||
1177 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1179 local SPEED=$($SHOW_NAMESPACE |
1180 awk '/^average_speed_phase2/ { print $2 }')
1181 # There may be time error, normally it should be less than 2 seconds.
1182 # We allow another 20% schedule error.
1184 # MAX_MARGIN = 1.3 = 13 / 10
1185 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1186 RUN_TIME1 * 13 / 10))
1187 [ $SPEED -lt $MAX_SPEED ] || {
1189 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1190 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1193 # adjust speed limit
1194 local BASE_SPEED2=150
1196 do_facet $SINGLEMDS \
1197 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1200 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1201 # MIN_MARGIN = 0.7 = 7 / 10
1202 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1203 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1204 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1205 [ $SPEED -gt $MIN_SPEED ] || {
1206 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1207 error_ignore LU-5624 \
1208 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1211 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1215 # MAX_MARGIN = 1.3 = 13 / 10
1216 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1217 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1218 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1219 [ $SPEED -lt $MAX_SPEED ] || {
1221 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1222 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1223 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1226 do_nodes $(comma_list $(mdts_nodes)) \
1227 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1228 do_nodes $(comma_list $(osts_nodes)) \
1229 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1230 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1231 mdd.${MDT_DEV}.lfsck_namespace |
1232 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1234 error "(11) unexpected status"
1237 run_test 9b "LFSCK speed control (2)"
1241 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1242 skip "lookup(..)/linkea on ZFS issue" && return
1246 echo "Preparing more files with error at $(date)."
1247 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1250 for ((i = 0; i < 1000; i = $((i+2)))); do
1251 mkdir -p $DIR/$tdir/d${i}
1252 touch $DIR/$tdir/f${i}
1253 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1256 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1257 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1259 for ((i = 1; i < 1000; i = $((i+2)))); do
1260 mkdir -p $DIR/$tdir/d${i}
1261 touch $DIR/$tdir/f${i}
1262 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1266 echo "Prepared at $(date)."
1268 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1270 umount_client $MOUNT
1271 mount_client $MOUNT || error "(3) Fail to start client!"
1273 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1276 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1277 [ "$STATUS" == "scanning-phase1" ] ||
1278 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1280 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1282 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1284 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1286 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1288 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1290 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1292 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1294 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1295 error "(14) Fail to softlink!"
1297 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1298 [ "$STATUS" == "scanning-phase1" ] ||
1299 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1301 do_nodes $(comma_list $(mdts_nodes)) \
1302 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1303 do_nodes $(comma_list $(osts_nodes)) \
1304 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1305 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1306 mdd.${MDT_DEV}.lfsck_namespace |
1307 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1309 error "(16) unexpected status"
1312 run_test 10 "System is available during LFSCK scanning"
1315 ost_remove_lastid() {
1318 local rcmd="do_facet ost${ost}"
1320 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1322 # step 1: local mount
1323 mount_fstype ost${ost} || return 1
1324 # step 2: remove the specified LAST_ID
1325 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1327 unmount_fstype ost${ost} || return 2
1331 check_mount_and_prep
1332 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1333 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1338 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1340 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1341 error "(2) Fail to start ost1"
1343 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1344 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1346 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1347 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1349 wait_update_facet ost1 "$LCTL get_param -n \
1350 obdfilter.${OST_DEV}.lfsck_layout |
1351 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1353 error "(5) unexpected status"
1356 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1358 wait_update_facet ost1 "$LCTL get_param -n \
1359 obdfilter.${OST_DEV}.lfsck_layout |
1360 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1362 error "(6) unexpected status"
1365 echo "the LAST_ID(s) should have been rebuilt"
1366 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1367 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1369 run_test 11a "LFSCK can rebuild lost last_id"
1372 check_mount_and_prep
1373 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1375 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1376 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1377 do_facet ost1 $LCTL set_param fail_loc=0x160d
1379 local count=$(precreated_ost_obj_count 0 0)
1381 createmany -o $DIR/$tdir/f $((count + 32))
1383 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1384 local seq=$(do_facet mds1 $LCTL get_param -n \
1385 osp.${proc_path}.prealloc_last_seq)
1386 local lastid1=$(do_facet ost1 "lctl get_param -n \
1387 obdfilter.${ost1_svc}.last_id" | grep $seq |
1388 awk -F: '{ print $2 }')
1390 umount_client $MOUNT
1391 stop ost1 || error "(1) Fail to stop ost1"
1393 #define OBD_FAIL_OST_ENOSPC 0x215
1394 do_facet ost1 $LCTL set_param fail_loc=0x215
1396 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1397 error "(2) Fail to start ost1"
1399 for ((i = 0; i < 60; i++)); do
1400 lastid2=$(do_facet ost1 "lctl get_param -n \
1401 obdfilter.${ost1_svc}.last_id" | grep $seq |
1402 awk -F: '{ print $2 }')
1403 [ ! -z $lastid2 ] && break;
1407 echo "the on-disk LAST_ID should be smaller than the expected one"
1408 [ $lastid1 -gt $lastid2 ] ||
1409 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1411 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1412 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1414 wait_update_facet ost1 "$LCTL get_param -n \
1415 obdfilter.${OST_DEV}.lfsck_layout |
1416 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1418 error "(6) unexpected status"
1421 stop ost1 || error "(7) Fail to stop ost1"
1423 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1424 error "(8) Fail to start ost1"
1426 echo "the on-disk LAST_ID should have been rebuilt"
1427 wait_update_facet ost1 "$LCTL get_param -n \
1428 obdfilter.${ost1_svc}.last_id | grep $seq |
1429 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1430 do_facet ost1 $LCTL get_param -n \
1431 obdfilter.${ost1_svc}.last_id
1432 error "(9) expect lastid1 $seq:$lastid1"
1435 do_facet ost1 $LCTL set_param fail_loc=0
1436 stopall || error "(10) Fail to stopall"
1438 run_test 11b "LFSCK can rebuild crashed last_id"
1441 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1443 check_mount_and_prep
1444 for k in $(seq $MDSCOUNT); do
1445 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1446 createmany -o $DIR/$tdir/${k}/f 100 ||
1447 error "(0) Fail to create 100 files."
1450 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1451 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1452 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1454 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1455 wait_all_targets namespace scanning-phase1 3
1457 echo "Stop namespace LFSCK on all targets by single lctl command."
1458 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1459 error "(4) Fail to stop LFSCK on all devices!"
1461 echo "All the LFSCK targets should be in 'stopped' status."
1462 wait_all_targets_blocked namespace stopped 5
1464 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1465 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1466 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1468 echo "All the LFSCK targets should be in 'completed' status."
1469 wait_all_targets_blocked namespace completed 7
1471 start_full_debug_logging
1473 echo "Start layout LFSCK on all targets by single command (-s 1)."
1474 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1475 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1477 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1478 wait_all_targets layout scanning-phase1 9
1480 echo "Stop layout LFSCK on all targets by single lctl command."
1481 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1482 error "(10) Fail to stop LFSCK on all devices!"
1484 echo "All the LFSCK targets should be in 'stopped' status."
1485 wait_all_targets_blocked layout stopped 11
1487 for k in $(seq $OSTCOUNT); do
1488 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1489 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1490 awk '/^status/ { print $2 }')
1491 [ "$STATUS" == "stopped" ] ||
1492 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1495 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1496 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1497 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1499 echo "All the LFSCK targets should be in 'completed' status."
1500 wait_all_targets_blocked layout completed 14
1502 stop_full_debug_logging
1504 run_test 12a "single command to trigger LFSCK on all devices"
1507 check_mount_and_prep
1509 echo "Start LFSCK without '-M' specified."
1510 do_facet mds1 $LCTL lfsck_start -A -r ||
1511 error "(0) Fail to start LFSCK without '-M'"
1513 wait_all_targets_blocked namespace completed 1
1514 wait_all_targets_blocked layout completed 2
1516 local count=$(do_facet mds1 $LCTL dl |
1517 awk '{ print $3 }' | grep mdt | wc -l)
1518 if [ $count -gt 1 ]; then
1520 echo "Start layout LFSCK on the node with multipe targets,"
1521 echo "but not specify '-M'/'-A' option. Should get failure."
1523 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1524 error "(3) Start layout LFSCK should fail" || true
1527 run_test 12b "auto detect Lustre device"
1531 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1532 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1533 echo "MDT-object FID."
1536 check_mount_and_prep
1538 echo "Inject failure stub to simulate bad lmm_oi"
1539 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1541 createmany -o $DIR/$tdir/f 1
1542 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1543 error "(0) Fail to create PFL $DIR/$tdir/f1"
1544 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1546 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1547 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1549 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1550 mdd.${MDT_DEV}.lfsck_layout |
1551 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1553 error "(2) unexpected status"
1556 local repaired=$($SHOW_LAYOUT |
1557 awk '/^repaired_others/ { print $2 }')
1558 [ $repaired -eq 2 ] ||
1559 error "(3) Fail to repair crashed lmm_oi: $repaired"
1561 run_test 13 "LFSCK can repair crashed lmm_oi"
1565 echo "The OST-object referenced by the MDT-object should be there;"
1566 echo "otherwise, the LFSCK should re-create the missing OST-object."
1567 echo "without '--delay-create-ostobj' option."
1570 check_mount_and_prep
1571 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1573 echo "Inject failure stub to simulate dangling referenced MDT-object"
1574 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1575 do_facet ost1 $LCTL set_param fail_loc=0x1610
1576 local count=$(precreated_ost_obj_count 0 0)
1578 createmany -o $DIR/$tdir/f $((count + 16)) ||
1579 error "(0.1) Fail to create $DIR/$tdir/fx"
1580 touch $DIR/$tdir/guard0
1582 for ((i = 0; i < 16; i++)); do
1583 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1584 $DIR/$tdir/f_comp${i} ||
1585 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1587 touch $DIR/$tdir/guard1
1589 do_facet ost1 $LCTL set_param fail_loc=0
1591 start_full_debug_logging
1593 # exhaust other pre-created dangling cases
1594 count=$(precreated_ost_obj_count 0 0)
1595 createmany -o $DIR/$tdir/a $count ||
1596 error "(0.5) Fail to create $count files."
1598 echo "'ls' should fail because of dangling referenced MDT-object"
1599 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1601 echo "Trigger layout LFSCK to find out dangling reference"
1602 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1605 mdd.${MDT_DEV}.lfsck_layout |
1606 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1608 error "(3) unexpected status"
1611 local repaired=$($SHOW_LAYOUT |
1612 awk '/^repaired_dangling/ { print $2 }')
1613 [ $repaired -ge 32 ] ||
1614 error "(4) Fail to repair dangling reference: $repaired"
1616 echo "'stat' should fail because of not repair dangling by default"
1617 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1618 error "(5.1) stat should fail"
1619 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1620 error "(5.2) stat should fail"
1622 echo "Trigger layout LFSCK to repair dangling reference"
1623 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1625 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1626 mdd.${MDT_DEV}.lfsck_layout |
1627 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1629 error "(7) unexpected status"
1632 # There may be some async LFSCK updates in processing, wait for
1633 # a while until the target reparation has been done. LU-4970.
1635 echo "'stat' should success after layout LFSCK repairing"
1636 wait_update_facet client "stat $DIR/$tdir/guard0 |
1637 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1638 stat $DIR/$tdir/guard0
1640 error "(8.1) unexpected size"
1643 wait_update_facet client "stat $DIR/$tdir/guard1 |
1644 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1645 stat $DIR/$tdir/guard1
1647 error "(8.2) unexpected size"
1650 repaired=$($SHOW_LAYOUT |
1651 awk '/^repaired_dangling/ { print $2 }')
1652 [ $repaired -ge 32 ] ||
1653 error "(9) Fail to repair dangling reference: $repaired"
1655 stop_full_debug_logging
1657 echo "stopall to cleanup object cache"
1660 setupall > /dev/null
1662 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1666 echo "The OST-object referenced by the MDT-object should be there;"
1667 echo "otherwise, the LFSCK should re-create the missing OST-object."
1668 echo "with '--delay-create-ostobj' option."
1671 check_mount_and_prep
1672 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1674 echo "Inject failure stub to simulate dangling referenced MDT-object"
1675 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1676 do_facet ost1 $LCTL set_param fail_loc=0x1610
1677 local count=$(precreated_ost_obj_count 0 0)
1679 createmany -o $DIR/$tdir/f $((count + 31))
1680 touch $DIR/$tdir/guard
1681 do_facet ost1 $LCTL set_param fail_loc=0
1683 start_full_debug_logging
1685 # exhaust other pre-created dangling cases
1686 count=$(precreated_ost_obj_count 0 0)
1687 createmany -o $DIR/$tdir/a $count ||
1688 error "(0) Fail to create $count files."
1690 echo "'ls' should fail because of dangling referenced MDT-object"
1691 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1693 echo "Trigger layout LFSCK to find out dangling reference"
1694 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1696 wait_all_targets_blocked layout completed 3
1698 local repaired=$($SHOW_LAYOUT |
1699 awk '/^repaired_dangling/ { print $2 }')
1700 [ $repaired -ge 32 ] ||
1701 error "(4) Fail to repair dangling reference: $repaired"
1703 echo "'stat' should fail because of not repair dangling by default"
1704 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1706 echo "Trigger layout LFSCK to repair dangling reference"
1707 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1709 wait_all_targets_blocked layout completed 7
1711 # There may be some async LFSCK updates in processing, wait for
1712 # a while until the target reparation has been done. LU-4970.
1714 echo "'stat' should success after layout LFSCK repairing"
1715 wait_update_facet client "stat $DIR/$tdir/guard |
1716 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1717 stat $DIR/$tdir/guard
1719 error "(8) unexpected size"
1722 repaired=$($SHOW_LAYOUT |
1723 awk '/^repaired_dangling/ { print $2 }')
1724 [ $repaired -ge 32 ] ||
1725 error "(9) Fail to repair dangling reference: $repaired"
1727 stop_full_debug_logging
1729 echo "stopall to cleanup object cache"
1732 setupall > /dev/null
1734 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1738 echo "If the OST-object referenced by the MDT-object back points"
1739 echo "to some non-exist MDT-object, then the LFSCK should repair"
1740 echo "the OST-object to back point to the right MDT-object."
1743 check_mount_and_prep
1744 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1746 echo "Inject failure stub to make the OST-object to back point to"
1747 echo "non-exist MDT-object."
1748 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1750 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1751 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1752 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1754 error "(0) Fail to create PFL $DIR/$tdir/f1"
1755 # 'dd' will trigger punch RPC firstly on every OST-objects.
1756 # So even though some OST-object will not be write by 'dd',
1757 # as long as it is allocated (may be NOT allocated in pfl_3b)
1758 # its layout information will be set also.
1759 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1760 cancel_lru_locks osc
1761 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1763 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1764 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1766 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1767 mdd.${MDT_DEV}.lfsck_layout |
1768 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1770 error "(2) unexpected status"
1773 local repaired=$($SHOW_LAYOUT |
1774 awk '/^repaired_unmatched_pair/ { print $2 }')
1775 [ $repaired -ge 3 ] ||
1776 error "(3) Fail to repair unmatched pair: $repaired"
1778 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1782 echo "If the OST-object referenced by the MDT-object back points"
1783 echo "to other MDT-object that doesn't recognize the OST-object,"
1784 echo "then the LFSCK should repair it to back point to the right"
1785 echo "MDT-object (the first one)."
1788 check_mount_and_prep
1789 mkdir -p $DIR/$tdir/0
1790 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1791 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1792 cancel_lru_locks osc
1794 echo "Inject failure stub to make the OST-object to back point to"
1795 echo "other MDT-object"
1798 [ $OSTCOUNT -ge 2 ] && stripes=2
1800 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1801 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1802 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1803 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1805 error "(0) Fail to create PFL $DIR/$tdir/f1"
1806 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1807 cancel_lru_locks osc
1808 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1810 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1811 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1813 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1814 mdd.${MDT_DEV}.lfsck_layout |
1815 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1817 error "(2) unexpected status"
1820 local repaired=$($SHOW_LAYOUT |
1821 awk '/^repaired_unmatched_pair/ { print $2 }')
1822 [ $repaired -eq 4 ] ||
1823 error "(3) Fail to repair unmatched pair: $repaired"
1825 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1828 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1830 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1831 skip "Skip the test after 2.7.55 see LU-6437" && return
1834 echo "According to current metadata migration implementation,"
1835 echo "before the old MDT-object is removed, both the new MDT-object"
1836 echo "and old MDT-object will reference the same LOV layout. Then if"
1837 echo "the layout LFSCK finds the new MDT-object by race, it will"
1838 echo "regard related OST-object(s) as multiple referenced case, and"
1839 echo "will try to create new OST-object(s) for the new MDT-object."
1840 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1841 echo "MDT-object before confirm the multiple referenced case."
1844 check_mount_and_prep
1845 $LFS mkdir -i 1 $DIR/$tdir/a1
1846 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1847 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1848 cancel_lru_locks osc
1850 echo "Inject failure stub on MDT1 to delay the migration"
1852 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1853 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1854 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1855 $LFS migrate -m 0 $DIR/$tdir/a1 &
1858 echo "Trigger layout LFSCK to race with the migration"
1859 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1861 wait_all_targets_blocked layout completed 2
1863 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1864 local repaired=$($SHOW_LAYOUT |
1865 awk '/^repaired_unmatched_pair/ { print $2 }')
1866 [ $repaired -eq 1 ] ||
1867 error "(3) Fail to repair unmatched pair: $repaired"
1869 repaired=$($SHOW_LAYOUT |
1870 awk '/^repaired_multiple_referenced/ { print $2 }')
1871 [ $repaired -eq 0 ] ||
1872 error "(4) Unexpectedly repaird multiple references: $repaired"
1874 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1878 echo "If the OST-object's owner information does not match the owner"
1879 echo "information stored in the MDT-object, then the LFSCK trust the"
1880 echo "MDT-object and update the OST-object's owner information."
1883 check_mount_and_prep
1884 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1885 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1886 cancel_lru_locks osc
1888 echo "Inject failure stub to skip OST-object owner changing"
1889 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1891 chown 1.1 $DIR/$tdir/f0
1892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1894 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1897 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1899 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1900 mdd.${MDT_DEV}.lfsck_layout |
1901 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1903 error "(2) unexpected status"
1906 local repaired=$($SHOW_LAYOUT |
1907 awk '/^repaired_inconsistent_owner/ { print $2 }')
1908 [ $repaired -eq 1 ] ||
1909 error "(3) Fail to repair inconsistent owner: $repaired"
1911 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1915 echo "If more than one MDT-objects reference the same OST-object,"
1916 echo "and the OST-object only recognizes one MDT-object, then the"
1917 echo "LFSCK should create new OST-objects for such non-recognized"
1921 check_mount_and_prep
1922 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1924 echo "Inject failure stub to make two MDT-objects to refernce"
1925 echo "the OST-object"
1927 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1928 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1929 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1930 cancel_lru_locks mdc
1931 cancel_lru_locks osc
1933 createmany -o $DIR/$tdir/f 1
1934 cancel_lru_locks mdc
1935 cancel_lru_locks osc
1937 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1939 error "(0) Fail to create PFL $DIR/$tdir/f1"
1940 cancel_lru_locks mdc
1941 cancel_lru_locks osc
1942 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1944 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1945 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1946 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1947 [ $size -eq 1048576 ] ||
1948 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1950 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1951 [ $size -eq 1048576 ] ||
1952 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1954 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1957 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1959 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1960 mdd.${MDT_DEV}.lfsck_layout |
1961 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1963 error "(3) unexpected status"
1966 local repaired=$($SHOW_LAYOUT |
1967 awk '/^repaired_multiple_referenced/ { print $2 }')
1968 [ $repaired -eq 2 ] ||
1969 error "(4) Fail to repair multiple references: $repaired"
1971 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1972 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1973 error "(5) Fail to write f0."
1974 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1975 [ $size -eq 1048576 ] ||
1976 error "(6) guard size should be 1048576, but got $size"
1978 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1979 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1980 error "(7) Fail to write f1."
1981 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1982 [ $size -eq 1048576 ] ||
1983 error "(8) guard size should be 1048576, but got $size"
1985 run_test 17 "LFSCK can repair multiple references"
1987 $LCTL set_param debug=+cache > /dev/null
1991 echo "The target MDT-object is there, but related stripe information"
1992 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1993 echo "layout EA entries."
1996 check_mount_and_prep
1997 $LFS mkdir -i 0 $DIR/$tdir/a1
1998 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1999 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2001 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2003 $LFS path2fid $DIR/$tdir/a1/f1
2004 $LFS getstripe $DIR/$tdir/a1/f1
2006 if [ $MDSCOUNT -ge 2 ]; then
2007 $LFS mkdir -i 1 $DIR/$tdir/a2
2008 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2009 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2010 $LFS path2fid $DIR/$tdir/a2/f2
2011 $LFS getstripe $DIR/$tdir/a2/f2
2014 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2015 error "(0) Fail to create PFL $DIR/$tdir/f3"
2017 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2019 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2021 $LFS path2fid $DIR/$tdir/f3
2022 $LFS getstripe $DIR/$tdir/f3
2024 cancel_lru_locks osc
2026 echo "Inject failure, to make the MDT-object lost its layout EA"
2027 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2028 do_facet mds1 $LCTL set_param fail_loc=0x1615
2029 chown 1.1 $DIR/$tdir/a1/f1
2031 if [ $MDSCOUNT -ge 2 ]; then
2032 do_facet mds2 $LCTL set_param fail_loc=0x1615
2033 chown 1.1 $DIR/$tdir/a2/f2
2036 chown 1.1 $DIR/$tdir/f3
2041 do_facet mds1 $LCTL set_param fail_loc=0
2042 if [ $MDSCOUNT -ge 2 ]; then
2043 do_facet mds2 $LCTL set_param fail_loc=0
2046 cancel_lru_locks mdc
2047 cancel_lru_locks osc
2049 echo "The file size should be incorrect since layout EA is lost"
2050 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2051 [ "$cur_size" != "$saved_size1" ] ||
2052 error "(1) Expect incorrect file1 size"
2054 if [ $MDSCOUNT -ge 2 ]; then
2055 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2056 [ "$cur_size" != "$saved_size1" ] ||
2057 error "(2) Expect incorrect file2 size"
2060 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2061 [ "$cur_size" != "$saved_size2" ] ||
2062 error "(1.2) Expect incorrect file3 size"
2064 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2065 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2067 for k in $(seq $MDSCOUNT); do
2068 # The LFSCK status query internal is 30 seconds. For the case
2069 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2070 # time to guarantee the status sync up.
2071 wait_update_facet mds${k} "$LCTL get_param -n \
2072 mdd.$(facet_svc mds${k}).lfsck_layout |
2073 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2074 error "(4) MDS${k} is not the expected 'completed'"
2077 for k in $(seq $OSTCOUNT); do
2078 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2079 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2080 awk '/^status/ { print $2 }')
2081 [ "$cur_status" == "completed" ] ||
2082 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2085 local repaired=$(do_facet mds1 $LCTL get_param -n \
2086 mdd.$(facet_svc mds1).lfsck_layout |
2087 awk '/^repaired_orphan/ { print $2 }')
2088 [ $repaired -eq 3 ] ||
2089 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2091 if [ $MDSCOUNT -ge 2 ]; then
2092 repaired=$(do_facet mds2 $LCTL get_param -n \
2093 mdd.$(facet_svc mds2).lfsck_layout |
2094 awk '/^repaired_orphan/ { print $2 }')
2095 [ $repaired -eq 2 ] ||
2096 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2099 $LFS path2fid $DIR/$tdir/a1/f1
2100 $LFS getstripe $DIR/$tdir/a1/f1
2102 if [ $MDSCOUNT -ge 2 ]; then
2103 $LFS path2fid $DIR/$tdir/a2/f2
2104 $LFS getstripe $DIR/$tdir/a2/f2
2107 $LFS path2fid $DIR/$tdir/f3
2108 $LFS getstripe $DIR/$tdir/f3
2110 echo "The file size should be correct after layout LFSCK scanning"
2111 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2112 [ "$cur_size" == "$saved_size1" ] ||
2113 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2115 if [ $MDSCOUNT -ge 2 ]; then
2116 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2117 [ "$cur_size" == "$saved_size1" ] ||
2118 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2121 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2122 [ "$cur_size" == "$saved_size2" ] ||
2123 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2125 run_test 18a "Find out orphan OST-object and repair it (1)"
2129 echo "The target MDT-object is lost. The LFSCK should re-create the"
2130 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2131 echo "can move it back to normal namespace manually."
2134 check_mount_and_prep
2135 $LFS mkdir -i 0 $DIR/$tdir/a1
2136 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2137 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2138 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2139 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2141 $LFS getstripe $DIR/$tdir/a1/f1
2143 if [ $MDSCOUNT -ge 2 ]; then
2144 $LFS mkdir -i 1 $DIR/$tdir/a2
2145 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2146 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2147 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2149 $LFS getstripe $DIR/$tdir/a2/f2
2152 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2153 error "(0) Fail to create PFL $DIR/$tdir/f3"
2155 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2157 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2158 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2160 $LFS getstripe $DIR/$tdir/f3
2162 cancel_lru_locks osc
2164 echo "Inject failure, to simulate the case of missing the MDT-object"
2165 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2166 do_facet mds1 $LCTL set_param fail_loc=0x1616
2167 rm -f $DIR/$tdir/a1/f1
2169 if [ $MDSCOUNT -ge 2 ]; then
2170 do_facet mds2 $LCTL set_param fail_loc=0x1616
2171 rm -f $DIR/$tdir/a2/f2
2179 do_facet mds1 $LCTL set_param fail_loc=0
2180 if [ $MDSCOUNT -ge 2 ]; then
2181 do_facet mds2 $LCTL set_param fail_loc=0
2184 cancel_lru_locks mdc
2185 cancel_lru_locks osc
2187 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2188 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2190 for k in $(seq $MDSCOUNT); do
2191 # The LFSCK status query internal is 30 seconds. For the case
2192 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2193 # time to guarantee the status sync up.
2194 wait_update_facet mds${k} "$LCTL get_param -n \
2195 mdd.$(facet_svc mds${k}).lfsck_layout |
2196 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2197 error "(2) MDS${k} is not the expected 'completed'"
2200 for k in $(seq $OSTCOUNT); do
2201 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2202 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2203 awk '/^status/ { print $2 }')
2204 [ "$cur_status" == "completed" ] ||
2205 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2208 local repaired=$(do_facet mds1 $LCTL get_param -n \
2209 mdd.$(facet_svc mds1).lfsck_layout |
2210 awk '/^repaired_orphan/ { print $2 }')
2211 [ $repaired -eq 3 ] ||
2212 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2214 if [ $MDSCOUNT -ge 2 ]; then
2215 repaired=$(do_facet mds2 $LCTL get_param -n \
2216 mdd.$(facet_svc mds2).lfsck_layout |
2217 awk '/^repaired_orphan/ { print $2 }')
2218 [ $repaired -eq 2 ] ||
2219 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2222 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2223 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2224 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2226 if [ $MDSCOUNT -ge 2 ]; then
2227 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2228 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2231 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2232 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2234 $LFS path2fid $DIR/$tdir/a1/f1
2235 $LFS getstripe $DIR/$tdir/a1/f1
2237 if [ $MDSCOUNT -ge 2 ]; then
2238 $LFS path2fid $DIR/$tdir/a2/f2
2239 $LFS getstripe $DIR/$tdir/a2/f2
2242 $LFS path2fid $DIR/$tdir/f3
2243 $LFS getstripe $DIR/$tdir/f3
2245 echo "The file size should be correct after layout LFSCK scanning"
2246 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2247 [ "$cur_size" == "$saved_size1" ] ||
2248 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2250 if [ $MDSCOUNT -ge 2 ]; then
2251 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2252 [ "$cur_size" == "$saved_size1" ] ||
2253 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2256 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2257 [ "$cur_size" == "$saved_size2" ] ||
2258 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2260 run_test 18b "Find out orphan OST-object and repair it (2)"
2264 echo "The target MDT-object is lost, and the OST-object FID is missing."
2265 echo "The LFSCK should re-create the MDT-object with new FID under the "
2266 echo "directory .lustre/lost+found/MDTxxxx."
2269 check_mount_and_prep
2270 $LFS mkdir -i 0 $DIR/$tdir/a1
2271 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2273 echo "Inject failure, to simulate the case of missing parent FID"
2274 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2275 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2277 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2278 $LFS getstripe $DIR/$tdir/a1/f1
2280 if [ $MDSCOUNT -ge 2 ]; then
2281 $LFS mkdir -i 1 $DIR/$tdir/a2
2282 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2283 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2284 $LFS getstripe $DIR/$tdir/a2/f2
2287 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2288 error "(0) Fail to create PFL $DIR/$tdir/f3"
2290 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2291 $LFS getstripe $DIR/$tdir/f3
2293 cancel_lru_locks osc
2294 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2296 echo "Inject failure, to simulate the case of missing the MDT-object"
2297 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2298 do_facet mds1 $LCTL set_param fail_loc=0x1616
2299 rm -f $DIR/$tdir/a1/f1
2301 if [ $MDSCOUNT -ge 2 ]; then
2302 do_facet mds2 $LCTL set_param fail_loc=0x1616
2303 rm -f $DIR/$tdir/a2/f2
2311 do_facet mds1 $LCTL set_param fail_loc=0
2312 if [ $MDSCOUNT -ge 2 ]; then
2313 do_facet mds2 $LCTL set_param fail_loc=0
2316 cancel_lru_locks mdc
2317 cancel_lru_locks osc
2319 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2320 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2322 for k in $(seq $MDSCOUNT); do
2323 # The LFSCK status query internal is 30 seconds. For the case
2324 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2325 # time to guarantee the status sync up.
2326 wait_update_facet mds${k} "$LCTL get_param -n \
2327 mdd.$(facet_svc mds${k}).lfsck_layout |
2328 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2329 error "(2) MDS${k} is not the expected 'completed'"
2332 for k in $(seq $OSTCOUNT); do
2333 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2334 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2335 awk '/^status/ { print $2 }')
2336 [ "$cur_status" == "completed" ] ||
2337 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2340 if [ $MDSCOUNT -ge 2 ]; then
2346 local repaired=$(do_facet mds1 $LCTL get_param -n \
2347 mdd.$(facet_svc mds1).lfsck_layout |
2348 awk '/^repaired_orphan/ { print $2 }')
2349 [ $repaired -eq $expected ] ||
2350 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2352 if [ $MDSCOUNT -ge 2 ]; then
2353 repaired=$(do_facet mds2 $LCTL get_param -n \
2354 mdd.$(facet_svc mds2).lfsck_layout |
2355 awk '/^repaired_orphan/ { print $2 }')
2356 [ $repaired -eq 0 ] ||
2357 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2360 ls -ail $MOUNT/.lustre/lost+found/
2362 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2363 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2364 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2366 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2369 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2370 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2371 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2373 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2374 [ ! -z "$cname" ] ||
2375 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2377 run_test 18c "Find out orphan OST-object and repair it (3)"
2381 echo "The target MDT-object layout EA is corrupted, but the right"
2382 echo "OST-object is still alive as orphan. The layout LFSCK will"
2383 echo "not create new OST-object to occupy such slot."
2386 check_mount_and_prep
2388 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2389 echo "guard" > $DIR/$tdir/a1/f1
2390 echo "foo" > $DIR/$tdir/a1/f2
2392 echo "guard" > $DIR/$tdir/a1/f3
2393 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2394 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2395 echo "foo" > $DIR/$tdir/a1/f4
2397 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2398 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2399 $LFS path2fid $DIR/$tdir/a1/f1
2400 $LFS getstripe $DIR/$tdir/a1/f1
2401 $LFS path2fid $DIR/$tdir/a1/f2
2402 $LFS getstripe $DIR/$tdir/a1/f2
2403 $LFS path2fid $DIR/$tdir/a1/f3
2404 $LFS getstripe $DIR/$tdir/a1/f3
2405 $LFS path2fid $DIR/$tdir/a1/f4
2406 $LFS getstripe $DIR/$tdir/a1/f4
2407 cancel_lru_locks osc
2409 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2410 echo "to reference the same OST-object (which is f1's OST-obejct)."
2411 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2412 echo "dangling reference case, but f2's old OST-object is there."
2414 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2415 echo "to reference the same OST-object (which is f3's OST-obejct)."
2416 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2417 echo "dangling reference case, but f4's old OST-object is there."
2420 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2421 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2422 chown 1.1 $DIR/$tdir/a1/f2
2423 chown 1.1 $DIR/$tdir/a1/f4
2424 rm -f $DIR/$tdir/a1/f1
2425 rm -f $DIR/$tdir/a1/f3
2428 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2430 echo "stopall to cleanup object cache"
2433 setupall > /dev/null
2435 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2436 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2438 for k in $(seq $MDSCOUNT); do
2439 # The LFSCK status query internal is 30 seconds. For the case
2440 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2441 # time to guarantee the status sync up.
2442 wait_update_facet mds${k} "$LCTL get_param -n \
2443 mdd.$(facet_svc mds${k}).lfsck_layout |
2444 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2445 error "(3) MDS${k} is not the expected 'completed'"
2448 for k in $(seq $OSTCOUNT); do
2449 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2450 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2451 awk '/^status/ { print $2 }')
2452 [ "$cur_status" == "completed" ] ||
2453 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2456 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2457 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2458 awk '/^repaired_orphan/ { print $2 }')
2459 [ $repaired -eq 2 ] ||
2460 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2462 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2463 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2464 awk '/^repaired_dangling/ { print $2 }')
2465 [ $repaired -eq 0 ] ||
2466 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2468 echo "The file size should be correct after layout LFSCK scanning"
2469 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2470 [ "$cur_size" == "$saved_size1" ] ||
2471 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2473 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2474 [ "$cur_size" == "$saved_size2" ] ||
2475 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2477 echo "The LFSCK should find back the original data."
2478 cat $DIR/$tdir/a1/f2
2479 $LFS path2fid $DIR/$tdir/a1/f2
2480 $LFS getstripe $DIR/$tdir/a1/f2
2481 cat $DIR/$tdir/a1/f4
2482 $LFS path2fid $DIR/$tdir/a1/f4
2483 $LFS getstripe $DIR/$tdir/a1/f4
2485 run_test 18d "Find out orphan OST-object and repair it (4)"
2489 echo "The target MDT-object layout EA slot is occpuied by some new"
2490 echo "created OST-object when repair dangling reference case. Such"
2491 echo "conflict OST-object has been modified by others. To keep the"
2492 echo "new data, the LFSCK will create a new file to refernece this"
2493 echo "old orphan OST-object."
2496 check_mount_and_prep
2498 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2499 echo "guard" > $DIR/$tdir/a1/f1
2500 echo "foo" > $DIR/$tdir/a1/f2
2502 echo "guard" > $DIR/$tdir/a1/f3
2503 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2504 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2505 echo "foo" > $DIR/$tdir/a1/f4
2507 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2508 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2510 $LFS path2fid $DIR/$tdir/a1/f1
2511 $LFS getstripe $DIR/$tdir/a1/f1
2512 $LFS path2fid $DIR/$tdir/a1/f2
2513 $LFS getstripe $DIR/$tdir/a1/f2
2514 $LFS path2fid $DIR/$tdir/a1/f3
2515 $LFS getstripe $DIR/$tdir/a1/f3
2516 $LFS path2fid $DIR/$tdir/a1/f4
2517 $LFS getstripe $DIR/$tdir/a1/f4
2518 cancel_lru_locks osc
2520 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2521 echo "to reference the same OST-object (which is f1's OST-obejct)."
2522 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2523 echo "dangling reference case, but f2's old OST-object is there."
2525 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2526 echo "to reference the same OST-object (which is f3's OST-obejct)."
2527 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2528 echo "dangling reference case, but f4's old OST-object is there."
2531 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2532 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2533 chown 1.1 $DIR/$tdir/a1/f2
2534 chown 1.1 $DIR/$tdir/a1/f4
2535 rm -f $DIR/$tdir/a1/f1
2536 rm -f $DIR/$tdir/a1/f3
2539 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2541 echo "stopall to cleanup object cache"
2544 setupall > /dev/null
2546 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2547 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2549 start_full_debug_logging
2551 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2552 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2554 wait_update_facet mds1 "$LCTL get_param -n \
2555 mdd.$(facet_svc mds1).lfsck_layout |
2556 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2557 error "(3) MDS1 is not the expected 'scanning-phase2'"
2559 # to guarantee all updates are synced.
2563 echo "Write new data to f2/f4 to modify the new created OST-object."
2564 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2565 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2567 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2569 for k in $(seq $MDSCOUNT); do
2570 # The LFSCK status query internal is 30 seconds. For the case
2571 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2572 # time to guarantee the status sync up.
2573 wait_update_facet mds${k} "$LCTL get_param -n \
2574 mdd.$(facet_svc mds${k}).lfsck_layout |
2575 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2576 error "(4) MDS${k} is not the expected 'completed'"
2579 for k in $(seq $OSTCOUNT); do
2580 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2581 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2582 awk '/^status/ { print $2 }')
2583 [ "$cur_status" == "completed" ] ||
2584 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2587 stop_full_debug_logging
2589 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2590 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2591 awk '/^repaired_orphan/ { print $2 }')
2592 [ $repaired -eq 2 ] ||
2593 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2595 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2596 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2597 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2599 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2600 if [ $count -ne 2 ]; then
2601 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2602 error "(8) Expect 2 stubs under lost+found, but got $count"
2605 echo "The stub file should keep the original f2 or f4 data"
2606 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2607 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2608 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2609 error "(9) Got unexpected $cur_size"
2612 $LFS path2fid $cname
2613 $LFS getstripe $cname
2615 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2616 cur_size=$(ls -il $cname | awk '{ print $6 }')
2617 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2618 error "(10) Got unexpected $cur_size"
2621 $LFS path2fid $cname
2622 $LFS getstripe $cname
2624 echo "The f2/f4 should contains new data."
2625 cat $DIR/$tdir/a1/f2
2626 $LFS path2fid $DIR/$tdir/a1/f2
2627 $LFS getstripe $DIR/$tdir/a1/f2
2628 cat $DIR/$tdir/a1/f4
2629 $LFS path2fid $DIR/$tdir/a1/f4
2630 $LFS getstripe $DIR/$tdir/a1/f4
2632 run_test 18e "Find out orphan OST-object and repair it (5)"
2635 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2638 echo "The target MDT-object is lost. The LFSCK should re-create the"
2639 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2640 echo "to verify some OST-object(s) during the first stage-scanning,"
2641 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2642 echo "should not be affected."
2645 check_mount_and_prep
2646 $LFS mkdir -i 0 $DIR/$tdir/a1
2647 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2648 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2649 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2650 $LFS mkdir -i 0 $DIR/$tdir/a2
2651 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2652 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2653 $LFS getstripe $DIR/$tdir/a1/f1
2654 $LFS getstripe $DIR/$tdir/a2/f2
2656 if [ $MDSCOUNT -ge 2 ]; then
2657 $LFS mkdir -i 1 $DIR/$tdir/a3
2658 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2659 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2660 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2661 $LFS mkdir -i 1 $DIR/$tdir/a4
2662 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2663 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2664 $LFS getstripe $DIR/$tdir/a3/f3
2665 $LFS getstripe $DIR/$tdir/a4/f4
2668 cancel_lru_locks osc
2670 echo "Inject failure, to simulate the case of missing the MDT-object"
2671 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2672 do_facet mds1 $LCTL set_param fail_loc=0x1616
2673 rm -f $DIR/$tdir/a1/f1
2674 rm -f $DIR/$tdir/a2/f2
2676 if [ $MDSCOUNT -ge 2 ]; then
2677 do_facet mds2 $LCTL set_param fail_loc=0x1616
2678 rm -f $DIR/$tdir/a3/f3
2679 rm -f $DIR/$tdir/a4/f4
2685 do_facet mds1 $LCTL set_param fail_loc=0
2686 if [ $MDSCOUNT -ge 2 ]; then
2687 do_facet mds2 $LCTL set_param fail_loc=0
2690 cancel_lru_locks mdc
2691 cancel_lru_locks osc
2693 echo "Inject failure, to simulate the OST0 fail to handle"
2694 echo "MDT0 LFSCK request during the first-stage scanning."
2695 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2696 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2698 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2699 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2701 for k in $(seq $MDSCOUNT); do
2702 # The LFSCK status query internal is 30 seconds. For the case
2703 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2704 # time to guarantee the status sync up.
2705 wait_update_facet mds${k} "$LCTL get_param -n \
2706 mdd.$(facet_svc mds${k}).lfsck_layout |
2707 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2708 error "(2) MDS${k} is not the expected 'partial'"
2711 wait_update_facet ost1 "$LCTL get_param -n \
2712 obdfilter.$(facet_svc ost1).lfsck_layout |
2713 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2714 error "(3) OST1 is not the expected 'partial'"
2717 wait_update_facet ost2 "$LCTL get_param -n \
2718 obdfilter.$(facet_svc ost2).lfsck_layout |
2719 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2720 error "(4) OST2 is not the expected 'completed'"
2723 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2725 local repaired=$(do_facet mds1 $LCTL get_param -n \
2726 mdd.$(facet_svc mds1).lfsck_layout |
2727 awk '/^repaired_orphan/ { print $2 }')
2728 [ $repaired -eq 1 ] ||
2729 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2731 if [ $MDSCOUNT -ge 2 ]; then
2732 repaired=$(do_facet mds2 $LCTL get_param -n \
2733 mdd.$(facet_svc mds2).lfsck_layout |
2734 awk '/^repaired_orphan/ { print $2 }')
2735 [ $repaired -eq 1 ] ||
2736 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2739 echo "Trigger layout LFSCK on all devices again to cleanup"
2740 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2742 for k in $(seq $MDSCOUNT); do
2743 # The LFSCK status query internal is 30 seconds. For the case
2744 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2745 # time to guarantee the status sync up.
2746 wait_update_facet mds${k} "$LCTL get_param -n \
2747 mdd.$(facet_svc mds${k}).lfsck_layout |
2748 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2749 error "(8) MDS${k} is not the expected 'completed'"
2752 for k in $(seq $OSTCOUNT); do
2753 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2754 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2755 awk '/^status/ { print $2 }')
2756 [ "$cur_status" == "completed" ] ||
2757 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2761 local repaired=$(do_facet mds1 $LCTL get_param -n \
2762 mdd.$(facet_svc mds1).lfsck_layout |
2763 awk '/^repaired_orphan/ { print $2 }')
2764 [ $repaired -eq 2 ] ||
2765 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2767 if [ $MDSCOUNT -ge 2 ]; then
2768 repaired=$(do_facet mds2 $LCTL get_param -n \
2769 mdd.$(facet_svc mds2).lfsck_layout |
2770 awk '/^repaired_orphan/ { print $2 }')
2771 [ $repaired -eq 2 ] ||
2772 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2775 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2779 echo "The target MDT-object is lost, but related OI mapping is there"
2780 echo "The LFSCK should recreate the lost MDT-object without affected"
2781 echo "by the stale OI mapping."
2784 check_mount_and_prep
2785 $LFS mkdir -i 0 $DIR/$tdir/a1
2786 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2787 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2788 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2790 $LFS getstripe $DIR/$tdir/a1/f1
2791 cancel_lru_locks osc
2793 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2794 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2795 do_facet mds1 $LCTL set_param fail_loc=0x162e
2796 rm -f $DIR/$tdir/a1/f1
2798 do_facet mds1 $LCTL set_param fail_loc=0
2799 cancel_lru_locks mdc
2800 cancel_lru_locks osc
2802 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2803 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2805 for k in $(seq $MDSCOUNT); do
2806 # The LFSCK status query internal is 30 seconds. For the case
2807 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2808 # time to guarantee the status sync up.
2809 wait_update_facet mds${k} "$LCTL get_param -n \
2810 mdd.$(facet_svc mds${k}).lfsck_layout |
2811 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2812 error "(2) MDS${k} is not the expected 'completed'"
2815 for k in $(seq $OSTCOUNT); do
2816 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2817 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2818 awk '/^status/ { print $2 }')
2819 [ "$cur_status" == "completed" ] ||
2820 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2823 local repaired=$(do_facet mds1 $LCTL get_param -n \
2824 mdd.$(facet_svc mds1).lfsck_layout |
2825 awk '/^repaired_orphan/ { print $2 }')
2826 [ $repaired -eq $OSTCOUNT ] ||
2827 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2829 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2830 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2831 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2833 $LFS path2fid $DIR/$tdir/a1/f1
2834 $LFS getstripe $DIR/$tdir/a1/f1
2836 run_test 18g "Find out orphan OST-object and repair it (7)"
2840 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2841 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2842 echo "scanning its OST-object(s). Then in the second stage scanning,"
2843 echo "the OST will return related OST-object(s) to the MDT as orphan."
2844 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2845 echo "the 'orphan(s)' stripe information."
2848 check_mount_and_prep
2850 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2851 error "(0) Fail to create PFL $DIR/$tdir/f0"
2853 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2854 error "(1.1) Fail to write $DIR/$tdir/f0"
2856 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2857 error "(1.2) Fail to write $DIR/$tdir/f0"
2859 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2861 echo "Inject failure stub to simulate bad PFL extent range"
2862 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2863 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2865 chown 1.1 $DIR/$tdir/f0
2867 cancel_lru_locks mdc
2868 cancel_lru_locks osc
2869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2871 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2872 error "(2) Write to bad PFL file should fail"
2874 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2875 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2877 for k in $(seq $MDSCOUNT); do
2878 # The LFSCK status query internal is 30 seconds. For the case
2879 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2880 # time to guarantee the status sync up.
2881 wait_update_facet mds${k} "$LCTL get_param -n \
2882 mdd.$(facet_svc mds${k}).lfsck_layout |
2883 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2884 error "(4.1) MDS${k} is not the expected 'completed'"
2887 for k in $(seq $OSTCOUNT); do
2888 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2889 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2890 awk '/^status/ { print $2 }')
2891 [ "$cur_status" == "completed" ] ||
2892 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2896 local repaired=$($SHOW_LAYOUT |
2897 awk '/^repaired_orphan/ { print $2 }')
2898 [ $repaired -eq 2 ] ||
2899 error "(5) Fail to repair crashed PFL range: $repaired"
2901 echo "Data in $DIR/$tdir/f0 should not be broken"
2902 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2903 error "(6) Data in $DIR/$tdir/f0 is broken"
2905 echo "Write should succeed after LFSCK repairing the bad PFL range"
2906 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2907 error "(7) Write should succeed after LFSCK"
2909 run_test 18h "LFSCK can repair crashed PFL extent range"
2911 $LCTL set_param debug=-cache > /dev/null
2914 check_mount_and_prep
2915 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2917 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2918 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2920 echo "foo1" > $DIR/$tdir/a0
2921 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2922 error "(0) Fail to create PFL $DIR/$tdir/a1"
2923 echo "foo2" > $DIR/$tdir/a1
2924 echo "guard" > $DIR/$tdir/a2
2925 cancel_lru_locks osc
2927 echo "Inject failure, then client will offer wrong parent FID when read"
2928 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2929 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2931 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2932 $LCTL set_param fail_loc=0x1619
2934 echo "Read RPC with wrong parent FID should be denied"
2935 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2936 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2937 $LCTL set_param fail_loc=0
2939 run_test 19a "OST-object inconsistency self detect"
2942 check_mount_and_prep
2943 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2945 echo "Inject failure stub to make the OST-object to back point to"
2946 echo "non-exist MDT-object"
2948 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2949 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2951 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2952 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2953 echo "foo1" > $DIR/$tdir/f0
2954 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2955 error "(0) Fail to create PFL $DIR/$tdir/f1"
2956 echo "foo2" > $DIR/$tdir/f1
2957 cancel_lru_locks osc
2958 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2960 do_facet ost1 $LCTL set_param -n \
2961 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2962 echo "Nothing should be fixed since self detect and repair is disabled"
2963 local repaired=$(do_facet ost1 $LCTL get_param -n \
2964 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2965 awk '/^repaired/ { print $2 }')
2966 [ $repaired -eq 0 ] ||
2967 error "(1) Expected 0 repaired, but got $repaired"
2969 echo "Read RPC with right parent FID should be accepted,"
2970 echo "and cause parent FID on OST to be fixed"
2972 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2973 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2975 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2976 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2978 repaired=$(do_facet ost1 $LCTL get_param -n \
2979 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2980 awk '/^repaired/ { print $2 }')
2981 [ $repaired -eq 2 ] ||
2982 error "(3) Expected 1 repaired, but got $repaired"
2984 run_test 19b "OST-object inconsistency self repair"
2986 PATTERN_WITH_HOLE="40000001"
2987 PATTERN_WITHOUT_HOLE="raid0"
2990 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2993 echo "The target MDT-object and some of its OST-object are lost."
2994 echo "The LFSCK should find out the left OST-objects and re-create"
2995 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2996 echo "with the partial OST-objects (LOV EA hole)."
2998 echo "New client can access the file with LOV EA hole via normal"
2999 echo "system tools or commands without crash the system."
3001 echo "For old client, even though it cannot access the file with"
3002 echo "LOV EA hole, it should not cause the system crash."
3005 check_mount_and_prep
3006 $LFS mkdir -i 0 $DIR/$tdir/a1
3007 if [ $OSTCOUNT -gt 2 ]; then
3008 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3011 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3015 # 256 blocks on the stripe0.
3016 # 1 block on the stripe1 for 2 OSTs case.
3017 # 256 blocks on the stripe1 for other cases.
3018 # 1 block on the stripe2 if OSTs > 2
3019 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3020 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3021 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3023 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3024 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3025 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3028 $LFS getstripe $DIR/$tdir/a1/f0
3030 $LFS getstripe $DIR/$tdir/a1/f1
3032 $LFS getstripe $DIR/$tdir/a1/f2
3034 if [ $OSTCOUNT -gt 2 ]; then
3035 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3036 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3038 $LFS getstripe $DIR/$tdir/a1/f3
3041 cancel_lru_locks osc
3043 echo "Inject failure..."
3044 echo "To simulate f0 lost MDT-object"
3045 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3046 do_facet mds1 $LCTL set_param fail_loc=0x1616
3047 rm -f $DIR/$tdir/a1/f0
3049 echo "To simulate f1 lost MDT-object and OST-object0"
3050 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3051 do_facet mds1 $LCTL set_param fail_loc=0x161a
3052 rm -f $DIR/$tdir/a1/f1
3054 echo "To simulate f2 lost MDT-object and OST-object1"
3055 do_facet mds1 $LCTL set_param fail_val=1
3056 rm -f $DIR/$tdir/a1/f2
3058 if [ $OSTCOUNT -gt 2 ]; then
3059 echo "To simulate f3 lost MDT-object and OST-object2"
3060 do_facet mds1 $LCTL set_param fail_val=2
3061 rm -f $DIR/$tdir/a1/f3
3064 umount_client $MOUNT
3067 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3069 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3070 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3072 for k in $(seq $MDSCOUNT); do
3073 # The LFSCK status query internal is 30 seconds. For the case
3074 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3075 # time to guarantee the status sync up.
3076 wait_update_facet mds${k} "$LCTL get_param -n \
3077 mdd.$(facet_svc mds${k}).lfsck_layout |
3078 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3079 error "(2) MDS${k} is not the expected 'completed'"
3082 for k in $(seq $OSTCOUNT); do
3083 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3084 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3085 awk '/^status/ { print $2 }')
3086 [ "$cur_status" == "completed" ] ||
3087 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3090 local repaired=$(do_facet mds1 $LCTL get_param -n \
3091 mdd.$(facet_svc mds1).lfsck_layout |
3092 awk '/^repaired_orphan/ { print $2 }')
3093 if [ $OSTCOUNT -gt 2 ]; then
3094 [ $repaired -eq 9 ] ||
3095 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3097 [ $repaired -eq 4 ] ||
3098 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3101 mount_client $MOUNT || error "(5.0) Fail to start client!"
3103 LOV_PATTERN_F_HOLE=0x40000000
3106 # ${fid0}-R-0 is the old f0
3108 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3109 echo "Check $name, which is the old f0"
3111 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3113 local pattern=$($LFS getstripe -L $name)
3114 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3115 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3117 local stripes=$($LFS getstripe -c $name)
3118 if [ $OSTCOUNT -gt 2 ]; then
3119 [ $stripes -eq 3 ] ||
3120 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3122 [ $stripes -eq 2 ] ||
3123 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3126 local size=$(stat $name | awk '/Size:/ { print $2 }')
3127 [ $size -eq $((4096 * $bcount)) ] ||
3128 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3130 cat $name > /dev/null || error "(5.5) cannot read $name"
3132 echo "dummy" >> $name || error "(5.6) cannot write $name"
3134 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3136 touch $name || error "(5.8) cannot touch $name"
3138 rm -f $name || error "(5.9) cannot unlink $name"
3141 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3143 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3144 if [ $OSTCOUNT -gt 2 ]; then
3145 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3147 echo "Check $name, it contains the old f1's stripe1"
3150 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3152 pattern=$($LFS getstripe -L $name)
3153 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3154 error "(6.2) expect pattern flag hole, but got $pattern"
3156 stripes=$($LFS getstripe -c $name)
3157 if [ $OSTCOUNT -gt 2 ]; then
3158 [ $stripes -eq 3 ] ||
3159 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3161 [ $stripes -eq 2 ] ||
3162 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3165 size=$(stat $name | awk '/Size:/ { print $2 }')
3166 [ $size -eq $((4096 * $bcount)) ] ||
3167 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3169 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3171 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3172 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3175 [ $failures -eq 256 ] ||
3176 error "(6.6) expect 256 IO failures, but get $failures"
3178 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3179 [ $size -eq $((4096 * $bcount)) ] ||
3180 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3182 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3183 error "(6.8) write to the LOV EA hole should fail"
3185 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3186 error "(6.9) write to normal stripe should NOT fail"
3188 echo "foo" >> $name && error "(6.10) append write $name should fail"
3190 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3192 touch $name || error "(6.12) cannot touch $name"
3194 rm -f $name || error "(6.13) cannot unlink $name"
3197 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3199 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3200 if [ $OSTCOUNT -gt 2 ]; then
3201 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3203 echo "Check $name, it contains the old f2's stripe0"
3206 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3208 pattern=$($LFS getstripe -L $name)
3209 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3210 error "(7.2) expect pattern flag hole, but got $pattern"
3212 stripes=$($LFS getstripe -c $name)
3213 size=$(stat $name | awk '/Size:/ { print $2 }')
3214 if [ $OSTCOUNT -gt 2 ]; then
3215 [ $stripes -eq 3 ] ||
3216 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3218 [ $size -eq $((4096 * $bcount)) ] ||
3219 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3221 cat $name > /dev/null &&
3222 error "(7.5.1) normal read $name should fail"
3224 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3225 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3227 [ $failures -eq 256 ] ||
3228 error "(7.6) expect 256 IO failures, but get $failures"
3230 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3231 [ $size -eq $((4096 * $bcount)) ] ||
3232 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3234 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3235 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3237 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3238 error "(7.8.1) write to normal stripe should NOT fail"
3240 echo "foo" >> $name &&
3241 error "(7.8.3) append write $name should fail"
3243 chown $RUNAS_ID:$RUNAS_GID $name ||
3244 error "(7.9.1) cannot chown on $name"
3246 touch $name || error "(7.10.1) cannot touch $name"
3248 [ $stripes -eq 2 ] ||
3249 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3252 [ $size -eq $((4096 * (256 + 0))) ] ||
3253 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3255 cat $name > /dev/null &&
3256 error "(7.5.2) normal read $name should fail"
3258 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3259 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3260 [ $failures -eq 256 ] ||
3261 error "(7.6.2) expect 256 IO failures, but get $failures"
3264 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3265 [ $size -eq $((4096 * $bcount)) ] ||
3266 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3268 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3269 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3271 chown $RUNAS_ID:$RUNAS_GID $name ||
3272 error "(7.9.2) cannot chown on $name"
3274 touch $name || error "(7.10.2) cannot touch $name"
3277 rm -f $name || error "(7.11) cannot unlink $name"
3279 [ $OSTCOUNT -le 2 ] && return
3282 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3284 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3285 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3287 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3289 pattern=$($LFS getstripe -L $name)
3290 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3291 error "(8.2) expect pattern flag hole, but got $pattern"
3293 stripes=$($LFS getstripe -c $name)
3294 [ $stripes -eq 3 ] ||
3295 error "(8.3) expect the stripe count is 3, but got $stripes"
3297 size=$(stat $name | awk '/Size:/ { print $2 }')
3299 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3300 error "(8.4) expect the size $((4096 * 512)), but got $size"
3302 cat $name > /dev/null &&
3303 error "(8.5) normal read $name should fail"
3305 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3306 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3308 [ $failures -eq 256 ] ||
3309 error "(8.6) expect 256 IO failures, but get $failures"
3312 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3313 [ $size -eq $((4096 * $bcount)) ] ||
3314 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3316 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3317 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3319 chown $RUNAS_ID:$RUNAS_GID $name ||
3320 error "(8.9) cannot chown on $name"
3322 touch $name || error "(8.10) cannot touch $name"
3324 rm -f $name || error "(8.11) cannot unlink $name"
3326 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3329 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3332 echo "The target MDT-object and some of its OST-object are lost."
3333 echo "The LFSCK should find out the left OST-objects and re-create"
3334 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3335 echo "with the partial OST-objects (LOV EA hole)."
3337 echo "New client can access the file with LOV EA hole via normal"
3338 echo "system tools or commands without crash the system - PFL case."
3341 check_mount_and_prep
3343 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3344 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3345 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3346 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3347 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3348 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3350 local bcount=$((256 * 3 + 1))
3352 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3353 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3354 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3356 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3357 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3358 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3361 $LFS getstripe $DIR/$tdir/f0
3363 $LFS getstripe $DIR/$tdir/f1
3365 $LFS getstripe $DIR/$tdir/f2
3367 cancel_lru_locks mdc
3368 cancel_lru_locks osc
3370 echo "Inject failure..."
3371 echo "To simulate f0 lost MDT-object"
3372 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3373 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3376 echo "To simulate the case of f1 lost MDT-object and "
3377 echo "the first OST-object in each PFL component"
3378 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3379 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3382 echo "To simulate the case of f2 lost MDT-object and "
3383 echo "the second OST-object in each PFL component"
3384 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3391 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3392 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3394 for k in $(seq $MDSCOUNT); do
3395 # The LFSCK status query internal is 30 seconds. For the case
3396 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3397 # time to guarantee the status sync up.
3398 wait_update_facet mds${k} "$LCTL get_param -n \
3399 mdd.$(facet_svc mds${k}).lfsck_layout |
3400 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3401 error "(4) MDS${k} is not the expected 'completed'"
3404 for k in $(seq $OSTCOUNT); do
3405 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3406 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3407 awk '/^status/ { print $2 }')
3408 [ "$cur_status" == "completed" ] ||
3409 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3412 local repaired=$(do_facet mds1 $LCTL get_param -n \
3413 mdd.$(facet_svc mds1).lfsck_layout |
3414 awk '/^repaired_orphan/ { print $2 }')
3415 [ $repaired -eq 8 ] ||
3416 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3419 # ${fid0}-R-0 is the old f0
3421 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3422 echo "Check $name, which is the old f0"
3424 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3426 local pattern=$($LFS getstripe -L -I1 $name)
3427 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3428 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3430 pattern=$($LFS getstripe -L -I2 $name)
3431 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3432 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3434 local stripes=$($LFS getstripe -c -I1 $name)
3435 [ $stripes -eq 2 ] ||
3436 error "(7.3.1) expect 2 stripes, but got $stripes"
3438 stripes=$($LFS getstripe -c -I2 $name)
3439 [ $stripes -eq 2 ] ||
3440 error "(7.3.2) expect 2 stripes, but got $stripes"
3442 local e_start=$($LFS getstripe -I1 $name |
3443 awk '/lcme_extent.e_start:/ { print $2 }')
3444 [ $e_start -eq 0 ] ||
3445 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3447 local e_end=$($LFS getstripe -I1 $name |
3448 awk '/lcme_extent.e_end:/ { print $2 }')
3449 [ $e_end -eq 2097152 ] ||
3450 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3452 e_start=$($LFS getstripe -I2 $name |
3453 awk '/lcme_extent.e_start:/ { print $2 }')
3454 [ $e_start -eq 2097152 ] ||
3455 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3457 e_end=$($LFS getstripe -I2 $name |
3458 awk '/lcme_extent.e_end:/ { print $2 }')
3459 [ "$e_end" = "EOF" ] ||
3460 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3462 local size=$(stat $name | awk '/Size:/ { print $2 }')
3463 [ $size -eq $((4096 * $bcount)) ] ||
3464 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3466 cat $name > /dev/null || error "(7.7) cannot read $name"
3468 echo "dummy" >> $name || error "(7.8) cannot write $name"
3470 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3472 touch $name || error "(7.10) cannot touch $name"
3474 rm -f $name || error "(7.11) cannot unlink $name"
3477 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3479 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3480 echo "Check $name, it contains f1's second OST-object in each COMP"
3482 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3484 pattern=$($LFS getstripe -L -I1 $name)
3485 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3486 error "(8.2.1) expect pattern flag hole, but got $pattern"
3488 pattern=$($LFS getstripe -L -I2 $name)
3489 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3490 error "(8.2.2) expect pattern flag hole, but got $pattern"
3492 stripes=$($LFS getstripe -c -I1 $name)
3493 [ $stripes -eq 2 ] ||
3494 error "(8.3.2) expect 2 stripes, but got $stripes"
3496 stripes=$($LFS getstripe -c -I2 $name)
3497 [ $stripes -eq 2 ] ||
3498 error "(8.3.2) expect 2 stripes, but got $stripes"
3500 e_start=$($LFS getstripe -I1 $name |
3501 awk '/lcme_extent.e_start:/ { print $2 }')
3502 [ $e_start -eq 0 ] ||
3503 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3505 e_end=$($LFS getstripe -I1 $name |
3506 awk '/lcme_extent.e_end:/ { print $2 }')
3507 [ $e_end -eq 2097152 ] ||
3508 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3510 e_start=$($LFS getstripe -I2 $name |
3511 awk '/lcme_extent.e_start:/ { print $2 }')
3512 [ $e_start -eq 2097152 ] ||
3513 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3515 e_end=$($LFS getstripe -I2 $name |
3516 awk '/lcme_extent.e_end:/ { print $2 }')
3517 [ "$e_end" = "EOF" ] ||
3518 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3520 size=$(stat $name | awk '/Size:/ { print $2 }')
3521 [ $size -eq $((4096 * $bcount)) ] ||
3522 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3524 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3526 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3527 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3529 # The first stripe in each COMP was lost
3530 [ $failures -eq 512 ] ||
3531 error "(8.8) expect 512 IO failures, but get $failures"
3533 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3534 [ $size -eq $((4096 * $bcount)) ] ||
3535 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3537 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3538 error "(8.10) write to the LOV EA hole should fail"
3540 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3541 error "(8.11) write to normal stripe should NOT fail"
3543 echo "foo" >> $name && error "(8.12) append write $name should fail"
3545 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3547 touch $name || error "(8.14) cannot touch $name"
3549 rm -f $name || error "(8.15) cannot unlink $name"
3552 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3554 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3555 echo "Check $name, it contains f2's first stripe in each COMP"
3557 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3559 pattern=$($LFS getstripe -L -I1 $name)
3560 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3561 error "(9.2.1) expect pattern flag hole, but got $pattern"
3563 pattern=$($LFS getstripe -L -I2 $name)
3564 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3565 error "(9.2.2) expect pattern flag hole, but got $pattern"
3567 stripes=$($LFS getstripe -c -I1 $name)
3568 [ $stripes -eq 2 ] ||
3569 error "(9.3.2) expect 2 stripes, but got $stripes"
3571 stripes=$($LFS getstripe -c -I2 $name)
3572 [ $stripes -eq 2 ] ||
3573 error "(9.3.2) expect 2 stripes, but got $stripes"
3575 e_start=$($LFS getstripe -I1 $name |
3576 awk '/lcme_extent.e_start:/ { print $2 }')
3577 [ $e_start -eq 0 ] ||
3578 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3580 e_end=$($LFS getstripe -I1 $name |
3581 awk '/lcme_extent.e_end:/ { print $2 }')
3582 [ $e_end -eq 2097152 ] ||
3583 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3585 e_start=$($LFS getstripe -I2 $name |
3586 awk '/lcme_extent.e_start:/ { print $2 }')
3587 [ $e_start -eq 2097152 ] ||
3588 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3590 e_end=$($LFS getstripe -I2 $name |
3591 awk '/lcme_extent.e_end:/ { print $2 }')
3592 [ "$e_end" = "EOF" ] ||
3593 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3595 size=$(stat $name | awk '/Size:/ { print $2 }')
3596 # The second stripe in COMP was lost, so we do not know there
3597 # have ever been some data before. 'stat' will regard it as
3598 # no data on the lost stripe.
3600 [ $size -eq $((4096 * $bcount)) ] ||
3601 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3603 cat $name > /dev/null &&
3604 error "(9.7) normal read $name should fail"
3606 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3607 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3608 [ $failures -eq 512 ] ||
3609 error "(9.8) expect 256 IO failures, but get $failures"
3611 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3612 # The second stripe in COMP was lost, so we do not know there
3613 # have ever been some data before. Since 'dd' skip failure,
3614 # it will regard the lost stripe contains data.
3616 [ $size -eq $((4096 * $bcount)) ] ||
3617 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3619 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3620 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3622 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3623 error "(9.11) write to normal stripe should NOT fail"
3625 echo "foo" >> $name &&
3626 error "(9.12) append write $name should fail"
3628 chown $RUNAS_ID:$RUNAS_GID $name ||
3629 error "(9.13) cannot chown on $name"
3631 touch $name || error "(9.14) cannot touch $name"
3633 rm -f $name || error "(7.15) cannot unlink $name"
3635 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3638 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3639 skip "ignore the test if MDS is older than 2.5.59" && return
3641 check_mount_and_prep
3642 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3644 echo "Start all LFSCK components by default (-s 1)"
3645 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3646 error "Fail to start LFSCK"
3648 echo "namespace LFSCK should be in 'scanning-phase1' status"
3649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3650 [ "$STATUS" == "scanning-phase1" ] ||
3651 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3653 echo "layout LFSCK should be in 'scanning-phase1' status"
3654 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3655 [ "$STATUS" == "scanning-phase1" ] ||
3656 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3658 echo "Stop all LFSCK components by default"
3659 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3660 error "Fail to stop LFSCK"
3662 run_test 21 "run all LFSCK components by default"
3665 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3668 echo "The parent_A references the child directory via some name entry,"
3669 echo "but the child directory back references another parent_B via its"
3670 echo "".." name entry. The parent_B does not exist. Then the namespace"
3671 echo "LFSCK will repair the child directory's ".." name entry."
3674 check_mount_and_prep
3676 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3677 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3679 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3680 echo "The dummy's dotdot name entry references the guard."
3681 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3682 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3683 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3684 error "(3) Fail to mkdir on MDT0"
3685 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3687 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3689 echo "Trigger namespace LFSCK to repair unmatched pairs"
3690 $START_NAMESPACE -A -r ||
3691 error "(5) Fail to start LFSCK for namespace"
3693 wait_all_targets_blocked namespace completed 6
3695 local repaired=$($SHOW_NAMESPACE |
3696 awk '/^unmatched_pairs_repaired/ { print $2 }')
3697 [ $repaired -eq 1 ] ||
3698 error "(7) Fail to repair unmatched pairs: $repaired"
3700 echo "'ls' should success after namespace LFSCK repairing"
3701 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3702 error "(8) ls should success."
3704 run_test 22a "LFSCK can repair unmatched pairs (1)"
3707 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3710 echo "The parent_A references the child directory via the name entry_B,"
3711 echo "but the child directory back references another parent_C via its"
3712 echo "".." name entry. The parent_C exists, but there is no the name"
3713 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3714 echo "the child directory's ".." name entry and its linkEA."
3717 check_mount_and_prep
3719 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3720 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3722 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3723 echo "and bad linkEA. The dummy's dotdot name entry references the"
3724 echo "guard. The dummy's linkEA references n non-exist name entry."
3725 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3726 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3727 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3728 error "(3) Fail to mkdir on MDT0"
3729 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3731 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3732 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3733 local dummyname=$($LFS fid2path $DIR $dummyfid)
3734 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3735 error "(4) fid2path works unexpectedly."
3737 echo "Trigger namespace LFSCK to repair unmatched pairs"
3738 $START_NAMESPACE -A -r ||
3739 error "(5) Fail to start LFSCK for namespace"
3741 wait_all_targets_blocked namespace completed 6
3743 local repaired=$($SHOW_NAMESPACE |
3744 awk '/^unmatched_pairs_repaired/ { print $2 }')
3745 [ $repaired -eq 1 ] ||
3746 error "(7) Fail to repair unmatched pairs: $repaired"
3748 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3749 local dummyname=$($LFS fid2path $DIR $dummyfid)
3750 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3751 error "(8) fid2path does not work"
3753 run_test 22b "LFSCK can repair unmatched pairs (2)"
3756 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3759 echo "The name entry is there, but the MDT-object for such name "
3760 echo "entry does not exist. The namespace LFSCK should find out "
3761 echo "and repair the inconsistency as required."
3764 check_mount_and_prep
3766 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3767 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3769 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3770 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3771 do_facet mds2 $LCTL set_param fail_loc=0x1620
3772 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3773 do_facet mds2 $LCTL set_param fail_loc=0
3775 echo "'ls' should fail because of dangling name entry"
3776 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3778 echo "Trigger namespace LFSCK to find out dangling name entry"
3779 $START_NAMESPACE -A -r ||
3780 error "(5) Fail to start LFSCK for namespace"
3782 wait_all_targets_blocked namespace completed 6
3784 local repaired=$($SHOW_NAMESPACE |
3785 awk '/^dangling_repaired/ { print $2 }')
3786 [ $repaired -eq 1 ] ||
3787 error "(7) Fail to repair dangling name entry: $repaired"
3789 echo "'ls' should fail because not re-create MDT-object by default"
3790 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3792 echo "Trigger namespace LFSCK again to repair dangling name entry"
3793 $START_NAMESPACE -A -r -C ||
3794 error "(9) Fail to start LFSCK for namespace"
3796 wait_all_targets_blocked namespace completed 10
3798 repaired=$($SHOW_NAMESPACE |
3799 awk '/^dangling_repaired/ { print $2 }')
3800 [ $repaired -eq 1 ] ||
3801 error "(11) Fail to repair dangling name entry: $repaired"
3803 echo "'ls' should success after namespace LFSCK repairing"
3804 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3806 run_test 23a "LFSCK can repair dangling name entry (1)"
3810 echo "The objectA has multiple hard links, one of them corresponding"
3811 echo "to the name entry_B. But there is something wrong for the name"
3812 echo "entry_B and cause entry_B to references non-exist object_C."
3813 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3814 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3815 echo "comes to the second-stage scanning, it will find that the"
3816 echo "former re-creating object_C is not proper, and will try to"
3817 echo "replace the object_C with the real object_A."
3820 check_mount_and_prep
3822 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3823 $LFS path2fid $DIR/$tdir/d0
3825 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3827 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3828 $LFS path2fid $DIR/$tdir/d0/f0
3830 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3831 $LFS path2fid $DIR/$tdir/d0/f1
3833 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3834 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3836 if [ "$SEQ0" != "$SEQ1" ]; then
3837 # To guarantee that the f0 and f1 are in the same FID seq
3838 rm -f $DIR/$tdir/d0/f0 ||
3839 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3840 echo "dummy" > $DIR/$tdir/d0/f0 ||
3841 error "(3.2) Fail to touch on MDT0"
3842 $LFS path2fid $DIR/$tdir/d0/f0
3845 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3846 OID=$(printf %d $OID)
3848 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3849 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3850 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3851 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3852 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3854 # If there is creation after the dangling injection, it may re-use
3855 # the just released local object (inode) that is referenced by the
3856 # dangling name entry. It will fail the dangling injection.
3857 # So before deleting the target object for the dangling name entry,
3858 # remove some other objects to avoid the target object being reused
3859 # by some potential creations. LU-7429
3860 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3862 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3864 echo "'ls' should fail because of dangling name entry"
3865 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3866 error "(6) ls should fail."
3868 echo "Trigger namespace LFSCK to find out dangling name entry"
3869 $START_NAMESPACE -r -C ||
3870 error "(7) Fail to start LFSCK for namespace"
3872 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3873 mdd.${MDT_DEV}.lfsck_namespace |
3874 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3876 error "(8) unexpected status"
3879 local repaired=$($SHOW_NAMESPACE |
3880 awk '/^dangling_repaired/ { print $2 }')
3881 [ $repaired -eq 1 ] ||
3882 error "(9) Fail to repair dangling name entry: $repaired"
3884 repaired=$($SHOW_NAMESPACE |
3885 awk '/^multiple_linked_repaired/ { print $2 }')
3886 [ $repaired -eq 1 ] ||
3887 error "(10) Fail to drop the former created object: $repaired"
3889 local data=$(cat $DIR/$tdir/d0/foo)
3890 [ "$data" == "dummy" ] ||
3891 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3893 run_test 23b "LFSCK can repair dangling name entry (2)"
3896 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3897 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3898 mdd.${MDT_DEV}.lfsck_namespace |
3899 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3901 error "(10) unexpected status"
3904 stop_full_debug_logging
3909 echo "The objectA has multiple hard links, one of them corresponding"
3910 echo "to the name entry_B. But there is something wrong for the name"
3911 echo "entry_B and cause entry_B to references non-exist object_C."
3912 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3913 echo "as dangling, and re-create the lost object_C. And then others"
3914 echo "modified the re-created object_C. When the LFSCK comes to the"
3915 echo "second-stage scanning, it will find that the former re-creating"
3916 echo "object_C maybe wrong and try to replace the object_C with the"
3917 echo "real object_A. But because object_C has been modified, so the"
3918 echo "LFSCK cannot replace it."
3921 start_full_debug_logging
3923 check_mount_and_prep
3925 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3926 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3927 echo "parent_fid=$parent_fid"
3929 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3931 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3932 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3933 echo "f0_fid=$f0_fid"
3935 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3936 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
3937 echo "f1_fid=$f1_fid"
3939 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
3940 # To guarantee that the f0 and f1 are in the same FID seq
3941 rm -f $DIR/$tdir/d0/f0 ||
3942 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3943 echo "dummy" > $DIR/$tdir/d0/f0 ||
3944 error "(3.2) Fail to touch on MDT0"
3945 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3946 echo "f0_fid=$f0_fid (replaced)"
3949 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
3951 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3952 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3953 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
3954 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3955 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3957 # If there is creation after the dangling injection, it may re-use
3958 # the just released local object (inode) that is referenced by the
3959 # dangling name entry. It will fail the dangling injection.
3960 # So before deleting the target object for the dangling name entry,
3961 # remove some other objects to avoid the target object being reused
3962 # by some potential creations. LU-7429
3963 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3965 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3967 echo "'ls' should fail because of dangling name entry"
3968 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3969 error "(6) ls should fail."
3971 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3972 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3974 echo "Trigger namespace LFSCK to find out dangling name entry"
3975 $START_NAMESPACE -r -C ||
3976 error "(7) Fail to start LFSCK for namespace"
3978 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
3979 # While unexpected by the test, it is valid for LFSCK to repair
3980 # the link to the original object before any data is written.
3981 local size=$(stat -c %s $DIR/$tdir/d0/foo)
3983 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
3984 log "LFSCK repaired file prematurely"
3989 stat $DIR/$tdir/d0/foo
3991 error "(8) unexpected size"
3994 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3995 cancel_lru_locks osc
3999 local repaired=$($SHOW_NAMESPACE |
4000 awk '/^dangling_repaired/ { print $2 }')
4001 [ $repaired -eq 1 ] ||
4002 error "(11) Fail to repair dangling name entry: $repaired"
4004 local data=$(cat $DIR/$tdir/d0/foo)
4005 [ "$data" != "dummy" ] ||
4006 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4008 run_test 23c "LFSCK can repair dangling name entry (3)"
4011 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4014 echo "Two MDT-objects back reference the same name entry via their"
4015 echo "each own linkEA entry, but the name entry only references one"
4016 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4017 echo "for the MDT-object that is not recognized. If such MDT-object"
4018 echo "has no other linkEA entry after the removing, then the LFSCK"
4019 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4022 check_mount_and_prep
4024 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4026 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4027 $LFS path2fid $DIR/$tdir/d0/guard
4029 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4030 $LFS path2fid $DIR/$tdir/d0/dummy
4033 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4034 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4036 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4039 touch $DIR/$tdir/d0/guard/foo ||
4040 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4042 echo "Inject failure stub on MDT0 to simulate the case that"
4043 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4044 echo "that references $DIR/$tdir/d0/guard/foo."
4045 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4046 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4047 echo "there with the same linkEA entry as another MDT-object"
4048 echo "$DIR/$tdir/d0/guard/foo has"
4050 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4051 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4052 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4053 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4054 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4055 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4056 rmdir $DIR/$tdir/d0/dummy/foo ||
4057 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4058 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4060 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4061 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4062 error "(6) stat successfully unexpectedly"
4064 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4065 $START_NAMESPACE -A -r ||
4066 error "(7) Fail to start LFSCK for namespace"
4068 wait_all_targets_blocked namespace completed 8
4070 local repaired=$($SHOW_NAMESPACE |
4071 awk '/^multiple_referenced_repaired/ { print $2 }')
4072 [ $repaired -eq 1 ] ||
4073 error "(9) Fail to repair multiple referenced name entry: $repaired"
4075 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4076 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4077 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4079 local cname="$cfid-$pfid-D-0"
4080 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4081 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4083 run_test 24 "LFSCK can repair multiple-referenced name entry"
4086 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4087 skip "ldiskfs only test" && return
4090 echo "The file type in the name entry does not match the file type"
4091 echo "claimed by the referenced object. Then the LFSCK will update"
4092 echo "the file type in the name entry."
4095 check_mount_and_prep
4097 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4099 echo "Inject failure stub on MDT0 to simulate the case that"
4100 echo "the file type stored in the name entry is wrong."
4102 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4103 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4104 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4105 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4107 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4108 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4110 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4111 mdd.${MDT_DEV}.lfsck_namespace |
4112 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4114 error "(4) unexpected status"
4117 local repaired=$($SHOW_NAMESPACE |
4118 awk '/^bad_file_type_repaired/ { print $2 }')
4119 [ $repaired -eq 1 ] ||
4120 error "(5) Fail to repair bad file type in name entry: $repaired"
4122 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4124 run_test 25 "LFSCK can repair bad file type in the name entry"
4128 echo "The local name entry back referenced by the MDT-object is lost."
4129 echo "The namespace LFSCK will add the missing local name entry back"
4130 echo "to the normal namespace."
4133 check_mount_and_prep
4135 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4136 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4137 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4139 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4140 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4142 echo "Inject failure stub on MDT0 to simulate the case that"
4143 echo "foo's name entry will be removed, but the foo's object"
4144 echo "and its linkEA are kept in the system."
4146 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4147 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4148 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4149 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4151 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4152 error "(5) 'ls' should fail"
4154 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4155 $START_NAMESPACE -r -A ||
4156 error "(6) Fail to start LFSCK for namespace"
4158 wait_all_targets_blocked namespace completed 7
4160 local repaired=$($SHOW_NAMESPACE |
4161 awk '/^lost_dirent_repaired/ { print $2 }')
4162 [ $repaired -eq 1 ] ||
4163 error "(8) Fail to repair lost dirent: $repaired"
4165 ls -ail $DIR/$tdir/d0/foo ||
4166 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4168 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4169 [ "$foofid" == "$foofid2" ] ||
4170 error "(10) foo's FID changed: $foofid, $foofid2"
4172 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4175 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4178 echo "The remote name entry back referenced by the MDT-object is lost."
4179 echo "The namespace LFSCK will add the missing remote name entry back"
4180 echo "to the normal namespace."
4183 check_mount_and_prep
4185 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4186 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4187 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4189 echo "Inject failure stub on MDT0 to simulate the case that"
4190 echo "foo's name entry will be removed, but the foo's object"
4191 echo "and its linkEA are kept in the system."
4193 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4195 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4196 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4198 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4199 error "(4) 'ls' should fail"
4201 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4202 $START_NAMESPACE -r -A ||
4203 error "(5) Fail to start LFSCK for namespace"
4205 wait_all_targets_blocked namespace completed 6
4207 local repaired=$($SHOW_NAMESPACE |
4208 awk '/^lost_dirent_repaired/ { print $2 }')
4209 [ $repaired -eq 1 ] ||
4210 error "(7) Fail to repair lost dirent: $repaired"
4212 ls -ail $DIR/$tdir/d0/foo ||
4213 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4215 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4216 [ "$foofid" == "$foofid2" ] ||
4217 error "(9) foo's FID changed: $foofid, $foofid2"
4219 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4223 echo "The local parent referenced by the MDT-object linkEA is lost."
4224 echo "The namespace LFSCK will re-create the lost parent as orphan."
4227 check_mount_and_prep
4229 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4230 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4231 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4232 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4234 echo "Inject failure stub on MDT0 to simulate the case that"
4235 echo "foo's name entry will be removed, but the foo's object"
4236 echo "and its linkEA are kept in the system. And then remove"
4237 echo "another hard link and the parent directory."
4239 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4240 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4241 rm -f $DIR/$tdir/d0/foo ||
4242 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4243 rm -f $DIR/$tdir/d0/dummy ||
4244 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4247 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4248 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4250 echo "Trigger namespace LFSCK to repair the lost parent"
4251 $START_NAMESPACE -r -A ||
4252 error "(6) Fail to start LFSCK for namespace"
4254 wait_all_targets_blocked namespace completed 7
4256 local repaired=$($SHOW_NAMESPACE |
4257 awk '/^lost_dirent_repaired/ { print $2 }')
4258 [ $repaired -eq 1 ] ||
4259 error "(8) Fail to repair lost dirent: $repaired"
4261 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4262 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4263 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4265 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4267 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4268 [ ! -z "$cname" ] ||
4269 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4271 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4274 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4277 echo "The remote parent referenced by the MDT-object linkEA is lost."
4278 echo "The namespace LFSCK will re-create the lost parent as orphan."
4281 check_mount_and_prep
4283 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4284 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4286 $LFS path2fid $DIR/$tdir/d0
4288 echo "Inject failure stub on MDT0 to simulate the case that"
4289 echo "foo's name entry will be removed, but the foo's object"
4290 echo "and its linkEA are kept in the system. And then remove"
4291 echo "the parent directory."
4293 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4294 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4295 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4298 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4299 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4301 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4302 $START_NAMESPACE -r -A ||
4303 error "(6) Fail to start LFSCK for namespace"
4305 wait_all_targets_blocked namespace completed 7
4307 local repaired=$($SHOW_NAMESPACE |
4308 awk '/^lost_dirent_repaired/ { print $2 }')
4309 [ $repaired -eq 1 ] ||
4310 error "(8) Fail to repair lost dirent: $repaired"
4312 ls -ail $MOUNT/.lustre/lost+found/
4314 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4315 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4316 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4318 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4320 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4321 [ ! -z "$cname" ] ||
4322 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4324 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4327 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4330 echo "The target name entry is lost. The LFSCK should insert the"
4331 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4332 echo "the MDT (on which the orphan MDT-object resides) has ever"
4333 echo "failed to respond some name entry verification during the"
4334 echo "first stage-scanning, then the LFSCK should skip to handle"
4335 echo "orphan MDT-object on this MDT. But other MDTs should not"
4339 check_mount_and_prep
4340 $LFS mkdir -i 0 $DIR/$tdir/d1
4341 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4342 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4344 $LFS mkdir -i 1 $DIR/$tdir/d2
4345 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4346 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4348 echo "Inject failure stub on MDT0 to simulate the case that"
4349 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4350 echo "and its linkEA are kept in the system. And the case that"
4351 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4352 echo "and its linkEA are kept in the system."
4354 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4355 do_facet mds1 $LCTL set_param fail_loc=0x1624
4356 do_facet mds2 $LCTL set_param fail_loc=0x1624
4357 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4358 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4359 do_facet mds1 $LCTL set_param fail_loc=0
4360 do_facet mds2 $LCTL set_param fail_loc=0
4362 cancel_lru_locks mdc
4363 cancel_lru_locks osc
4365 echo "Inject failure, to simulate the MDT0 fail to handle"
4366 echo "MDT1 LFSCK request during the first-stage scanning."
4367 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4368 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4370 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4371 $START_NAMESPACE -r -A ||
4372 error "(3) Fail to start LFSCK for namespace"
4374 wait_update_facet mds1 "$LCTL get_param -n \
4375 mdd.$(facet_svc mds1).lfsck_namespace |
4376 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4377 error "(4) mds1 is not the expected 'partial'"
4380 wait_update_facet mds2 "$LCTL get_param -n \
4381 mdd.$(facet_svc mds2).lfsck_namespace |
4382 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4383 error "(5) mds2 is not the expected 'completed'"
4386 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4388 local repaired=$(do_facet mds1 $LCTL get_param -n \
4389 mdd.$(facet_svc mds1).lfsck_namespace |
4390 awk '/^lost_dirent_repaired/ { print $2 }')
4391 [ $repaired -eq 0 ] ||
4392 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4394 repaired=$(do_facet mds2 $LCTL get_param -n \
4395 mdd.$(facet_svc mds2).lfsck_namespace |
4396 awk '/^lost_dirent_repaired/ { print $2 }')
4397 [ $repaired -eq 1 ] ||
4398 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4400 echo "Trigger namespace LFSCK on all devices again to cleanup"
4401 $START_NAMESPACE -r -A ||
4402 error "(8) Fail to start LFSCK for namespace"
4404 wait_all_targets_blocked namespace completed 9
4406 local repaired=$(do_facet mds1 $LCTL get_param -n \
4407 mdd.$(facet_svc mds1).lfsck_namespace |
4408 awk '/^lost_dirent_repaired/ { print $2 }')
4409 [ $repaired -eq 1 ] ||
4410 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4412 repaired=$(do_facet mds2 $LCTL get_param -n \
4413 mdd.$(facet_svc mds2).lfsck_namespace |
4414 awk '/^lost_dirent_repaired/ { print $2 }')
4415 [ $repaired -eq 0 ] ||
4416 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4418 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4422 echo "The object's nlink attribute is larger than the object's known"
4423 echo "name entries count. The LFSCK will repair the object's nlink"
4424 echo "attribute to match the known name entries count"
4427 check_mount_and_prep
4429 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4430 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4432 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4433 echo "nlink attribute is larger than its name entries count."
4435 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4436 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4437 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4438 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4439 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4441 cancel_lru_locks mdc
4442 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4443 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4445 echo "Trigger namespace LFSCK to repair the nlink count"
4446 $START_NAMESPACE -r -A ||
4447 error "(5) Fail to start LFSCK for namespace"
4449 wait_all_targets_blocked namespace completed 6
4451 local repaired=$($SHOW_NAMESPACE |
4452 awk '/^nlinks_repaired/ { print $2 }')
4453 [ $repaired -eq 1 ] ||
4454 error "(7) Fail to repair nlink count: $repaired"
4456 cancel_lru_locks mdc
4457 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4458 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4460 # Disable 29a, we only allow nlink to be updated if the known linkEA
4461 # entries is larger than nlink count.
4463 #run_test 29a "LFSCK can repair bad nlink count (1)"
4467 echo "The object's nlink attribute is smaller than the object's known"
4468 echo "name entries count. The LFSCK will repair the object's nlink"
4469 echo "attribute to match the known name entries count"
4472 check_mount_and_prep
4474 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4475 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4477 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4478 echo "nlink attribute is smaller than its name entries count."
4480 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4481 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4482 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4483 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4484 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4486 cancel_lru_locks mdc
4487 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4488 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4490 echo "Trigger namespace LFSCK to repair the nlink count"
4491 $START_NAMESPACE -r -A ||
4492 error "(5) Fail to start LFSCK for namespace"
4494 wait_all_targets_blocked namespace completed 6
4496 local repaired=$($SHOW_NAMESPACE |
4497 awk '/^nlinks_repaired/ { print $2 }')
4498 [ $repaired -eq 1 ] ||
4499 error "(7) Fail to repair nlink count: $repaired"
4501 cancel_lru_locks mdc
4502 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4503 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4505 run_test 29b "LFSCK can repair bad nlink count (2)"
4510 echo "The namespace LFSCK will create many hard links to the target"
4511 echo "file as to exceed the linkEA size limitation. Under such case"
4512 echo "the linkEA will be marked as overflow that will prevent the"
4513 echo "target file to be migrated. Then remove some hard links to"
4514 echo "make the left hard links to be held within the linkEA size"
4515 echo "limitation. But before the namespace LFSCK adding all the"
4516 echo "missed linkEA entries back, the overflow mark (timestamp)"
4517 echo "will not be cleared."
4520 check_mount_and_prep
4522 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4523 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4524 error "(0.2) Fail to mkdir"
4525 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4526 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4528 # define MAX_LINKEA_SIZE 4096
4529 # sizeof(link_ea_header) = 24
4530 # sizeof(link_ea_entry) = 18
4531 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4532 # (sizeof(link_ea_entry) + name_length))
4533 # If the average name length is 12 bytes, then 150 hard links
4534 # is totally enough to overflow the linkEA
4535 echo "Create 150 hard links should succeed although the linkEA overflow"
4536 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4537 error "(2) Fail to hard link"
4539 cancel_lru_locks mdc
4540 if [ $MDSCOUNT -ge 2 ]; then
4541 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4542 error "(3.1) Migrate should fail"
4544 echo "The object with linkEA overflow should NOT be migrated"
4545 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4546 [ "$newfid" == "$oldfid" ] ||
4547 error "(3.2) Migrate should fail: $newfid != $oldfid"
4550 # Remove 100 hard links, then the linkEA should have space
4551 # to hold the missed linkEA entries.
4552 echo "Remove 100 hard links to save space for the missed linkEA entries"
4553 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4555 if [ $MDSCOUNT -ge 2 ]; then
4556 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4557 error "(5.1) Migrate should fail"
4559 # The overflow timestamp is still there, so migration will fail.
4560 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4561 [ "$newfid" == "$oldfid" ] ||
4562 error "(5.2) Migrate should fail: $newfid != $oldfid"
4565 # sleep 3 seconds to guarantee that the overflow is recognized
4568 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4569 $START_NAMESPACE -r -A ||
4570 error "(6) Fail to start LFSCK for namespace"
4572 wait_all_targets_blocked namespace completed 7
4574 local repaired=$($SHOW_NAMESPACE |
4575 awk '/^linkea_overflow_cleared/ { print $2 }')
4576 [ $repaired -eq 1 ] ||
4577 error "(8) Fail to clear linkea overflow: $repaired"
4579 repaired=$($SHOW_NAMESPACE |
4580 awk '/^nlinks_repaired/ { print $2 }')
4581 [ $repaired -eq 0 ] ||
4582 error "(9) Unexpected nlink repaired: $repaired"
4584 if [ $MDSCOUNT -ge 2 ]; then
4585 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4586 error "(10.1) Migrate failure"
4588 # Migration should succeed after clear the overflow timestamp.
4589 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4590 [ "$newfid" != "$oldfid" ] ||
4591 error "(10.2) Migrate should succeed"
4593 ls -l $DIR/$tdir/foo > /dev/null ||
4594 error "(11) 'ls' failed after migration"
4597 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4598 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4600 run_test 29c "verify linkEA size limitation"
4603 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4604 skip "ldiskfs only test" && return
4607 echo "The namespace LFSCK will move the orphans from backend"
4608 echo "/lost+found directory to normal client visible namespace"
4609 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4612 check_mount_and_prep
4614 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4615 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4617 echo "Inject failure stub on MDT0 to simulate the case that"
4618 echo "directory d0 has no linkEA entry, then the LFSCK will"
4619 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4621 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4622 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4623 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4624 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4626 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4627 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4629 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4630 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4632 echo "Inject failure stub on MDT0 to simulate the case that the"
4633 echo "object's name entry will be removed, but not destroy the"
4634 echo "object. Then backend e2fsck will handle it as orphan and"
4635 echo "add them into the backend /lost+found directory."
4637 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4638 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4639 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4640 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4641 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4642 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4643 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4645 umount_client $MOUNT || error "(10) Fail to stop client!"
4647 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4650 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4651 error "(12) Fail to run e2fsck"
4653 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4654 error "(13) Fail to start MDT0"
4656 echo "Trigger namespace LFSCK to recover backend orphans"
4657 $START_NAMESPACE -r -A ||
4658 error "(14) Fail to start LFSCK for namespace"
4660 wait_all_targets_blocked namespace completed 15
4662 local repaired=$($SHOW_NAMESPACE |
4663 awk '/^local_lost_found_moved/ { print $2 }')
4664 [ $repaired -ge 4 ] ||
4665 error "(16) Fail to recover backend orphans: $repaired"
4667 mount_client $MOUNT || error "(17) Fail to start client!"
4669 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4671 ls -ail $MOUNT/.lustre/lost+found/
4673 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4674 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4675 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4677 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4679 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4680 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4682 stat ${cname}/d1 || error "(21) d1 is not recovered"
4683 stat ${cname}/f1 || error "(22) f1 is not recovered"
4685 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4688 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4691 echo "For the name entry under a striped directory, if the name"
4692 echo "hash does not match the shard, then the LFSCK will repair"
4693 echo "the bad name entry"
4696 check_mount_and_prep
4698 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4699 error "(1) Fail to create striped directory"
4701 echo "Inject failure stub on client to simulate the case that"
4702 echo "some name entry should be inserted into other non-first"
4703 echo "shard, but inserted into the first shard by wrong"
4705 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4706 $LCTL set_param fail_loc=0x1628 fail_val=0
4707 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4708 error "(2) Fail to create file under striped directory"
4709 $LCTL set_param fail_loc=0 fail_val=0
4711 echo "Trigger namespace LFSCK to repair bad name hash"
4712 $START_NAMESPACE -r -A ||
4713 error "(3) Fail to start LFSCK for namespace"
4715 wait_all_targets_blocked namespace completed 4
4717 local repaired=$($SHOW_NAMESPACE |
4718 awk '/^name_hash_repaired/ { print $2 }')
4719 [ $repaired -ge 1 ] ||
4720 error "(5) Fail to repair bad name hash: $repaired"
4722 umount_client $MOUNT || error "(6) umount failed"
4723 mount_client $MOUNT || error "(7) mount failed"
4725 for ((i = 0; i < $MDSCOUNT; i++)); do
4726 stat $DIR/$tdir/striped_dir/d$i ||
4727 error "(8) Fail to stat d$i after LFSCK"
4728 rmdir $DIR/$tdir/striped_dir/d$i ||
4729 error "(9) Fail to unlink d$i after LFSCK"
4732 rmdir $DIR/$tdir/striped_dir ||
4733 error "(10) Fail to remove the striped directory after LFSCK"
4735 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4738 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4741 echo "For the name entry under a striped directory, if the name"
4742 echo "hash does not match the shard, then the LFSCK will repair"
4743 echo "the bad name entry"
4746 check_mount_and_prep
4748 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4749 error "(1) Fail to create striped directory"
4751 echo "Inject failure stub on client to simulate the case that"
4752 echo "some name entry should be inserted into other non-second"
4753 echo "shard, but inserted into the secod shard by wrong"
4755 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4756 $LCTL set_param fail_loc=0x1628 fail_val=1
4757 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4758 error "(2) Fail to create file under striped directory"
4759 $LCTL set_param fail_loc=0 fail_val=0
4761 echo "Trigger namespace LFSCK to repair bad name hash"
4762 $START_NAMESPACE -r -A ||
4763 error "(3) Fail to start LFSCK for namespace"
4765 wait_all_targets_blocked namespace completed 4
4767 local repaired=$(do_facet mds2 $LCTL get_param -n \
4768 mdd.$(facet_svc mds2).lfsck_namespace |
4769 awk '/^name_hash_repaired/ { print $2 }')
4770 [ $repaired -ge 1 ] ||
4771 error "(5) Fail to repair bad name hash: $repaired"
4773 umount_client $MOUNT || error "(6) umount failed"
4774 mount_client $MOUNT || error "(7) mount failed"
4776 for ((i = 0; i < $MDSCOUNT; i++)); do
4777 stat $DIR/$tdir/striped_dir/d$i ||
4778 error "(8) Fail to stat d$i after LFSCK"
4779 rmdir $DIR/$tdir/striped_dir/d$i ||
4780 error "(9) Fail to unlink d$i after LFSCK"
4783 rmdir $DIR/$tdir/striped_dir ||
4784 error "(10) Fail to remove the striped directory after LFSCK"
4786 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4789 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4792 echo "For some reason, the master MDT-object of the striped directory"
4793 echo "may lost its master LMV EA. If nobody created files under the"
4794 echo "master directly after the master LMV EA lost, then the LFSCK"
4795 echo "should re-generate the master LMV EA."
4798 check_mount_and_prep
4800 echo "Inject failure stub on MDT0 to simulate the case that the"
4801 echo "master MDT-object of the striped directory lost the LMV EA."
4803 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4805 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4806 error "(1) Fail to create striped directory"
4807 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4809 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4810 $START_NAMESPACE -r -A ||
4811 error "(2) Fail to start LFSCK for namespace"
4813 wait_all_targets_blocked namespace completed 3
4815 local repaired=$($SHOW_NAMESPACE |
4816 awk '/^striped_dirs_repaired/ { print $2 }')
4817 [ $repaired -eq 1 ] ||
4818 error "(4) Fail to re-generate master LMV EA: $repaired"
4820 umount_client $MOUNT || error "(5) umount failed"
4821 mount_client $MOUNT || error "(6) mount failed"
4823 local empty=$(ls $DIR/$tdir/striped_dir/)
4824 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4826 rmdir $DIR/$tdir/striped_dir ||
4827 error "(8) Fail to remove the striped directory after LFSCK"
4829 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4832 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4835 echo "For some reason, the master MDT-object of the striped directory"
4836 echo "may lost its master LMV EA. If somebody created files under the"
4837 echo "master directly after the master LMV EA lost, then the LFSCK"
4838 echo "should NOT re-generate the master LMV EA, instead, it should"
4839 echo "change the broken striped dirctory as read-only to prevent"
4840 echo "further damage"
4843 check_mount_and_prep
4845 echo "Inject failure stub on MDT0 to simulate the case that the"
4846 echo "master MDT-object of the striped directory lost the LMV EA."
4848 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4849 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4850 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4851 error "(1) Fail to create striped directory"
4852 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4854 umount_client $MOUNT || error "(2) umount failed"
4855 mount_client $MOUNT || error "(3) mount failed"
4857 touch $DIR/$tdir/striped_dir/dummy ||
4858 error "(4) Fail to touch under broken striped directory"
4860 echo "Trigger namespace LFSCK to find out the inconsistency"
4861 $START_NAMESPACE -r -A ||
4862 error "(5) Fail to start LFSCK for namespace"
4864 wait_all_targets_blocked namespace completed 6
4866 local repaired=$($SHOW_NAMESPACE |
4867 awk '/^striped_dirs_repaired/ { print $2 }')
4868 [ $repaired -eq 0 ] ||
4869 error "(7) Re-generate master LMV EA unexpected: $repaired"
4871 stat $DIR/$tdir/striped_dir/dummy ||
4872 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4874 touch $DIR/$tdir/striped_dir/foo &&
4875 error "(9) The broken striped directory should be read-only"
4877 chattr -i $DIR/$tdir/striped_dir ||
4878 error "(10) Fail to chattr on the broken striped directory"
4880 rmdir $DIR/$tdir/striped_dir ||
4881 error "(11) Fail to remove the striped directory after LFSCK"
4883 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4886 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4889 echo "For some reason, the slave MDT-object of the striped directory"
4890 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4891 echo "slave LMV EA."
4894 check_mount_and_prep
4896 echo "Inject failure stub on MDT0 to simulate the case that the"
4897 echo "slave MDT-object (that resides on the same MDT as the master"
4898 echo "MDT-object resides on) lost the LMV EA."
4900 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4901 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4902 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4903 error "(1) Fail to create striped directory"
4904 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4906 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4907 $START_NAMESPACE -r -A ||
4908 error "(2) Fail to start LFSCK for namespace"
4910 wait_all_targets_blocked namespace completed 3
4912 local repaired=$($SHOW_NAMESPACE |
4913 awk '/^striped_shards_repaired/ { print $2 }')
4914 [ $repaired -eq 1 ] ||
4915 error "(4) Fail to re-generate slave LMV EA: $repaired"
4917 rmdir $DIR/$tdir/striped_dir ||
4918 error "(5) Fail to remove the striped directory after LFSCK"
4920 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4923 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4926 echo "For some reason, the slave MDT-object of the striped directory"
4927 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4928 echo "slave LMV EA."
4931 check_mount_and_prep
4933 echo "Inject failure stub on MDT0 to simulate the case that the"
4934 echo "slave MDT-object (that resides on different MDT as the master"
4935 echo "MDT-object resides on) lost the LMV EA."
4937 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4938 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4939 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4940 error "(1) Fail to create striped directory"
4941 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4943 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4944 $START_NAMESPACE -r -A ||
4945 error "(2) Fail to start LFSCK for namespace"
4947 wait_all_targets_blocked namespace completed 3
4949 local repaired=$(do_facet mds2 $LCTL get_param -n \
4950 mdd.$(facet_svc mds2).lfsck_namespace |
4951 awk '/^striped_shards_repaired/ { print $2 }')
4952 [ $repaired -eq 1 ] ||
4953 error "(4) Fail to re-generate slave LMV EA: $repaired"
4955 rmdir $DIR/$tdir/striped_dir ||
4956 error "(5) Fail to remove the striped directory after LFSCK"
4958 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4961 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4964 echo "For some reason, the stripe index in the slave LMV EA is"
4965 echo "corrupted. The LFSCK should repair the slave LMV EA."
4968 check_mount_and_prep
4970 echo "Inject failure stub on MDT0 to simulate the case that the"
4971 echo "slave LMV EA on the first shard of the striped directory"
4972 echo "claims the same index as the second shard claims"
4974 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4975 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4976 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4977 error "(1) Fail to create striped directory"
4978 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4980 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4981 $START_NAMESPACE -r -A ||
4982 error "(2) Fail to start LFSCK for namespace"
4984 wait_all_targets_blocked namespace completed 3
4986 local repaired=$($SHOW_NAMESPACE |
4987 awk '/^striped_shards_repaired/ { print $2 }')
4988 [ $repaired -eq 1 ] ||
4989 error "(4) Fail to repair slave LMV EA: $repaired"
4991 umount_client $MOUNT || error "(5) umount failed"
4992 mount_client $MOUNT || error "(6) mount failed"
4994 touch $DIR/$tdir/striped_dir/foo ||
4995 error "(7) Fail to touch file after the LFSCK"
4997 rm -f $DIR/$tdir/striped_dir/foo ||
4998 error "(8) Fail to unlink file after the LFSCK"
5000 rmdir $DIR/$tdir/striped_dir ||
5001 error "(9) Fail to remove the striped directory after LFSCK"
5003 run_test 31g "Repair the corrupted slave LMV EA"
5006 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5009 echo "For some reason, the shard's name entry in the striped"
5010 echo "directory may be corrupted. The LFSCK should repair the"
5011 echo "bad shard's name entry."
5014 check_mount_and_prep
5016 echo "Inject failure stub on MDT0 to simulate the case that the"
5017 echo "first shard's name entry in the striped directory claims"
5018 echo "the same index as the second shard's name entry claims."
5020 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5021 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5022 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5023 error "(1) Fail to create striped directory"
5024 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5026 echo "Trigger namespace LFSCK to repair the shard's name entry"
5027 $START_NAMESPACE -r -A ||
5028 error "(2) Fail to start LFSCK for namespace"
5030 wait_all_targets_blocked namespace completed 3
5032 local repaired=$($SHOW_NAMESPACE |
5033 awk '/^dirent_repaired/ { print $2 }')
5034 [ $repaired -eq 1 ] ||
5035 error "(4) Fail to repair shard's name entry: $repaired"
5037 umount_client $MOUNT || error "(5) umount failed"
5038 mount_client $MOUNT || error "(6) mount failed"
5040 touch $DIR/$tdir/striped_dir/foo ||
5041 error "(7) Fail to touch file after the LFSCK"
5043 rm -f $DIR/$tdir/striped_dir/foo ||
5044 error "(8) Fail to unlink file after the LFSCK"
5046 rmdir $DIR/$tdir/striped_dir ||
5047 error "(9) Fail to remove the striped directory after LFSCK"
5049 run_test 31h "Repair the corrupted shard's name entry"
5054 umount_client $MOUNT
5056 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5057 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5058 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5060 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5061 [ "$STATUS" == "scanning-phase1" ] ||
5062 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5065 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5067 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5071 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5073 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5074 error "(5) Fail to start ost1"
5076 run_test 32a "stop LFSCK when some OST failed"
5080 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5083 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5084 error "(1) Fail to create $DIR/$tdir/dp"
5085 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5086 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5087 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5088 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5089 umount_client $MOUNT
5091 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5092 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5093 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5095 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5096 mdd.${MDT_DEV}.lfsck_namespace |
5097 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5099 error "(5) unexpected status"
5103 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5105 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5109 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5111 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5112 error "(8) Fail to start MDT2"
5114 run_test 32b "stop LFSCK when some MDT failed"
5120 $START_LAYOUT --dryrun -o -r ||
5121 error "(1) Fail to start layout LFSCK"
5122 wait_all_targets_blocked layout completed 2
5124 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5125 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5126 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5128 $START_NAMESPACE -e abort -A -r ||
5129 error "(4) Fail to start namespace LFSCK"
5130 wait_all_targets_blocked namespace completed 5
5132 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5133 [ "$PARAMS" == "failout,all_targets" ] ||
5134 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5136 run_test 33 "check LFSCK paramters"
5140 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5141 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5142 skip "Only valid for ZFS backend" && return
5146 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5147 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5148 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5149 error "(1) Fail to create $DIR/$tdir/dummy"
5151 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5152 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5153 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5154 mdd.${MDT_DEV}.lfsck_namespace |
5155 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5157 error "(3) unexpected status"
5160 local repaired=$($SHOW_NAMESPACE |
5161 awk '/^dirent_repaired/ { print $2 }')
5162 [ $repaired -eq 1 ] ||
5163 error "(4) Fail to repair the lost agent object: $repaired"
5165 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5166 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5167 mdd.${MDT_DEV}.lfsck_namespace |
5168 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5170 error "(6) unexpected status"
5173 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5174 [ $repaired -eq 0 ] ||
5175 error "(7) Unexpected repairing: $repaired"
5177 run_test 34 "LFSCK can rebuild the lost agent object"
5181 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5185 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5186 do_facet mds2 $LCTL set_param fail_loc=0x1631
5187 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5188 error "(1) Fail to create $DIR/$tdir/dummy"
5191 do_facet mds2 $LCTL set_param fail_loc=0
5192 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5193 wait_update_facet mds2 "$LCTL get_param -n \
5194 mdd.$(facet_svc mds2).lfsck_namespace |
5195 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5196 error "(3) MDS${k} is not the expected 'completed'"
5198 local repaired=$(do_facet mds2 $LCTL get_param -n \
5199 mdd.$(facet_svc mds2).lfsck_namespace |
5200 awk '/^agent_entries_repaired/ { print $2 }')
5201 [ $repaired -eq 1 ] ||
5202 error "(4) Fail to repair the lost agent entry: $repaired"
5204 echo "stopall to cleanup object cache"
5207 setupall > /dev/null
5209 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5210 wait_update_facet mds2 "$LCTL get_param -n \
5211 mdd.$(facet_svc mds2).lfsck_namespace |
5212 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5213 error "(6) MDS${k} is not the expected 'completed'"
5215 repaired=$(do_facet mds2 $LCTL get_param -n \
5216 mdd.$(facet_svc mds2).lfsck_namespace |
5217 awk '/^agent_entries_repaired/ { print $2 }')
5218 [ $repaired -eq 0 ] ||
5219 error "(7) Unexpected repairing: $repaired"
5221 run_test 35 "LFSCK can rebuild the lost agent entry"
5224 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5227 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5228 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5229 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5232 check_mount_and_prep
5234 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5235 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5236 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5237 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5238 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5239 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5240 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5241 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5242 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5244 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5245 error "(3) Fail to write $DIR/$tdir/f0"
5246 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5247 error "(4) Fail to write $DIR/$tdir/f1"
5248 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5249 error "(5) Fail to write $DIR/$tdir/f2"
5251 $LFS mirror resync $DIR/$tdir/f0 ||
5252 error "(6) Fail to resync $DIR/$tdir/f0"
5253 $LFS mirror resync $DIR/$tdir/f1 ||
5254 error "(7) Fail to resync $DIR/$tdir/f1"
5255 $LFS mirror resync $DIR/$tdir/f2 ||
5256 error "(8) Fail to resync $DIR/$tdir/f2"
5258 cancel_lru_locks mdc
5259 cancel_lru_locks osc
5261 $LFS getstripe $DIR/$tdir/f0 ||
5262 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5263 $LFS getstripe $DIR/$tdir/f1 ||
5264 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5265 $LFS getstripe $DIR/$tdir/f2 ||
5266 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5268 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5269 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5270 do_facet mds1 $LCTL set_param fail_loc=0x1616
5272 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5273 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5274 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5275 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5276 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5277 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5281 do_facet mds1 $LCTL set_param fail_loc=0
5283 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5284 error "(15) The 1st of mirror is not destroyed"
5285 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5286 error "(16) The 2nd of mirror is not destroyed"
5287 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5288 error "(17) The 3rd of mirror is not destroyed"
5292 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5293 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5294 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5295 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5296 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5297 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5299 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5300 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5302 for k in $(seq $MDSCOUNT); do
5303 # The LFSCK status query internal is 30 seconds. For the case
5304 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5305 # time to guarantee the status sync up.
5306 wait_update_facet mds${k} "$LCTL get_param -n \
5307 mdd.$(facet_svc mds${k}).lfsck_layout |
5308 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5309 error "(22) MDS${k} is not the expected 'completed'"
5312 for k in $(seq $OSTCOUNT); do
5313 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5314 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5315 awk '/^status/ { print $2 }')
5316 [ "$cur_status" == "completed" ] ||
5317 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5320 local repaired=$(do_facet mds1 $LCTL get_param -n \
5321 mdd.$(facet_svc mds1).lfsck_layout |
5322 awk '/^repaired_orphan/ { print $2 }')
5323 [ $repaired -eq 9 ] ||
5324 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5326 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5327 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5328 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5329 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5330 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5331 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5333 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5334 $LFS getstripe $DIR/$tdir/f0
5335 error "(28) The 1st of mirror is not recovered"
5338 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5339 $LFS getstripe $DIR/$tdir/f1
5340 error "(29) The 2nd of mirror is not recovered"
5343 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5344 $LFS getstripe $DIR/$tdir/f2
5345 error "(30) The 3rd of mirror is not recovered"
5348 run_test 36a "rebuild LOV EA for mirrored file (1)"
5351 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5354 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5355 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5356 echo "with the PFID EA of related OST-object(s) belong to the file. "
5359 check_mount_and_prep
5361 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5362 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5363 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5365 local fid=$($LFS path2fid $DIR/$tdir/f0)
5367 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5368 error "(1) Fail to write $DIR/$tdir/f0"
5369 $LFS mirror resync $DIR/$tdir/f0 ||
5370 error "(2) Fail to resync $DIR/$tdir/f0"
5372 cancel_lru_locks mdc
5373 cancel_lru_locks osc
5375 $LFS getstripe $DIR/$tdir/f0 ||
5376 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5378 echo "Inject failure, to simulate the case of missing the MDT-object"
5379 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5380 do_facet mds1 $LCTL set_param fail_loc=0x1616
5381 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5385 do_facet mds1 $LCTL set_param fail_loc=0
5387 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5388 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5390 for k in $(seq $MDSCOUNT); do
5391 # The LFSCK status query internal is 30 seconds. For the case
5392 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5393 # time to guarantee the status sync up.
5394 wait_update_facet mds${k} "$LCTL get_param -n \
5395 mdd.$(facet_svc mds${k}).lfsck_layout |
5396 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5397 error "(6) MDS${k} is not the expected 'completed'"
5400 for k in $(seq $OSTCOUNT); do
5401 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5402 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5403 awk '/^status/ { print $2 }')
5404 [ "$cur_status" == "completed" ] ||
5405 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5408 local count=$(do_facet mds1 $LCTL get_param -n \
5409 mdd.$(facet_svc mds1).lfsck_layout |
5410 awk '/^repaired_orphan/ { print $2 }')
5411 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5413 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5414 count=$($LFS getstripe --mirror-count $name)
5415 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5417 count=$($LFS getstripe --component-count $name)
5418 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5420 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5421 $LFS getstripe $name
5422 error "(11) The 1st of mirror is not recovered"
5425 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5426 $LFS getstripe $name
5427 error "(12) The 2nd of mirror is not recovered"
5430 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5431 $LFS getstripe $name
5432 error "(13) The 3rd of mirror is not recovered"
5435 run_test 36b "rebuild LOV EA for mirrored file (2)"
5438 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5441 echo "The mirrored file has been modified, not resynced yet, then "
5442 echo "lost its MDT-object, but relatd OST-objects are still there. "
5443 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5444 echo "with the PFID EA of related OST-object(s) belong to the file. "
5447 check_mount_and_prep
5449 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5451 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5453 local fid=$($LFS path2fid $DIR/$tdir/f0)
5455 # The 1st dd && resync makes all related OST-objects have been written
5456 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5457 error "(1.1) Fail to write $DIR/$tdir/f0"
5458 $LFS mirror resync $DIR/$tdir/f0 ||
5459 error "(1.2) Fail to resync $DIR/$tdir/f0"
5460 # The 2nd dd makes one mirror to be stale
5461 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5462 error "(1.3) Fail to write $DIR/$tdir/f0"
5464 cancel_lru_locks mdc
5465 cancel_lru_locks osc
5467 $LFS getstripe $DIR/$tdir/f0 ||
5468 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5470 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5471 awk '/lcme_flags/ { print $2 }')
5472 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5473 awk '/lcme_flags/ { print $2 }')
5475 echo "Inject failure, to simulate the case of missing the MDT-object"
5476 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5477 do_facet mds1 $LCTL set_param fail_loc=0x1616
5478 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5482 do_facet mds1 $LCTL set_param fail_loc=0
5484 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5485 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5487 for k in $(seq $MDSCOUNT); do
5488 # The LFSCK status query internal is 30 seconds. For the case
5489 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5490 # time to guarantee the status sync up.
5491 wait_update_facet mds${k} "$LCTL get_param -n \
5492 mdd.$(facet_svc mds${k}).lfsck_layout |
5493 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5494 error "(5) MDS${k} is not the expected 'completed'"
5497 for k in $(seq $OSTCOUNT); do
5498 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5499 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5500 awk '/^status/ { print $2 }')
5501 [ "$cur_status" == "completed" ] ||
5502 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5505 local count=$(do_facet mds1 $LCTL get_param -n \
5506 mdd.$(facet_svc mds1).lfsck_layout |
5507 awk '/^repaired_orphan/ { print $2 }')
5508 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5510 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5511 count=$($LFS getstripe --mirror-count $name)
5512 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5514 count=$($LFS getstripe --component-count $name)
5515 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5517 local flags=$($LFS getstripe $name | head -n 10 |
5518 awk '/lcme_flags/ { print $2 }')
5519 [ "$flags" == "$saved_flags1" ] || {
5520 $LFS getstripe $name
5521 error "(10) expect flags $saved_flags1, got $flags"
5524 flags=$($LFS getstripe $name | tail -n 10 |
5525 awk '/lcme_flags/ { print $2 }')
5526 [ "$flags" == "$saved_flags2" ] || {
5527 $LFS getstripe $name
5528 error "(11) expect flags $saved_flags2, got $flags"
5531 run_test 36c "rebuild LOV EA for mirrored file (3)"
5537 local t_dir="$DIR/$tdir/d0"
5538 check_mount_and_prep
5540 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5541 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5545 $START_NAMESPACE -r -A || {
5546 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5548 wait_all_targets_blocked namespace completed 4
5553 run_test 37 "LFSCK must skip a ORPHAN"
5556 [[ $MDSCOUNT -ge 2 ]] || skip "needs >= 2 MDTs"
5558 check_mount_and_prep
5559 $LFS mkdir -i 1 $DIR/$tdir/dir1
5560 $LFS setstripe -E 1M -c1 -S 1M -E 128M -c2 -S 4M -E eof $DIR/$tdir/dir1
5562 touch $DIR/$tdir/dir1/f1
5563 local layout1=$(get_layout_param $DIR/$tdir/dir1/f1)
5565 echo "Migrate $DIR/$tdir/dir1 from MDT1 to MDT0"
5566 $LFS migrate -m 0 $DIR/$tdir/dir1
5568 echo "trigger LFSCK for layout"
5569 do_facet $SINGLEMDS $LCTL lfsck_start -M ${MDT_DEV} -t layout -r
5571 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5572 mdd.${MDT_DEV}.lfsck_layout |
5573 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5575 error "(2) unexpected status"
5578 local layout2=$(get_layout_param $DIR/$tdir/dir1/f1)
5580 [[ "$layout1" == "$layout2" ]] || error "layout lost after lfsck"
5582 run_test 40a "LFSCK correctly fixes lmm_oi in composite layout"
5584 # restore MDS/OST size
5585 MDSSIZE=${SAVED_MDSSIZE}
5586 OSTSIZE=${SAVED_OSTSIZE}
5587 OSTCOUNT=${SAVED_OSTCOUNT}
5589 # cleanup the system at last
5590 REFORMAT="yes" cleanup_and_setup_lustre
5593 check_and_cleanup_lustre