3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
45 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
57 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
60 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
61 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
63 # DNE does not support striped directory on zfs-based backend yet.
64 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
65 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
69 MDT_DEV="${FSNAME}-MDT0000"
70 OST_DEV="${FSNAME}-OST0000"
71 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
72 START_NAMESPACE="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
74 START_LAYOUT="do_facet $SINGLEMDS \
75 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
76 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
77 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
78 SHOW_NAMESPACE="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
80 SHOW_LAYOUT="do_facet $SINGLEMDS \
81 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
82 SHOW_LAYOUT_ON_OST="do_facet ost1 \
83 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
84 MOUNT_OPTS_SCRUB="-o user_xattr"
85 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
86 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
95 echo "preparing... $nfiles * $ndirs files will be created $(date)."
96 if [ ! -z $igif ]; then
97 #define OBD_FAIL_FID_IGIF 0x1504
98 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
101 cp $LUSTRE/tests/*.sh $DIR/$tdir/
102 if [ $ndirs -gt 0 ]; then
103 createmany -d $DIR/$tdir/d $ndirs
104 createmany -m $DIR/$tdir/f $ndirs
105 if [ $nfiles -gt 0 ]; then
106 for ((i = 0; i < $ndirs; i++)); do
107 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
108 /dev/null || error "createmany $nfiles"
111 createmany -d $DIR/$tdir/e $ndirs
114 if [ ! -z $igif ]; then
115 touch $DIR/$tdir/dummy
116 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
119 echo "prepared $(date)."
122 run_e2fsck_on_mdt0() {
123 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
125 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
128 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
129 error "(2) Detected inconsistency on MDT0"
131 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
132 error "(3) Fail to start MDT0"
135 wait_all_targets_blocked() {
140 local count=$(do_facet mds1 \
141 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
142 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
143 [[ $count -eq $MDSCOUNT ]] || {
144 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
145 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
154 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
155 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
156 "$MDSCOUNT" $LTIME || {
157 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
158 error "($err) some MDTs are not in ${status}"
165 #define OBD_FAIL_LFSCK_DELAY1 0x1600
166 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
167 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
169 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
171 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
172 [ "$STATUS" == "scanning-phase1" ] ||
173 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
175 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
177 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
178 [ "$STATUS" == "stopped" ] ||
179 error "(6) Expect 'stopped', but got '$STATUS'"
181 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
183 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
184 [ "$STATUS" == "scanning-phase1" ] ||
185 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
188 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
189 mdd.${MDT_DEV}.lfsck_namespace |
190 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
192 error "(9) unexpected status"
195 local repaired=$($SHOW_NAMESPACE |
196 awk '/^updated_phase1/ { print $2 }')
197 [ $repaired -eq 0 ] ||
198 error "(10) Expect nothing to be repaired, but got: $repaired"
200 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
201 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
202 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
203 mdd.${MDT_DEV}.lfsck_namespace |
204 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
206 error "(12) unexpected status"
209 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
210 [ $((scanned1 + 1)) -eq $scanned2 ] ||
211 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
213 echo "stopall, should NOT crash LU-3649"
214 stopall || error "(14) Fail to stopall"
216 run_test 0 "Control LFSCK manually"
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_FID_IGIF 0x1504
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^dirent_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase1/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair lost FID-in-dirent: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 #define OBD_FAIL_FID_LOOKUP 0x1505
335 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
336 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
338 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
340 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
345 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
347 touch $DIR/$tdir/dummy
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
351 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
352 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
353 mdd.${MDT_DEV}.lfsck_namespace |
354 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
356 error "(4) unexpected status"
359 local repaired=$($SHOW_NAMESPACE |
360 awk '/^linkea_repaired/ { print $2 }')
361 # for interop with old server
362 [ -z "$repaired" ] &&
363 repaired=$($SHOW_NAMESPACE |
364 awk '/^updated_phase2/ { print $2 }')
366 [ $repaired -eq 1 ] ||
367 error "(5) Fail to repair crashed linkEA: $repaired"
371 mount_client $MOUNT || error "(6) Fail to start client!"
373 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
374 error "(7) Fail to stat $DIR/$tdir/dummy"
376 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
377 local dummyname=$($LFS fid2path $DIR $dummyfid)
378 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
379 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
381 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
387 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
388 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
389 touch $DIR/$tdir/dummy
391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
393 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
394 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
395 mdd.${MDT_DEV}.lfsck_namespace |
396 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
398 error "(4) unexpected status"
401 local repaired=$($SHOW_NAMESPACE |
402 awk '/^updated_phase2/ { print $2 }')
403 [ $repaired -eq 1 ] ||
404 error "(5) Fail to repair crashed linkEA: $repaired"
408 mount_client $MOUNT || error "(6) Fail to start client!"
410 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
411 error "(7) Fail to stat $DIR/$tdir/dummy"
413 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
414 local dummyname=$($LFS fid2path $DIR $dummyfid)
415 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
416 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
418 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
424 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
426 touch $DIR/$tdir/dummy
428 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
430 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
431 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
432 mdd.${MDT_DEV}.lfsck_namespace |
433 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
435 error "(4) unexpected status"
438 local repaired=$($SHOW_NAMESPACE |
439 awk '/^updated_phase2/ { print $2 }')
440 [ $repaired -eq 1 ] ||
441 error "(5) Fail to repair crashed linkEA: $repaired"
445 mount_client $MOUNT || error "(6) Fail to start client!"
447 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
448 error "(7) Fail to stat $DIR/$tdir/dummy"
450 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
451 local dummyname=$($LFS fid2path $DIR $dummyfid)
452 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
453 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
455 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
461 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
462 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
463 touch $DIR/$tdir/dummy
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
467 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
468 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
469 mdd.${MDT_DEV}.lfsck_namespace |
470 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
472 error "(4) unexpected status"
475 local repaired=$($SHOW_NAMESPACE |
476 awk '/^linkea_repaired/ { print $2 }')
477 [ $repaired -eq 1 ] ||
478 error "(5) Fail to repair crashed linkEA: $repaired"
482 mount_client $MOUNT || error "(6) Fail to start client!"
484 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
485 error "(7) Fail to stat $DIR/$tdir/dummy"
487 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
488 local dummyname=$($LFS fid2path $DIR $dummyfid)
489 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
490 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
492 run_test 2d "LFSCK can recover the missing linkEA entry"
496 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
500 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
502 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
504 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
509 wait_all_targets_blocked namespace completed 4
511 local repaired=$($SHOW_NAMESPACE |
512 awk '/^linkea_repaired/ { print $2 }')
513 [ $repaired -eq 1 ] ||
514 error "(5) Fail to repair crashed linkEA: $repaired"
516 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
517 local name=$($LFS fid2path $DIR $fid)
518 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
519 error "(6) Fail to repair linkEA: $fid $name"
521 run_test 2e "namespace LFSCK can verify remote object linkEA"
527 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
528 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
529 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
531 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
532 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
533 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
535 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
536 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
537 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
539 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
541 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
543 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
545 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
547 mdd.${MDT_DEV}.lfsck_namespace |
548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
550 error "(10) unexpected status"
553 local checked=$($SHOW_NAMESPACE |
554 awk '/^checked_phase2/ { print $2 }')
555 [ $checked -ge 4 ] ||
556 error "(11) Fail to check multiple-linked object: $checked"
558 local repaired=$($SHOW_NAMESPACE |
559 awk '/^multiple_linked_repaired/ { print $2 }')
560 [ $repaired -ge 2 ] ||
561 error "(12) Fail to repair multiple-linked object: $repaired"
563 run_test 3 "LFSCK can verify multiple-linked objects"
567 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
568 skip "OI Scrub not implemented for ZFS" && return
571 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
572 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
574 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
575 echo "start $SINGLEMDS with disabling OI scrub"
576 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
577 error "(2) Fail to start MDS!"
579 #define OBD_FAIL_LFSCK_DELAY2 0x1601
580 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
581 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
582 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
583 mdd.${MDT_DEV}.lfsck_namespace |
584 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
586 error "(5) unexpected status"
589 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
590 [ "$STATUS" == "scanning-phase1" ] ||
591 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
593 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
594 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
595 mdd.${MDT_DEV}.lfsck_namespace |
596 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
598 error "(7) unexpected status"
601 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
602 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
604 local repaired=$($SHOW_NAMESPACE |
605 awk '/^dirent_repaired/ { print $2 }')
606 # for interop with old server
607 [ -z "$repaired" ] &&
608 repaired=$($SHOW_NAMESPACE |
609 awk '/^updated_phase1/ { print $2 }')
611 [ $repaired -ge 9 ] ||
612 error "(9) Fail to re-generate FID-in-dirent: $repaired"
616 mount_client $MOUNT || error "(10) Fail to start client!"
618 #define OBD_FAIL_FID_LOOKUP 0x1505
619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
620 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
621 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
623 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
627 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
628 skip "OI Scrub not implemented for ZFS" && return
631 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
632 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
634 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
635 echo "start $SINGLEMDS with disabling OI scrub"
636 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
637 error "(2) Fail to start MDS!"
639 #define OBD_FAIL_LFSCK_DELAY2 0x1601
640 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
641 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
643 mdd.${MDT_DEV}.lfsck_namespace |
644 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
646 error "(5) unexpected status"
649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
650 [ "$STATUS" == "scanning-phase1" ] ||
651 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
653 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
658 error "(7) unexpected status"
661 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
662 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
664 local repaired=$($SHOW_NAMESPACE |
665 awk '/^dirent_repaired/ { print $2 }')
666 # for interop with old server
667 [ -z "$repaired" ] &&
668 repaired=$($SHOW_NAMESPACE |
669 awk '/^updated_phase1/ { print $2 }')
671 [ $repaired -ge 2 ] ||
672 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
676 mount_client $MOUNT || error "(10) Fail to start client!"
678 #define OBD_FAIL_FID_LOOKUP 0x1505
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
680 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
682 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
684 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
685 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
686 local dummyname=$($LFS fid2path $DIR $dummyfid)
687 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
688 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
690 run_test 5 "LFSCK can handle IGIF object upgrading"
695 #define OBD_FAIL_LFSCK_DELAY1 0x1600
696 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
697 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
699 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
700 [ "$STATUS" == "scanning-phase1" ] ||
701 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
703 # Sleep 3 sec to guarantee at least one object processed by LFSCK
705 # Fail the LFSCK to guarantee there is at least one checkpoint
706 #define OBD_FAIL_LFSCK_FATAL1 0x1608
707 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
708 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
709 mdd.${MDT_DEV}.lfsck_namespace |
710 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
712 error "(4) unexpected status"
715 local POS0=$($SHOW_NAMESPACE |
716 awk '/^last_checkpoint_position/ { print $2 }' |
719 #define OBD_FAIL_LFSCK_DELAY1 0x1600
720 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
721 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
723 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
724 [ "$STATUS" == "scanning-phase1" ] ||
725 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
727 local POS1=$($SHOW_NAMESPACE |
728 awk '/^latest_start_position/ { print $2 }' |
730 [[ $POS0 -lt $POS1 ]] ||
731 error "(7) Expect larger than: $POS0, but got $POS1"
733 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
734 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
735 mdd.${MDT_DEV}.lfsck_namespace |
736 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
738 error "(8) unexpected status"
741 run_test 6a "LFSCK resumes from last checkpoint (1)"
746 #define OBD_FAIL_LFSCK_DELAY2 0x1601
747 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
748 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
750 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
751 [ "$STATUS" == "scanning-phase1" ] ||
752 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
754 # Sleep 5 sec to guarantee that we are in the directory scanning
756 # Fail the LFSCK to guarantee there is at least one checkpoint
757 #define OBD_FAIL_LFSCK_FATAL2 0x1609
758 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
759 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
760 mdd.${MDT_DEV}.lfsck_namespace |
761 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
763 error "(4) unexpected status"
766 local O_POS0=$($SHOW_NAMESPACE |
767 awk '/^last_checkpoint_position/ { print $2 }' |
770 local D_POS0=$($SHOW_NAMESPACE |
771 awk '/^last_checkpoint_position/ { print $4 }')
773 #define OBD_FAIL_LFSCK_DELAY2 0x1601
774 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
775 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
777 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
778 [ "$STATUS" == "scanning-phase1" ] ||
779 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
781 local O_POS1=$($SHOW_NAMESPACE |
782 awk '/^latest_start_position/ { print $2 }' |
784 local D_POS1=$($SHOW_NAMESPACE |
785 awk '/^latest_start_position/ { print $4 }')
787 echo "Additional debug for 6b"
789 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
790 [[ $O_POS0 -lt $O_POS1 ]] ||
791 error "(7.1) $O_POS1 is not larger than $O_POS0"
793 [[ $D_POS0 -lt $D_POS1 ]] ||
794 error "(7.2) $D_POS1 is not larger than $D_POS0"
797 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
798 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
799 mdd.${MDT_DEV}.lfsck_namespace |
800 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
802 error "(8) unexpected status"
805 run_test 6b "LFSCK resumes from last checkpoint (2)"
812 #define OBD_FAIL_LFSCK_DELAY2 0x1601
813 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
814 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
816 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
817 [ "$STATUS" == "scanning-phase1" ] ||
818 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
820 # Sleep 3 sec to guarantee at least one object processed by LFSCK
822 echo "stop $SINGLEMDS"
823 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
825 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
826 echo "start $SINGLEMDS"
827 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
828 error "(5) Fail to start MDS!"
830 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
831 mdd.${MDT_DEV}.lfsck_namespace |
832 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
834 error "(6) unexpected status"
837 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
843 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
844 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
845 for ((i = 0; i < 20; i++)); do
846 touch $DIR/$tdir/dummy${i}
849 #define OBD_FAIL_LFSCK_DELAY3 0x1602
850 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
851 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
852 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
853 mdd.${MDT_DEV}.lfsck_namespace |
854 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
856 error "(4) unexpected status"
860 echo "stop $SINGLEMDS"
861 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
863 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
864 echo "start $SINGLEMDS"
865 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
866 error "(6) Fail to start MDS!"
868 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
869 mdd.${MDT_DEV}.lfsck_namespace |
870 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
872 error "(7) unexpected status"
875 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
880 formatall > /dev/null
886 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
887 [ "$STATUS" == "init" ] ||
888 error "(2) Expect 'init', but got '$STATUS'"
890 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
891 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
892 mkdir $DIR/$tdir/crashed
894 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
895 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
896 for ((i = 0; i < 5; i++)); do
897 touch $DIR/$tdir/dummy${i}
900 umount_client $MOUNT || error "(3) Fail to stop client!"
902 #define OBD_FAIL_LFSCK_DELAY2 0x1601
903 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
904 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
906 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
907 [ "$STATUS" == "scanning-phase1" ] ||
908 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
910 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
912 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
913 [ "$STATUS" == "stopped" ] ||
914 error "(7) Expect 'stopped', but got '$STATUS'"
916 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
918 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
919 [ "$STATUS" == "scanning-phase1" ] ||
920 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
922 #define OBD_FAIL_LFSCK_FATAL2 0x1609
923 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
924 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
925 mdd.${MDT_DEV}.lfsck_namespace |
926 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
928 error "(10) unexpected status"
931 #define OBD_FAIL_LFSCK_DELAY1 0x1600
932 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
933 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
935 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
936 [ "$STATUS" == "scanning-phase1" ] ||
937 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
939 #define OBD_FAIL_LFSCK_CRASH 0x160a
940 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
943 echo "stop $SINGLEMDS"
944 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
946 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
947 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
949 echo "start $SINGLEMDS"
950 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
951 error "(14) Fail to start MDS!"
953 local timeout=$(max_recovery_time)
956 while [ $timer -lt $timeout ]; do
957 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
958 mdt.${MDT_DEV}.recovery_status |
959 awk '/^status/ { print \\\$2 }'")
960 [ "$STATUS" != "RECOVERING" ] && break;
965 [ $timer != $timeout ] ||
966 error "(14.1) recovery timeout"
968 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
969 [ "$STATUS" == "crashed" ] ||
970 error "(15) Expect 'crashed', but got '$STATUS'"
972 #define OBD_FAIL_LFSCK_DELAY2 0x1601
973 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
974 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
976 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
977 [ "$STATUS" == "scanning-phase1" ] ||
978 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
980 echo "stop $SINGLEMDS"
981 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
983 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
984 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
986 echo "start $SINGLEMDS"
987 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
988 error "(19) Fail to start MDS!"
991 while [ $timer -lt $timeout ]; do
992 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
993 mdt.${MDT_DEV}.recovery_status |
994 awk '/^status/ { print \\\$2 }'")
995 [ "$STATUS" != "RECOVERING" ] && break;
1000 [ $timer != $timeout ] ||
1001 error "(19.1) recovery timeout"
1003 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1004 [ "$STATUS" == "paused" ] ||
1005 error "(20) Expect 'paused', but got '$STATUS'"
1007 echo "stop $SINGLEMDS"
1008 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1010 echo "start $SINGLEMDS without resume LFSCK"
1011 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1012 error "(20.2) Fail to start MDS!"
1015 while [ $timer -lt $timeout ]; do
1016 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1017 mdt.${MDT_DEV}.recovery_status |
1018 awk '/^status/ { print \\\$2 }'")
1019 [ "$STATUS" != "RECOVERING" ] && break;
1021 timer=$((timer + 1))
1024 [ $timer != $timeout ] ||
1025 error "(20.3) recovery timeout"
1027 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1028 [ "$STATUS" == "paused" ] ||
1029 error "(20.4) Expect 'paused', but got '$STATUS'"
1031 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1032 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1034 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1035 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1036 mdd.${MDT_DEV}.lfsck_namespace |
1037 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1039 error "(22) unexpected status"
1042 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1043 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1044 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1046 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1047 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1048 mdd.${MDT_DEV}.lfsck_namespace |
1049 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1051 error "(24) unexpected status"
1054 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1055 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1057 run_test 8 "LFSCK state machine"
1060 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1061 skip "Testing on UP system, the speed may be inaccurate."
1065 check_mount_and_prep
1066 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1067 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1068 createmany -o $DIR/$tdir/lfsck/f 5000
1070 local BASE_SPEED1=100
1072 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1075 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1076 [ "$STATUS" == "scanning-phase1" ] ||
1077 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1079 local SPEED=$($SHOW_LAYOUT |
1080 awk '/^average_speed_phase1/ { print $2 }')
1082 # There may be time error, normally it should be less than 2 seconds.
1083 # We allow another 20% schedule error.
1085 # MAX_MARGIN = 1.3 = 13 / 10
1086 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1087 RUN_TIME1 * 13 / 10))
1088 [ $SPEED -lt $MAX_SPEED ] || {
1090 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1091 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1094 # adjust speed limit
1095 local BASE_SPEED2=300
1097 do_facet $SINGLEMDS \
1098 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1101 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1102 # MIN_MARGIN = 0.7 = 7 / 10
1103 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1104 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1105 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1106 [ $SPEED -gt $MIN_SPEED ] || {
1107 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1108 error_ignore LU-5624 \
1109 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1112 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1116 # MAX_MARGIN = 1.3 = 13 / 10
1117 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1118 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1119 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1120 [ $SPEED -lt $MAX_SPEED ] || {
1122 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1123 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1124 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1127 do_nodes $(comma_list $(mdts_nodes)) \
1128 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1129 do_nodes $(comma_list $(osts_nodes)) \
1130 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1132 wait_update_facet $SINGLEMDS \
1133 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1134 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1135 error "(7) Failed to get expected 'completed'"
1137 run_test 9a "LFSCK speed control (1)"
1140 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1141 skip "Testing on UP system, the speed may be inaccurate."
1147 echo "Preparing another 50 * 50 files (with error) at $(date)."
1148 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1149 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1150 createmany -d $DIR/$tdir/d 50
1151 createmany -m $DIR/$tdir/f 50
1152 for ((i = 0; i < 50; i++)); do
1153 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1156 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1157 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1158 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1159 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1160 mdd.${MDT_DEV}.lfsck_namespace |
1161 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1163 error "(5) unexpected status"
1166 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1167 echo "Prepared at $(date)."
1169 local BASE_SPEED1=50
1171 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1175 [ "$STATUS" == "scanning-phase2" ] ||
1176 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1178 local SPEED=$($SHOW_NAMESPACE |
1179 awk '/^average_speed_phase2/ { print $2 }')
1180 # There may be time error, normally it should be less than 2 seconds.
1181 # We allow another 20% schedule error.
1183 # MAX_MARGIN = 1.3 = 13 / 10
1184 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1185 RUN_TIME1 * 13 / 10))
1186 [ $SPEED -lt $MAX_SPEED ] || {
1188 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1189 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1192 # adjust speed limit
1193 local BASE_SPEED2=150
1195 do_facet $SINGLEMDS \
1196 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1199 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1200 # MIN_MARGIN = 0.7 = 7 / 10
1201 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1202 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1203 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1204 [ $SPEED -gt $MIN_SPEED ] || {
1205 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1206 error_ignore LU-5624 \
1207 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1210 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1214 # MAX_MARGIN = 1.3 = 13 / 10
1215 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1216 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1217 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1218 [ $SPEED -lt $MAX_SPEED ] || {
1220 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1221 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1222 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1225 do_nodes $(comma_list $(mdts_nodes)) \
1226 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1227 do_nodes $(comma_list $(osts_nodes)) \
1228 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1229 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1230 mdd.${MDT_DEV}.lfsck_namespace |
1231 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1233 error "(11) unexpected status"
1236 run_test 9b "LFSCK speed control (2)"
1240 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1241 skip "lookup(..)/linkea on ZFS issue" && return
1245 echo "Preparing more files with error at $(date)."
1246 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1247 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1249 for ((i = 0; i < 1000; i = $((i+2)))); do
1250 mkdir -p $DIR/$tdir/d${i}
1251 touch $DIR/$tdir/f${i}
1252 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1255 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1256 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1258 for ((i = 1; i < 1000; i = $((i+2)))); do
1259 mkdir -p $DIR/$tdir/d${i}
1260 touch $DIR/$tdir/f${i}
1261 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1264 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1265 echo "Prepared at $(date)."
1267 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1269 umount_client $MOUNT
1270 mount_client $MOUNT || error "(3) Fail to start client!"
1272 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1275 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1276 [ "$STATUS" == "scanning-phase1" ] ||
1277 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1279 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1281 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1283 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1285 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1287 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1289 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1291 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1293 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1294 error "(14) Fail to softlink!"
1296 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1297 [ "$STATUS" == "scanning-phase1" ] ||
1298 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1300 do_nodes $(comma_list $(mdts_nodes)) \
1301 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1302 do_nodes $(comma_list $(osts_nodes)) \
1303 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1304 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1305 mdd.${MDT_DEV}.lfsck_namespace |
1306 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1308 error "(16) unexpected status"
1311 run_test 10 "System is available during LFSCK scanning"
1314 ost_remove_lastid() {
1317 local rcmd="do_facet ost${ost}"
1319 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1321 # step 1: local mount
1322 mount_fstype ost${ost} || return 1
1323 # step 2: remove the specified LAST_ID
1324 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1326 unmount_fstype ost${ost} || return 2
1330 check_mount_and_prep
1331 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1332 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1337 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1339 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1340 error "(2) Fail to start ost1"
1342 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1343 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1345 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1346 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1348 wait_update_facet ost1 "$LCTL get_param -n \
1349 obdfilter.${OST_DEV}.lfsck_layout |
1350 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1352 error "(5) unexpected status"
1355 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1357 wait_update_facet ost1 "$LCTL get_param -n \
1358 obdfilter.${OST_DEV}.lfsck_layout |
1359 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1361 error "(6) unexpected status"
1364 echo "the LAST_ID(s) should have been rebuilt"
1365 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1366 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1368 run_test 11a "LFSCK can rebuild lost last_id"
1371 check_mount_and_prep
1372 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1374 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1375 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1376 do_facet ost1 $LCTL set_param fail_loc=0x160d
1378 local count=$(precreated_ost_obj_count 0 0)
1380 createmany -o $DIR/$tdir/f $((count + 32))
1382 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1383 local seq=$(do_facet mds1 $LCTL get_param -n \
1384 osp.${proc_path}.prealloc_last_seq)
1385 local lastid1=$(do_facet ost1 "lctl get_param -n \
1386 obdfilter.${ost1_svc}.last_id" | grep $seq |
1387 awk -F: '{ print $2 }')
1389 umount_client $MOUNT
1390 stop ost1 || error "(1) Fail to stop ost1"
1392 #define OBD_FAIL_OST_ENOSPC 0x215
1393 do_facet ost1 $LCTL set_param fail_loc=0x215
1395 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1396 error "(2) Fail to start ost1"
1398 for ((i = 0; i < 60; i++)); do
1399 lastid2=$(do_facet ost1 "lctl get_param -n \
1400 obdfilter.${ost1_svc}.last_id" | grep $seq |
1401 awk -F: '{ print $2 }')
1402 [ ! -z $lastid2 ] && break;
1406 echo "the on-disk LAST_ID should be smaller than the expected one"
1407 [ $lastid1 -gt $lastid2 ] ||
1408 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1410 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1411 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1413 wait_update_facet ost1 "$LCTL get_param -n \
1414 obdfilter.${OST_DEV}.lfsck_layout |
1415 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1417 error "(6) unexpected status"
1420 stop ost1 || error "(7) Fail to stop ost1"
1422 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1423 error "(8) Fail to start ost1"
1425 echo "the on-disk LAST_ID should have been rebuilt"
1426 wait_update_facet ost1 "$LCTL get_param -n \
1427 obdfilter.${ost1_svc}.last_id | grep $seq |
1428 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1429 do_facet ost1 $LCTL get_param -n \
1430 obdfilter.${ost1_svc}.last_id
1431 error "(9) expect lastid1 $seq:$lastid1"
1434 do_facet ost1 $LCTL set_param fail_loc=0
1435 stopall || error "(10) Fail to stopall"
1437 run_test 11b "LFSCK can rebuild crashed last_id"
1440 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1442 check_mount_and_prep
1443 for k in $(seq $MDSCOUNT); do
1444 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1445 createmany -o $DIR/$tdir/${k}/f 100 ||
1446 error "(0) Fail to create 100 files."
1449 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1450 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1451 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1453 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1454 wait_all_targets namespace scanning-phase1 3
1456 echo "Stop namespace LFSCK on all targets by single lctl command."
1457 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1458 error "(4) Fail to stop LFSCK on all devices!"
1460 echo "All the LFSCK targets should be in 'stopped' status."
1461 wait_all_targets_blocked namespace stopped 5
1463 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1464 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1465 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1467 echo "All the LFSCK targets should be in 'completed' status."
1468 wait_all_targets_blocked namespace completed 7
1470 start_full_debug_logging
1472 echo "Start layout LFSCK on all targets by single command (-s 1)."
1473 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1474 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1476 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1477 wait_all_targets layout scanning-phase1 9
1479 echo "Stop layout LFSCK on all targets by single lctl command."
1480 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1481 error "(10) Fail to stop LFSCK on all devices!"
1483 echo "All the LFSCK targets should be in 'stopped' status."
1484 wait_all_targets_blocked layout stopped 11
1486 for k in $(seq $OSTCOUNT); do
1487 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1488 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1489 awk '/^status/ { print $2 }')
1490 [ "$STATUS" == "stopped" ] ||
1491 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1494 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1495 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1496 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1498 echo "All the LFSCK targets should be in 'completed' status."
1499 wait_all_targets_blocked layout completed 14
1501 stop_full_debug_logging
1503 run_test 12a "single command to trigger LFSCK on all devices"
1506 check_mount_and_prep
1508 echo "Start LFSCK without '-M' specified."
1509 do_facet mds1 $LCTL lfsck_start -A -r ||
1510 error "(0) Fail to start LFSCK without '-M'"
1512 wait_all_targets_blocked namespace completed 1
1513 wait_all_targets_blocked layout completed 2
1515 local count=$(do_facet mds1 $LCTL dl |
1516 awk '{ print $3 }' | grep mdt | wc -l)
1517 if [ $count -gt 1 ]; then
1519 echo "Start layout LFSCK on the node with multipe targets,"
1520 echo "but not specify '-M'/'-A' option. Should get failure."
1522 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1523 error "(3) Start layout LFSCK should fail" || true
1526 run_test 12b "auto detect Lustre device"
1530 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1531 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1532 echo "MDT-object FID."
1535 check_mount_and_prep
1537 echo "Inject failure stub to simulate bad lmm_oi"
1538 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1539 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1540 createmany -o $DIR/$tdir/f 1
1541 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1542 error "(0) Fail to create PFL $DIR/$tdir/f1"
1543 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1545 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1546 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1548 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1549 mdd.${MDT_DEV}.lfsck_layout |
1550 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1552 error "(2) unexpected status"
1555 local repaired=$($SHOW_LAYOUT |
1556 awk '/^repaired_others/ { print $2 }')
1557 [ $repaired -eq 2 ] ||
1558 error "(3) Fail to repair crashed lmm_oi: $repaired"
1560 run_test 13 "LFSCK can repair crashed lmm_oi"
1564 echo "The OST-object referenced by the MDT-object should be there;"
1565 echo "otherwise, the LFSCK should re-create the missing OST-object."
1566 echo "without '--delay-create-ostobj' option."
1569 check_mount_and_prep
1570 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1572 echo "Inject failure stub to simulate dangling referenced MDT-object"
1573 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1574 do_facet ost1 $LCTL set_param fail_loc=0x1610
1575 local count=$(precreated_ost_obj_count 0 0)
1577 createmany -o $DIR/$tdir/f $((count + 16)) ||
1578 error "(0.1) Fail to create $DIR/$tdir/fx"
1579 touch $DIR/$tdir/guard0
1581 for ((i = 0; i < 16; i++)); do
1582 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1583 $DIR/$tdir/f_comp${i} ||
1584 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1586 touch $DIR/$tdir/guard1
1588 do_facet ost1 $LCTL set_param fail_loc=0
1590 start_full_debug_logging
1592 # exhaust other pre-created dangling cases
1593 count=$(precreated_ost_obj_count 0 0)
1594 createmany -o $DIR/$tdir/a $count ||
1595 error "(0.5) Fail to create $count files."
1597 echo "'ls' should fail because of dangling referenced MDT-object"
1598 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1600 echo "Trigger layout LFSCK to find out dangling reference"
1601 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1603 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1604 mdd.${MDT_DEV}.lfsck_layout |
1605 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1607 error "(3) unexpected status"
1610 local repaired=$($SHOW_LAYOUT |
1611 awk '/^repaired_dangling/ { print $2 }')
1612 [ $repaired -ge 32 ] ||
1613 error "(4) Fail to repair dangling reference: $repaired"
1615 echo "'stat' should fail because of not repair dangling by default"
1616 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1617 error "(5.1) stat should fail"
1618 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1619 error "(5.2) stat should fail"
1621 echo "Trigger layout LFSCK to repair dangling reference"
1622 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1624 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1625 mdd.${MDT_DEV}.lfsck_layout |
1626 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1628 error "(7) unexpected status"
1631 # There may be some async LFSCK updates in processing, wait for
1632 # a while until the target reparation has been done. LU-4970.
1634 echo "'stat' should success after layout LFSCK repairing"
1635 wait_update_facet client "stat $DIR/$tdir/guard0 |
1636 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1637 stat $DIR/$tdir/guard0
1639 error "(8.1) unexpected size"
1642 wait_update_facet client "stat $DIR/$tdir/guard1 |
1643 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1644 stat $DIR/$tdir/guard1
1646 error "(8.2) unexpected size"
1649 repaired=$($SHOW_LAYOUT |
1650 awk '/^repaired_dangling/ { print $2 }')
1651 [ $repaired -ge 32 ] ||
1652 error "(9) Fail to repair dangling reference: $repaired"
1654 stop_full_debug_logging
1656 echo "stopall to cleanup object cache"
1659 setupall > /dev/null
1661 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1665 echo "The OST-object referenced by the MDT-object should be there;"
1666 echo "otherwise, the LFSCK should re-create the missing OST-object."
1667 echo "with '--delay-create-ostobj' option."
1670 check_mount_and_prep
1671 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1673 echo "Inject failure stub to simulate dangling referenced MDT-object"
1674 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1675 do_facet ost1 $LCTL set_param fail_loc=0x1610
1676 local count=$(precreated_ost_obj_count 0 0)
1678 createmany -o $DIR/$tdir/f $((count + 31))
1679 touch $DIR/$tdir/guard
1680 do_facet ost1 $LCTL set_param fail_loc=0
1682 start_full_debug_logging
1684 # exhaust other pre-created dangling cases
1685 count=$(precreated_ost_obj_count 0 0)
1686 createmany -o $DIR/$tdir/a $count ||
1687 error "(0) Fail to create $count files."
1689 echo "'ls' should fail because of dangling referenced MDT-object"
1690 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1692 echo "Trigger layout LFSCK to find out dangling reference"
1693 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1695 wait_all_targets_blocked layout completed 3
1697 local repaired=$($SHOW_LAYOUT |
1698 awk '/^repaired_dangling/ { print $2 }')
1699 [ $repaired -ge 32 ] ||
1700 error "(4) Fail to repair dangling reference: $repaired"
1702 echo "'stat' should fail because of not repair dangling by default"
1703 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1705 echo "Trigger layout LFSCK to repair dangling reference"
1706 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1708 wait_all_targets_blocked layout completed 7
1710 # There may be some async LFSCK updates in processing, wait for
1711 # a while until the target reparation has been done. LU-4970.
1713 echo "'stat' should success after layout LFSCK repairing"
1714 wait_update_facet client "stat $DIR/$tdir/guard |
1715 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1716 stat $DIR/$tdir/guard
1718 error "(8) unexpected size"
1721 repaired=$($SHOW_LAYOUT |
1722 awk '/^repaired_dangling/ { print $2 }')
1723 [ $repaired -ge 32 ] ||
1724 error "(9) Fail to repair dangling reference: $repaired"
1726 stop_full_debug_logging
1728 echo "stopall to cleanup object cache"
1731 setupall > /dev/null
1733 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1737 echo "If the OST-object referenced by the MDT-object back points"
1738 echo "to some non-exist MDT-object, then the LFSCK should repair"
1739 echo "the OST-object to back point to the right MDT-object."
1742 check_mount_and_prep
1743 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1745 echo "Inject failure stub to make the OST-object to back point to"
1746 echo "non-exist MDT-object."
1747 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1749 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1750 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1751 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1753 error "(0) Fail to create PFL $DIR/$tdir/f1"
1754 # 'dd' will trigger punch RPC firstly on every OST-objects.
1755 # So even though some OST-object will not be write by 'dd',
1756 # as long as it is allocated (may be NOT allocated in pfl_3b)
1757 # its layout information will be set also.
1758 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1759 cancel_lru_locks osc
1760 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1762 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1763 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1765 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1766 mdd.${MDT_DEV}.lfsck_layout |
1767 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1769 error "(2) unexpected status"
1772 local repaired=$($SHOW_LAYOUT |
1773 awk '/^repaired_unmatched_pair/ { print $2 }')
1774 [ $repaired -ge 3 ] ||
1775 error "(3) Fail to repair unmatched pair: $repaired"
1777 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1781 echo "If the OST-object referenced by the MDT-object back points"
1782 echo "to other MDT-object that doesn't recognize the OST-object,"
1783 echo "then the LFSCK should repair it to back point to the right"
1784 echo "MDT-object (the first one)."
1787 check_mount_and_prep
1788 mkdir -p $DIR/$tdir/0
1789 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1790 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1791 cancel_lru_locks osc
1793 echo "Inject failure stub to make the OST-object to back point to"
1794 echo "other MDT-object"
1797 [ $OSTCOUNT -ge 2 ] && stripes=2
1799 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1800 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1801 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1802 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1804 error "(0) Fail to create PFL $DIR/$tdir/f1"
1805 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1806 cancel_lru_locks osc
1807 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1809 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1810 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1812 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1813 mdd.${MDT_DEV}.lfsck_layout |
1814 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1816 error "(2) unexpected status"
1819 local repaired=$($SHOW_LAYOUT |
1820 awk '/^repaired_unmatched_pair/ { print $2 }')
1821 [ $repaired -eq 4 ] ||
1822 error "(3) Fail to repair unmatched pair: $repaired"
1824 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1827 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1829 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1830 skip "Skip the test after 2.7.55 see LU-6437" && return
1833 echo "According to current metadata migration implementation,"
1834 echo "before the old MDT-object is removed, both the new MDT-object"
1835 echo "and old MDT-object will reference the same LOV layout. Then if"
1836 echo "the layout LFSCK finds the new MDT-object by race, it will"
1837 echo "regard related OST-object(s) as multiple referenced case, and"
1838 echo "will try to create new OST-object(s) for the new MDT-object."
1839 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1840 echo "MDT-object before confirm the multiple referenced case."
1843 check_mount_and_prep
1844 $LFS mkdir -i 1 $DIR/$tdir/a1
1845 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1846 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1847 cancel_lru_locks osc
1849 echo "Inject failure stub on MDT1 to delay the migration"
1851 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1852 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1853 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1854 $LFS migrate -m 0 $DIR/$tdir/a1 &
1857 echo "Trigger layout LFSCK to race with the migration"
1858 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1860 wait_all_targets_blocked layout completed 2
1862 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1863 local repaired=$($SHOW_LAYOUT |
1864 awk '/^repaired_unmatched_pair/ { print $2 }')
1865 [ $repaired -eq 1 ] ||
1866 error "(3) Fail to repair unmatched pair: $repaired"
1868 repaired=$($SHOW_LAYOUT |
1869 awk '/^repaired_multiple_referenced/ { print $2 }')
1870 [ $repaired -eq 0 ] ||
1871 error "(4) Unexpectedly repaird multiple references: $repaired"
1873 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1877 echo "If the OST-object's owner information does not match the owner"
1878 echo "information stored in the MDT-object, then the LFSCK trust the"
1879 echo "MDT-object and update the OST-object's owner information."
1882 check_mount_and_prep
1883 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1884 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1885 cancel_lru_locks osc
1887 # created but no setattr or write to the file.
1889 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1890 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1892 echo "Inject failure stub to skip OST-object owner changing"
1893 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1894 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1895 chown 1.1 $DIR/$tdir/f0
1896 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1898 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1901 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1903 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1904 mdd.${MDT_DEV}.lfsck_layout |
1905 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1907 error "(2) unexpected status"
1910 local repaired=$($SHOW_LAYOUT |
1911 awk '/^repaired_inconsistent_owner/ { print $2 }')
1912 [ $repaired -eq 1 ] ||
1913 error "(3) Fail to repair inconsistent owner: $repaired"
1915 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1919 echo "If more than one MDT-objects reference the same OST-object,"
1920 echo "and the OST-object only recognizes one MDT-object, then the"
1921 echo "LFSCK should create new OST-objects for such non-recognized"
1925 check_mount_and_prep
1926 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1928 echo "Inject failure stub to make two MDT-objects to refernce"
1929 echo "the OST-object"
1931 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1932 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1933 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1934 cancel_lru_locks mdc
1935 cancel_lru_locks osc
1937 createmany -o $DIR/$tdir/f 1
1938 cancel_lru_locks mdc
1939 cancel_lru_locks osc
1941 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1943 error "(0) Fail to create PFL $DIR/$tdir/f1"
1944 cancel_lru_locks mdc
1945 cancel_lru_locks osc
1946 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1948 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1949 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1950 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1951 [ $size -eq 1048576 ] ||
1952 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1954 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1955 [ $size -eq 1048576 ] ||
1956 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1958 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1961 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1963 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1964 mdd.${MDT_DEV}.lfsck_layout |
1965 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1967 error "(3) unexpected status"
1970 local repaired=$($SHOW_LAYOUT |
1971 awk '/^repaired_multiple_referenced/ { print $2 }')
1972 [ $repaired -eq 2 ] ||
1973 error "(4) Fail to repair multiple references: $repaired"
1975 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1976 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1977 error "(5) Fail to write f0."
1978 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1979 [ $size -eq 1048576 ] ||
1980 error "(6) guard size should be 1048576, but got $size"
1982 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1983 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1984 error "(7) Fail to write f1."
1985 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1986 [ $size -eq 1048576 ] ||
1987 error "(8) guard size should be 1048576, but got $size"
1989 run_test 17 "LFSCK can repair multiple references"
1991 $LCTL set_param debug=+cache > /dev/null
1995 echo "The target MDT-object is there, but related stripe information"
1996 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1997 echo "layout EA entries."
2000 check_mount_and_prep
2001 $LFS mkdir -i 0 $DIR/$tdir/a1
2002 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2003 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2005 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2007 $LFS path2fid $DIR/$tdir/a1/f1
2008 $LFS getstripe $DIR/$tdir/a1/f1
2010 if [ $MDSCOUNT -ge 2 ]; then
2011 $LFS mkdir -i 1 $DIR/$tdir/a2
2012 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2013 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2014 $LFS path2fid $DIR/$tdir/a2/f2
2015 $LFS getstripe $DIR/$tdir/a2/f2
2018 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2019 error "(0) Fail to create PFL $DIR/$tdir/f3"
2021 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2023 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2025 $LFS path2fid $DIR/$tdir/f3
2026 $LFS getstripe $DIR/$tdir/f3
2028 cancel_lru_locks osc
2030 echo "Inject failure, to make the MDT-object lost its layout EA"
2031 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2032 do_facet mds1 $LCTL set_param fail_loc=0x1615
2033 chown 1.1 $DIR/$tdir/a1/f1
2035 if [ $MDSCOUNT -ge 2 ]; then
2036 do_facet mds2 $LCTL set_param fail_loc=0x1615
2037 chown 1.1 $DIR/$tdir/a2/f2
2040 chown 1.1 $DIR/$tdir/f3
2045 do_facet mds1 $LCTL set_param fail_loc=0
2046 if [ $MDSCOUNT -ge 2 ]; then
2047 do_facet mds2 $LCTL set_param fail_loc=0
2050 cancel_lru_locks mdc
2051 cancel_lru_locks osc
2053 echo "The file size should be incorrect since layout EA is lost"
2054 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2055 [ "$cur_size" != "$saved_size1" ] ||
2056 error "(1) Expect incorrect file1 size"
2058 if [ $MDSCOUNT -ge 2 ]; then
2059 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2060 [ "$cur_size" != "$saved_size1" ] ||
2061 error "(2) Expect incorrect file2 size"
2064 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2065 [ "$cur_size" != "$saved_size2" ] ||
2066 error "(1.2) Expect incorrect file3 size"
2068 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2069 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2071 for k in $(seq $MDSCOUNT); do
2072 # The LFSCK status query internal is 30 seconds. For the case
2073 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2074 # time to guarantee the status sync up.
2075 wait_update_facet mds${k} "$LCTL get_param -n \
2076 mdd.$(facet_svc mds${k}).lfsck_layout |
2077 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2078 error "(4) MDS${k} is not the expected 'completed'"
2081 for k in $(seq $OSTCOUNT); do
2082 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2083 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2084 awk '/^status/ { print $2 }')
2085 [ "$cur_status" == "completed" ] ||
2086 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2089 local repaired=$(do_facet mds1 $LCTL get_param -n \
2090 mdd.$(facet_svc mds1).lfsck_layout |
2091 awk '/^repaired_orphan/ { print $2 }')
2092 [ $repaired -eq 3 ] ||
2093 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2095 if [ $MDSCOUNT -ge 2 ]; then
2096 repaired=$(do_facet mds2 $LCTL get_param -n \
2097 mdd.$(facet_svc mds2).lfsck_layout |
2098 awk '/^repaired_orphan/ { print $2 }')
2099 [ $repaired -eq 2 ] ||
2100 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2103 $LFS path2fid $DIR/$tdir/a1/f1
2104 $LFS getstripe $DIR/$tdir/a1/f1
2106 if [ $MDSCOUNT -ge 2 ]; then
2107 $LFS path2fid $DIR/$tdir/a2/f2
2108 $LFS getstripe $DIR/$tdir/a2/f2
2111 $LFS path2fid $DIR/$tdir/f3
2112 $LFS getstripe $DIR/$tdir/f3
2114 echo "The file size should be correct after layout LFSCK scanning"
2115 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2116 [ "$cur_size" == "$saved_size1" ] ||
2117 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2119 if [ $MDSCOUNT -ge 2 ]; then
2120 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2121 [ "$cur_size" == "$saved_size1" ] ||
2122 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2125 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2126 [ "$cur_size" == "$saved_size2" ] ||
2127 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2129 run_test 18a "Find out orphan OST-object and repair it (1)"
2132 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2135 echo "The target MDT-object is lost. The LFSCK should re-create the"
2136 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2137 echo "can move it back to normal namespace manually."
2140 check_mount_and_prep
2141 $LFS mkdir -i 0 $DIR/$tdir/a1
2142 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2143 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2144 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2145 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2147 $LFS getstripe $DIR/$tdir/a1/f1
2149 if [ $MDSCOUNT -ge 2 ]; then
2150 $LFS mkdir -i 1 $DIR/$tdir/a2
2151 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2152 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2153 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2155 $LFS getstripe $DIR/$tdir/a2/f2
2158 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2159 error "(0) Fail to create PFL $DIR/$tdir/f3"
2161 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2163 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2164 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2166 $LFS getstripe $DIR/$tdir/f3
2168 cancel_lru_locks osc
2170 echo "Inject failure, to simulate the case of missing the MDT-object"
2171 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2172 do_facet mds1 $LCTL set_param fail_loc=0x1616
2173 rm -f $DIR/$tdir/a1/f1
2175 if [ $MDSCOUNT -ge 2 ]; then
2176 do_facet mds2 $LCTL set_param fail_loc=0x1616
2177 rm -f $DIR/$tdir/a2/f2
2185 do_facet mds1 $LCTL set_param fail_loc=0
2186 if [ $MDSCOUNT -ge 2 ]; then
2187 do_facet mds2 $LCTL set_param fail_loc=0
2190 cancel_lru_locks mdc
2191 cancel_lru_locks osc
2193 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2194 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2196 for k in $(seq $MDSCOUNT); do
2197 # The LFSCK status query internal is 30 seconds. For the case
2198 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2199 # time to guarantee the status sync up.
2200 wait_update_facet mds${k} "$LCTL get_param -n \
2201 mdd.$(facet_svc mds${k}).lfsck_layout |
2202 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2203 error "(2) MDS${k} is not the expected 'completed'"
2206 for k in $(seq $OSTCOUNT); do
2207 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2208 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2209 awk '/^status/ { print $2 }')
2210 [ "$cur_status" == "completed" ] ||
2211 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2214 local repaired=$(do_facet mds1 $LCTL get_param -n \
2215 mdd.$(facet_svc mds1).lfsck_layout |
2216 awk '/^repaired_orphan/ { print $2 }')
2217 [ $repaired -eq 3 ] ||
2218 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2220 if [ $MDSCOUNT -ge 2 ]; then
2221 repaired=$(do_facet mds2 $LCTL get_param -n \
2222 mdd.$(facet_svc mds2).lfsck_layout |
2223 awk '/^repaired_orphan/ { print $2 }')
2224 [ $repaired -eq 2 ] ||
2225 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2228 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2229 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2230 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2232 if [ $MDSCOUNT -ge 2 ]; then
2233 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2234 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2237 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2238 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2240 $LFS path2fid $DIR/$tdir/a1/f1
2241 $LFS getstripe $DIR/$tdir/a1/f1
2243 if [ $MDSCOUNT -ge 2 ]; then
2244 $LFS path2fid $DIR/$tdir/a2/f2
2245 $LFS getstripe $DIR/$tdir/a2/f2
2248 $LFS path2fid $DIR/$tdir/f3
2249 $LFS getstripe $DIR/$tdir/f3
2251 echo "The file size should be correct after layout LFSCK scanning"
2252 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2253 [ "$cur_size" == "$saved_size1" ] ||
2254 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2256 if [ $MDSCOUNT -ge 2 ]; then
2257 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2258 [ "$cur_size" == "$saved_size1" ] ||
2259 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2262 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2263 [ "$cur_size" == "$saved_size2" ] ||
2264 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2266 run_test 18b "Find out orphan OST-object and repair it (2)"
2269 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2272 echo "The target MDT-object is lost, and the OST-object FID is missing."
2273 echo "The LFSCK should re-create the MDT-object with new FID under the "
2274 echo "directory .lustre/lost+found/MDTxxxx."
2277 check_mount_and_prep
2278 $LFS mkdir -i 0 $DIR/$tdir/a1
2279 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2281 echo "Inject failure, to simulate the case of missing parent FID"
2282 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2283 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2285 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2286 $LFS getstripe $DIR/$tdir/a1/f1
2288 if [ $MDSCOUNT -ge 2 ]; then
2289 $LFS mkdir -i 1 $DIR/$tdir/a2
2290 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2291 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2292 $LFS getstripe $DIR/$tdir/a2/f2
2295 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2296 error "(0) Fail to create PFL $DIR/$tdir/f3"
2298 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2299 $LFS getstripe $DIR/$tdir/f3
2301 cancel_lru_locks osc
2302 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2304 echo "Inject failure, to simulate the case of missing the MDT-object"
2305 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2306 do_facet mds1 $LCTL set_param fail_loc=0x1616
2307 rm -f $DIR/$tdir/a1/f1
2309 if [ $MDSCOUNT -ge 2 ]; then
2310 do_facet mds2 $LCTL set_param fail_loc=0x1616
2311 rm -f $DIR/$tdir/a2/f2
2319 do_facet mds1 $LCTL set_param fail_loc=0
2320 if [ $MDSCOUNT -ge 2 ]; then
2321 do_facet mds2 $LCTL set_param fail_loc=0
2324 cancel_lru_locks mdc
2325 cancel_lru_locks osc
2327 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2328 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2330 for k in $(seq $MDSCOUNT); do
2331 # The LFSCK status query internal is 30 seconds. For the case
2332 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2333 # time to guarantee the status sync up.
2334 wait_update_facet mds${k} "$LCTL get_param -n \
2335 mdd.$(facet_svc mds${k}).lfsck_layout |
2336 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2337 error "(2) MDS${k} is not the expected 'completed'"
2340 for k in $(seq $OSTCOUNT); do
2341 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2342 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2343 awk '/^status/ { print $2 }')
2344 [ "$cur_status" == "completed" ] ||
2345 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2348 if [ $MDSCOUNT -ge 2 ]; then
2354 local repaired=$(do_facet mds1 $LCTL get_param -n \
2355 mdd.$(facet_svc mds1).lfsck_layout |
2356 awk '/^repaired_orphan/ { print $2 }')
2357 [ $repaired -eq $expected ] ||
2358 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2360 if [ $MDSCOUNT -ge 2 ]; then
2361 repaired=$(do_facet mds2 $LCTL get_param -n \
2362 mdd.$(facet_svc mds2).lfsck_layout |
2363 awk '/^repaired_orphan/ { print $2 }')
2364 [ $repaired -eq 0 ] ||
2365 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2368 ls -ail $MOUNT/.lustre/lost+found/
2370 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2371 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2372 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2374 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2377 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2378 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2379 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2381 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2382 [ ! -z "$cname" ] ||
2383 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2385 run_test 18c "Find out orphan OST-object and repair it (3)"
2389 echo "The target MDT-object layout EA is corrupted, but the right"
2390 echo "OST-object is still alive as orphan. The layout LFSCK will"
2391 echo "not create new OST-object to occupy such slot."
2394 check_mount_and_prep
2396 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2397 echo "guard" > $DIR/$tdir/a1/f1
2398 echo "foo" > $DIR/$tdir/a1/f2
2400 echo "guard" > $DIR/$tdir/a1/f3
2401 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2402 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2403 echo "foo" > $DIR/$tdir/a1/f4
2405 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2406 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2407 $LFS path2fid $DIR/$tdir/a1/f1
2408 $LFS getstripe $DIR/$tdir/a1/f1
2409 $LFS path2fid $DIR/$tdir/a1/f2
2410 $LFS getstripe $DIR/$tdir/a1/f2
2411 $LFS path2fid $DIR/$tdir/a1/f3
2412 $LFS getstripe $DIR/$tdir/a1/f3
2413 $LFS path2fid $DIR/$tdir/a1/f4
2414 $LFS getstripe $DIR/$tdir/a1/f4
2415 cancel_lru_locks osc
2417 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2418 echo "to reference the same OST-object (which is f1's OST-obejct)."
2419 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2420 echo "dangling reference case, but f2's old OST-object is there."
2422 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2423 echo "to reference the same OST-object (which is f3's OST-obejct)."
2424 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2425 echo "dangling reference case, but f4's old OST-object is there."
2428 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2429 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2430 chown 1.1 $DIR/$tdir/a1/f2
2431 chown 1.1 $DIR/$tdir/a1/f4
2432 rm -f $DIR/$tdir/a1/f1
2433 rm -f $DIR/$tdir/a1/f3
2436 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2438 echo "stopall to cleanup object cache"
2441 setupall > /dev/null
2443 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2444 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2446 for k in $(seq $MDSCOUNT); do
2447 # The LFSCK status query internal is 30 seconds. For the case
2448 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2449 # time to guarantee the status sync up.
2450 wait_update_facet mds${k} "$LCTL get_param -n \
2451 mdd.$(facet_svc mds${k}).lfsck_layout |
2452 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2453 error "(3) MDS${k} is not the expected 'completed'"
2456 for k in $(seq $OSTCOUNT); do
2457 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2458 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2459 awk '/^status/ { print $2 }')
2460 [ "$cur_status" == "completed" ] ||
2461 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2464 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2465 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2466 awk '/^repaired_orphan/ { print $2 }')
2467 [ $repaired -eq 2 ] ||
2468 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2470 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2471 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2472 awk '/^repaired_dangling/ { print $2 }')
2473 [ $repaired -eq 0 ] ||
2474 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2476 echo "The file size should be correct after layout LFSCK scanning"
2477 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2478 [ "$cur_size" == "$saved_size1" ] ||
2479 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2481 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2482 [ "$cur_size" == "$saved_size2" ] ||
2483 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2485 echo "The LFSCK should find back the original data."
2486 cat $DIR/$tdir/a1/f2
2487 $LFS path2fid $DIR/$tdir/a1/f2
2488 $LFS getstripe $DIR/$tdir/a1/f2
2489 cat $DIR/$tdir/a1/f4
2490 $LFS path2fid $DIR/$tdir/a1/f4
2491 $LFS getstripe $DIR/$tdir/a1/f4
2493 run_test 18d "Find out orphan OST-object and repair it (4)"
2496 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2499 echo "The target MDT-object layout EA slot is occpuied by some new"
2500 echo "created OST-object when repair dangling reference case. Such"
2501 echo "conflict OST-object has been modified by others. To keep the"
2502 echo "new data, the LFSCK will create a new file to refernece this"
2503 echo "old orphan OST-object."
2506 check_mount_and_prep
2508 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2509 echo "guard" > $DIR/$tdir/a1/f1
2510 echo "foo" > $DIR/$tdir/a1/f2
2512 echo "guard" > $DIR/$tdir/a1/f3
2513 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2514 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2515 echo "foo" > $DIR/$tdir/a1/f4
2517 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2518 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2520 $LFS path2fid $DIR/$tdir/a1/f1
2521 $LFS getstripe $DIR/$tdir/a1/f1
2522 $LFS path2fid $DIR/$tdir/a1/f2
2523 $LFS getstripe $DIR/$tdir/a1/f2
2524 $LFS path2fid $DIR/$tdir/a1/f3
2525 $LFS getstripe $DIR/$tdir/a1/f3
2526 $LFS path2fid $DIR/$tdir/a1/f4
2527 $LFS getstripe $DIR/$tdir/a1/f4
2528 cancel_lru_locks osc
2530 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2531 echo "to reference the same OST-object (which is f1's OST-obejct)."
2532 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2533 echo "dangling reference case, but f2's old OST-object is there."
2535 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2536 echo "to reference the same OST-object (which is f3's OST-obejct)."
2537 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2538 echo "dangling reference case, but f4's old OST-object is there."
2541 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2542 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2543 chown 1.1 $DIR/$tdir/a1/f2
2544 chown 1.1 $DIR/$tdir/a1/f4
2545 rm -f $DIR/$tdir/a1/f1
2546 rm -f $DIR/$tdir/a1/f3
2549 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2551 echo "stopall to cleanup object cache"
2554 setupall > /dev/null
2556 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2557 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2559 start_full_debug_logging
2561 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2562 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2564 wait_update_facet mds1 "$LCTL get_param -n \
2565 mdd.$(facet_svc mds1).lfsck_layout |
2566 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2567 error "(3) MDS1 is not the expected 'scanning-phase2'"
2569 # to guarantee all updates are synced.
2573 echo "Write new data to f2/f4 to modify the new created OST-object."
2574 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2575 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2577 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2579 for k in $(seq $MDSCOUNT); do
2580 # The LFSCK status query internal is 30 seconds. For the case
2581 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2582 # time to guarantee the status sync up.
2583 wait_update_facet mds${k} "$LCTL get_param -n \
2584 mdd.$(facet_svc mds${k}).lfsck_layout |
2585 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2586 error "(4) MDS${k} is not the expected 'completed'"
2589 for k in $(seq $OSTCOUNT); do
2590 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2591 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2592 awk '/^status/ { print $2 }')
2593 [ "$cur_status" == "completed" ] ||
2594 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2597 stop_full_debug_logging
2599 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2600 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2601 awk '/^repaired_orphan/ { print $2 }')
2602 [ $repaired -eq 2 ] ||
2603 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2605 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2606 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2607 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2609 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2610 if [ $count -ne 2 ]; then
2611 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2612 error "(8) Expect 2 stubs under lost+found, but got $count"
2615 echo "The stub file should keep the original f2 or f4 data"
2616 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2617 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2618 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2619 error "(9) Got unexpected $cur_size"
2622 $LFS path2fid $cname
2623 $LFS getstripe $cname
2625 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2626 cur_size=$(ls -il $cname | awk '{ print $6 }')
2627 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2628 error "(10) Got unexpected $cur_size"
2631 $LFS path2fid $cname
2632 $LFS getstripe $cname
2634 echo "The f2/f4 should contains new data."
2635 cat $DIR/$tdir/a1/f2
2636 $LFS path2fid $DIR/$tdir/a1/f2
2637 $LFS getstripe $DIR/$tdir/a1/f2
2638 cat $DIR/$tdir/a1/f4
2639 $LFS path2fid $DIR/$tdir/a1/f4
2640 $LFS getstripe $DIR/$tdir/a1/f4
2642 run_test 18e "Find out orphan OST-object and repair it (5)"
2645 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2648 echo "The target MDT-object is lost. The LFSCK should re-create the"
2649 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2650 echo "to verify some OST-object(s) during the first stage-scanning,"
2651 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2652 echo "should not be affected."
2655 check_mount_and_prep
2656 $LFS mkdir -i 0 $DIR/$tdir/a1
2657 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2658 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2659 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2660 $LFS mkdir -i 0 $DIR/$tdir/a2
2661 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2662 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2663 $LFS getstripe $DIR/$tdir/a1/f1
2664 $LFS getstripe $DIR/$tdir/a2/f2
2666 if [ $MDSCOUNT -ge 2 ]; then
2667 $LFS mkdir -i 1 $DIR/$tdir/a3
2668 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2669 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2670 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2671 $LFS mkdir -i 1 $DIR/$tdir/a4
2672 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2673 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2674 $LFS getstripe $DIR/$tdir/a3/f3
2675 $LFS getstripe $DIR/$tdir/a4/f4
2678 cancel_lru_locks osc
2680 echo "Inject failure, to simulate the case of missing the MDT-object"
2681 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2682 do_facet mds1 $LCTL set_param fail_loc=0x1616
2683 rm -f $DIR/$tdir/a1/f1
2684 rm -f $DIR/$tdir/a2/f2
2686 if [ $MDSCOUNT -ge 2 ]; then
2687 do_facet mds2 $LCTL set_param fail_loc=0x1616
2688 rm -f $DIR/$tdir/a3/f3
2689 rm -f $DIR/$tdir/a4/f4
2695 do_facet mds1 $LCTL set_param fail_loc=0
2696 if [ $MDSCOUNT -ge 2 ]; then
2697 do_facet mds2 $LCTL set_param fail_loc=0
2700 cancel_lru_locks mdc
2701 cancel_lru_locks osc
2703 echo "Inject failure, to simulate the OST0 fail to handle"
2704 echo "MDT0 LFSCK request during the first-stage scanning."
2705 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2706 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2708 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2709 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2711 for k in $(seq $MDSCOUNT); do
2712 # The LFSCK status query internal is 30 seconds. For the case
2713 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2714 # time to guarantee the status sync up.
2715 wait_update_facet mds${k} "$LCTL get_param -n \
2716 mdd.$(facet_svc mds${k}).lfsck_layout |
2717 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2718 error "(2) MDS${k} is not the expected 'partial'"
2721 wait_update_facet ost1 "$LCTL get_param -n \
2722 obdfilter.$(facet_svc ost1).lfsck_layout |
2723 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2724 error "(3) OST1 is not the expected 'partial'"
2727 wait_update_facet ost2 "$LCTL get_param -n \
2728 obdfilter.$(facet_svc ost2).lfsck_layout |
2729 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2730 error "(4) OST2 is not the expected 'completed'"
2733 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2735 local repaired=$(do_facet mds1 $LCTL get_param -n \
2736 mdd.$(facet_svc mds1).lfsck_layout |
2737 awk '/^repaired_orphan/ { print $2 }')
2738 [ $repaired -eq 1 ] ||
2739 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2741 if [ $MDSCOUNT -ge 2 ]; then
2742 repaired=$(do_facet mds2 $LCTL get_param -n \
2743 mdd.$(facet_svc mds2).lfsck_layout |
2744 awk '/^repaired_orphan/ { print $2 }')
2745 [ $repaired -eq 1 ] ||
2746 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2749 echo "Trigger layout LFSCK on all devices again to cleanup"
2750 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2752 for k in $(seq $MDSCOUNT); do
2753 # The LFSCK status query internal is 30 seconds. For the case
2754 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2755 # time to guarantee the status sync up.
2756 wait_update_facet mds${k} "$LCTL get_param -n \
2757 mdd.$(facet_svc mds${k}).lfsck_layout |
2758 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2759 error "(8) MDS${k} is not the expected 'completed'"
2762 for k in $(seq $OSTCOUNT); do
2763 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2764 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2765 awk '/^status/ { print $2 }')
2766 [ "$cur_status" == "completed" ] ||
2767 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2771 local repaired=$(do_facet mds1 $LCTL get_param -n \
2772 mdd.$(facet_svc mds1).lfsck_layout |
2773 awk '/^repaired_orphan/ { print $2 }')
2774 [ $repaired -eq 2 ] ||
2775 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2777 if [ $MDSCOUNT -ge 2 ]; then
2778 repaired=$(do_facet mds2 $LCTL get_param -n \
2779 mdd.$(facet_svc mds2).lfsck_layout |
2780 awk '/^repaired_orphan/ { print $2 }')
2781 [ $repaired -eq 2 ] ||
2782 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2785 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2788 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2791 echo "The target MDT-object is lost, but related OI mapping is there"
2792 echo "The LFSCK should recreate the lost MDT-object without affected"
2793 echo "by the stale OI mapping."
2796 check_mount_and_prep
2797 $LFS mkdir -i 0 $DIR/$tdir/a1
2798 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2799 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2800 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2802 $LFS getstripe $DIR/$tdir/a1/f1
2803 cancel_lru_locks osc
2805 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2806 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2807 do_facet mds1 $LCTL set_param fail_loc=0x162e
2808 rm -f $DIR/$tdir/a1/f1
2810 do_facet mds1 $LCTL set_param fail_loc=0
2811 cancel_lru_locks mdc
2812 cancel_lru_locks osc
2814 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2815 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2817 for k in $(seq $MDSCOUNT); do
2818 # The LFSCK status query internal is 30 seconds. For the case
2819 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2820 # time to guarantee the status sync up.
2821 wait_update_facet mds${k} "$LCTL get_param -n \
2822 mdd.$(facet_svc mds${k}).lfsck_layout |
2823 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2824 error "(2) MDS${k} is not the expected 'completed'"
2827 for k in $(seq $OSTCOUNT); do
2828 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2829 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2830 awk '/^status/ { print $2 }')
2831 [ "$cur_status" == "completed" ] ||
2832 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2835 local repaired=$(do_facet mds1 $LCTL get_param -n \
2836 mdd.$(facet_svc mds1).lfsck_layout |
2837 awk '/^repaired_orphan/ { print $2 }')
2838 [ $repaired -eq $OSTCOUNT ] ||
2839 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2841 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2842 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2843 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2845 $LFS path2fid $DIR/$tdir/a1/f1
2846 $LFS getstripe $DIR/$tdir/a1/f1
2848 run_test 18g "Find out orphan OST-object and repair it (7)"
2852 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2853 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2854 echo "scanning its OST-object(s). Then in the second stage scanning,"
2855 echo "the OST will return related OST-object(s) to the MDT as orphan."
2856 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2857 echo "the 'orphan(s)' stripe information."
2860 check_mount_and_prep
2862 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2863 error "(0) Fail to create PFL $DIR/$tdir/f0"
2865 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2866 error "(1.1) Fail to write $DIR/$tdir/f0"
2868 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2869 error "(1.2) Fail to write $DIR/$tdir/f0"
2871 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2873 echo "Inject failure stub to simulate bad PFL extent range"
2874 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2875 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2877 chown 1.1 $DIR/$tdir/f0
2879 cancel_lru_locks mdc
2880 cancel_lru_locks osc
2881 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2883 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2884 error "(2) Write to bad PFL file should fail"
2886 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2887 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2889 for k in $(seq $MDSCOUNT); do
2890 # The LFSCK status query internal is 30 seconds. For the case
2891 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2892 # time to guarantee the status sync up.
2893 wait_update_facet mds${k} "$LCTL get_param -n \
2894 mdd.$(facet_svc mds${k}).lfsck_layout |
2895 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2896 error "(4.1) MDS${k} is not the expected 'completed'"
2899 for k in $(seq $OSTCOUNT); do
2900 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2901 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2902 awk '/^status/ { print $2 }')
2903 [ "$cur_status" == "completed" ] ||
2904 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2908 local repaired=$($SHOW_LAYOUT |
2909 awk '/^repaired_orphan/ { print $2 }')
2910 [ $repaired -eq 2 ] ||
2911 error "(5) Fail to repair crashed PFL range: $repaired"
2913 echo "Data in $DIR/$tdir/f0 should not be broken"
2914 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2915 error "(6) Data in $DIR/$tdir/f0 is broken"
2917 echo "Write should succeed after LFSCK repairing the bad PFL range"
2918 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2919 error "(7) Write should succeed after LFSCK"
2921 run_test 18h "LFSCK can repair crashed PFL extent range"
2923 $LCTL set_param debug=-cache > /dev/null
2926 check_mount_and_prep
2927 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2929 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2930 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2932 echo "foo1" > $DIR/$tdir/a0
2933 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2934 error "(0) Fail to create PFL $DIR/$tdir/a1"
2935 echo "foo2" > $DIR/$tdir/a1
2936 echo "guard" > $DIR/$tdir/a2
2937 cancel_lru_locks osc
2939 echo "Inject failure, then client will offer wrong parent FID when read"
2940 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2941 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2943 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2944 $LCTL set_param fail_loc=0x1619
2946 echo "Read RPC with wrong parent FID should be denied"
2947 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2948 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2949 $LCTL set_param fail_loc=0
2951 run_test 19a "OST-object inconsistency self detect"
2954 check_mount_and_prep
2955 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2957 echo "Inject failure stub to make the OST-object to back point to"
2958 echo "non-exist MDT-object"
2960 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2961 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2963 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2964 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2965 echo "foo1" > $DIR/$tdir/f0
2966 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2967 error "(0) Fail to create PFL $DIR/$tdir/f1"
2968 echo "foo2" > $DIR/$tdir/f1
2969 cancel_lru_locks osc
2970 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2972 do_facet ost1 $LCTL set_param -n \
2973 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2974 echo "Nothing should be fixed since self detect and repair is disabled"
2975 local repaired=$(do_facet ost1 $LCTL get_param -n \
2976 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2977 awk '/^repaired/ { print $2 }')
2978 [ $repaired -eq 0 ] ||
2979 error "(1) Expected 0 repaired, but got $repaired"
2981 echo "Read RPC with right parent FID should be accepted,"
2982 echo "and cause parent FID on OST to be fixed"
2984 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2985 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2987 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2988 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2990 repaired=$(do_facet ost1 $LCTL get_param -n \
2991 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2992 awk '/^repaired/ { print $2 }')
2993 [ $repaired -eq 2 ] ||
2994 error "(3) Expected 1 repaired, but got $repaired"
2996 run_test 19b "OST-object inconsistency self repair"
2998 PATTERN_WITH_HOLE="40000001"
2999 PATTERN_WITHOUT_HOLE="raid0"
3002 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3003 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3006 echo "The target MDT-object and some of its OST-object are lost."
3007 echo "The LFSCK should find out the left OST-objects and re-create"
3008 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3009 echo "with the partial OST-objects (LOV EA hole)."
3011 echo "New client can access the file with LOV EA hole via normal"
3012 echo "system tools or commands without crash the system."
3014 echo "For old client, even though it cannot access the file with"
3015 echo "LOV EA hole, it should not cause the system crash."
3018 check_mount_and_prep
3019 $LFS mkdir -i 0 $DIR/$tdir/a1
3020 if [ $OSTCOUNT -gt 2 ]; then
3021 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3024 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3028 # 256 blocks on the stripe0.
3029 # 1 block on the stripe1 for 2 OSTs case.
3030 # 256 blocks on the stripe1 for other cases.
3031 # 1 block on the stripe2 if OSTs > 2
3032 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3033 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3034 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3036 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3037 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3038 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3041 $LFS getstripe $DIR/$tdir/a1/f0
3043 $LFS getstripe $DIR/$tdir/a1/f1
3045 $LFS getstripe $DIR/$tdir/a1/f2
3047 if [ $OSTCOUNT -gt 2 ]; then
3048 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3049 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3051 $LFS getstripe $DIR/$tdir/a1/f3
3054 cancel_lru_locks osc
3056 echo "Inject failure..."
3057 echo "To simulate f0 lost MDT-object"
3058 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3059 do_facet mds1 $LCTL set_param fail_loc=0x1616
3060 rm -f $DIR/$tdir/a1/f0
3062 echo "To simulate f1 lost MDT-object and OST-object0"
3063 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3064 do_facet mds1 $LCTL set_param fail_loc=0x161a
3065 rm -f $DIR/$tdir/a1/f1
3067 echo "To simulate f2 lost MDT-object and OST-object1"
3068 do_facet mds1 $LCTL set_param fail_val=1
3069 rm -f $DIR/$tdir/a1/f2
3071 if [ $OSTCOUNT -gt 2 ]; then
3072 echo "To simulate f3 lost MDT-object and OST-object2"
3073 do_facet mds1 $LCTL set_param fail_val=2
3074 rm -f $DIR/$tdir/a1/f3
3077 umount_client $MOUNT
3080 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3082 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3083 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3085 for k in $(seq $MDSCOUNT); do
3086 # The LFSCK status query internal is 30 seconds. For the case
3087 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3088 # time to guarantee the status sync up.
3089 wait_update_facet mds${k} "$LCTL get_param -n \
3090 mdd.$(facet_svc mds${k}).lfsck_layout |
3091 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3092 error "(2) MDS${k} is not the expected 'completed'"
3095 for k in $(seq $OSTCOUNT); do
3096 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3097 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3098 awk '/^status/ { print $2 }')
3099 [ "$cur_status" == "completed" ] ||
3100 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3103 local repaired=$(do_facet mds1 $LCTL get_param -n \
3104 mdd.$(facet_svc mds1).lfsck_layout |
3105 awk '/^repaired_orphan/ { print $2 }')
3106 if [ $OSTCOUNT -gt 2 ]; then
3107 [ $repaired -eq 9 ] ||
3108 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3110 [ $repaired -eq 4 ] ||
3111 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3114 mount_client $MOUNT || error "(5.0) Fail to start client!"
3116 LOV_PATTERN_F_HOLE=0x40000000
3119 # ${fid0}-R-0 is the old f0
3121 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3122 echo "Check $name, which is the old f0"
3124 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3126 local pattern=$($LFS getstripe -L $name)
3127 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3128 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3130 local stripes=$($LFS getstripe -c $name)
3131 if [ $OSTCOUNT -gt 2 ]; then
3132 [ $stripes -eq 3 ] ||
3133 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3135 [ $stripes -eq 2 ] ||
3136 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3139 local size=$(stat $name | awk '/Size:/ { print $2 }')
3140 [ $size -eq $((4096 * $bcount)) ] ||
3141 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3143 cat $name > /dev/null || error "(5.5) cannot read $name"
3145 echo "dummy" >> $name || error "(5.6) cannot write $name"
3147 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3149 touch $name || error "(5.8) cannot touch $name"
3151 rm -f $name || error "(5.9) cannot unlink $name"
3154 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3156 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3157 if [ $OSTCOUNT -gt 2 ]; then
3158 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3160 echo "Check $name, it contains the old f1's stripe1"
3163 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3165 pattern=$($LFS getstripe -L $name)
3166 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3167 error "(6.2) expect pattern flag hole, but got $pattern"
3169 stripes=$($LFS getstripe -c $name)
3170 if [ $OSTCOUNT -gt 2 ]; then
3171 [ $stripes -eq 3 ] ||
3172 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3174 [ $stripes -eq 2 ] ||
3175 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3178 size=$(stat $name | awk '/Size:/ { print $2 }')
3179 [ $size -eq $((4096 * $bcount)) ] ||
3180 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3182 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3184 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3185 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3188 [ $failures -eq 256 ] ||
3189 error "(6.6) expect 256 IO failures, but get $failures"
3191 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3192 [ $size -eq $((4096 * $bcount)) ] ||
3193 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3195 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3196 error "(6.8) write to the LOV EA hole should fail"
3198 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3199 error "(6.9) write to normal stripe should NOT fail"
3201 echo "foo" >> $name && error "(6.10) append write $name should fail"
3203 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3205 touch $name || error "(6.12) cannot touch $name"
3207 rm -f $name || error "(6.13) cannot unlink $name"
3210 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3212 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3213 if [ $OSTCOUNT -gt 2 ]; then
3214 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3216 echo "Check $name, it contains the old f2's stripe0"
3219 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3221 pattern=$($LFS getstripe -L $name)
3222 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3223 error "(7.2) expect pattern flag hole, but got $pattern"
3225 stripes=$($LFS getstripe -c $name)
3226 size=$(stat $name | awk '/Size:/ { print $2 }')
3227 if [ $OSTCOUNT -gt 2 ]; then
3228 [ $stripes -eq 3 ] ||
3229 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3231 [ $size -eq $((4096 * $bcount)) ] ||
3232 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3234 cat $name > /dev/null &&
3235 error "(7.5.1) normal read $name should fail"
3237 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3238 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3240 [ $failures -eq 256 ] ||
3241 error "(7.6) expect 256 IO failures, but get $failures"
3243 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3244 [ $size -eq $((4096 * $bcount)) ] ||
3245 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3247 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3248 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3250 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3251 error "(7.8.1) write to normal stripe should NOT fail"
3253 echo "foo" >> $name &&
3254 error "(7.8.3) append write $name should fail"
3256 chown $RUNAS_ID:$RUNAS_GID $name ||
3257 error "(7.9.1) cannot chown on $name"
3259 touch $name || error "(7.10.1) cannot touch $name"
3261 [ $stripes -eq 2 ] ||
3262 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3265 [ $size -eq $((4096 * (256 + 0))) ] ||
3266 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3268 cat $name > /dev/null &&
3269 error "(7.5.2) normal read $name should fail"
3271 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3272 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3273 [ $failures -eq 256 ] ||
3274 error "(7.6.2) expect 256 IO failures, but get $failures"
3277 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3278 [ $size -eq $((4096 * $bcount)) ] ||
3279 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3281 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3282 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3284 chown $RUNAS_ID:$RUNAS_GID $name ||
3285 error "(7.9.2) cannot chown on $name"
3287 touch $name || error "(7.10.2) cannot touch $name"
3290 rm -f $name || error "(7.11) cannot unlink $name"
3292 [ $OSTCOUNT -le 2 ] && return
3295 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3297 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3298 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3300 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3302 pattern=$($LFS getstripe -L $name)
3303 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3304 error "(8.2) expect pattern flag hole, but got $pattern"
3306 stripes=$($LFS getstripe -c $name)
3307 [ $stripes -eq 3 ] ||
3308 error "(8.3) expect the stripe count is 3, but got $stripes"
3310 size=$(stat $name | awk '/Size:/ { print $2 }')
3312 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3313 error "(8.4) expect the size $((4096 * 512)), but got $size"
3315 cat $name > /dev/null &&
3316 error "(8.5) normal read $name should fail"
3318 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3319 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3321 [ $failures -eq 256 ] ||
3322 error "(8.6) expect 256 IO failures, but get $failures"
3325 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3326 [ $size -eq $((4096 * $bcount)) ] ||
3327 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3329 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3330 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3332 chown $RUNAS_ID:$RUNAS_GID $name ||
3333 error "(8.9) cannot chown on $name"
3335 touch $name || error "(8.10) cannot touch $name"
3337 rm -f $name || error "(8.11) cannot unlink $name"
3339 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3342 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3343 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3346 echo "The target MDT-object and some of its OST-object are lost."
3347 echo "The LFSCK should find out the left OST-objects and re-create"
3348 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3349 echo "with the partial OST-objects (LOV EA hole)."
3351 echo "New client can access the file with LOV EA hole via normal"
3352 echo "system tools or commands without crash the system - PFL case."
3355 check_mount_and_prep
3357 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3358 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3359 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3360 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3361 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3362 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3364 local bcount=$((256 * 3 + 1))
3366 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3367 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3368 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3370 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3371 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3372 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3375 $LFS getstripe $DIR/$tdir/f0
3377 $LFS getstripe $DIR/$tdir/f1
3379 $LFS getstripe $DIR/$tdir/f2
3381 cancel_lru_locks mdc
3382 cancel_lru_locks osc
3384 echo "Inject failure..."
3385 echo "To simulate f0 lost MDT-object"
3386 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3387 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3390 echo "To simulate the case of f1 lost MDT-object and "
3391 echo "the first OST-object in each PFL component"
3392 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3393 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3396 echo "To simulate the case of f2 lost MDT-object and "
3397 echo "the second OST-object in each PFL component"
3398 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3403 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3405 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3406 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3408 for k in $(seq $MDSCOUNT); do
3409 # The LFSCK status query internal is 30 seconds. For the case
3410 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3411 # time to guarantee the status sync up.
3412 wait_update_facet mds${k} "$LCTL get_param -n \
3413 mdd.$(facet_svc mds${k}).lfsck_layout |
3414 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3415 error "(4) MDS${k} is not the expected 'completed'"
3418 for k in $(seq $OSTCOUNT); do
3419 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3420 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3421 awk '/^status/ { print $2 }')
3422 [ "$cur_status" == "completed" ] ||
3423 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3426 local repaired=$(do_facet mds1 $LCTL get_param -n \
3427 mdd.$(facet_svc mds1).lfsck_layout |
3428 awk '/^repaired_orphan/ { print $2 }')
3429 [ $repaired -eq 8 ] ||
3430 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3433 # ${fid0}-R-0 is the old f0
3435 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3436 echo "Check $name, which is the old f0"
3438 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3440 local pattern=$($LFS getstripe -L -I1 $name)
3441 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3442 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3444 pattern=$($LFS getstripe -L -I2 $name)
3445 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3446 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3448 local stripes=$($LFS getstripe -c -I1 $name)
3449 [ $stripes -eq 2 ] ||
3450 error "(7.3.1) expect 2 stripes, but got $stripes"
3452 stripes=$($LFS getstripe -c -I2 $name)
3453 [ $stripes -eq 2 ] ||
3454 error "(7.3.2) expect 2 stripes, but got $stripes"
3456 local e_start=$($LFS getstripe -I1 $name |
3457 awk '/lcme_extent.e_start:/ { print $2 }')
3458 [ $e_start -eq 0 ] ||
3459 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3461 local e_end=$($LFS getstripe -I1 $name |
3462 awk '/lcme_extent.e_end:/ { print $2 }')
3463 [ $e_end -eq 2097152 ] ||
3464 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3466 e_start=$($LFS getstripe -I2 $name |
3467 awk '/lcme_extent.e_start:/ { print $2 }')
3468 [ $e_start -eq 2097152 ] ||
3469 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3471 e_end=$($LFS getstripe -I2 $name |
3472 awk '/lcme_extent.e_end:/ { print $2 }')
3473 [ "$e_end" = "EOF" ] ||
3474 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3476 local size=$(stat $name | awk '/Size:/ { print $2 }')
3477 [ $size -eq $((4096 * $bcount)) ] ||
3478 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3480 cat $name > /dev/null || error "(7.7) cannot read $name"
3482 echo "dummy" >> $name || error "(7.8) cannot write $name"
3484 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3486 touch $name || error "(7.10) cannot touch $name"
3488 rm -f $name || error "(7.11) cannot unlink $name"
3491 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3493 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3494 echo "Check $name, it contains f1's second OST-object in each COMP"
3496 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3498 pattern=$($LFS getstripe -L -I1 $name)
3499 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3500 error "(8.2.1) expect pattern flag hole, but got $pattern"
3502 pattern=$($LFS getstripe -L -I2 $name)
3503 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3504 error "(8.2.2) expect pattern flag hole, but got $pattern"
3506 stripes=$($LFS getstripe -c -I1 $name)
3507 [ $stripes -eq 2 ] ||
3508 error "(8.3.2) expect 2 stripes, but got $stripes"
3510 stripes=$($LFS getstripe -c -I2 $name)
3511 [ $stripes -eq 2 ] ||
3512 error "(8.3.2) expect 2 stripes, but got $stripes"
3514 e_start=$($LFS getstripe -I1 $name |
3515 awk '/lcme_extent.e_start:/ { print $2 }')
3516 [ $e_start -eq 0 ] ||
3517 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3519 e_end=$($LFS getstripe -I1 $name |
3520 awk '/lcme_extent.e_end:/ { print $2 }')
3521 [ $e_end -eq 2097152 ] ||
3522 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3524 e_start=$($LFS getstripe -I2 $name |
3525 awk '/lcme_extent.e_start:/ { print $2 }')
3526 [ $e_start -eq 2097152 ] ||
3527 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3529 e_end=$($LFS getstripe -I2 $name |
3530 awk '/lcme_extent.e_end:/ { print $2 }')
3531 [ "$e_end" = "EOF" ] ||
3532 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3534 size=$(stat $name | awk '/Size:/ { print $2 }')
3535 [ $size -eq $((4096 * $bcount)) ] ||
3536 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3538 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3540 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3541 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3543 # The first stripe in each COMP was lost
3544 [ $failures -eq 512 ] ||
3545 error "(8.8) expect 512 IO failures, but get $failures"
3547 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3548 [ $size -eq $((4096 * $bcount)) ] ||
3549 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3551 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3552 error "(8.10) write to the LOV EA hole should fail"
3554 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3555 error "(8.11) write to normal stripe should NOT fail"
3557 echo "foo" >> $name && error "(8.12) append write $name should fail"
3559 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3561 touch $name || error "(8.14) cannot touch $name"
3563 rm -f $name || error "(8.15) cannot unlink $name"
3566 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3568 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3569 echo "Check $name, it contains f2's first stripe in each COMP"
3571 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3573 pattern=$($LFS getstripe -L -I1 $name)
3574 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3575 error "(9.2.1) expect pattern flag hole, but got $pattern"
3577 pattern=$($LFS getstripe -L -I2 $name)
3578 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3579 error "(9.2.2) expect pattern flag hole, but got $pattern"
3581 stripes=$($LFS getstripe -c -I1 $name)
3582 [ $stripes -eq 2 ] ||
3583 error "(9.3.2) expect 2 stripes, but got $stripes"
3585 stripes=$($LFS getstripe -c -I2 $name)
3586 [ $stripes -eq 2 ] ||
3587 error "(9.3.2) expect 2 stripes, but got $stripes"
3589 e_start=$($LFS getstripe -I1 $name |
3590 awk '/lcme_extent.e_start:/ { print $2 }')
3591 [ $e_start -eq 0 ] ||
3592 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3594 e_end=$($LFS getstripe -I1 $name |
3595 awk '/lcme_extent.e_end:/ { print $2 }')
3596 [ $e_end -eq 2097152 ] ||
3597 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3599 e_start=$($LFS getstripe -I2 $name |
3600 awk '/lcme_extent.e_start:/ { print $2 }')
3601 [ $e_start -eq 2097152 ] ||
3602 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3604 e_end=$($LFS getstripe -I2 $name |
3605 awk '/lcme_extent.e_end:/ { print $2 }')
3606 [ "$e_end" = "EOF" ] ||
3607 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3609 size=$(stat $name | awk '/Size:/ { print $2 }')
3610 # The second stripe in COMP was lost, so we do not know there
3611 # have ever been some data before. 'stat' will regard it as
3612 # no data on the lost stripe.
3614 [ $size -eq $((4096 * $bcount)) ] ||
3615 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3617 cat $name > /dev/null &&
3618 error "(9.7) normal read $name should fail"
3620 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3621 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3622 [ $failures -eq 512 ] ||
3623 error "(9.8) expect 256 IO failures, but get $failures"
3625 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3626 # The second stripe in COMP was lost, so we do not know there
3627 # have ever been some data before. Since 'dd' skip failure,
3628 # it will regard the lost stripe contains data.
3630 [ $size -eq $((4096 * $bcount)) ] ||
3631 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3633 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3634 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3636 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3637 error "(9.11) write to normal stripe should NOT fail"
3639 echo "foo" >> $name &&
3640 error "(9.12) append write $name should fail"
3642 chown $RUNAS_ID:$RUNAS_GID $name ||
3643 error "(9.13) cannot chown on $name"
3645 touch $name || error "(9.14) cannot touch $name"
3647 rm -f $name || error "(7.15) cannot unlink $name"
3649 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3652 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3653 skip "ignore the test if MDS is older than 2.5.59" && return
3655 check_mount_and_prep
3656 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3658 echo "Start all LFSCK components by default (-s 1)"
3659 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3660 error "Fail to start LFSCK"
3662 echo "namespace LFSCK should be in 'scanning-phase1' status"
3663 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3664 [ "$STATUS" == "scanning-phase1" ] ||
3665 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3667 echo "layout LFSCK should be in 'scanning-phase1' status"
3668 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3669 [ "$STATUS" == "scanning-phase1" ] ||
3670 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3672 echo "Stop all LFSCK components by default"
3673 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3674 error "Fail to stop LFSCK"
3676 run_test 21 "run all LFSCK components by default"
3679 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3682 echo "The parent_A references the child directory via some name entry,"
3683 echo "but the child directory back references another parent_B via its"
3684 echo "".." name entry. The parent_B does not exist. Then the namespace"
3685 echo "LFSCK will repair the child directory's ".." name entry."
3688 check_mount_and_prep
3690 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3691 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3693 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3694 echo "The dummy's dotdot name entry references the guard."
3695 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3696 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3697 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3698 error "(3) Fail to mkdir on MDT0"
3699 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3701 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3703 echo "Trigger namespace LFSCK to repair unmatched pairs"
3704 $START_NAMESPACE -A -r ||
3705 error "(5) Fail to start LFSCK for namespace"
3707 wait_all_targets_blocked namespace completed 6
3709 local repaired=$($SHOW_NAMESPACE |
3710 awk '/^unmatched_pairs_repaired/ { print $2 }')
3711 [ $repaired -eq 1 ] ||
3712 error "(7) Fail to repair unmatched pairs: $repaired"
3714 echo "'ls' should success after namespace LFSCK repairing"
3715 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3716 error "(8) ls should success."
3718 run_test 22a "LFSCK can repair unmatched pairs (1)"
3721 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3724 echo "The parent_A references the child directory via the name entry_B,"
3725 echo "but the child directory back references another parent_C via its"
3726 echo "".." name entry. The parent_C exists, but there is no the name"
3727 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3728 echo "the child directory's ".." name entry and its linkEA."
3731 check_mount_and_prep
3733 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3734 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3736 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3737 echo "and bad linkEA. The dummy's dotdot name entry references the"
3738 echo "guard. The dummy's linkEA references n non-exist name entry."
3739 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3740 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3741 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3742 error "(3) Fail to mkdir on MDT0"
3743 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3745 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3746 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3747 local dummyname=$($LFS fid2path $DIR $dummyfid)
3748 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3749 error "(4) fid2path works unexpectedly."
3751 echo "Trigger namespace LFSCK to repair unmatched pairs"
3752 $START_NAMESPACE -A -r ||
3753 error "(5) Fail to start LFSCK for namespace"
3755 wait_all_targets_blocked namespace completed 6
3757 local repaired=$($SHOW_NAMESPACE |
3758 awk '/^unmatched_pairs_repaired/ { print $2 }')
3759 [ $repaired -eq 1 ] ||
3760 error "(7) Fail to repair unmatched pairs: $repaired"
3762 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3763 local dummyname=$($LFS fid2path $DIR $dummyfid)
3764 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3765 error "(8) fid2path does not work"
3767 run_test 22b "LFSCK can repair unmatched pairs (2)"
3770 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3773 echo "The name entry is there, but the MDT-object for such name "
3774 echo "entry does not exist. The namespace LFSCK should find out "
3775 echo "and repair the inconsistency as required."
3778 check_mount_and_prep
3780 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3781 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3783 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3784 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3785 do_facet mds2 $LCTL set_param fail_loc=0x1620
3786 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3787 do_facet mds2 $LCTL set_param fail_loc=0
3789 echo "'ls' should fail because of dangling name entry"
3790 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3792 echo "Trigger namespace LFSCK to find out dangling name entry"
3793 $START_NAMESPACE -A -r ||
3794 error "(5) Fail to start LFSCK for namespace"
3796 wait_all_targets_blocked namespace completed 6
3798 local repaired=$($SHOW_NAMESPACE |
3799 awk '/^dangling_repaired/ { print $2 }')
3800 [ $repaired -eq 1 ] ||
3801 error "(7) Fail to repair dangling name entry: $repaired"
3803 echo "'ls' should fail because not re-create MDT-object by default"
3804 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3806 echo "Trigger namespace LFSCK again to repair dangling name entry"
3807 $START_NAMESPACE -A -r -C ||
3808 error "(9) Fail to start LFSCK for namespace"
3810 wait_all_targets_blocked namespace completed 10
3812 repaired=$($SHOW_NAMESPACE |
3813 awk '/^dangling_repaired/ { print $2 }')
3814 [ $repaired -eq 1 ] ||
3815 error "(11) Fail to repair dangling name entry: $repaired"
3817 echo "'ls' should success after namespace LFSCK repairing"
3818 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3820 run_test 23a "LFSCK can repair dangling name entry (1)"
3824 echo "The objectA has multiple hard links, one of them corresponding"
3825 echo "to the name entry_B. But there is something wrong for the name"
3826 echo "entry_B and cause entry_B to references non-exist object_C."
3827 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3828 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3829 echo "comes to the second-stage scanning, it will find that the"
3830 echo "former re-creating object_C is not proper, and will try to"
3831 echo "replace the object_C with the real object_A."
3834 check_mount_and_prep
3836 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3837 $LFS path2fid $DIR/$tdir/d0
3839 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3841 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3842 $LFS path2fid $DIR/$tdir/d0/f0
3844 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3845 $LFS path2fid $DIR/$tdir/d0/f1
3847 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3848 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3850 if [ "$SEQ0" != "$SEQ1" ]; then
3851 # To guarantee that the f0 and f1 are in the same FID seq
3852 rm -f $DIR/$tdir/d0/f0 ||
3853 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3854 echo "dummy" > $DIR/$tdir/d0/f0 ||
3855 error "(3.2) Fail to touch on MDT0"
3856 $LFS path2fid $DIR/$tdir/d0/f0
3859 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3860 OID=$(printf %d $OID)
3862 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3863 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3864 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3865 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3866 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3868 # If there is creation after the dangling injection, it may re-use
3869 # the just released local object (inode) that is referenced by the
3870 # dangling name entry. It will fail the dangling injection.
3871 # So before deleting the target object for the dangling name entry,
3872 # remove some other objects to avoid the target object being reused
3873 # by some potential creations. LU-7429
3874 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3876 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3878 echo "'ls' should fail because of dangling name entry"
3879 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3880 error "(6) ls should fail."
3882 echo "Trigger namespace LFSCK to find out dangling name entry"
3883 $START_NAMESPACE -r -C ||
3884 error "(7) Fail to start LFSCK for namespace"
3886 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3887 mdd.${MDT_DEV}.lfsck_namespace |
3888 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3890 error "(8) unexpected status"
3893 local repaired=$($SHOW_NAMESPACE |
3894 awk '/^dangling_repaired/ { print $2 }')
3895 [ $repaired -eq 1 ] ||
3896 error "(9) Fail to repair dangling name entry: $repaired"
3898 repaired=$($SHOW_NAMESPACE |
3899 awk '/^multiple_linked_repaired/ { print $2 }')
3900 [ $repaired -eq 1 ] ||
3901 error "(10) Fail to drop the former created object: $repaired"
3903 local data=$(cat $DIR/$tdir/d0/foo)
3904 [ "$data" == "dummy" ] ||
3905 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3907 run_test 23b "LFSCK can repair dangling name entry (2)"
3910 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3911 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3912 mdd.${MDT_DEV}.lfsck_namespace |
3913 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3915 error "(10) unexpected status"
3918 stop_full_debug_logging
3923 echo "The objectA has multiple hard links, one of them corresponding"
3924 echo "to the name entry_B. But there is something wrong for the name"
3925 echo "entry_B and cause entry_B to references non-exist object_C."
3926 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3927 echo "as dangling, and re-create the lost object_C. And then others"
3928 echo "modified the re-created object_C. When the LFSCK comes to the"
3929 echo "second-stage scanning, it will find that the former re-creating"
3930 echo "object_C maybe wrong and try to replace the object_C with the"
3931 echo "real object_A. But because object_C has been modified, so the"
3932 echo "LFSCK cannot replace it."
3935 start_full_debug_logging
3937 check_mount_and_prep
3939 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3940 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3941 echo "parent_fid=$parent_fid"
3943 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3945 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3946 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3947 echo "f0_fid=$f0_fid"
3949 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3950 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
3951 echo "f1_fid=$f1_fid"
3953 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
3954 # To guarantee that the f0 and f1 are in the same FID seq
3955 rm -f $DIR/$tdir/d0/f0 ||
3956 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3957 echo "dummy" > $DIR/$tdir/d0/f0 ||
3958 error "(3.2) Fail to touch on MDT0"
3959 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3960 echo "f0_fid=$f0_fid (replaced)"
3963 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
3965 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3966 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3967 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
3968 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3969 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3971 # If there is creation after the dangling injection, it may re-use
3972 # the just released local object (inode) that is referenced by the
3973 # dangling name entry. It will fail the dangling injection.
3974 # So before deleting the target object for the dangling name entry,
3975 # remove some other objects to avoid the target object being reused
3976 # by some potential creations. LU-7429
3977 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3979 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3981 echo "'ls' should fail because of dangling name entry"
3982 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3983 error "(6) ls should fail."
3985 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3986 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3988 echo "Trigger namespace LFSCK to find out dangling name entry"
3989 $START_NAMESPACE -r -C ||
3990 error "(7) Fail to start LFSCK for namespace"
3992 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
3993 # While unexpected by the test, it is valid for LFSCK to repair
3994 # the link to the original object before any data is written.
3995 local size=$(stat -c %s $DIR/$tdir/d0/foo)
3997 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
3998 log "LFSCK repaired file prematurely"
4003 stat $DIR/$tdir/d0/foo
4005 error "(8) unexpected size"
4008 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4009 cancel_lru_locks osc
4013 local repaired=$($SHOW_NAMESPACE |
4014 awk '/^dangling_repaired/ { print $2 }')
4015 [ $repaired -eq 1 ] ||
4016 error "(11) Fail to repair dangling name entry: $repaired"
4018 local data=$(cat $DIR/$tdir/d0/foo)
4019 [ "$data" != "dummy" ] ||
4020 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4022 run_test 23c "LFSCK can repair dangling name entry (3)"
4025 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4026 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4029 echo "Two MDT-objects back reference the same name entry via their"
4030 echo "each own linkEA entry, but the name entry only references one"
4031 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4032 echo "for the MDT-object that is not recognized. If such MDT-object"
4033 echo "has no other linkEA entry after the removing, then the LFSCK"
4034 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4037 check_mount_and_prep
4039 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4041 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4042 $LFS path2fid $DIR/$tdir/d0/guard
4044 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4045 $LFS path2fid $DIR/$tdir/d0/dummy
4048 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4049 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4051 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4054 touch $DIR/$tdir/d0/guard/foo ||
4055 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4057 echo "Inject failure stub on MDT0 to simulate the case that"
4058 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4059 echo "that references $DIR/$tdir/d0/guard/foo."
4060 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4061 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4062 echo "there with the same linkEA entry as another MDT-object"
4063 echo "$DIR/$tdir/d0/guard/foo has"
4065 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4066 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4067 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4068 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4069 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4070 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4071 rmdir $DIR/$tdir/d0/dummy/foo ||
4072 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4073 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4075 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4076 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4077 error "(6) stat successfully unexpectedly"
4079 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4080 $START_NAMESPACE -A -r ||
4081 error "(7) Fail to start LFSCK for namespace"
4083 wait_all_targets_blocked namespace completed 8
4085 local repaired=$($SHOW_NAMESPACE |
4086 awk '/^multiple_referenced_repaired/ { print $2 }')
4087 [ $repaired -eq 1 ] ||
4088 error "(9) Fail to repair multiple referenced name entry: $repaired"
4090 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4091 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4092 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4094 local cname="$cfid-$pfid-D-0"
4095 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4096 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4098 run_test 24 "LFSCK can repair multiple-referenced name entry"
4101 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4102 skip "ldiskfs only test" && return
4105 echo "The file type in the name entry does not match the file type"
4106 echo "claimed by the referenced object. Then the LFSCK will update"
4107 echo "the file type in the name entry."
4110 check_mount_and_prep
4112 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4114 echo "Inject failure stub on MDT0 to simulate the case that"
4115 echo "the file type stored in the name entry is wrong."
4117 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4118 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4119 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4120 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4122 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4123 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4125 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4126 mdd.${MDT_DEV}.lfsck_namespace |
4127 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4129 error "(4) unexpected status"
4132 local repaired=$($SHOW_NAMESPACE |
4133 awk '/^bad_file_type_repaired/ { print $2 }')
4134 [ $repaired -eq 1 ] ||
4135 error "(5) Fail to repair bad file type in name entry: $repaired"
4137 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4139 run_test 25 "LFSCK can repair bad file type in the name entry"
4143 echo "The local name entry back referenced by the MDT-object is lost."
4144 echo "The namespace LFSCK will add the missing local name entry back"
4145 echo "to the normal namespace."
4148 check_mount_and_prep
4150 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4151 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4152 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4154 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4155 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4157 echo "Inject failure stub on MDT0 to simulate the case that"
4158 echo "foo's name entry will be removed, but the foo's object"
4159 echo "and its linkEA are kept in the system."
4161 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4162 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4163 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4164 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4166 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4167 error "(5) 'ls' should fail"
4169 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4170 $START_NAMESPACE -r -A ||
4171 error "(6) Fail to start LFSCK for namespace"
4173 wait_all_targets_blocked namespace completed 7
4175 local repaired=$($SHOW_NAMESPACE |
4176 awk '/^lost_dirent_repaired/ { print $2 }')
4177 [ $repaired -eq 1 ] ||
4178 error "(8) Fail to repair lost dirent: $repaired"
4180 ls -ail $DIR/$tdir/d0/foo ||
4181 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4183 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4184 [ "$foofid" == "$foofid2" ] ||
4185 error "(10) foo's FID changed: $foofid, $foofid2"
4187 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4190 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4193 echo "The remote name entry back referenced by the MDT-object is lost."
4194 echo "The namespace LFSCK will add the missing remote name entry back"
4195 echo "to the normal namespace."
4198 check_mount_and_prep
4200 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4201 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4202 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4204 echo "Inject failure stub on MDT0 to simulate the case that"
4205 echo "foo's name entry will be removed, but the foo's object"
4206 echo "and its linkEA are kept in the system."
4208 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4209 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4210 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4213 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4214 error "(4) 'ls' should fail"
4216 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4217 $START_NAMESPACE -r -A ||
4218 error "(5) Fail to start LFSCK for namespace"
4220 wait_all_targets_blocked namespace completed 6
4222 local repaired=$($SHOW_NAMESPACE |
4223 awk '/^lost_dirent_repaired/ { print $2 }')
4224 [ $repaired -eq 1 ] ||
4225 error "(7) Fail to repair lost dirent: $repaired"
4227 ls -ail $DIR/$tdir/d0/foo ||
4228 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4230 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4231 [ "$foofid" == "$foofid2" ] ||
4232 error "(9) foo's FID changed: $foofid, $foofid2"
4234 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4237 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4240 echo "The local parent referenced by the MDT-object linkEA is lost."
4241 echo "The namespace LFSCK will re-create the lost parent as orphan."
4244 check_mount_and_prep
4246 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4247 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4248 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4249 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4251 echo "Inject failure stub on MDT0 to simulate the case that"
4252 echo "foo's name entry will be removed, but the foo's object"
4253 echo "and its linkEA are kept in the system. And then remove"
4254 echo "another hard link and the parent directory."
4256 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4257 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4258 rm -f $DIR/$tdir/d0/foo ||
4259 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4260 rm -f $DIR/$tdir/d0/dummy ||
4261 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4264 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4265 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4267 echo "Trigger namespace LFSCK to repair the lost parent"
4268 $START_NAMESPACE -r -A ||
4269 error "(6) Fail to start LFSCK for namespace"
4271 wait_all_targets_blocked namespace completed 7
4273 local repaired=$($SHOW_NAMESPACE |
4274 awk '/^lost_dirent_repaired/ { print $2 }')
4275 [ $repaired -eq 1 ] ||
4276 error "(8) Fail to repair lost dirent: $repaired"
4278 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4279 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4280 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4282 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4284 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4285 [ ! -z "$cname" ] ||
4286 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4288 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4291 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4292 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4295 echo "The remote parent referenced by the MDT-object linkEA is lost."
4296 echo "The namespace LFSCK will re-create the lost parent as orphan."
4299 check_mount_and_prep
4301 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4302 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4304 $LFS path2fid $DIR/$tdir/d0
4306 echo "Inject failure stub on MDT0 to simulate the case that"
4307 echo "foo's name entry will be removed, but the foo's object"
4308 echo "and its linkEA are kept in the system. And then remove"
4309 echo "the parent directory."
4311 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4312 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4313 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4314 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4316 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4317 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4319 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4320 $START_NAMESPACE -r -A ||
4321 error "(6) Fail to start LFSCK for namespace"
4323 wait_all_targets_blocked namespace completed 7
4325 local repaired=$($SHOW_NAMESPACE |
4326 awk '/^lost_dirent_repaired/ { print $2 }')
4327 [ $repaired -eq 1 ] ||
4328 error "(8) Fail to repair lost dirent: $repaired"
4330 ls -ail $MOUNT/.lustre/lost+found/
4332 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4333 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4334 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4336 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4338 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4339 [ ! -z "$cname" ] ||
4340 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4342 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4345 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4348 echo "The target name entry is lost. The LFSCK should insert the"
4349 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4350 echo "the MDT (on which the orphan MDT-object resides) has ever"
4351 echo "failed to respond some name entry verification during the"
4352 echo "first stage-scanning, then the LFSCK should skip to handle"
4353 echo "orphan MDT-object on this MDT. But other MDTs should not"
4357 check_mount_and_prep
4358 $LFS mkdir -i 0 $DIR/$tdir/d1
4359 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4360 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4362 $LFS mkdir -i 1 $DIR/$tdir/d2
4363 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4364 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4366 echo "Inject failure stub on MDT0 to simulate the case that"
4367 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4368 echo "and its linkEA are kept in the system. And the case that"
4369 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4370 echo "and its linkEA are kept in the system."
4372 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4373 do_facet mds1 $LCTL set_param fail_loc=0x1624
4374 do_facet mds2 $LCTL set_param fail_loc=0x1624
4375 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4376 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4377 do_facet mds1 $LCTL set_param fail_loc=0
4378 do_facet mds2 $LCTL set_param fail_loc=0
4380 cancel_lru_locks mdc
4381 cancel_lru_locks osc
4383 echo "Inject failure, to simulate the MDT0 fail to handle"
4384 echo "MDT1 LFSCK request during the first-stage scanning."
4385 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4386 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4388 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4389 $START_NAMESPACE -r -A ||
4390 error "(3) Fail to start LFSCK for namespace"
4392 wait_update_facet mds1 "$LCTL get_param -n \
4393 mdd.$(facet_svc mds1).lfsck_namespace |
4394 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4395 error "(4) mds1 is not the expected 'partial'"
4398 wait_update_facet mds2 "$LCTL get_param -n \
4399 mdd.$(facet_svc mds2).lfsck_namespace |
4400 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4401 error "(5) mds2 is not the expected 'completed'"
4404 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4406 local repaired=$(do_facet mds1 $LCTL get_param -n \
4407 mdd.$(facet_svc mds1).lfsck_namespace |
4408 awk '/^lost_dirent_repaired/ { print $2 }')
4409 [ $repaired -eq 0 ] ||
4410 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4412 repaired=$(do_facet mds2 $LCTL get_param -n \
4413 mdd.$(facet_svc mds2).lfsck_namespace |
4414 awk '/^lost_dirent_repaired/ { print $2 }')
4415 [ $repaired -eq 1 ] ||
4416 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4418 echo "Trigger namespace LFSCK on all devices again to cleanup"
4419 $START_NAMESPACE -r -A ||
4420 error "(8) Fail to start LFSCK for namespace"
4422 wait_all_targets_blocked namespace completed 9
4424 local repaired=$(do_facet mds1 $LCTL get_param -n \
4425 mdd.$(facet_svc mds1).lfsck_namespace |
4426 awk '/^lost_dirent_repaired/ { print $2 }')
4427 [ $repaired -eq 1 ] ||
4428 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4430 repaired=$(do_facet mds2 $LCTL get_param -n \
4431 mdd.$(facet_svc mds2).lfsck_namespace |
4432 awk '/^lost_dirent_repaired/ { print $2 }')
4433 [ $repaired -eq 0 ] ||
4434 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4436 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4440 echo "The object's nlink attribute is larger than the object's known"
4441 echo "name entries count. The LFSCK will repair the object's nlink"
4442 echo "attribute to match the known name entries count"
4445 check_mount_and_prep
4447 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4448 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4450 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4451 echo "nlink attribute is larger than its name entries count."
4453 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4454 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4455 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4456 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4457 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4459 cancel_lru_locks mdc
4460 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4461 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4463 echo "Trigger namespace LFSCK to repair the nlink count"
4464 $START_NAMESPACE -r -A ||
4465 error "(5) Fail to start LFSCK for namespace"
4467 wait_all_targets_blocked namespace completed 6
4469 local repaired=$($SHOW_NAMESPACE |
4470 awk '/^nlinks_repaired/ { print $2 }')
4471 [ $repaired -eq 1 ] ||
4472 error "(7) Fail to repair nlink count: $repaired"
4474 cancel_lru_locks mdc
4475 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4476 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4478 # Disable 29a, we only allow nlink to be updated if the known linkEA
4479 # entries is larger than nlink count.
4481 #run_test 29a "LFSCK can repair bad nlink count (1)"
4485 echo "The object's nlink attribute is smaller than the object's known"
4486 echo "name entries count. The LFSCK will repair the object's nlink"
4487 echo "attribute to match the known name entries count"
4490 check_mount_and_prep
4492 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4493 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4495 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4496 echo "nlink attribute is smaller than its name entries count."
4498 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4499 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4500 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4501 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4504 cancel_lru_locks mdc
4505 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4506 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4508 echo "Trigger namespace LFSCK to repair the nlink count"
4509 $START_NAMESPACE -r -A ||
4510 error "(5) Fail to start LFSCK for namespace"
4512 wait_all_targets_blocked namespace completed 6
4514 local repaired=$($SHOW_NAMESPACE |
4515 awk '/^nlinks_repaired/ { print $2 }')
4516 [ $repaired -eq 1 ] ||
4517 error "(7) Fail to repair nlink count: $repaired"
4519 cancel_lru_locks mdc
4520 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4521 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4523 run_test 29b "LFSCK can repair bad nlink count (2)"
4528 echo "The namespace LFSCK will create many hard links to the target"
4529 echo "file as to exceed the linkEA size limitation. Under such case"
4530 echo "the linkEA will be marked as overflow that will prevent the"
4531 echo "target file to be migrated. Then remove some hard links to"
4532 echo "make the left hard links to be held within the linkEA size"
4533 echo "limitation. But before the namespace LFSCK adding all the"
4534 echo "missed linkEA entries back, the overflow mark (timestamp)"
4535 echo "will not be cleared."
4538 check_mount_and_prep
4540 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4541 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4542 error "(0.2) Fail to mkdir"
4543 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4544 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4546 # define MAX_LINKEA_SIZE 4096
4547 # sizeof(link_ea_header) = 24
4548 # sizeof(link_ea_entry) = 18
4549 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4550 # (sizeof(link_ea_entry) + name_length))
4551 # If the average name length is 12 bytes, then 150 hard links
4552 # is totally enough to overflow the linkEA
4553 echo "Create 150 hard links should succeed although the linkEA overflow"
4554 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4555 error "(2) Fail to hard link"
4557 cancel_lru_locks mdc
4558 if [ $MDSCOUNT -ge 2 ]; then
4559 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4560 error "(3.1) Migrate should fail"
4562 echo "The object with linkEA overflow should NOT be migrated"
4563 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4564 [ "$newfid" == "$oldfid" ] ||
4565 error "(3.2) Migrate should fail: $newfid != $oldfid"
4568 # Remove 100 hard links, then the linkEA should have space
4569 # to hold the missed linkEA entries.
4570 echo "Remove 100 hard links to save space for the missed linkEA entries"
4571 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4573 if [ $MDSCOUNT -ge 2 ]; then
4574 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4575 error "(5.1) Migrate should fail"
4577 # The overflow timestamp is still there, so migration will fail.
4578 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4579 [ "$newfid" == "$oldfid" ] ||
4580 error "(5.2) Migrate should fail: $newfid != $oldfid"
4583 # sleep 3 seconds to guarantee that the overflow is recognized
4586 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4587 $START_NAMESPACE -r -A ||
4588 error "(6) Fail to start LFSCK for namespace"
4590 wait_all_targets_blocked namespace completed 7
4592 local repaired=$($SHOW_NAMESPACE |
4593 awk '/^linkea_overflow_cleared/ { print $2 }')
4594 [ $repaired -eq 1 ] ||
4595 error "(8) Fail to clear linkea overflow: $repaired"
4597 repaired=$($SHOW_NAMESPACE |
4598 awk '/^nlinks_repaired/ { print $2 }')
4599 [ $repaired -eq 0 ] ||
4600 error "(9) Unexpected nlink repaired: $repaired"
4602 if [ $MDSCOUNT -ge 2 ]; then
4603 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4604 error "(10.1) Migrate failure"
4606 # Migration should succeed after clear the overflow timestamp.
4607 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4608 [ "$newfid" != "$oldfid" ] ||
4609 error "(10.2) Migrate should succeed"
4611 ls -l $DIR/$tdir/foo > /dev/null ||
4612 error "(11) 'ls' failed after migration"
4615 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4616 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4618 run_test 29c "verify linkEA size limitation"
4621 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4622 skip "ldiskfs only test" && return
4623 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4626 echo "The namespace LFSCK will move the orphans from backend"
4627 echo "/lost+found directory to normal client visible namespace"
4628 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4631 check_mount_and_prep
4633 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4634 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4636 echo "Inject failure stub on MDT0 to simulate the case that"
4637 echo "directory d0 has no linkEA entry, then the LFSCK will"
4638 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4640 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4642 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4643 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4645 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4646 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4648 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4649 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4651 echo "Inject failure stub on MDT0 to simulate the case that the"
4652 echo "object's name entry will be removed, but not destroy the"
4653 echo "object. Then backend e2fsck will handle it as orphan and"
4654 echo "add them into the backend /lost+found directory."
4656 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4657 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4658 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4659 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4660 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4661 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4662 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4664 umount_client $MOUNT || error "(10) Fail to stop client!"
4666 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4669 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4670 error "(12) Fail to run e2fsck"
4672 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4673 error "(13) Fail to start MDT0"
4675 echo "Trigger namespace LFSCK to recover backend orphans"
4676 $START_NAMESPACE -r -A ||
4677 error "(14) Fail to start LFSCK for namespace"
4679 wait_all_targets_blocked namespace completed 15
4681 local repaired=$($SHOW_NAMESPACE |
4682 awk '/^local_lost_found_moved/ { print $2 }')
4683 [ $repaired -ge 4 ] ||
4684 error "(16) Fail to recover backend orphans: $repaired"
4686 mount_client $MOUNT || error "(17) Fail to start client!"
4688 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4690 ls -ail $MOUNT/.lustre/lost+found/
4692 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4693 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4694 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4696 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4698 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4699 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4701 stat ${cname}/d1 || error "(21) d1 is not recovered"
4702 stat ${cname}/f1 || error "(22) f1 is not recovered"
4704 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4707 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4710 echo "For the name entry under a striped directory, if the name"
4711 echo "hash does not match the shard, then the LFSCK will repair"
4712 echo "the bad name entry"
4715 check_mount_and_prep
4717 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4718 error "(1) Fail to create striped directory"
4720 echo "Inject failure stub on client to simulate the case that"
4721 echo "some name entry should be inserted into other non-first"
4722 echo "shard, but inserted into the first shard by wrong"
4724 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4725 $LCTL set_param fail_loc=0x1628 fail_val=0
4726 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4727 error "(2) Fail to create file under striped directory"
4728 $LCTL set_param fail_loc=0 fail_val=0
4730 echo "Trigger namespace LFSCK to repair bad name hash"
4731 $START_NAMESPACE -r -A ||
4732 error "(3) Fail to start LFSCK for namespace"
4734 wait_all_targets_blocked namespace completed 4
4736 local repaired=$($SHOW_NAMESPACE |
4737 awk '/^name_hash_repaired/ { print $2 }')
4738 [ $repaired -ge 1 ] ||
4739 error "(5) Fail to repair bad name hash: $repaired"
4741 umount_client $MOUNT || error "(6) umount failed"
4742 mount_client $MOUNT || error "(7) mount failed"
4744 for ((i = 0; i < $MDSCOUNT; i++)); do
4745 stat $DIR/$tdir/striped_dir/d$i ||
4746 error "(8) Fail to stat d$i after LFSCK"
4747 rmdir $DIR/$tdir/striped_dir/d$i ||
4748 error "(9) Fail to unlink d$i after LFSCK"
4751 rmdir $DIR/$tdir/striped_dir ||
4752 error "(10) Fail to remove the striped directory after LFSCK"
4754 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4757 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4760 echo "For the name entry under a striped directory, if the name"
4761 echo "hash does not match the shard, then the LFSCK will repair"
4762 echo "the bad name entry"
4765 check_mount_and_prep
4767 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4768 error "(1) Fail to create striped directory"
4770 echo "Inject failure stub on client to simulate the case that"
4771 echo "some name entry should be inserted into other non-second"
4772 echo "shard, but inserted into the secod shard by wrong"
4774 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4775 $LCTL set_param fail_loc=0x1628 fail_val=1
4776 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4777 error "(2) Fail to create file under striped directory"
4778 $LCTL set_param fail_loc=0 fail_val=0
4780 echo "Trigger namespace LFSCK to repair bad name hash"
4781 $START_NAMESPACE -r -A ||
4782 error "(3) Fail to start LFSCK for namespace"
4784 wait_all_targets_blocked namespace completed 4
4786 local repaired=$(do_facet mds2 $LCTL get_param -n \
4787 mdd.$(facet_svc mds2).lfsck_namespace |
4788 awk '/^name_hash_repaired/ { print $2 }')
4789 [ $repaired -ge 1 ] ||
4790 error "(5) Fail to repair bad name hash: $repaired"
4792 umount_client $MOUNT || error "(6) umount failed"
4793 mount_client $MOUNT || error "(7) mount failed"
4795 for ((i = 0; i < $MDSCOUNT; i++)); do
4796 stat $DIR/$tdir/striped_dir/d$i ||
4797 error "(8) Fail to stat d$i after LFSCK"
4798 rmdir $DIR/$tdir/striped_dir/d$i ||
4799 error "(9) Fail to unlink d$i after LFSCK"
4802 rmdir $DIR/$tdir/striped_dir ||
4803 error "(10) Fail to remove the striped directory after LFSCK"
4805 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4808 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4811 echo "For some reason, the master MDT-object of the striped directory"
4812 echo "may lost its master LMV EA. If nobody created files under the"
4813 echo "master directly after the master LMV EA lost, then the LFSCK"
4814 echo "should re-generate the master LMV EA."
4817 check_mount_and_prep
4819 echo "Inject failure stub on MDT0 to simulate the case that the"
4820 echo "master MDT-object of the striped directory lost the LMV EA."
4822 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4824 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4825 error "(1) Fail to create striped directory"
4826 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4828 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4829 $START_NAMESPACE -r -A ||
4830 error "(2) Fail to start LFSCK for namespace"
4832 wait_all_targets_blocked namespace completed 3
4834 local repaired=$($SHOW_NAMESPACE |
4835 awk '/^striped_dirs_repaired/ { print $2 }')
4836 [ $repaired -eq 1 ] ||
4837 error "(4) Fail to re-generate master LMV EA: $repaired"
4839 umount_client $MOUNT || error "(5) umount failed"
4840 mount_client $MOUNT || error "(6) mount failed"
4842 local empty=$(ls $DIR/$tdir/striped_dir/)
4843 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4845 rmdir $DIR/$tdir/striped_dir ||
4846 error "(8) Fail to remove the striped directory after LFSCK"
4848 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4851 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4854 echo "For some reason, the master MDT-object of the striped directory"
4855 echo "may lost its master LMV EA. If somebody created files under the"
4856 echo "master directly after the master LMV EA lost, then the LFSCK"
4857 echo "should NOT re-generate the master LMV EA, instead, it should"
4858 echo "change the broken striped dirctory as read-only to prevent"
4859 echo "further damage"
4862 check_mount_and_prep
4864 echo "Inject failure stub on MDT0 to simulate the case that the"
4865 echo "master MDT-object of the striped directory lost the LMV EA."
4867 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4868 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4869 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4870 error "(1) Fail to create striped directory"
4871 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4873 umount_client $MOUNT || error "(2) umount failed"
4874 mount_client $MOUNT || error "(3) mount failed"
4876 touch $DIR/$tdir/striped_dir/dummy ||
4877 error "(4) Fail to touch under broken striped directory"
4879 echo "Trigger namespace LFSCK to find out the inconsistency"
4880 $START_NAMESPACE -r -A ||
4881 error "(5) Fail to start LFSCK for namespace"
4883 wait_all_targets_blocked namespace completed 6
4885 local repaired=$($SHOW_NAMESPACE |
4886 awk '/^striped_dirs_repaired/ { print $2 }')
4887 [ $repaired -eq 0 ] ||
4888 error "(7) Re-generate master LMV EA unexpected: $repaired"
4890 stat $DIR/$tdir/striped_dir/dummy ||
4891 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4893 touch $DIR/$tdir/striped_dir/foo &&
4894 error "(9) The broken striped directory should be read-only"
4896 chattr -i $DIR/$tdir/striped_dir ||
4897 error "(10) Fail to chattr on the broken striped directory"
4899 rmdir $DIR/$tdir/striped_dir ||
4900 error "(11) Fail to remove the striped directory after LFSCK"
4902 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4905 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4908 echo "For some reason, the slave MDT-object of the striped directory"
4909 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4910 echo "slave LMV EA."
4913 check_mount_and_prep
4915 echo "Inject failure stub on MDT0 to simulate the case that the"
4916 echo "slave MDT-object (that resides on the same MDT as the master"
4917 echo "MDT-object resides on) lost the LMV EA."
4919 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4920 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4921 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4922 error "(1) Fail to create striped directory"
4923 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4925 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4926 $START_NAMESPACE -r -A ||
4927 error "(2) Fail to start LFSCK for namespace"
4929 wait_all_targets_blocked namespace completed 3
4931 local repaired=$($SHOW_NAMESPACE |
4932 awk '/^striped_shards_repaired/ { print $2 }')
4933 [ $repaired -eq 1 ] ||
4934 error "(4) Fail to re-generate slave LMV EA: $repaired"
4936 rmdir $DIR/$tdir/striped_dir ||
4937 error "(5) Fail to remove the striped directory after LFSCK"
4939 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4942 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4945 echo "For some reason, the slave MDT-object of the striped directory"
4946 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4947 echo "slave LMV EA."
4950 check_mount_and_prep
4952 echo "Inject failure stub on MDT0 to simulate the case that the"
4953 echo "slave MDT-object (that resides on different MDT as the master"
4954 echo "MDT-object resides on) lost the LMV EA."
4956 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4957 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4958 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4959 error "(1) Fail to create striped directory"
4960 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4962 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4963 $START_NAMESPACE -r -A ||
4964 error "(2) Fail to start LFSCK for namespace"
4966 wait_all_targets_blocked namespace completed 3
4968 local repaired=$(do_facet mds2 $LCTL get_param -n \
4969 mdd.$(facet_svc mds2).lfsck_namespace |
4970 awk '/^striped_shards_repaired/ { print $2 }')
4971 [ $repaired -eq 1 ] ||
4972 error "(4) Fail to re-generate slave LMV EA: $repaired"
4974 rmdir $DIR/$tdir/striped_dir ||
4975 error "(5) Fail to remove the striped directory after LFSCK"
4977 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4980 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4983 echo "For some reason, the stripe index in the slave LMV EA is"
4984 echo "corrupted. The LFSCK should repair the slave LMV EA."
4987 check_mount_and_prep
4989 echo "Inject failure stub on MDT0 to simulate the case that the"
4990 echo "slave LMV EA on the first shard of the striped directory"
4991 echo "claims the same index as the second shard claims"
4993 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4994 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4995 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4996 error "(1) Fail to create striped directory"
4997 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4999 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5000 $START_NAMESPACE -r -A ||
5001 error "(2) Fail to start LFSCK for namespace"
5003 wait_all_targets_blocked namespace completed 3
5005 local repaired=$($SHOW_NAMESPACE |
5006 awk '/^striped_shards_repaired/ { print $2 }')
5007 [ $repaired -eq 1 ] ||
5008 error "(4) Fail to repair slave LMV EA: $repaired"
5010 umount_client $MOUNT || error "(5) umount failed"
5011 mount_client $MOUNT || error "(6) mount failed"
5013 touch $DIR/$tdir/striped_dir/foo ||
5014 error "(7) Fail to touch file after the LFSCK"
5016 rm -f $DIR/$tdir/striped_dir/foo ||
5017 error "(8) Fail to unlink file after the LFSCK"
5019 rmdir $DIR/$tdir/striped_dir ||
5020 error "(9) Fail to remove the striped directory after LFSCK"
5022 run_test 31g "Repair the corrupted slave LMV EA"
5025 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5028 echo "For some reason, the shard's name entry in the striped"
5029 echo "directory may be corrupted. The LFSCK should repair the"
5030 echo "bad shard's name entry."
5033 check_mount_and_prep
5035 echo "Inject failure stub on MDT0 to simulate the case that the"
5036 echo "first shard's name entry in the striped directory claims"
5037 echo "the same index as the second shard's name entry claims."
5039 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5040 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5041 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5042 error "(1) Fail to create striped directory"
5043 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5045 echo "Trigger namespace LFSCK to repair the shard's name entry"
5046 $START_NAMESPACE -r -A ||
5047 error "(2) Fail to start LFSCK for namespace"
5049 wait_all_targets_blocked namespace completed 3
5051 local repaired=$($SHOW_NAMESPACE |
5052 awk '/^dirent_repaired/ { print $2 }')
5053 [ $repaired -eq 1 ] ||
5054 error "(4) Fail to repair shard's name entry: $repaired"
5056 umount_client $MOUNT || error "(5) umount failed"
5057 mount_client $MOUNT || error "(6) mount failed"
5059 touch $DIR/$tdir/striped_dir/foo ||
5060 error "(7) Fail to touch file after the LFSCK"
5062 rm -f $DIR/$tdir/striped_dir/foo ||
5063 error "(8) Fail to unlink file after the LFSCK"
5065 rmdir $DIR/$tdir/striped_dir ||
5066 error "(9) Fail to remove the striped directory after LFSCK"
5068 run_test 31h "Repair the corrupted shard's name entry"
5073 umount_client $MOUNT
5075 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5076 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5077 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5079 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5080 [ "$STATUS" == "scanning-phase1" ] ||
5081 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5084 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5086 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5090 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5092 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5093 error "(5) Fail to start ost1"
5095 run_test 32a "stop LFSCK when some OST failed"
5099 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5102 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5103 error "(1) Fail to create $DIR/$tdir/dp"
5104 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5105 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5106 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5107 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5108 umount_client $MOUNT
5110 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5111 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5112 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5114 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5115 mdd.${MDT_DEV}.lfsck_namespace |
5116 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5118 error "(5) unexpected status"
5122 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5124 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5128 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5130 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5131 error "(8) Fail to start MDT2"
5133 run_test 32b "stop LFSCK when some MDT failed"
5139 $START_LAYOUT --dryrun -o -r ||
5140 error "(1) Fail to start layout LFSCK"
5141 wait_all_targets_blocked layout completed 2
5143 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5144 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5145 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5147 $START_NAMESPACE -e abort -A -r ||
5148 error "(4) Fail to start namespace LFSCK"
5149 wait_all_targets_blocked namespace completed 5
5151 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5152 [ "$PARAMS" == "failout,all_targets" ] ||
5153 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5155 run_test 33 "check LFSCK paramters"
5159 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5160 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5161 skip "Only valid for ZFS backend" && return
5165 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5166 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5167 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5168 error "(1) Fail to create $DIR/$tdir/dummy"
5170 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5171 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5172 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5173 mdd.${MDT_DEV}.lfsck_namespace |
5174 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5176 error "(3) unexpected status"
5179 local repaired=$($SHOW_NAMESPACE |
5180 awk '/^dirent_repaired/ { print $2 }')
5181 [ $repaired -eq 1 ] ||
5182 error "(4) Fail to repair the lost agent object: $repaired"
5184 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5186 mdd.${MDT_DEV}.lfsck_namespace |
5187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5189 error "(6) unexpected status"
5192 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5193 [ $repaired -eq 0 ] ||
5194 error "(7) Unexpected repairing: $repaired"
5196 run_test 34 "LFSCK can rebuild the lost agent object"
5200 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5204 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5205 do_facet mds2 $LCTL set_param fail_loc=0x1631
5206 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5207 error "(1) Fail to create $DIR/$tdir/dummy"
5210 do_facet mds2 $LCTL set_param fail_loc=0
5211 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5212 wait_update_facet mds2 "$LCTL get_param -n \
5213 mdd.$(facet_svc mds2).lfsck_namespace |
5214 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5215 error "(3) MDS${k} is not the expected 'completed'"
5217 local repaired=$(do_facet mds2 $LCTL get_param -n \
5218 mdd.$(facet_svc mds2).lfsck_namespace |
5219 awk '/^agent_entries_repaired/ { print $2 }')
5220 [ $repaired -eq 1 ] ||
5221 error "(4) Fail to repair the lost agent entry: $repaired"
5223 echo "stopall to cleanup object cache"
5226 setupall > /dev/null
5228 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5229 wait_update_facet mds2 "$LCTL get_param -n \
5230 mdd.$(facet_svc mds2).lfsck_namespace |
5231 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5232 error "(6) MDS${k} is not the expected 'completed'"
5234 repaired=$(do_facet mds2 $LCTL get_param -n \
5235 mdd.$(facet_svc mds2).lfsck_namespace |
5236 awk '/^agent_entries_repaired/ { print $2 }')
5237 [ $repaired -eq 0 ] ||
5238 error "(7) Unexpected repairing: $repaired"
5240 run_test 35 "LFSCK can rebuild the lost agent entry"
5243 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5246 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5247 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5248 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5251 check_mount_and_prep
5253 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5254 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5255 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5256 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5257 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5258 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5259 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5260 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5261 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5263 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5264 error "(3) Fail to write $DIR/$tdir/f0"
5265 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5266 error "(4) Fail to write $DIR/$tdir/f1"
5267 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5268 error "(5) Fail to write $DIR/$tdir/f2"
5270 $LFS mirror resync $DIR/$tdir/f0 ||
5271 error "(6) Fail to resync $DIR/$tdir/f0"
5272 $LFS mirror resync $DIR/$tdir/f1 ||
5273 error "(7) Fail to resync $DIR/$tdir/f1"
5274 $LFS mirror resync $DIR/$tdir/f2 ||
5275 error "(8) Fail to resync $DIR/$tdir/f2"
5277 cancel_lru_locks mdc
5278 cancel_lru_locks osc
5280 $LFS getstripe $DIR/$tdir/f0 ||
5281 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5282 $LFS getstripe $DIR/$tdir/f1 ||
5283 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5284 $LFS getstripe $DIR/$tdir/f2 ||
5285 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5287 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5288 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5289 do_facet mds1 $LCTL set_param fail_loc=0x1616
5291 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5292 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5293 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5294 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5295 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5296 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5300 do_facet mds1 $LCTL set_param fail_loc=0
5302 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5303 error "(15) The 1st of mirror is not destroyed"
5304 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5305 error "(16) The 2nd of mirror is not destroyed"
5306 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5307 error "(17) The 3rd of mirror is not destroyed"
5311 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5312 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5313 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5314 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5315 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5316 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5318 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5319 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5321 for k in $(seq $MDSCOUNT); do
5322 # The LFSCK status query internal is 30 seconds. For the case
5323 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5324 # time to guarantee the status sync up.
5325 wait_update_facet mds${k} "$LCTL get_param -n \
5326 mdd.$(facet_svc mds${k}).lfsck_layout |
5327 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5328 error "(22) MDS${k} is not the expected 'completed'"
5331 for k in $(seq $OSTCOUNT); do
5332 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5333 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5334 awk '/^status/ { print $2 }')
5335 [ "$cur_status" == "completed" ] ||
5336 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5339 local repaired=$(do_facet mds1 $LCTL get_param -n \
5340 mdd.$(facet_svc mds1).lfsck_layout |
5341 awk '/^repaired_orphan/ { print $2 }')
5342 [ $repaired -eq 9 ] ||
5343 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5345 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5346 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5347 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5348 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5349 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5350 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5352 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5353 $LFS getstripe $DIR/$tdir/f0
5354 error "(28) The 1st of mirror is not recovered"
5357 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5358 $LFS getstripe $DIR/$tdir/f1
5359 error "(29) The 2nd of mirror is not recovered"
5362 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5363 $LFS getstripe $DIR/$tdir/f2
5364 error "(30) The 3rd of mirror is not recovered"
5367 run_test 36a "rebuild LOV EA for mirrored file (1)"
5370 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5373 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5374 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5375 echo "with the PFID EA of related OST-object(s) belong to the file. "
5378 check_mount_and_prep
5380 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5381 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5382 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5384 local fid=$($LFS path2fid $DIR/$tdir/f0)
5386 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5387 error "(1) Fail to write $DIR/$tdir/f0"
5388 $LFS mirror resync $DIR/$tdir/f0 ||
5389 error "(2) Fail to resync $DIR/$tdir/f0"
5391 cancel_lru_locks mdc
5392 cancel_lru_locks osc
5394 $LFS getstripe $DIR/$tdir/f0 ||
5395 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5397 echo "Inject failure, to simulate the case of missing the MDT-object"
5398 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5399 do_facet mds1 $LCTL set_param fail_loc=0x1616
5400 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5404 do_facet mds1 $LCTL set_param fail_loc=0
5406 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5407 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5409 for k in $(seq $MDSCOUNT); do
5410 # The LFSCK status query internal is 30 seconds. For the case
5411 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5412 # time to guarantee the status sync up.
5413 wait_update_facet mds${k} "$LCTL get_param -n \
5414 mdd.$(facet_svc mds${k}).lfsck_layout |
5415 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5416 error "(6) MDS${k} is not the expected 'completed'"
5419 for k in $(seq $OSTCOUNT); do
5420 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5421 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5422 awk '/^status/ { print $2 }')
5423 [ "$cur_status" == "completed" ] ||
5424 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5427 local count=$(do_facet mds1 $LCTL get_param -n \
5428 mdd.$(facet_svc mds1).lfsck_layout |
5429 awk '/^repaired_orphan/ { print $2 }')
5430 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5432 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5433 count=$($LFS getstripe --mirror-count $name)
5434 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5436 count=$($LFS getstripe --component-count $name)
5437 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5439 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5440 $LFS getstripe $name
5441 error "(11) The 1st of mirror is not recovered"
5444 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5445 $LFS getstripe $name
5446 error "(12) The 2nd of mirror is not recovered"
5449 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5450 $LFS getstripe $name
5451 error "(13) The 3rd of mirror is not recovered"
5454 run_test 36b "rebuild LOV EA for mirrored file (2)"
5457 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5460 echo "The mirrored file has been modified, not resynced yet, then "
5461 echo "lost its MDT-object, but relatd OST-objects are still there. "
5462 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5463 echo "with the PFID EA of related OST-object(s) belong to the file. "
5466 check_mount_and_prep
5468 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5470 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5472 local fid=$($LFS path2fid $DIR/$tdir/f0)
5474 # The 1st dd && resync makes all related OST-objects have been written
5475 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5476 error "(1.1) Fail to write $DIR/$tdir/f0"
5477 $LFS mirror resync $DIR/$tdir/f0 ||
5478 error "(1.2) Fail to resync $DIR/$tdir/f0"
5479 # The 2nd dd makes one mirror to be stale
5480 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5481 error "(1.3) Fail to write $DIR/$tdir/f0"
5483 cancel_lru_locks mdc
5484 cancel_lru_locks osc
5486 $LFS getstripe $DIR/$tdir/f0 ||
5487 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5489 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5490 awk '/lcme_flags/ { print $2 }')
5491 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5492 awk '/lcme_flags/ { print $2 }')
5494 echo "Inject failure, to simulate the case of missing the MDT-object"
5495 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5496 do_facet mds1 $LCTL set_param fail_loc=0x1616
5497 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5501 do_facet mds1 $LCTL set_param fail_loc=0
5503 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5504 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5506 for k in $(seq $MDSCOUNT); do
5507 # The LFSCK status query internal is 30 seconds. For the case
5508 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5509 # time to guarantee the status sync up.
5510 wait_update_facet mds${k} "$LCTL get_param -n \
5511 mdd.$(facet_svc mds${k}).lfsck_layout |
5512 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5513 error "(5) MDS${k} is not the expected 'completed'"
5516 for k in $(seq $OSTCOUNT); do
5517 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5518 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5519 awk '/^status/ { print $2 }')
5520 [ "$cur_status" == "completed" ] ||
5521 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5524 local count=$(do_facet mds1 $LCTL get_param -n \
5525 mdd.$(facet_svc mds1).lfsck_layout |
5526 awk '/^repaired_orphan/ { print $2 }')
5527 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5529 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5530 count=$($LFS getstripe --mirror-count $name)
5531 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5533 count=$($LFS getstripe --component-count $name)
5534 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5536 local flags=$($LFS getstripe $name | head -n 10 |
5537 awk '/lcme_flags/ { print $2 }')
5538 [ "$flags" == "$saved_flags1" ] || {
5539 $LFS getstripe $name
5540 error "(10) expect flags $saved_flags1, got $flags"
5543 flags=$($LFS getstripe $name | tail -n 10 |
5544 awk '/lcme_flags/ { print $2 }')
5545 [ "$flags" == "$saved_flags2" ] || {
5546 $LFS getstripe $name
5547 error "(11) expect flags $saved_flags2, got $flags"
5550 run_test 36c "rebuild LOV EA for mirrored file (3)"
5556 local t_dir="$DIR/$tdir/d0"
5557 check_mount_and_prep
5559 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5560 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5564 $START_NAMESPACE -r -A || {
5565 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5567 wait_all_targets_blocked namespace completed 4
5572 run_test 37 "LFSCK must skip a ORPHAN"
5575 # restore MDS/OST size
5576 MDSSIZE=${SAVED_MDSSIZE}
5577 OSTSIZE=${SAVED_OSTSIZE}
5578 OSTCOUNT=${SAVED_OSTCOUNT}
5580 # cleanup the system at last
5581 REFORMAT="yes" cleanup_and_setup_lustre
5584 check_and_cleanup_lustre