3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
45 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
57 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
60 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
61 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
63 # DNE does not support striped directory on zfs-based backend yet.
64 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
65 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
69 MDT_DEV="${FSNAME}-MDT0000"
70 OST_DEV="${FSNAME}-OST0000"
71 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
72 START_NAMESPACE="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
74 START_LAYOUT="do_facet $SINGLEMDS \
75 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
76 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
77 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
78 SHOW_NAMESPACE="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
80 SHOW_LAYOUT="do_facet $SINGLEMDS \
81 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
82 SHOW_LAYOUT_ON_OST="do_facet ost1 \
83 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
84 MOUNT_OPTS_SCRUB="-o user_xattr"
85 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
86 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
95 echo "preparing... $nfiles * $ndirs files will be created $(date)."
96 if [ ! -z $igif ]; then
97 #define OBD_FAIL_FID_IGIF 0x1504
98 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
101 cp $LUSTRE/tests/*.sh $DIR/$tdir/
102 if [ $ndirs -gt 0 ]; then
103 createmany -d $DIR/$tdir/d $ndirs
104 createmany -m $DIR/$tdir/f $ndirs
105 if [ $nfiles -gt 0 ]; then
106 for ((i = 0; i < $ndirs; i++)); do
107 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
108 /dev/null || error "createmany $nfiles"
111 createmany -d $DIR/$tdir/e $ndirs
114 if [ ! -z $igif ]; then
115 touch $DIR/$tdir/dummy
116 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
119 echo "prepared $(date)."
122 run_e2fsck_on_mdt0() {
123 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
125 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
128 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
129 error "(2) Detected inconsistency on MDT0"
131 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
132 error "(3) Fail to start MDT0"
135 wait_all_targets_blocked() {
140 local count=$(do_facet mds1 \
141 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
142 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
143 [[ $count -eq $MDSCOUNT ]] || {
144 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
145 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
154 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
155 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
156 "$MDSCOUNT" $LTIME || {
157 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
158 error "($err) some MDTs are not in ${status}"
165 #define OBD_FAIL_LFSCK_DELAY1 0x1600
166 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
167 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
169 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
171 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
172 [ "$STATUS" == "scanning-phase1" ] ||
173 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
175 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
177 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
178 [ "$STATUS" == "stopped" ] ||
179 error "(6) Expect 'stopped', but got '$STATUS'"
181 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
183 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
184 [ "$STATUS" == "scanning-phase1" ] ||
185 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
188 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
189 mdd.${MDT_DEV}.lfsck_namespace |
190 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
192 error "(9) unexpected status"
195 local repaired=$($SHOW_NAMESPACE |
196 awk '/^updated_phase1/ { print $2 }')
197 [ $repaired -eq 0 ] ||
198 error "(10) Expect nothing to be repaired, but got: $repaired"
200 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
201 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
202 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
203 mdd.${MDT_DEV}.lfsck_namespace |
204 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
206 error "(12) unexpected status"
209 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
210 [ $((scanned1 + 1)) -eq $scanned2 ] ||
211 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
213 echo "stopall, should NOT crash LU-3649"
214 stopall || error "(14) Fail to stopall"
216 run_test 0 "Control LFSCK manually"
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_FID_IGIF 0x1504
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^dirent_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase1/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair lost FID-in-dirent: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 #define OBD_FAIL_FID_LOOKUP 0x1505
335 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
336 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
338 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
340 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
345 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
347 touch $DIR/$tdir/dummy
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
351 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
352 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
353 mdd.${MDT_DEV}.lfsck_namespace |
354 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
356 error "(4) unexpected status"
359 local repaired=$($SHOW_NAMESPACE |
360 awk '/^linkea_repaired/ { print $2 }')
361 # for interop with old server
362 [ -z "$repaired" ] &&
363 repaired=$($SHOW_NAMESPACE |
364 awk '/^updated_phase2/ { print $2 }')
366 [ $repaired -eq 1 ] ||
367 error "(5) Fail to repair crashed linkEA: $repaired"
371 mount_client $MOUNT || error "(6) Fail to start client!"
373 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
374 error "(7) Fail to stat $DIR/$tdir/dummy"
376 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
377 local dummyname=$($LFS fid2path $DIR $dummyfid)
378 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
379 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
381 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
387 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
388 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
389 touch $DIR/$tdir/dummy
391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
393 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
394 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
395 mdd.${MDT_DEV}.lfsck_namespace |
396 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
398 error "(4) unexpected status"
401 local repaired=$($SHOW_NAMESPACE |
402 awk '/^updated_phase2/ { print $2 }')
403 [ $repaired -eq 1 ] ||
404 error "(5) Fail to repair crashed linkEA: $repaired"
408 mount_client $MOUNT || error "(6) Fail to start client!"
410 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
411 error "(7) Fail to stat $DIR/$tdir/dummy"
413 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
414 local dummyname=$($LFS fid2path $DIR $dummyfid)
415 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
416 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
418 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
424 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
426 touch $DIR/$tdir/dummy
428 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
430 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
431 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
432 mdd.${MDT_DEV}.lfsck_namespace |
433 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
435 error "(4) unexpected status"
438 local repaired=$($SHOW_NAMESPACE |
439 awk '/^updated_phase2/ { print $2 }')
440 [ $repaired -eq 1 ] ||
441 error "(5) Fail to repair crashed linkEA: $repaired"
445 mount_client $MOUNT || error "(6) Fail to start client!"
447 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
448 error "(7) Fail to stat $DIR/$tdir/dummy"
450 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
451 local dummyname=$($LFS fid2path $DIR $dummyfid)
452 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
453 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
455 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
461 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
462 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
463 touch $DIR/$tdir/dummy
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
467 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
468 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
469 mdd.${MDT_DEV}.lfsck_namespace |
470 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
472 error "(4) unexpected status"
475 local repaired=$($SHOW_NAMESPACE |
476 awk '/^linkea_repaired/ { print $2 }')
477 [ $repaired -eq 1 ] ||
478 error "(5) Fail to repair crashed linkEA: $repaired"
482 mount_client $MOUNT || error "(6) Fail to start client!"
484 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
485 error "(7) Fail to stat $DIR/$tdir/dummy"
487 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
488 local dummyname=$($LFS fid2path $DIR $dummyfid)
489 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
490 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
492 run_test 2d "LFSCK can recover the missing linkEA entry"
496 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
500 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
502 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
504 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
509 wait_all_targets_blocked namespace completed 4
511 local repaired=$($SHOW_NAMESPACE |
512 awk '/^linkea_repaired/ { print $2 }')
513 [ $repaired -eq 1 ] ||
514 error "(5) Fail to repair crashed linkEA: $repaired"
516 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
517 local name=$($LFS fid2path $DIR $fid)
518 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
519 error "(6) Fail to repair linkEA: $fid $name"
521 run_test 2e "namespace LFSCK can verify remote object linkEA"
527 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
528 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
529 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
531 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
532 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
533 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
535 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
536 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
537 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
539 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
541 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
543 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
545 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
547 mdd.${MDT_DEV}.lfsck_namespace |
548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
550 error "(10) unexpected status"
553 local checked=$($SHOW_NAMESPACE |
554 awk '/^checked_phase2/ { print $2 }')
555 [ $checked -ge 4 ] ||
556 error "(11) Fail to check multiple-linked object: $checked"
558 local repaired=$($SHOW_NAMESPACE |
559 awk '/^multiple_linked_repaired/ { print $2 }')
560 [ $repaired -ge 2 ] ||
561 error "(12) Fail to repair multiple-linked object: $repaired"
563 run_test 3 "LFSCK can verify multiple-linked objects"
567 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
568 skip "OI Scrub not implemented for ZFS" && return
571 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
572 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
574 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
575 echo "start $SINGLEMDS with disabling OI scrub"
576 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
577 error "(2) Fail to start MDS!"
579 #define OBD_FAIL_LFSCK_DELAY2 0x1601
580 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
581 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
582 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
583 mdd.${MDT_DEV}.lfsck_namespace |
584 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
586 error "(5) unexpected status"
589 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
590 [ "$STATUS" == "scanning-phase1" ] ||
591 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
593 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
594 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
595 mdd.${MDT_DEV}.lfsck_namespace |
596 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
598 error "(7) unexpected status"
601 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
602 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
604 local repaired=$($SHOW_NAMESPACE |
605 awk '/^dirent_repaired/ { print $2 }')
606 # for interop with old server
607 [ -z "$repaired" ] &&
608 repaired=$($SHOW_NAMESPACE |
609 awk '/^updated_phase1/ { print $2 }')
611 [ $repaired -ge 9 ] ||
612 error "(9) Fail to re-generate FID-in-dirent: $repaired"
616 mount_client $MOUNT || error "(10) Fail to start client!"
618 #define OBD_FAIL_FID_LOOKUP 0x1505
619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
620 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
621 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
623 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
627 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
628 skip "OI Scrub not implemented for ZFS" && return
631 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
632 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
634 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
635 echo "start $SINGLEMDS with disabling OI scrub"
636 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
637 error "(2) Fail to start MDS!"
639 #define OBD_FAIL_LFSCK_DELAY2 0x1601
640 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
641 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
643 mdd.${MDT_DEV}.lfsck_namespace |
644 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
646 error "(5) unexpected status"
649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
650 [ "$STATUS" == "scanning-phase1" ] ||
651 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
653 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
658 error "(7) unexpected status"
661 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
662 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
664 local repaired=$($SHOW_NAMESPACE |
665 awk '/^dirent_repaired/ { print $2 }')
666 # for interop with old server
667 [ -z "$repaired" ] &&
668 repaired=$($SHOW_NAMESPACE |
669 awk '/^updated_phase1/ { print $2 }')
671 [ $repaired -ge 2 ] ||
672 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
676 mount_client $MOUNT || error "(10) Fail to start client!"
678 #define OBD_FAIL_FID_LOOKUP 0x1505
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
680 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
682 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
684 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
685 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
686 local dummyname=$($LFS fid2path $DIR $dummyfid)
687 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
688 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
690 run_test 5 "LFSCK can handle IGIF object upgrading"
695 #define OBD_FAIL_LFSCK_DELAY1 0x1600
696 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
697 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
699 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
700 [ "$STATUS" == "scanning-phase1" ] ||
701 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
703 # Sleep 3 sec to guarantee at least one object processed by LFSCK
705 # Fail the LFSCK to guarantee there is at least one checkpoint
706 #define OBD_FAIL_LFSCK_FATAL1 0x1608
707 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
708 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
709 mdd.${MDT_DEV}.lfsck_namespace |
710 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
712 error "(4) unexpected status"
715 local POS0=$($SHOW_NAMESPACE |
716 awk '/^last_checkpoint_position/ { print $2 }' |
719 #define OBD_FAIL_LFSCK_DELAY1 0x1600
720 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
721 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
723 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
724 [ "$STATUS" == "scanning-phase1" ] ||
725 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
727 local POS1=$($SHOW_NAMESPACE |
728 awk '/^latest_start_position/ { print $2 }' |
730 [[ $POS0 -lt $POS1 ]] ||
731 error "(7) Expect larger than: $POS0, but got $POS1"
733 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
734 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
735 mdd.${MDT_DEV}.lfsck_namespace |
736 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
738 error "(8) unexpected status"
741 run_test 6a "LFSCK resumes from last checkpoint (1)"
746 #define OBD_FAIL_LFSCK_DELAY2 0x1601
747 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
748 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
750 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
751 [ "$STATUS" == "scanning-phase1" ] ||
752 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
754 # Sleep 5 sec to guarantee that we are in the directory scanning
756 # Fail the LFSCK to guarantee there is at least one checkpoint
757 #define OBD_FAIL_LFSCK_FATAL2 0x1609
758 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
759 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
760 mdd.${MDT_DEV}.lfsck_namespace |
761 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
763 error "(4) unexpected status"
766 local O_POS0=$($SHOW_NAMESPACE |
767 awk '/^last_checkpoint_position/ { print $2 }' |
770 local D_POS0=$($SHOW_NAMESPACE |
771 awk '/^last_checkpoint_position/ { print $4 }')
773 #define OBD_FAIL_LFSCK_DELAY2 0x1601
774 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
775 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
777 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
778 [ "$STATUS" == "scanning-phase1" ] ||
779 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
781 local O_POS1=$($SHOW_NAMESPACE |
782 awk '/^latest_start_position/ { print $2 }' |
784 local D_POS1=$($SHOW_NAMESPACE |
785 awk '/^latest_start_position/ { print $4 }')
787 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
788 [[ $O_POS0 -lt $O_POS1 ]] ||
789 error "(7.1) $O_POS1 is not larger than $O_POS0"
791 [[ $D_POS0 -lt $D_POS1 ]] ||
792 error "(7.2) $D_POS1 is not larger than $D_POS0"
795 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
796 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
797 mdd.${MDT_DEV}.lfsck_namespace |
798 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
800 error "(8) unexpected status"
803 run_test 6b "LFSCK resumes from last checkpoint (2)"
810 #define OBD_FAIL_LFSCK_DELAY2 0x1601
811 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
812 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
814 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
815 [ "$STATUS" == "scanning-phase1" ] ||
816 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
818 # Sleep 3 sec to guarantee at least one object processed by LFSCK
820 echo "stop $SINGLEMDS"
821 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
824 echo "start $SINGLEMDS"
825 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
826 error "(5) Fail to start MDS!"
828 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
829 mdd.${MDT_DEV}.lfsck_namespace |
830 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
832 error "(6) unexpected status"
835 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
841 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
842 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
843 for ((i = 0; i < 20; i++)); do
844 touch $DIR/$tdir/dummy${i}
847 #define OBD_FAIL_LFSCK_DELAY3 0x1602
848 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
849 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
850 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
851 mdd.${MDT_DEV}.lfsck_namespace |
852 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
854 error "(4) unexpected status"
858 echo "stop $SINGLEMDS"
859 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
861 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
862 echo "start $SINGLEMDS"
863 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
864 error "(6) Fail to start MDS!"
866 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
867 mdd.${MDT_DEV}.lfsck_namespace |
868 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
870 error "(7) unexpected status"
873 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
878 formatall > /dev/null
884 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
885 [ "$STATUS" == "init" ] ||
886 error "(2) Expect 'init', but got '$STATUS'"
888 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
890 mkdir $DIR/$tdir/crashed
892 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
893 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
894 for ((i = 0; i < 5; i++)); do
895 touch $DIR/$tdir/dummy${i}
898 umount_client $MOUNT || error "(3) Fail to stop client!"
900 #define OBD_FAIL_LFSCK_DELAY2 0x1601
901 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
902 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
904 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
905 [ "$STATUS" == "scanning-phase1" ] ||
906 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
908 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
910 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
911 [ "$STATUS" == "stopped" ] ||
912 error "(7) Expect 'stopped', but got '$STATUS'"
914 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
916 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
917 [ "$STATUS" == "scanning-phase1" ] ||
918 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
920 #define OBD_FAIL_LFSCK_FATAL2 0x1609
921 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
922 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
923 mdd.${MDT_DEV}.lfsck_namespace |
924 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
926 error "(10) unexpected status"
929 #define OBD_FAIL_LFSCK_DELAY1 0x1600
930 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
931 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
933 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
934 [ "$STATUS" == "scanning-phase1" ] ||
935 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
937 #define OBD_FAIL_LFSCK_CRASH 0x160a
938 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
941 echo "stop $SINGLEMDS"
942 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
944 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
947 echo "start $SINGLEMDS"
948 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
949 error "(14) Fail to start MDS!"
951 local timeout=$(max_recovery_time)
954 while [ $timer -lt $timeout ]; do
955 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
956 mdt.${MDT_DEV}.recovery_status |
957 awk '/^status/ { print \\\$2 }'")
958 [ "$STATUS" != "RECOVERING" ] && break;
963 [ $timer != $timeout ] ||
964 error "(14.1) recovery timeout"
966 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
967 [ "$STATUS" == "crashed" ] ||
968 error "(15) Expect 'crashed', but got '$STATUS'"
970 #define OBD_FAIL_LFSCK_DELAY2 0x1601
971 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
972 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
974 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
975 [ "$STATUS" == "scanning-phase1" ] ||
976 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
978 echo "stop $SINGLEMDS"
979 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
981 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
982 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
984 echo "start $SINGLEMDS"
985 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
986 error "(19) Fail to start MDS!"
989 while [ $timer -lt $timeout ]; do
990 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
991 mdt.${MDT_DEV}.recovery_status |
992 awk '/^status/ { print \\\$2 }'")
993 [ "$STATUS" != "RECOVERING" ] && break;
998 [ $timer != $timeout ] ||
999 error "(19.1) recovery timeout"
1001 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1002 [ "$STATUS" == "paused" ] ||
1003 error "(20) Expect 'paused', but got '$STATUS'"
1005 echo "stop $SINGLEMDS"
1006 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1008 echo "start $SINGLEMDS without resume LFSCK"
1009 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1010 error "(20.2) Fail to start MDS!"
1013 while [ $timer -lt $timeout ]; do
1014 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1015 mdt.${MDT_DEV}.recovery_status |
1016 awk '/^status/ { print \\\$2 }'")
1017 [ "$STATUS" != "RECOVERING" ] && break;
1019 timer=$((timer + 1))
1022 [ $timer != $timeout ] ||
1023 error "(20.3) recovery timeout"
1025 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1026 [ "$STATUS" == "paused" ] ||
1027 error "(20.4) Expect 'paused', but got '$STATUS'"
1029 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1030 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1032 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1033 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1034 mdd.${MDT_DEV}.lfsck_namespace |
1035 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1037 error "(22) unexpected status"
1040 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1041 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1042 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1044 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1045 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1046 mdd.${MDT_DEV}.lfsck_namespace |
1047 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1049 error "(24) unexpected status"
1052 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1053 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1055 run_test 8 "LFSCK state machine"
1058 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1059 skip "Testing on UP system, the speed may be inaccurate."
1063 check_mount_and_prep
1064 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1065 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1066 createmany -o $DIR/$tdir/lfsck/f 5000
1068 local BASE_SPEED1=100
1070 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1073 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1074 [ "$STATUS" == "scanning-phase1" ] ||
1075 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1077 local SPEED=$($SHOW_LAYOUT |
1078 awk '/^average_speed_phase1/ { print $2 }')
1080 # There may be time error, normally it should be less than 2 seconds.
1081 # We allow another 20% schedule error.
1083 # MAX_MARGIN = 1.3 = 13 / 10
1084 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1085 RUN_TIME1 * 13 / 10))
1086 [ $SPEED -lt $MAX_SPEED ] || {
1088 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1089 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1092 # adjust speed limit
1093 local BASE_SPEED2=300
1095 do_facet $SINGLEMDS \
1096 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1099 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1100 # MIN_MARGIN = 0.7 = 7 / 10
1101 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1102 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1103 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1104 [ $SPEED -gt $MIN_SPEED ] || {
1105 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1106 error_ignore LU-5624 \
1107 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1110 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1114 # MAX_MARGIN = 1.3 = 13 / 10
1115 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1116 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1117 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1118 [ $SPEED -lt $MAX_SPEED ] || {
1120 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1121 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1122 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1125 do_nodes $(comma_list $(mdts_nodes)) \
1126 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1127 do_nodes $(comma_list $(osts_nodes)) \
1128 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1130 wait_update_facet $SINGLEMDS \
1131 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1132 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1133 error "(7) Failed to get expected 'completed'"
1135 run_test 9a "LFSCK speed control (1)"
1138 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1139 skip "Testing on UP system, the speed may be inaccurate."
1145 echo "Preparing another 50 * 50 files (with error) at $(date)."
1146 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1147 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1148 createmany -d $DIR/$tdir/d 50
1149 createmany -m $DIR/$tdir/f 50
1150 for ((i = 0; i < 50; i++)); do
1151 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1154 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1155 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1156 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1157 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1158 mdd.${MDT_DEV}.lfsck_namespace |
1159 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1161 error "(5) unexpected status"
1164 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1165 echo "Prepared at $(date)."
1167 local BASE_SPEED1=50
1169 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1172 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1173 [ "$STATUS" == "scanning-phase2" ] ||
1174 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1176 local SPEED=$($SHOW_NAMESPACE |
1177 awk '/^average_speed_phase2/ { print $2 }')
1178 # There may be time error, normally it should be less than 2 seconds.
1179 # We allow another 20% schedule error.
1181 # MAX_MARGIN = 1.3 = 13 / 10
1182 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1183 RUN_TIME1 * 13 / 10))
1184 [ $SPEED -lt $MAX_SPEED ] || {
1186 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1187 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1190 # adjust speed limit
1191 local BASE_SPEED2=150
1193 do_facet $SINGLEMDS \
1194 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1197 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1198 # MIN_MARGIN = 0.7 = 7 / 10
1199 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1200 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1201 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1202 [ $SPEED -gt $MIN_SPEED ] || {
1203 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1204 error_ignore LU-5624 \
1205 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1208 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1212 # MAX_MARGIN = 1.3 = 13 / 10
1213 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1214 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1215 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1216 [ $SPEED -lt $MAX_SPEED ] || {
1218 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1219 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1220 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1223 do_nodes $(comma_list $(mdts_nodes)) \
1224 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1225 do_nodes $(comma_list $(osts_nodes)) \
1226 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1227 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1228 mdd.${MDT_DEV}.lfsck_namespace |
1229 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1231 error "(11) unexpected status"
1234 run_test 9b "LFSCK speed control (2)"
1238 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1239 skip "lookup(..)/linkea on ZFS issue" && return
1243 echo "Preparing more files with error at $(date)."
1244 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1247 for ((i = 0; i < 1000; i = $((i+2)))); do
1248 mkdir -p $DIR/$tdir/d${i}
1249 touch $DIR/$tdir/f${i}
1250 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1253 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1254 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1256 for ((i = 1; i < 1000; i = $((i+2)))); do
1257 mkdir -p $DIR/$tdir/d${i}
1258 touch $DIR/$tdir/f${i}
1259 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1263 echo "Prepared at $(date)."
1265 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1267 umount_client $MOUNT
1268 mount_client $MOUNT || error "(3) Fail to start client!"
1270 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1273 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1274 [ "$STATUS" == "scanning-phase1" ] ||
1275 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1277 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1279 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1281 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1283 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1285 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1287 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1289 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1291 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1292 error "(14) Fail to softlink!"
1294 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1295 [ "$STATUS" == "scanning-phase1" ] ||
1296 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1298 do_nodes $(comma_list $(mdts_nodes)) \
1299 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1300 do_nodes $(comma_list $(osts_nodes)) \
1301 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1302 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1303 mdd.${MDT_DEV}.lfsck_namespace |
1304 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1306 error "(16) unexpected status"
1309 run_test 10 "System is available during LFSCK scanning"
1312 ost_remove_lastid() {
1315 local rcmd="do_facet ost${ost}"
1317 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1319 # step 1: local mount
1320 mount_fstype ost${ost} || return 1
1321 # step 2: remove the specified LAST_ID
1322 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1324 unmount_fstype ost${ost} || return 2
1328 check_mount_and_prep
1329 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1330 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1335 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1337 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1338 error "(2) Fail to start ost1"
1340 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1341 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1343 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1344 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1346 wait_update_facet ost1 "$LCTL get_param -n \
1347 obdfilter.${OST_DEV}.lfsck_layout |
1348 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1350 error "(5) unexpected status"
1353 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1355 wait_update_facet ost1 "$LCTL get_param -n \
1356 obdfilter.${OST_DEV}.lfsck_layout |
1357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1359 error "(6) unexpected status"
1362 echo "the LAST_ID(s) should have been rebuilt"
1363 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1364 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1366 run_test 11a "LFSCK can rebuild lost last_id"
1369 check_mount_and_prep
1370 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1372 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1373 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1374 do_facet ost1 $LCTL set_param fail_loc=0x160d
1376 local count=$(precreated_ost_obj_count 0 0)
1378 createmany -o $DIR/$tdir/f $((count + 32))
1380 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1381 local seq=$(do_facet mds1 $LCTL get_param -n \
1382 osp.${proc_path}.prealloc_last_seq)
1383 local lastid1=$(do_facet ost1 "lctl get_param -n \
1384 obdfilter.${ost1_svc}.last_id" | grep $seq |
1385 awk -F: '{ print $2 }')
1387 umount_client $MOUNT
1388 stop ost1 || error "(1) Fail to stop ost1"
1390 #define OBD_FAIL_OST_ENOSPC 0x215
1391 do_facet ost1 $LCTL set_param fail_loc=0x215
1393 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1394 error "(2) Fail to start ost1"
1396 for ((i = 0; i < 60; i++)); do
1397 lastid2=$(do_facet ost1 "lctl get_param -n \
1398 obdfilter.${ost1_svc}.last_id" | grep $seq |
1399 awk -F: '{ print $2 }')
1400 [ ! -z $lastid2 ] && break;
1404 echo "the on-disk LAST_ID should be smaller than the expected one"
1405 [ $lastid1 -gt $lastid2 ] ||
1406 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1408 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1409 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1411 wait_update_facet ost1 "$LCTL get_param -n \
1412 obdfilter.${OST_DEV}.lfsck_layout |
1413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1415 error "(6) unexpected status"
1418 stop ost1 || error "(7) Fail to stop ost1"
1420 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1421 error "(8) Fail to start ost1"
1423 echo "the on-disk LAST_ID should have been rebuilt"
1424 wait_update_facet ost1 "$LCTL get_param -n \
1425 obdfilter.${ost1_svc}.last_id | grep $seq |
1426 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1427 do_facet ost1 $LCTL get_param -n \
1428 obdfilter.${ost1_svc}.last_id
1429 error "(9) expect lastid1 $seq:$lastid1"
1432 do_facet ost1 $LCTL set_param fail_loc=0
1433 stopall || error "(10) Fail to stopall"
1435 run_test 11b "LFSCK can rebuild crashed last_id"
1438 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1440 check_mount_and_prep
1441 for k in $(seq $MDSCOUNT); do
1442 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1443 createmany -o $DIR/$tdir/${k}/f 100 ||
1444 error "(0) Fail to create 100 files."
1447 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1448 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1449 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1451 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1452 wait_all_targets namespace scanning-phase1 3
1454 echo "Stop namespace LFSCK on all targets by single lctl command."
1455 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1456 error "(4) Fail to stop LFSCK on all devices!"
1458 echo "All the LFSCK targets should be in 'stopped' status."
1459 wait_all_targets_blocked namespace stopped 5
1461 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1462 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1463 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1465 echo "All the LFSCK targets should be in 'completed' status."
1466 wait_all_targets_blocked namespace completed 7
1468 start_full_debug_logging
1470 echo "Start layout LFSCK on all targets by single command (-s 1)."
1471 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1472 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1474 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1475 wait_all_targets layout scanning-phase1 9
1477 echo "Stop layout LFSCK on all targets by single lctl command."
1478 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1479 error "(10) Fail to stop LFSCK on all devices!"
1481 echo "All the LFSCK targets should be in 'stopped' status."
1482 wait_all_targets_blocked layout stopped 11
1484 for k in $(seq $OSTCOUNT); do
1485 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1486 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1487 awk '/^status/ { print $2 }')
1488 [ "$STATUS" == "stopped" ] ||
1489 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1492 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1493 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1494 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1496 echo "All the LFSCK targets should be in 'completed' status."
1497 wait_all_targets_blocked layout completed 14
1499 stop_full_debug_logging
1501 run_test 12a "single command to trigger LFSCK on all devices"
1504 check_mount_and_prep
1506 echo "Start LFSCK without '-M' specified."
1507 do_facet mds1 $LCTL lfsck_start -A -r ||
1508 error "(0) Fail to start LFSCK without '-M'"
1510 wait_all_targets_blocked namespace completed 1
1511 wait_all_targets_blocked layout completed 2
1513 local count=$(do_facet mds1 $LCTL dl |
1514 awk '{ print $3 }' | grep mdt | wc -l)
1515 if [ $count -gt 1 ]; then
1517 echo "Start layout LFSCK on the node with multipe targets,"
1518 echo "but not specify '-M'/'-A' option. Should get failure."
1520 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1521 error "(3) Start layout LFSCK should fail" || true
1524 run_test 12b "auto detect Lustre device"
1528 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1529 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1530 echo "MDT-object FID."
1533 check_mount_and_prep
1535 echo "Inject failure stub to simulate bad lmm_oi"
1536 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1537 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1538 createmany -o $DIR/$tdir/f 1
1539 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1540 error "(0) Fail to create PFL $DIR/$tdir/f1"
1541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1543 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1544 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1547 mdd.${MDT_DEV}.lfsck_layout |
1548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1550 error "(2) unexpected status"
1553 local repaired=$($SHOW_LAYOUT |
1554 awk '/^repaired_others/ { print $2 }')
1555 [ $repaired -eq 2 ] ||
1556 error "(3) Fail to repair crashed lmm_oi: $repaired"
1558 run_test 13 "LFSCK can repair crashed lmm_oi"
1562 echo "The OST-object referenced by the MDT-object should be there;"
1563 echo "otherwise, the LFSCK should re-create the missing OST-object."
1564 echo "without '--delay-create-ostobj' option."
1567 check_mount_and_prep
1568 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1570 echo "Inject failure stub to simulate dangling referenced MDT-object"
1571 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1572 do_facet ost1 $LCTL set_param fail_loc=0x1610
1573 local count=$(precreated_ost_obj_count 0 0)
1575 createmany -o $DIR/$tdir/f $((count + 16)) ||
1576 error "(0.1) Fail to create $DIR/$tdir/fx"
1577 touch $DIR/$tdir/guard0
1579 for ((i = 0; i < 16; i++)); do
1580 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1581 $DIR/$tdir/f_comp${i} ||
1582 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1584 touch $DIR/$tdir/guard1
1586 do_facet ost1 $LCTL set_param fail_loc=0
1588 start_full_debug_logging
1590 # exhaust other pre-created dangling cases
1591 count=$(precreated_ost_obj_count 0 0)
1592 createmany -o $DIR/$tdir/a $count ||
1593 error "(0.5) Fail to create $count files."
1595 echo "'ls' should fail because of dangling referenced MDT-object"
1596 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1598 echo "Trigger layout LFSCK to find out dangling reference"
1599 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1601 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1602 mdd.${MDT_DEV}.lfsck_layout |
1603 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1605 error "(3) unexpected status"
1608 local repaired=$($SHOW_LAYOUT |
1609 awk '/^repaired_dangling/ { print $2 }')
1610 [ $repaired -ge 32 ] ||
1611 error "(4) Fail to repair dangling reference: $repaired"
1613 echo "'stat' should fail because of not repair dangling by default"
1614 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1615 error "(5.1) stat should fail"
1616 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1617 error "(5.2) stat should fail"
1619 echo "Trigger layout LFSCK to repair dangling reference"
1620 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1622 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1623 mdd.${MDT_DEV}.lfsck_layout |
1624 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1626 error "(7) unexpected status"
1629 # There may be some async LFSCK updates in processing, wait for
1630 # a while until the target reparation has been done. LU-4970.
1632 echo "'stat' should success after layout LFSCK repairing"
1633 wait_update_facet client "stat $DIR/$tdir/guard0 |
1634 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1635 stat $DIR/$tdir/guard0
1637 error "(8.1) unexpected size"
1640 wait_update_facet client "stat $DIR/$tdir/guard1 |
1641 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1642 stat $DIR/$tdir/guard1
1644 error "(8.2) unexpected size"
1647 repaired=$($SHOW_LAYOUT |
1648 awk '/^repaired_dangling/ { print $2 }')
1649 [ $repaired -ge 32 ] ||
1650 error "(9) Fail to repair dangling reference: $repaired"
1652 stop_full_debug_logging
1654 echo "stopall to cleanup object cache"
1657 setupall > /dev/null
1659 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1663 echo "The OST-object referenced by the MDT-object should be there;"
1664 echo "otherwise, the LFSCK should re-create the missing OST-object."
1665 echo "with '--delay-create-ostobj' option."
1668 check_mount_and_prep
1669 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1671 echo "Inject failure stub to simulate dangling referenced MDT-object"
1672 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1673 do_facet ost1 $LCTL set_param fail_loc=0x1610
1674 local count=$(precreated_ost_obj_count 0 0)
1676 createmany -o $DIR/$tdir/f $((count + 31))
1677 touch $DIR/$tdir/guard
1678 do_facet ost1 $LCTL set_param fail_loc=0
1680 start_full_debug_logging
1682 # exhaust other pre-created dangling cases
1683 count=$(precreated_ost_obj_count 0 0)
1684 createmany -o $DIR/$tdir/a $count ||
1685 error "(0) Fail to create $count files."
1687 echo "'ls' should fail because of dangling referenced MDT-object"
1688 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1690 echo "Trigger layout LFSCK to find out dangling reference"
1691 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1693 wait_all_targets_blocked layout completed 3
1695 local repaired=$($SHOW_LAYOUT |
1696 awk '/^repaired_dangling/ { print $2 }')
1697 [ $repaired -ge 32 ] ||
1698 error "(4) Fail to repair dangling reference: $repaired"
1700 echo "'stat' should fail because of not repair dangling by default"
1701 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1703 echo "Trigger layout LFSCK to repair dangling reference"
1704 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1706 wait_all_targets_blocked layout completed 7
1708 # There may be some async LFSCK updates in processing, wait for
1709 # a while until the target reparation has been done. LU-4970.
1711 echo "'stat' should success after layout LFSCK repairing"
1712 wait_update_facet client "stat $DIR/$tdir/guard |
1713 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1714 stat $DIR/$tdir/guard
1716 error "(8) unexpected size"
1719 repaired=$($SHOW_LAYOUT |
1720 awk '/^repaired_dangling/ { print $2 }')
1721 [ $repaired -ge 32 ] ||
1722 error "(9) Fail to repair dangling reference: $repaired"
1724 stop_full_debug_logging
1726 echo "stopall to cleanup object cache"
1729 setupall > /dev/null
1731 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1735 echo "If the OST-object referenced by the MDT-object back points"
1736 echo "to some non-exist MDT-object, then the LFSCK should repair"
1737 echo "the OST-object to back point to the right MDT-object."
1740 check_mount_and_prep
1741 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1743 echo "Inject failure stub to make the OST-object to back point to"
1744 echo "non-exist MDT-object."
1745 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1747 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1748 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1749 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1751 error "(0) Fail to create PFL $DIR/$tdir/f1"
1752 # 'dd' will trigger punch RPC firstly on every OST-objects.
1753 # So even though some OST-object will not be write by 'dd',
1754 # as long as it is allocated (may be NOT allocated in pfl_3b)
1755 # its layout information will be set also.
1756 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1757 cancel_lru_locks osc
1758 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1760 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1761 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1763 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1764 mdd.${MDT_DEV}.lfsck_layout |
1765 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1767 error "(2) unexpected status"
1770 local repaired=$($SHOW_LAYOUT |
1771 awk '/^repaired_unmatched_pair/ { print $2 }')
1772 [ $repaired -ge 3 ] ||
1773 error "(3) Fail to repair unmatched pair: $repaired"
1775 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1779 echo "If the OST-object referenced by the MDT-object back points"
1780 echo "to other MDT-object that doesn't recognize the OST-object,"
1781 echo "then the LFSCK should repair it to back point to the right"
1782 echo "MDT-object (the first one)."
1785 check_mount_and_prep
1786 mkdir -p $DIR/$tdir/0
1787 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1788 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1789 cancel_lru_locks osc
1791 echo "Inject failure stub to make the OST-object to back point to"
1792 echo "other MDT-object"
1795 [ $OSTCOUNT -ge 2 ] && stripes=2
1797 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1798 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1799 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1800 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1802 error "(0) Fail to create PFL $DIR/$tdir/f1"
1803 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1804 cancel_lru_locks osc
1805 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1807 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1808 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1810 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1811 mdd.${MDT_DEV}.lfsck_layout |
1812 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1814 error "(2) unexpected status"
1817 local repaired=$($SHOW_LAYOUT |
1818 awk '/^repaired_unmatched_pair/ { print $2 }')
1819 [ $repaired -eq 4 ] ||
1820 error "(3) Fail to repair unmatched pair: $repaired"
1822 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1825 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1827 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1828 skip "Skip the test after 2.7.55 see LU-6437" && return
1831 echo "According to current metadata migration implementation,"
1832 echo "before the old MDT-object is removed, both the new MDT-object"
1833 echo "and old MDT-object will reference the same LOV layout. Then if"
1834 echo "the layout LFSCK finds the new MDT-object by race, it will"
1835 echo "regard related OST-object(s) as multiple referenced case, and"
1836 echo "will try to create new OST-object(s) for the new MDT-object."
1837 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1838 echo "MDT-object before confirm the multiple referenced case."
1841 check_mount_and_prep
1842 $LFS mkdir -i 1 $DIR/$tdir/a1
1843 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1844 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1845 cancel_lru_locks osc
1847 echo "Inject failure stub on MDT1 to delay the migration"
1849 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1850 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1851 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1852 $LFS migrate -m 0 $DIR/$tdir/a1 &
1855 echo "Trigger layout LFSCK to race with the migration"
1856 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1858 wait_all_targets_blocked layout completed 2
1860 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1861 local repaired=$($SHOW_LAYOUT |
1862 awk '/^repaired_unmatched_pair/ { print $2 }')
1863 [ $repaired -eq 1 ] ||
1864 error "(3) Fail to repair unmatched pair: $repaired"
1866 repaired=$($SHOW_LAYOUT |
1867 awk '/^repaired_multiple_referenced/ { print $2 }')
1868 [ $repaired -eq 0 ] ||
1869 error "(4) Unexpectedly repaird multiple references: $repaired"
1871 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1875 echo "If the OST-object's owner information does not match the owner"
1876 echo "information stored in the MDT-object, then the LFSCK trust the"
1877 echo "MDT-object and update the OST-object's owner information."
1880 check_mount_and_prep
1881 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1882 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1883 cancel_lru_locks osc
1885 # created but no setattr or write to the file.
1887 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1888 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1890 echo "Inject failure stub to skip OST-object owner changing"
1891 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1893 chown 1.1 $DIR/$tdir/f0
1894 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1896 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1899 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1901 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1902 mdd.${MDT_DEV}.lfsck_layout |
1903 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1905 error "(2) unexpected status"
1908 local repaired=$($SHOW_LAYOUT |
1909 awk '/^repaired_inconsistent_owner/ { print $2 }')
1910 [ $repaired -eq 1 ] ||
1911 error "(3) Fail to repair inconsistent owner: $repaired"
1913 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1917 echo "If more than one MDT-objects reference the same OST-object,"
1918 echo "and the OST-object only recognizes one MDT-object, then the"
1919 echo "LFSCK should create new OST-objects for such non-recognized"
1923 check_mount_and_prep
1924 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1926 echo "Inject failure stub to make two MDT-objects to refernce"
1927 echo "the OST-object"
1929 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1930 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1931 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1932 cancel_lru_locks mdc
1933 cancel_lru_locks osc
1935 createmany -o $DIR/$tdir/f 1
1936 cancel_lru_locks mdc
1937 cancel_lru_locks osc
1939 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1941 error "(0) Fail to create PFL $DIR/$tdir/f1"
1942 cancel_lru_locks mdc
1943 cancel_lru_locks osc
1944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1946 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1947 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1948 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1949 [ $size -eq 1048576 ] ||
1950 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1952 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1953 [ $size -eq 1048576 ] ||
1954 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1956 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1959 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1961 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1962 mdd.${MDT_DEV}.lfsck_layout |
1963 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1965 error "(3) unexpected status"
1968 local repaired=$($SHOW_LAYOUT |
1969 awk '/^repaired_multiple_referenced/ { print $2 }')
1970 [ $repaired -eq 2 ] ||
1971 error "(4) Fail to repair multiple references: $repaired"
1973 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1974 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1975 error "(5) Fail to write f0."
1976 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1977 [ $size -eq 1048576 ] ||
1978 error "(6) guard size should be 1048576, but got $size"
1980 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1981 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1982 error "(7) Fail to write f1."
1983 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1984 [ $size -eq 1048576 ] ||
1985 error "(8) guard size should be 1048576, but got $size"
1987 run_test 17 "LFSCK can repair multiple references"
1989 $LCTL set_param debug=+cache > /dev/null
1993 echo "The target MDT-object is there, but related stripe information"
1994 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1995 echo "layout EA entries."
1998 check_mount_and_prep
1999 $LFS mkdir -i 0 $DIR/$tdir/a1
2000 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2001 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2003 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2005 $LFS path2fid $DIR/$tdir/a1/f1
2006 $LFS getstripe $DIR/$tdir/a1/f1
2008 if [ $MDSCOUNT -ge 2 ]; then
2009 $LFS mkdir -i 1 $DIR/$tdir/a2
2010 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2011 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2012 $LFS path2fid $DIR/$tdir/a2/f2
2013 $LFS getstripe $DIR/$tdir/a2/f2
2016 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2017 error "(0) Fail to create PFL $DIR/$tdir/f3"
2019 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2021 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2023 $LFS path2fid $DIR/$tdir/f3
2024 $LFS getstripe $DIR/$tdir/f3
2026 cancel_lru_locks osc
2028 echo "Inject failure, to make the MDT-object lost its layout EA"
2029 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2030 do_facet mds1 $LCTL set_param fail_loc=0x1615
2031 chown 1.1 $DIR/$tdir/a1/f1
2033 if [ $MDSCOUNT -ge 2 ]; then
2034 do_facet mds2 $LCTL set_param fail_loc=0x1615
2035 chown 1.1 $DIR/$tdir/a2/f2
2038 chown 1.1 $DIR/$tdir/f3
2043 do_facet mds1 $LCTL set_param fail_loc=0
2044 if [ $MDSCOUNT -ge 2 ]; then
2045 do_facet mds2 $LCTL set_param fail_loc=0
2048 cancel_lru_locks mdc
2049 cancel_lru_locks osc
2051 echo "The file size should be incorrect since layout EA is lost"
2052 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2053 [ "$cur_size" != "$saved_size1" ] ||
2054 error "(1) Expect incorrect file1 size"
2056 if [ $MDSCOUNT -ge 2 ]; then
2057 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2058 [ "$cur_size" != "$saved_size1" ] ||
2059 error "(2) Expect incorrect file2 size"
2062 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2063 [ "$cur_size" != "$saved_size2" ] ||
2064 error "(1.2) Expect incorrect file3 size"
2066 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2067 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2069 for k in $(seq $MDSCOUNT); do
2070 # The LFSCK status query internal is 30 seconds. For the case
2071 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2072 # time to guarantee the status sync up.
2073 wait_update_facet mds${k} "$LCTL get_param -n \
2074 mdd.$(facet_svc mds${k}).lfsck_layout |
2075 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2076 error "(4) MDS${k} is not the expected 'completed'"
2079 for k in $(seq $OSTCOUNT); do
2080 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2081 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2082 awk '/^status/ { print $2 }')
2083 [ "$cur_status" == "completed" ] ||
2084 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2087 local repaired=$(do_facet mds1 $LCTL get_param -n \
2088 mdd.$(facet_svc mds1).lfsck_layout |
2089 awk '/^repaired_orphan/ { print $2 }')
2090 [ $repaired -eq 3 ] ||
2091 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2093 if [ $MDSCOUNT -ge 2 ]; then
2094 repaired=$(do_facet mds2 $LCTL get_param -n \
2095 mdd.$(facet_svc mds2).lfsck_layout |
2096 awk '/^repaired_orphan/ { print $2 }')
2097 [ $repaired -eq 2 ] ||
2098 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2101 $LFS path2fid $DIR/$tdir/a1/f1
2102 $LFS getstripe $DIR/$tdir/a1/f1
2104 if [ $MDSCOUNT -ge 2 ]; then
2105 $LFS path2fid $DIR/$tdir/a2/f2
2106 $LFS getstripe $DIR/$tdir/a2/f2
2109 $LFS path2fid $DIR/$tdir/f3
2110 $LFS getstripe $DIR/$tdir/f3
2112 echo "The file size should be correct after layout LFSCK scanning"
2113 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2114 [ "$cur_size" == "$saved_size1" ] ||
2115 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2117 if [ $MDSCOUNT -ge 2 ]; then
2118 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2119 [ "$cur_size" == "$saved_size1" ] ||
2120 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2123 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2124 [ "$cur_size" == "$saved_size2" ] ||
2125 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2127 run_test 18a "Find out orphan OST-object and repair it (1)"
2130 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2133 echo "The target MDT-object is lost. The LFSCK should re-create the"
2134 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2135 echo "can move it back to normal namespace manually."
2138 check_mount_and_prep
2139 $LFS mkdir -i 0 $DIR/$tdir/a1
2140 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2141 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2142 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2143 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2145 $LFS getstripe $DIR/$tdir/a1/f1
2147 if [ $MDSCOUNT -ge 2 ]; then
2148 $LFS mkdir -i 1 $DIR/$tdir/a2
2149 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2150 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2151 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2153 $LFS getstripe $DIR/$tdir/a2/f2
2156 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2157 error "(0) Fail to create PFL $DIR/$tdir/f3"
2159 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2161 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2162 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2164 $LFS getstripe $DIR/$tdir/f3
2166 cancel_lru_locks osc
2168 echo "Inject failure, to simulate the case of missing the MDT-object"
2169 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2170 do_facet mds1 $LCTL set_param fail_loc=0x1616
2171 rm -f $DIR/$tdir/a1/f1
2173 if [ $MDSCOUNT -ge 2 ]; then
2174 do_facet mds2 $LCTL set_param fail_loc=0x1616
2175 rm -f $DIR/$tdir/a2/f2
2183 do_facet mds1 $LCTL set_param fail_loc=0
2184 if [ $MDSCOUNT -ge 2 ]; then
2185 do_facet mds2 $LCTL set_param fail_loc=0
2188 cancel_lru_locks mdc
2189 cancel_lru_locks osc
2191 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2192 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2194 for k in $(seq $MDSCOUNT); do
2195 # The LFSCK status query internal is 30 seconds. For the case
2196 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2197 # time to guarantee the status sync up.
2198 wait_update_facet mds${k} "$LCTL get_param -n \
2199 mdd.$(facet_svc mds${k}).lfsck_layout |
2200 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2201 error "(2) MDS${k} is not the expected 'completed'"
2204 for k in $(seq $OSTCOUNT); do
2205 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2206 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2207 awk '/^status/ { print $2 }')
2208 [ "$cur_status" == "completed" ] ||
2209 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2212 local repaired=$(do_facet mds1 $LCTL get_param -n \
2213 mdd.$(facet_svc mds1).lfsck_layout |
2214 awk '/^repaired_orphan/ { print $2 }')
2215 [ $repaired -eq 3 ] ||
2216 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2218 if [ $MDSCOUNT -ge 2 ]; then
2219 repaired=$(do_facet mds2 $LCTL get_param -n \
2220 mdd.$(facet_svc mds2).lfsck_layout |
2221 awk '/^repaired_orphan/ { print $2 }')
2222 [ $repaired -eq 2 ] ||
2223 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2226 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2227 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2228 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2230 if [ $MDSCOUNT -ge 2 ]; then
2231 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2232 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2235 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2236 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2238 $LFS path2fid $DIR/$tdir/a1/f1
2239 $LFS getstripe $DIR/$tdir/a1/f1
2241 if [ $MDSCOUNT -ge 2 ]; then
2242 $LFS path2fid $DIR/$tdir/a2/f2
2243 $LFS getstripe $DIR/$tdir/a2/f2
2246 $LFS path2fid $DIR/$tdir/f3
2247 $LFS getstripe $DIR/$tdir/f3
2249 echo "The file size should be correct after layout LFSCK scanning"
2250 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2251 [ "$cur_size" == "$saved_size1" ] ||
2252 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2254 if [ $MDSCOUNT -ge 2 ]; then
2255 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2256 [ "$cur_size" == "$saved_size1" ] ||
2257 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2260 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2261 [ "$cur_size" == "$saved_size2" ] ||
2262 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2264 run_test 18b "Find out orphan OST-object and repair it (2)"
2267 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2270 echo "The target MDT-object is lost, and the OST-object FID is missing."
2271 echo "The LFSCK should re-create the MDT-object with new FID under the "
2272 echo "directory .lustre/lost+found/MDTxxxx."
2275 check_mount_and_prep
2276 $LFS mkdir -i 0 $DIR/$tdir/a1
2277 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2279 echo "Inject failure, to simulate the case of missing parent FID"
2280 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2281 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2283 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2284 $LFS getstripe $DIR/$tdir/a1/f1
2286 if [ $MDSCOUNT -ge 2 ]; then
2287 $LFS mkdir -i 1 $DIR/$tdir/a2
2288 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2289 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2290 $LFS getstripe $DIR/$tdir/a2/f2
2293 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2294 error "(0) Fail to create PFL $DIR/$tdir/f3"
2296 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2297 $LFS getstripe $DIR/$tdir/f3
2299 cancel_lru_locks osc
2300 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2302 echo "Inject failure, to simulate the case of missing the MDT-object"
2303 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2304 do_facet mds1 $LCTL set_param fail_loc=0x1616
2305 rm -f $DIR/$tdir/a1/f1
2307 if [ $MDSCOUNT -ge 2 ]; then
2308 do_facet mds2 $LCTL set_param fail_loc=0x1616
2309 rm -f $DIR/$tdir/a2/f2
2317 do_facet mds1 $LCTL set_param fail_loc=0
2318 if [ $MDSCOUNT -ge 2 ]; then
2319 do_facet mds2 $LCTL set_param fail_loc=0
2322 cancel_lru_locks mdc
2323 cancel_lru_locks osc
2325 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2326 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2328 for k in $(seq $MDSCOUNT); do
2329 # The LFSCK status query internal is 30 seconds. For the case
2330 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2331 # time to guarantee the status sync up.
2332 wait_update_facet mds${k} "$LCTL get_param -n \
2333 mdd.$(facet_svc mds${k}).lfsck_layout |
2334 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2335 error "(2) MDS${k} is not the expected 'completed'"
2338 for k in $(seq $OSTCOUNT); do
2339 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2340 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2341 awk '/^status/ { print $2 }')
2342 [ "$cur_status" == "completed" ] ||
2343 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2346 if [ $MDSCOUNT -ge 2 ]; then
2352 local repaired=$(do_facet mds1 $LCTL get_param -n \
2353 mdd.$(facet_svc mds1).lfsck_layout |
2354 awk '/^repaired_orphan/ { print $2 }')
2355 [ $repaired -eq $expected ] ||
2356 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2358 if [ $MDSCOUNT -ge 2 ]; then
2359 repaired=$(do_facet mds2 $LCTL get_param -n \
2360 mdd.$(facet_svc mds2).lfsck_layout |
2361 awk '/^repaired_orphan/ { print $2 }')
2362 [ $repaired -eq 0 ] ||
2363 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2366 ls -ail $MOUNT/.lustre/lost+found/
2368 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2369 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2370 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2372 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2375 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2376 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2377 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2379 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2380 [ ! -z "$cname" ] ||
2381 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2383 run_test 18c "Find out orphan OST-object and repair it (3)"
2387 echo "The target MDT-object layout EA is corrupted, but the right"
2388 echo "OST-object is still alive as orphan. The layout LFSCK will"
2389 echo "not create new OST-object to occupy such slot."
2392 check_mount_and_prep
2394 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2395 echo "guard" > $DIR/$tdir/a1/f1
2396 echo "foo" > $DIR/$tdir/a1/f2
2398 echo "guard" > $DIR/$tdir/a1/f3
2399 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2400 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2401 echo "foo" > $DIR/$tdir/a1/f4
2403 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2404 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2405 $LFS path2fid $DIR/$tdir/a1/f1
2406 $LFS getstripe $DIR/$tdir/a1/f1
2407 $LFS path2fid $DIR/$tdir/a1/f2
2408 $LFS getstripe $DIR/$tdir/a1/f2
2409 $LFS path2fid $DIR/$tdir/a1/f3
2410 $LFS getstripe $DIR/$tdir/a1/f3
2411 $LFS path2fid $DIR/$tdir/a1/f4
2412 $LFS getstripe $DIR/$tdir/a1/f4
2413 cancel_lru_locks osc
2415 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2416 echo "to reference the same OST-object (which is f1's OST-obejct)."
2417 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2418 echo "dangling reference case, but f2's old OST-object is there."
2420 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2421 echo "to reference the same OST-object (which is f3's OST-obejct)."
2422 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2423 echo "dangling reference case, but f4's old OST-object is there."
2426 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2427 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2428 chown 1.1 $DIR/$tdir/a1/f2
2429 chown 1.1 $DIR/$tdir/a1/f4
2430 rm -f $DIR/$tdir/a1/f1
2431 rm -f $DIR/$tdir/a1/f3
2434 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2436 echo "stopall to cleanup object cache"
2439 setupall > /dev/null
2441 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2442 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2444 for k in $(seq $MDSCOUNT); do
2445 # The LFSCK status query internal is 30 seconds. For the case
2446 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2447 # time to guarantee the status sync up.
2448 wait_update_facet mds${k} "$LCTL get_param -n \
2449 mdd.$(facet_svc mds${k}).lfsck_layout |
2450 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2451 error "(3) MDS${k} is not the expected 'completed'"
2454 for k in $(seq $OSTCOUNT); do
2455 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2456 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2457 awk '/^status/ { print $2 }')
2458 [ "$cur_status" == "completed" ] ||
2459 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2462 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2463 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2464 awk '/^repaired_orphan/ { print $2 }')
2465 [ $repaired -eq 2 ] ||
2466 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2468 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2469 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2470 awk '/^repaired_dangling/ { print $2 }')
2471 [ $repaired -eq 0 ] ||
2472 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2474 echo "The file size should be correct after layout LFSCK scanning"
2475 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2476 [ "$cur_size" == "$saved_size1" ] ||
2477 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2479 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2480 [ "$cur_size" == "$saved_size2" ] ||
2481 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2483 echo "The LFSCK should find back the original data."
2484 cat $DIR/$tdir/a1/f2
2485 $LFS path2fid $DIR/$tdir/a1/f2
2486 $LFS getstripe $DIR/$tdir/a1/f2
2487 cat $DIR/$tdir/a1/f4
2488 $LFS path2fid $DIR/$tdir/a1/f4
2489 $LFS getstripe $DIR/$tdir/a1/f4
2491 run_test 18d "Find out orphan OST-object and repair it (4)"
2494 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2497 echo "The target MDT-object layout EA slot is occpuied by some new"
2498 echo "created OST-object when repair dangling reference case. Such"
2499 echo "conflict OST-object has been modified by others. To keep the"
2500 echo "new data, the LFSCK will create a new file to refernece this"
2501 echo "old orphan OST-object."
2504 check_mount_and_prep
2506 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2507 echo "guard" > $DIR/$tdir/a1/f1
2508 echo "foo" > $DIR/$tdir/a1/f2
2510 echo "guard" > $DIR/$tdir/a1/f3
2511 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2512 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2513 echo "foo" > $DIR/$tdir/a1/f4
2515 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2516 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2518 $LFS path2fid $DIR/$tdir/a1/f1
2519 $LFS getstripe $DIR/$tdir/a1/f1
2520 $LFS path2fid $DIR/$tdir/a1/f2
2521 $LFS getstripe $DIR/$tdir/a1/f2
2522 $LFS path2fid $DIR/$tdir/a1/f3
2523 $LFS getstripe $DIR/$tdir/a1/f3
2524 $LFS path2fid $DIR/$tdir/a1/f4
2525 $LFS getstripe $DIR/$tdir/a1/f4
2526 cancel_lru_locks osc
2528 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2529 echo "to reference the same OST-object (which is f1's OST-obejct)."
2530 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2531 echo "dangling reference case, but f2's old OST-object is there."
2533 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2534 echo "to reference the same OST-object (which is f3's OST-obejct)."
2535 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2536 echo "dangling reference case, but f4's old OST-object is there."
2539 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2541 chown 1.1 $DIR/$tdir/a1/f2
2542 chown 1.1 $DIR/$tdir/a1/f4
2543 rm -f $DIR/$tdir/a1/f1
2544 rm -f $DIR/$tdir/a1/f3
2547 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2549 echo "stopall to cleanup object cache"
2552 setupall > /dev/null
2554 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2555 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2557 start_full_debug_logging
2559 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2560 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2562 wait_update_facet mds1 "$LCTL get_param -n \
2563 mdd.$(facet_svc mds1).lfsck_layout |
2564 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2565 error "(3) MDS1 is not the expected 'scanning-phase2'"
2567 # to guarantee all updates are synced.
2571 echo "Write new data to f2/f4 to modify the new created OST-object."
2572 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2573 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2575 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2577 for k in $(seq $MDSCOUNT); do
2578 # The LFSCK status query internal is 30 seconds. For the case
2579 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2580 # time to guarantee the status sync up.
2581 wait_update_facet mds${k} "$LCTL get_param -n \
2582 mdd.$(facet_svc mds${k}).lfsck_layout |
2583 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2584 error "(4) MDS${k} is not the expected 'completed'"
2587 for k in $(seq $OSTCOUNT); do
2588 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2589 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2590 awk '/^status/ { print $2 }')
2591 [ "$cur_status" == "completed" ] ||
2592 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2595 stop_full_debug_logging
2597 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2598 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2599 awk '/^repaired_orphan/ { print $2 }')
2600 [ $repaired -eq 2 ] ||
2601 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2603 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2604 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2605 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2607 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2608 if [ $count -ne 2 ]; then
2609 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2610 error "(8) Expect 2 stubs under lost+found, but got $count"
2613 echo "The stub file should keep the original f2 or f4 data"
2614 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2615 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2616 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2617 error "(9) Got unexpected $cur_size"
2620 $LFS path2fid $cname
2621 $LFS getstripe $cname
2623 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2624 cur_size=$(ls -il $cname | awk '{ print $6 }')
2625 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2626 error "(10) Got unexpected $cur_size"
2629 $LFS path2fid $cname
2630 $LFS getstripe $cname
2632 echo "The f2/f4 should contains new data."
2633 cat $DIR/$tdir/a1/f2
2634 $LFS path2fid $DIR/$tdir/a1/f2
2635 $LFS getstripe $DIR/$tdir/a1/f2
2636 cat $DIR/$tdir/a1/f4
2637 $LFS path2fid $DIR/$tdir/a1/f4
2638 $LFS getstripe $DIR/$tdir/a1/f4
2640 run_test 18e "Find out orphan OST-object and repair it (5)"
2643 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2646 echo "The target MDT-object is lost. The LFSCK should re-create the"
2647 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2648 echo "to verify some OST-object(s) during the first stage-scanning,"
2649 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2650 echo "should not be affected."
2653 check_mount_and_prep
2654 $LFS mkdir -i 0 $DIR/$tdir/a1
2655 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2656 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2657 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2658 $LFS mkdir -i 0 $DIR/$tdir/a2
2659 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2660 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2661 $LFS getstripe $DIR/$tdir/a1/f1
2662 $LFS getstripe $DIR/$tdir/a2/f2
2664 if [ $MDSCOUNT -ge 2 ]; then
2665 $LFS mkdir -i 1 $DIR/$tdir/a3
2666 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2667 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2668 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2669 $LFS mkdir -i 1 $DIR/$tdir/a4
2670 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2671 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2672 $LFS getstripe $DIR/$tdir/a3/f3
2673 $LFS getstripe $DIR/$tdir/a4/f4
2676 cancel_lru_locks osc
2678 echo "Inject failure, to simulate the case of missing the MDT-object"
2679 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2680 do_facet mds1 $LCTL set_param fail_loc=0x1616
2681 rm -f $DIR/$tdir/a1/f1
2682 rm -f $DIR/$tdir/a2/f2
2684 if [ $MDSCOUNT -ge 2 ]; then
2685 do_facet mds2 $LCTL set_param fail_loc=0x1616
2686 rm -f $DIR/$tdir/a3/f3
2687 rm -f $DIR/$tdir/a4/f4
2693 do_facet mds1 $LCTL set_param fail_loc=0
2694 if [ $MDSCOUNT -ge 2 ]; then
2695 do_facet mds2 $LCTL set_param fail_loc=0
2698 cancel_lru_locks mdc
2699 cancel_lru_locks osc
2701 echo "Inject failure, to simulate the OST0 fail to handle"
2702 echo "MDT0 LFSCK request during the first-stage scanning."
2703 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2704 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2706 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2707 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2709 for k in $(seq $MDSCOUNT); do
2710 # The LFSCK status query internal is 30 seconds. For the case
2711 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2712 # time to guarantee the status sync up.
2713 wait_update_facet mds${k} "$LCTL get_param -n \
2714 mdd.$(facet_svc mds${k}).lfsck_layout |
2715 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2716 error "(2) MDS${k} is not the expected 'partial'"
2719 wait_update_facet ost1 "$LCTL get_param -n \
2720 obdfilter.$(facet_svc ost1).lfsck_layout |
2721 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2722 error "(3) OST1 is not the expected 'partial'"
2725 wait_update_facet ost2 "$LCTL get_param -n \
2726 obdfilter.$(facet_svc ost2).lfsck_layout |
2727 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2728 error "(4) OST2 is not the expected 'completed'"
2731 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2733 local repaired=$(do_facet mds1 $LCTL get_param -n \
2734 mdd.$(facet_svc mds1).lfsck_layout |
2735 awk '/^repaired_orphan/ { print $2 }')
2736 [ $repaired -eq 1 ] ||
2737 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2739 if [ $MDSCOUNT -ge 2 ]; then
2740 repaired=$(do_facet mds2 $LCTL get_param -n \
2741 mdd.$(facet_svc mds2).lfsck_layout |
2742 awk '/^repaired_orphan/ { print $2 }')
2743 [ $repaired -eq 1 ] ||
2744 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2747 echo "Trigger layout LFSCK on all devices again to cleanup"
2748 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2750 for k in $(seq $MDSCOUNT); do
2751 # The LFSCK status query internal is 30 seconds. For the case
2752 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2753 # time to guarantee the status sync up.
2754 wait_update_facet mds${k} "$LCTL get_param -n \
2755 mdd.$(facet_svc mds${k}).lfsck_layout |
2756 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2757 error "(8) MDS${k} is not the expected 'completed'"
2760 for k in $(seq $OSTCOUNT); do
2761 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2762 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2763 awk '/^status/ { print $2 }')
2764 [ "$cur_status" == "completed" ] ||
2765 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2769 local repaired=$(do_facet mds1 $LCTL get_param -n \
2770 mdd.$(facet_svc mds1).lfsck_layout |
2771 awk '/^repaired_orphan/ { print $2 }')
2772 [ $repaired -eq 2 ] ||
2773 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2775 if [ $MDSCOUNT -ge 2 ]; then
2776 repaired=$(do_facet mds2 $LCTL get_param -n \
2777 mdd.$(facet_svc mds2).lfsck_layout |
2778 awk '/^repaired_orphan/ { print $2 }')
2779 [ $repaired -eq 2 ] ||
2780 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2783 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2786 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2789 echo "The target MDT-object is lost, but related OI mapping is there"
2790 echo "The LFSCK should recreate the lost MDT-object without affected"
2791 echo "by the stale OI mapping."
2794 check_mount_and_prep
2795 $LFS mkdir -i 0 $DIR/$tdir/a1
2796 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2797 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2798 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2800 $LFS getstripe $DIR/$tdir/a1/f1
2801 cancel_lru_locks osc
2803 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2804 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2805 do_facet mds1 $LCTL set_param fail_loc=0x162e
2806 rm -f $DIR/$tdir/a1/f1
2808 do_facet mds1 $LCTL set_param fail_loc=0
2809 cancel_lru_locks mdc
2810 cancel_lru_locks osc
2812 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2813 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2815 for k in $(seq $MDSCOUNT); do
2816 # The LFSCK status query internal is 30 seconds. For the case
2817 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2818 # time to guarantee the status sync up.
2819 wait_update_facet mds${k} "$LCTL get_param -n \
2820 mdd.$(facet_svc mds${k}).lfsck_layout |
2821 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2822 error "(2) MDS${k} is not the expected 'completed'"
2825 for k in $(seq $OSTCOUNT); do
2826 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2827 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2828 awk '/^status/ { print $2 }')
2829 [ "$cur_status" == "completed" ] ||
2830 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2833 local repaired=$(do_facet mds1 $LCTL get_param -n \
2834 mdd.$(facet_svc mds1).lfsck_layout |
2835 awk '/^repaired_orphan/ { print $2 }')
2836 [ $repaired -eq $OSTCOUNT ] ||
2837 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2839 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2840 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2841 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2843 $LFS path2fid $DIR/$tdir/a1/f1
2844 $LFS getstripe $DIR/$tdir/a1/f1
2846 run_test 18g "Find out orphan OST-object and repair it (7)"
2850 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2851 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2852 echo "scanning its OST-object(s). Then in the second stage scanning,"
2853 echo "the OST will return related OST-object(s) to the MDT as orphan."
2854 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2855 echo "the 'orphan(s)' stripe information."
2858 check_mount_and_prep
2860 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2861 error "(0) Fail to create PFL $DIR/$tdir/f0"
2863 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2864 error "(1.1) Fail to write $DIR/$tdir/f0"
2866 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2867 error "(1.2) Fail to write $DIR/$tdir/f0"
2869 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2871 echo "Inject failure stub to simulate bad PFL extent range"
2872 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2873 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2875 chown 1.1 $DIR/$tdir/f0
2877 cancel_lru_locks mdc
2878 cancel_lru_locks osc
2879 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2881 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2882 error "(2) Write to bad PFL file should fail"
2884 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2885 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2887 for k in $(seq $MDSCOUNT); do
2888 # The LFSCK status query internal is 30 seconds. For the case
2889 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2890 # time to guarantee the status sync up.
2891 wait_update_facet mds${k} "$LCTL get_param -n \
2892 mdd.$(facet_svc mds${k}).lfsck_layout |
2893 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2894 error "(4.1) MDS${k} is not the expected 'completed'"
2897 for k in $(seq $OSTCOUNT); do
2898 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2899 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2900 awk '/^status/ { print $2 }')
2901 [ "$cur_status" == "completed" ] ||
2902 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2906 local repaired=$($SHOW_LAYOUT |
2907 awk '/^repaired_orphan/ { print $2 }')
2908 [ $repaired -eq 2 ] ||
2909 error "(5) Fail to repair crashed PFL range: $repaired"
2911 echo "Data in $DIR/$tdir/f0 should not be broken"
2912 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2913 error "(6) Data in $DIR/$tdir/f0 is broken"
2915 echo "Write should succeed after LFSCK repairing the bad PFL range"
2916 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2917 error "(7) Write should succeed after LFSCK"
2919 run_test 18h "LFSCK can repair crashed PFL extent range"
2921 $LCTL set_param debug=-cache > /dev/null
2924 check_mount_and_prep
2925 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2927 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2928 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2930 echo "foo1" > $DIR/$tdir/a0
2931 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2932 error "(0) Fail to create PFL $DIR/$tdir/a1"
2933 echo "foo2" > $DIR/$tdir/a1
2934 echo "guard" > $DIR/$tdir/a2
2935 cancel_lru_locks osc
2937 echo "Inject failure, then client will offer wrong parent FID when read"
2938 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2939 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2941 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2942 $LCTL set_param fail_loc=0x1619
2944 echo "Read RPC with wrong parent FID should be denied"
2945 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2946 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2947 $LCTL set_param fail_loc=0
2949 run_test 19a "OST-object inconsistency self detect"
2952 check_mount_and_prep
2953 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2955 echo "Inject failure stub to make the OST-object to back point to"
2956 echo "non-exist MDT-object"
2958 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2959 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2961 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2962 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2963 echo "foo1" > $DIR/$tdir/f0
2964 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2965 error "(0) Fail to create PFL $DIR/$tdir/f1"
2966 echo "foo2" > $DIR/$tdir/f1
2967 cancel_lru_locks osc
2968 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2970 do_facet ost1 $LCTL set_param -n \
2971 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2972 echo "Nothing should be fixed since self detect and repair is disabled"
2973 local repaired=$(do_facet ost1 $LCTL get_param -n \
2974 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2975 awk '/^repaired/ { print $2 }')
2976 [ $repaired -eq 0 ] ||
2977 error "(1) Expected 0 repaired, but got $repaired"
2979 echo "Read RPC with right parent FID should be accepted,"
2980 echo "and cause parent FID on OST to be fixed"
2982 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2983 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2985 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2986 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2988 repaired=$(do_facet ost1 $LCTL get_param -n \
2989 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2990 awk '/^repaired/ { print $2 }')
2991 [ $repaired -eq 2 ] ||
2992 error "(3) Expected 1 repaired, but got $repaired"
2994 run_test 19b "OST-object inconsistency self repair"
2996 PATTERN_WITH_HOLE="40000001"
2997 PATTERN_WITHOUT_HOLE="raid0"
3000 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3001 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3004 echo "The target MDT-object and some of its OST-object are lost."
3005 echo "The LFSCK should find out the left OST-objects and re-create"
3006 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3007 echo "with the partial OST-objects (LOV EA hole)."
3009 echo "New client can access the file with LOV EA hole via normal"
3010 echo "system tools or commands without crash the system."
3012 echo "For old client, even though it cannot access the file with"
3013 echo "LOV EA hole, it should not cause the system crash."
3016 check_mount_and_prep
3017 $LFS mkdir -i 0 $DIR/$tdir/a1
3018 if [ $OSTCOUNT -gt 2 ]; then
3019 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3022 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3026 # 256 blocks on the stripe0.
3027 # 1 block on the stripe1 for 2 OSTs case.
3028 # 256 blocks on the stripe1 for other cases.
3029 # 1 block on the stripe2 if OSTs > 2
3030 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3031 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3032 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3034 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3035 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3036 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3039 $LFS getstripe $DIR/$tdir/a1/f0
3041 $LFS getstripe $DIR/$tdir/a1/f1
3043 $LFS getstripe $DIR/$tdir/a1/f2
3045 if [ $OSTCOUNT -gt 2 ]; then
3046 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3047 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3049 $LFS getstripe $DIR/$tdir/a1/f3
3052 cancel_lru_locks osc
3054 echo "Inject failure..."
3055 echo "To simulate f0 lost MDT-object"
3056 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3057 do_facet mds1 $LCTL set_param fail_loc=0x1616
3058 rm -f $DIR/$tdir/a1/f0
3060 echo "To simulate f1 lost MDT-object and OST-object0"
3061 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3062 do_facet mds1 $LCTL set_param fail_loc=0x161a
3063 rm -f $DIR/$tdir/a1/f1
3065 echo "To simulate f2 lost MDT-object and OST-object1"
3066 do_facet mds1 $LCTL set_param fail_val=1
3067 rm -f $DIR/$tdir/a1/f2
3069 if [ $OSTCOUNT -gt 2 ]; then
3070 echo "To simulate f3 lost MDT-object and OST-object2"
3071 do_facet mds1 $LCTL set_param fail_val=2
3072 rm -f $DIR/$tdir/a1/f3
3075 umount_client $MOUNT
3078 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3080 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3081 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3083 for k in $(seq $MDSCOUNT); do
3084 # The LFSCK status query internal is 30 seconds. For the case
3085 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3086 # time to guarantee the status sync up.
3087 wait_update_facet mds${k} "$LCTL get_param -n \
3088 mdd.$(facet_svc mds${k}).lfsck_layout |
3089 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3090 error "(2) MDS${k} is not the expected 'completed'"
3093 for k in $(seq $OSTCOUNT); do
3094 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3095 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3096 awk '/^status/ { print $2 }')
3097 [ "$cur_status" == "completed" ] ||
3098 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3101 local repaired=$(do_facet mds1 $LCTL get_param -n \
3102 mdd.$(facet_svc mds1).lfsck_layout |
3103 awk '/^repaired_orphan/ { print $2 }')
3104 if [ $OSTCOUNT -gt 2 ]; then
3105 [ $repaired -eq 9 ] ||
3106 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3108 [ $repaired -eq 4 ] ||
3109 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3112 mount_client $MOUNT || error "(5.0) Fail to start client!"
3114 LOV_PATTERN_F_HOLE=0x40000000
3117 # ${fid0}-R-0 is the old f0
3119 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3120 echo "Check $name, which is the old f0"
3122 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3124 local pattern=$($LFS getstripe -L $name)
3125 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3126 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3128 local stripes=$($LFS getstripe -c $name)
3129 if [ $OSTCOUNT -gt 2 ]; then
3130 [ $stripes -eq 3 ] ||
3131 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3133 [ $stripes -eq 2 ] ||
3134 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3137 local size=$(stat $name | awk '/Size:/ { print $2 }')
3138 [ $size -eq $((4096 * $bcount)) ] ||
3139 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3141 cat $name > /dev/null || error "(5.5) cannot read $name"
3143 echo "dummy" >> $name || error "(5.6) cannot write $name"
3145 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3147 touch $name || error "(5.8) cannot touch $name"
3149 rm -f $name || error "(5.9) cannot unlink $name"
3152 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3154 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3155 if [ $OSTCOUNT -gt 2 ]; then
3156 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3158 echo "Check $name, it contains the old f1's stripe1"
3161 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3163 pattern=$($LFS getstripe -L $name)
3164 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3165 error "(6.2) expect pattern flag hole, but got $pattern"
3167 stripes=$($LFS getstripe -c $name)
3168 if [ $OSTCOUNT -gt 2 ]; then
3169 [ $stripes -eq 3 ] ||
3170 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3172 [ $stripes -eq 2 ] ||
3173 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3176 size=$(stat $name | awk '/Size:/ { print $2 }')
3177 [ $size -eq $((4096 * $bcount)) ] ||
3178 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3180 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3182 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3183 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3186 [ $failures -eq 256 ] ||
3187 error "(6.6) expect 256 IO failures, but get $failures"
3189 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3190 [ $size -eq $((4096 * $bcount)) ] ||
3191 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3193 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3194 error "(6.8) write to the LOV EA hole should fail"
3196 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3197 error "(6.9) write to normal stripe should NOT fail"
3199 echo "foo" >> $name && error "(6.10) append write $name should fail"
3201 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3203 touch $name || error "(6.12) cannot touch $name"
3205 rm -f $name || error "(6.13) cannot unlink $name"
3208 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3210 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3211 if [ $OSTCOUNT -gt 2 ]; then
3212 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3214 echo "Check $name, it contains the old f2's stripe0"
3217 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3219 pattern=$($LFS getstripe -L $name)
3220 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3221 error "(7.2) expect pattern flag hole, but got $pattern"
3223 stripes=$($LFS getstripe -c $name)
3224 size=$(stat $name | awk '/Size:/ { print $2 }')
3225 if [ $OSTCOUNT -gt 2 ]; then
3226 [ $stripes -eq 3 ] ||
3227 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3229 [ $size -eq $((4096 * $bcount)) ] ||
3230 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3232 cat $name > /dev/null &&
3233 error "(7.5.1) normal read $name should fail"
3235 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3236 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3238 [ $failures -eq 256 ] ||
3239 error "(7.6) expect 256 IO failures, but get $failures"
3241 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3242 [ $size -eq $((4096 * $bcount)) ] ||
3243 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3245 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3246 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3248 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3249 error "(7.8.1) write to normal stripe should NOT fail"
3251 echo "foo" >> $name &&
3252 error "(7.8.3) append write $name should fail"
3254 chown $RUNAS_ID:$RUNAS_GID $name ||
3255 error "(7.9.1) cannot chown on $name"
3257 touch $name || error "(7.10.1) cannot touch $name"
3259 [ $stripes -eq 2 ] ||
3260 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3263 [ $size -eq $((4096 * (256 + 0))) ] ||
3264 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3266 cat $name > /dev/null &&
3267 error "(7.5.2) normal read $name should fail"
3269 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3270 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3271 [ $failures -eq 256 ] ||
3272 error "(7.6.2) expect 256 IO failures, but get $failures"
3275 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3276 [ $size -eq $((4096 * $bcount)) ] ||
3277 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3279 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3280 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3282 chown $RUNAS_ID:$RUNAS_GID $name ||
3283 error "(7.9.2) cannot chown on $name"
3285 touch $name || error "(7.10.2) cannot touch $name"
3288 rm -f $name || error "(7.11) cannot unlink $name"
3290 [ $OSTCOUNT -le 2 ] && return
3293 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3295 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3296 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3298 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3300 pattern=$($LFS getstripe -L $name)
3301 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3302 error "(8.2) expect pattern flag hole, but got $pattern"
3304 stripes=$($LFS getstripe -c $name)
3305 [ $stripes -eq 3 ] ||
3306 error "(8.3) expect the stripe count is 3, but got $stripes"
3308 size=$(stat $name | awk '/Size:/ { print $2 }')
3310 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3311 error "(8.4) expect the size $((4096 * 512)), but got $size"
3313 cat $name > /dev/null &&
3314 error "(8.5) normal read $name should fail"
3316 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3317 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3319 [ $failures -eq 256 ] ||
3320 error "(8.6) expect 256 IO failures, but get $failures"
3323 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3324 [ $size -eq $((4096 * $bcount)) ] ||
3325 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3327 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3328 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3330 chown $RUNAS_ID:$RUNAS_GID $name ||
3331 error "(8.9) cannot chown on $name"
3333 touch $name || error "(8.10) cannot touch $name"
3335 rm -f $name || error "(8.11) cannot unlink $name"
3337 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3340 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3341 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3344 echo "The target MDT-object and some of its OST-object are lost."
3345 echo "The LFSCK should find out the left OST-objects and re-create"
3346 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3347 echo "with the partial OST-objects (LOV EA hole)."
3349 echo "New client can access the file with LOV EA hole via normal"
3350 echo "system tools or commands without crash the system - PFL case."
3353 check_mount_and_prep
3355 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3356 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3357 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3358 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3359 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3360 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3362 local bcount=$((256 * 3 + 1))
3364 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3365 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3366 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3368 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3369 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3370 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3373 $LFS getstripe $DIR/$tdir/f0
3375 $LFS getstripe $DIR/$tdir/f1
3377 $LFS getstripe $DIR/$tdir/f2
3379 cancel_lru_locks mdc
3380 cancel_lru_locks osc
3382 echo "Inject failure..."
3383 echo "To simulate f0 lost MDT-object"
3384 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3385 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3388 echo "To simulate the case of f1 lost MDT-object and "
3389 echo "the first OST-object in each PFL component"
3390 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3394 echo "To simulate the case of f2 lost MDT-object and "
3395 echo "the second OST-object in each PFL component"
3396 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3401 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3403 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3404 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3406 for k in $(seq $MDSCOUNT); do
3407 # The LFSCK status query internal is 30 seconds. For the case
3408 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3409 # time to guarantee the status sync up.
3410 wait_update_facet mds${k} "$LCTL get_param -n \
3411 mdd.$(facet_svc mds${k}).lfsck_layout |
3412 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3413 error "(4) MDS${k} is not the expected 'completed'"
3416 for k in $(seq $OSTCOUNT); do
3417 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3418 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3419 awk '/^status/ { print $2 }')
3420 [ "$cur_status" == "completed" ] ||
3421 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3424 local repaired=$(do_facet mds1 $LCTL get_param -n \
3425 mdd.$(facet_svc mds1).lfsck_layout |
3426 awk '/^repaired_orphan/ { print $2 }')
3427 [ $repaired -eq 8 ] ||
3428 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3431 # ${fid0}-R-0 is the old f0
3433 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3434 echo "Check $name, which is the old f0"
3436 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3438 local pattern=$($LFS getstripe -L -I1 $name)
3439 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3440 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3442 pattern=$($LFS getstripe -L -I2 $name)
3443 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3444 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3446 local stripes=$($LFS getstripe -c -I1 $name)
3447 [ $stripes -eq 2 ] ||
3448 error "(7.3.1) expect 2 stripes, but got $stripes"
3450 stripes=$($LFS getstripe -c -I2 $name)
3451 [ $stripes -eq 2 ] ||
3452 error "(7.3.2) expect 2 stripes, but got $stripes"
3454 local e_start=$($LFS getstripe -I1 $name |
3455 awk '/lcme_extent.e_start:/ { print $2 }')
3456 [ $e_start -eq 0 ] ||
3457 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3459 local e_end=$($LFS getstripe -I1 $name |
3460 awk '/lcme_extent.e_end:/ { print $2 }')
3461 [ $e_end -eq 2097152 ] ||
3462 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3464 e_start=$($LFS getstripe -I2 $name |
3465 awk '/lcme_extent.e_start:/ { print $2 }')
3466 [ $e_start -eq 2097152 ] ||
3467 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3469 e_end=$($LFS getstripe -I2 $name |
3470 awk '/lcme_extent.e_end:/ { print $2 }')
3471 [ "$e_end" = "EOF" ] ||
3472 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3474 local size=$(stat $name | awk '/Size:/ { print $2 }')
3475 [ $size -eq $((4096 * $bcount)) ] ||
3476 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3478 cat $name > /dev/null || error "(7.7) cannot read $name"
3480 echo "dummy" >> $name || error "(7.8) cannot write $name"
3482 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3484 touch $name || error "(7.10) cannot touch $name"
3486 rm -f $name || error "(7.11) cannot unlink $name"
3489 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3491 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3492 echo "Check $name, it contains f1's second OST-object in each COMP"
3494 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3496 pattern=$($LFS getstripe -L -I1 $name)
3497 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3498 error "(8.2.1) expect pattern flag hole, but got $pattern"
3500 pattern=$($LFS getstripe -L -I2 $name)
3501 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3502 error "(8.2.2) expect pattern flag hole, but got $pattern"
3504 stripes=$($LFS getstripe -c -I1 $name)
3505 [ $stripes -eq 2 ] ||
3506 error "(8.3.2) expect 2 stripes, but got $stripes"
3508 stripes=$($LFS getstripe -c -I2 $name)
3509 [ $stripes -eq 2 ] ||
3510 error "(8.3.2) expect 2 stripes, but got $stripes"
3512 e_start=$($LFS getstripe -I1 $name |
3513 awk '/lcme_extent.e_start:/ { print $2 }')
3514 [ $e_start -eq 0 ] ||
3515 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3517 e_end=$($LFS getstripe -I1 $name |
3518 awk '/lcme_extent.e_end:/ { print $2 }')
3519 [ $e_end -eq 2097152 ] ||
3520 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3522 e_start=$($LFS getstripe -I2 $name |
3523 awk '/lcme_extent.e_start:/ { print $2 }')
3524 [ $e_start -eq 2097152 ] ||
3525 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3527 e_end=$($LFS getstripe -I2 $name |
3528 awk '/lcme_extent.e_end:/ { print $2 }')
3529 [ "$e_end" = "EOF" ] ||
3530 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3532 size=$(stat $name | awk '/Size:/ { print $2 }')
3533 [ $size -eq $((4096 * $bcount)) ] ||
3534 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3536 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3538 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3539 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3541 # The first stripe in each COMP was lost
3542 [ $failures -eq 512 ] ||
3543 error "(8.8) expect 512 IO failures, but get $failures"
3545 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3546 [ $size -eq $((4096 * $bcount)) ] ||
3547 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3549 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3550 error "(8.10) write to the LOV EA hole should fail"
3552 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3553 error "(8.11) write to normal stripe should NOT fail"
3555 echo "foo" >> $name && error "(8.12) append write $name should fail"
3557 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3559 touch $name || error "(8.14) cannot touch $name"
3561 rm -f $name || error "(8.15) cannot unlink $name"
3564 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3566 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3567 echo "Check $name, it contains f2's first stripe in each COMP"
3569 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3571 pattern=$($LFS getstripe -L -I1 $name)
3572 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3573 error "(9.2.1) expect pattern flag hole, but got $pattern"
3575 pattern=$($LFS getstripe -L -I2 $name)
3576 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3577 error "(9.2.2) expect pattern flag hole, but got $pattern"
3579 stripes=$($LFS getstripe -c -I1 $name)
3580 [ $stripes -eq 2 ] ||
3581 error "(9.3.2) expect 2 stripes, but got $stripes"
3583 stripes=$($LFS getstripe -c -I2 $name)
3584 [ $stripes -eq 2 ] ||
3585 error "(9.3.2) expect 2 stripes, but got $stripes"
3587 e_start=$($LFS getstripe -I1 $name |
3588 awk '/lcme_extent.e_start:/ { print $2 }')
3589 [ $e_start -eq 0 ] ||
3590 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3592 e_end=$($LFS getstripe -I1 $name |
3593 awk '/lcme_extent.e_end:/ { print $2 }')
3594 [ $e_end -eq 2097152 ] ||
3595 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3597 e_start=$($LFS getstripe -I2 $name |
3598 awk '/lcme_extent.e_start:/ { print $2 }')
3599 [ $e_start -eq 2097152 ] ||
3600 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3602 e_end=$($LFS getstripe -I2 $name |
3603 awk '/lcme_extent.e_end:/ { print $2 }')
3604 [ "$e_end" = "EOF" ] ||
3605 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3607 size=$(stat $name | awk '/Size:/ { print $2 }')
3608 # The second stripe in COMP was lost, so we do not know there
3609 # have ever been some data before. 'stat' will regard it as
3610 # no data on the lost stripe.
3612 [ $size -eq $((4096 * $bcount)) ] ||
3613 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3615 cat $name > /dev/null &&
3616 error "(9.7) normal read $name should fail"
3618 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3619 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3620 [ $failures -eq 512 ] ||
3621 error "(9.8) expect 256 IO failures, but get $failures"
3623 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3624 # The second stripe in COMP was lost, so we do not know there
3625 # have ever been some data before. Since 'dd' skip failure,
3626 # it will regard the lost stripe contains data.
3628 [ $size -eq $((4096 * $bcount)) ] ||
3629 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3631 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3632 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3634 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3635 error "(9.11) write to normal stripe should NOT fail"
3637 echo "foo" >> $name &&
3638 error "(9.12) append write $name should fail"
3640 chown $RUNAS_ID:$RUNAS_GID $name ||
3641 error "(9.13) cannot chown on $name"
3643 touch $name || error "(9.14) cannot touch $name"
3645 rm -f $name || error "(7.15) cannot unlink $name"
3647 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3650 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3651 skip "ignore the test if MDS is older than 2.5.59" && return
3653 check_mount_and_prep
3654 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3656 echo "Start all LFSCK components by default (-s 1)"
3657 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3658 error "Fail to start LFSCK"
3660 echo "namespace LFSCK should be in 'scanning-phase1' status"
3661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3662 [ "$STATUS" == "scanning-phase1" ] ||
3663 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3665 echo "layout LFSCK should be in 'scanning-phase1' status"
3666 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3667 [ "$STATUS" == "scanning-phase1" ] ||
3668 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3670 echo "Stop all LFSCK components by default"
3671 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3672 error "Fail to stop LFSCK"
3674 run_test 21 "run all LFSCK components by default"
3677 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3680 echo "The parent_A references the child directory via some name entry,"
3681 echo "but the child directory back references another parent_B via its"
3682 echo "".." name entry. The parent_B does not exist. Then the namespace"
3683 echo "LFSCK will repair the child directory's ".." name entry."
3686 check_mount_and_prep
3688 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3689 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3691 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3692 echo "The dummy's dotdot name entry references the guard."
3693 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3694 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3695 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3696 error "(3) Fail to mkdir on MDT0"
3697 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3699 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3701 echo "Trigger namespace LFSCK to repair unmatched pairs"
3702 $START_NAMESPACE -A -r ||
3703 error "(5) Fail to start LFSCK for namespace"
3705 wait_all_targets_blocked namespace completed 6
3707 local repaired=$($SHOW_NAMESPACE |
3708 awk '/^unmatched_pairs_repaired/ { print $2 }')
3709 [ $repaired -eq 1 ] ||
3710 error "(7) Fail to repair unmatched pairs: $repaired"
3712 echo "'ls' should success after namespace LFSCK repairing"
3713 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3714 error "(8) ls should success."
3716 run_test 22a "LFSCK can repair unmatched pairs (1)"
3719 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3722 echo "The parent_A references the child directory via the name entry_B,"
3723 echo "but the child directory back references another parent_C via its"
3724 echo "".." name entry. The parent_C exists, but there is no the name"
3725 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3726 echo "the child directory's ".." name entry and its linkEA."
3729 check_mount_and_prep
3731 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3732 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3734 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3735 echo "and bad linkEA. The dummy's dotdot name entry references the"
3736 echo "guard. The dummy's linkEA references n non-exist name entry."
3737 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3738 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3739 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3740 error "(3) Fail to mkdir on MDT0"
3741 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3743 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3744 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3745 local dummyname=$($LFS fid2path $DIR $dummyfid)
3746 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3747 error "(4) fid2path works unexpectedly."
3749 echo "Trigger namespace LFSCK to repair unmatched pairs"
3750 $START_NAMESPACE -A -r ||
3751 error "(5) Fail to start LFSCK for namespace"
3753 wait_all_targets_blocked namespace completed 6
3755 local repaired=$($SHOW_NAMESPACE |
3756 awk '/^unmatched_pairs_repaired/ { print $2 }')
3757 [ $repaired -eq 1 ] ||
3758 error "(7) Fail to repair unmatched pairs: $repaired"
3760 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3761 local dummyname=$($LFS fid2path $DIR $dummyfid)
3762 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3763 error "(8) fid2path does not work"
3765 run_test 22b "LFSCK can repair unmatched pairs (2)"
3768 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3771 echo "The name entry is there, but the MDT-object for such name "
3772 echo "entry does not exist. The namespace LFSCK should find out "
3773 echo "and repair the inconsistency as required."
3776 check_mount_and_prep
3778 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3779 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3781 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3782 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3783 do_facet mds2 $LCTL set_param fail_loc=0x1620
3784 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3785 do_facet mds2 $LCTL set_param fail_loc=0
3787 echo "'ls' should fail because of dangling name entry"
3788 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3790 echo "Trigger namespace LFSCK to find out dangling name entry"
3791 $START_NAMESPACE -A -r ||
3792 error "(5) Fail to start LFSCK for namespace"
3794 wait_all_targets_blocked namespace completed 6
3796 local repaired=$($SHOW_NAMESPACE |
3797 awk '/^dangling_repaired/ { print $2 }')
3798 [ $repaired -eq 1 ] ||
3799 error "(7) Fail to repair dangling name entry: $repaired"
3801 echo "'ls' should fail because not re-create MDT-object by default"
3802 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3804 echo "Trigger namespace LFSCK again to repair dangling name entry"
3805 $START_NAMESPACE -A -r -C ||
3806 error "(9) Fail to start LFSCK for namespace"
3808 wait_all_targets_blocked namespace completed 10
3810 repaired=$($SHOW_NAMESPACE |
3811 awk '/^dangling_repaired/ { print $2 }')
3812 [ $repaired -eq 1 ] ||
3813 error "(11) Fail to repair dangling name entry: $repaired"
3815 echo "'ls' should success after namespace LFSCK repairing"
3816 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3818 run_test 23a "LFSCK can repair dangling name entry (1)"
3822 echo "The objectA has multiple hard links, one of them corresponding"
3823 echo "to the name entry_B. But there is something wrong for the name"
3824 echo "entry_B and cause entry_B to references non-exist object_C."
3825 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3826 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3827 echo "comes to the second-stage scanning, it will find that the"
3828 echo "former re-creating object_C is not proper, and will try to"
3829 echo "replace the object_C with the real object_A."
3832 check_mount_and_prep
3834 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3835 $LFS path2fid $DIR/$tdir/d0
3837 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3839 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3840 $LFS path2fid $DIR/$tdir/d0/f0
3842 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3843 $LFS path2fid $DIR/$tdir/d0/f1
3845 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3846 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3848 if [ "$SEQ0" != "$SEQ1" ]; then
3849 # To guarantee that the f0 and f1 are in the same FID seq
3850 rm -f $DIR/$tdir/d0/f0 ||
3851 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3852 echo "dummy" > $DIR/$tdir/d0/f0 ||
3853 error "(3.2) Fail to touch on MDT0"
3854 $LFS path2fid $DIR/$tdir/d0/f0
3857 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3858 OID=$(printf %d $OID)
3860 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3861 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3862 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3863 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3864 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3866 # If there is creation after the dangling injection, it may re-use
3867 # the just released local object (inode) that is referenced by the
3868 # dangling name entry. It will fail the dangling injection.
3869 # So before deleting the target object for the dangling name entry,
3870 # remove some other objects to avoid the target object being reused
3871 # by some potential creations. LU-7429
3872 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3874 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3876 echo "'ls' should fail because of dangling name entry"
3877 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3878 error "(6) ls should fail."
3880 echo "Trigger namespace LFSCK to find out dangling name entry"
3881 $START_NAMESPACE -r -C ||
3882 error "(7) Fail to start LFSCK for namespace"
3884 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3885 mdd.${MDT_DEV}.lfsck_namespace |
3886 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3888 error "(8) unexpected status"
3891 local repaired=$($SHOW_NAMESPACE |
3892 awk '/^dangling_repaired/ { print $2 }')
3893 [ $repaired -eq 1 ] ||
3894 error "(9) Fail to repair dangling name entry: $repaired"
3896 repaired=$($SHOW_NAMESPACE |
3897 awk '/^multiple_linked_repaired/ { print $2 }')
3898 [ $repaired -eq 1 ] ||
3899 error "(10) Fail to drop the former created object: $repaired"
3901 local data=$(cat $DIR/$tdir/d0/foo)
3902 [ "$data" == "dummy" ] ||
3903 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3905 run_test 23b "LFSCK can repair dangling name entry (2)"
3908 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3909 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3910 mdd.${MDT_DEV}.lfsck_namespace |
3911 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3913 error "(10) unexpected status"
3916 stop_full_debug_logging
3921 echo "The objectA has multiple hard links, one of them corresponding"
3922 echo "to the name entry_B. But there is something wrong for the name"
3923 echo "entry_B and cause entry_B to references non-exist object_C."
3924 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3925 echo "as dangling, and re-create the lost object_C. And then others"
3926 echo "modified the re-created object_C. When the LFSCK comes to the"
3927 echo "second-stage scanning, it will find that the former re-creating"
3928 echo "object_C maybe wrong and try to replace the object_C with the"
3929 echo "real object_A. But because object_C has been modified, so the"
3930 echo "LFSCK cannot replace it."
3933 start_full_debug_logging
3935 check_mount_and_prep
3937 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3938 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3939 echo "parent_fid=$parent_fid"
3941 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3943 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3944 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3945 echo "f0_fid=$f0_fid"
3947 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3948 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
3949 echo "f1_fid=$f1_fid"
3951 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
3952 # To guarantee that the f0 and f1 are in the same FID seq
3953 rm -f $DIR/$tdir/d0/f0 ||
3954 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3955 echo "dummy" > $DIR/$tdir/d0/f0 ||
3956 error "(3.2) Fail to touch on MDT0"
3957 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3958 echo "f0_fid=$f0_fid (replaced)"
3961 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
3963 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3964 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3965 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
3966 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3967 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3969 # If there is creation after the dangling injection, it may re-use
3970 # the just released local object (inode) that is referenced by the
3971 # dangling name entry. It will fail the dangling injection.
3972 # So before deleting the target object for the dangling name entry,
3973 # remove some other objects to avoid the target object being reused
3974 # by some potential creations. LU-7429
3975 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3977 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3979 echo "'ls' should fail because of dangling name entry"
3980 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3981 error "(6) ls should fail."
3983 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3984 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3986 echo "Trigger namespace LFSCK to find out dangling name entry"
3987 $START_NAMESPACE -r -C ||
3988 error "(7) Fail to start LFSCK for namespace"
3990 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
3991 # While unexpected by the test, it is valid for LFSCK to repair
3992 # the link to the original object before any data is written.
3993 local size=$(stat -c %s $DIR/$tdir/d0/foo)
3995 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
3996 log "LFSCK repaired file prematurely"
4001 stat $DIR/$tdir/d0/foo
4003 error "(8) unexpected size"
4006 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4007 cancel_lru_locks osc
4011 local repaired=$($SHOW_NAMESPACE |
4012 awk '/^dangling_repaired/ { print $2 }')
4013 [ $repaired -eq 1 ] ||
4014 error "(11) Fail to repair dangling name entry: $repaired"
4016 local data=$(cat $DIR/$tdir/d0/foo)
4017 [ "$data" != "dummy" ] ||
4018 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4020 run_test 23c "LFSCK can repair dangling name entry (3)"
4023 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4024 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4027 echo "Two MDT-objects back reference the same name entry via their"
4028 echo "each own linkEA entry, but the name entry only references one"
4029 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4030 echo "for the MDT-object that is not recognized. If such MDT-object"
4031 echo "has no other linkEA entry after the removing, then the LFSCK"
4032 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4035 check_mount_and_prep
4037 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4039 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4040 $LFS path2fid $DIR/$tdir/d0/guard
4042 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4043 $LFS path2fid $DIR/$tdir/d0/dummy
4046 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4047 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4049 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4052 touch $DIR/$tdir/d0/guard/foo ||
4053 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4055 echo "Inject failure stub on MDT0 to simulate the case that"
4056 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4057 echo "that references $DIR/$tdir/d0/guard/foo."
4058 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4059 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4060 echo "there with the same linkEA entry as another MDT-object"
4061 echo "$DIR/$tdir/d0/guard/foo has"
4063 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4064 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4065 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4066 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4067 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4068 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4069 rmdir $DIR/$tdir/d0/dummy/foo ||
4070 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4071 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4073 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4074 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4075 error "(6) stat successfully unexpectedly"
4077 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4078 $START_NAMESPACE -A -r ||
4079 error "(7) Fail to start LFSCK for namespace"
4081 wait_all_targets_blocked namespace completed 8
4083 local repaired=$($SHOW_NAMESPACE |
4084 awk '/^multiple_referenced_repaired/ { print $2 }')
4085 [ $repaired -eq 1 ] ||
4086 error "(9) Fail to repair multiple referenced name entry: $repaired"
4088 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4089 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4090 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4092 local cname="$cfid-$pfid-D-0"
4093 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4094 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4096 run_test 24 "LFSCK can repair multiple-referenced name entry"
4099 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4100 skip "ldiskfs only test" && return
4103 echo "The file type in the name entry does not match the file type"
4104 echo "claimed by the referenced object. Then the LFSCK will update"
4105 echo "the file type in the name entry."
4108 check_mount_and_prep
4110 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4112 echo "Inject failure stub on MDT0 to simulate the case that"
4113 echo "the file type stored in the name entry is wrong."
4115 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4116 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4117 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4118 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4120 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4121 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4123 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4124 mdd.${MDT_DEV}.lfsck_namespace |
4125 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4127 error "(4) unexpected status"
4130 local repaired=$($SHOW_NAMESPACE |
4131 awk '/^bad_file_type_repaired/ { print $2 }')
4132 [ $repaired -eq 1 ] ||
4133 error "(5) Fail to repair bad file type in name entry: $repaired"
4135 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4137 run_test 25 "LFSCK can repair bad file type in the name entry"
4141 echo "The local name entry back referenced by the MDT-object is lost."
4142 echo "The namespace LFSCK will add the missing local name entry back"
4143 echo "to the normal namespace."
4146 check_mount_and_prep
4148 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4149 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4150 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4152 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4153 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4155 echo "Inject failure stub on MDT0 to simulate the case that"
4156 echo "foo's name entry will be removed, but the foo's object"
4157 echo "and its linkEA are kept in the system."
4159 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4160 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4161 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4162 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4164 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4165 error "(5) 'ls' should fail"
4167 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4168 $START_NAMESPACE -r -A ||
4169 error "(6) Fail to start LFSCK for namespace"
4171 wait_all_targets_blocked namespace completed 7
4173 local repaired=$($SHOW_NAMESPACE |
4174 awk '/^lost_dirent_repaired/ { print $2 }')
4175 [ $repaired -eq 1 ] ||
4176 error "(8) Fail to repair lost dirent: $repaired"
4178 ls -ail $DIR/$tdir/d0/foo ||
4179 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4181 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4182 [ "$foofid" == "$foofid2" ] ||
4183 error "(10) foo's FID changed: $foofid, $foofid2"
4185 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4188 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4191 echo "The remote name entry back referenced by the MDT-object is lost."
4192 echo "The namespace LFSCK will add the missing remote name entry back"
4193 echo "to the normal namespace."
4196 check_mount_and_prep
4198 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4199 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4200 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4202 echo "Inject failure stub on MDT0 to simulate the case that"
4203 echo "foo's name entry will be removed, but the foo's object"
4204 echo "and its linkEA are kept in the system."
4206 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4207 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4208 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4209 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4211 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4212 error "(4) 'ls' should fail"
4214 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4215 $START_NAMESPACE -r -A ||
4216 error "(5) Fail to start LFSCK for namespace"
4218 wait_all_targets_blocked namespace completed 6
4220 local repaired=$($SHOW_NAMESPACE |
4221 awk '/^lost_dirent_repaired/ { print $2 }')
4222 [ $repaired -eq 1 ] ||
4223 error "(7) Fail to repair lost dirent: $repaired"
4225 ls -ail $DIR/$tdir/d0/foo ||
4226 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4228 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4229 [ "$foofid" == "$foofid2" ] ||
4230 error "(9) foo's FID changed: $foofid, $foofid2"
4232 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4235 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4238 echo "The local parent referenced by the MDT-object linkEA is lost."
4239 echo "The namespace LFSCK will re-create the lost parent as orphan."
4242 check_mount_and_prep
4244 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4245 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4246 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4247 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4249 echo "Inject failure stub on MDT0 to simulate the case that"
4250 echo "foo's name entry will be removed, but the foo's object"
4251 echo "and its linkEA are kept in the system. And then remove"
4252 echo "another hard link and the parent directory."
4254 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4255 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4256 rm -f $DIR/$tdir/d0/foo ||
4257 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4258 rm -f $DIR/$tdir/d0/dummy ||
4259 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4260 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4262 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4263 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4265 echo "Trigger namespace LFSCK to repair the lost parent"
4266 $START_NAMESPACE -r -A ||
4267 error "(6) Fail to start LFSCK for namespace"
4269 wait_all_targets_blocked namespace completed 7
4271 local repaired=$($SHOW_NAMESPACE |
4272 awk '/^lost_dirent_repaired/ { print $2 }')
4273 [ $repaired -eq 1 ] ||
4274 error "(8) Fail to repair lost dirent: $repaired"
4276 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4277 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4278 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4280 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4282 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4283 [ ! -z "$cname" ] ||
4284 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4286 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4289 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4290 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4293 echo "The remote parent referenced by the MDT-object linkEA is lost."
4294 echo "The namespace LFSCK will re-create the lost parent as orphan."
4297 check_mount_and_prep
4299 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4300 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4302 $LFS path2fid $DIR/$tdir/d0
4304 echo "Inject failure stub on MDT0 to simulate the case that"
4305 echo "foo's name entry will be removed, but the foo's object"
4306 echo "and its linkEA are kept in the system. And then remove"
4307 echo "the parent directory."
4309 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4311 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4312 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4314 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4315 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4317 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4318 $START_NAMESPACE -r -A ||
4319 error "(6) Fail to start LFSCK for namespace"
4321 wait_all_targets_blocked namespace completed 7
4323 local repaired=$($SHOW_NAMESPACE |
4324 awk '/^lost_dirent_repaired/ { print $2 }')
4325 [ $repaired -eq 1 ] ||
4326 error "(8) Fail to repair lost dirent: $repaired"
4328 ls -ail $MOUNT/.lustre/lost+found/
4330 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4331 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4332 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4334 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4336 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4337 [ ! -z "$cname" ] ||
4338 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4340 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4343 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4346 echo "The target name entry is lost. The LFSCK should insert the"
4347 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4348 echo "the MDT (on which the orphan MDT-object resides) has ever"
4349 echo "failed to respond some name entry verification during the"
4350 echo "first stage-scanning, then the LFSCK should skip to handle"
4351 echo "orphan MDT-object on this MDT. But other MDTs should not"
4355 check_mount_and_prep
4356 $LFS mkdir -i 0 $DIR/$tdir/d1
4357 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4358 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4360 $LFS mkdir -i 1 $DIR/$tdir/d2
4361 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4362 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4364 echo "Inject failure stub on MDT0 to simulate the case that"
4365 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4366 echo "and its linkEA are kept in the system. And the case that"
4367 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4368 echo "and its linkEA are kept in the system."
4370 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4371 do_facet mds1 $LCTL set_param fail_loc=0x1624
4372 do_facet mds2 $LCTL set_param fail_loc=0x1624
4373 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4374 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4375 do_facet mds1 $LCTL set_param fail_loc=0
4376 do_facet mds2 $LCTL set_param fail_loc=0
4378 cancel_lru_locks mdc
4379 cancel_lru_locks osc
4381 echo "Inject failure, to simulate the MDT0 fail to handle"
4382 echo "MDT1 LFSCK request during the first-stage scanning."
4383 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4384 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4386 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4387 $START_NAMESPACE -r -A ||
4388 error "(3) Fail to start LFSCK for namespace"
4390 wait_update_facet mds1 "$LCTL get_param -n \
4391 mdd.$(facet_svc mds1).lfsck_namespace |
4392 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4393 error "(4) mds1 is not the expected 'partial'"
4396 wait_update_facet mds2 "$LCTL get_param -n \
4397 mdd.$(facet_svc mds2).lfsck_namespace |
4398 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4399 error "(5) mds2 is not the expected 'completed'"
4402 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4404 local repaired=$(do_facet mds1 $LCTL get_param -n \
4405 mdd.$(facet_svc mds1).lfsck_namespace |
4406 awk '/^lost_dirent_repaired/ { print $2 }')
4407 [ $repaired -eq 0 ] ||
4408 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4410 repaired=$(do_facet mds2 $LCTL get_param -n \
4411 mdd.$(facet_svc mds2).lfsck_namespace |
4412 awk '/^lost_dirent_repaired/ { print $2 }')
4413 [ $repaired -eq 1 ] ||
4414 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4416 echo "Trigger namespace LFSCK on all devices again to cleanup"
4417 $START_NAMESPACE -r -A ||
4418 error "(8) Fail to start LFSCK for namespace"
4420 wait_all_targets_blocked namespace completed 9
4422 local repaired=$(do_facet mds1 $LCTL get_param -n \
4423 mdd.$(facet_svc mds1).lfsck_namespace |
4424 awk '/^lost_dirent_repaired/ { print $2 }')
4425 [ $repaired -eq 1 ] ||
4426 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4428 repaired=$(do_facet mds2 $LCTL get_param -n \
4429 mdd.$(facet_svc mds2).lfsck_namespace |
4430 awk '/^lost_dirent_repaired/ { print $2 }')
4431 [ $repaired -eq 0 ] ||
4432 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4434 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4438 echo "The object's nlink attribute is larger than the object's known"
4439 echo "name entries count. The LFSCK will repair the object's nlink"
4440 echo "attribute to match the known name entries count"
4443 check_mount_and_prep
4445 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4446 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4448 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4449 echo "nlink attribute is larger than its name entries count."
4451 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4452 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4453 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4454 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4455 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4457 cancel_lru_locks mdc
4458 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4459 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4461 echo "Trigger namespace LFSCK to repair the nlink count"
4462 $START_NAMESPACE -r -A ||
4463 error "(5) Fail to start LFSCK for namespace"
4465 wait_all_targets_blocked namespace completed 6
4467 local repaired=$($SHOW_NAMESPACE |
4468 awk '/^nlinks_repaired/ { print $2 }')
4469 [ $repaired -eq 1 ] ||
4470 error "(7) Fail to repair nlink count: $repaired"
4472 cancel_lru_locks mdc
4473 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4474 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4476 # Disable 29a, we only allow nlink to be updated if the known linkEA
4477 # entries is larger than nlink count.
4479 #run_test 29a "LFSCK can repair bad nlink count (1)"
4483 echo "The object's nlink attribute is smaller than the object's known"
4484 echo "name entries count. The LFSCK will repair the object's nlink"
4485 echo "attribute to match the known name entries count"
4488 check_mount_and_prep
4490 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4491 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4493 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4494 echo "nlink attribute is smaller than its name entries count."
4496 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4498 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4499 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4500 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4502 cancel_lru_locks mdc
4503 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4504 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4506 echo "Trigger namespace LFSCK to repair the nlink count"
4507 $START_NAMESPACE -r -A ||
4508 error "(5) Fail to start LFSCK for namespace"
4510 wait_all_targets_blocked namespace completed 6
4512 local repaired=$($SHOW_NAMESPACE |
4513 awk '/^nlinks_repaired/ { print $2 }')
4514 [ $repaired -eq 1 ] ||
4515 error "(7) Fail to repair nlink count: $repaired"
4517 cancel_lru_locks mdc
4518 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4519 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4521 run_test 29b "LFSCK can repair bad nlink count (2)"
4526 echo "The namespace LFSCK will create many hard links to the target"
4527 echo "file as to exceed the linkEA size limitation. Under such case"
4528 echo "the linkEA will be marked as overflow that will prevent the"
4529 echo "target file to be migrated. Then remove some hard links to"
4530 echo "make the left hard links to be held within the linkEA size"
4531 echo "limitation. But before the namespace LFSCK adding all the"
4532 echo "missed linkEA entries back, the overflow mark (timestamp)"
4533 echo "will not be cleared."
4536 check_mount_and_prep
4538 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4539 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4540 error "(0.2) Fail to mkdir"
4541 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4542 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4544 # define MAX_LINKEA_SIZE 4096
4545 # sizeof(link_ea_header) = 24
4546 # sizeof(link_ea_entry) = 18
4547 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4548 # (sizeof(link_ea_entry) + name_length))
4549 # If the average name length is 12 bytes, then 150 hard links
4550 # is totally enough to overflow the linkEA
4551 echo "Create 150 hard links should succeed although the linkEA overflow"
4552 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4553 error "(2) Fail to hard link"
4555 cancel_lru_locks mdc
4556 if [ $MDSCOUNT -ge 2 ]; then
4557 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4558 error "(3.1) Migrate should fail"
4560 echo "The object with linkEA overflow should NOT be migrated"
4561 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4562 [ "$newfid" == "$oldfid" ] ||
4563 error "(3.2) Migrate should fail: $newfid != $oldfid"
4566 # Remove 100 hard links, then the linkEA should have space
4567 # to hold the missed linkEA entries.
4568 echo "Remove 100 hard links to save space for the missed linkEA entries"
4569 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4571 if [ $MDSCOUNT -ge 2 ]; then
4572 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4573 error "(5.1) Migrate should fail"
4575 # The overflow timestamp is still there, so migration will fail.
4576 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4577 [ "$newfid" == "$oldfid" ] ||
4578 error "(5.2) Migrate should fail: $newfid != $oldfid"
4581 # sleep 3 seconds to guarantee that the overflow is recognized
4584 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4585 $START_NAMESPACE -r -A ||
4586 error "(6) Fail to start LFSCK for namespace"
4588 wait_all_targets_blocked namespace completed 7
4590 local repaired=$($SHOW_NAMESPACE |
4591 awk '/^linkea_overflow_cleared/ { print $2 }')
4592 [ $repaired -eq 1 ] ||
4593 error "(8) Fail to clear linkea overflow: $repaired"
4595 repaired=$($SHOW_NAMESPACE |
4596 awk '/^nlinks_repaired/ { print $2 }')
4597 [ $repaired -eq 0 ] ||
4598 error "(9) Unexpected nlink repaired: $repaired"
4600 if [ $MDSCOUNT -ge 2 ]; then
4601 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4602 error "(10.1) Migrate failure"
4604 # Migration should succeed after clear the overflow timestamp.
4605 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4606 [ "$newfid" != "$oldfid" ] ||
4607 error "(10.2) Migrate should succeed"
4609 ls -l $DIR/$tdir/foo > /dev/null ||
4610 error "(11) 'ls' failed after migration"
4613 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4614 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4616 run_test 29c "verify linkEA size limitation"
4619 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4620 skip "ldiskfs only test" && return
4621 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4624 echo "The namespace LFSCK will move the orphans from backend"
4625 echo "/lost+found directory to normal client visible namespace"
4626 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4629 check_mount_and_prep
4631 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4632 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4634 echo "Inject failure stub on MDT0 to simulate the case that"
4635 echo "directory d0 has no linkEA entry, then the LFSCK will"
4636 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4638 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4639 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4640 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4643 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4644 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4646 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4647 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4649 echo "Inject failure stub on MDT0 to simulate the case that the"
4650 echo "object's name entry will be removed, but not destroy the"
4651 echo "object. Then backend e2fsck will handle it as orphan and"
4652 echo "add them into the backend /lost+found directory."
4654 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4655 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4656 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4657 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4658 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4659 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4660 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4662 umount_client $MOUNT || error "(10) Fail to stop client!"
4664 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4667 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4668 error "(12) Fail to run e2fsck"
4670 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4671 error "(13) Fail to start MDT0"
4673 echo "Trigger namespace LFSCK to recover backend orphans"
4674 $START_NAMESPACE -r -A ||
4675 error "(14) Fail to start LFSCK for namespace"
4677 wait_all_targets_blocked namespace completed 15
4679 local repaired=$($SHOW_NAMESPACE |
4680 awk '/^local_lost_found_moved/ { print $2 }')
4681 [ $repaired -ge 4 ] ||
4682 error "(16) Fail to recover backend orphans: $repaired"
4684 mount_client $MOUNT || error "(17) Fail to start client!"
4686 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4688 ls -ail $MOUNT/.lustre/lost+found/
4690 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4691 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4692 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4694 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4696 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4697 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4699 stat ${cname}/d1 || error "(21) d1 is not recovered"
4700 stat ${cname}/f1 || error "(22) f1 is not recovered"
4702 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4705 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4708 echo "For the name entry under a striped directory, if the name"
4709 echo "hash does not match the shard, then the LFSCK will repair"
4710 echo "the bad name entry"
4713 check_mount_and_prep
4715 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4716 error "(1) Fail to create striped directory"
4718 echo "Inject failure stub on client to simulate the case that"
4719 echo "some name entry should be inserted into other non-first"
4720 echo "shard, but inserted into the first shard by wrong"
4722 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4723 $LCTL set_param fail_loc=0x1628 fail_val=0
4724 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4725 error "(2) Fail to create file under striped directory"
4726 $LCTL set_param fail_loc=0 fail_val=0
4728 echo "Trigger namespace LFSCK to repair bad name hash"
4729 $START_NAMESPACE -r -A ||
4730 error "(3) Fail to start LFSCK for namespace"
4732 wait_all_targets_blocked namespace completed 4
4734 local repaired=$($SHOW_NAMESPACE |
4735 awk '/^name_hash_repaired/ { print $2 }')
4736 [ $repaired -ge 1 ] ||
4737 error "(5) Fail to repair bad name hash: $repaired"
4739 umount_client $MOUNT || error "(6) umount failed"
4740 mount_client $MOUNT || error "(7) mount failed"
4742 for ((i = 0; i < $MDSCOUNT; i++)); do
4743 stat $DIR/$tdir/striped_dir/d$i ||
4744 error "(8) Fail to stat d$i after LFSCK"
4745 rmdir $DIR/$tdir/striped_dir/d$i ||
4746 error "(9) Fail to unlink d$i after LFSCK"
4749 rmdir $DIR/$tdir/striped_dir ||
4750 error "(10) Fail to remove the striped directory after LFSCK"
4752 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4755 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4758 echo "For the name entry under a striped directory, if the name"
4759 echo "hash does not match the shard, then the LFSCK will repair"
4760 echo "the bad name entry"
4763 check_mount_and_prep
4765 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4766 error "(1) Fail to create striped directory"
4768 echo "Inject failure stub on client to simulate the case that"
4769 echo "some name entry should be inserted into other non-second"
4770 echo "shard, but inserted into the secod shard by wrong"
4772 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4773 $LCTL set_param fail_loc=0x1628 fail_val=1
4774 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4775 error "(2) Fail to create file under striped directory"
4776 $LCTL set_param fail_loc=0 fail_val=0
4778 echo "Trigger namespace LFSCK to repair bad name hash"
4779 $START_NAMESPACE -r -A ||
4780 error "(3) Fail to start LFSCK for namespace"
4782 wait_all_targets_blocked namespace completed 4
4784 local repaired=$(do_facet mds2 $LCTL get_param -n \
4785 mdd.$(facet_svc mds2).lfsck_namespace |
4786 awk '/^name_hash_repaired/ { print $2 }')
4787 [ $repaired -ge 1 ] ||
4788 error "(5) Fail to repair bad name hash: $repaired"
4790 umount_client $MOUNT || error "(6) umount failed"
4791 mount_client $MOUNT || error "(7) mount failed"
4793 for ((i = 0; i < $MDSCOUNT; i++)); do
4794 stat $DIR/$tdir/striped_dir/d$i ||
4795 error "(8) Fail to stat d$i after LFSCK"
4796 rmdir $DIR/$tdir/striped_dir/d$i ||
4797 error "(9) Fail to unlink d$i after LFSCK"
4800 rmdir $DIR/$tdir/striped_dir ||
4801 error "(10) Fail to remove the striped directory after LFSCK"
4803 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4806 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4809 echo "For some reason, the master MDT-object of the striped directory"
4810 echo "may lost its master LMV EA. If nobody created files under the"
4811 echo "master directly after the master LMV EA lost, then the LFSCK"
4812 echo "should re-generate the master LMV EA."
4815 check_mount_and_prep
4817 echo "Inject failure stub on MDT0 to simulate the case that the"
4818 echo "master MDT-object of the striped directory lost the LMV EA."
4820 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4821 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4822 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4823 error "(1) Fail to create striped directory"
4824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4826 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4827 $START_NAMESPACE -r -A ||
4828 error "(2) Fail to start LFSCK for namespace"
4830 wait_all_targets_blocked namespace completed 3
4832 local repaired=$($SHOW_NAMESPACE |
4833 awk '/^striped_dirs_repaired/ { print $2 }')
4834 [ $repaired -eq 1 ] ||
4835 error "(4) Fail to re-generate master LMV EA: $repaired"
4837 umount_client $MOUNT || error "(5) umount failed"
4838 mount_client $MOUNT || error "(6) mount failed"
4840 local empty=$(ls $DIR/$tdir/striped_dir/)
4841 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4843 rmdir $DIR/$tdir/striped_dir ||
4844 error "(8) Fail to remove the striped directory after LFSCK"
4846 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4849 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4852 echo "For some reason, the master MDT-object of the striped directory"
4853 echo "may lost its master LMV EA. If somebody created files under the"
4854 echo "master directly after the master LMV EA lost, then the LFSCK"
4855 echo "should NOT re-generate the master LMV EA, instead, it should"
4856 echo "change the broken striped dirctory as read-only to prevent"
4857 echo "further damage"
4860 check_mount_and_prep
4862 echo "Inject failure stub on MDT0 to simulate the case that the"
4863 echo "master MDT-object of the striped directory lost the LMV EA."
4865 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4866 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4867 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4868 error "(1) Fail to create striped directory"
4869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4871 umount_client $MOUNT || error "(2) umount failed"
4872 mount_client $MOUNT || error "(3) mount failed"
4874 touch $DIR/$tdir/striped_dir/dummy ||
4875 error "(4) Fail to touch under broken striped directory"
4877 echo "Trigger namespace LFSCK to find out the inconsistency"
4878 $START_NAMESPACE -r -A ||
4879 error "(5) Fail to start LFSCK for namespace"
4881 wait_all_targets_blocked namespace completed 6
4883 local repaired=$($SHOW_NAMESPACE |
4884 awk '/^striped_dirs_repaired/ { print $2 }')
4885 [ $repaired -eq 0 ] ||
4886 error "(7) Re-generate master LMV EA unexpected: $repaired"
4888 stat $DIR/$tdir/striped_dir/dummy ||
4889 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4891 touch $DIR/$tdir/striped_dir/foo &&
4892 error "(9) The broken striped directory should be read-only"
4894 chattr -i $DIR/$tdir/striped_dir ||
4895 error "(10) Fail to chattr on the broken striped directory"
4897 rmdir $DIR/$tdir/striped_dir ||
4898 error "(11) Fail to remove the striped directory after LFSCK"
4900 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4903 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4906 echo "For some reason, the slave MDT-object of the striped directory"
4907 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4908 echo "slave LMV EA."
4911 check_mount_and_prep
4913 echo "Inject failure stub on MDT0 to simulate the case that the"
4914 echo "slave MDT-object (that resides on the same MDT as the master"
4915 echo "MDT-object resides on) lost the LMV EA."
4917 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4918 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4919 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4920 error "(1) Fail to create striped directory"
4921 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4923 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4924 $START_NAMESPACE -r -A ||
4925 error "(2) Fail to start LFSCK for namespace"
4927 wait_all_targets_blocked namespace completed 3
4929 local repaired=$($SHOW_NAMESPACE |
4930 awk '/^striped_shards_repaired/ { print $2 }')
4931 [ $repaired -eq 1 ] ||
4932 error "(4) Fail to re-generate slave LMV EA: $repaired"
4934 rmdir $DIR/$tdir/striped_dir ||
4935 error "(5) Fail to remove the striped directory after LFSCK"
4937 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4940 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4943 echo "For some reason, the slave MDT-object of the striped directory"
4944 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4945 echo "slave LMV EA."
4948 check_mount_and_prep
4950 echo "Inject failure stub on MDT0 to simulate the case that the"
4951 echo "slave MDT-object (that resides on different MDT as the master"
4952 echo "MDT-object resides on) lost the LMV EA."
4954 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4955 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4956 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4957 error "(1) Fail to create striped directory"
4958 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4960 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4961 $START_NAMESPACE -r -A ||
4962 error "(2) Fail to start LFSCK for namespace"
4964 wait_all_targets_blocked namespace completed 3
4966 local repaired=$(do_facet mds2 $LCTL get_param -n \
4967 mdd.$(facet_svc mds2).lfsck_namespace |
4968 awk '/^striped_shards_repaired/ { print $2 }')
4969 [ $repaired -eq 1 ] ||
4970 error "(4) Fail to re-generate slave LMV EA: $repaired"
4972 rmdir $DIR/$tdir/striped_dir ||
4973 error "(5) Fail to remove the striped directory after LFSCK"
4975 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4978 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4981 echo "For some reason, the stripe index in the slave LMV EA is"
4982 echo "corrupted. The LFSCK should repair the slave LMV EA."
4985 check_mount_and_prep
4987 echo "Inject failure stub on MDT0 to simulate the case that the"
4988 echo "slave LMV EA on the first shard of the striped directory"
4989 echo "claims the same index as the second shard claims"
4991 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4992 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4993 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4994 error "(1) Fail to create striped directory"
4995 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4997 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4998 $START_NAMESPACE -r -A ||
4999 error "(2) Fail to start LFSCK for namespace"
5001 wait_all_targets_blocked namespace completed 3
5003 local repaired=$($SHOW_NAMESPACE |
5004 awk '/^striped_shards_repaired/ { print $2 }')
5005 [ $repaired -eq 1 ] ||
5006 error "(4) Fail to repair slave LMV EA: $repaired"
5008 umount_client $MOUNT || error "(5) umount failed"
5009 mount_client $MOUNT || error "(6) mount failed"
5011 touch $DIR/$tdir/striped_dir/foo ||
5012 error "(7) Fail to touch file after the LFSCK"
5014 rm -f $DIR/$tdir/striped_dir/foo ||
5015 error "(8) Fail to unlink file after the LFSCK"
5017 rmdir $DIR/$tdir/striped_dir ||
5018 error "(9) Fail to remove the striped directory after LFSCK"
5020 run_test 31g "Repair the corrupted slave LMV EA"
5023 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5026 echo "For some reason, the shard's name entry in the striped"
5027 echo "directory may be corrupted. The LFSCK should repair the"
5028 echo "bad shard's name entry."
5031 check_mount_and_prep
5033 echo "Inject failure stub on MDT0 to simulate the case that the"
5034 echo "first shard's name entry in the striped directory claims"
5035 echo "the same index as the second shard's name entry claims."
5037 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5038 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5039 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5040 error "(1) Fail to create striped directory"
5041 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5043 echo "Trigger namespace LFSCK to repair the shard's name entry"
5044 $START_NAMESPACE -r -A ||
5045 error "(2) Fail to start LFSCK for namespace"
5047 wait_all_targets_blocked namespace completed 3
5049 local repaired=$($SHOW_NAMESPACE |
5050 awk '/^dirent_repaired/ { print $2 }')
5051 [ $repaired -eq 1 ] ||
5052 error "(4) Fail to repair shard's name entry: $repaired"
5054 umount_client $MOUNT || error "(5) umount failed"
5055 mount_client $MOUNT || error "(6) mount failed"
5057 touch $DIR/$tdir/striped_dir/foo ||
5058 error "(7) Fail to touch file after the LFSCK"
5060 rm -f $DIR/$tdir/striped_dir/foo ||
5061 error "(8) Fail to unlink file after the LFSCK"
5063 rmdir $DIR/$tdir/striped_dir ||
5064 error "(9) Fail to remove the striped directory after LFSCK"
5066 run_test 31h "Repair the corrupted shard's name entry"
5071 umount_client $MOUNT
5073 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5074 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5075 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5077 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5078 [ "$STATUS" == "scanning-phase1" ] ||
5079 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5082 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5084 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5088 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5090 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5091 error "(5) Fail to start ost1"
5093 run_test 32a "stop LFSCK when some OST failed"
5097 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5100 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5101 error "(1) Fail to create $DIR/$tdir/dp"
5102 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5103 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5104 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5105 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5106 umount_client $MOUNT
5108 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5109 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5110 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5112 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5113 mdd.${MDT_DEV}.lfsck_namespace |
5114 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5116 error "(5) unexpected status"
5120 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5122 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5126 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5128 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5129 error "(8) Fail to start MDT2"
5131 run_test 32b "stop LFSCK when some MDT failed"
5137 $START_LAYOUT --dryrun -o -r ||
5138 error "(1) Fail to start layout LFSCK"
5139 wait_all_targets_blocked layout completed 2
5141 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5142 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5143 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5145 $START_NAMESPACE -e abort -A -r ||
5146 error "(4) Fail to start namespace LFSCK"
5147 wait_all_targets_blocked namespace completed 5
5149 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5150 [ "$PARAMS" == "failout,all_targets" ] ||
5151 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5153 run_test 33 "check LFSCK paramters"
5157 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5158 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5159 skip "Only valid for ZFS backend" && return
5163 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5164 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5165 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5166 error "(1) Fail to create $DIR/$tdir/dummy"
5168 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5169 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5170 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5171 mdd.${MDT_DEV}.lfsck_namespace |
5172 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5174 error "(3) unexpected status"
5177 local repaired=$($SHOW_NAMESPACE |
5178 awk '/^dirent_repaired/ { print $2 }')
5179 [ $repaired -eq 1 ] ||
5180 error "(4) Fail to repair the lost agent object: $repaired"
5182 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5183 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5184 mdd.${MDT_DEV}.lfsck_namespace |
5185 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5187 error "(6) unexpected status"
5190 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5191 [ $repaired -eq 0 ] ||
5192 error "(7) Unexpected repairing: $repaired"
5194 run_test 34 "LFSCK can rebuild the lost agent object"
5198 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5202 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5203 do_facet mds2 $LCTL set_param fail_loc=0x1631
5204 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5205 error "(1) Fail to create $DIR/$tdir/dummy"
5208 do_facet mds2 $LCTL set_param fail_loc=0
5209 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5210 wait_update_facet mds2 "$LCTL get_param -n \
5211 mdd.$(facet_svc mds2).lfsck_namespace |
5212 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5213 error "(3) MDS${k} is not the expected 'completed'"
5215 local repaired=$(do_facet mds2 $LCTL get_param -n \
5216 mdd.$(facet_svc mds2).lfsck_namespace |
5217 awk '/^agent_entries_repaired/ { print $2 }')
5218 [ $repaired -eq 1 ] ||
5219 error "(4) Fail to repair the lost agent entry: $repaired"
5221 echo "stopall to cleanup object cache"
5224 setupall > /dev/null
5226 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5227 wait_update_facet mds2 "$LCTL get_param -n \
5228 mdd.$(facet_svc mds2).lfsck_namespace |
5229 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5230 error "(6) MDS${k} is not the expected 'completed'"
5232 repaired=$(do_facet mds2 $LCTL get_param -n \
5233 mdd.$(facet_svc mds2).lfsck_namespace |
5234 awk '/^agent_entries_repaired/ { print $2 }')
5235 [ $repaired -eq 0 ] ||
5236 error "(7) Unexpected repairing: $repaired"
5238 run_test 35 "LFSCK can rebuild the lost agent entry"
5241 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5244 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5245 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5246 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5249 check_mount_and_prep
5251 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5252 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5253 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5254 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5255 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5256 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5257 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5258 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5259 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5261 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5262 error "(3) Fail to write $DIR/$tdir/f0"
5263 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5264 error "(4) Fail to write $DIR/$tdir/f1"
5265 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5266 error "(5) Fail to write $DIR/$tdir/f2"
5268 $LFS mirror resync $DIR/$tdir/f0 ||
5269 error "(6) Fail to resync $DIR/$tdir/f0"
5270 $LFS mirror resync $DIR/$tdir/f1 ||
5271 error "(7) Fail to resync $DIR/$tdir/f1"
5272 $LFS mirror resync $DIR/$tdir/f2 ||
5273 error "(8) Fail to resync $DIR/$tdir/f2"
5275 cancel_lru_locks mdc
5276 cancel_lru_locks osc
5278 $LFS getstripe $DIR/$tdir/f0 ||
5279 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5280 $LFS getstripe $DIR/$tdir/f1 ||
5281 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5282 $LFS getstripe $DIR/$tdir/f2 ||
5283 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5285 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5286 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5287 do_facet mds1 $LCTL set_param fail_loc=0x1616
5289 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5290 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5291 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5292 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5293 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5294 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5298 do_facet mds1 $LCTL set_param fail_loc=0
5300 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5301 error "(15) The 1st of mirror is not destroyed"
5302 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5303 error "(16) The 2nd of mirror is not destroyed"
5304 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5305 error "(17) The 3rd of mirror is not destroyed"
5309 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5310 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5311 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5312 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5313 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5314 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5316 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5317 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5319 for k in $(seq $MDSCOUNT); do
5320 # The LFSCK status query internal is 30 seconds. For the case
5321 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5322 # time to guarantee the status sync up.
5323 wait_update_facet mds${k} "$LCTL get_param -n \
5324 mdd.$(facet_svc mds${k}).lfsck_layout |
5325 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5326 error "(22) MDS${k} is not the expected 'completed'"
5329 for k in $(seq $OSTCOUNT); do
5330 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5331 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5332 awk '/^status/ { print $2 }')
5333 [ "$cur_status" == "completed" ] ||
5334 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5337 local repaired=$(do_facet mds1 $LCTL get_param -n \
5338 mdd.$(facet_svc mds1).lfsck_layout |
5339 awk '/^repaired_orphan/ { print $2 }')
5340 [ $repaired -eq 9 ] ||
5341 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5343 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5344 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5345 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5346 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5347 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5348 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5350 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5351 $LFS getstripe $DIR/$tdir/f0
5352 error "(28) The 1st of mirror is not recovered"
5355 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5356 $LFS getstripe $DIR/$tdir/f1
5357 error "(29) The 2nd of mirror is not recovered"
5360 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5361 $LFS getstripe $DIR/$tdir/f2
5362 error "(30) The 3rd of mirror is not recovered"
5365 run_test 36a "rebuild LOV EA for mirrored file (1)"
5368 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5371 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5372 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5373 echo "with the PFID EA of related OST-object(s) belong to the file. "
5376 check_mount_and_prep
5378 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5379 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5380 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5382 local fid=$($LFS path2fid $DIR/$tdir/f0)
5384 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5385 error "(1) Fail to write $DIR/$tdir/f0"
5386 $LFS mirror resync $DIR/$tdir/f0 ||
5387 error "(2) Fail to resync $DIR/$tdir/f0"
5389 cancel_lru_locks mdc
5390 cancel_lru_locks osc
5392 $LFS getstripe $DIR/$tdir/f0 ||
5393 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5395 echo "Inject failure, to simulate the case of missing the MDT-object"
5396 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5397 do_facet mds1 $LCTL set_param fail_loc=0x1616
5398 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5402 do_facet mds1 $LCTL set_param fail_loc=0
5404 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5405 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5407 for k in $(seq $MDSCOUNT); do
5408 # The LFSCK status query internal is 30 seconds. For the case
5409 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5410 # time to guarantee the status sync up.
5411 wait_update_facet mds${k} "$LCTL get_param -n \
5412 mdd.$(facet_svc mds${k}).lfsck_layout |
5413 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5414 error "(6) MDS${k} is not the expected 'completed'"
5417 for k in $(seq $OSTCOUNT); do
5418 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5419 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5420 awk '/^status/ { print $2 }')
5421 [ "$cur_status" == "completed" ] ||
5422 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5425 local count=$(do_facet mds1 $LCTL get_param -n \
5426 mdd.$(facet_svc mds1).lfsck_layout |
5427 awk '/^repaired_orphan/ { print $2 }')
5428 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5430 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5431 count=$($LFS getstripe --mirror-count $name)
5432 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5434 count=$($LFS getstripe --component-count $name)
5435 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5437 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5438 $LFS getstripe $name
5439 error "(11) The 1st of mirror is not recovered"
5442 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5443 $LFS getstripe $name
5444 error "(12) The 2nd of mirror is not recovered"
5447 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5448 $LFS getstripe $name
5449 error "(13) The 3rd of mirror is not recovered"
5452 run_test 36b "rebuild LOV EA for mirrored file (2)"
5455 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5458 echo "The mirrored file has been modified, not resynced yet, then "
5459 echo "lost its MDT-object, but relatd OST-objects are still there. "
5460 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5461 echo "with the PFID EA of related OST-object(s) belong to the file. "
5464 check_mount_and_prep
5466 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5468 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5470 local fid=$($LFS path2fid $DIR/$tdir/f0)
5472 # The 1st dd && resync makes all related OST-objects have been written
5473 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5474 error "(1.1) Fail to write $DIR/$tdir/f0"
5475 $LFS mirror resync $DIR/$tdir/f0 ||
5476 error "(1.2) Fail to resync $DIR/$tdir/f0"
5477 # The 2nd dd makes one mirror to be stale
5478 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5479 error "(1.3) Fail to write $DIR/$tdir/f0"
5481 cancel_lru_locks mdc
5482 cancel_lru_locks osc
5484 $LFS getstripe $DIR/$tdir/f0 ||
5485 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5487 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5488 awk '/lcme_flags/ { print $2 }')
5489 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5490 awk '/lcme_flags/ { print $2 }')
5492 echo "Inject failure, to simulate the case of missing the MDT-object"
5493 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5494 do_facet mds1 $LCTL set_param fail_loc=0x1616
5495 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5499 do_facet mds1 $LCTL set_param fail_loc=0
5501 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5502 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5504 for k in $(seq $MDSCOUNT); do
5505 # The LFSCK status query internal is 30 seconds. For the case
5506 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5507 # time to guarantee the status sync up.
5508 wait_update_facet mds${k} "$LCTL get_param -n \
5509 mdd.$(facet_svc mds${k}).lfsck_layout |
5510 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5511 error "(5) MDS${k} is not the expected 'completed'"
5514 for k in $(seq $OSTCOUNT); do
5515 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5516 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5517 awk '/^status/ { print $2 }')
5518 [ "$cur_status" == "completed" ] ||
5519 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5522 local count=$(do_facet mds1 $LCTL get_param -n \
5523 mdd.$(facet_svc mds1).lfsck_layout |
5524 awk '/^repaired_orphan/ { print $2 }')
5525 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5527 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5528 count=$($LFS getstripe --mirror-count $name)
5529 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5531 count=$($LFS getstripe --component-count $name)
5532 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5534 local flags=$($LFS getstripe $name | head -n 10 |
5535 awk '/lcme_flags/ { print $2 }')
5536 [ "$flags" == "$saved_flags1" ] || {
5537 $LFS getstripe $name
5538 error "(10) expect flags $saved_flags1, got $flags"
5541 flags=$($LFS getstripe $name | tail -n 10 |
5542 awk '/lcme_flags/ { print $2 }')
5543 [ "$flags" == "$saved_flags2" ] || {
5544 $LFS getstripe $name
5545 error "(11) expect flags $saved_flags2, got $flags"
5548 run_test 36c "rebuild LOV EA for mirrored file (3)"
5554 local t_dir="$DIR/$tdir/d0"
5555 check_mount_and_prep
5557 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5558 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5562 $START_NAMESPACE -r -A || {
5563 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5565 wait_all_targets_blocked namespace completed 4
5570 run_test 37 "LFSCK must skip a ORPHAN"
5573 # restore MDS/OST size
5574 MDSSIZE=${SAVED_MDSSIZE}
5575 OSTSIZE=${SAVED_OSTSIZE}
5576 OSTCOUNT=${SAVED_OSTCOUNT}
5578 # cleanup the system at last
5579 REFORMAT="yes" cleanup_and_setup_lustre
5582 check_and_cleanup_lustre