3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
45 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
57 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
60 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
61 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
63 # DNE does not support striped directory on zfs-based backend yet.
64 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
65 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
69 MDT_DEV="${FSNAME}-MDT0000"
70 OST_DEV="${FSNAME}-OST0000"
71 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
72 START_NAMESPACE="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
74 START_LAYOUT="do_facet $SINGLEMDS \
75 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
76 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
77 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
78 SHOW_NAMESPACE="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
80 SHOW_LAYOUT="do_facet $SINGLEMDS \
81 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
82 SHOW_LAYOUT_ON_OST="do_facet ost1 \
83 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
84 MOUNT_OPTS_SCRUB="-o user_xattr"
85 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
86 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
95 echo "preparing... $nfiles * $ndirs files will be created $(date)."
96 if [ ! -z $igif ]; then
97 #define OBD_FAIL_FID_IGIF 0x1504
98 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
101 cp $LUSTRE/tests/*.sh $DIR/$tdir/
102 if [ $ndirs -gt 0 ]; then
103 createmany -d $DIR/$tdir/d $ndirs
104 createmany -m $DIR/$tdir/f $ndirs
105 if [ $nfiles -gt 0 ]; then
106 for ((i = 0; i < $ndirs; i++)); do
107 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
108 /dev/null || error "createmany $nfiles"
111 createmany -d $DIR/$tdir/e $ndirs
114 if [ ! -z $igif ]; then
115 touch $DIR/$tdir/dummy
116 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
119 echo "prepared $(date)."
122 run_e2fsck_on_mdt0() {
123 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
125 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
128 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
129 error "(2) Detected inconsistency on MDT0"
131 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
132 error "(3) Fail to start MDT0"
135 wait_all_targets_blocked() {
140 local count=$(do_facet mds1 \
141 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
142 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
143 [[ $count -eq $MDSCOUNT ]] || {
144 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
145 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
154 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
155 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
156 "$MDSCOUNT" $LTIME || {
157 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
158 error "($err) some MDTs are not in ${status}"
165 #define OBD_FAIL_LFSCK_DELAY1 0x1600
166 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
167 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
169 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
171 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
172 [ "$STATUS" == "scanning-phase1" ] ||
173 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
175 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
177 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
178 [ "$STATUS" == "stopped" ] ||
179 error "(6) Expect 'stopped', but got '$STATUS'"
181 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
183 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
184 [ "$STATUS" == "scanning-phase1" ] ||
185 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
188 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
189 mdd.${MDT_DEV}.lfsck_namespace |
190 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
192 error "(9) unexpected status"
195 local repaired=$($SHOW_NAMESPACE |
196 awk '/^updated_phase1/ { print $2 }')
197 [ $repaired -eq 0 ] ||
198 error "(10) Expect nothing to be repaired, but got: $repaired"
200 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
201 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
202 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
203 mdd.${MDT_DEV}.lfsck_namespace |
204 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
206 error "(12) unexpected status"
209 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
210 [ $((scanned1 + 1)) -eq $scanned2 ] ||
211 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
213 echo "stopall, should NOT crash LU-3649"
214 stopall || error "(14) Fail to stopall"
216 run_test 0 "Control LFSCK manually"
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_FID_IGIF 0x1504
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^dirent_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase1/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair lost FID-in-dirent: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 #define OBD_FAIL_FID_LOOKUP 0x1505
335 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
336 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
338 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
340 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
345 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
347 touch $DIR/$tdir/dummy
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
351 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
352 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
353 mdd.${MDT_DEV}.lfsck_namespace |
354 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
356 error "(4) unexpected status"
359 local repaired=$($SHOW_NAMESPACE |
360 awk '/^linkea_repaired/ { print $2 }')
361 # for interop with old server
362 [ -z "$repaired" ] &&
363 repaired=$($SHOW_NAMESPACE |
364 awk '/^updated_phase2/ { print $2 }')
366 [ $repaired -eq 1 ] ||
367 error "(5) Fail to repair crashed linkEA: $repaired"
371 mount_client $MOUNT || error "(6) Fail to start client!"
373 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
374 error "(7) Fail to stat $DIR/$tdir/dummy"
376 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
377 local dummyname=$($LFS fid2path $DIR $dummyfid)
378 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
379 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
381 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
387 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
388 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
389 touch $DIR/$tdir/dummy
391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
393 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
394 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
395 mdd.${MDT_DEV}.lfsck_namespace |
396 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
398 error "(4) unexpected status"
401 local repaired=$($SHOW_NAMESPACE |
402 awk '/^updated_phase2/ { print $2 }')
403 [ $repaired -eq 1 ] ||
404 error "(5) Fail to repair crashed linkEA: $repaired"
408 mount_client $MOUNT || error "(6) Fail to start client!"
410 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
411 error "(7) Fail to stat $DIR/$tdir/dummy"
413 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
414 local dummyname=$($LFS fid2path $DIR $dummyfid)
415 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
416 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
418 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
424 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
426 touch $DIR/$tdir/dummy
428 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
430 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
431 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
432 mdd.${MDT_DEV}.lfsck_namespace |
433 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
435 error "(4) unexpected status"
438 local repaired=$($SHOW_NAMESPACE |
439 awk '/^updated_phase2/ { print $2 }')
440 [ $repaired -eq 1 ] ||
441 error "(5) Fail to repair crashed linkEA: $repaired"
445 mount_client $MOUNT || error "(6) Fail to start client!"
447 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
448 error "(7) Fail to stat $DIR/$tdir/dummy"
450 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
451 local dummyname=$($LFS fid2path $DIR $dummyfid)
452 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
453 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
455 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
461 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
462 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
463 touch $DIR/$tdir/dummy
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
467 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
468 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
469 mdd.${MDT_DEV}.lfsck_namespace |
470 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
472 error "(4) unexpected status"
475 local repaired=$($SHOW_NAMESPACE |
476 awk '/^linkea_repaired/ { print $2 }')
477 [ $repaired -eq 1 ] ||
478 error "(5) Fail to repair crashed linkEA: $repaired"
482 mount_client $MOUNT || error "(6) Fail to start client!"
484 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
485 error "(7) Fail to stat $DIR/$tdir/dummy"
487 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
488 local dummyname=$($LFS fid2path $DIR $dummyfid)
489 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
490 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
492 run_test 2d "LFSCK can recover the missing linkEA entry"
496 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
500 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
502 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
504 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
509 wait_all_targets_blocked namespace completed 4
511 local repaired=$($SHOW_NAMESPACE |
512 awk '/^linkea_repaired/ { print $2 }')
513 [ $repaired -eq 1 ] ||
514 error "(5) Fail to repair crashed linkEA: $repaired"
516 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
517 local name=$($LFS fid2path $DIR $fid)
518 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
519 error "(6) Fail to repair linkEA: $fid $name"
521 run_test 2e "namespace LFSCK can verify remote object linkEA"
527 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
528 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
529 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
531 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
532 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
533 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
535 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
536 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
537 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
539 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
541 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
543 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
545 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
547 mdd.${MDT_DEV}.lfsck_namespace |
548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
550 error "(10) unexpected status"
553 local checked=$($SHOW_NAMESPACE |
554 awk '/^checked_phase2/ { print $2 }')
555 [ $checked -ge 4 ] ||
556 error "(11) Fail to check multiple-linked object: $checked"
558 local repaired=$($SHOW_NAMESPACE |
559 awk '/^multiple_linked_repaired/ { print $2 }')
560 [ $repaired -ge 2 ] ||
561 error "(12) Fail to repair multiple-linked object: $repaired"
563 run_test 3 "LFSCK can verify multiple-linked objects"
567 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
568 skip "OI Scrub not implemented for ZFS" && return
571 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
572 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
574 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
575 echo "start $SINGLEMDS with disabling OI scrub"
576 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
577 error "(2) Fail to start MDS!"
579 #define OBD_FAIL_LFSCK_DELAY2 0x1601
580 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
581 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
582 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
583 mdd.${MDT_DEV}.lfsck_namespace |
584 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
586 error "(5) unexpected status"
589 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
590 [ "$STATUS" == "scanning-phase1" ] ||
591 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
593 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
594 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
595 mdd.${MDT_DEV}.lfsck_namespace |
596 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
598 error "(7) unexpected status"
601 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
602 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
604 local repaired=$($SHOW_NAMESPACE |
605 awk '/^dirent_repaired/ { print $2 }')
606 # for interop with old server
607 [ -z "$repaired" ] &&
608 repaired=$($SHOW_NAMESPACE |
609 awk '/^updated_phase1/ { print $2 }')
611 [ $repaired -ge 9 ] ||
612 error "(9) Fail to re-generate FID-in-dirent: $repaired"
616 mount_client $MOUNT || error "(10) Fail to start client!"
618 #define OBD_FAIL_FID_LOOKUP 0x1505
619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
620 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
621 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
623 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
627 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
628 skip "OI Scrub not implemented for ZFS" && return
631 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
632 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
634 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
635 echo "start $SINGLEMDS with disabling OI scrub"
636 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
637 error "(2) Fail to start MDS!"
639 #define OBD_FAIL_LFSCK_DELAY2 0x1601
640 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
641 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
643 mdd.${MDT_DEV}.lfsck_namespace |
644 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
646 error "(5) unexpected status"
649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
650 [ "$STATUS" == "scanning-phase1" ] ||
651 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
653 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
658 error "(7) unexpected status"
661 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
662 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
664 local repaired=$($SHOW_NAMESPACE |
665 awk '/^dirent_repaired/ { print $2 }')
666 # for interop with old server
667 [ -z "$repaired" ] &&
668 repaired=$($SHOW_NAMESPACE |
669 awk '/^updated_phase1/ { print $2 }')
671 [ $repaired -ge 2 ] ||
672 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
676 mount_client $MOUNT || error "(10) Fail to start client!"
678 #define OBD_FAIL_FID_LOOKUP 0x1505
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
680 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
682 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
684 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
685 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
686 local dummyname=$($LFS fid2path $DIR $dummyfid)
687 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
688 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
690 run_test 5 "LFSCK can handle IGIF object upgrading"
695 #define OBD_FAIL_LFSCK_DELAY1 0x1600
696 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
697 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
699 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
700 [ "$STATUS" == "scanning-phase1" ] ||
701 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
703 # Sleep 3 sec to guarantee at least one object processed by LFSCK
705 # Fail the LFSCK to guarantee there is at least one checkpoint
706 #define OBD_FAIL_LFSCK_FATAL1 0x1608
707 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
708 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
709 mdd.${MDT_DEV}.lfsck_namespace |
710 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
712 error "(4) unexpected status"
715 local POS0=$($SHOW_NAMESPACE |
716 awk '/^last_checkpoint_position/ { print $2 }' |
719 #define OBD_FAIL_LFSCK_DELAY1 0x1600
720 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
721 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
723 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
724 [ "$STATUS" == "scanning-phase1" ] ||
725 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
727 local POS1=$($SHOW_NAMESPACE |
728 awk '/^latest_start_position/ { print $2 }' |
730 [[ $POS0 -lt $POS1 ]] ||
731 error "(7) Expect larger than: $POS0, but got $POS1"
733 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
734 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
735 mdd.${MDT_DEV}.lfsck_namespace |
736 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
738 error "(8) unexpected status"
741 run_test 6a "LFSCK resumes from last checkpoint (1)"
746 #define OBD_FAIL_LFSCK_DELAY2 0x1601
747 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
748 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
750 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
751 [ "$STATUS" == "scanning-phase1" ] ||
752 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
754 # Sleep 5 sec to guarantee that we are in the directory scanning
756 # Fail the LFSCK to guarantee there is at least one checkpoint
757 #define OBD_FAIL_LFSCK_FATAL2 0x1609
758 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
759 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
760 mdd.${MDT_DEV}.lfsck_namespace |
761 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
763 error "(4) unexpected status"
766 local O_POS0=$($SHOW_NAMESPACE |
767 awk '/^last_checkpoint_position/ { print $2 }' |
770 local D_POS0=$($SHOW_NAMESPACE |
771 awk '/^last_checkpoint_position/ { print $4 }')
773 #define OBD_FAIL_LFSCK_DELAY2 0x1601
774 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
775 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
777 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
778 [ "$STATUS" == "scanning-phase1" ] ||
779 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
781 local O_POS1=$($SHOW_NAMESPACE |
782 awk '/^latest_start_position/ { print $2 }' |
784 local D_POS1=$($SHOW_NAMESPACE |
785 awk '/^latest_start_position/ { print $4 }')
787 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
788 [[ $O_POS0 -lt $O_POS1 ]] ||
789 error "(7.1) $O_POS1 is not larger than $O_POS0"
791 [[ $D_POS0 -lt $D_POS1 ]] ||
792 error "(7.2) $D_POS1 is not larger than $D_POS0"
795 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
796 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
797 mdd.${MDT_DEV}.lfsck_namespace |
798 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
800 error "(8) unexpected status"
803 run_test 6b "LFSCK resumes from last checkpoint (2)"
810 #define OBD_FAIL_LFSCK_DELAY2 0x1601
811 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
812 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
814 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
815 [ "$STATUS" == "scanning-phase1" ] ||
816 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
818 # Sleep 3 sec to guarantee at least one object processed by LFSCK
820 echo "stop $SINGLEMDS"
821 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
824 echo "start $SINGLEMDS"
825 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
826 error "(5) Fail to start MDS!"
828 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
829 mdd.${MDT_DEV}.lfsck_namespace |
830 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
832 error "(6) unexpected status"
835 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
841 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
842 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
843 for ((i = 0; i < 20; i++)); do
844 touch $DIR/$tdir/dummy${i}
847 #define OBD_FAIL_LFSCK_DELAY3 0x1602
848 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
849 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
850 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
851 mdd.${MDT_DEV}.lfsck_namespace |
852 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
854 error "(4) unexpected status"
858 echo "stop $SINGLEMDS"
859 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
861 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
862 echo "start $SINGLEMDS"
863 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
864 error "(6) Fail to start MDS!"
866 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
867 mdd.${MDT_DEV}.lfsck_namespace |
868 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
870 error "(7) unexpected status"
873 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
878 formatall > /dev/null
884 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
885 [ "$STATUS" == "init" ] ||
886 error "(2) Expect 'init', but got '$STATUS'"
888 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
890 mkdir $DIR/$tdir/crashed
892 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
893 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
894 for ((i = 0; i < 5; i++)); do
895 touch $DIR/$tdir/dummy${i}
898 umount_client $MOUNT || error "(3) Fail to stop client!"
900 #define OBD_FAIL_LFSCK_DELAY2 0x1601
901 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
902 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
904 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
905 [ "$STATUS" == "scanning-phase1" ] ||
906 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
908 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
910 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
911 [ "$STATUS" == "stopped" ] ||
912 error "(7) Expect 'stopped', but got '$STATUS'"
914 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
916 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
917 [ "$STATUS" == "scanning-phase1" ] ||
918 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
920 #define OBD_FAIL_LFSCK_FATAL2 0x1609
921 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
922 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
923 mdd.${MDT_DEV}.lfsck_namespace |
924 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
926 error "(10) unexpected status"
929 #define OBD_FAIL_LFSCK_DELAY1 0x1600
930 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
931 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
933 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
934 [ "$STATUS" == "scanning-phase1" ] ||
935 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
937 #define OBD_FAIL_LFSCK_CRASH 0x160a
938 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
941 echo "stop $SINGLEMDS"
942 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
944 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
947 echo "start $SINGLEMDS"
948 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
949 error "(14) Fail to start MDS!"
951 local timeout=$(max_recovery_time)
954 while [ $timer -lt $timeout ]; do
955 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
956 mdt.${MDT_DEV}.recovery_status |
957 awk '/^status/ { print \\\$2 }'")
958 [ "$STATUS" != "RECOVERING" ] && break;
963 [ $timer != $timeout ] ||
964 error "(14.1) recovery timeout"
966 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
967 [ "$STATUS" == "crashed" ] ||
968 error "(15) Expect 'crashed', but got '$STATUS'"
970 #define OBD_FAIL_LFSCK_DELAY2 0x1601
971 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
972 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
974 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
975 [ "$STATUS" == "scanning-phase1" ] ||
976 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
978 echo "stop $SINGLEMDS"
979 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
981 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
982 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
984 echo "start $SINGLEMDS"
985 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
986 error "(19) Fail to start MDS!"
989 while [ $timer -lt $timeout ]; do
990 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
991 mdt.${MDT_DEV}.recovery_status |
992 awk '/^status/ { print \\\$2 }'")
993 [ "$STATUS" != "RECOVERING" ] && break;
998 [ $timer != $timeout ] ||
999 error "(19.1) recovery timeout"
1001 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1002 [ "$STATUS" == "paused" ] ||
1003 error "(20) Expect 'paused', but got '$STATUS'"
1005 echo "stop $SINGLEMDS"
1006 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1008 echo "start $SINGLEMDS without resume LFSCK"
1009 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1010 error "(20.2) Fail to start MDS!"
1013 while [ $timer -lt $timeout ]; do
1014 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1015 mdt.${MDT_DEV}.recovery_status |
1016 awk '/^status/ { print \\\$2 }'")
1017 [ "$STATUS" != "RECOVERING" ] && break;
1019 timer=$((timer + 1))
1022 [ $timer != $timeout ] ||
1023 error "(20.3) recovery timeout"
1025 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1026 [ "$STATUS" == "paused" ] ||
1027 error "(20.4) Expect 'paused', but got '$STATUS'"
1029 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1030 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1032 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1033 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1034 mdd.${MDT_DEV}.lfsck_namespace |
1035 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1037 error "(22) unexpected status"
1040 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1041 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1042 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1044 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1045 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1046 mdd.${MDT_DEV}.lfsck_namespace |
1047 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1049 error "(24) unexpected status"
1052 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1053 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1055 run_test 8 "LFSCK state machine"
1058 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1059 skip "Testing on UP system, the speed may be inaccurate."
1063 check_mount_and_prep
1064 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1065 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1066 createmany -o $DIR/$tdir/lfsck/f 5000
1068 local BASE_SPEED1=100
1070 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1073 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1074 [ "$STATUS" == "scanning-phase1" ] ||
1075 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1077 local SPEED=$($SHOW_LAYOUT |
1078 awk '/^average_speed_phase1/ { print $2 }')
1080 # There may be time error, normally it should be less than 2 seconds.
1081 # We allow another 20% schedule error.
1083 # MAX_MARGIN = 1.3 = 13 / 10
1084 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1085 RUN_TIME1 * 13 / 10))
1086 [ $SPEED -lt $MAX_SPEED ] || {
1088 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1089 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1092 # adjust speed limit
1093 local BASE_SPEED2=300
1095 do_facet $SINGLEMDS \
1096 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1099 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1100 # MIN_MARGIN = 0.7 = 7 / 10
1101 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1102 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1103 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1104 [ $SPEED -gt $MIN_SPEED ] || {
1105 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1106 error_ignore LU-5624 \
1107 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1110 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1114 # MAX_MARGIN = 1.3 = 13 / 10
1115 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1116 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1117 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1118 [ $SPEED -lt $MAX_SPEED ] || {
1120 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1121 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1122 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1125 do_nodes $(comma_list $(mdts_nodes)) \
1126 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1127 do_nodes $(comma_list $(osts_nodes)) \
1128 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1130 wait_update_facet $SINGLEMDS \
1131 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1132 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1133 error "(7) Failed to get expected 'completed'"
1135 run_test 9a "LFSCK speed control (1)"
1138 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1139 skip "Testing on UP system, the speed may be inaccurate."
1145 echo "Preparing another 50 * 50 files (with error) at $(date)."
1146 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1147 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1148 createmany -d $DIR/$tdir/d 50
1149 createmany -m $DIR/$tdir/f 50
1150 for ((i = 0; i < 50; i++)); do
1151 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1154 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1155 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1156 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1157 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1158 mdd.${MDT_DEV}.lfsck_namespace |
1159 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1161 error "(5) unexpected status"
1164 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1165 echo "Prepared at $(date)."
1167 local BASE_SPEED1=50
1169 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1172 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1173 [ "$STATUS" == "scanning-phase2" ] ||
1174 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1176 local SPEED=$($SHOW_NAMESPACE |
1177 awk '/^average_speed_phase2/ { print $2 }')
1178 # There may be time error, normally it should be less than 2 seconds.
1179 # We allow another 20% schedule error.
1181 # MAX_MARGIN = 1.3 = 13 / 10
1182 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1183 RUN_TIME1 * 13 / 10))
1184 [ $SPEED -lt $MAX_SPEED ] || {
1186 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1187 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1190 # adjust speed limit
1191 local BASE_SPEED2=150
1193 do_facet $SINGLEMDS \
1194 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1197 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1198 # MIN_MARGIN = 0.7 = 7 / 10
1199 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1200 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1201 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1202 [ $SPEED -gt $MIN_SPEED ] || {
1203 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1204 error_ignore LU-5624 \
1205 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1208 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1212 # MAX_MARGIN = 1.3 = 13 / 10
1213 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1214 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1215 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1216 [ $SPEED -lt $MAX_SPEED ] || {
1218 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1219 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1220 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1223 do_nodes $(comma_list $(mdts_nodes)) \
1224 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1225 do_nodes $(comma_list $(osts_nodes)) \
1226 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1227 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1228 mdd.${MDT_DEV}.lfsck_namespace |
1229 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1231 error "(11) unexpected status"
1234 run_test 9b "LFSCK speed control (2)"
1238 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1239 skip "lookup(..)/linkea on ZFS issue" && return
1243 echo "Preparing more files with error at $(date)."
1244 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1247 for ((i = 0; i < 1000; i = $((i+2)))); do
1248 mkdir -p $DIR/$tdir/d${i}
1249 touch $DIR/$tdir/f${i}
1250 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1253 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1254 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1256 for ((i = 1; i < 1000; i = $((i+2)))); do
1257 mkdir -p $DIR/$tdir/d${i}
1258 touch $DIR/$tdir/f${i}
1259 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1263 echo "Prepared at $(date)."
1265 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1267 umount_client $MOUNT
1268 mount_client $MOUNT || error "(3) Fail to start client!"
1270 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1273 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1274 [ "$STATUS" == "scanning-phase1" ] ||
1275 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1277 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1279 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1281 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1283 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1285 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1287 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1289 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1291 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1292 error "(14) Fail to softlink!"
1294 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1295 [ "$STATUS" == "scanning-phase1" ] ||
1296 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1298 do_nodes $(comma_list $(mdts_nodes)) \
1299 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1300 do_nodes $(comma_list $(osts_nodes)) \
1301 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1302 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1303 mdd.${MDT_DEV}.lfsck_namespace |
1304 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1306 error "(16) unexpected status"
1309 run_test 10 "System is available during LFSCK scanning"
1312 ost_remove_lastid() {
1315 local rcmd="do_facet ost${ost}"
1317 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1319 # step 1: local mount
1320 mount_fstype ost${ost} || return 1
1321 # step 2: remove the specified LAST_ID
1322 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1324 unmount_fstype ost${ost} || return 2
1328 check_mount_and_prep
1329 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1330 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1335 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1337 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1338 error "(2) Fail to start ost1"
1340 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1341 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1343 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1344 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1346 wait_update_facet ost1 "$LCTL get_param -n \
1347 obdfilter.${OST_DEV}.lfsck_layout |
1348 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1350 error "(5) unexpected status"
1353 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1355 wait_update_facet ost1 "$LCTL get_param -n \
1356 obdfilter.${OST_DEV}.lfsck_layout |
1357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1359 error "(6) unexpected status"
1362 echo "the LAST_ID(s) should have been rebuilt"
1363 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1364 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1366 run_test 11a "LFSCK can rebuild lost last_id"
1369 check_mount_and_prep
1370 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1372 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1373 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1374 do_facet ost1 $LCTL set_param fail_loc=0x160d
1376 local count=$(precreated_ost_obj_count 0 0)
1378 createmany -o $DIR/$tdir/f $((count + 32))
1380 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1381 local seq=$(do_facet mds1 $LCTL get_param -n \
1382 osp.${proc_path}.prealloc_last_seq)
1383 local lastid1=$(do_facet ost1 "lctl get_param -n \
1384 obdfilter.${ost1_svc}.last_id" | grep $seq |
1385 awk -F: '{ print $2 }')
1387 umount_client $MOUNT
1388 stop ost1 || error "(1) Fail to stop ost1"
1390 #define OBD_FAIL_OST_ENOSPC 0x215
1391 do_facet ost1 $LCTL set_param fail_loc=0x215
1393 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1394 error "(2) Fail to start ost1"
1396 for ((i = 0; i < 60; i++)); do
1397 lastid2=$(do_facet ost1 "lctl get_param -n \
1398 obdfilter.${ost1_svc}.last_id" | grep $seq |
1399 awk -F: '{ print $2 }')
1400 [ ! -z $lastid2 ] && break;
1404 echo "the on-disk LAST_ID should be smaller than the expected one"
1405 [ $lastid1 -gt $lastid2 ] ||
1406 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1408 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1409 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1411 wait_update_facet ost1 "$LCTL get_param -n \
1412 obdfilter.${OST_DEV}.lfsck_layout |
1413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1415 error "(6) unexpected status"
1418 stop ost1 || error "(7) Fail to stop ost1"
1420 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1421 error "(8) Fail to start ost1"
1423 echo "the on-disk LAST_ID should have been rebuilt"
1424 wait_update_facet ost1 "$LCTL get_param -n \
1425 obdfilter.${ost1_svc}.last_id | grep $seq |
1426 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1427 do_facet ost1 $LCTL get_param -n \
1428 obdfilter.${ost1_svc}.last_id
1429 error "(9) expect lastid1 $seq:$lastid1"
1432 do_facet ost1 $LCTL set_param fail_loc=0
1433 stopall || error "(10) Fail to stopall"
1435 run_test 11b "LFSCK can rebuild crashed last_id"
1438 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1440 check_mount_and_prep
1441 for k in $(seq $MDSCOUNT); do
1442 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1443 createmany -o $DIR/$tdir/${k}/f 100 ||
1444 error "(0) Fail to create 100 files."
1447 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1448 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1449 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1451 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1452 wait_all_targets namespace scanning-phase1 3
1454 echo "Stop namespace LFSCK on all targets by single lctl command."
1455 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1456 error "(4) Fail to stop LFSCK on all devices!"
1458 echo "All the LFSCK targets should be in 'stopped' status."
1459 wait_all_targets_blocked namespace stopped 5
1461 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1462 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1463 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1465 echo "All the LFSCK targets should be in 'completed' status."
1466 wait_all_targets_blocked namespace completed 7
1468 start_full_debug_logging
1470 echo "Start layout LFSCK on all targets by single command (-s 1)."
1471 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1472 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1474 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1475 wait_all_targets layout scanning-phase1 9
1477 echo "Stop layout LFSCK on all targets by single lctl command."
1478 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1479 error "(10) Fail to stop LFSCK on all devices!"
1481 echo "All the LFSCK targets should be in 'stopped' status."
1482 wait_all_targets_blocked layout stopped 11
1484 for k in $(seq $OSTCOUNT); do
1485 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1486 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1487 awk '/^status/ { print $2 }')
1488 [ "$STATUS" == "stopped" ] ||
1489 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1492 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1493 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1494 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1496 echo "All the LFSCK targets should be in 'completed' status."
1497 wait_all_targets_blocked layout completed 14
1499 stop_full_debug_logging
1501 run_test 12a "single command to trigger LFSCK on all devices"
1504 check_mount_and_prep
1506 echo "Start LFSCK without '-M' specified."
1507 do_facet mds1 $LCTL lfsck_start -A -r ||
1508 error "(0) Fail to start LFSCK without '-M'"
1510 wait_all_targets_blocked namespace completed 1
1511 wait_all_targets_blocked layout completed 2
1513 local count=$(do_facet mds1 $LCTL dl |
1514 awk '{ print $3 }' | grep mdt | wc -l)
1515 if [ $count -gt 1 ]; then
1517 echo "Start layout LFSCK on the node with multipe targets,"
1518 echo "but not specify '-M'/'-A' option. Should get failure."
1520 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1521 error "(3) Start layout LFSCK should fail" || true
1524 run_test 12b "auto detect Lustre device"
1528 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1529 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1530 echo "MDT-object FID."
1533 check_mount_and_prep
1535 echo "Inject failure stub to simulate bad lmm_oi"
1536 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1537 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1538 createmany -o $DIR/$tdir/f 1
1539 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1540 error "(0) Fail to create PFL $DIR/$tdir/f1"
1541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1543 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1544 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1547 mdd.${MDT_DEV}.lfsck_layout |
1548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1550 error "(2) unexpected status"
1553 local repaired=$($SHOW_LAYOUT |
1554 awk '/^repaired_others/ { print $2 }')
1555 [ $repaired -eq 2 ] ||
1556 error "(3) Fail to repair crashed lmm_oi: $repaired"
1558 run_test 13 "LFSCK can repair crashed lmm_oi"
1562 echo "The OST-object referenced by the MDT-object should be there;"
1563 echo "otherwise, the LFSCK should re-create the missing OST-object."
1564 echo "without '--delay-create-ostobj' option."
1567 check_mount_and_prep
1568 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1570 echo "Inject failure stub to simulate dangling referenced MDT-object"
1571 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1572 do_facet ost1 $LCTL set_param fail_loc=0x1610
1573 local count=$(precreated_ost_obj_count 0 0)
1575 createmany -o $DIR/$tdir/f $((count + 16)) ||
1576 error "(0.1) Fail to create $DIR/$tdir/fx"
1577 touch $DIR/$tdir/guard0
1579 for ((i = 0; i < 16; i++)); do
1580 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1581 $DIR/$tdir/f_comp${i} ||
1582 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1584 touch $DIR/$tdir/guard1
1586 do_facet ost1 $LCTL set_param fail_loc=0
1588 start_full_debug_logging
1590 # exhaust other pre-created dangling cases
1591 count=$(precreated_ost_obj_count 0 0)
1592 createmany -o $DIR/$tdir/a $count ||
1593 error "(0.5) Fail to create $count files."
1595 echo "'ls' should fail because of dangling referenced MDT-object"
1596 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1598 echo "Trigger layout LFSCK to find out dangling reference"
1599 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1601 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1602 mdd.${MDT_DEV}.lfsck_layout |
1603 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1605 error "(3) unexpected status"
1608 local repaired=$($SHOW_LAYOUT |
1609 awk '/^repaired_dangling/ { print $2 }')
1610 [ $repaired -ge 32 ] ||
1611 error "(4) Fail to repair dangling reference: $repaired"
1613 echo "'stat' should fail because of not repair dangling by default"
1614 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1615 error "(5.1) stat should fail"
1616 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1617 error "(5.2) stat should fail"
1619 echo "Trigger layout LFSCK to repair dangling reference"
1620 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1622 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1623 mdd.${MDT_DEV}.lfsck_layout |
1624 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1626 error "(7) unexpected status"
1629 # There may be some async LFSCK updates in processing, wait for
1630 # a while until the target reparation has been done. LU-4970.
1632 echo "'stat' should success after layout LFSCK repairing"
1633 wait_update_facet client "stat $DIR/$tdir/guard0 |
1634 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1635 stat $DIR/$tdir/guard0
1637 error "(8.1) unexpected size"
1640 wait_update_facet client "stat $DIR/$tdir/guard1 |
1641 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1642 stat $DIR/$tdir/guard1
1644 error "(8.2) unexpected size"
1647 repaired=$($SHOW_LAYOUT |
1648 awk '/^repaired_dangling/ { print $2 }')
1649 [ $repaired -ge 32 ] ||
1650 error "(9) Fail to repair dangling reference: $repaired"
1652 stop_full_debug_logging
1654 echo "stopall to cleanup object cache"
1657 setupall > /dev/null
1659 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1663 echo "The OST-object referenced by the MDT-object should be there;"
1664 echo "otherwise, the LFSCK should re-create the missing OST-object."
1665 echo "with '--delay-create-ostobj' option."
1668 check_mount_and_prep
1669 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1671 echo "Inject failure stub to simulate dangling referenced MDT-object"
1672 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1673 do_facet ost1 $LCTL set_param fail_loc=0x1610
1674 local count=$(precreated_ost_obj_count 0 0)
1676 createmany -o $DIR/$tdir/f $((count + 31))
1677 touch $DIR/$tdir/guard
1678 do_facet ost1 $LCTL set_param fail_loc=0
1680 start_full_debug_logging
1682 # exhaust other pre-created dangling cases
1683 count=$(precreated_ost_obj_count 0 0)
1684 createmany -o $DIR/$tdir/a $count ||
1685 error "(0) Fail to create $count files."
1687 echo "'ls' should fail because of dangling referenced MDT-object"
1688 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1690 echo "Trigger layout LFSCK to find out dangling reference"
1691 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1693 wait_all_targets_blocked layout completed 3
1695 local repaired=$($SHOW_LAYOUT |
1696 awk '/^repaired_dangling/ { print $2 }')
1697 [ $repaired -ge 32 ] ||
1698 error "(4) Fail to repair dangling reference: $repaired"
1700 echo "'stat' should fail because of not repair dangling by default"
1701 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1703 echo "Trigger layout LFSCK to repair dangling reference"
1704 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1706 wait_all_targets_blocked layout completed 7
1708 # There may be some async LFSCK updates in processing, wait for
1709 # a while until the target reparation has been done. LU-4970.
1711 echo "'stat' should success after layout LFSCK repairing"
1712 wait_update_facet client "stat $DIR/$tdir/guard |
1713 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1714 stat $DIR/$tdir/guard
1716 error "(8) unexpected size"
1719 repaired=$($SHOW_LAYOUT |
1720 awk '/^repaired_dangling/ { print $2 }')
1721 [ $repaired -ge 32 ] ||
1722 error "(9) Fail to repair dangling reference: $repaired"
1724 stop_full_debug_logging
1726 echo "stopall to cleanup object cache"
1729 setupall > /dev/null
1731 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1735 echo "If the OST-object referenced by the MDT-object back points"
1736 echo "to some non-exist MDT-object, then the LFSCK should repair"
1737 echo "the OST-object to back point to the right MDT-object."
1740 check_mount_and_prep
1741 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1743 echo "Inject failure stub to make the OST-object to back point to"
1744 echo "non-exist MDT-object."
1745 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1747 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1748 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1749 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1751 error "(0) Fail to create PFL $DIR/$tdir/f1"
1752 # 'dd' will trigger punch RPC firstly on every OST-objects.
1753 # So even though some OST-object will not be write by 'dd',
1754 # as long as it is allocated (may be NOT allocated in pfl_3b)
1755 # its layout information will be set also.
1756 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1757 cancel_lru_locks osc
1758 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1760 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1761 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1763 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1764 mdd.${MDT_DEV}.lfsck_layout |
1765 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1767 error "(2) unexpected status"
1770 local repaired=$($SHOW_LAYOUT |
1771 awk '/^repaired_unmatched_pair/ { print $2 }')
1772 [ $repaired -ge 3 ] ||
1773 error "(3) Fail to repair unmatched pair: $repaired"
1775 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1779 echo "If the OST-object referenced by the MDT-object back points"
1780 echo "to other MDT-object that doesn't recognize the OST-object,"
1781 echo "then the LFSCK should repair it to back point to the right"
1782 echo "MDT-object (the first one)."
1785 check_mount_and_prep
1786 mkdir -p $DIR/$tdir/0
1787 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1788 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1789 cancel_lru_locks osc
1791 echo "Inject failure stub to make the OST-object to back point to"
1792 echo "other MDT-object"
1795 [ $OSTCOUNT -ge 2 ] && stripes=2
1797 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1798 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1799 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1800 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1802 error "(0) Fail to create PFL $DIR/$tdir/f1"
1803 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1804 cancel_lru_locks osc
1805 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1807 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1808 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1810 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1811 mdd.${MDT_DEV}.lfsck_layout |
1812 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1814 error "(2) unexpected status"
1817 local repaired=$($SHOW_LAYOUT |
1818 awk '/^repaired_unmatched_pair/ { print $2 }')
1819 [ $repaired -eq 4 ] ||
1820 error "(3) Fail to repair unmatched pair: $repaired"
1822 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1825 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1827 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1828 skip "Skip the test after 2.7.55 see LU-6437" && return
1831 echo "According to current metadata migration implementation,"
1832 echo "before the old MDT-object is removed, both the new MDT-object"
1833 echo "and old MDT-object will reference the same LOV layout. Then if"
1834 echo "the layout LFSCK finds the new MDT-object by race, it will"
1835 echo "regard related OST-object(s) as multiple referenced case, and"
1836 echo "will try to create new OST-object(s) for the new MDT-object."
1837 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1838 echo "MDT-object before confirm the multiple referenced case."
1841 check_mount_and_prep
1842 $LFS mkdir -i 1 $DIR/$tdir/a1
1843 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1844 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1845 cancel_lru_locks osc
1847 echo "Inject failure stub on MDT1 to delay the migration"
1849 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1850 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1851 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1852 $LFS migrate -m 0 $DIR/$tdir/a1 &
1855 echo "Trigger layout LFSCK to race with the migration"
1856 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1858 wait_all_targets_blocked layout completed 2
1860 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1861 local repaired=$($SHOW_LAYOUT |
1862 awk '/^repaired_unmatched_pair/ { print $2 }')
1863 [ $repaired -eq 1 ] ||
1864 error "(3) Fail to repair unmatched pair: $repaired"
1866 repaired=$($SHOW_LAYOUT |
1867 awk '/^repaired_multiple_referenced/ { print $2 }')
1868 [ $repaired -eq 0 ] ||
1869 error "(4) Unexpectedly repaird multiple references: $repaired"
1871 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1875 echo "If the OST-object's owner information does not match the owner"
1876 echo "information stored in the MDT-object, then the LFSCK trust the"
1877 echo "MDT-object and update the OST-object's owner information."
1880 check_mount_and_prep
1881 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1882 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1883 cancel_lru_locks osc
1885 # created but no setattr or write to the file.
1887 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1888 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1890 echo "Inject failure stub to skip OST-object owner changing"
1891 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1893 chown 1.1 $DIR/$tdir/f0
1894 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1896 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1899 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1901 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1902 mdd.${MDT_DEV}.lfsck_layout |
1903 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1905 error "(2) unexpected status"
1908 local repaired=$($SHOW_LAYOUT |
1909 awk '/^repaired_inconsistent_owner/ { print $2 }')
1910 [ $repaired -eq 1 ] ||
1911 error "(3) Fail to repair inconsistent owner: $repaired"
1913 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1917 echo "If more than one MDT-objects reference the same OST-object,"
1918 echo "and the OST-object only recognizes one MDT-object, then the"
1919 echo "LFSCK should create new OST-objects for such non-recognized"
1923 check_mount_and_prep
1924 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1926 echo "Inject failure stub to make two MDT-objects to refernce"
1927 echo "the OST-object"
1929 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1930 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1931 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1932 cancel_lru_locks mdc
1933 cancel_lru_locks osc
1935 createmany -o $DIR/$tdir/f 1
1936 cancel_lru_locks mdc
1937 cancel_lru_locks osc
1939 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1941 error "(0) Fail to create PFL $DIR/$tdir/f1"
1942 cancel_lru_locks mdc
1943 cancel_lru_locks osc
1944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1946 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1947 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1948 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1949 [ $size -eq 1048576 ] ||
1950 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1952 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1953 [ $size -eq 1048576 ] ||
1954 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1956 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1959 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1961 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1962 mdd.${MDT_DEV}.lfsck_layout |
1963 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1965 error "(3) unexpected status"
1968 local repaired=$($SHOW_LAYOUT |
1969 awk '/^repaired_multiple_referenced/ { print $2 }')
1970 [ $repaired -eq 2 ] ||
1971 error "(4) Fail to repair multiple references: $repaired"
1973 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1974 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1975 error "(5) Fail to write f0."
1976 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1977 [ $size -eq 1048576 ] ||
1978 error "(6) guard size should be 1048576, but got $size"
1980 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1981 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1982 error "(7) Fail to write f1."
1983 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1984 [ $size -eq 1048576 ] ||
1985 error "(8) guard size should be 1048576, but got $size"
1987 run_test 17 "LFSCK can repair multiple references"
1989 $LCTL set_param debug=+cache > /dev/null
1993 echo "The target MDT-object is there, but related stripe information"
1994 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1995 echo "layout EA entries."
1998 check_mount_and_prep
1999 $LFS mkdir -i 0 $DIR/$tdir/a1
2000 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2001 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2003 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2005 $LFS path2fid $DIR/$tdir/a1/f1
2006 $LFS getstripe $DIR/$tdir/a1/f1
2008 if [ $MDSCOUNT -ge 2 ]; then
2009 $LFS mkdir -i 1 $DIR/$tdir/a2
2010 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2011 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2012 $LFS path2fid $DIR/$tdir/a2/f2
2013 $LFS getstripe $DIR/$tdir/a2/f2
2016 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2017 error "(0) Fail to create PFL $DIR/$tdir/f3"
2019 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2021 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2023 $LFS path2fid $DIR/$tdir/f3
2024 $LFS getstripe $DIR/$tdir/f3
2026 cancel_lru_locks osc
2028 echo "Inject failure, to make the MDT-object lost its layout EA"
2029 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2030 do_facet mds1 $LCTL set_param fail_loc=0x1615
2031 chown 1.1 $DIR/$tdir/a1/f1
2033 if [ $MDSCOUNT -ge 2 ]; then
2034 do_facet mds2 $LCTL set_param fail_loc=0x1615
2035 chown 1.1 $DIR/$tdir/a2/f2
2038 chown 1.1 $DIR/$tdir/f3
2043 do_facet mds1 $LCTL set_param fail_loc=0
2044 if [ $MDSCOUNT -ge 2 ]; then
2045 do_facet mds2 $LCTL set_param fail_loc=0
2048 cancel_lru_locks mdc
2049 cancel_lru_locks osc
2051 echo "The file size should be incorrect since layout EA is lost"
2052 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2053 [ "$cur_size" != "$saved_size1" ] ||
2054 error "(1) Expect incorrect file1 size"
2056 if [ $MDSCOUNT -ge 2 ]; then
2057 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2058 [ "$cur_size" != "$saved_size1" ] ||
2059 error "(2) Expect incorrect file2 size"
2062 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2063 [ "$cur_size" != "$saved_size2" ] ||
2064 error "(1.2) Expect incorrect file3 size"
2066 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2067 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2069 for k in $(seq $MDSCOUNT); do
2070 # The LFSCK status query internal is 30 seconds. For the case
2071 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2072 # time to guarantee the status sync up.
2073 wait_update_facet mds${k} "$LCTL get_param -n \
2074 mdd.$(facet_svc mds${k}).lfsck_layout |
2075 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2076 error "(4) MDS${k} is not the expected 'completed'"
2079 for k in $(seq $OSTCOUNT); do
2080 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2081 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2082 awk '/^status/ { print $2 }')
2083 [ "$cur_status" == "completed" ] ||
2084 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2087 local repaired=$(do_facet mds1 $LCTL get_param -n \
2088 mdd.$(facet_svc mds1).lfsck_layout |
2089 awk '/^repaired_orphan/ { print $2 }')
2090 [ $repaired -eq 3 ] ||
2091 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2093 if [ $MDSCOUNT -ge 2 ]; then
2094 repaired=$(do_facet mds2 $LCTL get_param -n \
2095 mdd.$(facet_svc mds2).lfsck_layout |
2096 awk '/^repaired_orphan/ { print $2 }')
2097 [ $repaired -eq 2 ] ||
2098 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2101 $LFS path2fid $DIR/$tdir/a1/f1
2102 $LFS getstripe $DIR/$tdir/a1/f1
2104 if [ $MDSCOUNT -ge 2 ]; then
2105 $LFS path2fid $DIR/$tdir/a2/f2
2106 $LFS getstripe $DIR/$tdir/a2/f2
2109 $LFS path2fid $DIR/$tdir/f3
2110 $LFS getstripe $DIR/$tdir/f3
2112 echo "The file size should be correct after layout LFSCK scanning"
2113 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2114 [ "$cur_size" == "$saved_size1" ] ||
2115 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2117 if [ $MDSCOUNT -ge 2 ]; then
2118 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2119 [ "$cur_size" == "$saved_size1" ] ||
2120 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2123 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2124 [ "$cur_size" == "$saved_size2" ] ||
2125 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2127 run_test 18a "Find out orphan OST-object and repair it (1)"
2131 echo "The target MDT-object is lost. The LFSCK should re-create the"
2132 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2133 echo "can move it back to normal namespace manually."
2136 check_mount_and_prep
2137 $LFS mkdir -i 0 $DIR/$tdir/a1
2138 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2139 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2140 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2141 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2143 $LFS getstripe $DIR/$tdir/a1/f1
2145 if [ $MDSCOUNT -ge 2 ]; then
2146 $LFS mkdir -i 1 $DIR/$tdir/a2
2147 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2148 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2149 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2151 $LFS getstripe $DIR/$tdir/a2/f2
2154 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2155 error "(0) Fail to create PFL $DIR/$tdir/f3"
2157 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2159 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2160 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2162 $LFS getstripe $DIR/$tdir/f3
2164 cancel_lru_locks osc
2166 echo "Inject failure, to simulate the case of missing the MDT-object"
2167 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2168 do_facet mds1 $LCTL set_param fail_loc=0x1616
2169 rm -f $DIR/$tdir/a1/f1
2171 if [ $MDSCOUNT -ge 2 ]; then
2172 do_facet mds2 $LCTL set_param fail_loc=0x1616
2173 rm -f $DIR/$tdir/a2/f2
2181 do_facet mds1 $LCTL set_param fail_loc=0
2182 if [ $MDSCOUNT -ge 2 ]; then
2183 do_facet mds2 $LCTL set_param fail_loc=0
2186 cancel_lru_locks mdc
2187 cancel_lru_locks osc
2189 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2190 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2192 for k in $(seq $MDSCOUNT); do
2193 # The LFSCK status query internal is 30 seconds. For the case
2194 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2195 # time to guarantee the status sync up.
2196 wait_update_facet mds${k} "$LCTL get_param -n \
2197 mdd.$(facet_svc mds${k}).lfsck_layout |
2198 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2199 error "(2) MDS${k} is not the expected 'completed'"
2202 for k in $(seq $OSTCOUNT); do
2203 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2204 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2205 awk '/^status/ { print $2 }')
2206 [ "$cur_status" == "completed" ] ||
2207 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2210 local repaired=$(do_facet mds1 $LCTL get_param -n \
2211 mdd.$(facet_svc mds1).lfsck_layout |
2212 awk '/^repaired_orphan/ { print $2 }')
2213 [ $repaired -eq 3 ] ||
2214 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2216 if [ $MDSCOUNT -ge 2 ]; then
2217 repaired=$(do_facet mds2 $LCTL get_param -n \
2218 mdd.$(facet_svc mds2).lfsck_layout |
2219 awk '/^repaired_orphan/ { print $2 }')
2220 [ $repaired -eq 2 ] ||
2221 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2224 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2225 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2226 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2228 if [ $MDSCOUNT -ge 2 ]; then
2229 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2230 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2233 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2234 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2236 $LFS path2fid $DIR/$tdir/a1/f1
2237 $LFS getstripe $DIR/$tdir/a1/f1
2239 if [ $MDSCOUNT -ge 2 ]; then
2240 $LFS path2fid $DIR/$tdir/a2/f2
2241 $LFS getstripe $DIR/$tdir/a2/f2
2244 $LFS path2fid $DIR/$tdir/f3
2245 $LFS getstripe $DIR/$tdir/f3
2247 echo "The file size should be correct after layout LFSCK scanning"
2248 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2249 [ "$cur_size" == "$saved_size1" ] ||
2250 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2252 if [ $MDSCOUNT -ge 2 ]; then
2253 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2254 [ "$cur_size" == "$saved_size1" ] ||
2255 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2258 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2259 [ "$cur_size" == "$saved_size2" ] ||
2260 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2262 run_test 18b "Find out orphan OST-object and repair it (2)"
2266 echo "The target MDT-object is lost, and the OST-object FID is missing."
2267 echo "The LFSCK should re-create the MDT-object with new FID under the "
2268 echo "directory .lustre/lost+found/MDTxxxx."
2271 check_mount_and_prep
2272 $LFS mkdir -i 0 $DIR/$tdir/a1
2273 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2275 echo "Inject failure, to simulate the case of missing parent FID"
2276 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2277 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2279 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2280 $LFS getstripe $DIR/$tdir/a1/f1
2282 if [ $MDSCOUNT -ge 2 ]; then
2283 $LFS mkdir -i 1 $DIR/$tdir/a2
2284 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2285 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2286 $LFS getstripe $DIR/$tdir/a2/f2
2289 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2290 error "(0) Fail to create PFL $DIR/$tdir/f3"
2292 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2293 $LFS getstripe $DIR/$tdir/f3
2295 cancel_lru_locks osc
2296 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2298 echo "Inject failure, to simulate the case of missing the MDT-object"
2299 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2300 do_facet mds1 $LCTL set_param fail_loc=0x1616
2301 rm -f $DIR/$tdir/a1/f1
2303 if [ $MDSCOUNT -ge 2 ]; then
2304 do_facet mds2 $LCTL set_param fail_loc=0x1616
2305 rm -f $DIR/$tdir/a2/f2
2313 do_facet mds1 $LCTL set_param fail_loc=0
2314 if [ $MDSCOUNT -ge 2 ]; then
2315 do_facet mds2 $LCTL set_param fail_loc=0
2318 cancel_lru_locks mdc
2319 cancel_lru_locks osc
2321 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2322 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2324 for k in $(seq $MDSCOUNT); do
2325 # The LFSCK status query internal is 30 seconds. For the case
2326 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2327 # time to guarantee the status sync up.
2328 wait_update_facet mds${k} "$LCTL get_param -n \
2329 mdd.$(facet_svc mds${k}).lfsck_layout |
2330 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2331 error "(2) MDS${k} is not the expected 'completed'"
2334 for k in $(seq $OSTCOUNT); do
2335 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2336 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2337 awk '/^status/ { print $2 }')
2338 [ "$cur_status" == "completed" ] ||
2339 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2342 if [ $MDSCOUNT -ge 2 ]; then
2348 local repaired=$(do_facet mds1 $LCTL get_param -n \
2349 mdd.$(facet_svc mds1).lfsck_layout |
2350 awk '/^repaired_orphan/ { print $2 }')
2351 [ $repaired -eq $expected ] ||
2352 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2354 if [ $MDSCOUNT -ge 2 ]; then
2355 repaired=$(do_facet mds2 $LCTL get_param -n \
2356 mdd.$(facet_svc mds2).lfsck_layout |
2357 awk '/^repaired_orphan/ { print $2 }')
2358 [ $repaired -eq 0 ] ||
2359 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2362 ls -ail $MOUNT/.lustre/lost+found/
2364 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2365 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2366 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2368 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2371 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2372 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2373 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2375 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2376 [ ! -z "$cname" ] ||
2377 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2379 run_test 18c "Find out orphan OST-object and repair it (3)"
2383 echo "The target MDT-object layout EA is corrupted, but the right"
2384 echo "OST-object is still alive as orphan. The layout LFSCK will"
2385 echo "not create new OST-object to occupy such slot."
2388 check_mount_and_prep
2390 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2391 echo "guard" > $DIR/$tdir/a1/f1
2392 echo "foo" > $DIR/$tdir/a1/f2
2394 echo "guard" > $DIR/$tdir/a1/f3
2395 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2396 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2397 echo "foo" > $DIR/$tdir/a1/f4
2399 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2400 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2401 $LFS path2fid $DIR/$tdir/a1/f1
2402 $LFS getstripe $DIR/$tdir/a1/f1
2403 $LFS path2fid $DIR/$tdir/a1/f2
2404 $LFS getstripe $DIR/$tdir/a1/f2
2405 $LFS path2fid $DIR/$tdir/a1/f3
2406 $LFS getstripe $DIR/$tdir/a1/f3
2407 $LFS path2fid $DIR/$tdir/a1/f4
2408 $LFS getstripe $DIR/$tdir/a1/f4
2409 cancel_lru_locks osc
2411 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2412 echo "to reference the same OST-object (which is f1's OST-obejct)."
2413 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2414 echo "dangling reference case, but f2's old OST-object is there."
2416 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2417 echo "to reference the same OST-object (which is f3's OST-obejct)."
2418 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2419 echo "dangling reference case, but f4's old OST-object is there."
2422 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2424 chown 1.1 $DIR/$tdir/a1/f2
2425 chown 1.1 $DIR/$tdir/a1/f4
2426 rm -f $DIR/$tdir/a1/f1
2427 rm -f $DIR/$tdir/a1/f3
2430 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2432 echo "stopall to cleanup object cache"
2435 setupall > /dev/null
2437 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2438 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2440 for k in $(seq $MDSCOUNT); do
2441 # The LFSCK status query internal is 30 seconds. For the case
2442 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2443 # time to guarantee the status sync up.
2444 wait_update_facet mds${k} "$LCTL get_param -n \
2445 mdd.$(facet_svc mds${k}).lfsck_layout |
2446 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2447 error "(3) MDS${k} is not the expected 'completed'"
2450 for k in $(seq $OSTCOUNT); do
2451 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2452 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2453 awk '/^status/ { print $2 }')
2454 [ "$cur_status" == "completed" ] ||
2455 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2458 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2459 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2460 awk '/^repaired_orphan/ { print $2 }')
2461 [ $repaired -eq 2 ] ||
2462 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2464 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2465 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2466 awk '/^repaired_dangling/ { print $2 }')
2467 [ $repaired -eq 0 ] ||
2468 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2470 echo "The file size should be correct after layout LFSCK scanning"
2471 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2472 [ "$cur_size" == "$saved_size1" ] ||
2473 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2475 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2476 [ "$cur_size" == "$saved_size2" ] ||
2477 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2479 echo "The LFSCK should find back the original data."
2480 cat $DIR/$tdir/a1/f2
2481 $LFS path2fid $DIR/$tdir/a1/f2
2482 $LFS getstripe $DIR/$tdir/a1/f2
2483 cat $DIR/$tdir/a1/f4
2484 $LFS path2fid $DIR/$tdir/a1/f4
2485 $LFS getstripe $DIR/$tdir/a1/f4
2487 run_test 18d "Find out orphan OST-object and repair it (4)"
2491 echo "The target MDT-object layout EA slot is occpuied by some new"
2492 echo "created OST-object when repair dangling reference case. Such"
2493 echo "conflict OST-object has been modified by others. To keep the"
2494 echo "new data, the LFSCK will create a new file to refernece this"
2495 echo "old orphan OST-object."
2498 check_mount_and_prep
2500 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2501 echo "guard" > $DIR/$tdir/a1/f1
2502 echo "foo" > $DIR/$tdir/a1/f2
2504 echo "guard" > $DIR/$tdir/a1/f3
2505 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2506 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2507 echo "foo" > $DIR/$tdir/a1/f4
2509 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2510 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2512 $LFS path2fid $DIR/$tdir/a1/f1
2513 $LFS getstripe $DIR/$tdir/a1/f1
2514 $LFS path2fid $DIR/$tdir/a1/f2
2515 $LFS getstripe $DIR/$tdir/a1/f2
2516 $LFS path2fid $DIR/$tdir/a1/f3
2517 $LFS getstripe $DIR/$tdir/a1/f3
2518 $LFS path2fid $DIR/$tdir/a1/f4
2519 $LFS getstripe $DIR/$tdir/a1/f4
2520 cancel_lru_locks osc
2522 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2523 echo "to reference the same OST-object (which is f1's OST-obejct)."
2524 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2525 echo "dangling reference case, but f2's old OST-object is there."
2527 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2528 echo "to reference the same OST-object (which is f3's OST-obejct)."
2529 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2530 echo "dangling reference case, but f4's old OST-object is there."
2533 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2534 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2535 chown 1.1 $DIR/$tdir/a1/f2
2536 chown 1.1 $DIR/$tdir/a1/f4
2537 rm -f $DIR/$tdir/a1/f1
2538 rm -f $DIR/$tdir/a1/f3
2541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2543 echo "stopall to cleanup object cache"
2546 setupall > /dev/null
2548 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2549 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2551 start_full_debug_logging
2553 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2554 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2556 wait_update_facet mds1 "$LCTL get_param -n \
2557 mdd.$(facet_svc mds1).lfsck_layout |
2558 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2559 error "(3) MDS1 is not the expected 'scanning-phase2'"
2561 # to guarantee all updates are synced.
2565 echo "Write new data to f2/f4 to modify the new created OST-object."
2566 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2567 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2569 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2571 for k in $(seq $MDSCOUNT); do
2572 # The LFSCK status query internal is 30 seconds. For the case
2573 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2574 # time to guarantee the status sync up.
2575 wait_update_facet mds${k} "$LCTL get_param -n \
2576 mdd.$(facet_svc mds${k}).lfsck_layout |
2577 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2578 error "(4) MDS${k} is not the expected 'completed'"
2581 for k in $(seq $OSTCOUNT); do
2582 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2583 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2584 awk '/^status/ { print $2 }')
2585 [ "$cur_status" == "completed" ] ||
2586 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2589 stop_full_debug_logging
2591 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2592 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2593 awk '/^repaired_orphan/ { print $2 }')
2594 [ $repaired -eq 2 ] ||
2595 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2597 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2598 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2599 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2601 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2602 if [ $count -ne 2 ]; then
2603 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2604 error "(8) Expect 2 stubs under lost+found, but got $count"
2607 echo "The stub file should keep the original f2 or f4 data"
2608 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2609 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2610 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2611 error "(9) Got unexpected $cur_size"
2614 $LFS path2fid $cname
2615 $LFS getstripe $cname
2617 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2618 cur_size=$(ls -il $cname | awk '{ print $6 }')
2619 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2620 error "(10) Got unexpected $cur_size"
2623 $LFS path2fid $cname
2624 $LFS getstripe $cname
2626 echo "The f2/f4 should contains new data."
2627 cat $DIR/$tdir/a1/f2
2628 $LFS path2fid $DIR/$tdir/a1/f2
2629 $LFS getstripe $DIR/$tdir/a1/f2
2630 cat $DIR/$tdir/a1/f4
2631 $LFS path2fid $DIR/$tdir/a1/f4
2632 $LFS getstripe $DIR/$tdir/a1/f4
2634 run_test 18e "Find out orphan OST-object and repair it (5)"
2637 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2640 echo "The target MDT-object is lost. The LFSCK should re-create the"
2641 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2642 echo "to verify some OST-object(s) during the first stage-scanning,"
2643 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2644 echo "should not be affected."
2647 check_mount_and_prep
2648 $LFS mkdir -i 0 $DIR/$tdir/a1
2649 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2650 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2651 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2652 $LFS mkdir -i 0 $DIR/$tdir/a2
2653 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2654 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2655 $LFS getstripe $DIR/$tdir/a1/f1
2656 $LFS getstripe $DIR/$tdir/a2/f2
2658 if [ $MDSCOUNT -ge 2 ]; then
2659 $LFS mkdir -i 1 $DIR/$tdir/a3
2660 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2661 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2662 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2663 $LFS mkdir -i 1 $DIR/$tdir/a4
2664 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2665 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2666 $LFS getstripe $DIR/$tdir/a3/f3
2667 $LFS getstripe $DIR/$tdir/a4/f4
2670 cancel_lru_locks osc
2672 echo "Inject failure, to simulate the case of missing the MDT-object"
2673 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2674 do_facet mds1 $LCTL set_param fail_loc=0x1616
2675 rm -f $DIR/$tdir/a1/f1
2676 rm -f $DIR/$tdir/a2/f2
2678 if [ $MDSCOUNT -ge 2 ]; then
2679 do_facet mds2 $LCTL set_param fail_loc=0x1616
2680 rm -f $DIR/$tdir/a3/f3
2681 rm -f $DIR/$tdir/a4/f4
2687 do_facet mds1 $LCTL set_param fail_loc=0
2688 if [ $MDSCOUNT -ge 2 ]; then
2689 do_facet mds2 $LCTL set_param fail_loc=0
2692 cancel_lru_locks mdc
2693 cancel_lru_locks osc
2695 echo "Inject failure, to simulate the OST0 fail to handle"
2696 echo "MDT0 LFSCK request during the first-stage scanning."
2697 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2698 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2700 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2701 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2703 for k in $(seq $MDSCOUNT); do
2704 # The LFSCK status query internal is 30 seconds. For the case
2705 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2706 # time to guarantee the status sync up.
2707 wait_update_facet mds${k} "$LCTL get_param -n \
2708 mdd.$(facet_svc mds${k}).lfsck_layout |
2709 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2710 error "(2) MDS${k} is not the expected 'partial'"
2713 wait_update_facet ost1 "$LCTL get_param -n \
2714 obdfilter.$(facet_svc ost1).lfsck_layout |
2715 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2716 error "(3) OST1 is not the expected 'partial'"
2719 wait_update_facet ost2 "$LCTL get_param -n \
2720 obdfilter.$(facet_svc ost2).lfsck_layout |
2721 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2722 error "(4) OST2 is not the expected 'completed'"
2725 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2727 local repaired=$(do_facet mds1 $LCTL get_param -n \
2728 mdd.$(facet_svc mds1).lfsck_layout |
2729 awk '/^repaired_orphan/ { print $2 }')
2730 [ $repaired -eq 1 ] ||
2731 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2733 if [ $MDSCOUNT -ge 2 ]; then
2734 repaired=$(do_facet mds2 $LCTL get_param -n \
2735 mdd.$(facet_svc mds2).lfsck_layout |
2736 awk '/^repaired_orphan/ { print $2 }')
2737 [ $repaired -eq 1 ] ||
2738 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2741 echo "Trigger layout LFSCK on all devices again to cleanup"
2742 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2744 for k in $(seq $MDSCOUNT); do
2745 # The LFSCK status query internal is 30 seconds. For the case
2746 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2747 # time to guarantee the status sync up.
2748 wait_update_facet mds${k} "$LCTL get_param -n \
2749 mdd.$(facet_svc mds${k}).lfsck_layout |
2750 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2751 error "(8) MDS${k} is not the expected 'completed'"
2754 for k in $(seq $OSTCOUNT); do
2755 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2756 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2757 awk '/^status/ { print $2 }')
2758 [ "$cur_status" == "completed" ] ||
2759 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2763 local repaired=$(do_facet mds1 $LCTL get_param -n \
2764 mdd.$(facet_svc mds1).lfsck_layout |
2765 awk '/^repaired_orphan/ { print $2 }')
2766 [ $repaired -eq 2 ] ||
2767 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2769 if [ $MDSCOUNT -ge 2 ]; then
2770 repaired=$(do_facet mds2 $LCTL get_param -n \
2771 mdd.$(facet_svc mds2).lfsck_layout |
2772 awk '/^repaired_orphan/ { print $2 }')
2773 [ $repaired -eq 2 ] ||
2774 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2777 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2781 echo "The target MDT-object is lost, but related OI mapping is there"
2782 echo "The LFSCK should recreate the lost MDT-object without affected"
2783 echo "by the stale OI mapping."
2786 check_mount_and_prep
2787 $LFS mkdir -i 0 $DIR/$tdir/a1
2788 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2789 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2790 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2792 $LFS getstripe $DIR/$tdir/a1/f1
2793 cancel_lru_locks osc
2795 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2796 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2797 do_facet mds1 $LCTL set_param fail_loc=0x162e
2798 rm -f $DIR/$tdir/a1/f1
2800 do_facet mds1 $LCTL set_param fail_loc=0
2801 cancel_lru_locks mdc
2802 cancel_lru_locks osc
2804 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2805 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2807 for k in $(seq $MDSCOUNT); do
2808 # The LFSCK status query internal is 30 seconds. For the case
2809 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2810 # time to guarantee the status sync up.
2811 wait_update_facet mds${k} "$LCTL get_param -n \
2812 mdd.$(facet_svc mds${k}).lfsck_layout |
2813 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2814 error "(2) MDS${k} is not the expected 'completed'"
2817 for k in $(seq $OSTCOUNT); do
2818 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2819 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2820 awk '/^status/ { print $2 }')
2821 [ "$cur_status" == "completed" ] ||
2822 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2825 local repaired=$(do_facet mds1 $LCTL get_param -n \
2826 mdd.$(facet_svc mds1).lfsck_layout |
2827 awk '/^repaired_orphan/ { print $2 }')
2828 [ $repaired -eq $OSTCOUNT ] ||
2829 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2831 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2832 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2833 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2835 $LFS path2fid $DIR/$tdir/a1/f1
2836 $LFS getstripe $DIR/$tdir/a1/f1
2838 run_test 18g "Find out orphan OST-object and repair it (7)"
2842 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2843 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2844 echo "scanning its OST-object(s). Then in the second stage scanning,"
2845 echo "the OST will return related OST-object(s) to the MDT as orphan."
2846 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2847 echo "the 'orphan(s)' stripe information."
2850 check_mount_and_prep
2852 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2853 error "(0) Fail to create PFL $DIR/$tdir/f0"
2855 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2856 error "(1.1) Fail to write $DIR/$tdir/f0"
2858 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2859 error "(1.2) Fail to write $DIR/$tdir/f0"
2861 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2863 echo "Inject failure stub to simulate bad PFL extent range"
2864 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2865 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2867 chown 1.1 $DIR/$tdir/f0
2869 cancel_lru_locks mdc
2870 cancel_lru_locks osc
2871 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2873 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2874 error "(2) Write to bad PFL file should fail"
2876 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2877 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2879 for k in $(seq $MDSCOUNT); do
2880 # The LFSCK status query internal is 30 seconds. For the case
2881 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2882 # time to guarantee the status sync up.
2883 wait_update_facet mds${k} "$LCTL get_param -n \
2884 mdd.$(facet_svc mds${k}).lfsck_layout |
2885 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2886 error "(4.1) MDS${k} is not the expected 'completed'"
2889 for k in $(seq $OSTCOUNT); do
2890 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2891 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2892 awk '/^status/ { print $2 }')
2893 [ "$cur_status" == "completed" ] ||
2894 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2898 local repaired=$($SHOW_LAYOUT |
2899 awk '/^repaired_orphan/ { print $2 }')
2900 [ $repaired -eq 2 ] ||
2901 error "(5) Fail to repair crashed PFL range: $repaired"
2903 echo "Data in $DIR/$tdir/f0 should not be broken"
2904 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2905 error "(6) Data in $DIR/$tdir/f0 is broken"
2907 echo "Write should succeed after LFSCK repairing the bad PFL range"
2908 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2909 error "(7) Write should succeed after LFSCK"
2911 run_test 18h "LFSCK can repair crashed PFL extent range"
2913 $LCTL set_param debug=-cache > /dev/null
2916 check_mount_and_prep
2917 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2919 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2920 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2922 echo "foo1" > $DIR/$tdir/a0
2923 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2924 error "(0) Fail to create PFL $DIR/$tdir/a1"
2925 echo "foo2" > $DIR/$tdir/a1
2926 echo "guard" > $DIR/$tdir/a2
2927 cancel_lru_locks osc
2929 echo "Inject failure, then client will offer wrong parent FID when read"
2930 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2931 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2933 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2934 $LCTL set_param fail_loc=0x1619
2936 echo "Read RPC with wrong parent FID should be denied"
2937 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2938 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2939 $LCTL set_param fail_loc=0
2941 run_test 19a "OST-object inconsistency self detect"
2944 check_mount_and_prep
2945 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2947 echo "Inject failure stub to make the OST-object to back point to"
2948 echo "non-exist MDT-object"
2950 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2951 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2953 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2954 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2955 echo "foo1" > $DIR/$tdir/f0
2956 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2957 error "(0) Fail to create PFL $DIR/$tdir/f1"
2958 echo "foo2" > $DIR/$tdir/f1
2959 cancel_lru_locks osc
2960 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2962 do_facet ost1 $LCTL set_param -n \
2963 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2964 echo "Nothing should be fixed since self detect and repair is disabled"
2965 local repaired=$(do_facet ost1 $LCTL get_param -n \
2966 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2967 awk '/^repaired/ { print $2 }')
2968 [ $repaired -eq 0 ] ||
2969 error "(1) Expected 0 repaired, but got $repaired"
2971 echo "Read RPC with right parent FID should be accepted,"
2972 echo "and cause parent FID on OST to be fixed"
2974 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2975 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2977 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2978 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2980 repaired=$(do_facet ost1 $LCTL get_param -n \
2981 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2982 awk '/^repaired/ { print $2 }')
2983 [ $repaired -eq 2 ] ||
2984 error "(3) Expected 1 repaired, but got $repaired"
2986 run_test 19b "OST-object inconsistency self repair"
2988 PATTERN_WITH_HOLE="40000001"
2989 PATTERN_WITHOUT_HOLE="raid0"
2992 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2995 echo "The target MDT-object and some of its OST-object are lost."
2996 echo "The LFSCK should find out the left OST-objects and re-create"
2997 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2998 echo "with the partial OST-objects (LOV EA hole)."
3000 echo "New client can access the file with LOV EA hole via normal"
3001 echo "system tools or commands without crash the system."
3003 echo "For old client, even though it cannot access the file with"
3004 echo "LOV EA hole, it should not cause the system crash."
3007 check_mount_and_prep
3008 $LFS mkdir -i 0 $DIR/$tdir/a1
3009 if [ $OSTCOUNT -gt 2 ]; then
3010 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3013 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3017 # 256 blocks on the stripe0.
3018 # 1 block on the stripe1 for 2 OSTs case.
3019 # 256 blocks on the stripe1 for other cases.
3020 # 1 block on the stripe2 if OSTs > 2
3021 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3022 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3023 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3025 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3026 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3027 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3030 $LFS getstripe $DIR/$tdir/a1/f0
3032 $LFS getstripe $DIR/$tdir/a1/f1
3034 $LFS getstripe $DIR/$tdir/a1/f2
3036 if [ $OSTCOUNT -gt 2 ]; then
3037 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3038 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3040 $LFS getstripe $DIR/$tdir/a1/f3
3043 cancel_lru_locks osc
3045 echo "Inject failure..."
3046 echo "To simulate f0 lost MDT-object"
3047 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3048 do_facet mds1 $LCTL set_param fail_loc=0x1616
3049 rm -f $DIR/$tdir/a1/f0
3051 echo "To simulate f1 lost MDT-object and OST-object0"
3052 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3053 do_facet mds1 $LCTL set_param fail_loc=0x161a
3054 rm -f $DIR/$tdir/a1/f1
3056 echo "To simulate f2 lost MDT-object and OST-object1"
3057 do_facet mds1 $LCTL set_param fail_val=1
3058 rm -f $DIR/$tdir/a1/f2
3060 if [ $OSTCOUNT -gt 2 ]; then
3061 echo "To simulate f3 lost MDT-object and OST-object2"
3062 do_facet mds1 $LCTL set_param fail_val=2
3063 rm -f $DIR/$tdir/a1/f3
3066 umount_client $MOUNT
3069 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3071 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3072 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3074 for k in $(seq $MDSCOUNT); do
3075 # The LFSCK status query internal is 30 seconds. For the case
3076 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3077 # time to guarantee the status sync up.
3078 wait_update_facet mds${k} "$LCTL get_param -n \
3079 mdd.$(facet_svc mds${k}).lfsck_layout |
3080 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3081 error "(2) MDS${k} is not the expected 'completed'"
3084 for k in $(seq $OSTCOUNT); do
3085 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3086 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3087 awk '/^status/ { print $2 }')
3088 [ "$cur_status" == "completed" ] ||
3089 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3092 local repaired=$(do_facet mds1 $LCTL get_param -n \
3093 mdd.$(facet_svc mds1).lfsck_layout |
3094 awk '/^repaired_orphan/ { print $2 }')
3095 if [ $OSTCOUNT -gt 2 ]; then
3096 [ $repaired -eq 9 ] ||
3097 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3099 [ $repaired -eq 4 ] ||
3100 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3103 mount_client $MOUNT || error "(5.0) Fail to start client!"
3105 LOV_PATTERN_F_HOLE=0x40000000
3108 # ${fid0}-R-0 is the old f0
3110 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3111 echo "Check $name, which is the old f0"
3113 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3115 local pattern=$($LFS getstripe -L $name)
3116 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3117 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3119 local stripes=$($LFS getstripe -c $name)
3120 if [ $OSTCOUNT -gt 2 ]; then
3121 [ $stripes -eq 3 ] ||
3122 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3124 [ $stripes -eq 2 ] ||
3125 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3128 local size=$(stat $name | awk '/Size:/ { print $2 }')
3129 [ $size -eq $((4096 * $bcount)) ] ||
3130 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3132 cat $name > /dev/null || error "(5.5) cannot read $name"
3134 echo "dummy" >> $name || error "(5.6) cannot write $name"
3136 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3138 touch $name || error "(5.8) cannot touch $name"
3140 rm -f $name || error "(5.9) cannot unlink $name"
3143 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3145 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3146 if [ $OSTCOUNT -gt 2 ]; then
3147 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3149 echo "Check $name, it contains the old f1's stripe1"
3152 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3154 pattern=$($LFS getstripe -L $name)
3155 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3156 error "(6.2) expect pattern flag hole, but got $pattern"
3158 stripes=$($LFS getstripe -c $name)
3159 if [ $OSTCOUNT -gt 2 ]; then
3160 [ $stripes -eq 3 ] ||
3161 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3163 [ $stripes -eq 2 ] ||
3164 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3167 size=$(stat $name | awk '/Size:/ { print $2 }')
3168 [ $size -eq $((4096 * $bcount)) ] ||
3169 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3171 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3173 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3174 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3177 [ $failures -eq 256 ] ||
3178 error "(6.6) expect 256 IO failures, but get $failures"
3180 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3181 [ $size -eq $((4096 * $bcount)) ] ||
3182 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3184 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3185 error "(6.8) write to the LOV EA hole should fail"
3187 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3188 error "(6.9) write to normal stripe should NOT fail"
3190 echo "foo" >> $name && error "(6.10) append write $name should fail"
3192 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3194 touch $name || error "(6.12) cannot touch $name"
3196 rm -f $name || error "(6.13) cannot unlink $name"
3199 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3201 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3202 if [ $OSTCOUNT -gt 2 ]; then
3203 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3205 echo "Check $name, it contains the old f2's stripe0"
3208 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3210 pattern=$($LFS getstripe -L $name)
3211 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3212 error "(7.2) expect pattern flag hole, but got $pattern"
3214 stripes=$($LFS getstripe -c $name)
3215 size=$(stat $name | awk '/Size:/ { print $2 }')
3216 if [ $OSTCOUNT -gt 2 ]; then
3217 [ $stripes -eq 3 ] ||
3218 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3220 [ $size -eq $((4096 * $bcount)) ] ||
3221 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3223 cat $name > /dev/null &&
3224 error "(7.5.1) normal read $name should fail"
3226 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3227 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3229 [ $failures -eq 256 ] ||
3230 error "(7.6) expect 256 IO failures, but get $failures"
3232 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3233 [ $size -eq $((4096 * $bcount)) ] ||
3234 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3236 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3237 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3239 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3240 error "(7.8.1) write to normal stripe should NOT fail"
3242 echo "foo" >> $name &&
3243 error "(7.8.3) append write $name should fail"
3245 chown $RUNAS_ID:$RUNAS_GID $name ||
3246 error "(7.9.1) cannot chown on $name"
3248 touch $name || error "(7.10.1) cannot touch $name"
3250 [ $stripes -eq 2 ] ||
3251 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3254 [ $size -eq $((4096 * (256 + 0))) ] ||
3255 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3257 cat $name > /dev/null &&
3258 error "(7.5.2) normal read $name should fail"
3260 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3261 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3262 [ $failures -eq 256 ] ||
3263 error "(7.6.2) expect 256 IO failures, but get $failures"
3266 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3267 [ $size -eq $((4096 * $bcount)) ] ||
3268 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3270 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3271 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3273 chown $RUNAS_ID:$RUNAS_GID $name ||
3274 error "(7.9.2) cannot chown on $name"
3276 touch $name || error "(7.10.2) cannot touch $name"
3279 rm -f $name || error "(7.11) cannot unlink $name"
3281 [ $OSTCOUNT -le 2 ] && return
3284 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3286 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3287 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3289 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3291 pattern=$($LFS getstripe -L $name)
3292 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3293 error "(8.2) expect pattern flag hole, but got $pattern"
3295 stripes=$($LFS getstripe -c $name)
3296 [ $stripes -eq 3 ] ||
3297 error "(8.3) expect the stripe count is 3, but got $stripes"
3299 size=$(stat $name | awk '/Size:/ { print $2 }')
3301 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3302 error "(8.4) expect the size $((4096 * 512)), but got $size"
3304 cat $name > /dev/null &&
3305 error "(8.5) normal read $name should fail"
3307 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3308 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3310 [ $failures -eq 256 ] ||
3311 error "(8.6) expect 256 IO failures, but get $failures"
3314 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3315 [ $size -eq $((4096 * $bcount)) ] ||
3316 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3318 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3319 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3321 chown $RUNAS_ID:$RUNAS_GID $name ||
3322 error "(8.9) cannot chown on $name"
3324 touch $name || error "(8.10) cannot touch $name"
3326 rm -f $name || error "(8.11) cannot unlink $name"
3328 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3331 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3334 echo "The target MDT-object and some of its OST-object are lost."
3335 echo "The LFSCK should find out the left OST-objects and re-create"
3336 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3337 echo "with the partial OST-objects (LOV EA hole)."
3339 echo "New client can access the file with LOV EA hole via normal"
3340 echo "system tools or commands without crash the system - PFL case."
3343 check_mount_and_prep
3345 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3346 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3347 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3348 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3349 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3350 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3352 local bcount=$((256 * 3 + 1))
3354 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3355 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3356 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3358 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3359 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3360 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3363 $LFS getstripe $DIR/$tdir/f0
3365 $LFS getstripe $DIR/$tdir/f1
3367 $LFS getstripe $DIR/$tdir/f2
3369 cancel_lru_locks mdc
3370 cancel_lru_locks osc
3372 echo "Inject failure..."
3373 echo "To simulate f0 lost MDT-object"
3374 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3375 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3378 echo "To simulate the case of f1 lost MDT-object and "
3379 echo "the first OST-object in each PFL component"
3380 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3381 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3384 echo "To simulate the case of f2 lost MDT-object and "
3385 echo "the second OST-object in each PFL component"
3386 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3393 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3394 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3396 for k in $(seq $MDSCOUNT); do
3397 # The LFSCK status query internal is 30 seconds. For the case
3398 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3399 # time to guarantee the status sync up.
3400 wait_update_facet mds${k} "$LCTL get_param -n \
3401 mdd.$(facet_svc mds${k}).lfsck_layout |
3402 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3403 error "(4) MDS${k} is not the expected 'completed'"
3406 for k in $(seq $OSTCOUNT); do
3407 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3408 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3409 awk '/^status/ { print $2 }')
3410 [ "$cur_status" == "completed" ] ||
3411 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3414 local repaired=$(do_facet mds1 $LCTL get_param -n \
3415 mdd.$(facet_svc mds1).lfsck_layout |
3416 awk '/^repaired_orphan/ { print $2 }')
3417 [ $repaired -eq 8 ] ||
3418 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3421 # ${fid0}-R-0 is the old f0
3423 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3424 echo "Check $name, which is the old f0"
3426 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3428 local pattern=$($LFS getstripe -L -I1 $name)
3429 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3430 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3432 pattern=$($LFS getstripe -L -I2 $name)
3433 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3434 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3436 local stripes=$($LFS getstripe -c -I1 $name)
3437 [ $stripes -eq 2 ] ||
3438 error "(7.3.1) expect 2 stripes, but got $stripes"
3440 stripes=$($LFS getstripe -c -I2 $name)
3441 [ $stripes -eq 2 ] ||
3442 error "(7.3.2) expect 2 stripes, but got $stripes"
3444 local e_start=$($LFS getstripe -I1 $name |
3445 awk '/lcme_extent.e_start:/ { print $2 }')
3446 [ $e_start -eq 0 ] ||
3447 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3449 local e_end=$($LFS getstripe -I1 $name |
3450 awk '/lcme_extent.e_end:/ { print $2 }')
3451 [ $e_end -eq 2097152 ] ||
3452 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3454 e_start=$($LFS getstripe -I2 $name |
3455 awk '/lcme_extent.e_start:/ { print $2 }')
3456 [ $e_start -eq 2097152 ] ||
3457 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3459 e_end=$($LFS getstripe -I2 $name |
3460 awk '/lcme_extent.e_end:/ { print $2 }')
3461 [ "$e_end" = "EOF" ] ||
3462 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3464 local size=$(stat $name | awk '/Size:/ { print $2 }')
3465 [ $size -eq $((4096 * $bcount)) ] ||
3466 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3468 cat $name > /dev/null || error "(7.7) cannot read $name"
3470 echo "dummy" >> $name || error "(7.8) cannot write $name"
3472 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3474 touch $name || error "(7.10) cannot touch $name"
3476 rm -f $name || error "(7.11) cannot unlink $name"
3479 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3481 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3482 echo "Check $name, it contains f1's second OST-object in each COMP"
3484 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3486 pattern=$($LFS getstripe -L -I1 $name)
3487 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3488 error "(8.2.1) expect pattern flag hole, but got $pattern"
3490 pattern=$($LFS getstripe -L -I2 $name)
3491 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3492 error "(8.2.2) expect pattern flag hole, but got $pattern"
3494 stripes=$($LFS getstripe -c -I1 $name)
3495 [ $stripes -eq 2 ] ||
3496 error "(8.3.2) expect 2 stripes, but got $stripes"
3498 stripes=$($LFS getstripe -c -I2 $name)
3499 [ $stripes -eq 2 ] ||
3500 error "(8.3.2) expect 2 stripes, but got $stripes"
3502 e_start=$($LFS getstripe -I1 $name |
3503 awk '/lcme_extent.e_start:/ { print $2 }')
3504 [ $e_start -eq 0 ] ||
3505 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3507 e_end=$($LFS getstripe -I1 $name |
3508 awk '/lcme_extent.e_end:/ { print $2 }')
3509 [ $e_end -eq 2097152 ] ||
3510 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3512 e_start=$($LFS getstripe -I2 $name |
3513 awk '/lcme_extent.e_start:/ { print $2 }')
3514 [ $e_start -eq 2097152 ] ||
3515 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3517 e_end=$($LFS getstripe -I2 $name |
3518 awk '/lcme_extent.e_end:/ { print $2 }')
3519 [ "$e_end" = "EOF" ] ||
3520 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3522 size=$(stat $name | awk '/Size:/ { print $2 }')
3523 [ $size -eq $((4096 * $bcount)) ] ||
3524 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3526 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3528 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3529 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3531 # The first stripe in each COMP was lost
3532 [ $failures -eq 512 ] ||
3533 error "(8.8) expect 512 IO failures, but get $failures"
3535 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3536 [ $size -eq $((4096 * $bcount)) ] ||
3537 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3539 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3540 error "(8.10) write to the LOV EA hole should fail"
3542 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3543 error "(8.11) write to normal stripe should NOT fail"
3545 echo "foo" >> $name && error "(8.12) append write $name should fail"
3547 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3549 touch $name || error "(8.14) cannot touch $name"
3551 rm -f $name || error "(8.15) cannot unlink $name"
3554 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3556 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3557 echo "Check $name, it contains f2's first stripe in each COMP"
3559 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3561 pattern=$($LFS getstripe -L -I1 $name)
3562 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3563 error "(9.2.1) expect pattern flag hole, but got $pattern"
3565 pattern=$($LFS getstripe -L -I2 $name)
3566 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3567 error "(9.2.2) expect pattern flag hole, but got $pattern"
3569 stripes=$($LFS getstripe -c -I1 $name)
3570 [ $stripes -eq 2 ] ||
3571 error "(9.3.2) expect 2 stripes, but got $stripes"
3573 stripes=$($LFS getstripe -c -I2 $name)
3574 [ $stripes -eq 2 ] ||
3575 error "(9.3.2) expect 2 stripes, but got $stripes"
3577 e_start=$($LFS getstripe -I1 $name |
3578 awk '/lcme_extent.e_start:/ { print $2 }')
3579 [ $e_start -eq 0 ] ||
3580 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3582 e_end=$($LFS getstripe -I1 $name |
3583 awk '/lcme_extent.e_end:/ { print $2 }')
3584 [ $e_end -eq 2097152 ] ||
3585 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3587 e_start=$($LFS getstripe -I2 $name |
3588 awk '/lcme_extent.e_start:/ { print $2 }')
3589 [ $e_start -eq 2097152 ] ||
3590 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3592 e_end=$($LFS getstripe -I2 $name |
3593 awk '/lcme_extent.e_end:/ { print $2 }')
3594 [ "$e_end" = "EOF" ] ||
3595 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3597 size=$(stat $name | awk '/Size:/ { print $2 }')
3598 # The second stripe in COMP was lost, so we do not know there
3599 # have ever been some data before. 'stat' will regard it as
3600 # no data on the lost stripe.
3602 [ $size -eq $((4096 * $bcount)) ] ||
3603 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3605 cat $name > /dev/null &&
3606 error "(9.7) normal read $name should fail"
3608 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3609 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3610 [ $failures -eq 512 ] ||
3611 error "(9.8) expect 256 IO failures, but get $failures"
3613 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3614 # The second stripe in COMP was lost, so we do not know there
3615 # have ever been some data before. Since 'dd' skip failure,
3616 # it will regard the lost stripe contains data.
3618 [ $size -eq $((4096 * $bcount)) ] ||
3619 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3621 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3622 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3624 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3625 error "(9.11) write to normal stripe should NOT fail"
3627 echo "foo" >> $name &&
3628 error "(9.12) append write $name should fail"
3630 chown $RUNAS_ID:$RUNAS_GID $name ||
3631 error "(9.13) cannot chown on $name"
3633 touch $name || error "(9.14) cannot touch $name"
3635 rm -f $name || error "(7.15) cannot unlink $name"
3637 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3640 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3641 skip "ignore the test if MDS is older than 2.5.59" && return
3643 check_mount_and_prep
3644 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3646 echo "Start all LFSCK components by default (-s 1)"
3647 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3648 error "Fail to start LFSCK"
3650 echo "namespace LFSCK should be in 'scanning-phase1' status"
3651 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3652 [ "$STATUS" == "scanning-phase1" ] ||
3653 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3655 echo "layout LFSCK should be in 'scanning-phase1' status"
3656 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3657 [ "$STATUS" == "scanning-phase1" ] ||
3658 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3660 echo "Stop all LFSCK components by default"
3661 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3662 error "Fail to stop LFSCK"
3664 run_test 21 "run all LFSCK components by default"
3667 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3670 echo "The parent_A references the child directory via some name entry,"
3671 echo "but the child directory back references another parent_B via its"
3672 echo "".." name entry. The parent_B does not exist. Then the namespace"
3673 echo "LFSCK will repair the child directory's ".." name entry."
3676 check_mount_and_prep
3678 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3679 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3681 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3682 echo "The dummy's dotdot name entry references the guard."
3683 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3684 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3685 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3686 error "(3) Fail to mkdir on MDT0"
3687 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3689 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3691 echo "Trigger namespace LFSCK to repair unmatched pairs"
3692 $START_NAMESPACE -A -r ||
3693 error "(5) Fail to start LFSCK for namespace"
3695 wait_all_targets_blocked namespace completed 6
3697 local repaired=$($SHOW_NAMESPACE |
3698 awk '/^unmatched_pairs_repaired/ { print $2 }')
3699 [ $repaired -eq 1 ] ||
3700 error "(7) Fail to repair unmatched pairs: $repaired"
3702 echo "'ls' should success after namespace LFSCK repairing"
3703 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3704 error "(8) ls should success."
3706 run_test 22a "LFSCK can repair unmatched pairs (1)"
3709 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3712 echo "The parent_A references the child directory via the name entry_B,"
3713 echo "but the child directory back references another parent_C via its"
3714 echo "".." name entry. The parent_C exists, but there is no the name"
3715 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3716 echo "the child directory's ".." name entry and its linkEA."
3719 check_mount_and_prep
3721 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3722 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3724 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3725 echo "and bad linkEA. The dummy's dotdot name entry references the"
3726 echo "guard. The dummy's linkEA references n non-exist name entry."
3727 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3728 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3729 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3730 error "(3) Fail to mkdir on MDT0"
3731 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3733 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3734 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3735 local dummyname=$($LFS fid2path $DIR $dummyfid)
3736 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3737 error "(4) fid2path works unexpectedly."
3739 echo "Trigger namespace LFSCK to repair unmatched pairs"
3740 $START_NAMESPACE -A -r ||
3741 error "(5) Fail to start LFSCK for namespace"
3743 wait_all_targets_blocked namespace completed 6
3745 local repaired=$($SHOW_NAMESPACE |
3746 awk '/^unmatched_pairs_repaired/ { print $2 }')
3747 [ $repaired -eq 1 ] ||
3748 error "(7) Fail to repair unmatched pairs: $repaired"
3750 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3751 local dummyname=$($LFS fid2path $DIR $dummyfid)
3752 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3753 error "(8) fid2path does not work"
3755 run_test 22b "LFSCK can repair unmatched pairs (2)"
3758 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3761 echo "The name entry is there, but the MDT-object for such name "
3762 echo "entry does not exist. The namespace LFSCK should find out "
3763 echo "and repair the inconsistency as required."
3766 check_mount_and_prep
3768 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3769 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3771 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3772 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3773 do_facet mds2 $LCTL set_param fail_loc=0x1620
3774 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3775 do_facet mds2 $LCTL set_param fail_loc=0
3777 echo "'ls' should fail because of dangling name entry"
3778 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3780 echo "Trigger namespace LFSCK to find out dangling name entry"
3781 $START_NAMESPACE -A -r ||
3782 error "(5) Fail to start LFSCK for namespace"
3784 wait_all_targets_blocked namespace completed 6
3786 local repaired=$($SHOW_NAMESPACE |
3787 awk '/^dangling_repaired/ { print $2 }')
3788 [ $repaired -eq 1 ] ||
3789 error "(7) Fail to repair dangling name entry: $repaired"
3791 echo "'ls' should fail because not re-create MDT-object by default"
3792 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3794 echo "Trigger namespace LFSCK again to repair dangling name entry"
3795 $START_NAMESPACE -A -r -C ||
3796 error "(9) Fail to start LFSCK for namespace"
3798 wait_all_targets_blocked namespace completed 10
3800 repaired=$($SHOW_NAMESPACE |
3801 awk '/^dangling_repaired/ { print $2 }')
3802 [ $repaired -eq 1 ] ||
3803 error "(11) Fail to repair dangling name entry: $repaired"
3805 echo "'ls' should success after namespace LFSCK repairing"
3806 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3808 run_test 23a "LFSCK can repair dangling name entry (1)"
3812 echo "The objectA has multiple hard links, one of them corresponding"
3813 echo "to the name entry_B. But there is something wrong for the name"
3814 echo "entry_B and cause entry_B to references non-exist object_C."
3815 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3816 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3817 echo "comes to the second-stage scanning, it will find that the"
3818 echo "former re-creating object_C is not proper, and will try to"
3819 echo "replace the object_C with the real object_A."
3822 check_mount_and_prep
3824 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3825 $LFS path2fid $DIR/$tdir/d0
3827 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3829 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3830 $LFS path2fid $DIR/$tdir/d0/f0
3832 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3833 $LFS path2fid $DIR/$tdir/d0/f1
3835 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3836 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3838 if [ "$SEQ0" != "$SEQ1" ]; then
3839 # To guarantee that the f0 and f1 are in the same FID seq
3840 rm -f $DIR/$tdir/d0/f0 ||
3841 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3842 echo "dummy" > $DIR/$tdir/d0/f0 ||
3843 error "(3.2) Fail to touch on MDT0"
3844 $LFS path2fid $DIR/$tdir/d0/f0
3847 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3848 OID=$(printf %d $OID)
3850 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3851 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3852 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3853 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3854 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3856 # If there is creation after the dangling injection, it may re-use
3857 # the just released local object (inode) that is referenced by the
3858 # dangling name entry. It will fail the dangling injection.
3859 # So before deleting the target object for the dangling name entry,
3860 # remove some other objects to avoid the target object being reused
3861 # by some potential creations. LU-7429
3862 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3864 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3866 echo "'ls' should fail because of dangling name entry"
3867 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3868 error "(6) ls should fail."
3870 echo "Trigger namespace LFSCK to find out dangling name entry"
3871 $START_NAMESPACE -r -C ||
3872 error "(7) Fail to start LFSCK for namespace"
3874 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3875 mdd.${MDT_DEV}.lfsck_namespace |
3876 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3878 error "(8) unexpected status"
3881 local repaired=$($SHOW_NAMESPACE |
3882 awk '/^dangling_repaired/ { print $2 }')
3883 [ $repaired -eq 1 ] ||
3884 error "(9) Fail to repair dangling name entry: $repaired"
3886 repaired=$($SHOW_NAMESPACE |
3887 awk '/^multiple_linked_repaired/ { print $2 }')
3888 [ $repaired -eq 1 ] ||
3889 error "(10) Fail to drop the former created object: $repaired"
3891 local data=$(cat $DIR/$tdir/d0/foo)
3892 [ "$data" == "dummy" ] ||
3893 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3895 run_test 23b "LFSCK can repair dangling name entry (2)"
3898 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3899 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3900 mdd.${MDT_DEV}.lfsck_namespace |
3901 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3903 error "(10) unexpected status"
3906 stop_full_debug_logging
3911 echo "The objectA has multiple hard links, one of them corresponding"
3912 echo "to the name entry_B. But there is something wrong for the name"
3913 echo "entry_B and cause entry_B to references non-exist object_C."
3914 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3915 echo "as dangling, and re-create the lost object_C. And then others"
3916 echo "modified the re-created object_C. When the LFSCK comes to the"
3917 echo "second-stage scanning, it will find that the former re-creating"
3918 echo "object_C maybe wrong and try to replace the object_C with the"
3919 echo "real object_A. But because object_C has been modified, so the"
3920 echo "LFSCK cannot replace it."
3923 start_full_debug_logging
3925 check_mount_and_prep
3927 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3928 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3929 echo "parent_fid=$parent_fid"
3931 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3933 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3934 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3935 echo "f0_fid=$f0_fid"
3937 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3938 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
3939 echo "f1_fid=$f1_fid"
3941 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
3942 # To guarantee that the f0 and f1 are in the same FID seq
3943 rm -f $DIR/$tdir/d0/f0 ||
3944 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3945 echo "dummy" > $DIR/$tdir/d0/f0 ||
3946 error "(3.2) Fail to touch on MDT0"
3947 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3948 echo "f0_fid=$f0_fid (replaced)"
3951 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
3953 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3954 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3955 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
3956 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3957 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3959 # If there is creation after the dangling injection, it may re-use
3960 # the just released local object (inode) that is referenced by the
3961 # dangling name entry. It will fail the dangling injection.
3962 # So before deleting the target object for the dangling name entry,
3963 # remove some other objects to avoid the target object being reused
3964 # by some potential creations. LU-7429
3965 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3967 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3969 echo "'ls' should fail because of dangling name entry"
3970 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3971 error "(6) ls should fail."
3973 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3974 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3976 echo "Trigger namespace LFSCK to find out dangling name entry"
3977 $START_NAMESPACE -r -C ||
3978 error "(7) Fail to start LFSCK for namespace"
3980 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
3981 # While unexpected by the test, it is valid for LFSCK to repair
3982 # the link to the original object before any data is written.
3983 local size=$(stat -c %s $DIR/$tdir/d0/foo)
3985 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
3986 log "LFSCK repaired file prematurely"
3991 stat $DIR/$tdir/d0/foo
3993 error "(8) unexpected size"
3996 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3997 cancel_lru_locks osc
4001 local repaired=$($SHOW_NAMESPACE |
4002 awk '/^dangling_repaired/ { print $2 }')
4003 [ $repaired -eq 1 ] ||
4004 error "(11) Fail to repair dangling name entry: $repaired"
4006 local data=$(cat $DIR/$tdir/d0/foo)
4007 [ "$data" != "dummy" ] ||
4008 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4010 run_test 23c "LFSCK can repair dangling name entry (3)"
4013 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4016 echo "Two MDT-objects back reference the same name entry via their"
4017 echo "each own linkEA entry, but the name entry only references one"
4018 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4019 echo "for the MDT-object that is not recognized. If such MDT-object"
4020 echo "has no other linkEA entry after the removing, then the LFSCK"
4021 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4024 check_mount_and_prep
4026 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4028 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4029 $LFS path2fid $DIR/$tdir/d0/guard
4031 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4032 $LFS path2fid $DIR/$tdir/d0/dummy
4035 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4036 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4038 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4041 touch $DIR/$tdir/d0/guard/foo ||
4042 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4044 echo "Inject failure stub on MDT0 to simulate the case that"
4045 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4046 echo "that references $DIR/$tdir/d0/guard/foo."
4047 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4048 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4049 echo "there with the same linkEA entry as another MDT-object"
4050 echo "$DIR/$tdir/d0/guard/foo has"
4052 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4053 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4054 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4055 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4056 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4057 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4058 rmdir $DIR/$tdir/d0/dummy/foo ||
4059 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4060 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4062 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4063 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4064 error "(6) stat successfully unexpectedly"
4066 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4067 $START_NAMESPACE -A -r ||
4068 error "(7) Fail to start LFSCK for namespace"
4070 wait_all_targets_blocked namespace completed 8
4072 local repaired=$($SHOW_NAMESPACE |
4073 awk '/^multiple_referenced_repaired/ { print $2 }')
4074 [ $repaired -eq 1 ] ||
4075 error "(9) Fail to repair multiple referenced name entry: $repaired"
4077 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4078 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4079 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4081 local cname="$cfid-$pfid-D-0"
4082 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4083 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4085 run_test 24 "LFSCK can repair multiple-referenced name entry"
4088 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4089 skip "ldiskfs only test" && return
4092 echo "The file type in the name entry does not match the file type"
4093 echo "claimed by the referenced object. Then the LFSCK will update"
4094 echo "the file type in the name entry."
4097 check_mount_and_prep
4099 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4101 echo "Inject failure stub on MDT0 to simulate the case that"
4102 echo "the file type stored in the name entry is wrong."
4104 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4105 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4106 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4107 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4109 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4110 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4112 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4113 mdd.${MDT_DEV}.lfsck_namespace |
4114 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4116 error "(4) unexpected status"
4119 local repaired=$($SHOW_NAMESPACE |
4120 awk '/^bad_file_type_repaired/ { print $2 }')
4121 [ $repaired -eq 1 ] ||
4122 error "(5) Fail to repair bad file type in name entry: $repaired"
4124 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4126 run_test 25 "LFSCK can repair bad file type in the name entry"
4130 echo "The local name entry back referenced by the MDT-object is lost."
4131 echo "The namespace LFSCK will add the missing local name entry back"
4132 echo "to the normal namespace."
4135 check_mount_and_prep
4137 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4138 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4139 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4141 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4142 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4144 echo "Inject failure stub on MDT0 to simulate the case that"
4145 echo "foo's name entry will be removed, but the foo's object"
4146 echo "and its linkEA are kept in the system."
4148 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4149 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4150 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4151 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4153 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4154 error "(5) 'ls' should fail"
4156 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4157 $START_NAMESPACE -r -A ||
4158 error "(6) Fail to start LFSCK for namespace"
4160 wait_all_targets_blocked namespace completed 7
4162 local repaired=$($SHOW_NAMESPACE |
4163 awk '/^lost_dirent_repaired/ { print $2 }')
4164 [ $repaired -eq 1 ] ||
4165 error "(8) Fail to repair lost dirent: $repaired"
4167 ls -ail $DIR/$tdir/d0/foo ||
4168 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4170 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4171 [ "$foofid" == "$foofid2" ] ||
4172 error "(10) foo's FID changed: $foofid, $foofid2"
4174 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4177 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4180 echo "The remote name entry back referenced by the MDT-object is lost."
4181 echo "The namespace LFSCK will add the missing remote name entry back"
4182 echo "to the normal namespace."
4185 check_mount_and_prep
4187 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4188 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4189 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4191 echo "Inject failure stub on MDT0 to simulate the case that"
4192 echo "foo's name entry will be removed, but the foo's object"
4193 echo "and its linkEA are kept in the system."
4195 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4196 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4197 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4198 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4200 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4201 error "(4) 'ls' should fail"
4203 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4204 $START_NAMESPACE -r -A ||
4205 error "(5) Fail to start LFSCK for namespace"
4207 wait_all_targets_blocked namespace completed 6
4209 local repaired=$($SHOW_NAMESPACE |
4210 awk '/^lost_dirent_repaired/ { print $2 }')
4211 [ $repaired -eq 1 ] ||
4212 error "(7) Fail to repair lost dirent: $repaired"
4214 ls -ail $DIR/$tdir/d0/foo ||
4215 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4217 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4218 [ "$foofid" == "$foofid2" ] ||
4219 error "(9) foo's FID changed: $foofid, $foofid2"
4221 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4225 echo "The local parent referenced by the MDT-object linkEA is lost."
4226 echo "The namespace LFSCK will re-create the lost parent as orphan."
4229 check_mount_and_prep
4231 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4232 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4233 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4234 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4236 echo "Inject failure stub on MDT0 to simulate the case that"
4237 echo "foo's name entry will be removed, but the foo's object"
4238 echo "and its linkEA are kept in the system. And then remove"
4239 echo "another hard link and the parent directory."
4241 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4242 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4243 rm -f $DIR/$tdir/d0/foo ||
4244 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4245 rm -f $DIR/$tdir/d0/dummy ||
4246 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4247 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4249 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4250 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4252 echo "Trigger namespace LFSCK to repair the lost parent"
4253 $START_NAMESPACE -r -A ||
4254 error "(6) Fail to start LFSCK for namespace"
4256 wait_all_targets_blocked namespace completed 7
4258 local repaired=$($SHOW_NAMESPACE |
4259 awk '/^lost_dirent_repaired/ { print $2 }')
4260 [ $repaired -eq 1 ] ||
4261 error "(8) Fail to repair lost dirent: $repaired"
4263 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4264 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4265 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4267 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4269 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4270 [ ! -z "$cname" ] ||
4271 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4273 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4276 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4279 echo "The remote parent referenced by the MDT-object linkEA is lost."
4280 echo "The namespace LFSCK will re-create the lost parent as orphan."
4283 check_mount_and_prep
4285 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4286 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4288 $LFS path2fid $DIR/$tdir/d0
4290 echo "Inject failure stub on MDT0 to simulate the case that"
4291 echo "foo's name entry will be removed, but the foo's object"
4292 echo "and its linkEA are kept in the system. And then remove"
4293 echo "the parent directory."
4295 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4297 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4298 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4300 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4301 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4303 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4304 $START_NAMESPACE -r -A ||
4305 error "(6) Fail to start LFSCK for namespace"
4307 wait_all_targets_blocked namespace completed 7
4309 local repaired=$($SHOW_NAMESPACE |
4310 awk '/^lost_dirent_repaired/ { print $2 }')
4311 [ $repaired -eq 1 ] ||
4312 error "(8) Fail to repair lost dirent: $repaired"
4314 ls -ail $MOUNT/.lustre/lost+found/
4316 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4317 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4318 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4320 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4322 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4323 [ ! -z "$cname" ] ||
4324 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4326 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4329 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4332 echo "The target name entry is lost. The LFSCK should insert the"
4333 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4334 echo "the MDT (on which the orphan MDT-object resides) has ever"
4335 echo "failed to respond some name entry verification during the"
4336 echo "first stage-scanning, then the LFSCK should skip to handle"
4337 echo "orphan MDT-object on this MDT. But other MDTs should not"
4341 check_mount_and_prep
4342 $LFS mkdir -i 0 $DIR/$tdir/d1
4343 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4344 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4346 $LFS mkdir -i 1 $DIR/$tdir/d2
4347 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4348 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4350 echo "Inject failure stub on MDT0 to simulate the case that"
4351 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4352 echo "and its linkEA are kept in the system. And the case that"
4353 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4354 echo "and its linkEA are kept in the system."
4356 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4357 do_facet mds1 $LCTL set_param fail_loc=0x1624
4358 do_facet mds2 $LCTL set_param fail_loc=0x1624
4359 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4360 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4361 do_facet mds1 $LCTL set_param fail_loc=0
4362 do_facet mds2 $LCTL set_param fail_loc=0
4364 cancel_lru_locks mdc
4365 cancel_lru_locks osc
4367 echo "Inject failure, to simulate the MDT0 fail to handle"
4368 echo "MDT1 LFSCK request during the first-stage scanning."
4369 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4370 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4372 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4373 $START_NAMESPACE -r -A ||
4374 error "(3) Fail to start LFSCK for namespace"
4376 wait_update_facet mds1 "$LCTL get_param -n \
4377 mdd.$(facet_svc mds1).lfsck_namespace |
4378 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4379 error "(4) mds1 is not the expected 'partial'"
4382 wait_update_facet mds2 "$LCTL get_param -n \
4383 mdd.$(facet_svc mds2).lfsck_namespace |
4384 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4385 error "(5) mds2 is not the expected 'completed'"
4388 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4390 local repaired=$(do_facet mds1 $LCTL get_param -n \
4391 mdd.$(facet_svc mds1).lfsck_namespace |
4392 awk '/^lost_dirent_repaired/ { print $2 }')
4393 [ $repaired -eq 0 ] ||
4394 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4396 repaired=$(do_facet mds2 $LCTL get_param -n \
4397 mdd.$(facet_svc mds2).lfsck_namespace |
4398 awk '/^lost_dirent_repaired/ { print $2 }')
4399 [ $repaired -eq 1 ] ||
4400 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4402 echo "Trigger namespace LFSCK on all devices again to cleanup"
4403 $START_NAMESPACE -r -A ||
4404 error "(8) Fail to start LFSCK for namespace"
4406 wait_all_targets_blocked namespace completed 9
4408 local repaired=$(do_facet mds1 $LCTL get_param -n \
4409 mdd.$(facet_svc mds1).lfsck_namespace |
4410 awk '/^lost_dirent_repaired/ { print $2 }')
4411 [ $repaired -eq 1 ] ||
4412 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4414 repaired=$(do_facet mds2 $LCTL get_param -n \
4415 mdd.$(facet_svc mds2).lfsck_namespace |
4416 awk '/^lost_dirent_repaired/ { print $2 }')
4417 [ $repaired -eq 0 ] ||
4418 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4420 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4424 echo "The object's nlink attribute is larger than the object's known"
4425 echo "name entries count. The LFSCK will repair the object's nlink"
4426 echo "attribute to match the known name entries count"
4429 check_mount_and_prep
4431 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4432 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4434 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4435 echo "nlink attribute is larger than its name entries count."
4437 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4438 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4439 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4440 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4441 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4443 cancel_lru_locks mdc
4444 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4445 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4447 echo "Trigger namespace LFSCK to repair the nlink count"
4448 $START_NAMESPACE -r -A ||
4449 error "(5) Fail to start LFSCK for namespace"
4451 wait_all_targets_blocked namespace completed 6
4453 local repaired=$($SHOW_NAMESPACE |
4454 awk '/^nlinks_repaired/ { print $2 }')
4455 [ $repaired -eq 1 ] ||
4456 error "(7) Fail to repair nlink count: $repaired"
4458 cancel_lru_locks mdc
4459 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4460 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4462 # Disable 29a, we only allow nlink to be updated if the known linkEA
4463 # entries is larger than nlink count.
4465 #run_test 29a "LFSCK can repair bad nlink count (1)"
4469 echo "The object's nlink attribute is smaller than the object's known"
4470 echo "name entries count. The LFSCK will repair the object's nlink"
4471 echo "attribute to match the known name entries count"
4474 check_mount_and_prep
4476 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4477 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4479 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4480 echo "nlink attribute is smaller than its name entries count."
4482 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4483 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4484 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4485 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4486 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4488 cancel_lru_locks mdc
4489 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4490 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4492 echo "Trigger namespace LFSCK to repair the nlink count"
4493 $START_NAMESPACE -r -A ||
4494 error "(5) Fail to start LFSCK for namespace"
4496 wait_all_targets_blocked namespace completed 6
4498 local repaired=$($SHOW_NAMESPACE |
4499 awk '/^nlinks_repaired/ { print $2 }')
4500 [ $repaired -eq 1 ] ||
4501 error "(7) Fail to repair nlink count: $repaired"
4503 cancel_lru_locks mdc
4504 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4505 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4507 run_test 29b "LFSCK can repair bad nlink count (2)"
4512 echo "The namespace LFSCK will create many hard links to the target"
4513 echo "file as to exceed the linkEA size limitation. Under such case"
4514 echo "the linkEA will be marked as overflow that will prevent the"
4515 echo "target file to be migrated. Then remove some hard links to"
4516 echo "make the left hard links to be held within the linkEA size"
4517 echo "limitation. But before the namespace LFSCK adding all the"
4518 echo "missed linkEA entries back, the overflow mark (timestamp)"
4519 echo "will not be cleared."
4522 check_mount_and_prep
4524 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4525 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4526 error "(0.2) Fail to mkdir"
4527 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4528 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4530 # define MAX_LINKEA_SIZE 4096
4531 # sizeof(link_ea_header) = 24
4532 # sizeof(link_ea_entry) = 18
4533 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4534 # (sizeof(link_ea_entry) + name_length))
4535 # If the average name length is 12 bytes, then 150 hard links
4536 # is totally enough to overflow the linkEA
4537 echo "Create 150 hard links should succeed although the linkEA overflow"
4538 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4539 error "(2) Fail to hard link"
4541 cancel_lru_locks mdc
4542 if [ $MDSCOUNT -ge 2 ]; then
4543 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4544 error "(3.1) Migrate should fail"
4546 echo "The object with linkEA overflow should NOT be migrated"
4547 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4548 [ "$newfid" == "$oldfid" ] ||
4549 error "(3.2) Migrate should fail: $newfid != $oldfid"
4552 # Remove 100 hard links, then the linkEA should have space
4553 # to hold the missed linkEA entries.
4554 echo "Remove 100 hard links to save space for the missed linkEA entries"
4555 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4557 if [ $MDSCOUNT -ge 2 ]; then
4558 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4559 error "(5.1) Migrate should fail"
4561 # The overflow timestamp is still there, so migration will fail.
4562 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4563 [ "$newfid" == "$oldfid" ] ||
4564 error "(5.2) Migrate should fail: $newfid != $oldfid"
4567 # sleep 3 seconds to guarantee that the overflow is recognized
4570 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4571 $START_NAMESPACE -r -A ||
4572 error "(6) Fail to start LFSCK for namespace"
4574 wait_all_targets_blocked namespace completed 7
4576 local repaired=$($SHOW_NAMESPACE |
4577 awk '/^linkea_overflow_cleared/ { print $2 }')
4578 [ $repaired -eq 1 ] ||
4579 error "(8) Fail to clear linkea overflow: $repaired"
4581 repaired=$($SHOW_NAMESPACE |
4582 awk '/^nlinks_repaired/ { print $2 }')
4583 [ $repaired -eq 0 ] ||
4584 error "(9) Unexpected nlink repaired: $repaired"
4586 if [ $MDSCOUNT -ge 2 ]; then
4587 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4588 error "(10.1) Migrate failure"
4590 # Migration should succeed after clear the overflow timestamp.
4591 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4592 [ "$newfid" != "$oldfid" ] ||
4593 error "(10.2) Migrate should succeed"
4595 ls -l $DIR/$tdir/foo > /dev/null ||
4596 error "(11) 'ls' failed after migration"
4599 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4600 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4602 run_test 29c "verify linkEA size limitation"
4605 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4606 skip "ldiskfs only test" && return
4609 echo "The namespace LFSCK will move the orphans from backend"
4610 echo "/lost+found directory to normal client visible namespace"
4611 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4614 check_mount_and_prep
4616 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4617 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4619 echo "Inject failure stub on MDT0 to simulate the case that"
4620 echo "directory d0 has no linkEA entry, then the LFSCK will"
4621 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4623 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4624 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4625 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4626 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4628 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4629 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4631 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4632 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4634 echo "Inject failure stub on MDT0 to simulate the case that the"
4635 echo "object's name entry will be removed, but not destroy the"
4636 echo "object. Then backend e2fsck will handle it as orphan and"
4637 echo "add them into the backend /lost+found directory."
4639 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4640 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4641 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4642 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4643 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4644 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4645 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4647 umount_client $MOUNT || error "(10) Fail to stop client!"
4649 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4652 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4653 error "(12) Fail to run e2fsck"
4655 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4656 error "(13) Fail to start MDT0"
4658 echo "Trigger namespace LFSCK to recover backend orphans"
4659 $START_NAMESPACE -r -A ||
4660 error "(14) Fail to start LFSCK for namespace"
4662 wait_all_targets_blocked namespace completed 15
4664 local repaired=$($SHOW_NAMESPACE |
4665 awk '/^local_lost_found_moved/ { print $2 }')
4666 [ $repaired -ge 4 ] ||
4667 error "(16) Fail to recover backend orphans: $repaired"
4669 mount_client $MOUNT || error "(17) Fail to start client!"
4671 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4673 ls -ail $MOUNT/.lustre/lost+found/
4675 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4676 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4677 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4679 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4681 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4682 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4684 stat ${cname}/d1 || error "(21) d1 is not recovered"
4685 stat ${cname}/f1 || error "(22) f1 is not recovered"
4687 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4690 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4693 echo "For the name entry under a striped directory, if the name"
4694 echo "hash does not match the shard, then the LFSCK will repair"
4695 echo "the bad name entry"
4698 check_mount_and_prep
4700 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4701 error "(1) Fail to create striped directory"
4703 echo "Inject failure stub on client to simulate the case that"
4704 echo "some name entry should be inserted into other non-first"
4705 echo "shard, but inserted into the first shard by wrong"
4707 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4708 $LCTL set_param fail_loc=0x1628 fail_val=0
4709 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4710 error "(2) Fail to create file under striped directory"
4711 $LCTL set_param fail_loc=0 fail_val=0
4713 echo "Trigger namespace LFSCK to repair bad name hash"
4714 $START_NAMESPACE -r -A ||
4715 error "(3) Fail to start LFSCK for namespace"
4717 wait_all_targets_blocked namespace completed 4
4719 local repaired=$($SHOW_NAMESPACE |
4720 awk '/^name_hash_repaired/ { print $2 }')
4721 [ $repaired -ge 1 ] ||
4722 error "(5) Fail to repair bad name hash: $repaired"
4724 umount_client $MOUNT || error "(6) umount failed"
4725 mount_client $MOUNT || error "(7) mount failed"
4727 for ((i = 0; i < $MDSCOUNT; i++)); do
4728 stat $DIR/$tdir/striped_dir/d$i ||
4729 error "(8) Fail to stat d$i after LFSCK"
4730 rmdir $DIR/$tdir/striped_dir/d$i ||
4731 error "(9) Fail to unlink d$i after LFSCK"
4734 rmdir $DIR/$tdir/striped_dir ||
4735 error "(10) Fail to remove the striped directory after LFSCK"
4737 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4740 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4743 echo "For the name entry under a striped directory, if the name"
4744 echo "hash does not match the shard, then the LFSCK will repair"
4745 echo "the bad name entry"
4748 check_mount_and_prep
4750 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4751 error "(1) Fail to create striped directory"
4753 echo "Inject failure stub on client to simulate the case that"
4754 echo "some name entry should be inserted into other non-second"
4755 echo "shard, but inserted into the secod shard by wrong"
4757 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4758 $LCTL set_param fail_loc=0x1628 fail_val=1
4759 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4760 error "(2) Fail to create file under striped directory"
4761 $LCTL set_param fail_loc=0 fail_val=0
4763 echo "Trigger namespace LFSCK to repair bad name hash"
4764 $START_NAMESPACE -r -A ||
4765 error "(3) Fail to start LFSCK for namespace"
4767 wait_all_targets_blocked namespace completed 4
4769 local repaired=$(do_facet mds2 $LCTL get_param -n \
4770 mdd.$(facet_svc mds2).lfsck_namespace |
4771 awk '/^name_hash_repaired/ { print $2 }')
4772 [ $repaired -ge 1 ] ||
4773 error "(5) Fail to repair bad name hash: $repaired"
4775 umount_client $MOUNT || error "(6) umount failed"
4776 mount_client $MOUNT || error "(7) mount failed"
4778 for ((i = 0; i < $MDSCOUNT; i++)); do
4779 stat $DIR/$tdir/striped_dir/d$i ||
4780 error "(8) Fail to stat d$i after LFSCK"
4781 rmdir $DIR/$tdir/striped_dir/d$i ||
4782 error "(9) Fail to unlink d$i after LFSCK"
4785 rmdir $DIR/$tdir/striped_dir ||
4786 error "(10) Fail to remove the striped directory after LFSCK"
4788 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4791 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4794 echo "For some reason, the master MDT-object of the striped directory"
4795 echo "may lost its master LMV EA. If nobody created files under the"
4796 echo "master directly after the master LMV EA lost, then the LFSCK"
4797 echo "should re-generate the master LMV EA."
4800 check_mount_and_prep
4802 echo "Inject failure stub on MDT0 to simulate the case that the"
4803 echo "master MDT-object of the striped directory lost the LMV EA."
4805 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4806 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4807 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4808 error "(1) Fail to create striped directory"
4809 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4811 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4812 $START_NAMESPACE -r -A ||
4813 error "(2) Fail to start LFSCK for namespace"
4815 wait_all_targets_blocked namespace completed 3
4817 local repaired=$($SHOW_NAMESPACE |
4818 awk '/^striped_dirs_repaired/ { print $2 }')
4819 [ $repaired -eq 1 ] ||
4820 error "(4) Fail to re-generate master LMV EA: $repaired"
4822 umount_client $MOUNT || error "(5) umount failed"
4823 mount_client $MOUNT || error "(6) mount failed"
4825 local empty=$(ls $DIR/$tdir/striped_dir/)
4826 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4828 rmdir $DIR/$tdir/striped_dir ||
4829 error "(8) Fail to remove the striped directory after LFSCK"
4831 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4834 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4837 echo "For some reason, the master MDT-object of the striped directory"
4838 echo "may lost its master LMV EA. If somebody created files under the"
4839 echo "master directly after the master LMV EA lost, then the LFSCK"
4840 echo "should NOT re-generate the master LMV EA, instead, it should"
4841 echo "change the broken striped dirctory as read-only to prevent"
4842 echo "further damage"
4845 check_mount_and_prep
4847 echo "Inject failure stub on MDT0 to simulate the case that the"
4848 echo "master MDT-object of the striped directory lost the LMV EA."
4850 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4852 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4853 error "(1) Fail to create striped directory"
4854 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4856 umount_client $MOUNT || error "(2) umount failed"
4857 mount_client $MOUNT || error "(3) mount failed"
4859 touch $DIR/$tdir/striped_dir/dummy ||
4860 error "(4) Fail to touch under broken striped directory"
4862 echo "Trigger namespace LFSCK to find out the inconsistency"
4863 $START_NAMESPACE -r -A ||
4864 error "(5) Fail to start LFSCK for namespace"
4866 wait_all_targets_blocked namespace completed 6
4868 local repaired=$($SHOW_NAMESPACE |
4869 awk '/^striped_dirs_repaired/ { print $2 }')
4870 [ $repaired -eq 0 ] ||
4871 error "(7) Re-generate master LMV EA unexpected: $repaired"
4873 stat $DIR/$tdir/striped_dir/dummy ||
4874 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4876 touch $DIR/$tdir/striped_dir/foo &&
4877 error "(9) The broken striped directory should be read-only"
4879 chattr -i $DIR/$tdir/striped_dir ||
4880 error "(10) Fail to chattr on the broken striped directory"
4882 rmdir $DIR/$tdir/striped_dir ||
4883 error "(11) Fail to remove the striped directory after LFSCK"
4885 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4888 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4891 echo "For some reason, the slave MDT-object of the striped directory"
4892 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4893 echo "slave LMV EA."
4896 check_mount_and_prep
4898 echo "Inject failure stub on MDT0 to simulate the case that the"
4899 echo "slave MDT-object (that resides on the same MDT as the master"
4900 echo "MDT-object resides on) lost the LMV EA."
4902 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4903 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4904 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4905 error "(1) Fail to create striped directory"
4906 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4908 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4909 $START_NAMESPACE -r -A ||
4910 error "(2) Fail to start LFSCK for namespace"
4912 wait_all_targets_blocked namespace completed 3
4914 local repaired=$($SHOW_NAMESPACE |
4915 awk '/^striped_shards_repaired/ { print $2 }')
4916 [ $repaired -eq 1 ] ||
4917 error "(4) Fail to re-generate slave LMV EA: $repaired"
4919 rmdir $DIR/$tdir/striped_dir ||
4920 error "(5) Fail to remove the striped directory after LFSCK"
4922 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4925 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4928 echo "For some reason, the slave MDT-object of the striped directory"
4929 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4930 echo "slave LMV EA."
4933 check_mount_and_prep
4935 echo "Inject failure stub on MDT0 to simulate the case that the"
4936 echo "slave MDT-object (that resides on different MDT as the master"
4937 echo "MDT-object resides on) lost the LMV EA."
4939 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4940 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4941 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4942 error "(1) Fail to create striped directory"
4943 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4945 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4946 $START_NAMESPACE -r -A ||
4947 error "(2) Fail to start LFSCK for namespace"
4949 wait_all_targets_blocked namespace completed 3
4951 local repaired=$(do_facet mds2 $LCTL get_param -n \
4952 mdd.$(facet_svc mds2).lfsck_namespace |
4953 awk '/^striped_shards_repaired/ { print $2 }')
4954 [ $repaired -eq 1 ] ||
4955 error "(4) Fail to re-generate slave LMV EA: $repaired"
4957 rmdir $DIR/$tdir/striped_dir ||
4958 error "(5) Fail to remove the striped directory after LFSCK"
4960 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4963 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4966 echo "For some reason, the stripe index in the slave LMV EA is"
4967 echo "corrupted. The LFSCK should repair the slave LMV EA."
4970 check_mount_and_prep
4972 echo "Inject failure stub on MDT0 to simulate the case that the"
4973 echo "slave LMV EA on the first shard of the striped directory"
4974 echo "claims the same index as the second shard claims"
4976 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4977 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4978 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4979 error "(1) Fail to create striped directory"
4980 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4982 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4983 $START_NAMESPACE -r -A ||
4984 error "(2) Fail to start LFSCK for namespace"
4986 wait_all_targets_blocked namespace completed 3
4988 local repaired=$($SHOW_NAMESPACE |
4989 awk '/^striped_shards_repaired/ { print $2 }')
4990 [ $repaired -eq 1 ] ||
4991 error "(4) Fail to repair slave LMV EA: $repaired"
4993 umount_client $MOUNT || error "(5) umount failed"
4994 mount_client $MOUNT || error "(6) mount failed"
4996 touch $DIR/$tdir/striped_dir/foo ||
4997 error "(7) Fail to touch file after the LFSCK"
4999 rm -f $DIR/$tdir/striped_dir/foo ||
5000 error "(8) Fail to unlink file after the LFSCK"
5002 rmdir $DIR/$tdir/striped_dir ||
5003 error "(9) Fail to remove the striped directory after LFSCK"
5005 run_test 31g "Repair the corrupted slave LMV EA"
5008 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5011 echo "For some reason, the shard's name entry in the striped"
5012 echo "directory may be corrupted. The LFSCK should repair the"
5013 echo "bad shard's name entry."
5016 check_mount_and_prep
5018 echo "Inject failure stub on MDT0 to simulate the case that the"
5019 echo "first shard's name entry in the striped directory claims"
5020 echo "the same index as the second shard's name entry claims."
5022 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5023 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5024 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5025 error "(1) Fail to create striped directory"
5026 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5028 echo "Trigger namespace LFSCK to repair the shard's name entry"
5029 $START_NAMESPACE -r -A ||
5030 error "(2) Fail to start LFSCK for namespace"
5032 wait_all_targets_blocked namespace completed 3
5034 local repaired=$($SHOW_NAMESPACE |
5035 awk '/^dirent_repaired/ { print $2 }')
5036 [ $repaired -eq 1 ] ||
5037 error "(4) Fail to repair shard's name entry: $repaired"
5039 umount_client $MOUNT || error "(5) umount failed"
5040 mount_client $MOUNT || error "(6) mount failed"
5042 touch $DIR/$tdir/striped_dir/foo ||
5043 error "(7) Fail to touch file after the LFSCK"
5045 rm -f $DIR/$tdir/striped_dir/foo ||
5046 error "(8) Fail to unlink file after the LFSCK"
5048 rmdir $DIR/$tdir/striped_dir ||
5049 error "(9) Fail to remove the striped directory after LFSCK"
5051 run_test 31h "Repair the corrupted shard's name entry"
5056 umount_client $MOUNT
5058 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5059 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5060 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5062 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5063 [ "$STATUS" == "scanning-phase1" ] ||
5064 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5067 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5069 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5073 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5075 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5076 error "(5) Fail to start ost1"
5078 run_test 32a "stop LFSCK when some OST failed"
5082 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5085 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5086 error "(1) Fail to create $DIR/$tdir/dp"
5087 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5088 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5089 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5090 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5091 umount_client $MOUNT
5093 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5094 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5095 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5097 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5098 mdd.${MDT_DEV}.lfsck_namespace |
5099 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5101 error "(5) unexpected status"
5105 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5107 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5111 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5113 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5114 error "(8) Fail to start MDT2"
5116 run_test 32b "stop LFSCK when some MDT failed"
5122 $START_LAYOUT --dryrun -o -r ||
5123 error "(1) Fail to start layout LFSCK"
5124 wait_all_targets_blocked layout completed 2
5126 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5127 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5128 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5130 $START_NAMESPACE -e abort -A -r ||
5131 error "(4) Fail to start namespace LFSCK"
5132 wait_all_targets_blocked namespace completed 5
5134 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5135 [ "$PARAMS" == "failout,all_targets" ] ||
5136 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5138 run_test 33 "check LFSCK paramters"
5142 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5143 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5144 skip "Only valid for ZFS backend" && return
5148 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5149 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5150 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5151 error "(1) Fail to create $DIR/$tdir/dummy"
5153 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5154 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5155 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5156 mdd.${MDT_DEV}.lfsck_namespace |
5157 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5159 error "(3) unexpected status"
5162 local repaired=$($SHOW_NAMESPACE |
5163 awk '/^dirent_repaired/ { print $2 }')
5164 [ $repaired -eq 1 ] ||
5165 error "(4) Fail to repair the lost agent object: $repaired"
5167 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5168 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5169 mdd.${MDT_DEV}.lfsck_namespace |
5170 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5172 error "(6) unexpected status"
5175 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5176 [ $repaired -eq 0 ] ||
5177 error "(7) Unexpected repairing: $repaired"
5179 run_test 34 "LFSCK can rebuild the lost agent object"
5183 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5187 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5188 do_facet mds2 $LCTL set_param fail_loc=0x1631
5189 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5190 error "(1) Fail to create $DIR/$tdir/dummy"
5193 do_facet mds2 $LCTL set_param fail_loc=0
5194 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5195 wait_update_facet mds2 "$LCTL get_param -n \
5196 mdd.$(facet_svc mds2).lfsck_namespace |
5197 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5198 error "(3) MDS${k} is not the expected 'completed'"
5200 local repaired=$(do_facet mds2 $LCTL get_param -n \
5201 mdd.$(facet_svc mds2).lfsck_namespace |
5202 awk '/^agent_entries_repaired/ { print $2 }')
5203 [ $repaired -eq 1 ] ||
5204 error "(4) Fail to repair the lost agent entry: $repaired"
5206 echo "stopall to cleanup object cache"
5209 setupall > /dev/null
5211 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5212 wait_update_facet mds2 "$LCTL get_param -n \
5213 mdd.$(facet_svc mds2).lfsck_namespace |
5214 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5215 error "(6) MDS${k} is not the expected 'completed'"
5217 repaired=$(do_facet mds2 $LCTL get_param -n \
5218 mdd.$(facet_svc mds2).lfsck_namespace |
5219 awk '/^agent_entries_repaired/ { print $2 }')
5220 [ $repaired -eq 0 ] ||
5221 error "(7) Unexpected repairing: $repaired"
5223 run_test 35 "LFSCK can rebuild the lost agent entry"
5226 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5229 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5230 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5231 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5234 check_mount_and_prep
5236 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5237 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5238 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5239 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5240 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5241 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5242 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5243 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5244 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5246 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5247 error "(3) Fail to write $DIR/$tdir/f0"
5248 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5249 error "(4) Fail to write $DIR/$tdir/f1"
5250 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5251 error "(5) Fail to write $DIR/$tdir/f2"
5253 $LFS mirror resync $DIR/$tdir/f0 ||
5254 error "(6) Fail to resync $DIR/$tdir/f0"
5255 $LFS mirror resync $DIR/$tdir/f1 ||
5256 error "(7) Fail to resync $DIR/$tdir/f1"
5257 $LFS mirror resync $DIR/$tdir/f2 ||
5258 error "(8) Fail to resync $DIR/$tdir/f2"
5260 cancel_lru_locks mdc
5261 cancel_lru_locks osc
5263 $LFS getstripe $DIR/$tdir/f0 ||
5264 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5265 $LFS getstripe $DIR/$tdir/f1 ||
5266 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5267 $LFS getstripe $DIR/$tdir/f2 ||
5268 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5270 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5271 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5272 do_facet mds1 $LCTL set_param fail_loc=0x1616
5274 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5275 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5276 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5277 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5278 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5279 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5283 do_facet mds1 $LCTL set_param fail_loc=0
5285 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5286 error "(15) The 1st of mirror is not destroyed"
5287 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5288 error "(16) The 2nd of mirror is not destroyed"
5289 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5290 error "(17) The 3rd of mirror is not destroyed"
5294 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5295 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5296 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5297 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5298 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5299 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5301 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5302 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5304 for k in $(seq $MDSCOUNT); do
5305 # The LFSCK status query internal is 30 seconds. For the case
5306 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5307 # time to guarantee the status sync up.
5308 wait_update_facet mds${k} "$LCTL get_param -n \
5309 mdd.$(facet_svc mds${k}).lfsck_layout |
5310 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5311 error "(22) MDS${k} is not the expected 'completed'"
5314 for k in $(seq $OSTCOUNT); do
5315 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5316 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5317 awk '/^status/ { print $2 }')
5318 [ "$cur_status" == "completed" ] ||
5319 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5322 local repaired=$(do_facet mds1 $LCTL get_param -n \
5323 mdd.$(facet_svc mds1).lfsck_layout |
5324 awk '/^repaired_orphan/ { print $2 }')
5325 [ $repaired -eq 9 ] ||
5326 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5328 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5329 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5330 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5331 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5332 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5333 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5335 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5336 $LFS getstripe $DIR/$tdir/f0
5337 error "(28) The 1st of mirror is not recovered"
5340 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5341 $LFS getstripe $DIR/$tdir/f1
5342 error "(29) The 2nd of mirror is not recovered"
5345 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5346 $LFS getstripe $DIR/$tdir/f2
5347 error "(30) The 3rd of mirror is not recovered"
5350 run_test 36a "rebuild LOV EA for mirrored file (1)"
5353 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5356 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5357 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5358 echo "with the PFID EA of related OST-object(s) belong to the file. "
5361 check_mount_and_prep
5363 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5364 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5365 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5367 local fid=$($LFS path2fid $DIR/$tdir/f0)
5369 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5370 error "(1) Fail to write $DIR/$tdir/f0"
5371 $LFS mirror resync $DIR/$tdir/f0 ||
5372 error "(2) Fail to resync $DIR/$tdir/f0"
5374 cancel_lru_locks mdc
5375 cancel_lru_locks osc
5377 $LFS getstripe $DIR/$tdir/f0 ||
5378 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5380 echo "Inject failure, to simulate the case of missing the MDT-object"
5381 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5382 do_facet mds1 $LCTL set_param fail_loc=0x1616
5383 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5387 do_facet mds1 $LCTL set_param fail_loc=0
5389 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5390 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5392 for k in $(seq $MDSCOUNT); do
5393 # The LFSCK status query internal is 30 seconds. For the case
5394 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5395 # time to guarantee the status sync up.
5396 wait_update_facet mds${k} "$LCTL get_param -n \
5397 mdd.$(facet_svc mds${k}).lfsck_layout |
5398 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5399 error "(6) MDS${k} is not the expected 'completed'"
5402 for k in $(seq $OSTCOUNT); do
5403 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5404 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5405 awk '/^status/ { print $2 }')
5406 [ "$cur_status" == "completed" ] ||
5407 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5410 local count=$(do_facet mds1 $LCTL get_param -n \
5411 mdd.$(facet_svc mds1).lfsck_layout |
5412 awk '/^repaired_orphan/ { print $2 }')
5413 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5415 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5416 count=$($LFS getstripe --mirror-count $name)
5417 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5419 count=$($LFS getstripe --component-count $name)
5420 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5422 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5423 $LFS getstripe $name
5424 error "(11) The 1st of mirror is not recovered"
5427 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5428 $LFS getstripe $name
5429 error "(12) The 2nd of mirror is not recovered"
5432 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5433 $LFS getstripe $name
5434 error "(13) The 3rd of mirror is not recovered"
5437 run_test 36b "rebuild LOV EA for mirrored file (2)"
5440 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5443 echo "The mirrored file has been modified, not resynced yet, then "
5444 echo "lost its MDT-object, but relatd OST-objects are still there. "
5445 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5446 echo "with the PFID EA of related OST-object(s) belong to the file. "
5449 check_mount_and_prep
5451 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5453 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5455 local fid=$($LFS path2fid $DIR/$tdir/f0)
5457 # The 1st dd && resync makes all related OST-objects have been written
5458 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5459 error "(1.1) Fail to write $DIR/$tdir/f0"
5460 $LFS mirror resync $DIR/$tdir/f0 ||
5461 error "(1.2) Fail to resync $DIR/$tdir/f0"
5462 # The 2nd dd makes one mirror to be stale
5463 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5464 error "(1.3) Fail to write $DIR/$tdir/f0"
5466 cancel_lru_locks mdc
5467 cancel_lru_locks osc
5469 $LFS getstripe $DIR/$tdir/f0 ||
5470 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5472 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5473 awk '/lcme_flags/ { print $2 }')
5474 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5475 awk '/lcme_flags/ { print $2 }')
5477 echo "Inject failure, to simulate the case of missing the MDT-object"
5478 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5479 do_facet mds1 $LCTL set_param fail_loc=0x1616
5480 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5484 do_facet mds1 $LCTL set_param fail_loc=0
5486 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5487 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5489 for k in $(seq $MDSCOUNT); do
5490 # The LFSCK status query internal is 30 seconds. For the case
5491 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5492 # time to guarantee the status sync up.
5493 wait_update_facet mds${k} "$LCTL get_param -n \
5494 mdd.$(facet_svc mds${k}).lfsck_layout |
5495 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5496 error "(5) MDS${k} is not the expected 'completed'"
5499 for k in $(seq $OSTCOUNT); do
5500 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5501 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5502 awk '/^status/ { print $2 }')
5503 [ "$cur_status" == "completed" ] ||
5504 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5507 local count=$(do_facet mds1 $LCTL get_param -n \
5508 mdd.$(facet_svc mds1).lfsck_layout |
5509 awk '/^repaired_orphan/ { print $2 }')
5510 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5512 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5513 count=$($LFS getstripe --mirror-count $name)
5514 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5516 count=$($LFS getstripe --component-count $name)
5517 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5519 local flags=$($LFS getstripe $name | head -n 10 |
5520 awk '/lcme_flags/ { print $2 }')
5521 [ "$flags" == "$saved_flags1" ] || {
5522 $LFS getstripe $name
5523 error "(10) expect flags $saved_flags1, got $flags"
5526 flags=$($LFS getstripe $name | tail -n 10 |
5527 awk '/lcme_flags/ { print $2 }')
5528 [ "$flags" == "$saved_flags2" ] || {
5529 $LFS getstripe $name
5530 error "(11) expect flags $saved_flags2, got $flags"
5533 run_test 36c "rebuild LOV EA for mirrored file (3)"
5539 local t_dir="$DIR/$tdir/d0"
5540 check_mount_and_prep
5542 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5543 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5547 $START_NAMESPACE -r -A || {
5548 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5550 wait_all_targets_blocked namespace completed 4
5555 run_test 37 "LFSCK must skip a ORPHAN"
5558 # restore MDS/OST size
5559 MDSSIZE=${SAVED_MDSSIZE}
5560 OSTSIZE=${SAVED_OSTSIZE}
5561 OSTCOUNT=${SAVED_OSTCOUNT}
5563 # cleanup the system at last
5564 REFORMAT="yes" cleanup_and_setup_lustre
5567 check_and_cleanup_lustre