3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test LU-10406
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT 31c"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
45 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
57 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
60 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
61 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
63 # DNE does not support striped directory on zfs-based backend yet.
64 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
65 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
69 MDT_DEV="${FSNAME}-MDT0000"
70 OST_DEV="${FSNAME}-OST0000"
71 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
72 START_NAMESPACE="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
74 START_LAYOUT="do_facet $SINGLEMDS \
75 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
76 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
77 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
78 SHOW_NAMESPACE="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
80 SHOW_LAYOUT="do_facet $SINGLEMDS \
81 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
82 SHOW_LAYOUT_ON_OST="do_facet ost1 \
83 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
84 MOUNT_OPTS_SCRUB="-o user_xattr"
85 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
86 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
95 echo "preparing... $nfiles * $ndirs files will be created $(date)."
96 if [ ! -z $igif ]; then
97 #define OBD_FAIL_FID_IGIF 0x1504
98 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
101 cp $LUSTRE/tests/*.sh $DIR/$tdir/
102 if [ $ndirs -gt 0 ]; then
103 createmany -d $DIR/$tdir/d $ndirs
104 createmany -m $DIR/$tdir/f $ndirs
105 if [ $nfiles -gt 0 ]; then
106 for ((i = 0; i < $ndirs; i++)); do
107 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
108 /dev/null || error "createmany $nfiles"
111 createmany -d $DIR/$tdir/e $ndirs
114 if [ ! -z $igif ]; then
115 touch $DIR/$tdir/dummy
116 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
119 echo "prepared $(date)."
122 run_e2fsck_on_mdt0() {
123 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
125 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
128 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
129 error "(2) Detected inconsistency on MDT0"
131 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
132 error "(3) Fail to start MDT0"
135 wait_all_targets_blocked() {
140 local count=$(do_facet mds1 \
141 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
142 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
143 [[ $count -eq $MDSCOUNT ]] || {
144 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
145 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
154 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
155 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
156 "$MDSCOUNT" $LTIME || {
157 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
158 error "($err) some MDTs are not in ${status}"
165 #define OBD_FAIL_LFSCK_DELAY1 0x1600
166 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
167 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
169 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
171 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
172 [ "$STATUS" == "scanning-phase1" ] ||
173 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
175 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
177 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
178 [ "$STATUS" == "stopped" ] ||
179 error "(6) Expect 'stopped', but got '$STATUS'"
181 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
183 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
184 [ "$STATUS" == "scanning-phase1" ] ||
185 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
188 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
189 mdd.${MDT_DEV}.lfsck_namespace |
190 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
192 error "(9) unexpected status"
195 local repaired=$($SHOW_NAMESPACE |
196 awk '/^updated_phase1/ { print $2 }')
197 [ $repaired -eq 0 ] ||
198 error "(10) Expect nothing to be repaired, but got: $repaired"
200 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
201 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
202 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
203 mdd.${MDT_DEV}.lfsck_namespace |
204 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
206 error "(12) unexpected status"
209 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
210 [ $((scanned1 + 1)) -eq $scanned2 ] ||
211 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
213 echo "stopall, should NOT crash LU-3649"
214 stopall || error "(14) Fail to stopall"
216 run_test 0 "Control LFSCK manually"
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_FID_IGIF 0x1504
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^dirent_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase1/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair lost FID-in-dirent: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 #define OBD_FAIL_FID_LOOKUP 0x1505
335 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
336 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
338 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
340 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
345 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
347 touch $DIR/$tdir/dummy
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
351 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
352 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
353 mdd.${MDT_DEV}.lfsck_namespace |
354 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
356 error "(4) unexpected status"
359 local repaired=$($SHOW_NAMESPACE |
360 awk '/^linkea_repaired/ { print $2 }')
361 # for interop with old server
362 [ -z "$repaired" ] &&
363 repaired=$($SHOW_NAMESPACE |
364 awk '/^updated_phase2/ { print $2 }')
366 [ $repaired -eq 1 ] ||
367 error "(5) Fail to repair crashed linkEA: $repaired"
371 mount_client $MOUNT || error "(6) Fail to start client!"
373 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
374 error "(7) Fail to stat $DIR/$tdir/dummy"
376 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
377 local dummyname=$($LFS fid2path $DIR $dummyfid)
378 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
379 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
381 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
387 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
388 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
389 touch $DIR/$tdir/dummy
391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
393 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
394 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
395 mdd.${MDT_DEV}.lfsck_namespace |
396 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
398 error "(4) unexpected status"
401 local repaired=$($SHOW_NAMESPACE |
402 awk '/^updated_phase2/ { print $2 }')
403 [ $repaired -eq 1 ] ||
404 error "(5) Fail to repair crashed linkEA: $repaired"
408 mount_client $MOUNT || error "(6) Fail to start client!"
410 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
411 error "(7) Fail to stat $DIR/$tdir/dummy"
413 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
414 local dummyname=$($LFS fid2path $DIR $dummyfid)
415 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
416 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
418 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
424 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
426 touch $DIR/$tdir/dummy
428 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
430 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
431 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
432 mdd.${MDT_DEV}.lfsck_namespace |
433 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
435 error "(4) unexpected status"
438 local repaired=$($SHOW_NAMESPACE |
439 awk '/^updated_phase2/ { print $2 }')
440 [ $repaired -eq 1 ] ||
441 error "(5) Fail to repair crashed linkEA: $repaired"
445 mount_client $MOUNT || error "(6) Fail to start client!"
447 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
448 error "(7) Fail to stat $DIR/$tdir/dummy"
450 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
451 local dummyname=$($LFS fid2path $DIR $dummyfid)
452 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
453 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
455 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
461 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
462 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
463 touch $DIR/$tdir/dummy
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
467 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
468 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
469 mdd.${MDT_DEV}.lfsck_namespace |
470 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
472 error "(4) unexpected status"
475 local repaired=$($SHOW_NAMESPACE |
476 awk '/^linkea_repaired/ { print $2 }')
477 [ $repaired -eq 1 ] ||
478 error "(5) Fail to repair crashed linkEA: $repaired"
482 mount_client $MOUNT || error "(6) Fail to start client!"
484 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
485 error "(7) Fail to stat $DIR/$tdir/dummy"
487 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
488 local dummyname=$($LFS fid2path $DIR $dummyfid)
489 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
490 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
492 run_test 2d "LFSCK can recover the missing linkEA entry"
496 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
500 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
502 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
504 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
509 wait_all_targets_blocked namespace completed 4
511 local repaired=$($SHOW_NAMESPACE |
512 awk '/^linkea_repaired/ { print $2 }')
513 [ $repaired -eq 1 ] ||
514 error "(5) Fail to repair crashed linkEA: $repaired"
516 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
517 local name=$($LFS fid2path $DIR $fid)
518 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
519 error "(6) Fail to repair linkEA: $fid $name"
521 run_test 2e "namespace LFSCK can verify remote object linkEA"
527 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
528 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
529 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
531 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
532 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
533 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
535 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
536 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
537 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
539 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
541 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
543 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
545 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
547 mdd.${MDT_DEV}.lfsck_namespace |
548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
550 error "(10) unexpected status"
553 local checked=$($SHOW_NAMESPACE |
554 awk '/^checked_phase2/ { print $2 }')
555 [ $checked -ge 4 ] ||
556 error "(11) Fail to check multiple-linked object: $checked"
558 local repaired=$($SHOW_NAMESPACE |
559 awk '/^multiple_linked_repaired/ { print $2 }')
560 [ $repaired -ge 2 ] ||
561 error "(12) Fail to repair multiple-linked object: $repaired"
563 run_test 3 "LFSCK can verify multiple-linked objects"
567 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
568 skip "OI Scrub not implemented for ZFS" && return
571 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
572 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
574 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
575 echo "start $SINGLEMDS with disabling OI scrub"
576 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
577 error "(2) Fail to start MDS!"
579 #define OBD_FAIL_LFSCK_DELAY2 0x1601
580 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
581 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
582 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
583 mdd.${MDT_DEV}.lfsck_namespace |
584 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
586 error "(5) unexpected status"
589 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
590 [ "$STATUS" == "scanning-phase1" ] ||
591 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
593 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
594 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
595 mdd.${MDT_DEV}.lfsck_namespace |
596 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
598 error "(7) unexpected status"
601 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
602 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
604 local repaired=$($SHOW_NAMESPACE |
605 awk '/^dirent_repaired/ { print $2 }')
606 # for interop with old server
607 [ -z "$repaired" ] &&
608 repaired=$($SHOW_NAMESPACE |
609 awk '/^updated_phase1/ { print $2 }')
611 [ $repaired -ge 9 ] ||
612 error "(9) Fail to re-generate FID-in-dirent: $repaired"
616 mount_client $MOUNT || error "(10) Fail to start client!"
618 #define OBD_FAIL_FID_LOOKUP 0x1505
619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
620 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
621 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
623 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
627 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
628 skip "OI Scrub not implemented for ZFS" && return
631 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
632 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
634 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
635 echo "start $SINGLEMDS with disabling OI scrub"
636 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
637 error "(2) Fail to start MDS!"
639 #define OBD_FAIL_LFSCK_DELAY2 0x1601
640 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
641 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
643 mdd.${MDT_DEV}.lfsck_namespace |
644 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
646 error "(5) unexpected status"
649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
650 [ "$STATUS" == "scanning-phase1" ] ||
651 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
653 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
658 error "(7) unexpected status"
661 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
662 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
664 local repaired=$($SHOW_NAMESPACE |
665 awk '/^dirent_repaired/ { print $2 }')
666 # for interop with old server
667 [ -z "$repaired" ] &&
668 repaired=$($SHOW_NAMESPACE |
669 awk '/^updated_phase1/ { print $2 }')
671 [ $repaired -ge 2 ] ||
672 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
676 mount_client $MOUNT || error "(10) Fail to start client!"
678 #define OBD_FAIL_FID_LOOKUP 0x1505
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
680 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
682 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
684 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
685 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
686 local dummyname=$($LFS fid2path $DIR $dummyfid)
687 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
688 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
690 run_test 5 "LFSCK can handle IGIF object upgrading"
695 #define OBD_FAIL_LFSCK_DELAY1 0x1600
696 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
697 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
699 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
700 [ "$STATUS" == "scanning-phase1" ] ||
701 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
703 # Sleep 3 sec to guarantee at least one object processed by LFSCK
705 # Fail the LFSCK to guarantee there is at least one checkpoint
706 #define OBD_FAIL_LFSCK_FATAL1 0x1608
707 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
708 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
709 mdd.${MDT_DEV}.lfsck_namespace |
710 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
712 error "(4) unexpected status"
715 local POS0=$($SHOW_NAMESPACE |
716 awk '/^last_checkpoint_position/ { print $2 }' |
719 #define OBD_FAIL_LFSCK_DELAY1 0x1600
720 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
721 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
723 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
724 [ "$STATUS" == "scanning-phase1" ] ||
725 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
727 local POS1=$($SHOW_NAMESPACE |
728 awk '/^latest_start_position/ { print $2 }' |
730 [[ $POS0 -lt $POS1 ]] ||
731 error "(7) Expect larger than: $POS0, but got $POS1"
733 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
734 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
735 mdd.${MDT_DEV}.lfsck_namespace |
736 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
738 error "(8) unexpected status"
741 run_test 6a "LFSCK resumes from last checkpoint (1)"
746 #define OBD_FAIL_LFSCK_DELAY2 0x1601
747 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
748 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
750 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
751 [ "$STATUS" == "scanning-phase1" ] ||
752 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
754 # Sleep 5 sec to guarantee that we are in the directory scanning
756 # Fail the LFSCK to guarantee there is at least one checkpoint
757 #define OBD_FAIL_LFSCK_FATAL2 0x1609
758 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
759 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
760 mdd.${MDT_DEV}.lfsck_namespace |
761 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
763 error "(4) unexpected status"
766 local O_POS0=$($SHOW_NAMESPACE |
767 awk '/^last_checkpoint_position/ { print $2 }' |
770 local D_POS0=$($SHOW_NAMESPACE |
771 awk '/^last_checkpoint_position/ { print $4 }')
773 #define OBD_FAIL_LFSCK_DELAY2 0x1601
774 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
775 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
777 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
778 [ "$STATUS" == "scanning-phase1" ] ||
779 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
781 local O_POS1=$($SHOW_NAMESPACE |
782 awk '/^latest_start_position/ { print $2 }' |
784 local D_POS1=$($SHOW_NAMESPACE |
785 awk '/^latest_start_position/ { print $4 }')
787 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
788 [[ $O_POS0 -lt $O_POS1 ]] ||
789 error "(7.1) $O_POS1 is not larger than $O_POS0"
791 [[ $D_POS0 -lt $D_POS1 ]] ||
792 error "(7.2) $D_POS1 is not larger than $D_POS0"
795 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
796 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
797 mdd.${MDT_DEV}.lfsck_namespace |
798 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
800 error "(8) unexpected status"
803 run_test 6b "LFSCK resumes from last checkpoint (2)"
810 #define OBD_FAIL_LFSCK_DELAY2 0x1601
811 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
812 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
814 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
815 [ "$STATUS" == "scanning-phase1" ] ||
816 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
818 # Sleep 3 sec to guarantee at least one object processed by LFSCK
820 echo "stop $SINGLEMDS"
821 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
824 echo "start $SINGLEMDS"
825 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
826 error "(5) Fail to start MDS!"
828 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
829 mdd.${MDT_DEV}.lfsck_namespace |
830 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
832 error "(6) unexpected status"
835 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
841 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
842 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
843 for ((i = 0; i < 20; i++)); do
844 touch $DIR/$tdir/dummy${i}
847 #define OBD_FAIL_LFSCK_DELAY3 0x1602
848 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
849 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
850 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
851 mdd.${MDT_DEV}.lfsck_namespace |
852 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
854 error "(4) unexpected status"
858 echo "stop $SINGLEMDS"
859 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
861 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
862 echo "start $SINGLEMDS"
863 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
864 error "(6) Fail to start MDS!"
866 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
867 mdd.${MDT_DEV}.lfsck_namespace |
868 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
870 error "(7) unexpected status"
873 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
878 formatall > /dev/null
884 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
885 [ "$STATUS" == "init" ] ||
886 error "(2) Expect 'init', but got '$STATUS'"
888 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
890 mkdir $DIR/$tdir/crashed
892 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
893 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
894 for ((i = 0; i < 5; i++)); do
895 touch $DIR/$tdir/dummy${i}
898 umount_client $MOUNT || error "(3) Fail to stop client!"
900 #define OBD_FAIL_LFSCK_DELAY2 0x1601
901 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
902 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
904 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
905 [ "$STATUS" == "scanning-phase1" ] ||
906 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
908 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
910 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
911 [ "$STATUS" == "stopped" ] ||
912 error "(7) Expect 'stopped', but got '$STATUS'"
914 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
916 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
917 [ "$STATUS" == "scanning-phase1" ] ||
918 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
920 #define OBD_FAIL_LFSCK_FATAL2 0x1609
921 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
922 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
923 mdd.${MDT_DEV}.lfsck_namespace |
924 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
926 error "(10) unexpected status"
929 #define OBD_FAIL_LFSCK_DELAY1 0x1600
930 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
931 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
933 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
934 [ "$STATUS" == "scanning-phase1" ] ||
935 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
937 #define OBD_FAIL_LFSCK_CRASH 0x160a
938 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
941 echo "stop $SINGLEMDS"
942 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
944 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
945 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
947 echo "start $SINGLEMDS"
948 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
949 error "(14) Fail to start MDS!"
951 local timeout=$(max_recovery_time)
954 while [ $timer -lt $timeout ]; do
955 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
956 mdt.${MDT_DEV}.recovery_status |
957 awk '/^status/ { print \\\$2 }'")
958 [ "$STATUS" != "RECOVERING" ] && break;
963 [ $timer != $timeout ] ||
964 error "(14.1) recovery timeout"
966 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
967 [ "$STATUS" == "crashed" ] ||
968 error "(15) Expect 'crashed', but got '$STATUS'"
970 #define OBD_FAIL_LFSCK_DELAY2 0x1601
971 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
972 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
974 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
975 [ "$STATUS" == "scanning-phase1" ] ||
976 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
978 echo "stop $SINGLEMDS"
979 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
981 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
982 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
984 echo "start $SINGLEMDS"
985 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
986 error "(19) Fail to start MDS!"
989 while [ $timer -lt $timeout ]; do
990 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
991 mdt.${MDT_DEV}.recovery_status |
992 awk '/^status/ { print \\\$2 }'")
993 [ "$STATUS" != "RECOVERING" ] && break;
998 [ $timer != $timeout ] ||
999 error "(19.1) recovery timeout"
1001 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1002 [ "$STATUS" == "paused" ] ||
1003 error "(20) Expect 'paused', but got '$STATUS'"
1005 echo "stop $SINGLEMDS"
1006 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1008 echo "start $SINGLEMDS without resume LFSCK"
1009 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1010 error "(20.2) Fail to start MDS!"
1013 while [ $timer -lt $timeout ]; do
1014 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1015 mdt.${MDT_DEV}.recovery_status |
1016 awk '/^status/ { print \\\$2 }'")
1017 [ "$STATUS" != "RECOVERING" ] && break;
1019 timer=$((timer + 1))
1022 [ $timer != $timeout ] ||
1023 error "(20.3) recovery timeout"
1025 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1026 [ "$STATUS" == "paused" ] ||
1027 error "(20.4) Expect 'paused', but got '$STATUS'"
1029 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1030 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1032 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1033 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1034 mdd.${MDT_DEV}.lfsck_namespace |
1035 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1037 error "(22) unexpected status"
1040 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1041 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1042 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1044 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1045 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1046 mdd.${MDT_DEV}.lfsck_namespace |
1047 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1049 error "(24) unexpected status"
1052 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1053 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1055 run_test 8 "LFSCK state machine"
1058 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1059 skip "Testing on UP system, the speed may be inaccurate."
1063 check_mount_and_prep
1064 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1065 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1066 createmany -o $DIR/$tdir/lfsck/f 5000
1068 local BASE_SPEED1=100
1070 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1073 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1074 [ "$STATUS" == "scanning-phase1" ] ||
1075 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1077 local SPEED=$($SHOW_LAYOUT |
1078 awk '/^average_speed_phase1/ { print $2 }')
1080 # There may be time error, normally it should be less than 2 seconds.
1081 # We allow another 20% schedule error.
1083 # MAX_MARGIN = 1.3 = 13 / 10
1084 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1085 RUN_TIME1 * 13 / 10))
1086 [ $SPEED -lt $MAX_SPEED ] || {
1088 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1089 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1092 # adjust speed limit
1093 local BASE_SPEED2=300
1095 do_facet $SINGLEMDS \
1096 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1099 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1100 # MIN_MARGIN = 0.7 = 7 / 10
1101 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1102 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1103 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1104 [ $SPEED -gt $MIN_SPEED ] || {
1105 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1106 error_ignore LU-5624 \
1107 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1110 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1114 # MAX_MARGIN = 1.3 = 13 / 10
1115 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1116 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1117 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1118 [ $SPEED -lt $MAX_SPEED ] || {
1120 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1121 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1122 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1125 do_nodes $(comma_list $(mdts_nodes)) \
1126 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1127 do_nodes $(comma_list $(osts_nodes)) \
1128 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1130 wait_update_facet $SINGLEMDS \
1131 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1132 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1133 error "(7) Failed to get expected 'completed'"
1135 run_test 9a "LFSCK speed control (1)"
1138 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1139 skip "Testing on UP system, the speed may be inaccurate."
1145 echo "Preparing another 50 * 50 files (with error) at $(date)."
1146 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1147 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1148 createmany -d $DIR/$tdir/d 50
1149 createmany -m $DIR/$tdir/f 50
1150 for ((i = 0; i < 50; i++)); do
1151 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1154 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1155 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1156 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1157 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1158 mdd.${MDT_DEV}.lfsck_namespace |
1159 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1161 error "(5) unexpected status"
1164 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1165 echo "Prepared at $(date)."
1167 local BASE_SPEED1=50
1169 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1172 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1173 [ "$STATUS" == "scanning-phase2" ] ||
1174 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1176 local SPEED=$($SHOW_NAMESPACE |
1177 awk '/^average_speed_phase2/ { print $2 }')
1178 # There may be time error, normally it should be less than 2 seconds.
1179 # We allow another 20% schedule error.
1181 # MAX_MARGIN = 1.3 = 13 / 10
1182 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1183 RUN_TIME1 * 13 / 10))
1184 [ $SPEED -lt $MAX_SPEED ] || {
1186 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1187 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1190 # adjust speed limit
1191 local BASE_SPEED2=150
1193 do_facet $SINGLEMDS \
1194 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1197 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1198 # MIN_MARGIN = 0.7 = 7 / 10
1199 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1200 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1201 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1202 [ $SPEED -gt $MIN_SPEED ] || {
1203 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1204 error_ignore LU-5624 \
1205 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1208 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1212 # MAX_MARGIN = 1.3 = 13 / 10
1213 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1214 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1215 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1216 [ $SPEED -lt $MAX_SPEED ] || {
1218 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1219 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1220 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1223 do_nodes $(comma_list $(mdts_nodes)) \
1224 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1225 do_nodes $(comma_list $(osts_nodes)) \
1226 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1227 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1228 mdd.${MDT_DEV}.lfsck_namespace |
1229 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1231 error "(11) unexpected status"
1234 run_test 9b "LFSCK speed control (2)"
1238 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1239 skip "lookup(..)/linkea on ZFS issue" && return
1243 echo "Preparing more files with error at $(date)."
1244 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1245 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1247 for ((i = 0; i < 1000; i = $((i+2)))); do
1248 mkdir -p $DIR/$tdir/d${i}
1249 touch $DIR/$tdir/f${i}
1250 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1253 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1254 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1256 for ((i = 1; i < 1000; i = $((i+2)))); do
1257 mkdir -p $DIR/$tdir/d${i}
1258 touch $DIR/$tdir/f${i}
1259 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1262 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1263 echo "Prepared at $(date)."
1265 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1267 umount_client $MOUNT
1268 mount_client $MOUNT || error "(3) Fail to start client!"
1270 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1273 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1274 [ "$STATUS" == "scanning-phase1" ] ||
1275 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1277 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1279 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1281 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1283 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1285 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1287 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1289 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1291 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1292 error "(14) Fail to softlink!"
1294 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1295 [ "$STATUS" == "scanning-phase1" ] ||
1296 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1298 do_nodes $(comma_list $(mdts_nodes)) \
1299 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1300 do_nodes $(comma_list $(osts_nodes)) \
1301 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1302 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1303 mdd.${MDT_DEV}.lfsck_namespace |
1304 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1306 error "(16) unexpected status"
1309 run_test 10 "System is available during LFSCK scanning"
1312 ost_remove_lastid() {
1315 local rcmd="do_facet ost${ost}"
1317 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1319 # step 1: local mount
1320 mount_fstype ost${ost} || return 1
1321 # step 2: remove the specified LAST_ID
1322 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1324 unmount_fstype ost${ost} || return 2
1328 check_mount_and_prep
1329 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1330 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1335 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1337 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1338 error "(2) Fail to start ost1"
1340 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1341 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1343 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1344 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1346 wait_update_facet ost1 "$LCTL get_param -n \
1347 obdfilter.${OST_DEV}.lfsck_layout |
1348 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1350 error "(5) unexpected status"
1353 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1355 wait_update_facet ost1 "$LCTL get_param -n \
1356 obdfilter.${OST_DEV}.lfsck_layout |
1357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1359 error "(6) unexpected status"
1362 echo "the LAST_ID(s) should have been rebuilt"
1363 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1364 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1366 run_test 11a "LFSCK can rebuild lost last_id"
1369 check_mount_and_prep
1370 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1372 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1373 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1374 do_facet ost1 $LCTL set_param fail_loc=0x160d
1376 local count=$(precreated_ost_obj_count 0 0)
1378 createmany -o $DIR/$tdir/f $((count + 32))
1380 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1381 local seq=$(do_facet mds1 $LCTL get_param -n \
1382 osp.${proc_path}.prealloc_last_seq)
1383 local lastid1=$(do_facet ost1 "lctl get_param -n \
1384 obdfilter.${ost1_svc}.last_id" | grep $seq |
1385 awk -F: '{ print $2 }')
1387 umount_client $MOUNT
1388 stop ost1 || error "(1) Fail to stop ost1"
1390 #define OBD_FAIL_OST_ENOSPC 0x215
1391 do_facet ost1 $LCTL set_param fail_loc=0x215
1393 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1394 error "(2) Fail to start ost1"
1396 for ((i = 0; i < 60; i++)); do
1397 lastid2=$(do_facet ost1 "lctl get_param -n \
1398 obdfilter.${ost1_svc}.last_id" | grep $seq |
1399 awk -F: '{ print $2 }')
1400 [ ! -z $lastid2 ] && break;
1404 echo "the on-disk LAST_ID should be smaller than the expected one"
1405 [ $lastid1 -gt $lastid2 ] ||
1406 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1408 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1409 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1411 wait_update_facet ost1 "$LCTL get_param -n \
1412 obdfilter.${OST_DEV}.lfsck_layout |
1413 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1415 error "(6) unexpected status"
1418 stop ost1 || error "(7) Fail to stop ost1"
1420 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1421 error "(8) Fail to start ost1"
1423 echo "the on-disk LAST_ID should have been rebuilt"
1424 wait_update_facet ost1 "$LCTL get_param -n \
1425 obdfilter.${ost1_svc}.last_id | grep $seq |
1426 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1427 do_facet ost1 $LCTL get_param -n \
1428 obdfilter.${ost1_svc}.last_id
1429 error "(9) expect lastid1 $seq:$lastid1"
1432 do_facet ost1 $LCTL set_param fail_loc=0
1433 stopall || error "(10) Fail to stopall"
1435 run_test 11b "LFSCK can rebuild crashed last_id"
1438 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1440 check_mount_and_prep
1441 for k in $(seq $MDSCOUNT); do
1442 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1443 createmany -o $DIR/$tdir/${k}/f 100 ||
1444 error "(0) Fail to create 100 files."
1447 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1448 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1449 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1451 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1452 wait_all_targets namespace scanning-phase1 3
1454 echo "Stop namespace LFSCK on all targets by single lctl command."
1455 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1456 error "(4) Fail to stop LFSCK on all devices!"
1458 echo "All the LFSCK targets should be in 'stopped' status."
1459 wait_all_targets_blocked namespace stopped 5
1461 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1462 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1463 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1465 echo "All the LFSCK targets should be in 'completed' status."
1466 wait_all_targets_blocked namespace completed 7
1468 start_full_debug_logging
1470 echo "Start layout LFSCK on all targets by single command (-s 1)."
1471 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1472 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1474 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1475 wait_all_targets layout scanning-phase1 9
1477 echo "Stop layout LFSCK on all targets by single lctl command."
1478 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1479 error "(10) Fail to stop LFSCK on all devices!"
1481 echo "All the LFSCK targets should be in 'stopped' status."
1482 wait_all_targets_blocked layout stopped 11
1484 for k in $(seq $OSTCOUNT); do
1485 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1486 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1487 awk '/^status/ { print $2 }')
1488 [ "$STATUS" == "stopped" ] ||
1489 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1492 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1493 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1494 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1496 echo "All the LFSCK targets should be in 'completed' status."
1497 wait_all_targets_blocked layout completed 14
1499 stop_full_debug_logging
1501 run_test 12a "single command to trigger LFSCK on all devices"
1504 check_mount_and_prep
1506 echo "Start LFSCK without '-M' specified."
1507 do_facet mds1 $LCTL lfsck_start -A -r ||
1508 error "(0) Fail to start LFSCK without '-M'"
1510 wait_all_targets_blocked namespace completed 1
1511 wait_all_targets_blocked layout completed 2
1513 local count=$(do_facet mds1 $LCTL dl |
1514 awk '{ print $3 }' | grep mdt | wc -l)
1515 if [ $count -gt 1 ]; then
1517 echo "Start layout LFSCK on the node with multipe targets,"
1518 echo "but not specify '-M'/'-A' option. Should get failure."
1520 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1521 error "(3) Start layout LFSCK should fail" || true
1524 run_test 12b "auto detect Lustre device"
1528 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1529 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1530 echo "MDT-object FID."
1533 check_mount_and_prep
1535 echo "Inject failure stub to simulate bad lmm_oi"
1536 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1537 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1538 createmany -o $DIR/$tdir/f 1
1539 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1540 error "(0) Fail to create PFL $DIR/$tdir/f1"
1541 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1543 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1544 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1547 mdd.${MDT_DEV}.lfsck_layout |
1548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1550 error "(2) unexpected status"
1553 local repaired=$($SHOW_LAYOUT |
1554 awk '/^repaired_others/ { print $2 }')
1555 [ $repaired -eq 2 ] ||
1556 error "(3) Fail to repair crashed lmm_oi: $repaired"
1558 run_test 13 "LFSCK can repair crashed lmm_oi"
1562 echo "The OST-object referenced by the MDT-object should be there;"
1563 echo "otherwise, the LFSCK should re-create the missing OST-object."
1564 echo "without '--delay-create-ostobj' option."
1567 check_mount_and_prep
1568 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1570 echo "Inject failure stub to simulate dangling referenced MDT-object"
1571 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1572 do_facet ost1 $LCTL set_param fail_loc=0x1610
1573 local count=$(precreated_ost_obj_count 0 0)
1575 createmany -o $DIR/$tdir/f $((count + 16)) ||
1576 error "(0.1) Fail to create $DIR/$tdir/fx"
1577 touch $DIR/$tdir/guard0
1579 for ((i = 0; i < 16; i++)); do
1580 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1581 $DIR/$tdir/f_comp${i} ||
1582 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1584 touch $DIR/$tdir/guard1
1586 do_facet ost1 $LCTL set_param fail_loc=0
1588 start_full_debug_logging
1590 # exhaust other pre-created dangling cases
1591 count=$(precreated_ost_obj_count 0 0)
1592 createmany -o $DIR/$tdir/a $count ||
1593 error "(0.5) Fail to create $count files."
1595 echo "'ls' should fail because of dangling referenced MDT-object"
1596 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1598 echo "Trigger layout LFSCK to find out dangling reference"
1599 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1601 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1602 mdd.${MDT_DEV}.lfsck_layout |
1603 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1605 error "(3) unexpected status"
1608 local repaired=$($SHOW_LAYOUT |
1609 awk '/^repaired_dangling/ { print $2 }')
1610 [ $repaired -ge 32 ] ||
1611 error "(4) Fail to repair dangling reference: $repaired"
1613 echo "'stat' should fail because of not repair dangling by default"
1614 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1615 error "(5.1) stat should fail"
1616 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1617 error "(5.2) stat should fail"
1619 echo "Trigger layout LFSCK to repair dangling reference"
1620 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1622 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1623 mdd.${MDT_DEV}.lfsck_layout |
1624 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1626 error "(7) unexpected status"
1629 # There may be some async LFSCK updates in processing, wait for
1630 # a while until the target reparation has been done. LU-4970.
1632 echo "'stat' should success after layout LFSCK repairing"
1633 wait_update_facet client "stat $DIR/$tdir/guard0 |
1634 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1635 stat $DIR/$tdir/guard0
1637 error "(8.1) unexpected size"
1640 wait_update_facet client "stat $DIR/$tdir/guard1 |
1641 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1642 stat $DIR/$tdir/guard1
1644 error "(8.2) unexpected size"
1647 repaired=$($SHOW_LAYOUT |
1648 awk '/^repaired_dangling/ { print $2 }')
1649 [ $repaired -ge 32 ] ||
1650 error "(9) Fail to repair dangling reference: $repaired"
1652 stop_full_debug_logging
1654 echo "stopall to cleanup object cache"
1657 setupall > /dev/null
1659 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1663 echo "The OST-object referenced by the MDT-object should be there;"
1664 echo "otherwise, the LFSCK should re-create the missing OST-object."
1665 echo "with '--delay-create-ostobj' option."
1668 check_mount_and_prep
1669 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1671 echo "Inject failure stub to simulate dangling referenced MDT-object"
1672 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1673 do_facet ost1 $LCTL set_param fail_loc=0x1610
1674 local count=$(precreated_ost_obj_count 0 0)
1676 createmany -o $DIR/$tdir/f $((count + 31))
1677 touch $DIR/$tdir/guard
1678 do_facet ost1 $LCTL set_param fail_loc=0
1680 start_full_debug_logging
1682 # exhaust other pre-created dangling cases
1683 count=$(precreated_ost_obj_count 0 0)
1684 createmany -o $DIR/$tdir/a $count ||
1685 error "(0) Fail to create $count files."
1687 echo "'ls' should fail because of dangling referenced MDT-object"
1688 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1690 echo "Trigger layout LFSCK to find out dangling reference"
1691 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1693 wait_all_targets_blocked layout completed 3
1695 local repaired=$($SHOW_LAYOUT |
1696 awk '/^repaired_dangling/ { print $2 }')
1697 [ $repaired -ge 32 ] ||
1698 error "(4) Fail to repair dangling reference: $repaired"
1700 echo "'stat' should fail because of not repair dangling by default"
1701 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1703 echo "Trigger layout LFSCK to repair dangling reference"
1704 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1706 wait_all_targets_blocked layout completed 7
1708 # There may be some async LFSCK updates in processing, wait for
1709 # a while until the target reparation has been done. LU-4970.
1711 echo "'stat' should success after layout LFSCK repairing"
1712 wait_update_facet client "stat $DIR/$tdir/guard |
1713 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1714 stat $DIR/$tdir/guard
1716 error "(8) unexpected size"
1719 repaired=$($SHOW_LAYOUT |
1720 awk '/^repaired_dangling/ { print $2 }')
1721 [ $repaired -ge 32 ] ||
1722 error "(9) Fail to repair dangling reference: $repaired"
1724 stop_full_debug_logging
1726 echo "stopall to cleanup object cache"
1729 setupall > /dev/null
1731 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1735 echo "If the OST-object referenced by the MDT-object back points"
1736 echo "to some non-exist MDT-object, then the LFSCK should repair"
1737 echo "the OST-object to back point to the right MDT-object."
1740 check_mount_and_prep
1741 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1743 echo "Inject failure stub to make the OST-object to back point to"
1744 echo "non-exist MDT-object."
1745 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1747 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1748 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1749 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1751 error "(0) Fail to create PFL $DIR/$tdir/f1"
1752 # 'dd' will trigger punch RPC firstly on every OST-objects.
1753 # So even though some OST-object will not be write by 'dd',
1754 # as long as it is allocated (may be NOT allocated in pfl_3b)
1755 # its layout information will be set also.
1756 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1757 cancel_lru_locks osc
1758 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1760 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1761 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1763 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1764 mdd.${MDT_DEV}.lfsck_layout |
1765 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1767 error "(2) unexpected status"
1770 local repaired=$($SHOW_LAYOUT |
1771 awk '/^repaired_unmatched_pair/ { print $2 }')
1772 [ $repaired -ge 3 ] ||
1773 error "(3) Fail to repair unmatched pair: $repaired"
1775 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1779 echo "If the OST-object referenced by the MDT-object back points"
1780 echo "to other MDT-object that doesn't recognize the OST-object,"
1781 echo "then the LFSCK should repair it to back point to the right"
1782 echo "MDT-object (the first one)."
1785 check_mount_and_prep
1786 mkdir -p $DIR/$tdir/0
1787 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1788 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1789 cancel_lru_locks osc
1791 echo "Inject failure stub to make the OST-object to back point to"
1792 echo "other MDT-object"
1795 [ $OSTCOUNT -ge 2 ] && stripes=2
1797 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1798 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1799 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1800 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1802 error "(0) Fail to create PFL $DIR/$tdir/f1"
1803 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1804 cancel_lru_locks osc
1805 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1807 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1808 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1810 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1811 mdd.${MDT_DEV}.lfsck_layout |
1812 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1814 error "(2) unexpected status"
1817 local repaired=$($SHOW_LAYOUT |
1818 awk '/^repaired_unmatched_pair/ { print $2 }')
1819 [ $repaired -eq 4 ] ||
1820 error "(3) Fail to repair unmatched pair: $repaired"
1822 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1825 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1827 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1828 skip "Skip the test after 2.7.55 see LU-6437" && return
1831 echo "According to current metadata migration implementation,"
1832 echo "before the old MDT-object is removed, both the new MDT-object"
1833 echo "and old MDT-object will reference the same LOV layout. Then if"
1834 echo "the layout LFSCK finds the new MDT-object by race, it will"
1835 echo "regard related OST-object(s) as multiple referenced case, and"
1836 echo "will try to create new OST-object(s) for the new MDT-object."
1837 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1838 echo "MDT-object before confirm the multiple referenced case."
1841 check_mount_and_prep
1842 $LFS mkdir -i 1 $DIR/$tdir/a1
1843 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1844 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1845 cancel_lru_locks osc
1847 echo "Inject failure stub on MDT1 to delay the migration"
1849 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1850 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1851 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1852 $LFS migrate -m 0 $DIR/$tdir/a1 &
1855 echo "Trigger layout LFSCK to race with the migration"
1856 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1858 wait_all_targets_blocked layout completed 2
1860 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1861 local repaired=$($SHOW_LAYOUT |
1862 awk '/^repaired_unmatched_pair/ { print $2 }')
1863 [ $repaired -eq 1 ] ||
1864 error "(3) Fail to repair unmatched pair: $repaired"
1866 repaired=$($SHOW_LAYOUT |
1867 awk '/^repaired_multiple_referenced/ { print $2 }')
1868 [ $repaired -eq 0 ] ||
1869 error "(4) Unexpectedly repaird multiple references: $repaired"
1871 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1875 echo "If the OST-object's owner information does not match the owner"
1876 echo "information stored in the MDT-object, then the LFSCK trust the"
1877 echo "MDT-object and update the OST-object's owner information."
1880 check_mount_and_prep
1881 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1882 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1883 cancel_lru_locks osc
1885 echo "Inject failure stub to skip OST-object owner changing"
1886 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1887 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1888 chown 1.1 $DIR/$tdir/f0
1889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1891 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1894 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1896 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1897 mdd.${MDT_DEV}.lfsck_layout |
1898 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1900 error "(2) unexpected status"
1903 local repaired=$($SHOW_LAYOUT |
1904 awk '/^repaired_inconsistent_owner/ { print $2 }')
1905 [ $repaired -eq 1 ] ||
1906 error "(3) Fail to repair inconsistent owner: $repaired"
1908 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1912 echo "If more than one MDT-objects reference the same OST-object,"
1913 echo "and the OST-object only recognizes one MDT-object, then the"
1914 echo "LFSCK should create new OST-objects for such non-recognized"
1918 check_mount_and_prep
1919 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1921 echo "Inject failure stub to make two MDT-objects to refernce"
1922 echo "the OST-object"
1924 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1925 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1926 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1927 cancel_lru_locks mdc
1928 cancel_lru_locks osc
1930 createmany -o $DIR/$tdir/f 1
1931 cancel_lru_locks mdc
1932 cancel_lru_locks osc
1934 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1936 error "(0) Fail to create PFL $DIR/$tdir/f1"
1937 cancel_lru_locks mdc
1938 cancel_lru_locks osc
1939 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1941 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1942 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1943 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1944 [ $size -eq 1048576 ] ||
1945 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1947 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1948 [ $size -eq 1048576 ] ||
1949 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1951 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1954 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1956 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1957 mdd.${MDT_DEV}.lfsck_layout |
1958 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1960 error "(3) unexpected status"
1963 local repaired=$($SHOW_LAYOUT |
1964 awk '/^repaired_multiple_referenced/ { print $2 }')
1965 [ $repaired -eq 2 ] ||
1966 error "(4) Fail to repair multiple references: $repaired"
1968 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1969 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1970 error "(5) Fail to write f0."
1971 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1972 [ $size -eq 1048576 ] ||
1973 error "(6) guard size should be 1048576, but got $size"
1975 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1976 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1977 error "(7) Fail to write f1."
1978 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1979 [ $size -eq 1048576 ] ||
1980 error "(8) guard size should be 1048576, but got $size"
1982 run_test 17 "LFSCK can repair multiple references"
1984 $LCTL set_param debug=+cache > /dev/null
1988 echo "The target MDT-object is there, but related stripe information"
1989 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1990 echo "layout EA entries."
1993 check_mount_and_prep
1994 $LFS mkdir -i 0 $DIR/$tdir/a1
1995 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1996 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1998 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2000 $LFS path2fid $DIR/$tdir/a1/f1
2001 $LFS getstripe $DIR/$tdir/a1/f1
2003 if [ $MDSCOUNT -ge 2 ]; then
2004 $LFS mkdir -i 1 $DIR/$tdir/a2
2005 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2006 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2007 $LFS path2fid $DIR/$tdir/a2/f2
2008 $LFS getstripe $DIR/$tdir/a2/f2
2011 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2012 error "(0) Fail to create PFL $DIR/$tdir/f3"
2014 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2016 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2018 $LFS path2fid $DIR/$tdir/f3
2019 $LFS getstripe $DIR/$tdir/f3
2021 cancel_lru_locks osc
2023 echo "Inject failure, to make the MDT-object lost its layout EA"
2024 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2025 do_facet mds1 $LCTL set_param fail_loc=0x1615
2026 chown 1.1 $DIR/$tdir/a1/f1
2028 if [ $MDSCOUNT -ge 2 ]; then
2029 do_facet mds2 $LCTL set_param fail_loc=0x1615
2030 chown 1.1 $DIR/$tdir/a2/f2
2033 chown 1.1 $DIR/$tdir/f3
2038 do_facet mds1 $LCTL set_param fail_loc=0
2039 if [ $MDSCOUNT -ge 2 ]; then
2040 do_facet mds2 $LCTL set_param fail_loc=0
2043 cancel_lru_locks mdc
2044 cancel_lru_locks osc
2046 echo "The file size should be incorrect since layout EA is lost"
2047 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2048 [ "$cur_size" != "$saved_size1" ] ||
2049 error "(1) Expect incorrect file1 size"
2051 if [ $MDSCOUNT -ge 2 ]; then
2052 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2053 [ "$cur_size" != "$saved_size1" ] ||
2054 error "(2) Expect incorrect file2 size"
2057 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2058 [ "$cur_size" != "$saved_size2" ] ||
2059 error "(1.2) Expect incorrect file3 size"
2061 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2062 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2064 for k in $(seq $MDSCOUNT); do
2065 # The LFSCK status query internal is 30 seconds. For the case
2066 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2067 # time to guarantee the status sync up.
2068 wait_update_facet mds${k} "$LCTL get_param -n \
2069 mdd.$(facet_svc mds${k}).lfsck_layout |
2070 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2071 error "(4) MDS${k} is not the expected 'completed'"
2074 for k in $(seq $OSTCOUNT); do
2075 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2076 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2077 awk '/^status/ { print $2 }')
2078 [ "$cur_status" == "completed" ] ||
2079 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2082 local repaired=$(do_facet mds1 $LCTL get_param -n \
2083 mdd.$(facet_svc mds1).lfsck_layout |
2084 awk '/^repaired_orphan/ { print $2 }')
2085 [ $repaired -eq 3 ] ||
2086 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2088 if [ $MDSCOUNT -ge 2 ]; then
2089 repaired=$(do_facet mds2 $LCTL get_param -n \
2090 mdd.$(facet_svc mds2).lfsck_layout |
2091 awk '/^repaired_orphan/ { print $2 }')
2092 [ $repaired -eq 2 ] ||
2093 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2096 $LFS path2fid $DIR/$tdir/a1/f1
2097 $LFS getstripe $DIR/$tdir/a1/f1
2099 if [ $MDSCOUNT -ge 2 ]; then
2100 $LFS path2fid $DIR/$tdir/a2/f2
2101 $LFS getstripe $DIR/$tdir/a2/f2
2104 $LFS path2fid $DIR/$tdir/f3
2105 $LFS getstripe $DIR/$tdir/f3
2107 echo "The file size should be correct after layout LFSCK scanning"
2108 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2109 [ "$cur_size" == "$saved_size1" ] ||
2110 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2112 if [ $MDSCOUNT -ge 2 ]; then
2113 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2114 [ "$cur_size" == "$saved_size1" ] ||
2115 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2118 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2119 [ "$cur_size" == "$saved_size2" ] ||
2120 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2122 run_test 18a "Find out orphan OST-object and repair it (1)"
2126 echo "The target MDT-object is lost. The LFSCK should re-create the"
2127 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2128 echo "can move it back to normal namespace manually."
2131 check_mount_and_prep
2132 $LFS mkdir -i 0 $DIR/$tdir/a1
2133 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2134 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2135 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2136 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2138 $LFS getstripe $DIR/$tdir/a1/f1
2140 if [ $MDSCOUNT -ge 2 ]; then
2141 $LFS mkdir -i 1 $DIR/$tdir/a2
2142 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2143 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2144 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2146 $LFS getstripe $DIR/$tdir/a2/f2
2149 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2150 error "(0) Fail to create PFL $DIR/$tdir/f3"
2152 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2154 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2155 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2157 $LFS getstripe $DIR/$tdir/f3
2159 cancel_lru_locks osc
2161 echo "Inject failure, to simulate the case of missing the MDT-object"
2162 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2163 do_facet mds1 $LCTL set_param fail_loc=0x1616
2164 rm -f $DIR/$tdir/a1/f1
2166 if [ $MDSCOUNT -ge 2 ]; then
2167 do_facet mds2 $LCTL set_param fail_loc=0x1616
2168 rm -f $DIR/$tdir/a2/f2
2176 do_facet mds1 $LCTL set_param fail_loc=0
2177 if [ $MDSCOUNT -ge 2 ]; then
2178 do_facet mds2 $LCTL set_param fail_loc=0
2181 cancel_lru_locks mdc
2182 cancel_lru_locks osc
2184 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2185 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2187 for k in $(seq $MDSCOUNT); do
2188 # The LFSCK status query internal is 30 seconds. For the case
2189 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2190 # time to guarantee the status sync up.
2191 wait_update_facet mds${k} "$LCTL get_param -n \
2192 mdd.$(facet_svc mds${k}).lfsck_layout |
2193 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2194 error "(2) MDS${k} is not the expected 'completed'"
2197 for k in $(seq $OSTCOUNT); do
2198 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2199 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2200 awk '/^status/ { print $2 }')
2201 [ "$cur_status" == "completed" ] ||
2202 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2205 local repaired=$(do_facet mds1 $LCTL get_param -n \
2206 mdd.$(facet_svc mds1).lfsck_layout |
2207 awk '/^repaired_orphan/ { print $2 }')
2208 [ $repaired -eq 3 ] ||
2209 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2211 if [ $MDSCOUNT -ge 2 ]; then
2212 repaired=$(do_facet mds2 $LCTL get_param -n \
2213 mdd.$(facet_svc mds2).lfsck_layout |
2214 awk '/^repaired_orphan/ { print $2 }')
2215 [ $repaired -eq 2 ] ||
2216 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2219 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2220 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2221 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2223 if [ $MDSCOUNT -ge 2 ]; then
2224 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2225 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2228 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2229 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2231 $LFS path2fid $DIR/$tdir/a1/f1
2232 $LFS getstripe $DIR/$tdir/a1/f1
2234 if [ $MDSCOUNT -ge 2 ]; then
2235 $LFS path2fid $DIR/$tdir/a2/f2
2236 $LFS getstripe $DIR/$tdir/a2/f2
2239 $LFS path2fid $DIR/$tdir/f3
2240 $LFS getstripe $DIR/$tdir/f3
2242 echo "The file size should be correct after layout LFSCK scanning"
2243 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2244 [ "$cur_size" == "$saved_size1" ] ||
2245 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2247 if [ $MDSCOUNT -ge 2 ]; then
2248 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2249 [ "$cur_size" == "$saved_size1" ] ||
2250 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2253 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2254 [ "$cur_size" == "$saved_size2" ] ||
2255 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2257 run_test 18b "Find out orphan OST-object and repair it (2)"
2261 echo "The target MDT-object is lost, and the OST-object FID is missing."
2262 echo "The LFSCK should re-create the MDT-object with new FID under the "
2263 echo "directory .lustre/lost+found/MDTxxxx."
2266 check_mount_and_prep
2267 $LFS mkdir -i 0 $DIR/$tdir/a1
2268 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2270 echo "Inject failure, to simulate the case of missing parent FID"
2271 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2272 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2274 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2275 $LFS getstripe $DIR/$tdir/a1/f1
2277 if [ $MDSCOUNT -ge 2 ]; then
2278 $LFS mkdir -i 1 $DIR/$tdir/a2
2279 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2280 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2281 $LFS getstripe $DIR/$tdir/a2/f2
2284 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2285 error "(0) Fail to create PFL $DIR/$tdir/f3"
2287 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2288 $LFS getstripe $DIR/$tdir/f3
2290 cancel_lru_locks osc
2291 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2293 echo "Inject failure, to simulate the case of missing the MDT-object"
2294 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2295 do_facet mds1 $LCTL set_param fail_loc=0x1616
2296 rm -f $DIR/$tdir/a1/f1
2298 if [ $MDSCOUNT -ge 2 ]; then
2299 do_facet mds2 $LCTL set_param fail_loc=0x1616
2300 rm -f $DIR/$tdir/a2/f2
2308 do_facet mds1 $LCTL set_param fail_loc=0
2309 if [ $MDSCOUNT -ge 2 ]; then
2310 do_facet mds2 $LCTL set_param fail_loc=0
2313 cancel_lru_locks mdc
2314 cancel_lru_locks osc
2316 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2317 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2319 for k in $(seq $MDSCOUNT); do
2320 # The LFSCK status query internal is 30 seconds. For the case
2321 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2322 # time to guarantee the status sync up.
2323 wait_update_facet mds${k} "$LCTL get_param -n \
2324 mdd.$(facet_svc mds${k}).lfsck_layout |
2325 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2326 error "(2) MDS${k} is not the expected 'completed'"
2329 for k in $(seq $OSTCOUNT); do
2330 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2331 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2332 awk '/^status/ { print $2 }')
2333 [ "$cur_status" == "completed" ] ||
2334 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2337 if [ $MDSCOUNT -ge 2 ]; then
2343 local repaired=$(do_facet mds1 $LCTL get_param -n \
2344 mdd.$(facet_svc mds1).lfsck_layout |
2345 awk '/^repaired_orphan/ { print $2 }')
2346 [ $repaired -eq $expected ] ||
2347 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2349 if [ $MDSCOUNT -ge 2 ]; then
2350 repaired=$(do_facet mds2 $LCTL get_param -n \
2351 mdd.$(facet_svc mds2).lfsck_layout |
2352 awk '/^repaired_orphan/ { print $2 }')
2353 [ $repaired -eq 0 ] ||
2354 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2357 ls -ail $MOUNT/.lustre/lost+found/
2359 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2360 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2361 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2363 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2366 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2367 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2368 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2370 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2371 [ ! -z "$cname" ] ||
2372 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2374 run_test 18c "Find out orphan OST-object and repair it (3)"
2378 echo "The target MDT-object layout EA is corrupted, but the right"
2379 echo "OST-object is still alive as orphan. The layout LFSCK will"
2380 echo "not create new OST-object to occupy such slot."
2383 check_mount_and_prep
2385 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2386 echo "guard" > $DIR/$tdir/a1/f1
2387 echo "foo" > $DIR/$tdir/a1/f2
2389 echo "guard" > $DIR/$tdir/a1/f3
2390 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2391 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2392 echo "foo" > $DIR/$tdir/a1/f4
2394 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2395 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2396 $LFS path2fid $DIR/$tdir/a1/f1
2397 $LFS getstripe $DIR/$tdir/a1/f1
2398 $LFS path2fid $DIR/$tdir/a1/f2
2399 $LFS getstripe $DIR/$tdir/a1/f2
2400 $LFS path2fid $DIR/$tdir/a1/f3
2401 $LFS getstripe $DIR/$tdir/a1/f3
2402 $LFS path2fid $DIR/$tdir/a1/f4
2403 $LFS getstripe $DIR/$tdir/a1/f4
2404 cancel_lru_locks osc
2406 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2407 echo "to reference the same OST-object (which is f1's OST-obejct)."
2408 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2409 echo "dangling reference case, but f2's old OST-object is there."
2411 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2412 echo "to reference the same OST-object (which is f3's OST-obejct)."
2413 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2414 echo "dangling reference case, but f4's old OST-object is there."
2417 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2418 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2419 chown 1.1 $DIR/$tdir/a1/f2
2420 chown 1.1 $DIR/$tdir/a1/f4
2421 rm -f $DIR/$tdir/a1/f1
2422 rm -f $DIR/$tdir/a1/f3
2425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2427 echo "stopall to cleanup object cache"
2430 setupall > /dev/null
2432 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2433 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2435 for k in $(seq $MDSCOUNT); do
2436 # The LFSCK status query internal is 30 seconds. For the case
2437 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2438 # time to guarantee the status sync up.
2439 wait_update_facet mds${k} "$LCTL get_param -n \
2440 mdd.$(facet_svc mds${k}).lfsck_layout |
2441 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2442 error "(3) MDS${k} is not the expected 'completed'"
2445 for k in $(seq $OSTCOUNT); do
2446 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2447 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2448 awk '/^status/ { print $2 }')
2449 [ "$cur_status" == "completed" ] ||
2450 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2453 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2454 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2455 awk '/^repaired_orphan/ { print $2 }')
2456 [ $repaired -eq 2 ] ||
2457 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2459 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2460 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2461 awk '/^repaired_dangling/ { print $2 }')
2462 [ $repaired -eq 0 ] ||
2463 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2465 echo "The file size should be correct after layout LFSCK scanning"
2466 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2467 [ "$cur_size" == "$saved_size1" ] ||
2468 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2470 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2471 [ "$cur_size" == "$saved_size2" ] ||
2472 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2474 echo "The LFSCK should find back the original data."
2475 cat $DIR/$tdir/a1/f2
2476 $LFS path2fid $DIR/$tdir/a1/f2
2477 $LFS getstripe $DIR/$tdir/a1/f2
2478 cat $DIR/$tdir/a1/f4
2479 $LFS path2fid $DIR/$tdir/a1/f4
2480 $LFS getstripe $DIR/$tdir/a1/f4
2482 run_test 18d "Find out orphan OST-object and repair it (4)"
2486 echo "The target MDT-object layout EA slot is occpuied by some new"
2487 echo "created OST-object when repair dangling reference case. Such"
2488 echo "conflict OST-object has been modified by others. To keep the"
2489 echo "new data, the LFSCK will create a new file to refernece this"
2490 echo "old orphan OST-object."
2493 check_mount_and_prep
2495 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2496 echo "guard" > $DIR/$tdir/a1/f1
2497 echo "foo" > $DIR/$tdir/a1/f2
2499 echo "guard" > $DIR/$tdir/a1/f3
2500 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2501 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2502 echo "foo" > $DIR/$tdir/a1/f4
2504 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2505 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2507 $LFS path2fid $DIR/$tdir/a1/f1
2508 $LFS getstripe $DIR/$tdir/a1/f1
2509 $LFS path2fid $DIR/$tdir/a1/f2
2510 $LFS getstripe $DIR/$tdir/a1/f2
2511 $LFS path2fid $DIR/$tdir/a1/f3
2512 $LFS getstripe $DIR/$tdir/a1/f3
2513 $LFS path2fid $DIR/$tdir/a1/f4
2514 $LFS getstripe $DIR/$tdir/a1/f4
2515 cancel_lru_locks osc
2517 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2518 echo "to reference the same OST-object (which is f1's OST-obejct)."
2519 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2520 echo "dangling reference case, but f2's old OST-object is there."
2522 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2523 echo "to reference the same OST-object (which is f3's OST-obejct)."
2524 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2525 echo "dangling reference case, but f4's old OST-object is there."
2528 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2529 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2530 chown 1.1 $DIR/$tdir/a1/f2
2531 chown 1.1 $DIR/$tdir/a1/f4
2532 rm -f $DIR/$tdir/a1/f1
2533 rm -f $DIR/$tdir/a1/f3
2536 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2538 echo "stopall to cleanup object cache"
2541 setupall > /dev/null
2543 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2544 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2546 start_full_debug_logging
2548 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2549 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2551 wait_update_facet mds1 "$LCTL get_param -n \
2552 mdd.$(facet_svc mds1).lfsck_layout |
2553 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2554 error "(3) MDS1 is not the expected 'scanning-phase2'"
2556 # to guarantee all updates are synced.
2560 echo "Write new data to f2/f4 to modify the new created OST-object."
2561 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2562 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2564 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2566 for k in $(seq $MDSCOUNT); do
2567 # The LFSCK status query internal is 30 seconds. For the case
2568 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2569 # time to guarantee the status sync up.
2570 wait_update_facet mds${k} "$LCTL get_param -n \
2571 mdd.$(facet_svc mds${k}).lfsck_layout |
2572 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2573 error "(4) MDS${k} is not the expected 'completed'"
2576 for k in $(seq $OSTCOUNT); do
2577 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2578 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2579 awk '/^status/ { print $2 }')
2580 [ "$cur_status" == "completed" ] ||
2581 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2584 stop_full_debug_logging
2586 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2587 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2588 awk '/^repaired_orphan/ { print $2 }')
2589 [ $repaired -eq 2 ] ||
2590 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2592 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2593 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2594 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2596 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2597 if [ $count -ne 2 ]; then
2598 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2599 error "(8) Expect 2 stubs under lost+found, but got $count"
2602 echo "The stub file should keep the original f2 or f4 data"
2603 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2604 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2605 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2606 error "(9) Got unexpected $cur_size"
2609 $LFS path2fid $cname
2610 $LFS getstripe $cname
2612 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2613 cur_size=$(ls -il $cname | awk '{ print $6 }')
2614 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2615 error "(10) Got unexpected $cur_size"
2618 $LFS path2fid $cname
2619 $LFS getstripe $cname
2621 echo "The f2/f4 should contains new data."
2622 cat $DIR/$tdir/a1/f2
2623 $LFS path2fid $DIR/$tdir/a1/f2
2624 $LFS getstripe $DIR/$tdir/a1/f2
2625 cat $DIR/$tdir/a1/f4
2626 $LFS path2fid $DIR/$tdir/a1/f4
2627 $LFS getstripe $DIR/$tdir/a1/f4
2629 run_test 18e "Find out orphan OST-object and repair it (5)"
2632 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2635 echo "The target MDT-object is lost. The LFSCK should re-create the"
2636 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2637 echo "to verify some OST-object(s) during the first stage-scanning,"
2638 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2639 echo "should not be affected."
2642 check_mount_and_prep
2643 $LFS mkdir -i 0 $DIR/$tdir/a1
2644 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2645 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2646 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2647 $LFS mkdir -i 0 $DIR/$tdir/a2
2648 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2649 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2650 $LFS getstripe $DIR/$tdir/a1/f1
2651 $LFS getstripe $DIR/$tdir/a2/f2
2653 if [ $MDSCOUNT -ge 2 ]; then
2654 $LFS mkdir -i 1 $DIR/$tdir/a3
2655 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2656 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2657 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2658 $LFS mkdir -i 1 $DIR/$tdir/a4
2659 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2660 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2661 $LFS getstripe $DIR/$tdir/a3/f3
2662 $LFS getstripe $DIR/$tdir/a4/f4
2665 cancel_lru_locks osc
2667 echo "Inject failure, to simulate the case of missing the MDT-object"
2668 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2669 do_facet mds1 $LCTL set_param fail_loc=0x1616
2670 rm -f $DIR/$tdir/a1/f1
2671 rm -f $DIR/$tdir/a2/f2
2673 if [ $MDSCOUNT -ge 2 ]; then
2674 do_facet mds2 $LCTL set_param fail_loc=0x1616
2675 rm -f $DIR/$tdir/a3/f3
2676 rm -f $DIR/$tdir/a4/f4
2682 do_facet mds1 $LCTL set_param fail_loc=0
2683 if [ $MDSCOUNT -ge 2 ]; then
2684 do_facet mds2 $LCTL set_param fail_loc=0
2687 cancel_lru_locks mdc
2688 cancel_lru_locks osc
2690 echo "Inject failure, to simulate the OST0 fail to handle"
2691 echo "MDT0 LFSCK request during the first-stage scanning."
2692 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2693 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2695 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2696 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2698 for k in $(seq $MDSCOUNT); do
2699 # The LFSCK status query internal is 30 seconds. For the case
2700 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2701 # time to guarantee the status sync up.
2702 wait_update_facet mds${k} "$LCTL get_param -n \
2703 mdd.$(facet_svc mds${k}).lfsck_layout |
2704 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2705 error "(2) MDS${k} is not the expected 'partial'"
2708 wait_update_facet ost1 "$LCTL get_param -n \
2709 obdfilter.$(facet_svc ost1).lfsck_layout |
2710 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2711 error "(3) OST1 is not the expected 'partial'"
2714 wait_update_facet ost2 "$LCTL get_param -n \
2715 obdfilter.$(facet_svc ost2).lfsck_layout |
2716 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2717 error "(4) OST2 is not the expected 'completed'"
2720 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2722 local repaired=$(do_facet mds1 $LCTL get_param -n \
2723 mdd.$(facet_svc mds1).lfsck_layout |
2724 awk '/^repaired_orphan/ { print $2 }')
2725 [ $repaired -eq 1 ] ||
2726 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2728 if [ $MDSCOUNT -ge 2 ]; then
2729 repaired=$(do_facet mds2 $LCTL get_param -n \
2730 mdd.$(facet_svc mds2).lfsck_layout |
2731 awk '/^repaired_orphan/ { print $2 }')
2732 [ $repaired -eq 1 ] ||
2733 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2736 echo "Trigger layout LFSCK on all devices again to cleanup"
2737 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2739 for k in $(seq $MDSCOUNT); do
2740 # The LFSCK status query internal is 30 seconds. For the case
2741 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2742 # time to guarantee the status sync up.
2743 wait_update_facet mds${k} "$LCTL get_param -n \
2744 mdd.$(facet_svc mds${k}).lfsck_layout |
2745 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2746 error "(8) MDS${k} is not the expected 'completed'"
2749 for k in $(seq $OSTCOUNT); do
2750 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2751 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2752 awk '/^status/ { print $2 }')
2753 [ "$cur_status" == "completed" ] ||
2754 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2758 local repaired=$(do_facet mds1 $LCTL get_param -n \
2759 mdd.$(facet_svc mds1).lfsck_layout |
2760 awk '/^repaired_orphan/ { print $2 }')
2761 [ $repaired -eq 2 ] ||
2762 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2764 if [ $MDSCOUNT -ge 2 ]; then
2765 repaired=$(do_facet mds2 $LCTL get_param -n \
2766 mdd.$(facet_svc mds2).lfsck_layout |
2767 awk '/^repaired_orphan/ { print $2 }')
2768 [ $repaired -eq 2 ] ||
2769 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2772 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2776 echo "The target MDT-object is lost, but related OI mapping is there"
2777 echo "The LFSCK should recreate the lost MDT-object without affected"
2778 echo "by the stale OI mapping."
2781 check_mount_and_prep
2782 $LFS mkdir -i 0 $DIR/$tdir/a1
2783 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2784 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2785 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2787 $LFS getstripe $DIR/$tdir/a1/f1
2788 cancel_lru_locks osc
2790 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2791 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2792 do_facet mds1 $LCTL set_param fail_loc=0x162e
2793 rm -f $DIR/$tdir/a1/f1
2795 do_facet mds1 $LCTL set_param fail_loc=0
2796 cancel_lru_locks mdc
2797 cancel_lru_locks osc
2799 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2800 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2802 for k in $(seq $MDSCOUNT); do
2803 # The LFSCK status query internal is 30 seconds. For the case
2804 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2805 # time to guarantee the status sync up.
2806 wait_update_facet mds${k} "$LCTL get_param -n \
2807 mdd.$(facet_svc mds${k}).lfsck_layout |
2808 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2809 error "(2) MDS${k} is not the expected 'completed'"
2812 for k in $(seq $OSTCOUNT); do
2813 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2814 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2815 awk '/^status/ { print $2 }')
2816 [ "$cur_status" == "completed" ] ||
2817 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2820 local repaired=$(do_facet mds1 $LCTL get_param -n \
2821 mdd.$(facet_svc mds1).lfsck_layout |
2822 awk '/^repaired_orphan/ { print $2 }')
2823 [ $repaired -eq $OSTCOUNT ] ||
2824 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2826 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2827 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2828 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2830 $LFS path2fid $DIR/$tdir/a1/f1
2831 $LFS getstripe $DIR/$tdir/a1/f1
2833 run_test 18g "Find out orphan OST-object and repair it (7)"
2837 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2838 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2839 echo "scanning its OST-object(s). Then in the second stage scanning,"
2840 echo "the OST will return related OST-object(s) to the MDT as orphan."
2841 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2842 echo "the 'orphan(s)' stripe information."
2845 check_mount_and_prep
2847 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2848 error "(0) Fail to create PFL $DIR/$tdir/f0"
2850 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2851 error "(1.1) Fail to write $DIR/$tdir/f0"
2853 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2854 error "(1.2) Fail to write $DIR/$tdir/f0"
2856 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2858 echo "Inject failure stub to simulate bad PFL extent range"
2859 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2860 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2862 chown 1.1 $DIR/$tdir/f0
2864 cancel_lru_locks mdc
2865 cancel_lru_locks osc
2866 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2868 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2869 error "(2) Write to bad PFL file should fail"
2871 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2872 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2874 for k in $(seq $MDSCOUNT); do
2875 # The LFSCK status query internal is 30 seconds. For the case
2876 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2877 # time to guarantee the status sync up.
2878 wait_update_facet mds${k} "$LCTL get_param -n \
2879 mdd.$(facet_svc mds${k}).lfsck_layout |
2880 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2881 error "(4.1) MDS${k} is not the expected 'completed'"
2884 for k in $(seq $OSTCOUNT); do
2885 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2886 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2887 awk '/^status/ { print $2 }')
2888 [ "$cur_status" == "completed" ] ||
2889 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2893 local repaired=$($SHOW_LAYOUT |
2894 awk '/^repaired_orphan/ { print $2 }')
2895 [ $repaired -eq 2 ] ||
2896 error "(5) Fail to repair crashed PFL range: $repaired"
2898 echo "Data in $DIR/$tdir/f0 should not be broken"
2899 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2900 error "(6) Data in $DIR/$tdir/f0 is broken"
2902 echo "Write should succeed after LFSCK repairing the bad PFL range"
2903 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2904 error "(7) Write should succeed after LFSCK"
2906 run_test 18h "LFSCK can repair crashed PFL extent range"
2908 $LCTL set_param debug=-cache > /dev/null
2911 check_mount_and_prep
2912 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2914 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2915 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2917 echo "foo1" > $DIR/$tdir/a0
2918 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2919 error "(0) Fail to create PFL $DIR/$tdir/a1"
2920 echo "foo2" > $DIR/$tdir/a1
2921 echo "guard" > $DIR/$tdir/a2
2922 cancel_lru_locks osc
2924 echo "Inject failure, then client will offer wrong parent FID when read"
2925 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2926 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2928 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2929 $LCTL set_param fail_loc=0x1619
2931 echo "Read RPC with wrong parent FID should be denied"
2932 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2933 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2934 $LCTL set_param fail_loc=0
2936 run_test 19a "OST-object inconsistency self detect"
2939 check_mount_and_prep
2940 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2942 echo "Inject failure stub to make the OST-object to back point to"
2943 echo "non-exist MDT-object"
2945 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2946 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2948 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2949 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2950 echo "foo1" > $DIR/$tdir/f0
2951 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2952 error "(0) Fail to create PFL $DIR/$tdir/f1"
2953 echo "foo2" > $DIR/$tdir/f1
2954 cancel_lru_locks osc
2955 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2957 do_facet ost1 $LCTL set_param -n \
2958 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2959 echo "Nothing should be fixed since self detect and repair is disabled"
2960 local repaired=$(do_facet ost1 $LCTL get_param -n \
2961 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2962 awk '/^repaired/ { print $2 }')
2963 [ $repaired -eq 0 ] ||
2964 error "(1) Expected 0 repaired, but got $repaired"
2966 echo "Read RPC with right parent FID should be accepted,"
2967 echo "and cause parent FID on OST to be fixed"
2969 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2970 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2972 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2973 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2975 repaired=$(do_facet ost1 $LCTL get_param -n \
2976 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2977 awk '/^repaired/ { print $2 }')
2978 [ $repaired -eq 2 ] ||
2979 error "(3) Expected 1 repaired, but got $repaired"
2981 run_test 19b "OST-object inconsistency self repair"
2983 PATTERN_WITH_HOLE="40000001"
2984 PATTERN_WITHOUT_HOLE="raid0"
2987 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2990 echo "The target MDT-object and some of its OST-object are lost."
2991 echo "The LFSCK should find out the left OST-objects and re-create"
2992 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2993 echo "with the partial OST-objects (LOV EA hole)."
2995 echo "New client can access the file with LOV EA hole via normal"
2996 echo "system tools or commands without crash the system."
2998 echo "For old client, even though it cannot access the file with"
2999 echo "LOV EA hole, it should not cause the system crash."
3002 check_mount_and_prep
3003 $LFS mkdir -i 0 $DIR/$tdir/a1
3004 if [ $OSTCOUNT -gt 2 ]; then
3005 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3008 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3012 # 256 blocks on the stripe0.
3013 # 1 block on the stripe1 for 2 OSTs case.
3014 # 256 blocks on the stripe1 for other cases.
3015 # 1 block on the stripe2 if OSTs > 2
3016 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3017 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3018 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3020 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3021 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3022 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3025 $LFS getstripe $DIR/$tdir/a1/f0
3027 $LFS getstripe $DIR/$tdir/a1/f1
3029 $LFS getstripe $DIR/$tdir/a1/f2
3031 if [ $OSTCOUNT -gt 2 ]; then
3032 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3033 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3035 $LFS getstripe $DIR/$tdir/a1/f3
3038 cancel_lru_locks osc
3040 echo "Inject failure..."
3041 echo "To simulate f0 lost MDT-object"
3042 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3043 do_facet mds1 $LCTL set_param fail_loc=0x1616
3044 rm -f $DIR/$tdir/a1/f0
3046 echo "To simulate f1 lost MDT-object and OST-object0"
3047 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3048 do_facet mds1 $LCTL set_param fail_loc=0x161a
3049 rm -f $DIR/$tdir/a1/f1
3051 echo "To simulate f2 lost MDT-object and OST-object1"
3052 do_facet mds1 $LCTL set_param fail_val=1
3053 rm -f $DIR/$tdir/a1/f2
3055 if [ $OSTCOUNT -gt 2 ]; then
3056 echo "To simulate f3 lost MDT-object and OST-object2"
3057 do_facet mds1 $LCTL set_param fail_val=2
3058 rm -f $DIR/$tdir/a1/f3
3061 umount_client $MOUNT
3064 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3066 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3067 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3069 for k in $(seq $MDSCOUNT); do
3070 # The LFSCK status query internal is 30 seconds. For the case
3071 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3072 # time to guarantee the status sync up.
3073 wait_update_facet mds${k} "$LCTL get_param -n \
3074 mdd.$(facet_svc mds${k}).lfsck_layout |
3075 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3076 error "(2) MDS${k} is not the expected 'completed'"
3079 for k in $(seq $OSTCOUNT); do
3080 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3081 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3082 awk '/^status/ { print $2 }')
3083 [ "$cur_status" == "completed" ] ||
3084 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3087 local repaired=$(do_facet mds1 $LCTL get_param -n \
3088 mdd.$(facet_svc mds1).lfsck_layout |
3089 awk '/^repaired_orphan/ { print $2 }')
3090 if [ $OSTCOUNT -gt 2 ]; then
3091 [ $repaired -eq 9 ] ||
3092 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3094 [ $repaired -eq 4 ] ||
3095 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3098 mount_client $MOUNT || error "(5.0) Fail to start client!"
3100 LOV_PATTERN_F_HOLE=0x40000000
3103 # ${fid0}-R-0 is the old f0
3105 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3106 echo "Check $name, which is the old f0"
3108 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3110 local pattern=$($LFS getstripe -L $name)
3111 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3112 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3114 local stripes=$($LFS getstripe -c $name)
3115 if [ $OSTCOUNT -gt 2 ]; then
3116 [ $stripes -eq 3 ] ||
3117 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3119 [ $stripes -eq 2 ] ||
3120 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3123 local size=$(stat $name | awk '/Size:/ { print $2 }')
3124 [ $size -eq $((4096 * $bcount)) ] ||
3125 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3127 cat $name > /dev/null || error "(5.5) cannot read $name"
3129 echo "dummy" >> $name || error "(5.6) cannot write $name"
3131 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3133 touch $name || error "(5.8) cannot touch $name"
3135 rm -f $name || error "(5.9) cannot unlink $name"
3138 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3140 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3141 if [ $OSTCOUNT -gt 2 ]; then
3142 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3144 echo "Check $name, it contains the old f1's stripe1"
3147 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3149 pattern=$($LFS getstripe -L $name)
3150 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3151 error "(6.2) expect pattern flag hole, but got $pattern"
3153 stripes=$($LFS getstripe -c $name)
3154 if [ $OSTCOUNT -gt 2 ]; then
3155 [ $stripes -eq 3 ] ||
3156 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3158 [ $stripes -eq 2 ] ||
3159 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3162 size=$(stat $name | awk '/Size:/ { print $2 }')
3163 [ $size -eq $((4096 * $bcount)) ] ||
3164 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3166 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3168 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3169 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3172 [ $failures -eq 256 ] ||
3173 error "(6.6) expect 256 IO failures, but get $failures"
3175 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3176 [ $size -eq $((4096 * $bcount)) ] ||
3177 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3179 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3180 error "(6.8) write to the LOV EA hole should fail"
3182 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3183 error "(6.9) write to normal stripe should NOT fail"
3185 echo "foo" >> $name && error "(6.10) append write $name should fail"
3187 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3189 touch $name || error "(6.12) cannot touch $name"
3191 rm -f $name || error "(6.13) cannot unlink $name"
3194 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3196 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3197 if [ $OSTCOUNT -gt 2 ]; then
3198 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3200 echo "Check $name, it contains the old f2's stripe0"
3203 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3205 pattern=$($LFS getstripe -L $name)
3206 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3207 error "(7.2) expect pattern flag hole, but got $pattern"
3209 stripes=$($LFS getstripe -c $name)
3210 size=$(stat $name | awk '/Size:/ { print $2 }')
3211 if [ $OSTCOUNT -gt 2 ]; then
3212 [ $stripes -eq 3 ] ||
3213 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3215 [ $size -eq $((4096 * $bcount)) ] ||
3216 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3218 cat $name > /dev/null &&
3219 error "(7.5.1) normal read $name should fail"
3221 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3222 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3224 [ $failures -eq 256 ] ||
3225 error "(7.6) expect 256 IO failures, but get $failures"
3227 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3228 [ $size -eq $((4096 * $bcount)) ] ||
3229 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3231 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3232 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3234 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3235 error "(7.8.1) write to normal stripe should NOT fail"
3237 echo "foo" >> $name &&
3238 error "(7.8.3) append write $name should fail"
3240 chown $RUNAS_ID:$RUNAS_GID $name ||
3241 error "(7.9.1) cannot chown on $name"
3243 touch $name || error "(7.10.1) cannot touch $name"
3245 [ $stripes -eq 2 ] ||
3246 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3249 [ $size -eq $((4096 * (256 + 0))) ] ||
3250 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3252 cat $name > /dev/null &&
3253 error "(7.5.2) normal read $name should fail"
3255 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3256 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3257 [ $failures -eq 256 ] ||
3258 error "(7.6.2) expect 256 IO failures, but get $failures"
3261 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3262 [ $size -eq $((4096 * $bcount)) ] ||
3263 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3265 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3266 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3268 chown $RUNAS_ID:$RUNAS_GID $name ||
3269 error "(7.9.2) cannot chown on $name"
3271 touch $name || error "(7.10.2) cannot touch $name"
3274 rm -f $name || error "(7.11) cannot unlink $name"
3276 [ $OSTCOUNT -le 2 ] && return
3279 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3281 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3282 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3284 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3286 pattern=$($LFS getstripe -L $name)
3287 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3288 error "(8.2) expect pattern flag hole, but got $pattern"
3290 stripes=$($LFS getstripe -c $name)
3291 [ $stripes -eq 3 ] ||
3292 error "(8.3) expect the stripe count is 3, but got $stripes"
3294 size=$(stat $name | awk '/Size:/ { print $2 }')
3296 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3297 error "(8.4) expect the size $((4096 * 512)), but got $size"
3299 cat $name > /dev/null &&
3300 error "(8.5) normal read $name should fail"
3302 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3303 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3305 [ $failures -eq 256 ] ||
3306 error "(8.6) expect 256 IO failures, but get $failures"
3309 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3310 [ $size -eq $((4096 * $bcount)) ] ||
3311 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3313 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3314 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3316 chown $RUNAS_ID:$RUNAS_GID $name ||
3317 error "(8.9) cannot chown on $name"
3319 touch $name || error "(8.10) cannot touch $name"
3321 rm -f $name || error "(8.11) cannot unlink $name"
3323 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3326 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3329 echo "The target MDT-object and some of its OST-object are lost."
3330 echo "The LFSCK should find out the left OST-objects and re-create"
3331 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3332 echo "with the partial OST-objects (LOV EA hole)."
3334 echo "New client can access the file with LOV EA hole via normal"
3335 echo "system tools or commands without crash the system - PFL case."
3338 check_mount_and_prep
3340 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3341 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3342 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3343 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3344 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3345 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3347 local bcount=$((256 * 3 + 1))
3349 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3350 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3351 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3353 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3354 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3355 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3358 $LFS getstripe $DIR/$tdir/f0
3360 $LFS getstripe $DIR/$tdir/f1
3362 $LFS getstripe $DIR/$tdir/f2
3364 cancel_lru_locks mdc
3365 cancel_lru_locks osc
3367 echo "Inject failure..."
3368 echo "To simulate f0 lost MDT-object"
3369 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3370 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3373 echo "To simulate the case of f1 lost MDT-object and "
3374 echo "the first OST-object in each PFL component"
3375 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3376 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3379 echo "To simulate the case of f2 lost MDT-object and "
3380 echo "the second OST-object in each PFL component"
3381 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3388 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3389 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3391 for k in $(seq $MDSCOUNT); do
3392 # The LFSCK status query internal is 30 seconds. For the case
3393 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3394 # time to guarantee the status sync up.
3395 wait_update_facet mds${k} "$LCTL get_param -n \
3396 mdd.$(facet_svc mds${k}).lfsck_layout |
3397 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3398 error "(4) MDS${k} is not the expected 'completed'"
3401 for k in $(seq $OSTCOUNT); do
3402 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3403 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3404 awk '/^status/ { print $2 }')
3405 [ "$cur_status" == "completed" ] ||
3406 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3409 local repaired=$(do_facet mds1 $LCTL get_param -n \
3410 mdd.$(facet_svc mds1).lfsck_layout |
3411 awk '/^repaired_orphan/ { print $2 }')
3412 [ $repaired -eq 8 ] ||
3413 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3416 # ${fid0}-R-0 is the old f0
3418 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3419 echo "Check $name, which is the old f0"
3421 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3423 local pattern=$($LFS getstripe -L -I1 $name)
3424 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3425 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3427 pattern=$($LFS getstripe -L -I2 $name)
3428 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3429 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3431 local stripes=$($LFS getstripe -c -I1 $name)
3432 [ $stripes -eq 2 ] ||
3433 error "(7.3.1) expect 2 stripes, but got $stripes"
3435 stripes=$($LFS getstripe -c -I2 $name)
3436 [ $stripes -eq 2 ] ||
3437 error "(7.3.2) expect 2 stripes, but got $stripes"
3439 local e_start=$($LFS getstripe -I1 $name |
3440 awk '/lcme_extent.e_start:/ { print $2 }')
3441 [ $e_start -eq 0 ] ||
3442 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3444 local e_end=$($LFS getstripe -I1 $name |
3445 awk '/lcme_extent.e_end:/ { print $2 }')
3446 [ $e_end -eq 2097152 ] ||
3447 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3449 e_start=$($LFS getstripe -I2 $name |
3450 awk '/lcme_extent.e_start:/ { print $2 }')
3451 [ $e_start -eq 2097152 ] ||
3452 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3454 e_end=$($LFS getstripe -I2 $name |
3455 awk '/lcme_extent.e_end:/ { print $2 }')
3456 [ "$e_end" = "EOF" ] ||
3457 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3459 local size=$(stat $name | awk '/Size:/ { print $2 }')
3460 [ $size -eq $((4096 * $bcount)) ] ||
3461 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3463 cat $name > /dev/null || error "(7.7) cannot read $name"
3465 echo "dummy" >> $name || error "(7.8) cannot write $name"
3467 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3469 touch $name || error "(7.10) cannot touch $name"
3471 rm -f $name || error "(7.11) cannot unlink $name"
3474 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3476 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3477 echo "Check $name, it contains f1's second OST-object in each COMP"
3479 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3481 pattern=$($LFS getstripe -L -I1 $name)
3482 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3483 error "(8.2.1) expect pattern flag hole, but got $pattern"
3485 pattern=$($LFS getstripe -L -I2 $name)
3486 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3487 error "(8.2.2) expect pattern flag hole, but got $pattern"
3489 stripes=$($LFS getstripe -c -I1 $name)
3490 [ $stripes -eq 2 ] ||
3491 error "(8.3.2) expect 2 stripes, but got $stripes"
3493 stripes=$($LFS getstripe -c -I2 $name)
3494 [ $stripes -eq 2 ] ||
3495 error "(8.3.2) expect 2 stripes, but got $stripes"
3497 e_start=$($LFS getstripe -I1 $name |
3498 awk '/lcme_extent.e_start:/ { print $2 }')
3499 [ $e_start -eq 0 ] ||
3500 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3502 e_end=$($LFS getstripe -I1 $name |
3503 awk '/lcme_extent.e_end:/ { print $2 }')
3504 [ $e_end -eq 2097152 ] ||
3505 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3507 e_start=$($LFS getstripe -I2 $name |
3508 awk '/lcme_extent.e_start:/ { print $2 }')
3509 [ $e_start -eq 2097152 ] ||
3510 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3512 e_end=$($LFS getstripe -I2 $name |
3513 awk '/lcme_extent.e_end:/ { print $2 }')
3514 [ "$e_end" = "EOF" ] ||
3515 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3517 size=$(stat $name | awk '/Size:/ { print $2 }')
3518 [ $size -eq $((4096 * $bcount)) ] ||
3519 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3521 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3523 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3524 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3526 # The first stripe in each COMP was lost
3527 [ $failures -eq 512 ] ||
3528 error "(8.8) expect 512 IO failures, but get $failures"
3530 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3531 [ $size -eq $((4096 * $bcount)) ] ||
3532 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3534 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3535 error "(8.10) write to the LOV EA hole should fail"
3537 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3538 error "(8.11) write to normal stripe should NOT fail"
3540 echo "foo" >> $name && error "(8.12) append write $name should fail"
3542 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3544 touch $name || error "(8.14) cannot touch $name"
3546 rm -f $name || error "(8.15) cannot unlink $name"
3549 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3551 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3552 echo "Check $name, it contains f2's first stripe in each COMP"
3554 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3556 pattern=$($LFS getstripe -L -I1 $name)
3557 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3558 error "(9.2.1) expect pattern flag hole, but got $pattern"
3560 pattern=$($LFS getstripe -L -I2 $name)
3561 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3562 error "(9.2.2) expect pattern flag hole, but got $pattern"
3564 stripes=$($LFS getstripe -c -I1 $name)
3565 [ $stripes -eq 2 ] ||
3566 error "(9.3.2) expect 2 stripes, but got $stripes"
3568 stripes=$($LFS getstripe -c -I2 $name)
3569 [ $stripes -eq 2 ] ||
3570 error "(9.3.2) expect 2 stripes, but got $stripes"
3572 e_start=$($LFS getstripe -I1 $name |
3573 awk '/lcme_extent.e_start:/ { print $2 }')
3574 [ $e_start -eq 0 ] ||
3575 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3577 e_end=$($LFS getstripe -I1 $name |
3578 awk '/lcme_extent.e_end:/ { print $2 }')
3579 [ $e_end -eq 2097152 ] ||
3580 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3582 e_start=$($LFS getstripe -I2 $name |
3583 awk '/lcme_extent.e_start:/ { print $2 }')
3584 [ $e_start -eq 2097152 ] ||
3585 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3587 e_end=$($LFS getstripe -I2 $name |
3588 awk '/lcme_extent.e_end:/ { print $2 }')
3589 [ "$e_end" = "EOF" ] ||
3590 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3592 size=$(stat $name | awk '/Size:/ { print $2 }')
3593 # The second stripe in COMP was lost, so we do not know there
3594 # have ever been some data before. 'stat' will regard it as
3595 # no data on the lost stripe.
3597 [ $size -eq $((4096 * $bcount)) ] ||
3598 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3600 cat $name > /dev/null &&
3601 error "(9.7) normal read $name should fail"
3603 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3604 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3605 [ $failures -eq 512 ] ||
3606 error "(9.8) expect 256 IO failures, but get $failures"
3608 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3609 # The second stripe in COMP was lost, so we do not know there
3610 # have ever been some data before. Since 'dd' skip failure,
3611 # it will regard the lost stripe contains data.
3613 [ $size -eq $((4096 * $bcount)) ] ||
3614 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3616 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3617 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3619 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3620 error "(9.11) write to normal stripe should NOT fail"
3622 echo "foo" >> $name &&
3623 error "(9.12) append write $name should fail"
3625 chown $RUNAS_ID:$RUNAS_GID $name ||
3626 error "(9.13) cannot chown on $name"
3628 touch $name || error "(9.14) cannot touch $name"
3630 rm -f $name || error "(7.15) cannot unlink $name"
3632 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3635 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3636 skip "ignore the test if MDS is older than 2.5.59" && return
3638 check_mount_and_prep
3639 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3641 echo "Start all LFSCK components by default (-s 1)"
3642 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3643 error "Fail to start LFSCK"
3645 echo "namespace LFSCK should be in 'scanning-phase1' status"
3646 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3647 [ "$STATUS" == "scanning-phase1" ] ||
3648 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3650 echo "layout LFSCK should be in 'scanning-phase1' status"
3651 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3652 [ "$STATUS" == "scanning-phase1" ] ||
3653 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3655 echo "Stop all LFSCK components by default"
3656 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3657 error "Fail to stop LFSCK"
3659 run_test 21 "run all LFSCK components by default"
3662 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3665 echo "The parent_A references the child directory via some name entry,"
3666 echo "but the child directory back references another parent_B via its"
3667 echo "".." name entry. The parent_B does not exist. Then the namespace"
3668 echo "LFSCK will repair the child directory's ".." name entry."
3671 check_mount_and_prep
3673 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3674 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3676 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3677 echo "The dummy's dotdot name entry references the guard."
3678 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3680 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3681 error "(3) Fail to mkdir on MDT0"
3682 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3684 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3686 echo "Trigger namespace LFSCK to repair unmatched pairs"
3687 $START_NAMESPACE -A -r ||
3688 error "(5) Fail to start LFSCK for namespace"
3690 wait_all_targets_blocked namespace completed 6
3692 local repaired=$($SHOW_NAMESPACE |
3693 awk '/^unmatched_pairs_repaired/ { print $2 }')
3694 [ $repaired -eq 1 ] ||
3695 error "(7) Fail to repair unmatched pairs: $repaired"
3697 echo "'ls' should success after namespace LFSCK repairing"
3698 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3699 error "(8) ls should success."
3701 run_test 22a "LFSCK can repair unmatched pairs (1)"
3704 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3707 echo "The parent_A references the child directory via the name entry_B,"
3708 echo "but the child directory back references another parent_C via its"
3709 echo "".." name entry. The parent_C exists, but there is no the name"
3710 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3711 echo "the child directory's ".." name entry and its linkEA."
3714 check_mount_and_prep
3716 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3717 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3719 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3720 echo "and bad linkEA. The dummy's dotdot name entry references the"
3721 echo "guard. The dummy's linkEA references n non-exist name entry."
3722 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3723 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3724 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3725 error "(3) Fail to mkdir on MDT0"
3726 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3728 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3729 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3730 local dummyname=$($LFS fid2path $DIR $dummyfid)
3731 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3732 error "(4) fid2path works unexpectedly."
3734 echo "Trigger namespace LFSCK to repair unmatched pairs"
3735 $START_NAMESPACE -A -r ||
3736 error "(5) Fail to start LFSCK for namespace"
3738 wait_all_targets_blocked namespace completed 6
3740 local repaired=$($SHOW_NAMESPACE |
3741 awk '/^unmatched_pairs_repaired/ { print $2 }')
3742 [ $repaired -eq 1 ] ||
3743 error "(7) Fail to repair unmatched pairs: $repaired"
3745 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3746 local dummyname=$($LFS fid2path $DIR $dummyfid)
3747 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3748 error "(8) fid2path does not work"
3750 run_test 22b "LFSCK can repair unmatched pairs (2)"
3753 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3756 echo "The name entry is there, but the MDT-object for such name "
3757 echo "entry does not exist. The namespace LFSCK should find out "
3758 echo "and repair the inconsistency as required."
3761 check_mount_and_prep
3763 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3764 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3766 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3767 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3768 do_facet mds2 $LCTL set_param fail_loc=0x1620
3769 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3770 do_facet mds2 $LCTL set_param fail_loc=0
3772 echo "'ls' should fail because of dangling name entry"
3773 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3775 echo "Trigger namespace LFSCK to find out dangling name entry"
3776 $START_NAMESPACE -A -r ||
3777 error "(5) Fail to start LFSCK for namespace"
3779 wait_all_targets_blocked namespace completed 6
3781 local repaired=$($SHOW_NAMESPACE |
3782 awk '/^dangling_repaired/ { print $2 }')
3783 [ $repaired -eq 1 ] ||
3784 error "(7) Fail to repair dangling name entry: $repaired"
3786 echo "'ls' should fail because not re-create MDT-object by default"
3787 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3789 echo "Trigger namespace LFSCK again to repair dangling name entry"
3790 $START_NAMESPACE -A -r -C ||
3791 error "(9) Fail to start LFSCK for namespace"
3793 wait_all_targets_blocked namespace completed 10
3795 repaired=$($SHOW_NAMESPACE |
3796 awk '/^dangling_repaired/ { print $2 }')
3797 [ $repaired -eq 1 ] ||
3798 error "(11) Fail to repair dangling name entry: $repaired"
3800 echo "'ls' should success after namespace LFSCK repairing"
3801 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3803 run_test 23a "LFSCK can repair dangling name entry (1)"
3807 echo "The objectA has multiple hard links, one of them corresponding"
3808 echo "to the name entry_B. But there is something wrong for the name"
3809 echo "entry_B and cause entry_B to references non-exist object_C."
3810 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3811 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3812 echo "comes to the second-stage scanning, it will find that the"
3813 echo "former re-creating object_C is not proper, and will try to"
3814 echo "replace the object_C with the real object_A."
3817 check_mount_and_prep
3819 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3820 $LFS path2fid $DIR/$tdir/d0
3822 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3824 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3825 $LFS path2fid $DIR/$tdir/d0/f0
3827 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3828 $LFS path2fid $DIR/$tdir/d0/f1
3830 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3831 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3833 if [ "$SEQ0" != "$SEQ1" ]; then
3834 # To guarantee that the f0 and f1 are in the same FID seq
3835 rm -f $DIR/$tdir/d0/f0 ||
3836 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3837 echo "dummy" > $DIR/$tdir/d0/f0 ||
3838 error "(3.2) Fail to touch on MDT0"
3839 $LFS path2fid $DIR/$tdir/d0/f0
3842 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3843 OID=$(printf %d $OID)
3845 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3846 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3847 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3848 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3849 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3851 # If there is creation after the dangling injection, it may re-use
3852 # the just released local object (inode) that is referenced by the
3853 # dangling name entry. It will fail the dangling injection.
3854 # So before deleting the target object for the dangling name entry,
3855 # remove some other objects to avoid the target object being reused
3856 # by some potential creations. LU-7429
3857 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3859 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3861 echo "'ls' should fail because of dangling name entry"
3862 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3863 error "(6) ls should fail."
3865 echo "Trigger namespace LFSCK to find out dangling name entry"
3866 $START_NAMESPACE -r -C ||
3867 error "(7) Fail to start LFSCK for namespace"
3869 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3870 mdd.${MDT_DEV}.lfsck_namespace |
3871 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3873 error "(8) unexpected status"
3876 local repaired=$($SHOW_NAMESPACE |
3877 awk '/^dangling_repaired/ { print $2 }')
3878 [ $repaired -eq 1 ] ||
3879 error "(9) Fail to repair dangling name entry: $repaired"
3881 repaired=$($SHOW_NAMESPACE |
3882 awk '/^multiple_linked_repaired/ { print $2 }')
3883 [ $repaired -eq 1 ] ||
3884 error "(10) Fail to drop the former created object: $repaired"
3886 local data=$(cat $DIR/$tdir/d0/foo)
3887 [ "$data" == "dummy" ] ||
3888 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3890 run_test 23b "LFSCK can repair dangling name entry (2)"
3894 echo "The objectA has multiple hard links, one of them corresponding"
3895 echo "to the name entry_B. But there is something wrong for the name"
3896 echo "entry_B and cause entry_B to references non-exist object_C."
3897 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3898 echo "as dangling, and re-create the lost object_C. And then others"
3899 echo "modified the re-created object_C. When the LFSCK comes to the"
3900 echo "second-stage scanning, it will find that the former re-creating"
3901 echo "object_C maybe wrong and try to replace the object_C with the"
3902 echo "real object_A. But because object_C has been modified, so the"
3903 echo "LFSCK cannot replace it."
3906 start_full_debug_logging
3908 check_mount_and_prep
3910 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3911 $LFS path2fid $DIR/$tdir/d0
3913 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3915 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3916 $LFS path2fid $DIR/$tdir/d0/f0
3918 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3919 $LFS path2fid $DIR/$tdir/d0/f1
3921 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3922 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3924 if [ "$SEQ0" != "$SEQ1" ]; then
3925 # To guarantee that the f0 and f1 are in the same FID seq
3926 rm -f $DIR/$tdir/d0/f0 ||
3927 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3928 echo "dummy" > $DIR/$tdir/d0/f0 ||
3929 error "(3.2) Fail to touch on MDT0"
3930 $LFS path2fid $DIR/$tdir/d0/f0
3933 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3934 OID=$(printf %d $OID)
3936 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3937 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3938 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3939 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3940 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3942 # If there is creation after the dangling injection, it may re-use
3943 # the just released local object (inode) that is referenced by the
3944 # dangling name entry. It will fail the dangling injection.
3945 # So before deleting the target object for the dangling name entry,
3946 # remove some other objects to avoid the target object being reused
3947 # by some potential creations. LU-7429
3948 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3950 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3952 echo "'ls' should fail because of dangling name entry"
3953 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3954 error "(6) ls should fail."
3956 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3957 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3959 echo "Trigger namespace LFSCK to find out dangling name entry"
3960 $START_NAMESPACE -r -C ||
3961 error "(7) Fail to start LFSCK for namespace"
3963 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3964 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3965 stat $DIR/$tdir/d0/foo
3967 error "(8) unexpected size"
3970 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3971 cancel_lru_locks osc
3973 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3974 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3975 mdd.${MDT_DEV}.lfsck_namespace |
3976 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3978 error "(10) unexpected status"
3981 stop_full_debug_logging
3983 local repaired=$($SHOW_NAMESPACE |
3984 awk '/^dangling_repaired/ { print $2 }')
3985 [ $repaired -eq 1 ] ||
3986 error "(11) Fail to repair dangling name entry: $repaired"
3988 local data=$(cat $DIR/$tdir/d0/foo)
3989 [ "$data" != "dummy" ] ||
3990 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3992 run_test 23c "LFSCK can repair dangling name entry (3)"
3995 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3998 echo "Two MDT-objects back reference the same name entry via their"
3999 echo "each own linkEA entry, but the name entry only references one"
4000 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4001 echo "for the MDT-object that is not recognized. If such MDT-object"
4002 echo "has no other linkEA entry after the removing, then the LFSCK"
4003 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4006 check_mount_and_prep
4008 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4010 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4011 $LFS path2fid $DIR/$tdir/d0/guard
4013 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4014 $LFS path2fid $DIR/$tdir/d0/dummy
4017 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4018 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4020 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4023 touch $DIR/$tdir/d0/guard/foo ||
4024 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4026 echo "Inject failure stub on MDT0 to simulate the case that"
4027 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4028 echo "that references $DIR/$tdir/d0/guard/foo."
4029 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4030 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4031 echo "there with the same linkEA entry as another MDT-object"
4032 echo "$DIR/$tdir/d0/guard/foo has"
4034 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4035 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4036 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4037 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4038 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4039 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4040 rmdir $DIR/$tdir/d0/dummy/foo ||
4041 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4042 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4044 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4045 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4046 error "(6) stat successfully unexpectedly"
4048 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4049 $START_NAMESPACE -A -r ||
4050 error "(7) Fail to start LFSCK for namespace"
4052 wait_all_targets_blocked namespace completed 8
4054 local repaired=$($SHOW_NAMESPACE |
4055 awk '/^multiple_referenced_repaired/ { print $2 }')
4056 [ $repaired -eq 1 ] ||
4057 error "(9) Fail to repair multiple referenced name entry: $repaired"
4059 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4060 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4061 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4063 local cname="$cfid-$pfid-D-0"
4064 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4065 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4067 run_test 24 "LFSCK can repair multiple-referenced name entry"
4070 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4071 skip "ldiskfs only test" && return
4074 echo "The file type in the name entry does not match the file type"
4075 echo "claimed by the referenced object. Then the LFSCK will update"
4076 echo "the file type in the name entry."
4079 check_mount_and_prep
4081 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4083 echo "Inject failure stub on MDT0 to simulate the case that"
4084 echo "the file type stored in the name entry is wrong."
4086 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4087 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4088 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4089 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4091 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4092 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4094 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4095 mdd.${MDT_DEV}.lfsck_namespace |
4096 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4098 error "(4) unexpected status"
4101 local repaired=$($SHOW_NAMESPACE |
4102 awk '/^bad_file_type_repaired/ { print $2 }')
4103 [ $repaired -eq 1 ] ||
4104 error "(5) Fail to repair bad file type in name entry: $repaired"
4106 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4108 run_test 25 "LFSCK can repair bad file type in the name entry"
4112 echo "The local name entry back referenced by the MDT-object is lost."
4113 echo "The namespace LFSCK will add the missing local name entry back"
4114 echo "to the normal namespace."
4117 check_mount_and_prep
4119 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4120 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4121 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4123 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4124 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4126 echo "Inject failure stub on MDT0 to simulate the case that"
4127 echo "foo's name entry will be removed, but the foo's object"
4128 echo "and its linkEA are kept in the system."
4130 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4131 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4132 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4133 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4135 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4136 error "(5) 'ls' should fail"
4138 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4139 $START_NAMESPACE -r -A ||
4140 error "(6) Fail to start LFSCK for namespace"
4142 wait_all_targets_blocked namespace completed 7
4144 local repaired=$($SHOW_NAMESPACE |
4145 awk '/^lost_dirent_repaired/ { print $2 }')
4146 [ $repaired -eq 1 ] ||
4147 error "(8) Fail to repair lost dirent: $repaired"
4149 ls -ail $DIR/$tdir/d0/foo ||
4150 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4152 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4153 [ "$foofid" == "$foofid2" ] ||
4154 error "(10) foo's FID changed: $foofid, $foofid2"
4156 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4159 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4162 echo "The remote name entry back referenced by the MDT-object is lost."
4163 echo "The namespace LFSCK will add the missing remote name entry back"
4164 echo "to the normal namespace."
4167 check_mount_and_prep
4169 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4170 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4171 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4173 echo "Inject failure stub on MDT0 to simulate the case that"
4174 echo "foo's name entry will be removed, but the foo's object"
4175 echo "and its linkEA are kept in the system."
4177 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4178 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4179 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4180 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4182 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4183 error "(4) 'ls' should fail"
4185 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4186 $START_NAMESPACE -r -A ||
4187 error "(5) Fail to start LFSCK for namespace"
4189 wait_all_targets_blocked namespace completed 6
4191 local repaired=$($SHOW_NAMESPACE |
4192 awk '/^lost_dirent_repaired/ { print $2 }')
4193 [ $repaired -eq 1 ] ||
4194 error "(7) Fail to repair lost dirent: $repaired"
4196 ls -ail $DIR/$tdir/d0/foo ||
4197 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4199 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4200 [ "$foofid" == "$foofid2" ] ||
4201 error "(9) foo's FID changed: $foofid, $foofid2"
4203 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4207 echo "The local parent referenced by the MDT-object linkEA is lost."
4208 echo "The namespace LFSCK will re-create the lost parent as orphan."
4211 check_mount_and_prep
4213 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4214 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4215 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4216 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4218 echo "Inject failure stub on MDT0 to simulate the case that"
4219 echo "foo's name entry will be removed, but the foo's object"
4220 echo "and its linkEA are kept in the system. And then remove"
4221 echo "another hard link and the parent directory."
4223 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4224 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4225 rm -f $DIR/$tdir/d0/foo ||
4226 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4227 rm -f $DIR/$tdir/d0/dummy ||
4228 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4229 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4231 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4232 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4234 echo "Trigger namespace LFSCK to repair the lost parent"
4235 $START_NAMESPACE -r -A ||
4236 error "(6) Fail to start LFSCK for namespace"
4238 wait_all_targets_blocked namespace completed 7
4240 local repaired=$($SHOW_NAMESPACE |
4241 awk '/^lost_dirent_repaired/ { print $2 }')
4242 [ $repaired -eq 1 ] ||
4243 error "(8) Fail to repair lost dirent: $repaired"
4245 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4246 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4247 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4249 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4251 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4252 [ ! -z "$cname" ] ||
4253 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4255 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4258 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4261 echo "The remote parent referenced by the MDT-object linkEA is lost."
4262 echo "The namespace LFSCK will re-create the lost parent as orphan."
4265 check_mount_and_prep
4267 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4268 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4270 $LFS path2fid $DIR/$tdir/d0
4272 echo "Inject failure stub on MDT0 to simulate the case that"
4273 echo "foo's name entry will be removed, but the foo's object"
4274 echo "and its linkEA are kept in the system. And then remove"
4275 echo "the parent directory."
4277 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4278 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4279 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4280 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4282 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4283 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4285 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4286 $START_NAMESPACE -r -A ||
4287 error "(6) Fail to start LFSCK for namespace"
4289 wait_all_targets_blocked namespace completed 7
4291 local repaired=$($SHOW_NAMESPACE |
4292 awk '/^lost_dirent_repaired/ { print $2 }')
4293 [ $repaired -eq 1 ] ||
4294 error "(8) Fail to repair lost dirent: $repaired"
4296 ls -ail $MOUNT/.lustre/lost+found/
4298 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4299 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4300 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4302 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4304 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4305 [ ! -z "$cname" ] ||
4306 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4308 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4311 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4314 echo "The target name entry is lost. The LFSCK should insert the"
4315 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4316 echo "the MDT (on which the orphan MDT-object resides) has ever"
4317 echo "failed to respond some name entry verification during the"
4318 echo "first stage-scanning, then the LFSCK should skip to handle"
4319 echo "orphan MDT-object on this MDT. But other MDTs should not"
4323 check_mount_and_prep
4324 $LFS mkdir -i 0 $DIR/$tdir/d1
4325 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4326 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4328 $LFS mkdir -i 1 $DIR/$tdir/d2
4329 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4330 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4332 echo "Inject failure stub on MDT0 to simulate the case that"
4333 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4334 echo "and its linkEA are kept in the system. And the case that"
4335 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4336 echo "and its linkEA are kept in the system."
4338 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4339 do_facet mds1 $LCTL set_param fail_loc=0x1624
4340 do_facet mds2 $LCTL set_param fail_loc=0x1624
4341 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4342 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4343 do_facet mds1 $LCTL set_param fail_loc=0
4344 do_facet mds2 $LCTL set_param fail_loc=0
4346 cancel_lru_locks mdc
4347 cancel_lru_locks osc
4349 echo "Inject failure, to simulate the MDT0 fail to handle"
4350 echo "MDT1 LFSCK request during the first-stage scanning."
4351 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4352 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4354 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4355 $START_NAMESPACE -r -A ||
4356 error "(3) Fail to start LFSCK for namespace"
4358 wait_update_facet mds1 "$LCTL get_param -n \
4359 mdd.$(facet_svc mds1).lfsck_namespace |
4360 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4361 error "(4) mds1 is not the expected 'partial'"
4364 wait_update_facet mds2 "$LCTL get_param -n \
4365 mdd.$(facet_svc mds2).lfsck_namespace |
4366 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4367 error "(5) mds2 is not the expected 'completed'"
4370 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4372 local repaired=$(do_facet mds1 $LCTL get_param -n \
4373 mdd.$(facet_svc mds1).lfsck_namespace |
4374 awk '/^lost_dirent_repaired/ { print $2 }')
4375 [ $repaired -eq 0 ] ||
4376 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4378 repaired=$(do_facet mds2 $LCTL get_param -n \
4379 mdd.$(facet_svc mds2).lfsck_namespace |
4380 awk '/^lost_dirent_repaired/ { print $2 }')
4381 [ $repaired -eq 1 ] ||
4382 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4384 echo "Trigger namespace LFSCK on all devices again to cleanup"
4385 $START_NAMESPACE -r -A ||
4386 error "(8) Fail to start LFSCK for namespace"
4388 wait_all_targets_blocked namespace completed 9
4390 local repaired=$(do_facet mds1 $LCTL get_param -n \
4391 mdd.$(facet_svc mds1).lfsck_namespace |
4392 awk '/^lost_dirent_repaired/ { print $2 }')
4393 [ $repaired -eq 1 ] ||
4394 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4396 repaired=$(do_facet mds2 $LCTL get_param -n \
4397 mdd.$(facet_svc mds2).lfsck_namespace |
4398 awk '/^lost_dirent_repaired/ { print $2 }')
4399 [ $repaired -eq 0 ] ||
4400 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4402 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4406 echo "The object's nlink attribute is larger than the object's known"
4407 echo "name entries count. The LFSCK will repair the object's nlink"
4408 echo "attribute to match the known name entries count"
4411 check_mount_and_prep
4413 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4414 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4416 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4417 echo "nlink attribute is larger than its name entries count."
4419 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4420 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4421 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4422 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4425 cancel_lru_locks mdc
4426 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4427 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4429 echo "Trigger namespace LFSCK to repair the nlink count"
4430 $START_NAMESPACE -r -A ||
4431 error "(5) Fail to start LFSCK for namespace"
4433 wait_all_targets_blocked namespace completed 6
4435 local repaired=$($SHOW_NAMESPACE |
4436 awk '/^nlinks_repaired/ { print $2 }')
4437 [ $repaired -eq 1 ] ||
4438 error "(7) Fail to repair nlink count: $repaired"
4440 cancel_lru_locks mdc
4441 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4442 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4444 # Disable 29a, we only allow nlink to be updated if the known linkEA
4445 # entries is larger than nlink count.
4447 #run_test 29a "LFSCK can repair bad nlink count (1)"
4451 echo "The object's nlink attribute is smaller than the object's known"
4452 echo "name entries count. The LFSCK will repair the object's nlink"
4453 echo "attribute to match the known name entries count"
4456 check_mount_and_prep
4458 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4459 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4461 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4462 echo "nlink attribute is smaller than its name entries count."
4464 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4466 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4467 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4468 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4470 cancel_lru_locks mdc
4471 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4472 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4474 echo "Trigger namespace LFSCK to repair the nlink count"
4475 $START_NAMESPACE -r -A ||
4476 error "(5) Fail to start LFSCK for namespace"
4478 wait_all_targets_blocked namespace completed 6
4480 local repaired=$($SHOW_NAMESPACE |
4481 awk '/^nlinks_repaired/ { print $2 }')
4482 [ $repaired -eq 1 ] ||
4483 error "(7) Fail to repair nlink count: $repaired"
4485 cancel_lru_locks mdc
4486 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4487 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4489 run_test 29b "LFSCK can repair bad nlink count (2)"
4494 echo "The namespace LFSCK will create many hard links to the target"
4495 echo "file as to exceed the linkEA size limitation. Under such case"
4496 echo "the linkEA will be marked as overflow that will prevent the"
4497 echo "target file to be migrated. Then remove some hard links to"
4498 echo "make the left hard links to be held within the linkEA size"
4499 echo "limitation. But before the namespace LFSCK adding all the"
4500 echo "missed linkEA entries back, the overflow mark (timestamp)"
4501 echo "will not be cleared."
4504 check_mount_and_prep
4506 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4507 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4508 error "(0.2) Fail to mkdir"
4509 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4510 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4512 # define MAX_LINKEA_SIZE 4096
4513 # sizeof(link_ea_header) = 24
4514 # sizeof(link_ea_entry) = 18
4515 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4516 # (sizeof(link_ea_entry) + name_length))
4517 # If the average name length is 12 bytes, then 150 hard links
4518 # is totally enough to overflow the linkEA
4519 echo "Create 150 hard links should succeed although the linkEA overflow"
4520 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4521 error "(2) Fail to hard link"
4523 cancel_lru_locks mdc
4524 if [ $MDSCOUNT -ge 2 ]; then
4525 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4526 error "(3.1) Migrate failure"
4528 echo "The object with linkEA overflow should NOT be migrated"
4529 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4530 [ "$newfid" == "$oldfid" ] ||
4531 error "(3.2) Migrate should fail: $newfid != $oldfid"
4534 # Remove 100 hard links, then the linkEA should have space
4535 # to hold the missed linkEA entries.
4536 echo "Remove 100 hard links to save space for the missed linkEA entries"
4537 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4539 if [ $MDSCOUNT -ge 2 ]; then
4540 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4541 error "(5.1) Migrate failure"
4543 # The overflow timestamp is still there, so migration will fail.
4544 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4545 [ "$newfid" == "$oldfid" ] ||
4546 error "(5.2) Migrate should fail: $newfid != $oldfid"
4549 # sleep 3 seconds to guarantee that the overflow is recognized
4552 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4553 $START_NAMESPACE -r -A ||
4554 error "(6) Fail to start LFSCK for namespace"
4556 wait_all_targets_blocked namespace completed 7
4558 local repaired=$($SHOW_NAMESPACE |
4559 awk '/^linkea_overflow_cleared/ { print $2 }')
4560 [ $repaired -eq 1 ] ||
4561 error "(8) Fail to clear linkea overflow: $repaired"
4563 repaired=$($SHOW_NAMESPACE |
4564 awk '/^nlinks_repaired/ { print $2 }')
4565 [ $repaired -eq 0 ] ||
4566 error "(9) Unexpected nlink repaired: $repaired"
4568 if [ $MDSCOUNT -ge 2 ]; then
4569 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4570 error "(10.1) Migrate failure"
4572 # Migration should succeed after clear the overflow timestamp.
4573 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4574 [ "$newfid" != "$oldfid" ] ||
4575 error "(10.2) Migrate should succeed"
4577 ls -l $DIR/$tdir/foo > /dev/null ||
4578 error "(11) 'ls' failed after migration"
4581 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4582 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4584 run_test 29c "verify linkEA size limitation"
4587 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4588 skip "ldiskfs only test" && return
4591 echo "The namespace LFSCK will move the orphans from backend"
4592 echo "/lost+found directory to normal client visible namespace"
4593 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4596 check_mount_and_prep
4598 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4599 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4601 echo "Inject failure stub on MDT0 to simulate the case that"
4602 echo "directory d0 has no linkEA entry, then the LFSCK will"
4603 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4605 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4606 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4607 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4608 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4610 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4611 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4613 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4614 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4616 echo "Inject failure stub on MDT0 to simulate the case that the"
4617 echo "object's name entry will be removed, but not destroy the"
4618 echo "object. Then backend e2fsck will handle it as orphan and"
4619 echo "add them into the backend /lost+found directory."
4621 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4622 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4623 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4624 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4625 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4626 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4627 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4629 umount_client $MOUNT || error "(10) Fail to stop client!"
4631 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4634 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4635 error "(12) Fail to run e2fsck"
4637 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4638 error "(13) Fail to start MDT0"
4640 echo "Trigger namespace LFSCK to recover backend orphans"
4641 $START_NAMESPACE -r -A ||
4642 error "(14) Fail to start LFSCK for namespace"
4644 wait_all_targets_blocked namespace completed 15
4646 local repaired=$($SHOW_NAMESPACE |
4647 awk '/^local_lost_found_moved/ { print $2 }')
4648 [ $repaired -ge 4 ] ||
4649 error "(16) Fail to recover backend orphans: $repaired"
4651 mount_client $MOUNT || error "(17) Fail to start client!"
4653 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4655 ls -ail $MOUNT/.lustre/lost+found/
4657 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4658 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4659 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4661 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4663 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4664 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4666 stat ${cname}/d1 || error "(21) d1 is not recovered"
4667 stat ${cname}/f1 || error "(22) f1 is not recovered"
4669 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4672 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4675 echo "For the name entry under a striped directory, if the name"
4676 echo "hash does not match the shard, then the LFSCK will repair"
4677 echo "the bad name entry"
4680 check_mount_and_prep
4682 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4683 error "(1) Fail to create striped directory"
4685 echo "Inject failure stub on client to simulate the case that"
4686 echo "some name entry should be inserted into other non-first"
4687 echo "shard, but inserted into the first shard by wrong"
4689 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4690 $LCTL set_param fail_loc=0x1628 fail_val=0
4691 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4692 error "(2) Fail to create file under striped directory"
4693 $LCTL set_param fail_loc=0 fail_val=0
4695 echo "Trigger namespace LFSCK to repair bad name hash"
4696 $START_NAMESPACE -r -A ||
4697 error "(3) Fail to start LFSCK for namespace"
4699 wait_all_targets_blocked namespace completed 4
4701 local repaired=$($SHOW_NAMESPACE |
4702 awk '/^name_hash_repaired/ { print $2 }')
4703 [ $repaired -ge 1 ] ||
4704 error "(5) Fail to repair bad name hash: $repaired"
4706 umount_client $MOUNT || error "(6) umount failed"
4707 mount_client $MOUNT || error "(7) mount failed"
4709 for ((i = 0; i < $MDSCOUNT; i++)); do
4710 stat $DIR/$tdir/striped_dir/d$i ||
4711 error "(8) Fail to stat d$i after LFSCK"
4712 rmdir $DIR/$tdir/striped_dir/d$i ||
4713 error "(9) Fail to unlink d$i after LFSCK"
4716 rmdir $DIR/$tdir/striped_dir ||
4717 error "(10) Fail to remove the striped directory after LFSCK"
4719 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4722 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4725 echo "For the name entry under a striped directory, if the name"
4726 echo "hash does not match the shard, then the LFSCK will repair"
4727 echo "the bad name entry"
4730 check_mount_and_prep
4732 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4733 error "(1) Fail to create striped directory"
4735 echo "Inject failure stub on client to simulate the case that"
4736 echo "some name entry should be inserted into other non-second"
4737 echo "shard, but inserted into the secod shard by wrong"
4739 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4740 $LCTL set_param fail_loc=0x1628 fail_val=1
4741 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4742 error "(2) Fail to create file under striped directory"
4743 $LCTL set_param fail_loc=0 fail_val=0
4745 echo "Trigger namespace LFSCK to repair bad name hash"
4746 $START_NAMESPACE -r -A ||
4747 error "(3) Fail to start LFSCK for namespace"
4749 wait_all_targets_blocked namespace completed 4
4751 local repaired=$(do_facet mds2 $LCTL get_param -n \
4752 mdd.$(facet_svc mds2).lfsck_namespace |
4753 awk '/^name_hash_repaired/ { print $2 }')
4754 [ $repaired -ge 1 ] ||
4755 error "(5) Fail to repair bad name hash: $repaired"
4757 umount_client $MOUNT || error "(6) umount failed"
4758 mount_client $MOUNT || error "(7) mount failed"
4760 for ((i = 0; i < $MDSCOUNT; i++)); do
4761 stat $DIR/$tdir/striped_dir/d$i ||
4762 error "(8) Fail to stat d$i after LFSCK"
4763 rmdir $DIR/$tdir/striped_dir/d$i ||
4764 error "(9) Fail to unlink d$i after LFSCK"
4767 rmdir $DIR/$tdir/striped_dir ||
4768 error "(10) Fail to remove the striped directory after LFSCK"
4770 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4773 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4776 echo "For some reason, the master MDT-object of the striped directory"
4777 echo "may lost its master LMV EA. If nobody created files under the"
4778 echo "master directly after the master LMV EA lost, then the LFSCK"
4779 echo "should re-generate the master LMV EA."
4782 check_mount_and_prep
4784 echo "Inject failure stub on MDT0 to simulate the case that the"
4785 echo "master MDT-object of the striped directory lost the LMV EA."
4787 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4788 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4789 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4790 error "(1) Fail to create striped directory"
4791 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4793 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4794 $START_NAMESPACE -r -A ||
4795 error "(2) Fail to start LFSCK for namespace"
4797 wait_all_targets_blocked namespace completed 3
4799 local repaired=$($SHOW_NAMESPACE |
4800 awk '/^striped_dirs_repaired/ { print $2 }')
4801 [ $repaired -eq 1 ] ||
4802 error "(4) Fail to re-generate master LMV EA: $repaired"
4804 umount_client $MOUNT || error "(5) umount failed"
4805 mount_client $MOUNT || error "(6) mount failed"
4807 local empty=$(ls $DIR/$tdir/striped_dir/)
4808 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4810 rmdir $DIR/$tdir/striped_dir ||
4811 error "(8) Fail to remove the striped directory after LFSCK"
4813 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4816 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4819 echo "For some reason, the master MDT-object of the striped directory"
4820 echo "may lost its master LMV EA. If somebody created files under the"
4821 echo "master directly after the master LMV EA lost, then the LFSCK"
4822 echo "should NOT re-generate the master LMV EA, instead, it should"
4823 echo "change the broken striped dirctory as read-only to prevent"
4824 echo "further damage"
4827 check_mount_and_prep
4829 echo "Inject failure stub on MDT0 to simulate the case that the"
4830 echo "master MDT-object of the striped directory lost the LMV EA."
4832 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4833 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4834 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4835 error "(1) Fail to create striped directory"
4836 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4838 umount_client $MOUNT || error "(2) umount failed"
4839 mount_client $MOUNT || error "(3) mount failed"
4841 touch $DIR/$tdir/striped_dir/dummy ||
4842 error "(4) Fail to touch under broken striped directory"
4844 echo "Trigger namespace LFSCK to find out the inconsistency"
4845 $START_NAMESPACE -r -A ||
4846 error "(5) Fail to start LFSCK for namespace"
4848 wait_all_targets_blocked namespace completed 6
4850 local repaired=$($SHOW_NAMESPACE |
4851 awk '/^striped_dirs_repaired/ { print $2 }')
4852 [ $repaired -eq 0 ] ||
4853 error "(7) Re-generate master LMV EA unexpected: $repaired"
4855 stat $DIR/$tdir/striped_dir/dummy ||
4856 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4858 touch $DIR/$tdir/striped_dir/foo &&
4859 error "(9) The broken striped directory should be read-only"
4861 chattr -i $DIR/$tdir/striped_dir ||
4862 error "(10) Fail to chattr on the broken striped directory"
4864 rmdir $DIR/$tdir/striped_dir ||
4865 error "(11) Fail to remove the striped directory after LFSCK"
4867 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4870 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4873 echo "For some reason, the slave MDT-object of the striped directory"
4874 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4875 echo "slave LMV EA."
4878 check_mount_and_prep
4880 echo "Inject failure stub on MDT0 to simulate the case that the"
4881 echo "slave MDT-object (that resides on the same MDT as the master"
4882 echo "MDT-object resides on) lost the LMV EA."
4884 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4885 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4886 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4887 error "(1) Fail to create striped directory"
4888 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4890 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4891 $START_NAMESPACE -r -A ||
4892 error "(2) Fail to start LFSCK for namespace"
4894 wait_all_targets_blocked namespace completed 3
4896 local repaired=$($SHOW_NAMESPACE |
4897 awk '/^striped_shards_repaired/ { print $2 }')
4898 [ $repaired -eq 1 ] ||
4899 error "(4) Fail to re-generate slave LMV EA: $repaired"
4901 rmdir $DIR/$tdir/striped_dir ||
4902 error "(5) Fail to remove the striped directory after LFSCK"
4904 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4907 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4910 echo "For some reason, the slave MDT-object of the striped directory"
4911 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4912 echo "slave LMV EA."
4915 check_mount_and_prep
4917 echo "Inject failure stub on MDT0 to simulate the case that the"
4918 echo "slave MDT-object (that resides on different MDT as the master"
4919 echo "MDT-object resides on) lost the LMV EA."
4921 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4922 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4923 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4924 error "(1) Fail to create striped directory"
4925 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4927 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4928 $START_NAMESPACE -r -A ||
4929 error "(2) Fail to start LFSCK for namespace"
4931 wait_all_targets_blocked namespace completed 3
4933 local repaired=$(do_facet mds2 $LCTL get_param -n \
4934 mdd.$(facet_svc mds2).lfsck_namespace |
4935 awk '/^striped_shards_repaired/ { print $2 }')
4936 [ $repaired -eq 1 ] ||
4937 error "(4) Fail to re-generate slave LMV EA: $repaired"
4939 rmdir $DIR/$tdir/striped_dir ||
4940 error "(5) Fail to remove the striped directory after LFSCK"
4942 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4945 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4948 echo "For some reason, the stripe index in the slave LMV EA is"
4949 echo "corrupted. The LFSCK should repair the slave LMV EA."
4952 check_mount_and_prep
4954 echo "Inject failure stub on MDT0 to simulate the case that the"
4955 echo "slave LMV EA on the first shard of the striped directory"
4956 echo "claims the same index as the second shard claims"
4958 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4959 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4960 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4961 error "(1) Fail to create striped directory"
4962 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4964 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4965 $START_NAMESPACE -r -A ||
4966 error "(2) Fail to start LFSCK for namespace"
4968 wait_all_targets_blocked namespace completed 3
4970 local repaired=$($SHOW_NAMESPACE |
4971 awk '/^striped_shards_repaired/ { print $2 }')
4972 [ $repaired -eq 1 ] ||
4973 error "(4) Fail to repair slave LMV EA: $repaired"
4975 umount_client $MOUNT || error "(5) umount failed"
4976 mount_client $MOUNT || error "(6) mount failed"
4978 touch $DIR/$tdir/striped_dir/foo ||
4979 error "(7) Fail to touch file after the LFSCK"
4981 rm -f $DIR/$tdir/striped_dir/foo ||
4982 error "(8) Fail to unlink file after the LFSCK"
4984 rmdir $DIR/$tdir/striped_dir ||
4985 error "(9) Fail to remove the striped directory after LFSCK"
4987 run_test 31g "Repair the corrupted slave LMV EA"
4990 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4993 echo "For some reason, the shard's name entry in the striped"
4994 echo "directory may be corrupted. The LFSCK should repair the"
4995 echo "bad shard's name entry."
4998 check_mount_and_prep
5000 echo "Inject failure stub on MDT0 to simulate the case that the"
5001 echo "first shard's name entry in the striped directory claims"
5002 echo "the same index as the second shard's name entry claims."
5004 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5005 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5006 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5007 error "(1) Fail to create striped directory"
5008 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5010 echo "Trigger namespace LFSCK to repair the shard's name entry"
5011 $START_NAMESPACE -r -A ||
5012 error "(2) Fail to start LFSCK for namespace"
5014 wait_all_targets_blocked namespace completed 3
5016 local repaired=$($SHOW_NAMESPACE |
5017 awk '/^dirent_repaired/ { print $2 }')
5018 [ $repaired -eq 1 ] ||
5019 error "(4) Fail to repair shard's name entry: $repaired"
5021 umount_client $MOUNT || error "(5) umount failed"
5022 mount_client $MOUNT || error "(6) mount failed"
5024 touch $DIR/$tdir/striped_dir/foo ||
5025 error "(7) Fail to touch file after the LFSCK"
5027 rm -f $DIR/$tdir/striped_dir/foo ||
5028 error "(8) Fail to unlink file after the LFSCK"
5030 rmdir $DIR/$tdir/striped_dir ||
5031 error "(9) Fail to remove the striped directory after LFSCK"
5033 run_test 31h "Repair the corrupted shard's name entry"
5038 umount_client $MOUNT
5040 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5041 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5042 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5044 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5045 [ "$STATUS" == "scanning-phase1" ] ||
5046 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5049 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5051 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5055 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5057 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5058 error "(5) Fail to start ost1"
5060 run_test 32a "stop LFSCK when some OST failed"
5064 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5067 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5068 error "(1) Fail to create $DIR/$tdir/dp"
5069 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5070 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5071 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5072 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5073 umount_client $MOUNT
5075 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5076 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5077 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5079 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5080 mdd.${MDT_DEV}.lfsck_namespace |
5081 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5083 error "(5) unexpected status"
5087 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5089 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5093 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5095 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5096 error "(8) Fail to start MDT2"
5098 run_test 32b "stop LFSCK when some MDT failed"
5104 $START_LAYOUT --dryrun -o -r ||
5105 error "(1) Fail to start layout LFSCK"
5106 wait_all_targets_blocked layout completed 2
5108 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5109 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5110 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5112 $START_NAMESPACE -e abort -A -r ||
5113 error "(4) Fail to start namespace LFSCK"
5114 wait_all_targets_blocked namespace completed 5
5116 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5117 [ "$PARAMS" == "failout,all_targets" ] ||
5118 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5120 run_test 33 "check LFSCK paramters"
5124 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5125 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5126 skip "Only valid for ZFS backend" && return
5130 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5131 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5132 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5133 error "(1) Fail to create $DIR/$tdir/dummy"
5135 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5136 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5137 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5138 mdd.${MDT_DEV}.lfsck_namespace |
5139 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5141 error "(3) unexpected status"
5144 local repaired=$($SHOW_NAMESPACE |
5145 awk '/^dirent_repaired/ { print $2 }')
5146 [ $repaired -eq 1 ] ||
5147 error "(4) Fail to repair the lost agent object: $repaired"
5149 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5150 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5151 mdd.${MDT_DEV}.lfsck_namespace |
5152 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5154 error "(6) unexpected status"
5157 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5158 [ $repaired -eq 0 ] ||
5159 error "(7) Unexpected repairing: $repaired"
5161 run_test 34 "LFSCK can rebuild the lost agent object"
5165 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5169 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5170 do_facet mds2 $LCTL set_param fail_loc=0x1631
5171 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5172 error "(1) Fail to create $DIR/$tdir/dummy"
5175 do_facet mds2 $LCTL set_param fail_loc=0
5176 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5177 wait_update_facet mds2 "$LCTL get_param -n \
5178 mdd.$(facet_svc mds2).lfsck_namespace |
5179 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5180 error "(3) MDS${k} is not the expected 'completed'"
5182 local repaired=$(do_facet mds2 $LCTL get_param -n \
5183 mdd.$(facet_svc mds2).lfsck_namespace |
5184 awk '/^agent_entries_repaired/ { print $2 }')
5185 [ $repaired -eq 1 ] ||
5186 error "(4) Fail to repair the lost agent entry: $repaired"
5188 echo "stopall to cleanup object cache"
5191 setupall > /dev/null
5193 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5194 wait_update_facet mds2 "$LCTL get_param -n \
5195 mdd.$(facet_svc mds2).lfsck_namespace |
5196 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5197 error "(6) MDS${k} is not the expected 'completed'"
5199 repaired=$(do_facet mds2 $LCTL get_param -n \
5200 mdd.$(facet_svc mds2).lfsck_namespace |
5201 awk '/^agent_entries_repaired/ { print $2 }')
5202 [ $repaired -eq 0 ] ||
5203 error "(7) Unexpected repairing: $repaired"
5205 run_test 35 "LFSCK can rebuild the lost agent entry"
5207 # restore MDS/OST size
5208 MDSSIZE=${SAVED_MDSSIZE}
5209 OSTSIZE=${SAVED_OSTSIZE}
5210 OSTCOUNT=${SAVED_OSTCOUNT}
5212 # cleanup the system at last
5213 REFORMAT="yes" cleanup_and_setup_lustre
5216 check_and_cleanup_lustre