3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
43 [ $(facet_fstype $SINGLEMDS) == zfs ] && MDSSIZE=300000
45 [ $(facet_fstype ost1) == zfs ] && OSTSIZE=300000
47 # no need too many OSTs, to reduce the format/start/stop overhead
49 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
51 # build up a clean test environment.
52 REFORMAT="yes" check_and_setup_lustre
54 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
57 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
60 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
61 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
63 # DNE does not support striped directory on zfs-based backend yet.
64 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
65 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
69 MDT_DEV="${FSNAME}-MDT0000"
70 OST_DEV="${FSNAME}-OST0000"
71 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
72 START_NAMESPACE="do_facet $SINGLEMDS \
73 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
74 START_LAYOUT="do_facet $SINGLEMDS \
75 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
76 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
77 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
78 SHOW_NAMESPACE="do_facet $SINGLEMDS \
79 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
80 SHOW_LAYOUT="do_facet $SINGLEMDS \
81 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
82 SHOW_LAYOUT_ON_OST="do_facet ost1 \
83 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
84 MOUNT_OPTS_SCRUB="-o user_xattr"
85 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
86 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
95 echo "preparing... $nfiles * $ndirs files will be created $(date)."
96 if [ ! -z $igif ]; then
97 #define OBD_FAIL_FID_IGIF 0x1504
98 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
101 cp $LUSTRE/tests/*.sh $DIR/$tdir/
102 if [ $ndirs -gt 0 ]; then
103 createmany -d $DIR/$tdir/d $ndirs
104 createmany -m $DIR/$tdir/f $ndirs
105 if [ $nfiles -gt 0 ]; then
106 for ((i = 0; i < $ndirs; i++)); do
107 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
108 /dev/null || error "createmany $nfiles"
111 createmany -d $DIR/$tdir/e $ndirs
114 if [ ! -z $igif ]; then
115 touch $DIR/$tdir/dummy
116 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
119 echo "prepared $(date)."
122 run_e2fsck_on_mdt0() {
123 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
125 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
126 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
128 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
129 error "(2) Detected inconsistency on MDT0"
131 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
132 error "(3) Fail to start MDT0"
135 wait_all_targets_blocked() {
140 local count=$(do_facet mds1 \
141 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
142 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
143 [[ $count -eq $MDSCOUNT ]] || {
144 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
145 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
154 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
155 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
156 "$MDSCOUNT" $LTIME || {
157 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
158 error "($err) some MDTs are not in ${status}"
165 #define OBD_FAIL_LFSCK_DELAY1 0x1600
166 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
167 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
169 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
171 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
172 [ "$STATUS" == "scanning-phase1" ] ||
173 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
175 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
177 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
178 [ "$STATUS" == "stopped" ] ||
179 error "(6) Expect 'stopped', but got '$STATUS'"
181 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
183 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
184 [ "$STATUS" == "scanning-phase1" ] ||
185 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
188 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
189 mdd.${MDT_DEV}.lfsck_namespace |
190 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
192 error "(9) unexpected status"
195 local repaired=$($SHOW_NAMESPACE |
196 awk '/^updated_phase1/ { print $2 }')
197 [ $repaired -eq 0 ] ||
198 error "(10) Expect nothing to be repaired, but got: $repaired"
200 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
201 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
202 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
203 mdd.${MDT_DEV}.lfsck_namespace |
204 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
206 error "(12) unexpected status"
209 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
210 [ $((scanned1 + 1)) -eq $scanned2 ] ||
211 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
213 echo "stopall, should NOT crash LU-3649"
214 stopall || error "(14) Fail to stopall"
216 run_test 0 "Control LFSCK manually"
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_FID_IGIF 0x1504
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^dirent_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase1/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair lost FID-in-dirent: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 #define OBD_FAIL_FID_LOOKUP 0x1505
335 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
336 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
338 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
340 run_test 1c "LFSCK can find out and repair lost FID-in-dirent"
345 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
346 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
347 touch $DIR/$tdir/dummy
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
351 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
352 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
353 mdd.${MDT_DEV}.lfsck_namespace |
354 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
356 error "(4) unexpected status"
359 local repaired=$($SHOW_NAMESPACE |
360 awk '/^linkea_repaired/ { print $2 }')
361 # for interop with old server
362 [ -z "$repaired" ] &&
363 repaired=$($SHOW_NAMESPACE |
364 awk '/^updated_phase2/ { print $2 }')
366 [ $repaired -eq 1 ] ||
367 error "(5) Fail to repair crashed linkEA: $repaired"
371 mount_client $MOUNT || error "(6) Fail to start client!"
373 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
374 error "(7) Fail to stat $DIR/$tdir/dummy"
376 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
377 local dummyname=$($LFS fid2path $DIR $dummyfid)
378 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
379 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
381 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
387 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
388 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
389 touch $DIR/$tdir/dummy
391 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
393 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
394 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
395 mdd.${MDT_DEV}.lfsck_namespace |
396 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
398 error "(4) unexpected status"
401 local repaired=$($SHOW_NAMESPACE |
402 awk '/^updated_phase2/ { print $2 }')
403 [ $repaired -eq 1 ] ||
404 error "(5) Fail to repair crashed linkEA: $repaired"
408 mount_client $MOUNT || error "(6) Fail to start client!"
410 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
411 error "(7) Fail to stat $DIR/$tdir/dummy"
413 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
414 local dummyname=$($LFS fid2path $DIR $dummyfid)
415 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
416 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
418 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
424 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
426 touch $DIR/$tdir/dummy
428 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
430 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
431 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
432 mdd.${MDT_DEV}.lfsck_namespace |
433 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
435 error "(4) unexpected status"
438 local repaired=$($SHOW_NAMESPACE |
439 awk '/^updated_phase2/ { print $2 }')
440 [ $repaired -eq 1 ] ||
441 error "(5) Fail to repair crashed linkEA: $repaired"
445 mount_client $MOUNT || error "(6) Fail to start client!"
447 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
448 error "(7) Fail to stat $DIR/$tdir/dummy"
450 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
451 local dummyname=$($LFS fid2path $DIR $dummyfid)
452 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
453 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
455 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
461 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
462 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
463 touch $DIR/$tdir/dummy
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
467 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
468 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
469 mdd.${MDT_DEV}.lfsck_namespace |
470 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
472 error "(4) unexpected status"
475 local repaired=$($SHOW_NAMESPACE |
476 awk '/^linkea_repaired/ { print $2 }')
477 [ $repaired -eq 1 ] ||
478 error "(5) Fail to repair crashed linkEA: $repaired"
482 mount_client $MOUNT || error "(6) Fail to start client!"
484 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
485 error "(7) Fail to stat $DIR/$tdir/dummy"
487 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
488 local dummyname=$($LFS fid2path $DIR $dummyfid)
489 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
490 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
492 run_test 2d "LFSCK can recover the missing linkEA entry"
496 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
500 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
502 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
504 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
509 wait_all_targets_blocked namespace completed 4
511 local repaired=$($SHOW_NAMESPACE |
512 awk '/^linkea_repaired/ { print $2 }')
513 [ $repaired -eq 1 ] ||
514 error "(5) Fail to repair crashed linkEA: $repaired"
516 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
517 local name=$($LFS fid2path $DIR $fid)
518 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
519 error "(6) Fail to repair linkEA: $fid $name"
521 run_test 2e "namespace LFSCK can verify remote object linkEA"
527 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
528 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
529 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
531 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
532 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
533 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
535 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
536 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
537 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
539 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
541 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
543 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
545 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
546 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
547 mdd.${MDT_DEV}.lfsck_namespace |
548 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
550 error "(10) unexpected status"
553 local checked=$($SHOW_NAMESPACE |
554 awk '/^checked_phase2/ { print $2 }')
555 [ $checked -ge 4 ] ||
556 error "(11) Fail to check multiple-linked object: $checked"
558 local repaired=$($SHOW_NAMESPACE |
559 awk '/^multiple_linked_repaired/ { print $2 }')
560 [ $repaired -ge 2 ] ||
561 error "(12) Fail to repair multiple-linked object: $repaired"
563 run_test 3 "LFSCK can verify multiple-linked objects"
567 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
568 skip "OI Scrub not implemented for ZFS" && return
571 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
572 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
574 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
575 echo "start $SINGLEMDS with disabling OI scrub"
576 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
577 error "(2) Fail to start MDS!"
579 #define OBD_FAIL_LFSCK_DELAY2 0x1601
580 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
581 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
582 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
583 mdd.${MDT_DEV}.lfsck_namespace |
584 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
586 error "(5) unexpected status"
589 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
590 [ "$STATUS" == "scanning-phase1" ] ||
591 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
593 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
594 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
595 mdd.${MDT_DEV}.lfsck_namespace |
596 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
598 error "(7) unexpected status"
601 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
602 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
604 local repaired=$($SHOW_NAMESPACE |
605 awk '/^dirent_repaired/ { print $2 }')
606 # for interop with old server
607 [ -z "$repaired" ] &&
608 repaired=$($SHOW_NAMESPACE |
609 awk '/^updated_phase1/ { print $2 }')
611 [ $repaired -ge 9 ] ||
612 error "(9) Fail to re-generate FID-in-dirent: $repaired"
616 mount_client $MOUNT || error "(10) Fail to start client!"
618 #define OBD_FAIL_FID_LOOKUP 0x1505
619 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
620 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
621 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
623 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
627 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
628 skip "OI Scrub not implemented for ZFS" && return
631 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
632 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
634 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
635 echo "start $SINGLEMDS with disabling OI scrub"
636 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
637 error "(2) Fail to start MDS!"
639 #define OBD_FAIL_LFSCK_DELAY2 0x1601
640 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
641 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
642 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
643 mdd.${MDT_DEV}.lfsck_namespace |
644 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
646 error "(5) unexpected status"
649 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
650 [ "$STATUS" == "scanning-phase1" ] ||
651 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
653 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
654 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
655 mdd.${MDT_DEV}.lfsck_namespace |
656 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
658 error "(7) unexpected status"
661 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
662 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
664 local repaired=$($SHOW_NAMESPACE |
665 awk '/^dirent_repaired/ { print $2 }')
666 # for interop with old server
667 [ -z "$repaired" ] &&
668 repaired=$($SHOW_NAMESPACE |
669 awk '/^updated_phase1/ { print $2 }')
671 [ $repaired -ge 2 ] ||
672 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
676 mount_client $MOUNT || error "(10) Fail to start client!"
678 #define OBD_FAIL_FID_LOOKUP 0x1505
679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
680 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
682 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
684 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
685 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
686 local dummyname=$($LFS fid2path $DIR $dummyfid)
687 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
688 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
690 run_test 5 "LFSCK can handle IGIF object upgrading"
695 #define OBD_FAIL_LFSCK_DELAY1 0x1600
696 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
697 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
699 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
700 [ "$STATUS" == "scanning-phase1" ] ||
701 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
703 # Sleep 3 sec to guarantee at least one object processed by LFSCK
705 # Fail the LFSCK to guarantee there is at least one checkpoint
706 #define OBD_FAIL_LFSCK_FATAL1 0x1608
707 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
708 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
709 mdd.${MDT_DEV}.lfsck_namespace |
710 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
712 error "(4) unexpected status"
715 local POS0=$($SHOW_NAMESPACE |
716 awk '/^last_checkpoint_position/ { print $2 }' |
719 #define OBD_FAIL_LFSCK_DELAY1 0x1600
720 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
721 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
723 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
724 [ "$STATUS" == "scanning-phase1" ] ||
725 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
727 local POS1=$($SHOW_NAMESPACE |
728 awk '/^latest_start_position/ { print $2 }' |
730 [[ $POS0 -lt $POS1 ]] ||
731 error "(7) Expect larger than: $POS0, but got $POS1"
733 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
734 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
735 mdd.${MDT_DEV}.lfsck_namespace |
736 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
738 error "(8) unexpected status"
741 run_test 6a "LFSCK resumes from last checkpoint (1)"
746 #define OBD_FAIL_LFSCK_DELAY2 0x1601
747 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
748 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
750 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
751 [ "$STATUS" == "scanning-phase1" ] ||
752 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
754 # Sleep 5 sec to guarantee that we are in the directory scanning
756 # Fail the LFSCK to guarantee there is at least one checkpoint
757 #define OBD_FAIL_LFSCK_FATAL2 0x1609
758 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
759 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
760 mdd.${MDT_DEV}.lfsck_namespace |
761 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
763 error "(4) unexpected status"
766 local O_POS0=$($SHOW_NAMESPACE |
767 awk '/^last_checkpoint_position/ { print $2 }' |
770 local D_POS0=$($SHOW_NAMESPACE |
771 awk '/^last_checkpoint_position/ { print $4 }')
773 #define OBD_FAIL_LFSCK_DELAY2 0x1601
774 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
775 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
777 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
778 [ "$STATUS" == "scanning-phase1" ] ||
779 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
781 local O_POS1=$($SHOW_NAMESPACE |
782 awk '/^latest_start_position/ { print $2 }' |
784 local D_POS1=$($SHOW_NAMESPACE |
785 awk '/^latest_start_position/ { print $4 }')
787 echo "Additional debug for 6b"
789 if [ "$D_POS0" == "N/A" -o "$D_POS0" == "0x0" \
790 -o "$D_POS1" == "0x0" -o "$D_POS1" == "N/A" ]; then
791 [[ $O_POS0 -lt $O_POS1 ]] ||
792 error "(7.1) $O_POS1 is not larger than $O_POS0"
794 [[ $D_POS0 -lt $D_POS1 ]] ||
795 error "(7.2) $D_POS1 is not larger than $D_POS0"
798 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
799 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
800 mdd.${MDT_DEV}.lfsck_namespace |
801 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
803 error "(8) unexpected status"
806 run_test 6b "LFSCK resumes from last checkpoint (2)"
813 #define OBD_FAIL_LFSCK_DELAY2 0x1601
814 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
815 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
817 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
818 [ "$STATUS" == "scanning-phase1" ] ||
819 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
821 # Sleep 3 sec to guarantee at least one object processed by LFSCK
823 echo "stop $SINGLEMDS"
824 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
826 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
827 echo "start $SINGLEMDS"
828 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
829 error "(5) Fail to start MDS!"
831 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
832 mdd.${MDT_DEV}.lfsck_namespace |
833 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
835 error "(6) unexpected status"
838 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
844 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
845 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
846 for ((i = 0; i < 20; i++)); do
847 touch $DIR/$tdir/dummy${i}
850 #define OBD_FAIL_LFSCK_DELAY3 0x1602
851 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
852 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
853 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
854 mdd.${MDT_DEV}.lfsck_namespace |
855 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
857 error "(4) unexpected status"
861 echo "stop $SINGLEMDS"
862 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
864 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
865 echo "start $SINGLEMDS"
866 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
867 error "(6) Fail to start MDS!"
869 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
870 mdd.${MDT_DEV}.lfsck_namespace |
871 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
873 error "(7) unexpected status"
876 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
881 formatall > /dev/null
887 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
888 [ "$STATUS" == "init" ] ||
889 error "(2) Expect 'init', but got '$STATUS'"
891 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
893 mkdir $DIR/$tdir/crashed
895 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
896 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
897 for ((i = 0; i < 5; i++)); do
898 touch $DIR/$tdir/dummy${i}
901 umount_client $MOUNT || error "(3) Fail to stop client!"
903 #define OBD_FAIL_LFSCK_DELAY2 0x1601
904 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
905 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
907 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
908 [ "$STATUS" == "scanning-phase1" ] ||
909 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
911 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
913 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
914 [ "$STATUS" == "stopped" ] ||
915 error "(7) Expect 'stopped', but got '$STATUS'"
917 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
919 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
920 [ "$STATUS" == "scanning-phase1" ] ||
921 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
923 #define OBD_FAIL_LFSCK_FATAL2 0x1609
924 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
925 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
926 mdd.${MDT_DEV}.lfsck_namespace |
927 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
929 error "(10) unexpected status"
932 #define OBD_FAIL_LFSCK_DELAY1 0x1600
933 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
934 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
936 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
937 [ "$STATUS" == "scanning-phase1" ] ||
938 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
940 #define OBD_FAIL_LFSCK_CRASH 0x160a
941 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
944 echo "stop $SINGLEMDS"
945 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
947 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
948 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
950 echo "start $SINGLEMDS"
951 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
952 error "(14) Fail to start MDS!"
954 local timeout=$(max_recovery_time)
957 while [ $timer -lt $timeout ]; do
958 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
959 mdt.${MDT_DEV}.recovery_status |
960 awk '/^status/ { print \\\$2 }'")
961 [ "$STATUS" != "RECOVERING" ] && break;
966 [ $timer != $timeout ] ||
967 error "(14.1) recovery timeout"
969 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
970 [ "$STATUS" == "crashed" ] ||
971 error "(15) Expect 'crashed', but got '$STATUS'"
973 #define OBD_FAIL_LFSCK_DELAY2 0x1601
974 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
975 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
977 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
978 [ "$STATUS" == "scanning-phase1" ] ||
979 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
981 echo "stop $SINGLEMDS"
982 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
984 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
985 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
987 echo "start $SINGLEMDS"
988 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
989 error "(19) Fail to start MDS!"
992 while [ $timer -lt $timeout ]; do
993 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
994 mdt.${MDT_DEV}.recovery_status |
995 awk '/^status/ { print \\\$2 }'")
996 [ "$STATUS" != "RECOVERING" ] && break;
1001 [ $timer != $timeout ] ||
1002 error "(19.1) recovery timeout"
1004 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1005 [ "$STATUS" == "paused" ] ||
1006 error "(20) Expect 'paused', but got '$STATUS'"
1008 echo "stop $SINGLEMDS"
1009 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
1011 echo "start $SINGLEMDS without resume LFSCK"
1012 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
1013 error "(20.2) Fail to start MDS!"
1016 while [ $timer -lt $timeout ]; do
1017 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
1018 mdt.${MDT_DEV}.recovery_status |
1019 awk '/^status/ { print \\\$2 }'")
1020 [ "$STATUS" != "RECOVERING" ] && break;
1022 timer=$((timer + 1))
1025 [ $timer != $timeout ] ||
1026 error "(20.3) recovery timeout"
1028 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1029 [ "$STATUS" == "paused" ] ||
1030 error "(20.4) Expect 'paused', but got '$STATUS'"
1032 #define OBD_FAIL_LFSCK_DELAY3 0x1602
1033 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
1035 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
1036 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1037 mdd.${MDT_DEV}.lfsck_namespace |
1038 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
1040 error "(22) unexpected status"
1043 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1044 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1045 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1047 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1048 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1049 mdd.${MDT_DEV}.lfsck_namespace |
1050 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1052 error "(24) unexpected status"
1055 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1056 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1058 run_test 8 "LFSCK state machine"
1061 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1062 skip "Testing on UP system, the speed may be inaccurate."
1066 check_mount_and_prep
1067 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1068 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1069 createmany -o $DIR/$tdir/lfsck/f 5000
1071 local BASE_SPEED1=100
1073 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1076 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1077 [ "$STATUS" == "scanning-phase1" ] ||
1078 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1080 local SPEED=$($SHOW_LAYOUT |
1081 awk '/^average_speed_phase1/ { print $2 }')
1083 # There may be time error, normally it should be less than 2 seconds.
1084 # We allow another 20% schedule error.
1086 # MAX_MARGIN = 1.3 = 13 / 10
1087 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1088 RUN_TIME1 * 13 / 10))
1089 [ $SPEED -lt $MAX_SPEED ] || {
1091 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1092 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1095 # adjust speed limit
1096 local BASE_SPEED2=300
1098 do_facet $SINGLEMDS \
1099 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1102 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1103 # MIN_MARGIN = 0.7 = 7 / 10
1104 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1105 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1106 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1107 [ $SPEED -gt $MIN_SPEED ] || {
1108 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1109 error_ignore LU-5624 \
1110 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1113 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1117 # MAX_MARGIN = 1.3 = 13 / 10
1118 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1119 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1120 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1121 [ $SPEED -lt $MAX_SPEED ] || {
1123 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1124 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1125 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1128 do_nodes $(comma_list $(mdts_nodes)) \
1129 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1130 do_nodes $(comma_list $(osts_nodes)) \
1131 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1133 wait_update_facet $SINGLEMDS \
1134 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1135 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1136 error "(7) Failed to get expected 'completed'"
1138 run_test 9a "LFSCK speed control (1)"
1141 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1142 skip "Testing on UP system, the speed may be inaccurate."
1148 echo "Preparing another 50 * 50 files (with error) at $(date)."
1149 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1150 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1151 createmany -d $DIR/$tdir/d 50
1152 createmany -m $DIR/$tdir/f 50
1153 for ((i = 0; i < 50; i++)); do
1154 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1157 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1158 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1159 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1160 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1161 mdd.${MDT_DEV}.lfsck_namespace |
1162 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1164 error "(5) unexpected status"
1167 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1168 echo "Prepared at $(date)."
1170 local BASE_SPEED1=50
1172 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1175 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1176 [ "$STATUS" == "scanning-phase2" ] ||
1177 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1179 local SPEED=$($SHOW_NAMESPACE |
1180 awk '/^average_speed_phase2/ { print $2 }')
1181 # There may be time error, normally it should be less than 2 seconds.
1182 # We allow another 20% schedule error.
1184 # MAX_MARGIN = 1.3 = 13 / 10
1185 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1186 RUN_TIME1 * 13 / 10))
1187 [ $SPEED -lt $MAX_SPEED ] || {
1189 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1190 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1193 # adjust speed limit
1194 local BASE_SPEED2=150
1196 do_facet $SINGLEMDS \
1197 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1200 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1201 # MIN_MARGIN = 0.7 = 7 / 10
1202 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1203 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1204 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1205 [ $SPEED -gt $MIN_SPEED ] || {
1206 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1207 error_ignore LU-5624 \
1208 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1211 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1215 # MAX_MARGIN = 1.3 = 13 / 10
1216 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1217 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1218 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1219 [ $SPEED -lt $MAX_SPEED ] || {
1221 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1222 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1223 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1226 do_nodes $(comma_list $(mdts_nodes)) \
1227 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1228 do_nodes $(comma_list $(osts_nodes)) \
1229 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1230 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1231 mdd.${MDT_DEV}.lfsck_namespace |
1232 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1234 error "(11) unexpected status"
1237 run_test 9b "LFSCK speed control (2)"
1241 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1242 skip "lookup(..)/linkea on ZFS issue" && return
1246 echo "Preparing more files with error at $(date)."
1247 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1250 for ((i = 0; i < 1000; i = $((i+2)))); do
1251 mkdir -p $DIR/$tdir/d${i}
1252 touch $DIR/$tdir/f${i}
1253 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1256 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1257 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1259 for ((i = 1; i < 1000; i = $((i+2)))); do
1260 mkdir -p $DIR/$tdir/d${i}
1261 touch $DIR/$tdir/f${i}
1262 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1266 echo "Prepared at $(date)."
1268 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1270 umount_client $MOUNT
1271 mount_client $MOUNT || error "(3) Fail to start client!"
1273 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1276 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1277 [ "$STATUS" == "scanning-phase1" ] ||
1278 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1280 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1282 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1284 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1286 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1288 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1290 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1292 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1294 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1295 error "(14) Fail to softlink!"
1297 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1298 [ "$STATUS" == "scanning-phase1" ] ||
1299 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1301 do_nodes $(comma_list $(mdts_nodes)) \
1302 $LCTL set_param -n mdd.*.lfsck_speed_limit 0
1303 do_nodes $(comma_list $(osts_nodes)) \
1304 $LCTL set_param -n obdfilter.*.lfsck_speed_limit 0
1305 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1306 mdd.${MDT_DEV}.lfsck_namespace |
1307 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1309 error "(16) unexpected status"
1312 run_test 10 "System is available during LFSCK scanning"
1315 ost_remove_lastid() {
1318 local rcmd="do_facet ost${ost}"
1320 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1322 # step 1: local mount
1323 mount_fstype ost${ost} || return 1
1324 # step 2: remove the specified LAST_ID
1325 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1327 unmount_fstype ost${ost} || return 2
1331 check_mount_and_prep
1332 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1333 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1338 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1340 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1341 error "(2) Fail to start ost1"
1343 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1344 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1346 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1347 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1349 wait_update_facet ost1 "$LCTL get_param -n \
1350 obdfilter.${OST_DEV}.lfsck_layout |
1351 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1353 error "(5) unexpected status"
1356 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1358 wait_update_facet ost1 "$LCTL get_param -n \
1359 obdfilter.${OST_DEV}.lfsck_layout |
1360 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1362 error "(6) unexpected status"
1365 echo "the LAST_ID(s) should have been rebuilt"
1366 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1367 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1369 run_test 11a "LFSCK can rebuild lost last_id"
1372 check_mount_and_prep
1373 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1375 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1376 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1377 do_facet ost1 $LCTL set_param fail_loc=0x160d
1379 local count=$(precreated_ost_obj_count 0 0)
1381 createmany -o $DIR/$tdir/f $((count + 32))
1383 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1384 local seq=$(do_facet mds1 $LCTL get_param -n \
1385 osp.${proc_path}.prealloc_last_seq)
1386 local lastid1=$(do_facet ost1 "lctl get_param -n \
1387 obdfilter.${ost1_svc}.last_id" | grep $seq |
1388 awk -F: '{ print $2 }')
1390 umount_client $MOUNT
1391 stop ost1 || error "(1) Fail to stop ost1"
1393 #define OBD_FAIL_OST_ENOSPC 0x215
1394 do_facet ost1 $LCTL set_param fail_loc=0x215
1396 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1397 error "(2) Fail to start ost1"
1399 for ((i = 0; i < 60; i++)); do
1400 lastid2=$(do_facet ost1 "lctl get_param -n \
1401 obdfilter.${ost1_svc}.last_id" | grep $seq |
1402 awk -F: '{ print $2 }')
1403 [ ! -z $lastid2 ] && break;
1407 echo "the on-disk LAST_ID should be smaller than the expected one"
1408 [ $lastid1 -gt $lastid2 ] ||
1409 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1411 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1412 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1414 wait_update_facet ost1 "$LCTL get_param -n \
1415 obdfilter.${OST_DEV}.lfsck_layout |
1416 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1418 error "(6) unexpected status"
1421 stop ost1 || error "(7) Fail to stop ost1"
1423 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1424 error "(8) Fail to start ost1"
1426 echo "the on-disk LAST_ID should have been rebuilt"
1427 wait_update_facet ost1 "$LCTL get_param -n \
1428 obdfilter.${ost1_svc}.last_id | grep $seq |
1429 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1430 do_facet ost1 $LCTL get_param -n \
1431 obdfilter.${ost1_svc}.last_id
1432 error "(9) expect lastid1 $seq:$lastid1"
1435 do_facet ost1 $LCTL set_param fail_loc=0
1436 stopall || error "(10) Fail to stopall"
1438 run_test 11b "LFSCK can rebuild crashed last_id"
1441 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1443 check_mount_and_prep
1444 for k in $(seq $MDSCOUNT); do
1445 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1446 createmany -o $DIR/$tdir/${k}/f 100 ||
1447 error "(0) Fail to create 100 files."
1450 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1451 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1452 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1454 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1455 wait_all_targets namespace scanning-phase1 3
1457 echo "Stop namespace LFSCK on all targets by single lctl command."
1458 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1459 error "(4) Fail to stop LFSCK on all devices!"
1461 echo "All the LFSCK targets should be in 'stopped' status."
1462 wait_all_targets_blocked namespace stopped 5
1464 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1465 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1466 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1468 echo "All the LFSCK targets should be in 'completed' status."
1469 wait_all_targets_blocked namespace completed 7
1471 start_full_debug_logging
1473 echo "Start layout LFSCK on all targets by single command (-s 1)."
1474 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1475 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1477 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1478 wait_all_targets layout scanning-phase1 9
1480 echo "Stop layout LFSCK on all targets by single lctl command."
1481 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1482 error "(10) Fail to stop LFSCK on all devices!"
1484 echo "All the LFSCK targets should be in 'stopped' status."
1485 wait_all_targets_blocked layout stopped 11
1487 for k in $(seq $OSTCOUNT); do
1488 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1489 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1490 awk '/^status/ { print $2 }')
1491 [ "$STATUS" == "stopped" ] ||
1492 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1495 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1496 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1497 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1499 echo "All the LFSCK targets should be in 'completed' status."
1500 wait_all_targets_blocked layout completed 14
1502 stop_full_debug_logging
1504 run_test 12a "single command to trigger LFSCK on all devices"
1507 check_mount_and_prep
1509 echo "Start LFSCK without '-M' specified."
1510 do_facet mds1 $LCTL lfsck_start -A -r ||
1511 error "(0) Fail to start LFSCK without '-M'"
1513 wait_all_targets_blocked namespace completed 1
1514 wait_all_targets_blocked layout completed 2
1516 local count=$(do_facet mds1 $LCTL dl |
1517 awk '{ print $3 }' | grep mdt | wc -l)
1518 if [ $count -gt 1 ]; then
1520 echo "Start layout LFSCK on the node with multipe targets,"
1521 echo "but not specify '-M'/'-A' option. Should get failure."
1523 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1524 error "(3) Start layout LFSCK should fail" || true
1527 run_test 12b "auto detect Lustre device"
1531 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1532 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1533 echo "MDT-object FID."
1536 check_mount_and_prep
1538 echo "Inject failure stub to simulate bad lmm_oi"
1539 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1540 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1541 createmany -o $DIR/$tdir/f 1
1542 $LFS setstripe -E 1M -S 1M -E -1 $DIR/$tdir/f1 ||
1543 error "(0) Fail to create PFL $DIR/$tdir/f1"
1544 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1546 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1547 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1549 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1550 mdd.${MDT_DEV}.lfsck_layout |
1551 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1553 error "(2) unexpected status"
1556 local repaired=$($SHOW_LAYOUT |
1557 awk '/^repaired_others/ { print $2 }')
1558 [ $repaired -eq 2 ] ||
1559 error "(3) Fail to repair crashed lmm_oi: $repaired"
1561 run_test 13 "LFSCK can repair crashed lmm_oi"
1565 echo "The OST-object referenced by the MDT-object should be there;"
1566 echo "otherwise, the LFSCK should re-create the missing OST-object."
1567 echo "without '--delay-create-ostobj' option."
1570 check_mount_and_prep
1571 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1573 echo "Inject failure stub to simulate dangling referenced MDT-object"
1574 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1575 do_facet ost1 $LCTL set_param fail_loc=0x1610
1576 local count=$(precreated_ost_obj_count 0 0)
1578 createmany -o $DIR/$tdir/f $((count + 16)) ||
1579 error "(0.1) Fail to create $DIR/$tdir/fx"
1580 touch $DIR/$tdir/guard0
1582 for ((i = 0; i < 16; i++)); do
1583 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1584 $DIR/$tdir/f_comp${i} ||
1585 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1587 touch $DIR/$tdir/guard1
1589 do_facet ost1 $LCTL set_param fail_loc=0
1591 start_full_debug_logging
1593 # exhaust other pre-created dangling cases
1594 count=$(precreated_ost_obj_count 0 0)
1595 createmany -o $DIR/$tdir/a $count ||
1596 error "(0.5) Fail to create $count files."
1598 echo "'ls' should fail because of dangling referenced MDT-object"
1599 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1601 echo "Trigger layout LFSCK to find out dangling reference"
1602 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1605 mdd.${MDT_DEV}.lfsck_layout |
1606 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1608 error "(3) unexpected status"
1611 local repaired=$($SHOW_LAYOUT |
1612 awk '/^repaired_dangling/ { print $2 }')
1613 [ $repaired -ge 32 ] ||
1614 error "(4) Fail to repair dangling reference: $repaired"
1616 echo "'stat' should fail because of not repair dangling by default"
1617 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1618 error "(5.1) stat should fail"
1619 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1620 error "(5.2) stat should fail"
1622 echo "Trigger layout LFSCK to repair dangling reference"
1623 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1625 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1626 mdd.${MDT_DEV}.lfsck_layout |
1627 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1629 error "(7) unexpected status"
1632 # There may be some async LFSCK updates in processing, wait for
1633 # a while until the target reparation has been done. LU-4970.
1635 echo "'stat' should success after layout LFSCK repairing"
1636 wait_update_facet client "stat $DIR/$tdir/guard0 |
1637 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1638 stat $DIR/$tdir/guard0
1640 error "(8.1) unexpected size"
1643 wait_update_facet client "stat $DIR/$tdir/guard1 |
1644 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1645 stat $DIR/$tdir/guard1
1647 error "(8.2) unexpected size"
1650 repaired=$($SHOW_LAYOUT |
1651 awk '/^repaired_dangling/ { print $2 }')
1652 [ $repaired -ge 32 ] ||
1653 error "(9) Fail to repair dangling reference: $repaired"
1655 stop_full_debug_logging
1657 echo "stopall to cleanup object cache"
1660 setupall > /dev/null
1662 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1666 echo "The OST-object referenced by the MDT-object should be there;"
1667 echo "otherwise, the LFSCK should re-create the missing OST-object."
1668 echo "with '--delay-create-ostobj' option."
1671 check_mount_and_prep
1672 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1674 echo "Inject failure stub to simulate dangling referenced MDT-object"
1675 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1676 do_facet ost1 $LCTL set_param fail_loc=0x1610
1677 local count=$(precreated_ost_obj_count 0 0)
1679 createmany -o $DIR/$tdir/f $((count + 31))
1680 touch $DIR/$tdir/guard
1681 do_facet ost1 $LCTL set_param fail_loc=0
1683 start_full_debug_logging
1685 # exhaust other pre-created dangling cases
1686 count=$(precreated_ost_obj_count 0 0)
1687 createmany -o $DIR/$tdir/a $count ||
1688 error "(0) Fail to create $count files."
1690 echo "'ls' should fail because of dangling referenced MDT-object"
1691 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1693 echo "Trigger layout LFSCK to find out dangling reference"
1694 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1696 wait_all_targets_blocked layout completed 3
1698 local repaired=$($SHOW_LAYOUT |
1699 awk '/^repaired_dangling/ { print $2 }')
1700 [ $repaired -ge 32 ] ||
1701 error "(4) Fail to repair dangling reference: $repaired"
1703 echo "'stat' should fail because of not repair dangling by default"
1704 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1706 echo "Trigger layout LFSCK to repair dangling reference"
1707 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1709 wait_all_targets_blocked layout completed 7
1711 # There may be some async LFSCK updates in processing, wait for
1712 # a while until the target reparation has been done. LU-4970.
1714 echo "'stat' should success after layout LFSCK repairing"
1715 wait_update_facet client "stat $DIR/$tdir/guard |
1716 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1717 stat $DIR/$tdir/guard
1719 error "(8) unexpected size"
1722 repaired=$($SHOW_LAYOUT |
1723 awk '/^repaired_dangling/ { print $2 }')
1724 [ $repaired -ge 32 ] ||
1725 error "(9) Fail to repair dangling reference: $repaired"
1727 stop_full_debug_logging
1729 echo "stopall to cleanup object cache"
1732 setupall > /dev/null
1734 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1738 echo "If the OST-object referenced by the MDT-object back points"
1739 echo "to some non-exist MDT-object, then the LFSCK should repair"
1740 echo "the OST-object to back point to the right MDT-object."
1743 check_mount_and_prep
1744 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1746 echo "Inject failure stub to make the OST-object to back point to"
1747 echo "non-exist MDT-object."
1748 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1750 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1751 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1752 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1754 error "(0) Fail to create PFL $DIR/$tdir/f1"
1755 # 'dd' will trigger punch RPC firstly on every OST-objects.
1756 # So even though some OST-object will not be write by 'dd',
1757 # as long as it is allocated (may be NOT allocated in pfl_3b)
1758 # its layout information will be set also.
1759 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1760 cancel_lru_locks osc
1761 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1763 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1764 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1766 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1767 mdd.${MDT_DEV}.lfsck_layout |
1768 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1770 error "(2) unexpected status"
1773 local repaired=$($SHOW_LAYOUT |
1774 awk '/^repaired_unmatched_pair/ { print $2 }')
1775 [ $repaired -ge 3 ] ||
1776 error "(3) Fail to repair unmatched pair: $repaired"
1778 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1782 echo "If the OST-object referenced by the MDT-object back points"
1783 echo "to other MDT-object that doesn't recognize the OST-object,"
1784 echo "then the LFSCK should repair it to back point to the right"
1785 echo "MDT-object (the first one)."
1788 check_mount_and_prep
1789 mkdir -p $DIR/$tdir/0
1790 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1791 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1792 cancel_lru_locks osc
1794 echo "Inject failure stub to make the OST-object to back point to"
1795 echo "other MDT-object"
1798 [ $OSTCOUNT -ge 2 ] && stripes=2
1800 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1801 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1802 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1803 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1805 error "(0) Fail to create PFL $DIR/$tdir/f1"
1806 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1807 cancel_lru_locks osc
1808 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1810 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1811 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1813 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1814 mdd.${MDT_DEV}.lfsck_layout |
1815 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1817 error "(2) unexpected status"
1820 local repaired=$($SHOW_LAYOUT |
1821 awk '/^repaired_unmatched_pair/ { print $2 }')
1822 [ $repaired -eq 4 ] ||
1823 error "(3) Fail to repair unmatched pair: $repaired"
1825 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1828 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1830 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1831 skip "Skip the test after 2.7.55 see LU-6437" && return
1834 echo "According to current metadata migration implementation,"
1835 echo "before the old MDT-object is removed, both the new MDT-object"
1836 echo "and old MDT-object will reference the same LOV layout. Then if"
1837 echo "the layout LFSCK finds the new MDT-object by race, it will"
1838 echo "regard related OST-object(s) as multiple referenced case, and"
1839 echo "will try to create new OST-object(s) for the new MDT-object."
1840 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1841 echo "MDT-object before confirm the multiple referenced case."
1844 check_mount_and_prep
1845 $LFS mkdir -i 1 $DIR/$tdir/a1
1846 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1847 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1848 cancel_lru_locks osc
1850 echo "Inject failure stub on MDT1 to delay the migration"
1852 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1853 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1854 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1855 $LFS migrate -m 0 $DIR/$tdir/a1 &
1858 echo "Trigger layout LFSCK to race with the migration"
1859 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1861 wait_all_targets_blocked layout completed 2
1863 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1864 local repaired=$($SHOW_LAYOUT |
1865 awk '/^repaired_unmatched_pair/ { print $2 }')
1866 [ $repaired -eq 1 ] ||
1867 error "(3) Fail to repair unmatched pair: $repaired"
1869 repaired=$($SHOW_LAYOUT |
1870 awk '/^repaired_multiple_referenced/ { print $2 }')
1871 [ $repaired -eq 0 ] ||
1872 error "(4) Unexpectedly repaird multiple references: $repaired"
1874 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1878 echo "If the OST-object's owner information does not match the owner"
1879 echo "information stored in the MDT-object, then the LFSCK trust the"
1880 echo "MDT-object and update the OST-object's owner information."
1883 check_mount_and_prep
1884 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1885 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1886 cancel_lru_locks osc
1888 # created but no setattr or write to the file.
1890 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/d1
1891 $RUNAS createmany -o $DIR/$tdir/d1/o 100 || error "create failed"
1893 echo "Inject failure stub to skip OST-object owner changing"
1894 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1895 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1896 chown 1.1 $DIR/$tdir/f0
1897 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1899 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1902 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1904 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1905 mdd.${MDT_DEV}.lfsck_layout |
1906 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1908 error "(2) unexpected status"
1911 local repaired=$($SHOW_LAYOUT |
1912 awk '/^repaired_inconsistent_owner/ { print $2 }')
1913 [ $repaired -eq 1 ] ||
1914 error "(3) Fail to repair inconsistent owner: $repaired"
1916 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1920 echo "If more than one MDT-objects reference the same OST-object,"
1921 echo "and the OST-object only recognizes one MDT-object, then the"
1922 echo "LFSCK should create new OST-objects for such non-recognized"
1926 check_mount_and_prep
1927 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1929 echo "Inject failure stub to make two MDT-objects to refernce"
1930 echo "the OST-object"
1932 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1933 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1934 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1935 cancel_lru_locks mdc
1936 cancel_lru_locks osc
1938 createmany -o $DIR/$tdir/f 1
1939 cancel_lru_locks mdc
1940 cancel_lru_locks osc
1942 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1944 error "(0) Fail to create PFL $DIR/$tdir/f1"
1945 cancel_lru_locks mdc
1946 cancel_lru_locks osc
1947 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1949 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1950 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1951 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1952 [ $size -eq 1048576 ] ||
1953 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1955 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1956 [ $size -eq 1048576 ] ||
1957 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1959 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1962 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1964 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1965 mdd.${MDT_DEV}.lfsck_layout |
1966 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1968 error "(3) unexpected status"
1971 local repaired=$($SHOW_LAYOUT |
1972 awk '/^repaired_multiple_referenced/ { print $2 }')
1973 [ $repaired -eq 2 ] ||
1974 error "(4) Fail to repair multiple references: $repaired"
1976 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1977 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1978 error "(5) Fail to write f0."
1979 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1980 [ $size -eq 1048576 ] ||
1981 error "(6) guard size should be 1048576, but got $size"
1983 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1984 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1985 error "(7) Fail to write f1."
1986 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1987 [ $size -eq 1048576 ] ||
1988 error "(8) guard size should be 1048576, but got $size"
1990 run_test 17 "LFSCK can repair multiple references"
1992 $LCTL set_param debug=+cache > /dev/null
1996 echo "The target MDT-object is there, but related stripe information"
1997 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1998 echo "layout EA entries."
2001 check_mount_and_prep
2002 $LFS mkdir -i 0 $DIR/$tdir/a1
2003 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2004 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2006 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2008 $LFS path2fid $DIR/$tdir/a1/f1
2009 $LFS getstripe $DIR/$tdir/a1/f1
2011 if [ $MDSCOUNT -ge 2 ]; then
2012 $LFS mkdir -i 1 $DIR/$tdir/a2
2013 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2014 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2015 $LFS path2fid $DIR/$tdir/a2/f2
2016 $LFS getstripe $DIR/$tdir/a2/f2
2019 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2020 error "(0) Fail to create PFL $DIR/$tdir/f3"
2022 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2024 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2026 $LFS path2fid $DIR/$tdir/f3
2027 $LFS getstripe $DIR/$tdir/f3
2029 cancel_lru_locks osc
2031 echo "Inject failure, to make the MDT-object lost its layout EA"
2032 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
2033 do_facet mds1 $LCTL set_param fail_loc=0x1615
2034 chown 1.1 $DIR/$tdir/a1/f1
2036 if [ $MDSCOUNT -ge 2 ]; then
2037 do_facet mds2 $LCTL set_param fail_loc=0x1615
2038 chown 1.1 $DIR/$tdir/a2/f2
2041 chown 1.1 $DIR/$tdir/f3
2046 do_facet mds1 $LCTL set_param fail_loc=0
2047 if [ $MDSCOUNT -ge 2 ]; then
2048 do_facet mds2 $LCTL set_param fail_loc=0
2051 cancel_lru_locks mdc
2052 cancel_lru_locks osc
2054 echo "The file size should be incorrect since layout EA is lost"
2055 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2056 [ "$cur_size" != "$saved_size1" ] ||
2057 error "(1) Expect incorrect file1 size"
2059 if [ $MDSCOUNT -ge 2 ]; then
2060 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2061 [ "$cur_size" != "$saved_size1" ] ||
2062 error "(2) Expect incorrect file2 size"
2065 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2066 [ "$cur_size" != "$saved_size2" ] ||
2067 error "(1.2) Expect incorrect file3 size"
2069 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2070 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2072 for k in $(seq $MDSCOUNT); do
2073 # The LFSCK status query internal is 30 seconds. For the case
2074 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2075 # time to guarantee the status sync up.
2076 wait_update_facet mds${k} "$LCTL get_param -n \
2077 mdd.$(facet_svc mds${k}).lfsck_layout |
2078 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2079 error "(4) MDS${k} is not the expected 'completed'"
2082 for k in $(seq $OSTCOUNT); do
2083 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2084 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2085 awk '/^status/ { print $2 }')
2086 [ "$cur_status" == "completed" ] ||
2087 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2090 local repaired=$(do_facet mds1 $LCTL get_param -n \
2091 mdd.$(facet_svc mds1).lfsck_layout |
2092 awk '/^repaired_orphan/ { print $2 }')
2093 [ $repaired -eq 3 ] ||
2094 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2096 if [ $MDSCOUNT -ge 2 ]; then
2097 repaired=$(do_facet mds2 $LCTL get_param -n \
2098 mdd.$(facet_svc mds2).lfsck_layout |
2099 awk '/^repaired_orphan/ { print $2 }')
2100 [ $repaired -eq 2 ] ||
2101 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2104 $LFS path2fid $DIR/$tdir/a1/f1
2105 $LFS getstripe $DIR/$tdir/a1/f1
2107 if [ $MDSCOUNT -ge 2 ]; then
2108 $LFS path2fid $DIR/$tdir/a2/f2
2109 $LFS getstripe $DIR/$tdir/a2/f2
2112 $LFS path2fid $DIR/$tdir/f3
2113 $LFS getstripe $DIR/$tdir/f3
2115 echo "The file size should be correct after layout LFSCK scanning"
2116 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2117 [ "$cur_size" == "$saved_size1" ] ||
2118 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2120 if [ $MDSCOUNT -ge 2 ]; then
2121 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2122 [ "$cur_size" == "$saved_size1" ] ||
2123 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2126 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2127 [ "$cur_size" == "$saved_size2" ] ||
2128 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2130 run_test 18a "Find out orphan OST-object and repair it (1)"
2133 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2136 echo "The target MDT-object is lost. The LFSCK should re-create the"
2137 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2138 echo "can move it back to normal namespace manually."
2141 check_mount_and_prep
2142 $LFS mkdir -i 0 $DIR/$tdir/a1
2143 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2144 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2145 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2146 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2148 $LFS getstripe $DIR/$tdir/a1/f1
2150 if [ $MDSCOUNT -ge 2 ]; then
2151 $LFS mkdir -i 1 $DIR/$tdir/a2
2152 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2153 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2154 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2156 $LFS getstripe $DIR/$tdir/a2/f2
2159 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2160 error "(0) Fail to create PFL $DIR/$tdir/f3"
2162 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2164 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2165 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2167 $LFS getstripe $DIR/$tdir/f3
2169 cancel_lru_locks osc
2171 echo "Inject failure, to simulate the case of missing the MDT-object"
2172 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2173 do_facet mds1 $LCTL set_param fail_loc=0x1616
2174 rm -f $DIR/$tdir/a1/f1
2176 if [ $MDSCOUNT -ge 2 ]; then
2177 do_facet mds2 $LCTL set_param fail_loc=0x1616
2178 rm -f $DIR/$tdir/a2/f2
2186 do_facet mds1 $LCTL set_param fail_loc=0
2187 if [ $MDSCOUNT -ge 2 ]; then
2188 do_facet mds2 $LCTL set_param fail_loc=0
2191 cancel_lru_locks mdc
2192 cancel_lru_locks osc
2194 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2195 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2197 for k in $(seq $MDSCOUNT); do
2198 # The LFSCK status query internal is 30 seconds. For the case
2199 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2200 # time to guarantee the status sync up.
2201 wait_update_facet mds${k} "$LCTL get_param -n \
2202 mdd.$(facet_svc mds${k}).lfsck_layout |
2203 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2204 error "(2) MDS${k} is not the expected 'completed'"
2207 for k in $(seq $OSTCOUNT); do
2208 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2209 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2210 awk '/^status/ { print $2 }')
2211 [ "$cur_status" == "completed" ] ||
2212 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2215 local repaired=$(do_facet mds1 $LCTL get_param -n \
2216 mdd.$(facet_svc mds1).lfsck_layout |
2217 awk '/^repaired_orphan/ { print $2 }')
2218 [ $repaired -eq 3 ] ||
2219 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2221 if [ $MDSCOUNT -ge 2 ]; then
2222 repaired=$(do_facet mds2 $LCTL get_param -n \
2223 mdd.$(facet_svc mds2).lfsck_layout |
2224 awk '/^repaired_orphan/ { print $2 }')
2225 [ $repaired -eq 2 ] ||
2226 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2229 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2230 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2231 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2233 if [ $MDSCOUNT -ge 2 ]; then
2234 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2235 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2238 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2239 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2241 $LFS path2fid $DIR/$tdir/a1/f1
2242 $LFS getstripe $DIR/$tdir/a1/f1
2244 if [ $MDSCOUNT -ge 2 ]; then
2245 $LFS path2fid $DIR/$tdir/a2/f2
2246 $LFS getstripe $DIR/$tdir/a2/f2
2249 $LFS path2fid $DIR/$tdir/f3
2250 $LFS getstripe $DIR/$tdir/f3
2252 echo "The file size should be correct after layout LFSCK scanning"
2253 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2254 [ "$cur_size" == "$saved_size1" ] ||
2255 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2257 if [ $MDSCOUNT -ge 2 ]; then
2258 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2259 [ "$cur_size" == "$saved_size1" ] ||
2260 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2263 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2264 [ "$cur_size" == "$saved_size2" ] ||
2265 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2267 run_test 18b "Find out orphan OST-object and repair it (2)"
2270 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2273 echo "The target MDT-object is lost, and the OST-object FID is missing."
2274 echo "The LFSCK should re-create the MDT-object with new FID under the "
2275 echo "directory .lustre/lost+found/MDTxxxx."
2278 check_mount_and_prep
2279 $LFS mkdir -i 0 $DIR/$tdir/a1
2280 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2282 echo "Inject failure, to simulate the case of missing parent FID"
2283 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2284 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2286 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2287 $LFS getstripe $DIR/$tdir/a1/f1
2289 if [ $MDSCOUNT -ge 2 ]; then
2290 $LFS mkdir -i 1 $DIR/$tdir/a2
2291 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2292 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2293 $LFS getstripe $DIR/$tdir/a2/f2
2296 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2297 error "(0) Fail to create PFL $DIR/$tdir/f3"
2299 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2300 $LFS getstripe $DIR/$tdir/f3
2302 cancel_lru_locks osc
2303 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2305 echo "Inject failure, to simulate the case of missing the MDT-object"
2306 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2307 do_facet mds1 $LCTL set_param fail_loc=0x1616
2308 rm -f $DIR/$tdir/a1/f1
2310 if [ $MDSCOUNT -ge 2 ]; then
2311 do_facet mds2 $LCTL set_param fail_loc=0x1616
2312 rm -f $DIR/$tdir/a2/f2
2320 do_facet mds1 $LCTL set_param fail_loc=0
2321 if [ $MDSCOUNT -ge 2 ]; then
2322 do_facet mds2 $LCTL set_param fail_loc=0
2325 cancel_lru_locks mdc
2326 cancel_lru_locks osc
2328 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2329 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2331 for k in $(seq $MDSCOUNT); do
2332 # The LFSCK status query internal is 30 seconds. For the case
2333 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2334 # time to guarantee the status sync up.
2335 wait_update_facet mds${k} "$LCTL get_param -n \
2336 mdd.$(facet_svc mds${k}).lfsck_layout |
2337 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2338 error "(2) MDS${k} is not the expected 'completed'"
2341 for k in $(seq $OSTCOUNT); do
2342 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2343 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2344 awk '/^status/ { print $2 }')
2345 [ "$cur_status" == "completed" ] ||
2346 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2349 if [ $MDSCOUNT -ge 2 ]; then
2355 local repaired=$(do_facet mds1 $LCTL get_param -n \
2356 mdd.$(facet_svc mds1).lfsck_layout |
2357 awk '/^repaired_orphan/ { print $2 }')
2358 [ $repaired -eq $expected ] ||
2359 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2361 if [ $MDSCOUNT -ge 2 ]; then
2362 repaired=$(do_facet mds2 $LCTL get_param -n \
2363 mdd.$(facet_svc mds2).lfsck_layout |
2364 awk '/^repaired_orphan/ { print $2 }')
2365 [ $repaired -eq 0 ] ||
2366 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2369 ls -ail $MOUNT/.lustre/lost+found/
2371 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2372 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2373 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2375 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2378 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2379 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2380 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2382 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2383 [ ! -z "$cname" ] ||
2384 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2386 run_test 18c "Find out orphan OST-object and repair it (3)"
2390 echo "The target MDT-object layout EA is corrupted, but the right"
2391 echo "OST-object is still alive as orphan. The layout LFSCK will"
2392 echo "not create new OST-object to occupy such slot."
2395 check_mount_and_prep
2397 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2398 echo "guard" > $DIR/$tdir/a1/f1
2399 echo "foo" > $DIR/$tdir/a1/f2
2401 echo "guard" > $DIR/$tdir/a1/f3
2402 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2403 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2404 echo "foo" > $DIR/$tdir/a1/f4
2406 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2407 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2408 $LFS path2fid $DIR/$tdir/a1/f1
2409 $LFS getstripe $DIR/$tdir/a1/f1
2410 $LFS path2fid $DIR/$tdir/a1/f2
2411 $LFS getstripe $DIR/$tdir/a1/f2
2412 $LFS path2fid $DIR/$tdir/a1/f3
2413 $LFS getstripe $DIR/$tdir/a1/f3
2414 $LFS path2fid $DIR/$tdir/a1/f4
2415 $LFS getstripe $DIR/$tdir/a1/f4
2416 cancel_lru_locks osc
2418 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2419 echo "to reference the same OST-object (which is f1's OST-obejct)."
2420 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2421 echo "dangling reference case, but f2's old OST-object is there."
2423 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2424 echo "to reference the same OST-object (which is f3's OST-obejct)."
2425 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2426 echo "dangling reference case, but f4's old OST-object is there."
2429 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2430 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2431 chown 1.1 $DIR/$tdir/a1/f2
2432 chown 1.1 $DIR/$tdir/a1/f4
2433 rm -f $DIR/$tdir/a1/f1
2434 rm -f $DIR/$tdir/a1/f3
2437 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2439 echo "stopall to cleanup object cache"
2442 setupall > /dev/null
2444 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2445 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2447 for k in $(seq $MDSCOUNT); do
2448 # The LFSCK status query internal is 30 seconds. For the case
2449 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2450 # time to guarantee the status sync up.
2451 wait_update_facet mds${k} "$LCTL get_param -n \
2452 mdd.$(facet_svc mds${k}).lfsck_layout |
2453 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2454 error "(3) MDS${k} is not the expected 'completed'"
2457 for k in $(seq $OSTCOUNT); do
2458 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2459 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2460 awk '/^status/ { print $2 }')
2461 [ "$cur_status" == "completed" ] ||
2462 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2465 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2466 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2467 awk '/^repaired_orphan/ { print $2 }')
2468 [ $repaired -eq 2 ] ||
2469 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2471 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2472 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2473 awk '/^repaired_dangling/ { print $2 }')
2474 [ $repaired -eq 0 ] ||
2475 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2477 echo "The file size should be correct after layout LFSCK scanning"
2478 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2479 [ "$cur_size" == "$saved_size1" ] ||
2480 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2482 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2483 [ "$cur_size" == "$saved_size2" ] ||
2484 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2486 echo "The LFSCK should find back the original data."
2487 cat $DIR/$tdir/a1/f2
2488 $LFS path2fid $DIR/$tdir/a1/f2
2489 $LFS getstripe $DIR/$tdir/a1/f2
2490 cat $DIR/$tdir/a1/f4
2491 $LFS path2fid $DIR/$tdir/a1/f4
2492 $LFS getstripe $DIR/$tdir/a1/f4
2494 run_test 18d "Find out orphan OST-object and repair it (4)"
2497 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2500 echo "The target MDT-object layout EA slot is occpuied by some new"
2501 echo "created OST-object when repair dangling reference case. Such"
2502 echo "conflict OST-object has been modified by others. To keep the"
2503 echo "new data, the LFSCK will create a new file to refernece this"
2504 echo "old orphan OST-object."
2507 check_mount_and_prep
2509 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2510 echo "guard" > $DIR/$tdir/a1/f1
2511 echo "foo" > $DIR/$tdir/a1/f2
2513 echo "guard" > $DIR/$tdir/a1/f3
2514 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2515 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2516 echo "foo" > $DIR/$tdir/a1/f4
2518 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2519 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2521 $LFS path2fid $DIR/$tdir/a1/f1
2522 $LFS getstripe $DIR/$tdir/a1/f1
2523 $LFS path2fid $DIR/$tdir/a1/f2
2524 $LFS getstripe $DIR/$tdir/a1/f2
2525 $LFS path2fid $DIR/$tdir/a1/f3
2526 $LFS getstripe $DIR/$tdir/a1/f3
2527 $LFS path2fid $DIR/$tdir/a1/f4
2528 $LFS getstripe $DIR/$tdir/a1/f4
2529 cancel_lru_locks osc
2531 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2532 echo "to reference the same OST-object (which is f1's OST-obejct)."
2533 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2534 echo "dangling reference case, but f2's old OST-object is there."
2536 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2537 echo "to reference the same OST-object (which is f3's OST-obejct)."
2538 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2539 echo "dangling reference case, but f4's old OST-object is there."
2542 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2543 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2544 chown 1.1 $DIR/$tdir/a1/f2
2545 chown 1.1 $DIR/$tdir/a1/f4
2546 rm -f $DIR/$tdir/a1/f1
2547 rm -f $DIR/$tdir/a1/f3
2550 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2552 echo "stopall to cleanup object cache"
2555 setupall > /dev/null
2557 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2558 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2560 start_full_debug_logging
2562 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2563 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2565 wait_update_facet mds1 "$LCTL get_param -n \
2566 mdd.$(facet_svc mds1).lfsck_layout |
2567 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2568 error "(3) MDS1 is not the expected 'scanning-phase2'"
2570 # to guarantee all updates are synced.
2574 echo "Write new data to f2/f4 to modify the new created OST-object."
2575 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2576 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2578 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2580 for k in $(seq $MDSCOUNT); do
2581 # The LFSCK status query internal is 30 seconds. For the case
2582 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2583 # time to guarantee the status sync up.
2584 wait_update_facet mds${k} "$LCTL get_param -n \
2585 mdd.$(facet_svc mds${k}).lfsck_layout |
2586 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2587 error "(4) MDS${k} is not the expected 'completed'"
2590 for k in $(seq $OSTCOUNT); do
2591 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2592 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2593 awk '/^status/ { print $2 }')
2594 [ "$cur_status" == "completed" ] ||
2595 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2598 stop_full_debug_logging
2600 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2601 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2602 awk '/^repaired_orphan/ { print $2 }')
2603 [ $repaired -eq 2 ] ||
2604 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2606 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2607 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2608 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2610 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2611 if [ $count -ne 2 ]; then
2612 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2613 error "(8) Expect 2 stubs under lost+found, but got $count"
2616 echo "The stub file should keep the original f2 or f4 data"
2617 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2618 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2619 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2620 error "(9) Got unexpected $cur_size"
2623 $LFS path2fid $cname
2624 $LFS getstripe $cname
2626 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2627 cur_size=$(ls -il $cname | awk '{ print $6 }')
2628 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2629 error "(10) Got unexpected $cur_size"
2632 $LFS path2fid $cname
2633 $LFS getstripe $cname
2635 echo "The f2/f4 should contains new data."
2636 cat $DIR/$tdir/a1/f2
2637 $LFS path2fid $DIR/$tdir/a1/f2
2638 $LFS getstripe $DIR/$tdir/a1/f2
2639 cat $DIR/$tdir/a1/f4
2640 $LFS path2fid $DIR/$tdir/a1/f4
2641 $LFS getstripe $DIR/$tdir/a1/f4
2643 run_test 18e "Find out orphan OST-object and repair it (5)"
2646 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2649 echo "The target MDT-object is lost. The LFSCK should re-create the"
2650 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2651 echo "to verify some OST-object(s) during the first stage-scanning,"
2652 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2653 echo "should not be affected."
2656 check_mount_and_prep
2657 $LFS mkdir -i 0 $DIR/$tdir/a1
2658 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2659 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2660 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2661 $LFS mkdir -i 0 $DIR/$tdir/a2
2662 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2663 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2664 $LFS getstripe $DIR/$tdir/a1/f1
2665 $LFS getstripe $DIR/$tdir/a2/f2
2667 if [ $MDSCOUNT -ge 2 ]; then
2668 $LFS mkdir -i 1 $DIR/$tdir/a3
2669 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2670 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2671 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2672 $LFS mkdir -i 1 $DIR/$tdir/a4
2673 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2674 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2675 $LFS getstripe $DIR/$tdir/a3/f3
2676 $LFS getstripe $DIR/$tdir/a4/f4
2679 cancel_lru_locks osc
2681 echo "Inject failure, to simulate the case of missing the MDT-object"
2682 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2683 do_facet mds1 $LCTL set_param fail_loc=0x1616
2684 rm -f $DIR/$tdir/a1/f1
2685 rm -f $DIR/$tdir/a2/f2
2687 if [ $MDSCOUNT -ge 2 ]; then
2688 do_facet mds2 $LCTL set_param fail_loc=0x1616
2689 rm -f $DIR/$tdir/a3/f3
2690 rm -f $DIR/$tdir/a4/f4
2696 do_facet mds1 $LCTL set_param fail_loc=0
2697 if [ $MDSCOUNT -ge 2 ]; then
2698 do_facet mds2 $LCTL set_param fail_loc=0
2701 cancel_lru_locks mdc
2702 cancel_lru_locks osc
2704 echo "Inject failure, to simulate the OST0 fail to handle"
2705 echo "MDT0 LFSCK request during the first-stage scanning."
2706 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2707 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2709 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2710 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2712 for k in $(seq $MDSCOUNT); do
2713 # The LFSCK status query internal is 30 seconds. For the case
2714 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2715 # time to guarantee the status sync up.
2716 wait_update_facet mds${k} "$LCTL get_param -n \
2717 mdd.$(facet_svc mds${k}).lfsck_layout |
2718 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2719 error "(2) MDS${k} is not the expected 'partial'"
2722 wait_update_facet ost1 "$LCTL get_param -n \
2723 obdfilter.$(facet_svc ost1).lfsck_layout |
2724 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2725 error "(3) OST1 is not the expected 'partial'"
2728 wait_update_facet ost2 "$LCTL get_param -n \
2729 obdfilter.$(facet_svc ost2).lfsck_layout |
2730 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2731 error "(4) OST2 is not the expected 'completed'"
2734 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2736 local repaired=$(do_facet mds1 $LCTL get_param -n \
2737 mdd.$(facet_svc mds1).lfsck_layout |
2738 awk '/^repaired_orphan/ { print $2 }')
2739 [ $repaired -eq 1 ] ||
2740 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2742 if [ $MDSCOUNT -ge 2 ]; then
2743 repaired=$(do_facet mds2 $LCTL get_param -n \
2744 mdd.$(facet_svc mds2).lfsck_layout |
2745 awk '/^repaired_orphan/ { print $2 }')
2746 [ $repaired -eq 1 ] ||
2747 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2750 echo "Trigger layout LFSCK on all devices again to cleanup"
2751 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2753 for k in $(seq $MDSCOUNT); do
2754 # The LFSCK status query internal is 30 seconds. For the case
2755 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2756 # time to guarantee the status sync up.
2757 wait_update_facet mds${k} "$LCTL get_param -n \
2758 mdd.$(facet_svc mds${k}).lfsck_layout |
2759 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2760 error "(8) MDS${k} is not the expected 'completed'"
2763 for k in $(seq $OSTCOUNT); do
2764 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2765 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2766 awk '/^status/ { print $2 }')
2767 [ "$cur_status" == "completed" ] ||
2768 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2772 local repaired=$(do_facet mds1 $LCTL get_param -n \
2773 mdd.$(facet_svc mds1).lfsck_layout |
2774 awk '/^repaired_orphan/ { print $2 }')
2775 [ $repaired -eq 2 ] ||
2776 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2778 if [ $MDSCOUNT -ge 2 ]; then
2779 repaired=$(do_facet mds2 $LCTL get_param -n \
2780 mdd.$(facet_svc mds2).lfsck_layout |
2781 awk '/^repaired_orphan/ { print $2 }')
2782 [ $repaired -eq 2 ] ||
2783 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2786 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2789 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
2792 echo "The target MDT-object is lost, but related OI mapping is there"
2793 echo "The LFSCK should recreate the lost MDT-object without affected"
2794 echo "by the stale OI mapping."
2797 check_mount_and_prep
2798 $LFS mkdir -i 0 $DIR/$tdir/a1
2799 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2800 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2801 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2803 $LFS getstripe $DIR/$tdir/a1/f1
2804 cancel_lru_locks osc
2806 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2807 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2808 do_facet mds1 $LCTL set_param fail_loc=0x162e
2809 rm -f $DIR/$tdir/a1/f1
2811 do_facet mds1 $LCTL set_param fail_loc=0
2812 cancel_lru_locks mdc
2813 cancel_lru_locks osc
2815 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2816 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2818 for k in $(seq $MDSCOUNT); do
2819 # The LFSCK status query internal is 30 seconds. For the case
2820 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2821 # time to guarantee the status sync up.
2822 wait_update_facet mds${k} "$LCTL get_param -n \
2823 mdd.$(facet_svc mds${k}).lfsck_layout |
2824 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2825 error "(2) MDS${k} is not the expected 'completed'"
2828 for k in $(seq $OSTCOUNT); do
2829 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2830 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2831 awk '/^status/ { print $2 }')
2832 [ "$cur_status" == "completed" ] ||
2833 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2836 local repaired=$(do_facet mds1 $LCTL get_param -n \
2837 mdd.$(facet_svc mds1).lfsck_layout |
2838 awk '/^repaired_orphan/ { print $2 }')
2839 [ $repaired -eq $OSTCOUNT ] ||
2840 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2842 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2843 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2844 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2846 $LFS path2fid $DIR/$tdir/a1/f1
2847 $LFS getstripe $DIR/$tdir/a1/f1
2849 run_test 18g "Find out orphan OST-object and repair it (7)"
2853 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2854 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2855 echo "scanning its OST-object(s). Then in the second stage scanning,"
2856 echo "the OST will return related OST-object(s) to the MDT as orphan."
2857 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2858 echo "the 'orphan(s)' stripe information."
2861 check_mount_and_prep
2863 $LFS setstripe -E 2M -S 1M -c 1 -E -1 $DIR/$tdir/f0 ||
2864 error "(0) Fail to create PFL $DIR/$tdir/f0"
2866 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2867 error "(1.1) Fail to write $DIR/$tdir/f0"
2869 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2870 error "(1.2) Fail to write $DIR/$tdir/f0"
2872 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2874 echo "Inject failure stub to simulate bad PFL extent range"
2875 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2876 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2878 chown 1.1 $DIR/$tdir/f0
2880 cancel_lru_locks mdc
2881 cancel_lru_locks osc
2882 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2884 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2885 error "(2) Write to bad PFL file should fail"
2887 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2888 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2890 for k in $(seq $MDSCOUNT); do
2891 # The LFSCK status query internal is 30 seconds. For the case
2892 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2893 # time to guarantee the status sync up.
2894 wait_update_facet mds${k} "$LCTL get_param -n \
2895 mdd.$(facet_svc mds${k}).lfsck_layout |
2896 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2897 error "(4.1) MDS${k} is not the expected 'completed'"
2900 for k in $(seq $OSTCOUNT); do
2901 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2902 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2903 awk '/^status/ { print $2 }')
2904 [ "$cur_status" == "completed" ] ||
2905 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2909 local repaired=$($SHOW_LAYOUT |
2910 awk '/^repaired_orphan/ { print $2 }')
2911 [ $repaired -eq 2 ] ||
2912 error "(5) Fail to repair crashed PFL range: $repaired"
2914 echo "Data in $DIR/$tdir/f0 should not be broken"
2915 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2916 error "(6) Data in $DIR/$tdir/f0 is broken"
2918 echo "Write should succeed after LFSCK repairing the bad PFL range"
2919 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2920 error "(7) Write should succeed after LFSCK"
2922 run_test 18h "LFSCK can repair crashed PFL extent range"
2924 $LCTL set_param debug=-cache > /dev/null
2927 check_mount_and_prep
2928 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2930 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2931 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2933 echo "foo1" > $DIR/$tdir/a0
2934 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2935 error "(0) Fail to create PFL $DIR/$tdir/a1"
2936 echo "foo2" > $DIR/$tdir/a1
2937 echo "guard" > $DIR/$tdir/a2
2938 cancel_lru_locks osc
2940 echo "Inject failure, then client will offer wrong parent FID when read"
2941 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2942 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2944 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2945 $LCTL set_param fail_loc=0x1619
2947 echo "Read RPC with wrong parent FID should be denied"
2948 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2949 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2950 $LCTL set_param fail_loc=0
2952 run_test 19a "OST-object inconsistency self detect"
2955 check_mount_and_prep
2956 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2958 echo "Inject failure stub to make the OST-object to back point to"
2959 echo "non-exist MDT-object"
2961 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2962 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2964 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2965 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2966 echo "foo1" > $DIR/$tdir/f0
2967 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2968 error "(0) Fail to create PFL $DIR/$tdir/f1"
2969 echo "foo2" > $DIR/$tdir/f1
2970 cancel_lru_locks osc
2971 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2973 do_facet ost1 $LCTL set_param -n \
2974 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2975 echo "Nothing should be fixed since self detect and repair is disabled"
2976 local repaired=$(do_facet ost1 $LCTL get_param -n \
2977 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2978 awk '/^repaired/ { print $2 }')
2979 [ $repaired -eq 0 ] ||
2980 error "(1) Expected 0 repaired, but got $repaired"
2982 echo "Read RPC with right parent FID should be accepted,"
2983 echo "and cause parent FID on OST to be fixed"
2985 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2986 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2988 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2989 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2991 repaired=$(do_facet ost1 $LCTL get_param -n \
2992 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2993 awk '/^repaired/ { print $2 }')
2994 [ $repaired -eq 2 ] ||
2995 error "(3) Expected 1 repaired, but got $repaired"
2997 run_test 19b "OST-object inconsistency self repair"
2999 PATTERN_WITH_HOLE="40000001"
3000 PATTERN_WITHOUT_HOLE="raid0"
3003 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3004 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3007 echo "The target MDT-object and some of its OST-object are lost."
3008 echo "The LFSCK should find out the left OST-objects and re-create"
3009 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3010 echo "with the partial OST-objects (LOV EA hole)."
3012 echo "New client can access the file with LOV EA hole via normal"
3013 echo "system tools or commands without crash the system."
3015 echo "For old client, even though it cannot access the file with"
3016 echo "LOV EA hole, it should not cause the system crash."
3019 check_mount_and_prep
3020 $LFS mkdir -i 0 $DIR/$tdir/a1
3021 if [ $OSTCOUNT -gt 2 ]; then
3022 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
3025 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
3029 # 256 blocks on the stripe0.
3030 # 1 block on the stripe1 for 2 OSTs case.
3031 # 256 blocks on the stripe1 for other cases.
3032 # 1 block on the stripe2 if OSTs > 2
3033 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
3034 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
3035 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
3037 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
3038 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
3039 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
3042 $LFS getstripe $DIR/$tdir/a1/f0
3044 $LFS getstripe $DIR/$tdir/a1/f1
3046 $LFS getstripe $DIR/$tdir/a1/f2
3048 if [ $OSTCOUNT -gt 2 ]; then
3049 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
3050 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
3052 $LFS getstripe $DIR/$tdir/a1/f3
3055 cancel_lru_locks osc
3057 echo "Inject failure..."
3058 echo "To simulate f0 lost MDT-object"
3059 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3060 do_facet mds1 $LCTL set_param fail_loc=0x1616
3061 rm -f $DIR/$tdir/a1/f0
3063 echo "To simulate f1 lost MDT-object and OST-object0"
3064 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3065 do_facet mds1 $LCTL set_param fail_loc=0x161a
3066 rm -f $DIR/$tdir/a1/f1
3068 echo "To simulate f2 lost MDT-object and OST-object1"
3069 do_facet mds1 $LCTL set_param fail_val=1
3070 rm -f $DIR/$tdir/a1/f2
3072 if [ $OSTCOUNT -gt 2 ]; then
3073 echo "To simulate f3 lost MDT-object and OST-object2"
3074 do_facet mds1 $LCTL set_param fail_val=2
3075 rm -f $DIR/$tdir/a1/f3
3078 umount_client $MOUNT
3081 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3083 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3084 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3086 for k in $(seq $MDSCOUNT); do
3087 # The LFSCK status query internal is 30 seconds. For the case
3088 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3089 # time to guarantee the status sync up.
3090 wait_update_facet mds${k} "$LCTL get_param -n \
3091 mdd.$(facet_svc mds${k}).lfsck_layout |
3092 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3093 error "(2) MDS${k} is not the expected 'completed'"
3096 for k in $(seq $OSTCOUNT); do
3097 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3098 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3099 awk '/^status/ { print $2 }')
3100 [ "$cur_status" == "completed" ] ||
3101 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3104 local repaired=$(do_facet mds1 $LCTL get_param -n \
3105 mdd.$(facet_svc mds1).lfsck_layout |
3106 awk '/^repaired_orphan/ { print $2 }')
3107 if [ $OSTCOUNT -gt 2 ]; then
3108 [ $repaired -eq 9 ] ||
3109 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3111 [ $repaired -eq 4 ] ||
3112 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3115 mount_client $MOUNT || error "(5.0) Fail to start client!"
3117 LOV_PATTERN_F_HOLE=0x40000000
3120 # ${fid0}-R-0 is the old f0
3122 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3123 echo "Check $name, which is the old f0"
3125 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3127 local pattern=$($LFS getstripe -L $name)
3128 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3129 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3131 local stripes=$($LFS getstripe -c $name)
3132 if [ $OSTCOUNT -gt 2 ]; then
3133 [ $stripes -eq 3 ] ||
3134 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3136 [ $stripes -eq 2 ] ||
3137 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3140 local size=$(stat $name | awk '/Size:/ { print $2 }')
3141 [ $size -eq $((4096 * $bcount)) ] ||
3142 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3144 cat $name > /dev/null || error "(5.5) cannot read $name"
3146 echo "dummy" >> $name || error "(5.6) cannot write $name"
3148 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3150 touch $name || error "(5.8) cannot touch $name"
3152 rm -f $name || error "(5.9) cannot unlink $name"
3155 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3157 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3158 if [ $OSTCOUNT -gt 2 ]; then
3159 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3161 echo "Check $name, it contains the old f1's stripe1"
3164 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3166 pattern=$($LFS getstripe -L $name)
3167 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3168 error "(6.2) expect pattern flag hole, but got $pattern"
3170 stripes=$($LFS getstripe -c $name)
3171 if [ $OSTCOUNT -gt 2 ]; then
3172 [ $stripes -eq 3 ] ||
3173 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3175 [ $stripes -eq 2 ] ||
3176 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3179 size=$(stat $name | awk '/Size:/ { print $2 }')
3180 [ $size -eq $((4096 * $bcount)) ] ||
3181 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3183 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3185 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3186 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3189 [ $failures -eq 256 ] ||
3190 error "(6.6) expect 256 IO failures, but get $failures"
3192 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3193 [ $size -eq $((4096 * $bcount)) ] ||
3194 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3196 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3197 error "(6.8) write to the LOV EA hole should fail"
3199 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3200 error "(6.9) write to normal stripe should NOT fail"
3202 echo "foo" >> $name && error "(6.10) append write $name should fail"
3204 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3206 touch $name || error "(6.12) cannot touch $name"
3208 rm -f $name || error "(6.13) cannot unlink $name"
3211 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3213 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3214 if [ $OSTCOUNT -gt 2 ]; then
3215 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3217 echo "Check $name, it contains the old f2's stripe0"
3220 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3222 pattern=$($LFS getstripe -L $name)
3223 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3224 error "(7.2) expect pattern flag hole, but got $pattern"
3226 stripes=$($LFS getstripe -c $name)
3227 size=$(stat $name | awk '/Size:/ { print $2 }')
3228 if [ $OSTCOUNT -gt 2 ]; then
3229 [ $stripes -eq 3 ] ||
3230 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3232 [ $size -eq $((4096 * $bcount)) ] ||
3233 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3235 cat $name > /dev/null &&
3236 error "(7.5.1) normal read $name should fail"
3238 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3239 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3241 [ $failures -eq 256 ] ||
3242 error "(7.6) expect 256 IO failures, but get $failures"
3244 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3245 [ $size -eq $((4096 * $bcount)) ] ||
3246 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3248 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3249 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3251 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3252 error "(7.8.1) write to normal stripe should NOT fail"
3254 echo "foo" >> $name &&
3255 error "(7.8.3) append write $name should fail"
3257 chown $RUNAS_ID:$RUNAS_GID $name ||
3258 error "(7.9.1) cannot chown on $name"
3260 touch $name || error "(7.10.1) cannot touch $name"
3262 [ $stripes -eq 2 ] ||
3263 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3266 [ $size -eq $((4096 * (256 + 0))) ] ||
3267 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3269 cat $name > /dev/null &&
3270 error "(7.5.2) normal read $name should fail"
3272 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3273 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3274 [ $failures -eq 256 ] ||
3275 error "(7.6.2) expect 256 IO failures, but get $failures"
3278 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3279 [ $size -eq $((4096 * $bcount)) ] ||
3280 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3282 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3283 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3285 chown $RUNAS_ID:$RUNAS_GID $name ||
3286 error "(7.9.2) cannot chown on $name"
3288 touch $name || error "(7.10.2) cannot touch $name"
3291 rm -f $name || error "(7.11) cannot unlink $name"
3293 [ $OSTCOUNT -le 2 ] && return
3296 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3298 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3299 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3301 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3303 pattern=$($LFS getstripe -L $name)
3304 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3305 error "(8.2) expect pattern flag hole, but got $pattern"
3307 stripes=$($LFS getstripe -c $name)
3308 [ $stripes -eq 3 ] ||
3309 error "(8.3) expect the stripe count is 3, but got $stripes"
3311 size=$(stat $name | awk '/Size:/ { print $2 }')
3313 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3314 error "(8.4) expect the size $((4096 * 512)), but got $size"
3316 cat $name > /dev/null &&
3317 error "(8.5) normal read $name should fail"
3319 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3320 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3322 [ $failures -eq 256 ] ||
3323 error "(8.6) expect 256 IO failures, but get $failures"
3326 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3327 [ $size -eq $((4096 * $bcount)) ] ||
3328 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3330 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3331 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3333 chown $RUNAS_ID:$RUNAS_GID $name ||
3334 error "(8.9) cannot chown on $name"
3336 touch $name || error "(8.10) cannot touch $name"
3338 rm -f $name || error "(8.11) cannot unlink $name"
3340 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3343 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3344 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
3347 echo "The target MDT-object and some of its OST-object are lost."
3348 echo "The LFSCK should find out the left OST-objects and re-create"
3349 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3350 echo "with the partial OST-objects (LOV EA hole)."
3352 echo "New client can access the file with LOV EA hole via normal"
3353 echo "system tools or commands without crash the system - PFL case."
3356 check_mount_and_prep
3358 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3359 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3360 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3361 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3362 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3363 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3365 local bcount=$((256 * 3 + 1))
3367 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3368 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3369 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3371 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3372 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3373 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3376 $LFS getstripe $DIR/$tdir/f0
3378 $LFS getstripe $DIR/$tdir/f1
3380 $LFS getstripe $DIR/$tdir/f2
3382 cancel_lru_locks mdc
3383 cancel_lru_locks osc
3385 echo "Inject failure..."
3386 echo "To simulate f0 lost MDT-object"
3387 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3388 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3391 echo "To simulate the case of f1 lost MDT-object and "
3392 echo "the first OST-object in each PFL component"
3393 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3394 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3397 echo "To simulate the case of f2 lost MDT-object and "
3398 echo "the second OST-object in each PFL component"
3399 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3404 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3406 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3407 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3409 for k in $(seq $MDSCOUNT); do
3410 # The LFSCK status query internal is 30 seconds. For the case
3411 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3412 # time to guarantee the status sync up.
3413 wait_update_facet mds${k} "$LCTL get_param -n \
3414 mdd.$(facet_svc mds${k}).lfsck_layout |
3415 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3416 error "(4) MDS${k} is not the expected 'completed'"
3419 for k in $(seq $OSTCOUNT); do
3420 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3421 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3422 awk '/^status/ { print $2 }')
3423 [ "$cur_status" == "completed" ] ||
3424 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3427 local repaired=$(do_facet mds1 $LCTL get_param -n \
3428 mdd.$(facet_svc mds1).lfsck_layout |
3429 awk '/^repaired_orphan/ { print $2 }')
3430 [ $repaired -eq 8 ] ||
3431 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3434 # ${fid0}-R-0 is the old f0
3436 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3437 echo "Check $name, which is the old f0"
3439 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3441 local pattern=$($LFS getstripe -L -I1 $name)
3442 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3443 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3445 pattern=$($LFS getstripe -L -I2 $name)
3446 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3447 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3449 local stripes=$($LFS getstripe -c -I1 $name)
3450 [ $stripes -eq 2 ] ||
3451 error "(7.3.1) expect 2 stripes, but got $stripes"
3453 stripes=$($LFS getstripe -c -I2 $name)
3454 [ $stripes -eq 2 ] ||
3455 error "(7.3.2) expect 2 stripes, but got $stripes"
3457 local e_start=$($LFS getstripe -I1 $name |
3458 awk '/lcme_extent.e_start:/ { print $2 }')
3459 [ $e_start -eq 0 ] ||
3460 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3462 local e_end=$($LFS getstripe -I1 $name |
3463 awk '/lcme_extent.e_end:/ { print $2 }')
3464 [ $e_end -eq 2097152 ] ||
3465 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3467 e_start=$($LFS getstripe -I2 $name |
3468 awk '/lcme_extent.e_start:/ { print $2 }')
3469 [ $e_start -eq 2097152 ] ||
3470 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3472 e_end=$($LFS getstripe -I2 $name |
3473 awk '/lcme_extent.e_end:/ { print $2 }')
3474 [ "$e_end" = "EOF" ] ||
3475 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3477 local size=$(stat $name | awk '/Size:/ { print $2 }')
3478 [ $size -eq $((4096 * $bcount)) ] ||
3479 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3481 cat $name > /dev/null || error "(7.7) cannot read $name"
3483 echo "dummy" >> $name || error "(7.8) cannot write $name"
3485 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3487 touch $name || error "(7.10) cannot touch $name"
3489 rm -f $name || error "(7.11) cannot unlink $name"
3492 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3494 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3495 echo "Check $name, it contains f1's second OST-object in each COMP"
3497 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3499 pattern=$($LFS getstripe -L -I1 $name)
3500 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3501 error "(8.2.1) expect pattern flag hole, but got $pattern"
3503 pattern=$($LFS getstripe -L -I2 $name)
3504 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3505 error "(8.2.2) expect pattern flag hole, but got $pattern"
3507 stripes=$($LFS getstripe -c -I1 $name)
3508 [ $stripes -eq 2 ] ||
3509 error "(8.3.2) expect 2 stripes, but got $stripes"
3511 stripes=$($LFS getstripe -c -I2 $name)
3512 [ $stripes -eq 2 ] ||
3513 error "(8.3.2) expect 2 stripes, but got $stripes"
3515 e_start=$($LFS getstripe -I1 $name |
3516 awk '/lcme_extent.e_start:/ { print $2 }')
3517 [ $e_start -eq 0 ] ||
3518 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3520 e_end=$($LFS getstripe -I1 $name |
3521 awk '/lcme_extent.e_end:/ { print $2 }')
3522 [ $e_end -eq 2097152 ] ||
3523 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3525 e_start=$($LFS getstripe -I2 $name |
3526 awk '/lcme_extent.e_start:/ { print $2 }')
3527 [ $e_start -eq 2097152 ] ||
3528 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3530 e_end=$($LFS getstripe -I2 $name |
3531 awk '/lcme_extent.e_end:/ { print $2 }')
3532 [ "$e_end" = "EOF" ] ||
3533 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3535 size=$(stat $name | awk '/Size:/ { print $2 }')
3536 [ $size -eq $((4096 * $bcount)) ] ||
3537 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3539 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3541 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3542 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3544 # The first stripe in each COMP was lost
3545 [ $failures -eq 512 ] ||
3546 error "(8.8) expect 512 IO failures, but get $failures"
3548 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3549 [ $size -eq $((4096 * $bcount)) ] ||
3550 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3552 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3553 error "(8.10) write to the LOV EA hole should fail"
3555 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3556 error "(8.11) write to normal stripe should NOT fail"
3558 echo "foo" >> $name && error "(8.12) append write $name should fail"
3560 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3562 touch $name || error "(8.14) cannot touch $name"
3564 rm -f $name || error "(8.15) cannot unlink $name"
3567 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3569 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3570 echo "Check $name, it contains f2's first stripe in each COMP"
3572 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3574 pattern=$($LFS getstripe -L -I1 $name)
3575 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3576 error "(9.2.1) expect pattern flag hole, but got $pattern"
3578 pattern=$($LFS getstripe -L -I2 $name)
3579 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3580 error "(9.2.2) expect pattern flag hole, but got $pattern"
3582 stripes=$($LFS getstripe -c -I1 $name)
3583 [ $stripes -eq 2 ] ||
3584 error "(9.3.2) expect 2 stripes, but got $stripes"
3586 stripes=$($LFS getstripe -c -I2 $name)
3587 [ $stripes -eq 2 ] ||
3588 error "(9.3.2) expect 2 stripes, but got $stripes"
3590 e_start=$($LFS getstripe -I1 $name |
3591 awk '/lcme_extent.e_start:/ { print $2 }')
3592 [ $e_start -eq 0 ] ||
3593 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3595 e_end=$($LFS getstripe -I1 $name |
3596 awk '/lcme_extent.e_end:/ { print $2 }')
3597 [ $e_end -eq 2097152 ] ||
3598 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3600 e_start=$($LFS getstripe -I2 $name |
3601 awk '/lcme_extent.e_start:/ { print $2 }')
3602 [ $e_start -eq 2097152 ] ||
3603 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3605 e_end=$($LFS getstripe -I2 $name |
3606 awk '/lcme_extent.e_end:/ { print $2 }')
3607 [ "$e_end" = "EOF" ] ||
3608 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3610 size=$(stat $name | awk '/Size:/ { print $2 }')
3611 # The second stripe in COMP was lost, so we do not know there
3612 # have ever been some data before. 'stat' will regard it as
3613 # no data on the lost stripe.
3615 [ $size -eq $((4096 * $bcount)) ] ||
3616 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3618 cat $name > /dev/null &&
3619 error "(9.7) normal read $name should fail"
3621 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3622 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3623 [ $failures -eq 512 ] ||
3624 error "(9.8) expect 256 IO failures, but get $failures"
3626 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3627 # The second stripe in COMP was lost, so we do not know there
3628 # have ever been some data before. Since 'dd' skip failure,
3629 # it will regard the lost stripe contains data.
3631 [ $size -eq $((4096 * $bcount)) ] ||
3632 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3634 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3635 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3637 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3638 error "(9.11) write to normal stripe should NOT fail"
3640 echo "foo" >> $name &&
3641 error "(9.12) append write $name should fail"
3643 chown $RUNAS_ID:$RUNAS_GID $name ||
3644 error "(9.13) cannot chown on $name"
3646 touch $name || error "(9.14) cannot touch $name"
3648 rm -f $name || error "(7.15) cannot unlink $name"
3650 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3653 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3654 skip "ignore the test if MDS is older than 2.5.59" && return
3656 check_mount_and_prep
3657 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3659 echo "Start all LFSCK components by default (-s 1)"
3660 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3661 error "Fail to start LFSCK"
3663 echo "namespace LFSCK should be in 'scanning-phase1' status"
3664 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3665 [ "$STATUS" == "scanning-phase1" ] ||
3666 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3668 echo "layout LFSCK should be in 'scanning-phase1' status"
3669 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3670 [ "$STATUS" == "scanning-phase1" ] ||
3671 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3673 echo "Stop all LFSCK components by default"
3674 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3675 error "Fail to stop LFSCK"
3677 run_test 21 "run all LFSCK components by default"
3680 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3683 echo "The parent_A references the child directory via some name entry,"
3684 echo "but the child directory back references another parent_B via its"
3685 echo "".." name entry. The parent_B does not exist. Then the namespace"
3686 echo "LFSCK will repair the child directory's ".." name entry."
3689 check_mount_and_prep
3691 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3692 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3694 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3695 echo "The dummy's dotdot name entry references the guard."
3696 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3697 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3698 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3699 error "(3) Fail to mkdir on MDT0"
3700 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3702 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3704 echo "Trigger namespace LFSCK to repair unmatched pairs"
3705 $START_NAMESPACE -A -r ||
3706 error "(5) Fail to start LFSCK for namespace"
3708 wait_all_targets_blocked namespace completed 6
3710 local repaired=$($SHOW_NAMESPACE |
3711 awk '/^unmatched_pairs_repaired/ { print $2 }')
3712 [ $repaired -eq 1 ] ||
3713 error "(7) Fail to repair unmatched pairs: $repaired"
3715 echo "'ls' should success after namespace LFSCK repairing"
3716 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3717 error "(8) ls should success."
3719 run_test 22a "LFSCK can repair unmatched pairs (1)"
3722 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3725 echo "The parent_A references the child directory via the name entry_B,"
3726 echo "but the child directory back references another parent_C via its"
3727 echo "".." name entry. The parent_C exists, but there is no the name"
3728 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3729 echo "the child directory's ".." name entry and its linkEA."
3732 check_mount_and_prep
3734 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3735 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3737 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3738 echo "and bad linkEA. The dummy's dotdot name entry references the"
3739 echo "guard. The dummy's linkEA references n non-exist name entry."
3740 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3741 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3742 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3743 error "(3) Fail to mkdir on MDT0"
3744 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3746 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3747 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3748 local dummyname=$($LFS fid2path $DIR $dummyfid)
3749 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3750 error "(4) fid2path works unexpectedly."
3752 echo "Trigger namespace LFSCK to repair unmatched pairs"
3753 $START_NAMESPACE -A -r ||
3754 error "(5) Fail to start LFSCK for namespace"
3756 wait_all_targets_blocked namespace completed 6
3758 local repaired=$($SHOW_NAMESPACE |
3759 awk '/^unmatched_pairs_repaired/ { print $2 }')
3760 [ $repaired -eq 1 ] ||
3761 error "(7) Fail to repair unmatched pairs: $repaired"
3763 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3764 local dummyname=$($LFS fid2path $DIR $dummyfid)
3765 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3766 error "(8) fid2path does not work"
3768 run_test 22b "LFSCK can repair unmatched pairs (2)"
3771 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3774 echo "The name entry is there, but the MDT-object for such name "
3775 echo "entry does not exist. The namespace LFSCK should find out "
3776 echo "and repair the inconsistency as required."
3779 check_mount_and_prep
3781 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3782 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3784 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3785 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3786 do_facet mds2 $LCTL set_param fail_loc=0x1620
3787 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3788 do_facet mds2 $LCTL set_param fail_loc=0
3790 echo "'ls' should fail because of dangling name entry"
3791 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3793 echo "Trigger namespace LFSCK to find out dangling name entry"
3794 $START_NAMESPACE -A -r ||
3795 error "(5) Fail to start LFSCK for namespace"
3797 wait_all_targets_blocked namespace completed 6
3799 local repaired=$($SHOW_NAMESPACE |
3800 awk '/^dangling_repaired/ { print $2 }')
3801 [ $repaired -eq 1 ] ||
3802 error "(7) Fail to repair dangling name entry: $repaired"
3804 echo "'ls' should fail because not re-create MDT-object by default"
3805 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3807 echo "Trigger namespace LFSCK again to repair dangling name entry"
3808 $START_NAMESPACE -A -r -C ||
3809 error "(9) Fail to start LFSCK for namespace"
3811 wait_all_targets_blocked namespace completed 10
3813 repaired=$($SHOW_NAMESPACE |
3814 awk '/^dangling_repaired/ { print $2 }')
3815 [ $repaired -eq 1 ] ||
3816 error "(11) Fail to repair dangling name entry: $repaired"
3818 echo "'ls' should success after namespace LFSCK repairing"
3819 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3821 run_test 23a "LFSCK can repair dangling name entry (1)"
3825 echo "The objectA has multiple hard links, one of them corresponding"
3826 echo "to the name entry_B. But there is something wrong for the name"
3827 echo "entry_B and cause entry_B to references non-exist object_C."
3828 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3829 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3830 echo "comes to the second-stage scanning, it will find that the"
3831 echo "former re-creating object_C is not proper, and will try to"
3832 echo "replace the object_C with the real object_A."
3835 check_mount_and_prep
3837 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3838 $LFS path2fid $DIR/$tdir/d0
3840 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3842 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3843 $LFS path2fid $DIR/$tdir/d0/f0
3845 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3846 $LFS path2fid $DIR/$tdir/d0/f1
3848 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3849 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3851 if [ "$SEQ0" != "$SEQ1" ]; then
3852 # To guarantee that the f0 and f1 are in the same FID seq
3853 rm -f $DIR/$tdir/d0/f0 ||
3854 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3855 echo "dummy" > $DIR/$tdir/d0/f0 ||
3856 error "(3.2) Fail to touch on MDT0"
3857 $LFS path2fid $DIR/$tdir/d0/f0
3860 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3861 OID=$(printf %d $OID)
3863 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3864 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3865 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3866 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3867 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3869 # If there is creation after the dangling injection, it may re-use
3870 # the just released local object (inode) that is referenced by the
3871 # dangling name entry. It will fail the dangling injection.
3872 # So before deleting the target object for the dangling name entry,
3873 # remove some other objects to avoid the target object being reused
3874 # by some potential creations. LU-7429
3875 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3877 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3879 echo "'ls' should fail because of dangling name entry"
3880 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3881 error "(6) ls should fail."
3883 echo "Trigger namespace LFSCK to find out dangling name entry"
3884 $START_NAMESPACE -r -C ||
3885 error "(7) Fail to start LFSCK for namespace"
3887 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3888 mdd.${MDT_DEV}.lfsck_namespace |
3889 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3891 error "(8) unexpected status"
3894 local repaired=$($SHOW_NAMESPACE |
3895 awk '/^dangling_repaired/ { print $2 }')
3896 [ $repaired -eq 1 ] ||
3897 error "(9) Fail to repair dangling name entry: $repaired"
3899 repaired=$($SHOW_NAMESPACE |
3900 awk '/^multiple_linked_repaired/ { print $2 }')
3901 [ $repaired -eq 1 ] ||
3902 error "(10) Fail to drop the former created object: $repaired"
3904 local data=$(cat $DIR/$tdir/d0/foo)
3905 [ "$data" == "dummy" ] ||
3906 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3908 run_test 23b "LFSCK can repair dangling name entry (2)"
3911 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3912 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3913 mdd.${MDT_DEV}.lfsck_namespace |
3914 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3916 error "(10) unexpected status"
3919 stop_full_debug_logging
3924 echo "The objectA has multiple hard links, one of them corresponding"
3925 echo "to the name entry_B. But there is something wrong for the name"
3926 echo "entry_B and cause entry_B to references non-exist object_C."
3927 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3928 echo "as dangling, and re-create the lost object_C. And then others"
3929 echo "modified the re-created object_C. When the LFSCK comes to the"
3930 echo "second-stage scanning, it will find that the former re-creating"
3931 echo "object_C maybe wrong and try to replace the object_C with the"
3932 echo "real object_A. But because object_C has been modified, so the"
3933 echo "LFSCK cannot replace it."
3936 start_full_debug_logging
3938 check_mount_and_prep
3940 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3941 parent_fid="$($LFS path2fid $DIR/$tdir/d0)"
3942 echo "parent_fid=$parent_fid"
3944 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3946 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3947 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3948 echo "f0_fid=$f0_fid"
3950 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3951 f1_fid="$($LFS path2fid $DIR/$tdir/d0/f1)"
3952 echo "f1_fid=$f1_fid"
3954 if [ "${fid_f0/:.*/}" != "${fid_f1/:.*/}" ]; then
3955 # To guarantee that the f0 and f1 are in the same FID seq
3956 rm -f $DIR/$tdir/d0/f0 ||
3957 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3958 echo "dummy" > $DIR/$tdir/d0/f0 ||
3959 error "(3.2) Fail to touch on MDT0"
3960 f0_fid="$($LFS path2fid $DIR/$tdir/d0/f0)"
3961 echo "f0_fid=$f0_fid (replaced)"
3964 local oid=$(awk -F':' '{ printf $2 }' <<< $f1_fid)
3966 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3967 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3968 do_facet $SINGLEMDS $LCTL set_param fail_val=$oid fail_loc=0x1621
3969 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3970 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3972 # If there is creation after the dangling injection, it may re-use
3973 # the just released local object (inode) that is referenced by the
3974 # dangling name entry. It will fail the dangling injection.
3975 # So before deleting the target object for the dangling name entry,
3976 # remove some other objects to avoid the target object being reused
3977 # by some potential creations. LU-7429
3978 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3980 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3982 echo "'ls' should fail because of dangling name entry"
3983 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3984 error "(6) ls should fail."
3986 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3987 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3989 echo "Trigger namespace LFSCK to find out dangling name entry"
3990 $START_NAMESPACE -r -C ||
3991 error "(7) Fail to start LFSCK for namespace"
3993 wait_update_facet client "stat -c%s $DIR/$tdir/d0/foo" "0" $LTIME || {
3994 # While unexpected by the test, it is valid for LFSCK to repair
3995 # the link to the original object before any data is written.
3996 local size=$(stat -c %s $DIR/$tdir/d0/foo)
3998 if [ "$size" = "6" -a "$(<$DIR/$tdir/d0/foo)" = "dummy" ]; then
3999 log "LFSCK repaired file prematurely"
4004 stat $DIR/$tdir/d0/foo
4006 error "(8) unexpected size"
4009 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
4010 cancel_lru_locks osc
4014 local repaired=$($SHOW_NAMESPACE |
4015 awk '/^dangling_repaired/ { print $2 }')
4016 [ $repaired -eq 1 ] ||
4017 error "(11) Fail to repair dangling name entry: $repaired"
4019 local data=$(cat $DIR/$tdir/d0/foo)
4020 [ "$data" != "dummy" ] ||
4021 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
4023 run_test 23c "LFSCK can repair dangling name entry (3)"
4026 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4027 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4030 echo "Two MDT-objects back reference the same name entry via their"
4031 echo "each own linkEA entry, but the name entry only references one"
4032 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
4033 echo "for the MDT-object that is not recognized. If such MDT-object"
4034 echo "has no other linkEA entry after the removing, then the LFSCK"
4035 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
4038 check_mount_and_prep
4040 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4042 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
4043 $LFS path2fid $DIR/$tdir/d0/guard
4045 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
4046 $LFS path2fid $DIR/$tdir/d0/dummy
4049 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
4050 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
4052 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
4055 touch $DIR/$tdir/d0/guard/foo ||
4056 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
4058 echo "Inject failure stub on MDT0 to simulate the case that"
4059 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
4060 echo "that references $DIR/$tdir/d0/guard/foo."
4061 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
4062 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
4063 echo "there with the same linkEA entry as another MDT-object"
4064 echo "$DIR/$tdir/d0/guard/foo has"
4066 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
4067 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
4068 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
4069 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
4070 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4071 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4072 rmdir $DIR/$tdir/d0/dummy/foo ||
4073 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4074 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4076 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4077 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4078 error "(6) stat successfully unexpectedly"
4080 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4081 $START_NAMESPACE -A -r ||
4082 error "(7) Fail to start LFSCK for namespace"
4084 wait_all_targets_blocked namespace completed 8
4086 local repaired=$($SHOW_NAMESPACE |
4087 awk '/^multiple_referenced_repaired/ { print $2 }')
4088 [ $repaired -eq 1 ] ||
4089 error "(9) Fail to repair multiple referenced name entry: $repaired"
4091 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4092 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4093 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4095 local cname="$cfid-$pfid-D-0"
4096 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4097 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4099 run_test 24 "LFSCK can repair multiple-referenced name entry"
4102 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4103 skip "ldiskfs only test" && return
4106 echo "The file type in the name entry does not match the file type"
4107 echo "claimed by the referenced object. Then the LFSCK will update"
4108 echo "the file type in the name entry."
4111 check_mount_and_prep
4113 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4115 echo "Inject failure stub on MDT0 to simulate the case that"
4116 echo "the file type stored in the name entry is wrong."
4118 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4119 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4120 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4121 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4123 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4124 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4126 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4127 mdd.${MDT_DEV}.lfsck_namespace |
4128 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4130 error "(4) unexpected status"
4133 local repaired=$($SHOW_NAMESPACE |
4134 awk '/^bad_file_type_repaired/ { print $2 }')
4135 [ $repaired -eq 1 ] ||
4136 error "(5) Fail to repair bad file type in name entry: $repaired"
4138 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4140 run_test 25 "LFSCK can repair bad file type in the name entry"
4144 echo "The local name entry back referenced by the MDT-object is lost."
4145 echo "The namespace LFSCK will add the missing local name entry back"
4146 echo "to the normal namespace."
4149 check_mount_and_prep
4151 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4152 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4153 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4155 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4156 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4158 echo "Inject failure stub on MDT0 to simulate the case that"
4159 echo "foo's name entry will be removed, but the foo's object"
4160 echo "and its linkEA are kept in the system."
4162 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4163 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4164 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4165 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4167 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4168 error "(5) 'ls' should fail"
4170 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4171 $START_NAMESPACE -r -A ||
4172 error "(6) Fail to start LFSCK for namespace"
4174 wait_all_targets_blocked namespace completed 7
4176 local repaired=$($SHOW_NAMESPACE |
4177 awk '/^lost_dirent_repaired/ { print $2 }')
4178 [ $repaired -eq 1 ] ||
4179 error "(8) Fail to repair lost dirent: $repaired"
4181 ls -ail $DIR/$tdir/d0/foo ||
4182 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4184 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4185 [ "$foofid" == "$foofid2" ] ||
4186 error "(10) foo's FID changed: $foofid, $foofid2"
4188 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4191 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4194 echo "The remote name entry back referenced by the MDT-object is lost."
4195 echo "The namespace LFSCK will add the missing remote name entry back"
4196 echo "to the normal namespace."
4199 check_mount_and_prep
4201 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4202 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4203 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4205 echo "Inject failure stub on MDT0 to simulate the case that"
4206 echo "foo's name entry will be removed, but the foo's object"
4207 echo "and its linkEA are kept in the system."
4209 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4210 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4211 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4214 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4215 error "(4) 'ls' should fail"
4217 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4218 $START_NAMESPACE -r -A ||
4219 error "(5) Fail to start LFSCK for namespace"
4221 wait_all_targets_blocked namespace completed 6
4223 local repaired=$($SHOW_NAMESPACE |
4224 awk '/^lost_dirent_repaired/ { print $2 }')
4225 [ $repaired -eq 1 ] ||
4226 error "(7) Fail to repair lost dirent: $repaired"
4228 ls -ail $DIR/$tdir/d0/foo ||
4229 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4231 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4232 [ "$foofid" == "$foofid2" ] ||
4233 error "(9) foo's FID changed: $foofid, $foofid2"
4235 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4238 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4241 echo "The local parent referenced by the MDT-object linkEA is lost."
4242 echo "The namespace LFSCK will re-create the lost parent as orphan."
4245 check_mount_and_prep
4247 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4248 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4249 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4250 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4252 echo "Inject failure stub on MDT0 to simulate the case that"
4253 echo "foo's name entry will be removed, but the foo's object"
4254 echo "and its linkEA are kept in the system. And then remove"
4255 echo "another hard link and the parent directory."
4257 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4258 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4259 rm -f $DIR/$tdir/d0/foo ||
4260 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4261 rm -f $DIR/$tdir/d0/dummy ||
4262 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4265 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4266 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4268 echo "Trigger namespace LFSCK to repair the lost parent"
4269 $START_NAMESPACE -r -A ||
4270 error "(6) Fail to start LFSCK for namespace"
4272 wait_all_targets_blocked namespace completed 7
4274 local repaired=$($SHOW_NAMESPACE |
4275 awk '/^lost_dirent_repaired/ { print $2 }')
4276 [ $repaired -eq 1 ] ||
4277 error "(8) Fail to repair lost dirent: $repaired"
4279 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4280 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4281 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4283 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4285 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4286 [ ! -z "$cname" ] ||
4287 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4289 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4292 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4293 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4296 echo "The remote parent referenced by the MDT-object linkEA is lost."
4297 echo "The namespace LFSCK will re-create the lost parent as orphan."
4300 check_mount_and_prep
4302 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4303 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4305 $LFS path2fid $DIR/$tdir/d0
4307 echo "Inject failure stub on MDT0 to simulate the case that"
4308 echo "foo's name entry will be removed, but the foo's object"
4309 echo "and its linkEA are kept in the system. And then remove"
4310 echo "the parent directory."
4312 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4313 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4314 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4315 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4317 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4318 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4320 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4321 $START_NAMESPACE -r -A ||
4322 error "(6) Fail to start LFSCK for namespace"
4324 wait_all_targets_blocked namespace completed 7
4326 local repaired=$($SHOW_NAMESPACE |
4327 awk '/^lost_dirent_repaired/ { print $2 }')
4328 [ $repaired -eq 1 ] ||
4329 error "(8) Fail to repair lost dirent: $repaired"
4331 ls -ail $MOUNT/.lustre/lost+found/
4333 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4334 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4335 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4337 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4339 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4340 [ ! -z "$cname" ] ||
4341 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4343 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4346 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4349 echo "The target name entry is lost. The LFSCK should insert the"
4350 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4351 echo "the MDT (on which the orphan MDT-object resides) has ever"
4352 echo "failed to respond some name entry verification during the"
4353 echo "first stage-scanning, then the LFSCK should skip to handle"
4354 echo "orphan MDT-object on this MDT. But other MDTs should not"
4358 check_mount_and_prep
4359 $LFS mkdir -i 0 $DIR/$tdir/d1
4360 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4361 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4363 $LFS mkdir -i 1 $DIR/$tdir/d2
4364 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4365 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4367 echo "Inject failure stub on MDT0 to simulate the case that"
4368 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4369 echo "and its linkEA are kept in the system. And the case that"
4370 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4371 echo "and its linkEA are kept in the system."
4373 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4374 do_facet mds1 $LCTL set_param fail_loc=0x1624
4375 do_facet mds2 $LCTL set_param fail_loc=0x1624
4376 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4377 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4378 do_facet mds1 $LCTL set_param fail_loc=0
4379 do_facet mds2 $LCTL set_param fail_loc=0
4381 cancel_lru_locks mdc
4382 cancel_lru_locks osc
4384 echo "Inject failure, to simulate the MDT0 fail to handle"
4385 echo "MDT1 LFSCK request during the first-stage scanning."
4386 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4387 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4389 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4390 $START_NAMESPACE -r -A ||
4391 error "(3) Fail to start LFSCK for namespace"
4393 wait_update_facet mds1 "$LCTL get_param -n \
4394 mdd.$(facet_svc mds1).lfsck_namespace |
4395 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4396 error "(4) mds1 is not the expected 'partial'"
4399 wait_update_facet mds2 "$LCTL get_param -n \
4400 mdd.$(facet_svc mds2).lfsck_namespace |
4401 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4402 error "(5) mds2 is not the expected 'completed'"
4405 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4407 local repaired=$(do_facet mds1 $LCTL get_param -n \
4408 mdd.$(facet_svc mds1).lfsck_namespace |
4409 awk '/^lost_dirent_repaired/ { print $2 }')
4410 [ $repaired -eq 0 ] ||
4411 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4413 repaired=$(do_facet mds2 $LCTL get_param -n \
4414 mdd.$(facet_svc mds2).lfsck_namespace |
4415 awk '/^lost_dirent_repaired/ { print $2 }')
4416 [ $repaired -eq 1 ] ||
4417 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4419 echo "Trigger namespace LFSCK on all devices again to cleanup"
4420 $START_NAMESPACE -r -A ||
4421 error "(8) Fail to start LFSCK for namespace"
4423 wait_all_targets_blocked namespace completed 9
4425 local repaired=$(do_facet mds1 $LCTL get_param -n \
4426 mdd.$(facet_svc mds1).lfsck_namespace |
4427 awk '/^lost_dirent_repaired/ { print $2 }')
4428 [ $repaired -eq 1 ] ||
4429 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4431 repaired=$(do_facet mds2 $LCTL get_param -n \
4432 mdd.$(facet_svc mds2).lfsck_namespace |
4433 awk '/^lost_dirent_repaired/ { print $2 }')
4434 [ $repaired -eq 0 ] ||
4435 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4437 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4441 echo "The object's nlink attribute is larger than the object's known"
4442 echo "name entries count. The LFSCK will repair the object's nlink"
4443 echo "attribute to match the known name entries count"
4446 check_mount_and_prep
4448 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4449 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4451 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4452 echo "nlink attribute is larger than its name entries count."
4454 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4455 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4456 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4457 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4458 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4460 cancel_lru_locks mdc
4461 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4462 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4464 echo "Trigger namespace LFSCK to repair the nlink count"
4465 $START_NAMESPACE -r -A ||
4466 error "(5) Fail to start LFSCK for namespace"
4468 wait_all_targets_blocked namespace completed 6
4470 local repaired=$($SHOW_NAMESPACE |
4471 awk '/^nlinks_repaired/ { print $2 }')
4472 [ $repaired -eq 1 ] ||
4473 error "(7) Fail to repair nlink count: $repaired"
4475 cancel_lru_locks mdc
4476 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4477 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4479 # Disable 29a, we only allow nlink to be updated if the known linkEA
4480 # entries is larger than nlink count.
4482 #run_test 29a "LFSCK can repair bad nlink count (1)"
4486 echo "The object's nlink attribute is smaller than the object's known"
4487 echo "name entries count. The LFSCK will repair the object's nlink"
4488 echo "attribute to match the known name entries count"
4491 check_mount_and_prep
4493 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4494 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4496 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4497 echo "nlink attribute is smaller than its name entries count."
4499 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4500 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4501 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4502 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4503 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4505 cancel_lru_locks mdc
4506 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4507 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4509 echo "Trigger namespace LFSCK to repair the nlink count"
4510 $START_NAMESPACE -r -A ||
4511 error "(5) Fail to start LFSCK for namespace"
4513 wait_all_targets_blocked namespace completed 6
4515 local repaired=$($SHOW_NAMESPACE |
4516 awk '/^nlinks_repaired/ { print $2 }')
4517 [ $repaired -eq 1 ] ||
4518 error "(7) Fail to repair nlink count: $repaired"
4520 cancel_lru_locks mdc
4521 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4522 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4524 run_test 29b "LFSCK can repair bad nlink count (2)"
4529 echo "The namespace LFSCK will create many hard links to the target"
4530 echo "file as to exceed the linkEA size limitation. Under such case"
4531 echo "the linkEA will be marked as overflow that will prevent the"
4532 echo "target file to be migrated. Then remove some hard links to"
4533 echo "make the left hard links to be held within the linkEA size"
4534 echo "limitation. But before the namespace LFSCK adding all the"
4535 echo "missed linkEA entries back, the overflow mark (timestamp)"
4536 echo "will not be cleared."
4539 check_mount_and_prep
4541 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4542 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4543 error "(0.2) Fail to mkdir"
4544 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4545 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4547 # define MAX_LINKEA_SIZE 4096
4548 # sizeof(link_ea_header) = 24
4549 # sizeof(link_ea_entry) = 18
4550 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4551 # (sizeof(link_ea_entry) + name_length))
4552 # If the average name length is 12 bytes, then 150 hard links
4553 # is totally enough to overflow the linkEA
4554 echo "Create 150 hard links should succeed although the linkEA overflow"
4555 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4556 error "(2) Fail to hard link"
4558 cancel_lru_locks mdc
4559 if [ $MDSCOUNT -ge 2 ]; then
4560 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4561 error "(3.1) Migrate should fail"
4563 echo "The object with linkEA overflow should NOT be migrated"
4564 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4565 [ "$newfid" == "$oldfid" ] ||
4566 error "(3.2) Migrate should fail: $newfid != $oldfid"
4569 # Remove 100 hard links, then the linkEA should have space
4570 # to hold the missed linkEA entries.
4571 echo "Remove 100 hard links to save space for the missed linkEA entries"
4572 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4574 if [ $MDSCOUNT -ge 2 ]; then
4575 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null &&
4576 error "(5.1) Migrate should fail"
4578 # The overflow timestamp is still there, so migration will fail.
4579 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4580 [ "$newfid" == "$oldfid" ] ||
4581 error "(5.2) Migrate should fail: $newfid != $oldfid"
4584 # sleep 3 seconds to guarantee that the overflow is recognized
4587 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4588 $START_NAMESPACE -r -A ||
4589 error "(6) Fail to start LFSCK for namespace"
4591 wait_all_targets_blocked namespace completed 7
4593 local repaired=$($SHOW_NAMESPACE |
4594 awk '/^linkea_overflow_cleared/ { print $2 }')
4595 [ $repaired -eq 1 ] ||
4596 error "(8) Fail to clear linkea overflow: $repaired"
4598 repaired=$($SHOW_NAMESPACE |
4599 awk '/^nlinks_repaired/ { print $2 }')
4600 [ $repaired -eq 0 ] ||
4601 error "(9) Unexpected nlink repaired: $repaired"
4603 if [ $MDSCOUNT -ge 2 ]; then
4604 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4605 error "(10.1) Migrate failure"
4607 # Migration should succeed after clear the overflow timestamp.
4608 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4609 [ "$newfid" != "$oldfid" ] ||
4610 error "(10.2) Migrate should succeed"
4612 ls -l $DIR/$tdir/foo > /dev/null ||
4613 error "(11) 'ls' failed after migration"
4616 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4617 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4619 run_test 29c "verify linkEA size limitation"
4622 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4623 skip "ldiskfs only test" && return
4624 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
4627 echo "The namespace LFSCK will move the orphans from backend"
4628 echo "/lost+found directory to normal client visible namespace"
4629 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4632 check_mount_and_prep
4634 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4635 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4637 echo "Inject failure stub on MDT0 to simulate the case that"
4638 echo "directory d0 has no linkEA entry, then the LFSCK will"
4639 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4641 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4642 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4643 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4644 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4646 local pfid=$($LFS path2fid $DIR/$tdir/foo)
4647 local cfid=$($LFS path2fid $DIR/$tdir/foo/d0)
4649 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4650 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4652 echo "Inject failure stub on MDT0 to simulate the case that the"
4653 echo "object's name entry will be removed, but not destroy the"
4654 echo "object. Then backend e2fsck will handle it as orphan and"
4655 echo "add them into the backend /lost+found directory."
4657 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4658 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4659 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4660 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4661 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4662 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4663 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4665 umount_client $MOUNT || error "(10) Fail to stop client!"
4667 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4670 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4671 error "(12) Fail to run e2fsck"
4673 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4674 error "(13) Fail to start MDT0"
4676 echo "Trigger namespace LFSCK to recover backend orphans"
4677 $START_NAMESPACE -r -A ||
4678 error "(14) Fail to start LFSCK for namespace"
4680 wait_all_targets_blocked namespace completed 15
4682 local repaired=$($SHOW_NAMESPACE |
4683 awk '/^local_lost_found_moved/ { print $2 }')
4684 [ $repaired -ge 4 ] ||
4685 error "(16) Fail to recover backend orphans: $repaired"
4687 mount_client $MOUNT || error "(17) Fail to start client!"
4689 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4691 ls -ail $MOUNT/.lustre/lost+found/
4693 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4694 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4695 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4697 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4699 local cname=$MOUNT/.lustre/lost+found/MDT0000/${cfid}-${pfid}-D-0
4700 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4702 stat ${cname}/d1 || error "(21) d1 is not recovered"
4703 stat ${cname}/f1 || error "(22) f1 is not recovered"
4705 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4708 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4711 echo "For the name entry under a striped directory, if the name"
4712 echo "hash does not match the shard, then the LFSCK will repair"
4713 echo "the bad name entry"
4716 check_mount_and_prep
4718 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4719 error "(1) Fail to create striped directory"
4721 echo "Inject failure stub on client to simulate the case that"
4722 echo "some name entry should be inserted into other non-first"
4723 echo "shard, but inserted into the first shard by wrong"
4725 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4726 $LCTL set_param fail_loc=0x1628 fail_val=0
4727 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4728 error "(2) Fail to create file under striped directory"
4729 $LCTL set_param fail_loc=0 fail_val=0
4731 echo "Trigger namespace LFSCK to repair bad name hash"
4732 $START_NAMESPACE -r -A ||
4733 error "(3) Fail to start LFSCK for namespace"
4735 wait_all_targets_blocked namespace completed 4
4737 local repaired=$($SHOW_NAMESPACE |
4738 awk '/^name_hash_repaired/ { print $2 }')
4739 [ $repaired -ge 1 ] ||
4740 error "(5) Fail to repair bad name hash: $repaired"
4742 umount_client $MOUNT || error "(6) umount failed"
4743 mount_client $MOUNT || error "(7) mount failed"
4745 for ((i = 0; i < $MDSCOUNT; i++)); do
4746 stat $DIR/$tdir/striped_dir/d$i ||
4747 error "(8) Fail to stat d$i after LFSCK"
4748 rmdir $DIR/$tdir/striped_dir/d$i ||
4749 error "(9) Fail to unlink d$i after LFSCK"
4752 rmdir $DIR/$tdir/striped_dir ||
4753 error "(10) Fail to remove the striped directory after LFSCK"
4755 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4758 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4761 echo "For the name entry under a striped directory, if the name"
4762 echo "hash does not match the shard, then the LFSCK will repair"
4763 echo "the bad name entry"
4766 check_mount_and_prep
4768 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4769 error "(1) Fail to create striped directory"
4771 echo "Inject failure stub on client to simulate the case that"
4772 echo "some name entry should be inserted into other non-second"
4773 echo "shard, but inserted into the secod shard by wrong"
4775 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4776 $LCTL set_param fail_loc=0x1628 fail_val=1
4777 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4778 error "(2) Fail to create file under striped directory"
4779 $LCTL set_param fail_loc=0 fail_val=0
4781 echo "Trigger namespace LFSCK to repair bad name hash"
4782 $START_NAMESPACE -r -A ||
4783 error "(3) Fail to start LFSCK for namespace"
4785 wait_all_targets_blocked namespace completed 4
4787 local repaired=$(do_facet mds2 $LCTL get_param -n \
4788 mdd.$(facet_svc mds2).lfsck_namespace |
4789 awk '/^name_hash_repaired/ { print $2 }')
4790 [ $repaired -ge 1 ] ||
4791 error "(5) Fail to repair bad name hash: $repaired"
4793 umount_client $MOUNT || error "(6) umount failed"
4794 mount_client $MOUNT || error "(7) mount failed"
4796 for ((i = 0; i < $MDSCOUNT; i++)); do
4797 stat $DIR/$tdir/striped_dir/d$i ||
4798 error "(8) Fail to stat d$i after LFSCK"
4799 rmdir $DIR/$tdir/striped_dir/d$i ||
4800 error "(9) Fail to unlink d$i after LFSCK"
4803 rmdir $DIR/$tdir/striped_dir ||
4804 error "(10) Fail to remove the striped directory after LFSCK"
4806 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4809 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4812 echo "For some reason, the master MDT-object of the striped directory"
4813 echo "may lost its master LMV EA. If nobody created files under the"
4814 echo "master directly after the master LMV EA lost, then the LFSCK"
4815 echo "should re-generate the master LMV EA."
4818 check_mount_and_prep
4820 echo "Inject failure stub on MDT0 to simulate the case that the"
4821 echo "master MDT-object of the striped directory lost the LMV EA."
4823 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4824 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4825 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4826 error "(1) Fail to create striped directory"
4827 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4829 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4830 $START_NAMESPACE -r -A ||
4831 error "(2) Fail to start LFSCK for namespace"
4833 wait_all_targets_blocked namespace completed 3
4835 local repaired=$($SHOW_NAMESPACE |
4836 awk '/^striped_dirs_repaired/ { print $2 }')
4837 [ $repaired -eq 1 ] ||
4838 error "(4) Fail to re-generate master LMV EA: $repaired"
4840 umount_client $MOUNT || error "(5) umount failed"
4841 mount_client $MOUNT || error "(6) mount failed"
4843 local empty=$(ls $DIR/$tdir/striped_dir/)
4844 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4846 rmdir $DIR/$tdir/striped_dir ||
4847 error "(8) Fail to remove the striped directory after LFSCK"
4849 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4852 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4855 echo "For some reason, the master MDT-object of the striped directory"
4856 echo "may lost its master LMV EA. If somebody created files under the"
4857 echo "master directly after the master LMV EA lost, then the LFSCK"
4858 echo "should NOT re-generate the master LMV EA, instead, it should"
4859 echo "change the broken striped dirctory as read-only to prevent"
4860 echo "further damage"
4863 check_mount_and_prep
4865 echo "Inject failure stub on MDT0 to simulate the case that the"
4866 echo "master MDT-object of the striped directory lost the LMV EA."
4868 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4870 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4871 error "(1) Fail to create striped directory"
4872 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4874 umount_client $MOUNT || error "(2) umount failed"
4875 mount_client $MOUNT || error "(3) mount failed"
4877 touch $DIR/$tdir/striped_dir/dummy ||
4878 error "(4) Fail to touch under broken striped directory"
4880 echo "Trigger namespace LFSCK to find out the inconsistency"
4881 $START_NAMESPACE -r -A ||
4882 error "(5) Fail to start LFSCK for namespace"
4884 wait_all_targets_blocked namespace completed 6
4886 local repaired=$($SHOW_NAMESPACE |
4887 awk '/^striped_dirs_repaired/ { print $2 }')
4888 [ $repaired -eq 0 ] ||
4889 error "(7) Re-generate master LMV EA unexpected: $repaired"
4891 stat $DIR/$tdir/striped_dir/dummy ||
4892 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4894 touch $DIR/$tdir/striped_dir/foo &&
4895 error "(9) The broken striped directory should be read-only"
4897 chattr -i $DIR/$tdir/striped_dir ||
4898 error "(10) Fail to chattr on the broken striped directory"
4900 rmdir $DIR/$tdir/striped_dir ||
4901 error "(11) Fail to remove the striped directory after LFSCK"
4903 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4906 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4909 echo "For some reason, the slave MDT-object of the striped directory"
4910 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4911 echo "slave LMV EA."
4914 check_mount_and_prep
4916 echo "Inject failure stub on MDT0 to simulate the case that the"
4917 echo "slave MDT-object (that resides on the same MDT as the master"
4918 echo "MDT-object resides on) lost the LMV EA."
4920 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4921 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4922 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4923 error "(1) Fail to create striped directory"
4924 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4926 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4927 $START_NAMESPACE -r -A ||
4928 error "(2) Fail to start LFSCK for namespace"
4930 wait_all_targets_blocked namespace completed 3
4932 local repaired=$($SHOW_NAMESPACE |
4933 awk '/^striped_shards_repaired/ { print $2 }')
4934 [ $repaired -eq 1 ] ||
4935 error "(4) Fail to re-generate slave LMV EA: $repaired"
4937 rmdir $DIR/$tdir/striped_dir ||
4938 error "(5) Fail to remove the striped directory after LFSCK"
4940 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4943 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4946 echo "For some reason, the slave MDT-object of the striped directory"
4947 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4948 echo "slave LMV EA."
4951 check_mount_and_prep
4953 echo "Inject failure stub on MDT0 to simulate the case that the"
4954 echo "slave MDT-object (that resides on different MDT as the master"
4955 echo "MDT-object resides on) lost the LMV EA."
4957 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4958 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4959 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4960 error "(1) Fail to create striped directory"
4961 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4963 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4964 $START_NAMESPACE -r -A ||
4965 error "(2) Fail to start LFSCK for namespace"
4967 wait_all_targets_blocked namespace completed 3
4969 local repaired=$(do_facet mds2 $LCTL get_param -n \
4970 mdd.$(facet_svc mds2).lfsck_namespace |
4971 awk '/^striped_shards_repaired/ { print $2 }')
4972 [ $repaired -eq 1 ] ||
4973 error "(4) Fail to re-generate slave LMV EA: $repaired"
4975 rmdir $DIR/$tdir/striped_dir ||
4976 error "(5) Fail to remove the striped directory after LFSCK"
4978 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4981 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4984 echo "For some reason, the stripe index in the slave LMV EA is"
4985 echo "corrupted. The LFSCK should repair the slave LMV EA."
4988 check_mount_and_prep
4990 echo "Inject failure stub on MDT0 to simulate the case that the"
4991 echo "slave LMV EA on the first shard of the striped directory"
4992 echo "claims the same index as the second shard claims"
4994 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4995 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4996 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4997 error "(1) Fail to create striped directory"
4998 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5000 echo "Trigger namespace LFSCK to repair the slave LMV EA"
5001 $START_NAMESPACE -r -A ||
5002 error "(2) Fail to start LFSCK for namespace"
5004 wait_all_targets_blocked namespace completed 3
5006 local repaired=$($SHOW_NAMESPACE |
5007 awk '/^striped_shards_repaired/ { print $2 }')
5008 [ $repaired -eq 1 ] ||
5009 error "(4) Fail to repair slave LMV EA: $repaired"
5011 umount_client $MOUNT || error "(5) umount failed"
5012 mount_client $MOUNT || error "(6) mount failed"
5014 touch $DIR/$tdir/striped_dir/foo ||
5015 error "(7) Fail to touch file after the LFSCK"
5017 rm -f $DIR/$tdir/striped_dir/foo ||
5018 error "(8) Fail to unlink file after the LFSCK"
5020 rmdir $DIR/$tdir/striped_dir ||
5021 error "(9) Fail to remove the striped directory after LFSCK"
5023 run_test 31g "Repair the corrupted slave LMV EA"
5026 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5029 echo "For some reason, the shard's name entry in the striped"
5030 echo "directory may be corrupted. The LFSCK should repair the"
5031 echo "bad shard's name entry."
5034 check_mount_and_prep
5036 echo "Inject failure stub on MDT0 to simulate the case that the"
5037 echo "first shard's name entry in the striped directory claims"
5038 echo "the same index as the second shard's name entry claims."
5040 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
5041 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
5042 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
5043 error "(1) Fail to create striped directory"
5044 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
5046 echo "Trigger namespace LFSCK to repair the shard's name entry"
5047 $START_NAMESPACE -r -A ||
5048 error "(2) Fail to start LFSCK for namespace"
5050 wait_all_targets_blocked namespace completed 3
5052 local repaired=$($SHOW_NAMESPACE |
5053 awk '/^dirent_repaired/ { print $2 }')
5054 [ $repaired -eq 1 ] ||
5055 error "(4) Fail to repair shard's name entry: $repaired"
5057 umount_client $MOUNT || error "(5) umount failed"
5058 mount_client $MOUNT || error "(6) mount failed"
5060 touch $DIR/$tdir/striped_dir/foo ||
5061 error "(7) Fail to touch file after the LFSCK"
5063 rm -f $DIR/$tdir/striped_dir/foo ||
5064 error "(8) Fail to unlink file after the LFSCK"
5066 rmdir $DIR/$tdir/striped_dir ||
5067 error "(9) Fail to remove the striped directory after LFSCK"
5069 run_test 31h "Repair the corrupted shard's name entry"
5074 umount_client $MOUNT
5076 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5077 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5078 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
5080 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5081 [ "$STATUS" == "scanning-phase1" ] ||
5082 error "(2) Expect 'scanning-phase1', but got '$STATUS'"
5085 stop ost1 > /dev/null || error "(3) Fail to stop OST1!"
5087 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5091 $STOP_LFSCK || error "(4) Fail to stop LFSCK!"
5093 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5094 error "(5) Fail to start ost1"
5096 run_test 32a "stop LFSCK when some OST failed"
5100 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5103 $LFS mkdir -i 1 $DIR/$tdir/dp ||
5104 error "(1) Fail to create $DIR/$tdir/dp"
5105 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc1 ||
5106 error "(2) Fail to create $DIR/$tdir/dp/dc1"
5107 $LFS mkdir -i 0 -c $MDSCOUNT $DIR/$tdir/dp/dc2 ||
5108 error "(3) Fail to create $DIR/$tdir/dp/dc2"
5109 umount_client $MOUNT
5111 #define OBD_FAIL_LFSCK_ENGINE_DELAY 0x162d
5112 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5113 $START_NAMESPACE -r -A || error "(4) Fail to start LFSCK for namespace!"
5115 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5116 mdd.${MDT_DEV}.lfsck_namespace |
5117 awk '/^status/ { print \\\$2 }'" "scanning-phase1" 32 || {
5119 error "(5) unexpected status"
5123 stop mds2 > /dev/null || error "(6) Fail to stop MDT2!"
5125 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5129 $STOP_LFSCK || error "(7) Fail to stop LFSCK!"
5131 start mds2 $(mdsdevname 2) $MOUNT_OPTS_NOSCRUB > /dev/null ||
5132 error "(8) Fail to start MDT2"
5134 run_test 32b "stop LFSCK when some MDT failed"
5140 $START_LAYOUT --dryrun -o -r ||
5141 error "(1) Fail to start layout LFSCK"
5142 wait_all_targets_blocked layout completed 2
5144 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5145 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5146 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5148 $START_NAMESPACE -e abort -A -r ||
5149 error "(4) Fail to start namespace LFSCK"
5150 wait_all_targets_blocked namespace completed 5
5152 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5153 [ "$PARAMS" == "failout,all_targets" ] ||
5154 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5156 run_test 33 "check LFSCK paramters"
5160 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5161 [ $(facet_fstype $SINGLEMDS) != zfs ] &&
5162 skip "Only valid for ZFS backend" && return
5166 #define OBD_FAIL_LFSCK_NO_AGENTOBJ 0x1630
5167 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1630
5168 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5169 error "(1) Fail to create $DIR/$tdir/dummy"
5171 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
5172 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
5173 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5174 mdd.${MDT_DEV}.lfsck_namespace |
5175 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5177 error "(3) unexpected status"
5180 local repaired=$($SHOW_NAMESPACE |
5181 awk '/^dirent_repaired/ { print $2 }')
5182 [ $repaired -eq 1 ] ||
5183 error "(4) Fail to repair the lost agent object: $repaired"
5185 $START_NAMESPACE -r || error "(5) Fail to start LFSCK for namespace!"
5186 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
5187 mdd.${MDT_DEV}.lfsck_namespace |
5188 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
5190 error "(6) unexpected status"
5193 repaired=$($SHOW_NAMESPACE | awk '/^dirent_repaired/ { print $2 }')
5194 [ $repaired -eq 0 ] ||
5195 error "(7) Unexpected repairing: $repaired"
5197 run_test 34 "LFSCK can rebuild the lost agent object"
5201 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
5205 #define OBD_FAIL_LFSCK_NO_AGENTENT 0x1631
5206 do_facet mds2 $LCTL set_param fail_loc=0x1631
5207 $LFS mkdir -i 1 $DIR/$tdir/dummy ||
5208 error "(1) Fail to create $DIR/$tdir/dummy"
5211 do_facet mds2 $LCTL set_param fail_loc=0
5212 $START_NAMESPACE -A -r || error "(2) Fail to start LFSCK for namespace!"
5213 wait_update_facet mds2 "$LCTL get_param -n \
5214 mdd.$(facet_svc mds2).lfsck_namespace |
5215 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5216 error "(3) MDS${k} is not the expected 'completed'"
5218 local repaired=$(do_facet mds2 $LCTL get_param -n \
5219 mdd.$(facet_svc mds2).lfsck_namespace |
5220 awk '/^agent_entries_repaired/ { print $2 }')
5221 [ $repaired -eq 1 ] ||
5222 error "(4) Fail to repair the lost agent entry: $repaired"
5224 echo "stopall to cleanup object cache"
5227 setupall > /dev/null
5229 $START_NAMESPACE -A -r || error "(5) Fail to start LFSCK for namespace!"
5230 wait_update_facet mds2 "$LCTL get_param -n \
5231 mdd.$(facet_svc mds2).lfsck_namespace |
5232 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
5233 error "(6) MDS${k} is not the expected 'completed'"
5235 repaired=$(do_facet mds2 $LCTL get_param -n \
5236 mdd.$(facet_svc mds2).lfsck_namespace |
5237 awk '/^agent_entries_repaired/ { print $2 }')
5238 [ $repaired -eq 0 ] ||
5239 error "(7) Unexpected repairing: $repaired"
5241 run_test 35 "LFSCK can rebuild the lost agent entry"
5244 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5247 echo "The target MDT-object's LOV EA corrupted as to lose one of the "
5248 echo "mirrors information. The layout LFSCK should rebuild the LOV EA "
5249 echo "with the PFID EA of related OST-object(s) belong to the mirror."
5252 check_mount_and_prep
5254 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5255 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5256 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5257 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5258 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f1 ||
5259 error "(1) Fail to create mirror file $DIR/$tdir/f1"
5260 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5261 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f2 ||
5262 error "(2) Fail to create mirror file $DIR/$tdir/f2"
5264 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5265 error "(3) Fail to write $DIR/$tdir/f0"
5266 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=4 ||
5267 error "(4) Fail to write $DIR/$tdir/f1"
5268 dd if=/dev/zero of=$DIR/$tdir/f2 bs=1M count=4 ||
5269 error "(5) Fail to write $DIR/$tdir/f2"
5271 $LFS mirror resync $DIR/$tdir/f0 ||
5272 error "(6) Fail to resync $DIR/$tdir/f0"
5273 $LFS mirror resync $DIR/$tdir/f1 ||
5274 error "(7) Fail to resync $DIR/$tdir/f1"
5275 $LFS mirror resync $DIR/$tdir/f2 ||
5276 error "(8) Fail to resync $DIR/$tdir/f2"
5278 cancel_lru_locks mdc
5279 cancel_lru_locks osc
5281 $LFS getstripe $DIR/$tdir/f0 ||
5282 error "(9) Fail to getstripe for $DIR/$tdir/f0"
5283 $LFS getstripe $DIR/$tdir/f1 ||
5284 error "(10) Fail to getstripe for $DIR/$tdir/f1"
5285 $LFS getstripe $DIR/$tdir/f2 ||
5286 error "(11) Fail to getstripe for $DIR/$tdir/f2"
5288 echo "Inject failure, to simulate the case of missing one mirror in LOV"
5289 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5290 do_facet mds1 $LCTL set_param fail_loc=0x1616
5292 $LFS mirror split --mirror-id 1 -d $DIR/$tdir/f0 ||
5293 error "(12) Fail to split 1st mirror from $DIR/$tdir/f0"
5294 $LFS mirror split --mirror-id 2 -d $DIR/$tdir/f1 ||
5295 error "(13) Fail to split 2nd mirror from $DIR/$tdir/f1"
5296 $LFS mirror split --mirror-id 3 -d $DIR/$tdir/f2 ||
5297 error "(14) Fail to split 3rd mirror from $DIR/$tdir/f2"
5301 do_facet mds1 $LCTL set_param fail_loc=0
5303 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" &&
5304 error "(15) The 1st of mirror is not destroyed"
5305 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" &&
5306 error "(16) The 2nd of mirror is not destroyed"
5307 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" &&
5308 error "(17) The 3rd of mirror is not destroyed"
5312 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5313 [ $mirrors -eq 2 ] || error "(18) $DIR/$tdir/f0 has $mirrors mirrors"
5314 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5315 [ $mirrors -eq 2 ] || error "(19) $DIR/$tdir/f1 has $mirrors mirrors"
5316 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5317 [ $mirrors -eq 2 ] || error "(20) $DIR/$tdir/f2 has $mirrors mirrors"
5319 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5320 $START_LAYOUT -r -o || error "(21) Fail to start LFSCK for layout!"
5322 for k in $(seq $MDSCOUNT); do
5323 # The LFSCK status query internal is 30 seconds. For the case
5324 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5325 # time to guarantee the status sync up.
5326 wait_update_facet mds${k} "$LCTL get_param -n \
5327 mdd.$(facet_svc mds${k}).lfsck_layout |
5328 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5329 error "(22) MDS${k} is not the expected 'completed'"
5332 for k in $(seq $OSTCOUNT); do
5333 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5334 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5335 awk '/^status/ { print $2 }')
5336 [ "$cur_status" == "completed" ] ||
5337 error "(23) OST${k} Expect 'completed', but got '$cur_status'"
5340 local repaired=$(do_facet mds1 $LCTL get_param -n \
5341 mdd.$(facet_svc mds1).lfsck_layout |
5342 awk '/^repaired_orphan/ { print $2 }')
5343 [ $repaired -eq 9 ] ||
5344 error "(24) Expect 9 fixed on mds1, but got: $repaired"
5346 mirrors=$($LFS getstripe -N $DIR/$tdir/f0)
5347 [ $mirrors -eq 3 ] || error "(25) $DIR/$tdir/f0 has $mirrors mirrors"
5348 mirrors=$($LFS getstripe -N $DIR/$tdir/f1)
5349 [ $mirrors -eq 3 ] || error "(26) $DIR/$tdir/f1 has $mirrors mirrors"
5350 mirrors=$($LFS getstripe -N $DIR/$tdir/f2)
5351 [ $mirrors -eq 3 ] || error "(27) $DIR/$tdir/f2 has $mirrors mirrors"
5353 $LFS getstripe $DIR/$tdir/f0 | grep "lcme_mirror_id:.*1" || {
5354 $LFS getstripe $DIR/$tdir/f0
5355 error "(28) The 1st of mirror is not recovered"
5358 $LFS getstripe $DIR/$tdir/f1 | grep "lcme_mirror_id:.*2" || {
5359 $LFS getstripe $DIR/$tdir/f1
5360 error "(29) The 2nd of mirror is not recovered"
5363 $LFS getstripe $DIR/$tdir/f2 | grep "lcme_mirror_id:.*3" || {
5364 $LFS getstripe $DIR/$tdir/f2
5365 error "(30) The 3rd of mirror is not recovered"
5368 run_test 36a "rebuild LOV EA for mirrored file (1)"
5371 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5372 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5375 echo "The mirrored file lost its MDT-object, but relatd OST-objects "
5376 echo "are still there. The layout LFSCK should rebuild the LOV EA "
5377 echo "with the PFID EA of related OST-object(s) belong to the file. "
5380 check_mount_and_prep
5382 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5383 -N -E 3M -o 2,0 -E -1 -o 1 $DIR/$tdir/f0 ||
5384 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5386 local fid=$($LFS path2fid $DIR/$tdir/f0)
5388 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5389 error "(1) Fail to write $DIR/$tdir/f0"
5390 $LFS mirror resync $DIR/$tdir/f0 ||
5391 error "(2) Fail to resync $DIR/$tdir/f0"
5393 cancel_lru_locks mdc
5394 cancel_lru_locks osc
5396 $LFS getstripe $DIR/$tdir/f0 ||
5397 error "(3) Fail to getstripe for $DIR/$tdir/f0"
5399 echo "Inject failure, to simulate the case of missing the MDT-object"
5400 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5401 do_facet mds1 $LCTL set_param fail_loc=0x1616
5402 rm -f $DIR/$tdir/f0 || error "(4) Fail to remove $DIR/$tdir/f0"
5406 do_facet mds1 $LCTL set_param fail_loc=0
5408 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5409 $START_LAYOUT -r -o || error "(5) Fail to start LFSCK for layout!"
5411 for k in $(seq $MDSCOUNT); do
5412 # The LFSCK status query internal is 30 seconds. For the case
5413 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5414 # time to guarantee the status sync up.
5415 wait_update_facet mds${k} "$LCTL get_param -n \
5416 mdd.$(facet_svc mds${k}).lfsck_layout |
5417 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5418 error "(6) MDS${k} is not the expected 'completed'"
5421 for k in $(seq $OSTCOUNT); do
5422 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5423 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5424 awk '/^status/ { print $2 }')
5425 [ "$cur_status" == "completed" ] ||
5426 error "(7) OST${k} Expect 'completed', but got '$cur_status'"
5429 local count=$(do_facet mds1 $LCTL get_param -n \
5430 mdd.$(facet_svc mds1).lfsck_layout |
5431 awk '/^repaired_orphan/ { print $2 }')
5432 [ $count -eq 9 ] || error "(8) Expect 9 fixed on mds1, but got: $count"
5434 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5435 count=$($LFS getstripe --mirror-count $name)
5436 [ $count -eq 3 ] || error "(9) $DIR/$tdir/f0 has $count mirrors"
5438 count=$($LFS getstripe --component-count $name)
5439 [ $count -eq 6 ] || error "(10) $DIR/$tdir/f0 has $count components"
5441 $LFS getstripe $name | grep "lcme_mirror_id:.*1" || {
5442 $LFS getstripe $name
5443 error "(11) The 1st of mirror is not recovered"
5446 $LFS getstripe $name | grep "lcme_mirror_id:.*2" || {
5447 $LFS getstripe $name
5448 error "(12) The 2nd of mirror is not recovered"
5451 $LFS getstripe $name | grep "lcme_mirror_id:.*3" || {
5452 $LFS getstripe $name
5453 error "(13) The 3rd of mirror is not recovered"
5456 run_test 36b "rebuild LOV EA for mirrored file (2)"
5459 [ -n "$FILESET" ] && skip "Not functional for FILESET set"
5460 [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
5463 echo "The mirrored file has been modified, not resynced yet, then "
5464 echo "lost its MDT-object, but relatd OST-objects are still there. "
5465 echo "The layout LFSCK should rebuild the LOV EA and relatd status "
5466 echo "with the PFID EA of related OST-object(s) belong to the file. "
5469 check_mount_and_prep
5471 $LFS setstripe -N -E 1M -o 0,1 -E -1 -o 2 -N -E 2M -o 1,2 -E -1 -o 0 \
5473 error "(0) Fail to create mirror file $DIR/$tdir/f0"
5475 local fid=$($LFS path2fid $DIR/$tdir/f0)
5477 # The 1st dd && resync makes all related OST-objects have been written
5478 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5479 error "(1.1) Fail to write $DIR/$tdir/f0"
5480 $LFS mirror resync $DIR/$tdir/f0 ||
5481 error "(1.2) Fail to resync $DIR/$tdir/f0"
5482 # The 2nd dd makes one mirror to be stale
5483 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=4 ||
5484 error "(1.3) Fail to write $DIR/$tdir/f0"
5486 cancel_lru_locks mdc
5487 cancel_lru_locks osc
5489 $LFS getstripe $DIR/$tdir/f0 ||
5490 error "(2) Fail to getstripe for $DIR/$tdir/f0"
5492 local saved_flags1=$($LFS getstripe $DIR/$tdir/f0 | head -n 10 |
5493 awk '/lcme_flags/ { print $2 }')
5494 local saved_flags2=$($LFS getstripe $DIR/$tdir/f0 | tail -n 10 |
5495 awk '/lcme_flags/ { print $2 }')
5497 echo "Inject failure, to simulate the case of missing the MDT-object"
5498 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
5499 do_facet mds1 $LCTL set_param fail_loc=0x1616
5500 rm -f $DIR/$tdir/f0 || error "(3) Fail to remove $DIR/$tdir/f0"
5504 do_facet mds1 $LCTL set_param fail_loc=0
5506 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
5507 $START_LAYOUT -r -o || error "(4) Fail to start LFSCK for layout!"
5509 for k in $(seq $MDSCOUNT); do
5510 # The LFSCK status query internal is 30 seconds. For the case
5511 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
5512 # time to guarantee the status sync up.
5513 wait_update_facet mds${k} "$LCTL get_param -n \
5514 mdd.$(facet_svc mds${k}).lfsck_layout |
5515 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
5516 error "(5) MDS${k} is not the expected 'completed'"
5519 for k in $(seq $OSTCOUNT); do
5520 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
5521 obdfilter.$(facet_svc ost${k}).lfsck_layout |
5522 awk '/^status/ { print $2 }')
5523 [ "$cur_status" == "completed" ] ||
5524 error "(6) OST${k} Expect 'completed', but got '$cur_status'"
5527 local count=$(do_facet mds1 $LCTL get_param -n \
5528 mdd.$(facet_svc mds1).lfsck_layout |
5529 awk '/^repaired_orphan/ { print $2 }')
5530 [ $count -eq 6 ] || error "(7) Expect 9 fixed on mds1, but got: $count"
5532 local name=$MOUNT/.lustre/lost+found/MDT0000/${fid}-R-0
5533 count=$($LFS getstripe --mirror-count $name)
5534 [ $count -eq 2 ] || error "(8) $DIR/$tdir/f0 has $count mirrors"
5536 count=$($LFS getstripe --component-count $name)
5537 [ $count -eq 4 ] || error "(9) $DIR/$tdir/f0 has $count components"
5539 local flags=$($LFS getstripe $name | head -n 10 |
5540 awk '/lcme_flags/ { print $2 }')
5541 [ "$flags" == "$saved_flags1" ] || {
5542 $LFS getstripe $name
5543 error "(10) expect flags $saved_flags1, got $flags"
5546 flags=$($LFS getstripe $name | tail -n 10 |
5547 awk '/lcme_flags/ { print $2 }')
5548 [ "$flags" == "$saved_flags2" ] || {
5549 $LFS getstripe $name
5550 error "(11) expect flags $saved_flags2, got $flags"
5553 run_test 36c "rebuild LOV EA for mirrored file (3)"
5559 local t_dir="$DIR/$tdir/d0"
5560 check_mount_and_prep
5562 $LFS mkdir -i 0 $t_dir || error "(2) Fail to mkdir $t_dir on MDT0"
5563 multiop_bg_pause $t_dir D_c || { error "multiop failed: $?"; return 1; }
5567 $START_NAMESPACE -r -A || {
5568 error "(3) Fail to start LFSCK for namespace!"; kill -USR1 $PID; }
5570 wait_all_targets_blocked namespace completed 4
5575 run_test 37 "LFSCK must skip a ORPHAN"
5579 [[ $MDS1_VERSION -le $(version_code 2.12.51) ]] &&
5580 skip "Need MDS version newer than 2.12.51"
5582 test_mkdir $DIR/$tdir
5583 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5584 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5586 # create foreign file
5587 $LFS setstripe --foreign=daos --flags 0xda05 \
5588 -x "${uuid1}@${uuid2}" $DIR/$tdir/$tfile ||
5589 error "$DIR/$tdir/$tfile: create failed"
5591 $LFS getstripe -v $DIR/$tdir/$tfile |
5592 grep "lfm_magic:.*0x0BD70BD0" ||
5593 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5594 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5595 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5596 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5597 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5598 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5599 $LFS getstripe -v $DIR/$tdir/$tfile |
5600 grep "lfm_flags:.*0x0000DA05" ||
5601 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5602 $LFS getstripe $DIR/$tdir/$tfile |
5603 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5604 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5606 # modify striping should fail
5607 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5608 error "$DIR/$tdir/$tfile: setstripe should fail"
5610 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5612 wait_all_targets_blocked namespace completed 1
5614 # check that "global" namespace_repaired == 0 !!!
5615 local repaired=$(do_facet mds1 \
5616 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5617 awk '/^namespace_repaired/ { print \\\$2 }'")
5618 [ $repaired -eq 0 ] ||
5619 error "(2) Expect no namespace repair, but got: $repaired"
5621 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5623 wait_all_targets_blocked layout completed 2
5625 # check that "global" layout_repaired == 0 !!!
5626 local repaired=$(do_facet mds1 \
5627 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5628 awk '/^layout_repaired/ { print \\\$2 }'")
5629 [ $repaired -eq 0 ] ||
5630 error "(2) Expect no layout repair, but got: $repaired"
5632 echo "post-lfsck checks of foreign file"
5634 $LFS getstripe -v $DIR/$tdir/$tfile |
5635 grep "lfm_magic:.*0x0BD70BD0" ||
5636 error "$DIR/$tdir/$tfile: invalid LOV EA foreign magic"
5637 # lfm_length is LOV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5638 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_length:.*73" ||
5639 error "$DIR/$tdir/$tfile: invalid LOV EA foreign size"
5640 $LFS getstripe -v $DIR/$tdir/$tfile | grep "lfm_type:.*daos" ||
5641 error "$DIR/$tdir/$tfile: invalid LOV EA foreign type"
5642 $LFS getstripe -v $DIR/$tdir/$tfile |
5643 grep "lfm_flags:.*0x0000DA05" ||
5644 error "$DIR/$tdir/$tfile: invalid LOV EA foreign flags"
5645 $LFS getstripe $DIR/$tdir/$tfile |
5646 grep "lfm_value:.*${uuid1}@${uuid2}" ||
5647 error "$DIR/$tdir/$tfile: invalid LOV EA foreign value"
5649 # modify striping should fail
5650 $LFS setstripe -c 2 $DIR/$tdir/$tfile &&
5651 error "$DIR/$tdir/$tfile: setstripe should fail"
5654 cat $DIR/$tdir/$tfile && "$DIR/$tdir/$tfile: read should fail"
5655 cat /etc/passwd > $DIR/$tdir/$tfile &&
5656 error "$DIR/$tdir/$tfile: write should fail"
5658 #remove foreign file
5659 rm $DIR/$tdir/$tfile ||
5660 error "$DIR/$tdir/$tfile: remove of foreign file has failed"
5662 run_test 38 "LFSCK does not break foreign file and reverse is also true"
5666 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.12.51) ]] &&
5667 skip "Need MDS version newer than 2.12.51"
5669 test_mkdir $DIR/$tdir
5670 local uuid1=$(cat /proc/sys/kernel/random/uuid)
5671 local uuid2=$(cat /proc/sys/kernel/random/uuid)
5673 # create foreign dir
5674 $LFS mkdir --foreign=daos --xattr="${uuid1}@${uuid2}" --flags=0xda05 \
5675 $DIR/$tdir/${tdir}2 ||
5676 error "$DIR/$tdir/${tdir}2: create failed"
5678 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5679 grep "lfm_magic:.*0x0CD50CD0" ||
5680 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5681 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5682 # - sizeof(lfm_type) - sizeof(lfm_flags)
5683 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5684 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5685 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5686 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5687 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5688 grep "lfm_flags:.*0x0000DA05" ||
5689 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5690 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5691 grep "lfm_value.*${uuid1}@${uuid2}" ||
5692 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5694 # file create in dir should fail
5695 touch $DIR/$tdir/${tdir}2/$tfile &&
5696 "$DIR/${tdir}2: file create should fail"
5699 chmod 777 $DIR/$tdir/${tdir}2 ||
5700 error "$DIR/${tdir}2: chmod failed"
5703 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5704 error "$DIR/${tdir}2: chown failed"
5706 $START_NAMESPACE -r -A || error "Fail to start LFSCK for namespace"
5708 wait_all_targets_blocked namespace completed 1
5710 # check that "global" namespace_repaired == 0 !!!
5711 local repaired=$(do_facet mds1 \
5712 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5713 awk '/^namespace_repaired/ { print \\\$2 }'")
5714 [ $repaired -eq 0 ] ||
5715 error "(2) Expect nothing to be repaired, but got: $repaired"
5717 $START_LAYOUT -A -r || error "Fail to start LFSCK for layout"
5719 wait_all_targets_blocked layout completed 2
5721 # check that "global" layout_repaired == 0 !!!
5722 local repaired=$(do_facet mds1 \
5723 "$LCTL lfsck_query -t all -M ${FSNAME}-MDT0000 |
5724 awk '/^layout_repaired/ { print \\\$2 }'")
5725 [ $repaired -eq 0 ] ||
5726 error "(2) Expect no layout repair, but got: $repaired"
5728 echo "post-lfsck checks of foreign dir"
5730 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5731 grep "lfm_magic:.*0x0CD50CD0" ||
5732 error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
5733 # lfm_length is LMV EA size - sizeof(lfm_magic) - sizeof(lfm_length)
5734 # - sizeof(lfm_type) - sizeof(lfm_flags)
5735 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_length:.*73" ||
5736 error "$DIR/$tdir/${tdir}2: invalid LMV EA size"
5737 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 | grep "lfm_type:.*daos" ||
5738 error "$DIR/$tdir/${tdir}2: invalid LMV EA type"
5739 $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
5740 grep "lfm_flags:.*0x0000DA05" ||
5741 error "$DIR/$tdir/${tdir}2: invalid LMV EA flags"
5742 $LFS getdirstripe $DIR/$tdir/${tdir}2 |
5743 grep "lfm_value.*${uuid1}@${uuid2}" ||
5744 error "$DIR/$tdir/${tdir}2: invalid LMV EA value"
5746 # file create in dir should fail
5747 touch $DIR/$tdir/${tdir}2/$tfile &&
5748 "$DIR/${tdir}2: file create should fail"
5751 chmod 777 $DIR/$tdir/${tdir}2 ||
5752 error "$DIR/${tdir}2: chmod failed"
5755 chown $RUNAS_ID:$RUNAS_GID $DIR/$tdir/${tdir}2 ||
5756 error "$DIR/${tdir}2: chown failed"
5759 rmdir $DIR/$tdir/${tdir}2 ||
5760 error "$DIR/$tdir/${tdir}2: remove of foreign dir has failed"
5762 run_test 39 "LFSCK does not break foreign dir and reverse is also true"
5764 # restore MDS/OST size
5765 MDSSIZE=${SAVED_MDSSIZE}
5766 OSTSIZE=${SAVED_OSTSIZE}
5767 OSTCOUNT=${SAVED_OSTCOUNT}
5769 # cleanup the system at last
5770 REFORMAT="yes" cleanup_and_setup_lustre
5773 check_and_cleanup_lustre