3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
216 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
217 skip "OI Scrub not implemented for ZFS" && return
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^linkea_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase2/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair crashed linkEA: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
335 error "(7) Fail to stat $DIR/$tdir/dummy"
337 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
338 local dummyname=$($LFS fid2path $DIR $dummyfid)
339 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
340 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
342 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
348 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
350 touch $DIR/$tdir/dummy
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
354 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
356 mdd.${MDT_DEV}.lfsck_namespace |
357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
359 error "(4) unexpected status"
362 local repaired=$($SHOW_NAMESPACE |
363 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
422 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^linkea_repaired/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2d "LFSCK can recover the missing linkEA entry"
457 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
461 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
463 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
464 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
465 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
466 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
468 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
470 wait_all_targets_blocked namespace completed 4
472 local repaired=$($SHOW_NAMESPACE |
473 awk '/^linkea_repaired/ { print $2 }')
474 [ $repaired -eq 1 ] ||
475 error "(5) Fail to repair crashed linkEA: $repaired"
477 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
478 local name=$($LFS fid2path $DIR $fid)
479 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
480 error "(6) Fail to repair linkEA: $fid $name"
482 run_test 2e "namespace LFSCK can verify remote object linkEA"
488 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
489 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
490 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
492 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
493 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
494 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
496 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
498 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
500 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
501 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
502 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
504 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
506 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
507 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
508 mdd.${MDT_DEV}.lfsck_namespace |
509 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
511 error "(10) unexpected status"
514 local checked=$($SHOW_NAMESPACE |
515 awk '/^checked_phase2/ { print $2 }')
516 [ $checked -ge 4 ] ||
517 error "(11) Fail to check multiple-linked object: $checked"
519 local repaired=$($SHOW_NAMESPACE |
520 awk '/^multiple_linked_repaired/ { print $2 }')
521 [ $repaired -ge 2 ] ||
522 error "(12) Fail to repair multiple-linked object: $repaired"
524 run_test 3 "LFSCK can verify multiple-linked objects"
528 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
529 skip "OI Scrub not implemented for ZFS" && return
532 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
533 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
535 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
536 echo "start $SINGLEMDS with disabling OI scrub"
537 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
538 error "(2) Fail to start MDS!"
540 #define OBD_FAIL_LFSCK_DELAY2 0x1601
541 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
542 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
543 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
544 mdd.${MDT_DEV}.lfsck_namespace |
545 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
547 error "(5) unexpected status"
550 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
551 [ "$STATUS" == "scanning-phase1" ] ||
552 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
554 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
555 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
556 mdd.${MDT_DEV}.lfsck_namespace |
557 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
559 error "(7) unexpected status"
562 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
563 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
565 local repaired=$($SHOW_NAMESPACE |
566 awk '/^dirent_repaired/ { print $2 }')
567 # for interop with old server
568 [ -z "$repaired" ] &&
569 repaired=$($SHOW_NAMESPACE |
570 awk '/^updated_phase1/ { print $2 }')
572 [ $repaired -ge 9 ] ||
573 error "(9) Fail to re-generate FID-in-dirent: $repaired"
577 mount_client $MOUNT || error "(10) Fail to start client!"
579 #define OBD_FAIL_FID_LOOKUP 0x1505
580 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
581 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
582 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
584 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
588 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
589 skip "OI Scrub not implemented for ZFS" && return
592 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
593 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
595 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
596 echo "start $SINGLEMDS with disabling OI scrub"
597 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
598 error "(2) Fail to start MDS!"
600 #define OBD_FAIL_LFSCK_DELAY2 0x1601
601 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
602 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
603 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
604 mdd.${MDT_DEV}.lfsck_namespace |
605 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
607 error "(5) unexpected status"
610 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
611 [ "$STATUS" == "scanning-phase1" ] ||
612 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
614 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
615 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
616 mdd.${MDT_DEV}.lfsck_namespace |
617 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
619 error "(7) unexpected status"
622 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
623 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
625 local repaired=$($SHOW_NAMESPACE |
626 awk '/^dirent_repaired/ { print $2 }')
627 # for interop with old server
628 [ -z "$repaired" ] &&
629 repaired=$($SHOW_NAMESPACE |
630 awk '/^updated_phase1/ { print $2 }')
632 [ $repaired -ge 2 ] ||
633 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
637 mount_client $MOUNT || error "(10) Fail to start client!"
639 #define OBD_FAIL_FID_LOOKUP 0x1505
640 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
641 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
643 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
645 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
646 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
647 local dummyname=$($LFS fid2path $DIR $dummyfid)
648 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
649 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
651 run_test 5 "LFSCK can handle IGIF object upgrading"
656 #define OBD_FAIL_LFSCK_DELAY1 0x1600
657 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
658 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
660 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
661 [ "$STATUS" == "scanning-phase1" ] ||
662 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
664 # Sleep 3 sec to guarantee at least one object processed by LFSCK
666 # Fail the LFSCK to guarantee there is at least one checkpoint
667 #define OBD_FAIL_LFSCK_FATAL1 0x1608
668 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
669 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
670 mdd.${MDT_DEV}.lfsck_namespace |
671 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
673 error "(4) unexpected status"
676 local POS0=$($SHOW_NAMESPACE |
677 awk '/^last_checkpoint_position/ { print $2 }' |
680 #define OBD_FAIL_LFSCK_DELAY1 0x1600
681 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
682 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
684 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
685 [ "$STATUS" == "scanning-phase1" ] ||
686 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
688 local POS1=$($SHOW_NAMESPACE |
689 awk '/^latest_start_position/ { print $2 }' |
691 [[ $POS0 -lt $POS1 ]] ||
692 error "(7) Expect larger than: $POS0, but got $POS1"
694 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
695 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
696 mdd.${MDT_DEV}.lfsck_namespace |
697 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
699 error "(8) unexpected status"
702 run_test 6a "LFSCK resumes from last checkpoint (1)"
707 #define OBD_FAIL_LFSCK_DELAY2 0x1601
708 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
709 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
711 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
712 [ "$STATUS" == "scanning-phase1" ] ||
713 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
715 # Sleep 5 sec to guarantee that we are in the directory scanning
717 # Fail the LFSCK to guarantee there is at least one checkpoint
718 #define OBD_FAIL_LFSCK_FATAL2 0x1609
719 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
720 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
721 mdd.${MDT_DEV}.lfsck_namespace |
722 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
724 error "(4) unexpected status"
727 local O_POS0=$($SHOW_NAMESPACE |
728 awk '/^last_checkpoint_position/ { print $2 }' |
731 local D_POS0=$($SHOW_NAMESPACE |
732 awk '/^last_checkpoint_position/ { print $4 }')
734 #define OBD_FAIL_LFSCK_DELAY2 0x1601
735 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
736 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
738 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
739 [ "$STATUS" == "scanning-phase1" ] ||
740 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
742 local O_POS1=$($SHOW_NAMESPACE |
743 awk '/^latest_start_position/ { print $2 }' |
745 local D_POS1=$($SHOW_NAMESPACE |
746 awk '/^latest_start_position/ { print $4 }')
748 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
749 [[ $O_POS0 -lt $O_POS1 ]] ||
750 error "(7.1) $O_POS1 is not larger than $O_POS0"
752 [[ $D_POS0 -lt $D_POS1 ]] ||
753 error "(7.2) $D_POS1 is not larger than $D_POS0"
756 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
757 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
758 mdd.${MDT_DEV}.lfsck_namespace |
759 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
761 error "(8) unexpected status"
764 run_test 6b "LFSCK resumes from last checkpoint (2)"
771 #define OBD_FAIL_LFSCK_DELAY2 0x1601
772 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
773 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
775 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
776 [ "$STATUS" == "scanning-phase1" ] ||
777 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
779 # Sleep 3 sec to guarantee at least one object processed by LFSCK
781 echo "stop $SINGLEMDS"
782 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
784 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
785 echo "start $SINGLEMDS"
786 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
787 error "(5) Fail to start MDS!"
789 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
790 mdd.${MDT_DEV}.lfsck_namespace |
791 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
793 error "(6) unexpected status"
796 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
802 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
803 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
804 for ((i = 0; i < 20; i++)); do
805 touch $DIR/$tdir/dummy${i}
808 #define OBD_FAIL_LFSCK_DELAY3 0x1602
809 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
810 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
811 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
812 mdd.${MDT_DEV}.lfsck_namespace |
813 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
815 error "(4) unexpected status"
819 echo "stop $SINGLEMDS"
820 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
822 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
823 echo "start $SINGLEMDS"
824 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
825 error "(6) Fail to start MDS!"
827 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
828 mdd.${MDT_DEV}.lfsck_namespace |
829 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
831 error "(7) unexpected status"
834 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
839 formatall > /dev/null
845 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
846 [ "$STATUS" == "init" ] ||
847 error "(2) Expect 'init', but got '$STATUS'"
849 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
850 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
851 mkdir $DIR/$tdir/crashed
853 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
854 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
855 for ((i = 0; i < 5; i++)); do
856 touch $DIR/$tdir/dummy${i}
859 umount_client $MOUNT || error "(3) Fail to stop client!"
861 #define OBD_FAIL_LFSCK_DELAY2 0x1601
862 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
863 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
865 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
866 [ "$STATUS" == "scanning-phase1" ] ||
867 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
869 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
871 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
872 [ "$STATUS" == "stopped" ] ||
873 error "(7) Expect 'stopped', but got '$STATUS'"
875 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
877 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
878 [ "$STATUS" == "scanning-phase1" ] ||
879 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
881 #define OBD_FAIL_LFSCK_FATAL2 0x1609
882 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
883 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
884 mdd.${MDT_DEV}.lfsck_namespace |
885 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
887 error "(10) unexpected status"
890 #define OBD_FAIL_LFSCK_DELAY1 0x1600
891 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
892 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
894 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
895 [ "$STATUS" == "scanning-phase1" ] ||
896 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
898 #define OBD_FAIL_LFSCK_CRASH 0x160a
899 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
902 echo "stop $SINGLEMDS"
903 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
905 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
906 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
908 echo "start $SINGLEMDS"
909 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
910 error "(14) Fail to start MDS!"
912 local timeout=$(max_recovery_time)
915 while [ $timer -lt $timeout ]; do
916 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
917 mdt.${MDT_DEV}.recovery_status |
918 awk '/^status/ { print \\\$2 }'")
919 [ "$STATUS" != "RECOVERING" ] && break;
924 [ $timer != $timeout ] ||
925 error "(14.1) recovery timeout"
927 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
928 [ "$STATUS" == "crashed" ] ||
929 error "(15) Expect 'crashed', but got '$STATUS'"
931 #define OBD_FAIL_LFSCK_DELAY2 0x1601
932 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
933 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
935 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
936 [ "$STATUS" == "scanning-phase1" ] ||
937 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
939 echo "stop $SINGLEMDS"
940 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
942 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
943 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
945 echo "start $SINGLEMDS"
946 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
947 error "(19) Fail to start MDS!"
950 while [ $timer -lt $timeout ]; do
951 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
952 mdt.${MDT_DEV}.recovery_status |
953 awk '/^status/ { print \\\$2 }'")
954 [ "$STATUS" != "RECOVERING" ] && break;
959 [ $timer != $timeout ] ||
960 error "(19.1) recovery timeout"
962 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
963 [ "$STATUS" == "paused" ] ||
964 error "(20) Expect 'paused', but got '$STATUS'"
966 echo "stop $SINGLEMDS"
967 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
969 echo "start $SINGLEMDS without resume LFSCK"
970 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
971 error "(20.2) Fail to start MDS!"
974 while [ $timer -lt $timeout ]; do
975 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
976 mdt.${MDT_DEV}.recovery_status |
977 awk '/^status/ { print \\\$2 }'")
978 [ "$STATUS" != "RECOVERING" ] && break;
983 [ $timer != $timeout ] ||
984 error "(20.3) recovery timeout"
986 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
987 [ "$STATUS" == "paused" ] ||
988 error "(20.4) Expect 'paused', but got '$STATUS'"
990 #define OBD_FAIL_LFSCK_DELAY3 0x1602
991 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
993 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
994 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
995 mdd.${MDT_DEV}.lfsck_namespace |
996 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
998 error "(22) unexpected status"
1001 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1002 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1003 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1005 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1006 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1007 mdd.${MDT_DEV}.lfsck_namespace |
1008 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1010 error "(24) unexpected status"
1013 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1014 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1016 run_test 8 "LFSCK state machine"
1019 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1020 skip "Testing on UP system, the speed may be inaccurate."
1024 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.50) ]] ||
1025 { skip "Need MDS version >= 2.7.50"; return; }
1027 check_mount_and_prep
1028 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1029 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1030 createmany -o $DIR/$tdir/lfsck/f 5000
1032 local BASE_SPEED1=100
1034 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1037 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1038 [ "$STATUS" == "scanning-phase1" ] ||
1039 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1041 local SPEED=$($SHOW_LAYOUT |
1042 awk '/^average_speed_phase1/ { print $2 }')
1044 # There may be time error, normally it should be less than 2 seconds.
1045 # We allow another 20% schedule error.
1047 # MAX_MARGIN = 1.2 = 12 / 10
1048 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1049 RUN_TIME1 * 12 / 10))
1050 [ $SPEED -lt $MAX_SPEED ] || {
1052 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1053 error_ignore LU-9887 "(4) Speed $SPEED, expected < $MAX_SPEED"
1056 # adjust speed limit
1057 local BASE_SPEED2=300
1059 do_facet $SINGLEMDS \
1060 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1063 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1064 # MIN_MARGIN = 0.8 = 8 / 10
1065 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1066 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1067 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1068 [ $SPEED -gt $MIN_SPEED ] || {
1069 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1070 error_ignore LU-5624 \
1071 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1074 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1078 # MAX_MARGIN = 1.2 = 12 / 10
1079 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1080 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1081 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1082 [ $SPEED -lt $MAX_SPEED ] || {
1084 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1085 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1086 error_ignore LU-9887 "(6) Speed $SPEED, expected < $MAX_SPEED"
1089 do_facet $SINGLEMDS \
1090 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1092 wait_update_facet $SINGLEMDS \
1093 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1094 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1095 error "(7) Failed to get expected 'completed'"
1097 run_test 9a "LFSCK speed control (1)"
1100 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1101 skip "Testing on UP system, the speed may be inaccurate."
1105 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.50) ]] ||
1106 { skip "Need MDS version >= 2.7.50"; return; }
1110 echo "Preparing another 50 * 50 files (with error) at $(date)."
1111 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1112 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1113 createmany -d $DIR/$tdir/d 50
1114 createmany -m $DIR/$tdir/f 50
1115 for ((i = 0; i < 50; i++)); do
1116 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1119 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1120 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1121 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1122 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1123 mdd.${MDT_DEV}.lfsck_namespace |
1124 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1126 error "(5) unexpected status"
1129 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1130 echo "Prepared at $(date)."
1132 local BASE_SPEED1=50
1134 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1137 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1138 [ "$STATUS" == "scanning-phase2" ] ||
1139 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1141 local SPEED=$($SHOW_NAMESPACE |
1142 awk '/^average_speed_phase2/ { print $2 }')
1143 # There may be time error, normally it should be less than 2 seconds.
1144 # We allow another 20% schedule error.
1146 # MAX_MARGIN = 1.2 = 12 / 10
1147 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1148 RUN_TIME1 * 12 / 10))
1149 [ $SPEED -lt $MAX_SPEED ] || {
1151 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1152 error_ignore LU-9887 "(8) Speed $SPEED, expected < $MAX_SPEED"
1155 # adjust speed limit
1156 local BASE_SPEED2=150
1158 do_facet $SINGLEMDS \
1159 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1162 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1163 # MIN_MARGIN = 0.8 = 8 / 10
1164 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1165 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1166 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1167 [ $SPEED -gt $MIN_SPEED ] || {
1168 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1169 error_ignore LU-5624 \
1170 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1173 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1177 # MAX_MARGIN = 1.2 = 12 / 10
1178 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1179 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1180 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1181 [ $SPEED -lt $MAX_SPEED ] || {
1183 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1184 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1185 error_ignore LU-9887 "(10) Speed $SPEED, expected < $MAX_SPEED"
1188 do_facet $SINGLEMDS \
1189 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1190 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1191 mdd.${MDT_DEV}.lfsck_namespace |
1192 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1194 error "(11) unexpected status"
1197 run_test 9b "LFSCK speed control (2)"
1201 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1202 skip "lookup(..)/linkea on ZFS issue" && return
1206 echo "Preparing more files with error at $(date)."
1207 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1208 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1210 for ((i = 0; i < 1000; i = $((i+2)))); do
1211 mkdir -p $DIR/$tdir/d${i}
1212 touch $DIR/$tdir/f${i}
1213 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1216 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1217 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1219 for ((i = 1; i < 1000; i = $((i+2)))); do
1220 mkdir -p $DIR/$tdir/d${i}
1221 touch $DIR/$tdir/f${i}
1222 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1226 echo "Prepared at $(date)."
1228 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1230 umount_client $MOUNT
1231 mount_client $MOUNT || error "(3) Fail to start client!"
1233 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1236 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1237 [ "$STATUS" == "scanning-phase1" ] ||
1238 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1240 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1242 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1244 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1246 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1248 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1250 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1252 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1254 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1255 error "(14) Fail to softlink!"
1257 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1258 [ "$STATUS" == "scanning-phase1" ] ||
1259 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1261 do_facet $SINGLEMDS \
1262 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1263 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1264 mdd.${MDT_DEV}.lfsck_namespace |
1265 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1267 error "(16) unexpected status"
1270 run_test 10 "System is available during LFSCK scanning"
1273 ost_remove_lastid() {
1276 local rcmd="do_facet ost${ost}"
1278 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1280 # step 1: local mount
1281 mount_fstype ost${ost} || return 1
1282 # step 2: remove the specified LAST_ID
1283 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1285 unmount_fstype ost${ost} || return 2
1289 check_mount_and_prep
1290 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1291 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1296 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1298 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1299 error "(2) Fail to start ost1"
1301 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1302 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1304 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1305 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1307 wait_update_facet ost1 "$LCTL get_param -n \
1308 obdfilter.${OST_DEV}.lfsck_layout |
1309 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1311 error "(5) unexpected status"
1314 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1316 wait_update_facet ost1 "$LCTL get_param -n \
1317 obdfilter.${OST_DEV}.lfsck_layout |
1318 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1320 error "(6) unexpected status"
1323 echo "the LAST_ID(s) should have been rebuilt"
1324 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1325 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1327 run_test 11a "LFSCK can rebuild lost last_id"
1330 check_mount_and_prep
1331 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1333 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1334 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1335 do_facet ost1 $LCTL set_param fail_loc=0x160d
1337 local count=$(precreated_ost_obj_count 0 0)
1339 createmany -o $DIR/$tdir/f $((count + 32))
1341 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1342 local seq=$(do_facet mds1 $LCTL get_param -n \
1343 osp.${proc_path}.prealloc_last_seq)
1344 local lastid1=$(do_facet ost1 "lctl get_param -n \
1345 obdfilter.${ost1_svc}.last_id" | grep $seq |
1346 awk -F: '{ print $2 }')
1348 umount_client $MOUNT
1349 stop ost1 || error "(1) Fail to stop ost1"
1351 #define OBD_FAIL_OST_ENOSPC 0x215
1352 do_facet ost1 $LCTL set_param fail_loc=0x215
1354 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1355 error "(2) Fail to start ost1"
1357 for ((i = 0; i < 60; i++)); do
1358 lastid2=$(do_facet ost1 "lctl get_param -n \
1359 obdfilter.${ost1_svc}.last_id" | grep $seq |
1360 awk -F: '{ print $2 }')
1361 [ ! -z $lastid2 ] && break;
1365 echo "the on-disk LAST_ID should be smaller than the expected one"
1366 [ $lastid1 -gt $lastid2 ] ||
1367 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1369 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1370 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1372 wait_update_facet ost1 "$LCTL get_param -n \
1373 obdfilter.${OST_DEV}.lfsck_layout |
1374 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1376 error "(6) unexpected status"
1379 stop ost1 || error "(7) Fail to stop ost1"
1381 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1382 error "(8) Fail to start ost1"
1384 echo "the on-disk LAST_ID should have been rebuilt"
1385 wait_update_facet ost1 "$LCTL get_param -n \
1386 obdfilter.${ost1_svc}.last_id | grep $seq |
1387 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1388 do_facet ost1 $LCTL get_param -n \
1389 obdfilter.${ost1_svc}.last_id
1390 error "(9) expect lastid1 $seq:$lastid1"
1393 do_facet ost1 $LCTL set_param fail_loc=0
1394 stopall || error "(10) Fail to stopall"
1396 run_test 11b "LFSCK can rebuild crashed last_id"
1399 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1401 check_mount_and_prep
1402 for k in $(seq $MDSCOUNT); do
1403 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1404 createmany -o $DIR/$tdir/${k}/f 100 ||
1405 error "(0) Fail to create 100 files."
1408 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1409 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1410 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1412 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1413 wait_all_targets namespace scanning-phase1 3
1415 echo "Stop namespace LFSCK on all targets by single lctl command."
1416 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1417 error "(4) Fail to stop LFSCK on all devices!"
1419 echo "All the LFSCK targets should be in 'stopped' status."
1420 wait_all_targets_blocked namespace stopped 5
1422 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1423 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1424 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1426 echo "All the LFSCK targets should be in 'completed' status."
1427 wait_all_targets_blocked namespace completed 7
1429 start_full_debug_logging
1431 echo "Start layout LFSCK on all targets by single command (-s 1)."
1432 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1433 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1435 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1436 wait_all_targets layout scanning-phase1 9
1438 echo "Stop layout LFSCK on all targets by single lctl command."
1439 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1440 error "(10) Fail to stop LFSCK on all devices!"
1442 echo "All the LFSCK targets should be in 'stopped' status."
1443 wait_all_targets_blocked layout stopped 11
1445 for k in $(seq $OSTCOUNT); do
1446 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1447 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1448 awk '/^status/ { print $2 }')
1449 [ "$STATUS" == "stopped" ] ||
1450 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1453 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1454 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1455 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1457 echo "All the LFSCK targets should be in 'completed' status."
1458 wait_all_targets_blocked layout completed 14
1460 stop_full_debug_logging
1462 run_test 12a "single command to trigger LFSCK on all devices"
1465 check_mount_and_prep
1467 echo "Start LFSCK without '-M' specified."
1468 do_facet mds1 $LCTL lfsck_start -A -r ||
1469 error "(0) Fail to start LFSCK without '-M'"
1471 wait_all_targets_blocked namespace completed 1
1472 wait_all_targets_blocked layout completed 2
1474 local count=$(do_facet mds1 $LCTL dl |
1475 awk '{ print $3 }' | grep mdt | wc -l)
1476 if [ $count -gt 1 ]; then
1478 echo "Start layout LFSCK on the node with multipe targets,"
1479 echo "but not specify '-M'/'-A' option. Should get failure."
1481 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1482 error "(3) Start layout LFSCK should fail" || true
1485 run_test 12b "auto detect Lustre device"
1489 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1490 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1491 echo "MDT-object FID."
1494 check_mount_and_prep
1496 echo "Inject failure stub to simulate bad lmm_oi"
1497 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1499 createmany -o $DIR/$tdir/f 1
1500 $LFS setstripe -E 1M -E -1 $DIR/$tdir/f1 ||
1501 error "(0) Fail to create PFL $DIR/$tdir/f1"
1502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1504 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1505 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1507 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1508 mdd.${MDT_DEV}.lfsck_layout |
1509 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1511 error "(2) unexpected status"
1514 local repaired=$($SHOW_LAYOUT |
1515 awk '/^repaired_others/ { print $2 }')
1516 [ $repaired -eq 2 ] ||
1517 error "(3) Fail to repair crashed lmm_oi: $repaired"
1519 run_test 13 "LFSCK can repair crashed lmm_oi"
1523 echo "The OST-object referenced by the MDT-object should be there;"
1524 echo "otherwise, the LFSCK should re-create the missing OST-object."
1525 echo "without '--delay-create-ostobj' option."
1528 check_mount_and_prep
1529 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1531 echo "Inject failure stub to simulate dangling referenced MDT-object"
1532 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1533 do_facet ost1 $LCTL set_param fail_loc=0x1610
1534 local count=$(precreated_ost_obj_count 0 0)
1536 createmany -o $DIR/$tdir/f $((count + 16)) ||
1537 error "(0.1) Fail to create $DIR/$tdir/fx"
1538 touch $DIR/$tdir/guard0
1540 for ((i = 0; i < 16; i++)); do
1541 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1542 $DIR/$tdir/f_comp${i} ||
1543 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1545 touch $DIR/$tdir/guard1
1547 do_facet ost1 $LCTL set_param fail_loc=0
1549 start_full_debug_logging
1551 # exhaust other pre-created dangling cases
1552 count=$(precreated_ost_obj_count 0 0)
1553 createmany -o $DIR/$tdir/a $count ||
1554 error "(0.5) Fail to create $count files."
1556 echo "'ls' should fail because of dangling referenced MDT-object"
1557 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1559 echo "Trigger layout LFSCK to find out dangling reference"
1560 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1562 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1563 mdd.${MDT_DEV}.lfsck_layout |
1564 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1566 error "(3) unexpected status"
1569 local repaired=$($SHOW_LAYOUT |
1570 awk '/^repaired_dangling/ { print $2 }')
1571 [ $repaired -ge 32 ] ||
1572 error "(4) Fail to repair dangling reference: $repaired"
1574 echo "'stat' should fail because of not repair dangling by default"
1575 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1576 error "(5.1) stat should fail"
1577 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1578 error "(5.2) stat should fail"
1580 echo "Trigger layout LFSCK to repair dangling reference"
1581 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1583 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1584 mdd.${MDT_DEV}.lfsck_layout |
1585 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1587 error "(7) unexpected status"
1590 # There may be some async LFSCK updates in processing, wait for
1591 # a while until the target reparation has been done. LU-4970.
1593 echo "'stat' should success after layout LFSCK repairing"
1594 wait_update_facet client "stat $DIR/$tdir/guard0 |
1595 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1596 stat $DIR/$tdir/guard0
1598 error "(8.1) unexpected size"
1601 wait_update_facet client "stat $DIR/$tdir/guard1 |
1602 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1603 stat $DIR/$tdir/guard1
1605 error "(8.2) unexpected size"
1608 repaired=$($SHOW_LAYOUT |
1609 awk '/^repaired_dangling/ { print $2 }')
1610 [ $repaired -ge 32 ] ||
1611 error "(9) Fail to repair dangling reference: $repaired"
1613 stop_full_debug_logging
1615 echo "stopall to cleanup object cache"
1618 setupall > /dev/null
1620 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1624 echo "The OST-object referenced by the MDT-object should be there;"
1625 echo "otherwise, the LFSCK should re-create the missing OST-object."
1626 echo "with '--delay-create-ostobj' option."
1629 check_mount_and_prep
1630 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1632 echo "Inject failure stub to simulate dangling referenced MDT-object"
1633 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1634 do_facet ost1 $LCTL set_param fail_loc=0x1610
1635 local count=$(precreated_ost_obj_count 0 0)
1637 createmany -o $DIR/$tdir/f $((count + 31))
1638 touch $DIR/$tdir/guard
1639 do_facet ost1 $LCTL set_param fail_loc=0
1641 start_full_debug_logging
1643 # exhaust other pre-created dangling cases
1644 count=$(precreated_ost_obj_count 0 0)
1645 createmany -o $DIR/$tdir/a $count ||
1646 error "(0) Fail to create $count files."
1648 echo "'ls' should fail because of dangling referenced MDT-object"
1649 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1651 echo "Trigger layout LFSCK to find out dangling reference"
1652 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1654 wait_all_targets_blocked layout completed 3
1656 local repaired=$($SHOW_LAYOUT |
1657 awk '/^repaired_dangling/ { print $2 }')
1658 [ $repaired -ge 32 ] ||
1659 error "(4) Fail to repair dangling reference: $repaired"
1661 echo "'stat' should fail because of not repair dangling by default"
1662 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1664 echo "Trigger layout LFSCK to repair dangling reference"
1665 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1667 wait_all_targets_blocked layout completed 7
1669 # There may be some async LFSCK updates in processing, wait for
1670 # a while until the target reparation has been done. LU-4970.
1672 echo "'stat' should success after layout LFSCK repairing"
1673 wait_update_facet client "stat $DIR/$tdir/guard |
1674 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1675 stat $DIR/$tdir/guard
1677 error "(8) unexpected size"
1680 repaired=$($SHOW_LAYOUT |
1681 awk '/^repaired_dangling/ { print $2 }')
1682 [ $repaired -ge 32 ] ||
1683 error "(9) Fail to repair dangling reference: $repaired"
1685 stop_full_debug_logging
1687 echo "stopall to cleanup object cache"
1690 setupall > /dev/null
1692 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1696 echo "If the OST-object referenced by the MDT-object back points"
1697 echo "to some non-exist MDT-object, then the LFSCK should repair"
1698 echo "the OST-object to back point to the right MDT-object."
1701 check_mount_and_prep
1702 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1704 echo "Inject failure stub to make the OST-object to back point to"
1705 echo "non-exist MDT-object."
1706 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1708 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1709 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1710 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1712 error "(0) Fail to create PFL $DIR/$tdir/f1"
1713 # 'dd' will trigger punch RPC firstly on every OST-objects.
1714 # So even though some OST-object will not be write by 'dd',
1715 # as long as it is allocated (may be NOT allocated in pfl_3b)
1716 # its layout information will be set also.
1717 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1718 cancel_lru_locks osc
1719 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1721 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1722 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1724 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1725 mdd.${MDT_DEV}.lfsck_layout |
1726 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1728 error "(2) unexpected status"
1731 local repaired=$($SHOW_LAYOUT |
1732 awk '/^repaired_unmatched_pair/ { print $2 }')
1733 [ $repaired -ge 3 ] ||
1734 error "(3) Fail to repair unmatched pair: $repaired"
1736 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1740 echo "If the OST-object referenced by the MDT-object back points"
1741 echo "to other MDT-object that doesn't recognize the OST-object,"
1742 echo "then the LFSCK should repair it to back point to the right"
1743 echo "MDT-object (the first one)."
1746 check_mount_and_prep
1747 mkdir -p $DIR/$tdir/0
1748 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1749 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1750 cancel_lru_locks osc
1752 echo "Inject failure stub to make the OST-object to back point to"
1753 echo "other MDT-object"
1756 [ $OSTCOUNT -ge 2 ] && stripes=2
1758 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1759 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1760 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1761 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1763 error "(0) Fail to create PFL $DIR/$tdir/f1"
1764 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1765 cancel_lru_locks osc
1766 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1768 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1769 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1771 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1772 mdd.${MDT_DEV}.lfsck_layout |
1773 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1775 error "(2) unexpected status"
1778 local repaired=$($SHOW_LAYOUT |
1779 awk '/^repaired_unmatched_pair/ { print $2 }')
1780 [ $repaired -eq 4 ] ||
1781 error "(3) Fail to repair unmatched pair: $repaired"
1783 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1786 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1788 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1789 skip "Skip the test after 2.7.55 see LU-6437" && return
1792 echo "According to current metadata migration implementation,"
1793 echo "before the old MDT-object is removed, both the new MDT-object"
1794 echo "and old MDT-object will reference the same LOV layout. Then if"
1795 echo "the layout LFSCK finds the new MDT-object by race, it will"
1796 echo "regard related OST-object(s) as multiple referenced case, and"
1797 echo "will try to create new OST-object(s) for the new MDT-object."
1798 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1799 echo "MDT-object before confirm the multiple referenced case."
1802 check_mount_and_prep
1803 $LFS mkdir -i 1 $DIR/$tdir/a1
1804 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1805 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1806 cancel_lru_locks osc
1808 echo "Inject failure stub on MDT1 to delay the migration"
1810 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1811 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1812 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1813 $LFS migrate -m 0 $DIR/$tdir/a1 &
1816 echo "Trigger layout LFSCK to race with the migration"
1817 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1819 wait_all_targets_blocked layout completed 2
1821 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1822 local repaired=$($SHOW_LAYOUT |
1823 awk '/^repaired_unmatched_pair/ { print $2 }')
1824 [ $repaired -eq 1 ] ||
1825 error "(3) Fail to repair unmatched pair: $repaired"
1827 repaired=$($SHOW_LAYOUT |
1828 awk '/^repaired_multiple_referenced/ { print $2 }')
1829 [ $repaired -eq 0 ] ||
1830 error "(4) Unexpectedly repaird multiple references: $repaired"
1832 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1836 echo "If the OST-object's owner information does not match the owner"
1837 echo "information stored in the MDT-object, then the LFSCK trust the"
1838 echo "MDT-object and update the OST-object's owner information."
1841 check_mount_and_prep
1842 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1843 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1844 cancel_lru_locks osc
1846 echo "Inject failure stub to skip OST-object owner changing"
1847 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1848 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1849 chown 1.1 $DIR/$tdir/f0
1850 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1852 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1855 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1857 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1858 mdd.${MDT_DEV}.lfsck_layout |
1859 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1861 error "(2) unexpected status"
1864 local repaired=$($SHOW_LAYOUT |
1865 awk '/^repaired_inconsistent_owner/ { print $2 }')
1866 [ $repaired -eq 1 ] ||
1867 error "(3) Fail to repair inconsistent owner: $repaired"
1869 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1873 echo "If more than one MDT-objects reference the same OST-object,"
1874 echo "and the OST-object only recognizes one MDT-object, then the"
1875 echo "LFSCK should create new OST-objects for such non-recognized"
1879 check_mount_and_prep
1880 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1882 echo "Inject failure stub to make two MDT-objects to refernce"
1883 echo "the OST-object"
1885 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1886 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1887 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1888 cancel_lru_locks mdc
1889 cancel_lru_locks osc
1891 createmany -o $DIR/$tdir/f 1
1892 cancel_lru_locks mdc
1893 cancel_lru_locks osc
1895 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1897 error "(0) Fail to create PFL $DIR/$tdir/f1"
1898 cancel_lru_locks mdc
1899 cancel_lru_locks osc
1900 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1902 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1903 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1904 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1905 [ $size -eq 1048576 ] ||
1906 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1908 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1909 [ $size -eq 1048576 ] ||
1910 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1912 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1915 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1917 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1918 mdd.${MDT_DEV}.lfsck_layout |
1919 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1921 error "(3) unexpected status"
1924 local repaired=$($SHOW_LAYOUT |
1925 awk '/^repaired_multiple_referenced/ { print $2 }')
1926 [ $repaired -eq 2 ] ||
1927 error "(4) Fail to repair multiple references: $repaired"
1929 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1930 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1931 error "(5) Fail to write f0."
1932 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1933 [ $size -eq 1048576 ] ||
1934 error "(6) guard size should be 1048576, but got $size"
1936 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1937 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1938 error "(7) Fail to write f1."
1939 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1940 [ $size -eq 1048576 ] ||
1941 error "(8) guard size should be 1048576, but got $size"
1943 run_test 17 "LFSCK can repair multiple references"
1945 $LCTL set_param debug=+cache > /dev/null
1949 echo "The target MDT-object is there, but related stripe information"
1950 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1951 echo "layout EA entries."
1954 check_mount_and_prep
1955 $LFS mkdir -i 0 $DIR/$tdir/a1
1956 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1957 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1959 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1961 $LFS path2fid $DIR/$tdir/a1/f1
1962 $LFS getstripe $DIR/$tdir/a1/f1
1964 if [ $MDSCOUNT -ge 2 ]; then
1965 $LFS mkdir -i 1 $DIR/$tdir/a2
1966 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1967 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1968 $LFS path2fid $DIR/$tdir/a2/f2
1969 $LFS getstripe $DIR/$tdir/a2/f2
1972 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
1973 error "(0) Fail to create PFL $DIR/$tdir/f3"
1975 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
1977 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
1979 $LFS path2fid $DIR/$tdir/f3
1980 $LFS getstripe $DIR/$tdir/f3
1982 cancel_lru_locks osc
1984 echo "Inject failure, to make the MDT-object lost its layout EA"
1985 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1986 do_facet mds1 $LCTL set_param fail_loc=0x1615
1987 chown 1.1 $DIR/$tdir/a1/f1
1989 if [ $MDSCOUNT -ge 2 ]; then
1990 do_facet mds2 $LCTL set_param fail_loc=0x1615
1991 chown 1.1 $DIR/$tdir/a2/f2
1994 chown 1.1 $DIR/$tdir/f3
1999 do_facet mds1 $LCTL set_param fail_loc=0
2000 if [ $MDSCOUNT -ge 2 ]; then
2001 do_facet mds2 $LCTL set_param fail_loc=0
2004 cancel_lru_locks mdc
2005 cancel_lru_locks osc
2007 echo "The file size should be incorrect since layout EA is lost"
2008 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2009 [ "$cur_size" != "$saved_size1" ] ||
2010 error "(1) Expect incorrect file1 size"
2012 if [ $MDSCOUNT -ge 2 ]; then
2013 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2014 [ "$cur_size" != "$saved_size1" ] ||
2015 error "(2) Expect incorrect file2 size"
2018 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2019 [ "$cur_size" != "$saved_size2" ] ||
2020 error "(1.2) Expect incorrect file3 size"
2022 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2023 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2025 for k in $(seq $MDSCOUNT); do
2026 # The LFSCK status query internal is 30 seconds. For the case
2027 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2028 # time to guarantee the status sync up.
2029 wait_update_facet mds${k} "$LCTL get_param -n \
2030 mdd.$(facet_svc mds${k}).lfsck_layout |
2031 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2032 error "(4) MDS${k} is not the expected 'completed'"
2035 for k in $(seq $OSTCOUNT); do
2036 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2037 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2038 awk '/^status/ { print $2 }')
2039 [ "$cur_status" == "completed" ] ||
2040 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2043 local repaired=$(do_facet mds1 $LCTL get_param -n \
2044 mdd.$(facet_svc mds1).lfsck_layout |
2045 awk '/^repaired_orphan/ { print $2 }')
2046 [ $repaired -eq 3 ] ||
2047 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2049 if [ $MDSCOUNT -ge 2 ]; then
2050 repaired=$(do_facet mds2 $LCTL get_param -n \
2051 mdd.$(facet_svc mds2).lfsck_layout |
2052 awk '/^repaired_orphan/ { print $2 }')
2053 [ $repaired -eq 2 ] ||
2054 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2057 $LFS path2fid $DIR/$tdir/a1/f1
2058 $LFS getstripe $DIR/$tdir/a1/f1
2060 if [ $MDSCOUNT -ge 2 ]; then
2061 $LFS path2fid $DIR/$tdir/a2/f2
2062 $LFS getstripe $DIR/$tdir/a2/f2
2065 $LFS path2fid $DIR/$tdir/f3
2066 $LFS getstripe $DIR/$tdir/f3
2068 echo "The file size should be correct after layout LFSCK scanning"
2069 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2070 [ "$cur_size" == "$saved_size1" ] ||
2071 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2073 if [ $MDSCOUNT -ge 2 ]; then
2074 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2075 [ "$cur_size" == "$saved_size1" ] ||
2076 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2079 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2080 [ "$cur_size" == "$saved_size2" ] ||
2081 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2083 run_test 18a "Find out orphan OST-object and repair it (1)"
2087 echo "The target MDT-object is lost. The LFSCK should re-create the"
2088 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2089 echo "can move it back to normal namespace manually."
2092 check_mount_and_prep
2093 $LFS mkdir -i 0 $DIR/$tdir/a1
2094 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2095 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2096 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2097 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2099 $LFS getstripe $DIR/$tdir/a1/f1
2101 if [ $MDSCOUNT -ge 2 ]; then
2102 $LFS mkdir -i 1 $DIR/$tdir/a2
2103 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2104 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2105 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2107 $LFS getstripe $DIR/$tdir/a2/f2
2110 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2111 error "(0) Fail to create PFL $DIR/$tdir/f3"
2113 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2115 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2116 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2118 $LFS getstripe $DIR/$tdir/f3
2120 cancel_lru_locks osc
2122 echo "Inject failure, to simulate the case of missing the MDT-object"
2123 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2124 do_facet mds1 $LCTL set_param fail_loc=0x1616
2125 rm -f $DIR/$tdir/a1/f1
2127 if [ $MDSCOUNT -ge 2 ]; then
2128 do_facet mds2 $LCTL set_param fail_loc=0x1616
2129 rm -f $DIR/$tdir/a2/f2
2137 do_facet mds1 $LCTL set_param fail_loc=0
2138 if [ $MDSCOUNT -ge 2 ]; then
2139 do_facet mds2 $LCTL set_param fail_loc=0
2142 cancel_lru_locks mdc
2143 cancel_lru_locks osc
2145 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2146 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2148 for k in $(seq $MDSCOUNT); do
2149 # The LFSCK status query internal is 30 seconds. For the case
2150 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2151 # time to guarantee the status sync up.
2152 wait_update_facet mds${k} "$LCTL get_param -n \
2153 mdd.$(facet_svc mds${k}).lfsck_layout |
2154 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2155 error "(2) MDS${k} is not the expected 'completed'"
2158 for k in $(seq $OSTCOUNT); do
2159 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2160 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2161 awk '/^status/ { print $2 }')
2162 [ "$cur_status" == "completed" ] ||
2163 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2166 local repaired=$(do_facet mds1 $LCTL get_param -n \
2167 mdd.$(facet_svc mds1).lfsck_layout |
2168 awk '/^repaired_orphan/ { print $2 }')
2169 [ $repaired -eq 3 ] ||
2170 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2172 if [ $MDSCOUNT -ge 2 ]; then
2173 repaired=$(do_facet mds2 $LCTL get_param -n \
2174 mdd.$(facet_svc mds2).lfsck_layout |
2175 awk '/^repaired_orphan/ { print $2 }')
2176 [ $repaired -eq 2 ] ||
2177 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2180 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2181 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2182 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2184 if [ $MDSCOUNT -ge 2 ]; then
2185 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2186 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2189 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2190 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2192 $LFS path2fid $DIR/$tdir/a1/f1
2193 $LFS getstripe $DIR/$tdir/a1/f1
2195 if [ $MDSCOUNT -ge 2 ]; then
2196 $LFS path2fid $DIR/$tdir/a2/f2
2197 $LFS getstripe $DIR/$tdir/a2/f2
2200 $LFS path2fid $DIR/$tdir/f3
2201 $LFS getstripe $DIR/$tdir/f3
2203 echo "The file size should be correct after layout LFSCK scanning"
2204 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2205 [ "$cur_size" == "$saved_size1" ] ||
2206 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2208 if [ $MDSCOUNT -ge 2 ]; then
2209 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2210 [ "$cur_size" == "$saved_size1" ] ||
2211 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2214 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2215 [ "$cur_size" == "$saved_size2" ] ||
2216 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2218 run_test 18b "Find out orphan OST-object and repair it (2)"
2222 echo "The target MDT-object is lost, and the OST-object FID is missing."
2223 echo "The LFSCK should re-create the MDT-object with new FID under the "
2224 echo "directory .lustre/lost+found/MDTxxxx."
2227 check_mount_and_prep
2228 $LFS mkdir -i 0 $DIR/$tdir/a1
2229 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2231 echo "Inject failure, to simulate the case of missing parent FID"
2232 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2233 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2235 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2236 $LFS getstripe $DIR/$tdir/a1/f1
2238 if [ $MDSCOUNT -ge 2 ]; then
2239 $LFS mkdir -i 1 $DIR/$tdir/a2
2240 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2241 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2242 $LFS getstripe $DIR/$tdir/a2/f2
2245 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2246 error "(0) Fail to create PFL $DIR/$tdir/f3"
2248 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2249 $LFS getstripe $DIR/$tdir/f3
2251 cancel_lru_locks osc
2252 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2254 echo "Inject failure, to simulate the case of missing the MDT-object"
2255 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2256 do_facet mds1 $LCTL set_param fail_loc=0x1616
2257 rm -f $DIR/$tdir/a1/f1
2259 if [ $MDSCOUNT -ge 2 ]; then
2260 do_facet mds2 $LCTL set_param fail_loc=0x1616
2261 rm -f $DIR/$tdir/a2/f2
2269 do_facet mds1 $LCTL set_param fail_loc=0
2270 if [ $MDSCOUNT -ge 2 ]; then
2271 do_facet mds2 $LCTL set_param fail_loc=0
2274 cancel_lru_locks mdc
2275 cancel_lru_locks osc
2277 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2278 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2280 for k in $(seq $MDSCOUNT); do
2281 # The LFSCK status query internal is 30 seconds. For the case
2282 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2283 # time to guarantee the status sync up.
2284 wait_update_facet mds${k} "$LCTL get_param -n \
2285 mdd.$(facet_svc mds${k}).lfsck_layout |
2286 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2287 error "(2) MDS${k} is not the expected 'completed'"
2290 for k in $(seq $OSTCOUNT); do
2291 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2292 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2293 awk '/^status/ { print $2 }')
2294 [ "$cur_status" == "completed" ] ||
2295 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2298 if [ $MDSCOUNT -ge 2 ]; then
2304 local repaired=$(do_facet mds1 $LCTL get_param -n \
2305 mdd.$(facet_svc mds1).lfsck_layout |
2306 awk '/^repaired_orphan/ { print $2 }')
2307 [ $repaired -eq $expected ] ||
2308 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2310 if [ $MDSCOUNT -ge 2 ]; then
2311 repaired=$(do_facet mds2 $LCTL get_param -n \
2312 mdd.$(facet_svc mds2).lfsck_layout |
2313 awk '/^repaired_orphan/ { print $2 }')
2314 [ $repaired -eq 0 ] ||
2315 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2318 ls -ail $MOUNT/.lustre/lost+found/
2320 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2321 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2322 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2324 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2327 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2328 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2329 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2331 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2332 [ ! -z "$cname" ] ||
2333 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2335 run_test 18c "Find out orphan OST-object and repair it (3)"
2339 echo "The target MDT-object layout EA is corrupted, but the right"
2340 echo "OST-object is still alive as orphan. The layout LFSCK will"
2341 echo "not create new OST-object to occupy such slot."
2344 check_mount_and_prep
2346 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2347 echo "guard" > $DIR/$tdir/a1/f1
2348 echo "foo" > $DIR/$tdir/a1/f2
2350 echo "guard" > $DIR/$tdir/a1/f3
2351 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2352 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2353 echo "foo" > $DIR/$tdir/a1/f4
2355 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2356 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2357 $LFS path2fid $DIR/$tdir/a1/f1
2358 $LFS getstripe $DIR/$tdir/a1/f1
2359 $LFS path2fid $DIR/$tdir/a1/f2
2360 $LFS getstripe $DIR/$tdir/a1/f2
2361 $LFS path2fid $DIR/$tdir/a1/f3
2362 $LFS getstripe $DIR/$tdir/a1/f3
2363 $LFS path2fid $DIR/$tdir/a1/f4
2364 $LFS getstripe $DIR/$tdir/a1/f4
2365 cancel_lru_locks osc
2367 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2368 echo "to reference the same OST-object (which is f1's OST-obejct)."
2369 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2370 echo "dangling reference case, but f2's old OST-object is there."
2372 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2373 echo "to reference the same OST-object (which is f3's OST-obejct)."
2374 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2375 echo "dangling reference case, but f4's old OST-object is there."
2378 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2379 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2380 chown 1.1 $DIR/$tdir/a1/f2
2381 chown 1.1 $DIR/$tdir/a1/f4
2382 rm -f $DIR/$tdir/a1/f1
2383 rm -f $DIR/$tdir/a1/f3
2386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2388 echo "stopall to cleanup object cache"
2391 setupall > /dev/null
2393 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2394 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2396 for k in $(seq $MDSCOUNT); do
2397 # The LFSCK status query internal is 30 seconds. For the case
2398 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2399 # time to guarantee the status sync up.
2400 wait_update_facet mds${k} "$LCTL get_param -n \
2401 mdd.$(facet_svc mds${k}).lfsck_layout |
2402 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2403 error "(3) MDS${k} is not the expected 'completed'"
2406 for k in $(seq $OSTCOUNT); do
2407 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2408 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2409 awk '/^status/ { print $2 }')
2410 [ "$cur_status" == "completed" ] ||
2411 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2414 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2415 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2416 awk '/^repaired_orphan/ { print $2 }')
2417 [ $repaired -eq 2 ] ||
2418 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2420 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2421 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2422 awk '/^repaired_dangling/ { print $2 }')
2423 [ $repaired -eq 0 ] ||
2424 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2426 echo "The file size should be correct after layout LFSCK scanning"
2427 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2428 [ "$cur_size" == "$saved_size1" ] ||
2429 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2431 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2432 [ "$cur_size" == "$saved_size2" ] ||
2433 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2435 echo "The LFSCK should find back the original data."
2436 cat $DIR/$tdir/a1/f2
2437 $LFS path2fid $DIR/$tdir/a1/f2
2438 $LFS getstripe $DIR/$tdir/a1/f2
2439 cat $DIR/$tdir/a1/f4
2440 $LFS path2fid $DIR/$tdir/a1/f4
2441 $LFS getstripe $DIR/$tdir/a1/f4
2443 run_test 18d "Find out orphan OST-object and repair it (4)"
2447 echo "The target MDT-object layout EA slot is occpuied by some new"
2448 echo "created OST-object when repair dangling reference case. Such"
2449 echo "conflict OST-object has been modified by others. To keep the"
2450 echo "new data, the LFSCK will create a new file to refernece this"
2451 echo "old orphan OST-object."
2454 check_mount_and_prep
2456 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2457 echo "guard" > $DIR/$tdir/a1/f1
2458 echo "foo" > $DIR/$tdir/a1/f2
2460 echo "guard" > $DIR/$tdir/a1/f3
2461 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2462 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2463 echo "foo" > $DIR/$tdir/a1/f4
2465 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2466 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2468 $LFS path2fid $DIR/$tdir/a1/f1
2469 $LFS getstripe $DIR/$tdir/a1/f1
2470 $LFS path2fid $DIR/$tdir/a1/f2
2471 $LFS getstripe $DIR/$tdir/a1/f2
2472 $LFS path2fid $DIR/$tdir/a1/f3
2473 $LFS getstripe $DIR/$tdir/a1/f3
2474 $LFS path2fid $DIR/$tdir/a1/f4
2475 $LFS getstripe $DIR/$tdir/a1/f4
2476 cancel_lru_locks osc
2478 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2479 echo "to reference the same OST-object (which is f1's OST-obejct)."
2480 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2481 echo "dangling reference case, but f2's old OST-object is there."
2483 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2484 echo "to reference the same OST-object (which is f3's OST-obejct)."
2485 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2486 echo "dangling reference case, but f4's old OST-object is there."
2489 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2490 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2491 chown 1.1 $DIR/$tdir/a1/f2
2492 chown 1.1 $DIR/$tdir/a1/f4
2493 rm -f $DIR/$tdir/a1/f1
2494 rm -f $DIR/$tdir/a1/f3
2497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2499 echo "stopall to cleanup object cache"
2502 setupall > /dev/null
2504 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2505 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2507 start_full_debug_logging
2509 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2510 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2512 wait_update_facet mds1 "$LCTL get_param -n \
2513 mdd.$(facet_svc mds1).lfsck_layout |
2514 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2515 error "(3) MDS1 is not the expected 'scanning-phase2'"
2517 # to guarantee all updates are synced.
2521 echo "Write new data to f2/f4 to modify the new created OST-object."
2522 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2523 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2525 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2527 for k in $(seq $MDSCOUNT); do
2528 # The LFSCK status query internal is 30 seconds. For the case
2529 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2530 # time to guarantee the status sync up.
2531 wait_update_facet mds${k} "$LCTL get_param -n \
2532 mdd.$(facet_svc mds${k}).lfsck_layout |
2533 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2534 error "(4) MDS${k} is not the expected 'completed'"
2537 for k in $(seq $OSTCOUNT); do
2538 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2539 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2540 awk '/^status/ { print $2 }')
2541 [ "$cur_status" == "completed" ] ||
2542 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2545 stop_full_debug_logging
2547 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2548 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2549 awk '/^repaired_orphan/ { print $2 }')
2550 [ $repaired -eq 2 ] ||
2551 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2553 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2554 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2555 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2557 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2558 if [ $count -ne 2 ]; then
2559 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2560 error "(8) Expect 2 stubs under lost+found, but got $count"
2563 echo "The stub file should keep the original f2 or f4 data"
2564 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2565 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2566 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2567 error "(9) Got unexpected $cur_size"
2570 $LFS path2fid $cname
2571 $LFS getstripe $cname
2573 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2574 cur_size=$(ls -il $cname | awk '{ print $6 }')
2575 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2576 error "(10) Got unexpected $cur_size"
2579 $LFS path2fid $cname
2580 $LFS getstripe $cname
2582 echo "The f2/f4 should contains new data."
2583 cat $DIR/$tdir/a1/f2
2584 $LFS path2fid $DIR/$tdir/a1/f2
2585 $LFS getstripe $DIR/$tdir/a1/f2
2586 cat $DIR/$tdir/a1/f4
2587 $LFS path2fid $DIR/$tdir/a1/f4
2588 $LFS getstripe $DIR/$tdir/a1/f4
2590 run_test 18e "Find out orphan OST-object and repair it (5)"
2593 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2596 echo "The target MDT-object is lost. The LFSCK should re-create the"
2597 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2598 echo "to verify some OST-object(s) during the first stage-scanning,"
2599 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2600 echo "should not be affected."
2603 check_mount_and_prep
2604 $LFS mkdir -i 0 $DIR/$tdir/a1
2605 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2606 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2607 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2608 $LFS mkdir -i 0 $DIR/$tdir/a2
2609 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2610 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2611 $LFS getstripe $DIR/$tdir/a1/f1
2612 $LFS getstripe $DIR/$tdir/a2/f2
2614 if [ $MDSCOUNT -ge 2 ]; then
2615 $LFS mkdir -i 1 $DIR/$tdir/a3
2616 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2617 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2618 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2619 $LFS mkdir -i 1 $DIR/$tdir/a4
2620 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2621 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2622 $LFS getstripe $DIR/$tdir/a3/f3
2623 $LFS getstripe $DIR/$tdir/a4/f4
2626 cancel_lru_locks osc
2628 echo "Inject failure, to simulate the case of missing the MDT-object"
2629 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2630 do_facet mds1 $LCTL set_param fail_loc=0x1616
2631 rm -f $DIR/$tdir/a1/f1
2632 rm -f $DIR/$tdir/a2/f2
2634 if [ $MDSCOUNT -ge 2 ]; then
2635 do_facet mds2 $LCTL set_param fail_loc=0x1616
2636 rm -f $DIR/$tdir/a3/f3
2637 rm -f $DIR/$tdir/a4/f4
2643 do_facet mds1 $LCTL set_param fail_loc=0
2644 if [ $MDSCOUNT -ge 2 ]; then
2645 do_facet mds2 $LCTL set_param fail_loc=0
2648 cancel_lru_locks mdc
2649 cancel_lru_locks osc
2651 echo "Inject failure, to simulate the OST0 fail to handle"
2652 echo "MDT0 LFSCK request during the first-stage scanning."
2653 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2654 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2656 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2657 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2659 for k in $(seq $MDSCOUNT); do
2660 # The LFSCK status query internal is 30 seconds. For the case
2661 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2662 # time to guarantee the status sync up.
2663 wait_update_facet mds${k} "$LCTL get_param -n \
2664 mdd.$(facet_svc mds${k}).lfsck_layout |
2665 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2666 error "(2) MDS${k} is not the expected 'partial'"
2669 wait_update_facet ost1 "$LCTL get_param -n \
2670 obdfilter.$(facet_svc ost1).lfsck_layout |
2671 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2672 error "(3) OST1 is not the expected 'partial'"
2675 wait_update_facet ost2 "$LCTL get_param -n \
2676 obdfilter.$(facet_svc ost2).lfsck_layout |
2677 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2678 error "(4) OST2 is not the expected 'completed'"
2681 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2683 local repaired=$(do_facet mds1 $LCTL get_param -n \
2684 mdd.$(facet_svc mds1).lfsck_layout |
2685 awk '/^repaired_orphan/ { print $2 }')
2686 [ $repaired -eq 1 ] ||
2687 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2689 if [ $MDSCOUNT -ge 2 ]; then
2690 repaired=$(do_facet mds2 $LCTL get_param -n \
2691 mdd.$(facet_svc mds2).lfsck_layout |
2692 awk '/^repaired_orphan/ { print $2 }')
2693 [ $repaired -eq 1 ] ||
2694 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2697 echo "Trigger layout LFSCK on all devices again to cleanup"
2698 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2700 for k in $(seq $MDSCOUNT); do
2701 # The LFSCK status query internal is 30 seconds. For the case
2702 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2703 # time to guarantee the status sync up.
2704 wait_update_facet mds${k} "$LCTL get_param -n \
2705 mdd.$(facet_svc mds${k}).lfsck_layout |
2706 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2707 error "(8) MDS${k} is not the expected 'completed'"
2710 for k in $(seq $OSTCOUNT); do
2711 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2712 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2713 awk '/^status/ { print $2 }')
2714 [ "$cur_status" == "completed" ] ||
2715 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2719 local repaired=$(do_facet mds1 $LCTL get_param -n \
2720 mdd.$(facet_svc mds1).lfsck_layout |
2721 awk '/^repaired_orphan/ { print $2 }')
2722 [ $repaired -eq 2 ] ||
2723 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2725 if [ $MDSCOUNT -ge 2 ]; then
2726 repaired=$(do_facet mds2 $LCTL get_param -n \
2727 mdd.$(facet_svc mds2).lfsck_layout |
2728 awk '/^repaired_orphan/ { print $2 }')
2729 [ $repaired -eq 2 ] ||
2730 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2733 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2737 echo "The target MDT-object is lost, but related OI mapping is there"
2738 echo "The LFSCK should recreate the lost MDT-object without affected"
2739 echo "by the stale OI mapping."
2742 check_mount_and_prep
2743 $LFS mkdir -i 0 $DIR/$tdir/a1
2744 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2745 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2746 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2748 $LFS getstripe $DIR/$tdir/a1/f1
2749 cancel_lru_locks osc
2751 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2752 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2753 do_facet mds1 $LCTL set_param fail_loc=0x162e
2754 rm -f $DIR/$tdir/a1/f1
2756 do_facet mds1 $LCTL set_param fail_loc=0
2757 cancel_lru_locks mdc
2758 cancel_lru_locks osc
2760 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2761 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2763 for k in $(seq $MDSCOUNT); do
2764 # The LFSCK status query internal is 30 seconds. For the case
2765 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2766 # time to guarantee the status sync up.
2767 wait_update_facet mds${k} "$LCTL get_param -n \
2768 mdd.$(facet_svc mds${k}).lfsck_layout |
2769 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2770 error "(2) MDS${k} is not the expected 'completed'"
2773 for k in $(seq $OSTCOUNT); do
2774 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2775 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2776 awk '/^status/ { print $2 }')
2777 [ "$cur_status" == "completed" ] ||
2778 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2781 local repaired=$(do_facet mds1 $LCTL get_param -n \
2782 mdd.$(facet_svc mds1).lfsck_layout |
2783 awk '/^repaired_orphan/ { print $2 }')
2784 [ $repaired -eq $OSTCOUNT ] ||
2785 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2787 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2788 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2789 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2791 $LFS path2fid $DIR/$tdir/a1/f1
2792 $LFS getstripe $DIR/$tdir/a1/f1
2794 run_test 18g "Find out orphan OST-object and repair it (7)"
2798 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2799 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2800 echo "scanning its OST-object(s). Then in the second stage scanning,"
2801 echo "the OST will return related OST-object(s) to the MDT as orphan."
2802 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2803 echo "the 'orphan(s)' stripe information."
2806 check_mount_and_prep
2808 $LFS setstripe -E 2M -c 1 -E -1 $DIR/$tdir/f0 ||
2809 error "(0) Fail to create PFL $DIR/$tdir/f0"
2811 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2812 error "(1.1) Fail to write $DIR/$tdir/f0"
2814 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2815 error "(1.2) Fail to write $DIR/$tdir/f0"
2817 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2819 echo "Inject failure stub to simulate bad PFL extent range"
2820 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2821 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2823 chown 1.1 $DIR/$tdir/f0
2825 cancel_lru_locks mdc
2826 cancel_lru_locks osc
2827 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2829 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2830 error "(2) Write to bad PFL file should fail"
2832 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2833 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2835 for k in $(seq $MDSCOUNT); do
2836 # The LFSCK status query internal is 30 seconds. For the case
2837 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2838 # time to guarantee the status sync up.
2839 wait_update_facet mds${k} "$LCTL get_param -n \
2840 mdd.$(facet_svc mds${k}).lfsck_layout |
2841 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2842 error "(4.1) MDS${k} is not the expected 'completed'"
2845 for k in $(seq $OSTCOUNT); do
2846 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2847 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2848 awk '/^status/ { print $2 }')
2849 [ "$cur_status" == "completed" ] ||
2850 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2854 local repaired=$($SHOW_LAYOUT |
2855 awk '/^repaired_orphan/ { print $2 }')
2856 [ $repaired -eq 2 ] ||
2857 error "(5) Fail to repair crashed PFL range: $repaired"
2859 echo "Data in $DIR/$tdir/f0 should not be broken"
2860 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2861 error "(6) Data in $DIR/$tdir/f0 is broken"
2863 echo "Write should succeed after LFSCK repairing the bad PFL range"
2864 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2865 error "(7) Write should succeed after LFSCK"
2867 run_test 18h "LFSCK can repair crashed PFL extent range"
2869 $LCTL set_param debug=-cache > /dev/null
2872 check_mount_and_prep
2873 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2875 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2876 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2878 echo "foo1" > $DIR/$tdir/a0
2879 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2880 error "(0) Fail to create PFL $DIR/$tdir/a1"
2881 echo "foo2" > $DIR/$tdir/a1
2882 echo "guard" > $DIR/$tdir/a2
2883 cancel_lru_locks osc
2885 echo "Inject failure, then client will offer wrong parent FID when read"
2886 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2887 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2889 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2890 $LCTL set_param fail_loc=0x1619
2892 echo "Read RPC with wrong parent FID should be denied"
2893 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2894 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2895 $LCTL set_param fail_loc=0
2897 run_test 19a "OST-object inconsistency self detect"
2900 check_mount_and_prep
2901 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2903 echo "Inject failure stub to make the OST-object to back point to"
2904 echo "non-exist MDT-object"
2906 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2907 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2909 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2910 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2911 echo "foo1" > $DIR/$tdir/f0
2912 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2913 error "(0) Fail to create PFL $DIR/$tdir/f1"
2914 echo "foo2" > $DIR/$tdir/f1
2915 cancel_lru_locks osc
2916 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2918 do_facet ost1 $LCTL set_param -n \
2919 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2920 echo "Nothing should be fixed since self detect and repair is disabled"
2921 local repaired=$(do_facet ost1 $LCTL get_param -n \
2922 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2923 awk '/^repaired/ { print $2 }')
2924 [ $repaired -eq 0 ] ||
2925 error "(1) Expected 0 repaired, but got $repaired"
2927 echo "Read RPC with right parent FID should be accepted,"
2928 echo "and cause parent FID on OST to be fixed"
2930 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2931 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2933 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2934 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2936 repaired=$(do_facet ost1 $LCTL get_param -n \
2937 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2938 awk '/^repaired/ { print $2 }')
2939 [ $repaired -eq 2 ] ||
2940 error "(3) Expected 1 repaired, but got $repaired"
2942 run_test 19b "OST-object inconsistency self repair"
2944 PATTERN_WITH_HOLE="40000001"
2945 PATTERN_WITHOUT_HOLE="1"
2948 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2951 echo "The target MDT-object and some of its OST-object are lost."
2952 echo "The LFSCK should find out the left OST-objects and re-create"
2953 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2954 echo "with the partial OST-objects (LOV EA hole)."
2956 echo "New client can access the file with LOV EA hole via normal"
2957 echo "system tools or commands without crash the system."
2959 echo "For old client, even though it cannot access the file with"
2960 echo "LOV EA hole, it should not cause the system crash."
2963 check_mount_and_prep
2964 $LFS mkdir -i 0 $DIR/$tdir/a1
2965 if [ $OSTCOUNT -gt 2 ]; then
2966 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2969 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2973 # 256 blocks on the stripe0.
2974 # 1 block on the stripe1 for 2 OSTs case.
2975 # 256 blocks on the stripe1 for other cases.
2976 # 1 block on the stripe2 if OSTs > 2
2977 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2978 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2979 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2981 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2982 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2983 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2986 $LFS getstripe $DIR/$tdir/a1/f0
2988 $LFS getstripe $DIR/$tdir/a1/f1
2990 $LFS getstripe $DIR/$tdir/a1/f2
2992 if [ $OSTCOUNT -gt 2 ]; then
2993 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2994 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2996 $LFS getstripe $DIR/$tdir/a1/f3
2999 cancel_lru_locks osc
3001 echo "Inject failure..."
3002 echo "To simulate f0 lost MDT-object"
3003 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3004 do_facet mds1 $LCTL set_param fail_loc=0x1616
3005 rm -f $DIR/$tdir/a1/f0
3007 echo "To simulate f1 lost MDT-object and OST-object0"
3008 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3009 do_facet mds1 $LCTL set_param fail_loc=0x161a
3010 rm -f $DIR/$tdir/a1/f1
3012 echo "To simulate f2 lost MDT-object and OST-object1"
3013 do_facet mds1 $LCTL set_param fail_val=1
3014 rm -f $DIR/$tdir/a1/f2
3016 if [ $OSTCOUNT -gt 2 ]; then
3017 echo "To simulate f3 lost MDT-object and OST-object2"
3018 do_facet mds1 $LCTL set_param fail_val=2
3019 rm -f $DIR/$tdir/a1/f3
3022 umount_client $MOUNT
3025 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3027 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3028 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3030 for k in $(seq $MDSCOUNT); do
3031 # The LFSCK status query internal is 30 seconds. For the case
3032 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3033 # time to guarantee the status sync up.
3034 wait_update_facet mds${k} "$LCTL get_param -n \
3035 mdd.$(facet_svc mds${k}).lfsck_layout |
3036 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3037 error "(2) MDS${k} is not the expected 'completed'"
3040 for k in $(seq $OSTCOUNT); do
3041 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3042 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3043 awk '/^status/ { print $2 }')
3044 [ "$cur_status" == "completed" ] ||
3045 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3048 local repaired=$(do_facet mds1 $LCTL get_param -n \
3049 mdd.$(facet_svc mds1).lfsck_layout |
3050 awk '/^repaired_orphan/ { print $2 }')
3051 if [ $OSTCOUNT -gt 2 ]; then
3052 [ $repaired -eq 9 ] ||
3053 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3055 [ $repaired -eq 4 ] ||
3056 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3059 mount_client $MOUNT || error "(5.0) Fail to start client!"
3061 LOV_PATTERN_F_HOLE=0x40000000
3064 # ${fid0}-R-0 is the old f0
3066 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3067 echo "Check $name, which is the old f0"
3069 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3071 local pattern=$($LFS getstripe -L $name)
3072 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3073 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3075 local stripes=$($LFS getstripe -c $name)
3076 if [ $OSTCOUNT -gt 2 ]; then
3077 [ $stripes -eq 3 ] ||
3078 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3080 [ $stripes -eq 2 ] ||
3081 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3084 local size=$(stat $name | awk '/Size:/ { print $2 }')
3085 [ $size -eq $((4096 * $bcount)) ] ||
3086 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3088 cat $name > /dev/null || error "(5.5) cannot read $name"
3090 echo "dummy" >> $name || error "(5.6) cannot write $name"
3092 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3094 touch $name || error "(5.8) cannot touch $name"
3096 rm -f $name || error "(5.9) cannot unlink $name"
3099 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3101 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3102 if [ $OSTCOUNT -gt 2 ]; then
3103 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3105 echo "Check $name, it contains the old f1's stripe1"
3108 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3110 pattern=$($LFS getstripe -L $name)
3111 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3112 error "(6.2) expect pattern flag hole, but got $pattern"
3114 stripes=$($LFS getstripe -c $name)
3115 if [ $OSTCOUNT -gt 2 ]; then
3116 [ $stripes -eq 3 ] ||
3117 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3119 [ $stripes -eq 2 ] ||
3120 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3123 size=$(stat $name | awk '/Size:/ { print $2 }')
3124 [ $size -eq $((4096 * $bcount)) ] ||
3125 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3127 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3129 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3130 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3133 [ $failures -eq 256 ] ||
3134 error "(6.6) expect 256 IO failures, but get $failures"
3136 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3137 [ $size -eq $((4096 * $bcount)) ] ||
3138 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3140 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3141 error "(6.8) write to the LOV EA hole should fail"
3143 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3144 error "(6.9) write to normal stripe should NOT fail"
3146 echo "foo" >> $name && error "(6.10) append write $name should fail"
3148 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3150 touch $name || error "(6.12) cannot touch $name"
3152 rm -f $name || error "(6.13) cannot unlink $name"
3155 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3157 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3158 if [ $OSTCOUNT -gt 2 ]; then
3159 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3161 echo "Check $name, it contains the old f2's stripe0"
3164 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3166 pattern=$($LFS getstripe -L $name)
3167 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3168 error "(7.2) expect pattern flag hole, but got $pattern"
3170 stripes=$($LFS getstripe -c $name)
3171 size=$(stat $name | awk '/Size:/ { print $2 }')
3172 if [ $OSTCOUNT -gt 2 ]; then
3173 [ $stripes -eq 3 ] ||
3174 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3176 [ $size -eq $((4096 * $bcount)) ] ||
3177 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3179 cat $name > /dev/null &&
3180 error "(7.5.1) normal read $name should fail"
3182 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3183 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3185 [ $failures -eq 256 ] ||
3186 error "(7.6) expect 256 IO failures, but get $failures"
3188 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3189 [ $size -eq $((4096 * $bcount)) ] ||
3190 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3192 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3193 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3195 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3196 error "(7.8.1) write to normal stripe should NOT fail"
3198 echo "foo" >> $name &&
3199 error "(7.8.3) append write $name should fail"
3201 chown $RUNAS_ID:$RUNAS_GID $name ||
3202 error "(7.9.1) cannot chown on $name"
3204 touch $name || error "(7.10.1) cannot touch $name"
3206 [ $stripes -eq 2 ] ||
3207 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3210 [ $size -eq $((4096 * (256 + 0))) ] ||
3211 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3213 cat $name > /dev/null &&
3214 error "(7.5.2) normal read $name should fail"
3216 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3217 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3218 [ $failures -eq 256 ] ||
3219 error "(7.6.2) expect 256 IO failures, but get $failures"
3222 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3223 [ $size -eq $((4096 * $bcount)) ] ||
3224 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3226 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3227 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3229 chown $RUNAS_ID:$RUNAS_GID $name ||
3230 error "(7.9.2) cannot chown on $name"
3232 touch $name || error "(7.10.2) cannot touch $name"
3235 rm -f $name || error "(7.11) cannot unlink $name"
3237 [ $OSTCOUNT -le 2 ] && return
3240 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3242 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3243 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3245 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3247 pattern=$($LFS getstripe -L $name)
3248 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3249 error "(8.2) expect pattern flag hole, but got $pattern"
3251 stripes=$($LFS getstripe -c $name)
3252 [ $stripes -eq 3 ] ||
3253 error "(8.3) expect the stripe count is 3, but got $stripes"
3255 size=$(stat $name | awk '/Size:/ { print $2 }')
3257 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3258 error "(8.4) expect the size $((4096 * 512)), but got $size"
3260 cat $name > /dev/null &&
3261 error "(8.5) normal read $name should fail"
3263 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3264 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3266 [ $failures -eq 256 ] ||
3267 error "(8.6) expect 256 IO failures, but get $failures"
3270 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3271 [ $size -eq $((4096 * $bcount)) ] ||
3272 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3274 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3275 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3277 chown $RUNAS_ID:$RUNAS_GID $name ||
3278 error "(8.9) cannot chown on $name"
3280 touch $name || error "(8.10) cannot touch $name"
3282 rm -f $name || error "(8.11) cannot unlink $name"
3284 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3287 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3290 echo "The target MDT-object and some of its OST-object are lost."
3291 echo "The LFSCK should find out the left OST-objects and re-create"
3292 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3293 echo "with the partial OST-objects (LOV EA hole)."
3295 echo "New client can access the file with LOV EA hole via normal"
3296 echo "system tools or commands without crash the system - PFL case."
3299 check_mount_and_prep
3301 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3302 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3303 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3304 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3305 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3306 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3308 local bcount=$((256 * 3 + 1))
3310 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3311 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3312 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3314 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3315 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3316 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3319 $LFS getstripe $DIR/$tdir/f0
3321 $LFS getstripe $DIR/$tdir/f1
3323 $LFS getstripe $DIR/$tdir/f2
3325 cancel_lru_locks mdc
3326 cancel_lru_locks osc
3328 echo "Inject failure..."
3329 echo "To simulate f0 lost MDT-object"
3330 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3334 echo "To simulate the case of f1 lost MDT-object and "
3335 echo "the first OST-object in each PFL component"
3336 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3337 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3340 echo "To simulate the case of f2 lost MDT-object and "
3341 echo "the second OST-object in each PFL component"
3342 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3347 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3349 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3350 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3352 for k in $(seq $MDSCOUNT); do
3353 # The LFSCK status query internal is 30 seconds. For the case
3354 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3355 # time to guarantee the status sync up.
3356 wait_update_facet mds${k} "$LCTL get_param -n \
3357 mdd.$(facet_svc mds${k}).lfsck_layout |
3358 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3359 error "(4) MDS${k} is not the expected 'completed'"
3362 for k in $(seq $OSTCOUNT); do
3363 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3364 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3365 awk '/^status/ { print $2 }')
3366 [ "$cur_status" == "completed" ] ||
3367 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3370 local repaired=$(do_facet mds1 $LCTL get_param -n \
3371 mdd.$(facet_svc mds1).lfsck_layout |
3372 awk '/^repaired_orphan/ { print $2 }')
3373 [ $repaired -eq 8 ] ||
3374 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3377 # ${fid0}-R-0 is the old f0
3379 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3380 echo "Check $name, which is the old f0"
3382 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3384 local pattern=$($LFS getstripe -L -I1 $name)
3385 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3386 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3388 pattern=$($LFS getstripe -L -I2 $name)
3389 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3390 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3392 local stripes=$($LFS getstripe -c -I1 $name)
3393 [ $stripes -eq 2 ] ||
3394 error "(7.3.1) expect 2 stripes, but got $stripes"
3396 stripes=$($LFS getstripe -c -I2 $name)
3397 [ $stripes -eq 2 ] ||
3398 error "(7.3.2) expect 2 stripes, but got $stripes"
3400 local e_start=$($LFS getstripe -I1 $name |
3401 awk '/lcme_extent.e_start:/ { print $2 }')
3402 [ $e_start -eq 0 ] ||
3403 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3405 local e_end=$($LFS getstripe -I1 $name |
3406 awk '/lcme_extent.e_end:/ { print $2 }')
3407 [ $e_end -eq 2097152 ] ||
3408 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3410 e_start=$($LFS getstripe -I2 $name |
3411 awk '/lcme_extent.e_start:/ { print $2 }')
3412 [ $e_start -eq 2097152 ] ||
3413 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3415 e_end=$($LFS getstripe -I2 $name |
3416 awk '/lcme_extent.e_end:/ { print $2 }')
3417 [ "$e_end" = "EOF" ] ||
3418 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3420 local size=$(stat $name | awk '/Size:/ { print $2 }')
3421 [ $size -eq $((4096 * $bcount)) ] ||
3422 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3424 cat $name > /dev/null || error "(7.7) cannot read $name"
3426 echo "dummy" >> $name || error "(7.8) cannot write $name"
3428 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3430 touch $name || error "(7.10) cannot touch $name"
3432 rm -f $name || error "(7.11) cannot unlink $name"
3435 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3437 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3438 echo "Check $name, it contains f1's second OST-object in each COMP"
3440 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3442 pattern=$($LFS getstripe -L -I1 $name)
3443 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3444 error "(8.2.1) expect pattern flag hole, but got $pattern"
3446 pattern=$($LFS getstripe -L -I2 $name)
3447 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3448 error "(8.2.2) expect pattern flag hole, but got $pattern"
3450 stripes=$($LFS getstripe -c -I1 $name)
3451 [ $stripes -eq 2 ] ||
3452 error "(8.3.2) expect 2 stripes, but got $stripes"
3454 stripes=$($LFS getstripe -c -I2 $name)
3455 [ $stripes -eq 2 ] ||
3456 error "(8.3.2) expect 2 stripes, but got $stripes"
3458 e_start=$($LFS getstripe -I1 $name |
3459 awk '/lcme_extent.e_start:/ { print $2 }')
3460 [ $e_start -eq 0 ] ||
3461 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3463 e_end=$($LFS getstripe -I1 $name |
3464 awk '/lcme_extent.e_end:/ { print $2 }')
3465 [ $e_end -eq 2097152 ] ||
3466 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3468 e_start=$($LFS getstripe -I2 $name |
3469 awk '/lcme_extent.e_start:/ { print $2 }')
3470 [ $e_start -eq 2097152 ] ||
3471 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3473 e_end=$($LFS getstripe -I2 $name |
3474 awk '/lcme_extent.e_end:/ { print $2 }')
3475 [ "$e_end" = "EOF" ] ||
3476 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3478 size=$(stat $name | awk '/Size:/ { print $2 }')
3479 [ $size -eq $((4096 * $bcount)) ] ||
3480 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3482 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3484 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3485 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3487 # The first stripe in each COMP was lost
3488 [ $failures -eq 512 ] ||
3489 error "(8.8) expect 512 IO failures, but get $failures"
3491 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3492 [ $size -eq $((4096 * $bcount)) ] ||
3493 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3495 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3496 error "(8.10) write to the LOV EA hole should fail"
3498 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3499 error "(8.11) write to normal stripe should NOT fail"
3501 echo "foo" >> $name && error "(8.12) append write $name should fail"
3503 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3505 touch $name || error "(8.14) cannot touch $name"
3507 rm -f $name || error "(8.15) cannot unlink $name"
3510 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3512 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3513 echo "Check $name, it contains f2's first stripe in each COMP"
3515 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3517 pattern=$($LFS getstripe -L -I1 $name)
3518 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3519 error "(9.2.1) expect pattern flag hole, but got $pattern"
3521 pattern=$($LFS getstripe -L -I2 $name)
3522 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3523 error "(9.2.2) expect pattern flag hole, but got $pattern"
3525 stripes=$($LFS getstripe -c -I1 $name)
3526 [ $stripes -eq 2 ] ||
3527 error "(9.3.2) expect 2 stripes, but got $stripes"
3529 stripes=$($LFS getstripe -c -I2 $name)
3530 [ $stripes -eq 2 ] ||
3531 error "(9.3.2) expect 2 stripes, but got $stripes"
3533 e_start=$($LFS getstripe -I1 $name |
3534 awk '/lcme_extent.e_start:/ { print $2 }')
3535 [ $e_start -eq 0 ] ||
3536 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3538 e_end=$($LFS getstripe -I1 $name |
3539 awk '/lcme_extent.e_end:/ { print $2 }')
3540 [ $e_end -eq 2097152 ] ||
3541 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3543 e_start=$($LFS getstripe -I2 $name |
3544 awk '/lcme_extent.e_start:/ { print $2 }')
3545 [ $e_start -eq 2097152 ] ||
3546 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3548 e_end=$($LFS getstripe -I2 $name |
3549 awk '/lcme_extent.e_end:/ { print $2 }')
3550 [ "$e_end" = "EOF" ] ||
3551 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3553 size=$(stat $name | awk '/Size:/ { print $2 }')
3554 # The second stripe in COMP was lost, so we do not know there
3555 # have ever been some data before. 'stat' will regard it as
3556 # no data on the lost stripe.
3558 [ $size -eq $((4096 * $bcount)) ] ||
3559 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3561 cat $name > /dev/null &&
3562 error "(9.7) normal read $name should fail"
3564 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3565 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3566 [ $failures -eq 512 ] ||
3567 error "(9.8) expect 256 IO failures, but get $failures"
3569 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3570 # The second stripe in COMP was lost, so we do not know there
3571 # have ever been some data before. Since 'dd' skip failure,
3572 # it will regard the lost stripe contains data.
3574 [ $size -eq $((4096 * $bcount)) ] ||
3575 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3577 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3578 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3580 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3581 error "(9.11) write to normal stripe should NOT fail"
3583 echo "foo" >> $name &&
3584 error "(9.12) append write $name should fail"
3586 chown $RUNAS_ID:$RUNAS_GID $name ||
3587 error "(9.13) cannot chown on $name"
3589 touch $name || error "(9.14) cannot touch $name"
3591 rm -f $name || error "(7.15) cannot unlink $name"
3593 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3596 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3597 skip "ignore the test if MDS is older than 2.5.59" && return
3599 check_mount_and_prep
3600 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3602 echo "Start all LFSCK components by default (-s 1)"
3603 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3604 error "Fail to start LFSCK"
3606 echo "namespace LFSCK should be in 'scanning-phase1' status"
3607 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3608 [ "$STATUS" == "scanning-phase1" ] ||
3609 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3611 echo "layout LFSCK should be in 'scanning-phase1' status"
3612 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3613 [ "$STATUS" == "scanning-phase1" ] ||
3614 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3616 echo "Stop all LFSCK components by default"
3617 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3618 error "Fail to stop LFSCK"
3620 run_test 21 "run all LFSCK components by default"
3623 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3626 echo "The parent_A references the child directory via some name entry,"
3627 echo "but the child directory back references another parent_B via its"
3628 echo "".." name entry. The parent_B does not exist. Then the namespace"
3629 echo "LFSCK will repair the child directory's ".." name entry."
3632 check_mount_and_prep
3634 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3635 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3637 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3638 echo "The dummy's dotdot name entry references the guard."
3639 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3640 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3641 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3642 error "(3) Fail to mkdir on MDT0"
3643 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3645 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3647 echo "Trigger namespace LFSCK to repair unmatched pairs"
3648 $START_NAMESPACE -A -r ||
3649 error "(5) Fail to start LFSCK for namespace"
3651 wait_all_targets_blocked namespace completed 6
3653 local repaired=$($SHOW_NAMESPACE |
3654 awk '/^unmatched_pairs_repaired/ { print $2 }')
3655 [ $repaired -eq 1 ] ||
3656 error "(7) Fail to repair unmatched pairs: $repaired"
3658 echo "'ls' should success after namespace LFSCK repairing"
3659 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3660 error "(8) ls should success."
3662 run_test 22a "LFSCK can repair unmatched pairs (1)"
3665 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3668 echo "The parent_A references the child directory via the name entry_B,"
3669 echo "but the child directory back references another parent_C via its"
3670 echo "".." name entry. The parent_C exists, but there is no the name"
3671 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3672 echo "the child directory's ".." name entry and its linkEA."
3675 check_mount_and_prep
3677 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3678 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3680 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3681 echo "and bad linkEA. The dummy's dotdot name entry references the"
3682 echo "guard. The dummy's linkEA references n non-exist name entry."
3683 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3684 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3685 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3686 error "(3) Fail to mkdir on MDT0"
3687 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3689 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3690 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3691 local dummyname=$($LFS fid2path $DIR $dummyfid)
3692 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3693 error "(4) fid2path works unexpectedly."
3695 echo "Trigger namespace LFSCK to repair unmatched pairs"
3696 $START_NAMESPACE -A -r ||
3697 error "(5) Fail to start LFSCK for namespace"
3699 wait_all_targets_blocked namespace completed 6
3701 local repaired=$($SHOW_NAMESPACE |
3702 awk '/^unmatched_pairs_repaired/ { print $2 }')
3703 [ $repaired -eq 1 ] ||
3704 error "(7) Fail to repair unmatched pairs: $repaired"
3706 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3707 local dummyname=$($LFS fid2path $DIR $dummyfid)
3708 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3709 error "(8) fid2path does not work"
3711 run_test 22b "LFSCK can repair unmatched pairs (2)"
3714 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3717 echo "The name entry is there, but the MDT-object for such name "
3718 echo "entry does not exist. The namespace LFSCK should find out "
3719 echo "and repair the inconsistency as required."
3722 check_mount_and_prep
3724 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3725 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3727 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3728 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3729 do_facet mds2 $LCTL set_param fail_loc=0x1620
3730 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3731 do_facet mds2 $LCTL set_param fail_loc=0
3733 echo "'ls' should fail because of dangling name entry"
3734 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3736 echo "Trigger namespace LFSCK to find out dangling name entry"
3737 $START_NAMESPACE -A -r ||
3738 error "(5) Fail to start LFSCK for namespace"
3740 wait_all_targets_blocked namespace completed 6
3742 local repaired=$($SHOW_NAMESPACE |
3743 awk '/^dangling_repaired/ { print $2 }')
3744 [ $repaired -eq 1 ] ||
3745 error "(7) Fail to repair dangling name entry: $repaired"
3747 echo "'ls' should fail because not re-create MDT-object by default"
3748 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3750 echo "Trigger namespace LFSCK again to repair dangling name entry"
3751 $START_NAMESPACE -A -r -C ||
3752 error "(9) Fail to start LFSCK for namespace"
3754 wait_all_targets_blocked namespace completed 10
3756 repaired=$($SHOW_NAMESPACE |
3757 awk '/^dangling_repaired/ { print $2 }')
3758 [ $repaired -eq 1 ] ||
3759 error "(11) Fail to repair dangling name entry: $repaired"
3761 echo "'ls' should success after namespace LFSCK repairing"
3762 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3764 run_test 23a "LFSCK can repair dangling name entry (1)"
3768 echo "The objectA has multiple hard links, one of them corresponding"
3769 echo "to the name entry_B. But there is something wrong for the name"
3770 echo "entry_B and cause entry_B to references non-exist object_C."
3771 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3772 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3773 echo "comes to the second-stage scanning, it will find that the"
3774 echo "former re-creating object_C is not proper, and will try to"
3775 echo "replace the object_C with the real object_A."
3778 check_mount_and_prep
3780 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3781 $LFS path2fid $DIR/$tdir/d0
3783 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3785 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3786 $LFS path2fid $DIR/$tdir/d0/f0
3788 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3789 $LFS path2fid $DIR/$tdir/d0/f1
3791 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3792 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3794 if [ "$SEQ0" != "$SEQ1" ]; then
3795 # To guarantee that the f0 and f1 are in the same FID seq
3796 rm -f $DIR/$tdir/d0/f0 ||
3797 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3798 echo "dummy" > $DIR/$tdir/d0/f0 ||
3799 error "(3.2) Fail to touch on MDT0"
3800 $LFS path2fid $DIR/$tdir/d0/f0
3803 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3804 OID=$(printf %d $OID)
3806 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3807 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3808 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3809 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3810 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3812 # If there is creation after the dangling injection, it may re-use
3813 # the just released local object (inode) that is referenced by the
3814 # dangling name entry. It will fail the dangling injection.
3815 # So before deleting the target object for the dangling name entry,
3816 # remove some other objects to avoid the target object being reused
3817 # by some potential creations. LU-7429
3818 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3820 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3822 echo "'ls' should fail because of dangling name entry"
3823 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3824 error "(6) ls should fail."
3826 echo "Trigger namespace LFSCK to find out dangling name entry"
3827 $START_NAMESPACE -r -C ||
3828 error "(7) Fail to start LFSCK for namespace"
3830 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3831 mdd.${MDT_DEV}.lfsck_namespace |
3832 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3834 error "(8) unexpected status"
3837 local repaired=$($SHOW_NAMESPACE |
3838 awk '/^dangling_repaired/ { print $2 }')
3839 [ $repaired -eq 1 ] ||
3840 error "(9) Fail to repair dangling name entry: $repaired"
3842 repaired=$($SHOW_NAMESPACE |
3843 awk '/^multiple_linked_repaired/ { print $2 }')
3844 [ $repaired -eq 1 ] ||
3845 error "(10) Fail to drop the former created object: $repaired"
3847 local data=$(cat $DIR/$tdir/d0/foo)
3848 [ "$data" == "dummy" ] ||
3849 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3851 run_test 23b "LFSCK can repair dangling name entry (2)"
3855 echo "The objectA has multiple hard links, one of them corresponding"
3856 echo "to the name entry_B. But there is something wrong for the name"
3857 echo "entry_B and cause entry_B to references non-exist object_C."
3858 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3859 echo "as dangling, and re-create the lost object_C. And then others"
3860 echo "modified the re-created object_C. When the LFSCK comes to the"
3861 echo "second-stage scanning, it will find that the former re-creating"
3862 echo "object_C maybe wrong and try to replace the object_C with the"
3863 echo "real object_A. But because object_C has been modified, so the"
3864 echo "LFSCK cannot replace it."
3867 start_full_debug_logging
3869 check_mount_and_prep
3871 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3872 $LFS path2fid $DIR/$tdir/d0
3874 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3876 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3877 $LFS path2fid $DIR/$tdir/d0/f0
3879 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3880 $LFS path2fid $DIR/$tdir/d0/f1
3882 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3883 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3885 if [ "$SEQ0" != "$SEQ1" ]; then
3886 # To guarantee that the f0 and f1 are in the same FID seq
3887 rm -f $DIR/$tdir/d0/f0 ||
3888 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3889 echo "dummy" > $DIR/$tdir/d0/f0 ||
3890 error "(3.2) Fail to touch on MDT0"
3891 $LFS path2fid $DIR/$tdir/d0/f0
3894 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3895 OID=$(printf %d $OID)
3897 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3898 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3899 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3900 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3901 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3903 # If there is creation after the dangling injection, it may re-use
3904 # the just released local object (inode) that is referenced by the
3905 # dangling name entry. It will fail the dangling injection.
3906 # So before deleting the target object for the dangling name entry,
3907 # remove some other objects to avoid the target object being reused
3908 # by some potential creations. LU-7429
3909 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3911 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3913 echo "'ls' should fail because of dangling name entry"
3914 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3915 error "(6) ls should fail."
3917 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3918 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3920 echo "Trigger namespace LFSCK to find out dangling name entry"
3921 $START_NAMESPACE -r -C ||
3922 error "(7) Fail to start LFSCK for namespace"
3924 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3925 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3926 stat $DIR/$tdir/d0/foo
3928 error "(8) unexpected size"
3931 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3932 cancel_lru_locks osc
3934 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3935 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3936 mdd.${MDT_DEV}.lfsck_namespace |
3937 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3939 error "(10) unexpected status"
3942 stop_full_debug_logging
3944 local repaired=$($SHOW_NAMESPACE |
3945 awk '/^dangling_repaired/ { print $2 }')
3946 [ $repaired -eq 1 ] ||
3947 error "(11) Fail to repair dangling name entry: $repaired"
3949 local data=$(cat $DIR/$tdir/d0/foo)
3950 [ "$data" != "dummy" ] ||
3951 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3953 run_test 23c "LFSCK can repair dangling name entry (3)"
3956 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3959 echo "Two MDT-objects back reference the same name entry via their"
3960 echo "each own linkEA entry, but the name entry only references one"
3961 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3962 echo "for the MDT-object that is not recognized. If such MDT-object"
3963 echo "has no other linkEA entry after the removing, then the LFSCK"
3964 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3967 check_mount_and_prep
3969 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3971 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3972 $LFS path2fid $DIR/$tdir/d0/guard
3974 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3975 $LFS path2fid $DIR/$tdir/d0/dummy
3978 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3979 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3981 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3984 touch $DIR/$tdir/d0/guard/foo ||
3985 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3987 echo "Inject failure stub on MDT0 to simulate the case that"
3988 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3989 echo "that references $DIR/$tdir/d0/guard/foo."
3990 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3991 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3992 echo "there with the same linkEA entry as another MDT-object"
3993 echo "$DIR/$tdir/d0/guard/foo has"
3995 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3996 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3997 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3998 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3999 $LFS path2fid $DIR/$tdir/d0/dummy/foo
4000 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
4001 rmdir $DIR/$tdir/d0/dummy/foo ||
4002 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
4003 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4005 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4006 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4007 error "(6) stat successfully unexpectedly"
4009 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4010 $START_NAMESPACE -A -r ||
4011 error "(7) Fail to start LFSCK for namespace"
4013 wait_all_targets_blocked namespace completed 8
4015 local repaired=$($SHOW_NAMESPACE |
4016 awk '/^multiple_referenced_repaired/ { print $2 }')
4017 [ $repaired -eq 1 ] ||
4018 error "(9) Fail to repair multiple referenced name entry: $repaired"
4020 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4021 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4022 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4024 local cname="$cfid-$pfid-D-0"
4025 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4026 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4028 run_test 24 "LFSCK can repair multiple-referenced name entry"
4031 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4032 skip "ldiskfs only test" && return
4035 echo "The file type in the name entry does not match the file type"
4036 echo "claimed by the referenced object. Then the LFSCK will update"
4037 echo "the file type in the name entry."
4040 check_mount_and_prep
4042 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4044 echo "Inject failure stub on MDT0 to simulate the case that"
4045 echo "the file type stored in the name entry is wrong."
4047 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4048 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4049 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4050 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4052 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4053 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4055 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4056 mdd.${MDT_DEV}.lfsck_namespace |
4057 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4059 error "(4) unexpected status"
4062 local repaired=$($SHOW_NAMESPACE |
4063 awk '/^bad_file_type_repaired/ { print $2 }')
4064 [ $repaired -eq 1 ] ||
4065 error "(5) Fail to repair bad file type in name entry: $repaired"
4067 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4069 run_test 25 "LFSCK can repair bad file type in the name entry"
4073 echo "The local name entry back referenced by the MDT-object is lost."
4074 echo "The namespace LFSCK will add the missing local name entry back"
4075 echo "to the normal namespace."
4078 check_mount_and_prep
4080 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4081 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4082 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4084 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4085 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4087 echo "Inject failure stub on MDT0 to simulate the case that"
4088 echo "foo's name entry will be removed, but the foo's object"
4089 echo "and its linkEA are kept in the system."
4091 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4092 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4093 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4094 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4096 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4097 error "(5) 'ls' should fail"
4099 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4100 $START_NAMESPACE -r -A ||
4101 error "(6) Fail to start LFSCK for namespace"
4103 wait_all_targets_blocked namespace completed 7
4105 local repaired=$($SHOW_NAMESPACE |
4106 awk '/^lost_dirent_repaired/ { print $2 }')
4107 [ $repaired -eq 1 ] ||
4108 error "(8) Fail to repair lost dirent: $repaired"
4110 ls -ail $DIR/$tdir/d0/foo ||
4111 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4113 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4114 [ "$foofid" == "$foofid2" ] ||
4115 error "(10) foo's FID changed: $foofid, $foofid2"
4117 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4120 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4123 echo "The remote name entry back referenced by the MDT-object is lost."
4124 echo "The namespace LFSCK will add the missing remote name entry back"
4125 echo "to the normal namespace."
4128 check_mount_and_prep
4130 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4131 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4132 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4134 echo "Inject failure stub on MDT0 to simulate the case that"
4135 echo "foo's name entry will be removed, but the foo's object"
4136 echo "and its linkEA are kept in the system."
4138 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4139 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4140 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4141 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4143 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4144 error "(4) 'ls' should fail"
4146 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4147 $START_NAMESPACE -r -A ||
4148 error "(5) Fail to start LFSCK for namespace"
4150 wait_all_targets_blocked namespace completed 6
4152 local repaired=$($SHOW_NAMESPACE |
4153 awk '/^lost_dirent_repaired/ { print $2 }')
4154 [ $repaired -eq 1 ] ||
4155 error "(7) Fail to repair lost dirent: $repaired"
4157 ls -ail $DIR/$tdir/d0/foo ||
4158 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4160 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4161 [ "$foofid" == "$foofid2" ] ||
4162 error "(9) foo's FID changed: $foofid, $foofid2"
4164 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4168 echo "The local parent referenced by the MDT-object linkEA is lost."
4169 echo "The namespace LFSCK will re-create the lost parent as orphan."
4172 check_mount_and_prep
4174 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4175 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4176 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4177 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4179 echo "Inject failure stub on MDT0 to simulate the case that"
4180 echo "foo's name entry will be removed, but the foo's object"
4181 echo "and its linkEA are kept in the system. And then remove"
4182 echo "another hard link and the parent directory."
4184 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4185 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4186 rm -f $DIR/$tdir/d0/foo ||
4187 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4188 rm -f $DIR/$tdir/d0/dummy ||
4189 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4190 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4192 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4193 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4195 echo "Trigger namespace LFSCK to repair the lost parent"
4196 $START_NAMESPACE -r -A ||
4197 error "(6) Fail to start LFSCK for namespace"
4199 wait_all_targets_blocked namespace completed 7
4201 local repaired=$($SHOW_NAMESPACE |
4202 awk '/^lost_dirent_repaired/ { print $2 }')
4203 [ $repaired -eq 1 ] ||
4204 error "(8) Fail to repair lost dirent: $repaired"
4206 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4207 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4208 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4210 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4212 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4213 [ ! -z "$cname" ] ||
4214 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4216 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4219 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4222 echo "The remote parent referenced by the MDT-object linkEA is lost."
4223 echo "The namespace LFSCK will re-create the lost parent as orphan."
4226 check_mount_and_prep
4228 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4229 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4231 $LFS path2fid $DIR/$tdir/d0
4233 echo "Inject failure stub on MDT0 to simulate the case that"
4234 echo "foo's name entry will be removed, but the foo's object"
4235 echo "and its linkEA are kept in the system. And then remove"
4236 echo "the parent directory."
4238 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4239 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4240 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4241 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4243 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4244 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4246 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4247 $START_NAMESPACE -r -A ||
4248 error "(6) Fail to start LFSCK for namespace"
4250 wait_all_targets_blocked namespace completed 7
4252 local repaired=$($SHOW_NAMESPACE |
4253 awk '/^lost_dirent_repaired/ { print $2 }')
4254 [ $repaired -eq 1 ] ||
4255 error "(8) Fail to repair lost dirent: $repaired"
4257 ls -ail $MOUNT/.lustre/lost+found/
4259 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4260 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4261 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4263 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4265 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4266 [ ! -z "$cname" ] ||
4267 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4269 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4272 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4275 echo "The target name entry is lost. The LFSCK should insert the"
4276 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4277 echo "the MDT (on which the orphan MDT-object resides) has ever"
4278 echo "failed to respond some name entry verification during the"
4279 echo "first stage-scanning, then the LFSCK should skip to handle"
4280 echo "orphan MDT-object on this MDT. But other MDTs should not"
4284 check_mount_and_prep
4285 $LFS mkdir -i 0 $DIR/$tdir/d1
4286 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4287 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4289 $LFS mkdir -i 1 $DIR/$tdir/d2
4290 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4291 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4293 echo "Inject failure stub on MDT0 to simulate the case that"
4294 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4295 echo "and its linkEA are kept in the system. And the case that"
4296 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4297 echo "and its linkEA are kept in the system."
4299 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4300 do_facet mds1 $LCTL set_param fail_loc=0x1624
4301 do_facet mds2 $LCTL set_param fail_loc=0x1624
4302 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4303 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4304 do_facet mds1 $LCTL set_param fail_loc=0
4305 do_facet mds2 $LCTL set_param fail_loc=0
4307 cancel_lru_locks mdc
4308 cancel_lru_locks osc
4310 echo "Inject failure, to simulate the MDT0 fail to handle"
4311 echo "MDT1 LFSCK request during the first-stage scanning."
4312 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4313 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4315 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4316 $START_NAMESPACE -r -A ||
4317 error "(3) Fail to start LFSCK for namespace"
4319 wait_update_facet mds1 "$LCTL get_param -n \
4320 mdd.$(facet_svc mds1).lfsck_namespace |
4321 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4322 error "(4) mds1 is not the expected 'partial'"
4325 wait_update_facet mds2 "$LCTL get_param -n \
4326 mdd.$(facet_svc mds2).lfsck_namespace |
4327 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4328 error "(5) mds2 is not the expected 'completed'"
4331 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4333 local repaired=$(do_facet mds1 $LCTL get_param -n \
4334 mdd.$(facet_svc mds1).lfsck_namespace |
4335 awk '/^lost_dirent_repaired/ { print $2 }')
4336 [ $repaired -eq 0 ] ||
4337 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4339 repaired=$(do_facet mds2 $LCTL get_param -n \
4340 mdd.$(facet_svc mds2).lfsck_namespace |
4341 awk '/^lost_dirent_repaired/ { print $2 }')
4342 [ $repaired -eq 1 ] ||
4343 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4345 echo "Trigger namespace LFSCK on all devices again to cleanup"
4346 $START_NAMESPACE -r -A ||
4347 error "(8) Fail to start LFSCK for namespace"
4349 wait_all_targets_blocked namespace completed 9
4351 local repaired=$(do_facet mds1 $LCTL get_param -n \
4352 mdd.$(facet_svc mds1).lfsck_namespace |
4353 awk '/^lost_dirent_repaired/ { print $2 }')
4354 [ $repaired -eq 1 ] ||
4355 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4357 repaired=$(do_facet mds2 $LCTL get_param -n \
4358 mdd.$(facet_svc mds2).lfsck_namespace |
4359 awk '/^lost_dirent_repaired/ { print $2 }')
4360 [ $repaired -eq 0 ] ||
4361 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4363 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4367 echo "The object's nlink attribute is larger than the object's known"
4368 echo "name entries count. The LFSCK will repair the object's nlink"
4369 echo "attribute to match the known name entries count"
4372 check_mount_and_prep
4374 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4375 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4377 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4378 echo "nlink attribute is larger than its name entries count."
4380 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4381 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4382 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4383 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4384 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4386 cancel_lru_locks mdc
4387 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4388 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4390 echo "Trigger namespace LFSCK to repair the nlink count"
4391 $START_NAMESPACE -r -A ||
4392 error "(5) Fail to start LFSCK for namespace"
4394 wait_all_targets_blocked namespace completed 6
4396 local repaired=$($SHOW_NAMESPACE |
4397 awk '/^nlinks_repaired/ { print $2 }')
4398 [ $repaired -eq 1 ] ||
4399 error "(7) Fail to repair nlink count: $repaired"
4401 cancel_lru_locks mdc
4402 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4403 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4405 # Disable 29a, we only allow nlink to be updated if the known linkEA
4406 # entries is larger than nlink count.
4408 #run_test 29a "LFSCK can repair bad nlink count (1)"
4412 echo "The object's nlink attribute is smaller than the object's known"
4413 echo "name entries count. The LFSCK will repair the object's nlink"
4414 echo "attribute to match the known name entries count"
4417 check_mount_and_prep
4419 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4420 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4422 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4423 echo "nlink attribute is smaller than its name entries count."
4425 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4427 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4428 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4429 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4431 cancel_lru_locks mdc
4432 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4433 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4435 echo "Trigger namespace LFSCK to repair the nlink count"
4436 $START_NAMESPACE -r -A ||
4437 error "(5) Fail to start LFSCK for namespace"
4439 wait_all_targets_blocked namespace completed 6
4441 local repaired=$($SHOW_NAMESPACE |
4442 awk '/^nlinks_repaired/ { print $2 }')
4443 [ $repaired -eq 1 ] ||
4444 error "(7) Fail to repair nlink count: $repaired"
4446 cancel_lru_locks mdc
4447 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4448 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4450 run_test 29b "LFSCK can repair bad nlink count (2)"
4455 echo "The namespace LFSCK will create many hard links to the target"
4456 echo "file as to exceed the linkEA size limitation. Under such case"
4457 echo "the linkEA will be marked as overflow that will prevent the"
4458 echo "target file to be migrated. Then remove some hard links to"
4459 echo "make the left hard links to be held within the linkEA size"
4460 echo "limitation. But before the namespace LFSCK adding all the"
4461 echo "missed linkEA entries back, the overflow mark (timestamp)"
4462 echo "will not be cleared."
4465 check_mount_and_prep
4467 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4468 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4469 error "(0.2) Fail to mkdir"
4470 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4471 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4473 # define MAX_LINKEA_SIZE 4096
4474 # sizeof(link_ea_header) = 24
4475 # sizeof(link_ea_entry) = 18
4476 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4477 # (sizeof(link_ea_entry) + name_length))
4478 # If the average name length is 12 bytes, then 150 hard links
4479 # is totally enough to overflow the linkEA
4480 echo "Create 150 hard links should succeed although the linkEA overflow"
4481 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4482 error "(2) Fail to hard link"
4484 cancel_lru_locks mdc
4485 if [ $MDSCOUNT -ge 2 ]; then
4486 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4487 error "(3.1) Migrate failure"
4489 echo "The object with linkEA overflow should NOT be migrated"
4490 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4491 [ "$newfid" == "$oldfid" ] ||
4492 error "(3.2) Migrate should fail: $newfid != $oldfid"
4495 # Remove 100 hard links, then the linkEA should have space
4496 # to hold the missed linkEA entries.
4497 echo "Remove 100 hard links to save space for the missed linkEA entries"
4498 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4500 if [ $MDSCOUNT -ge 2 ]; then
4501 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4502 error "(5.1) Migrate failure"
4504 # The overflow timestamp is still there, so migration will fail.
4505 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4506 [ "$newfid" == "$oldfid" ] ||
4507 error "(5.2) Migrate should fail: $newfid != $oldfid"
4510 # sleep 3 seconds to guarantee that the overflow is recognized
4513 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4514 $START_NAMESPACE -r -A ||
4515 error "(6) Fail to start LFSCK for namespace"
4517 wait_all_targets_blocked namespace completed 7
4519 local repaired=$($SHOW_NAMESPACE |
4520 awk '/^linkea_overflow_cleared/ { print $2 }')
4521 [ $repaired -eq 1 ] ||
4522 error "(8) Fail to clear linkea overflow: $repaired"
4524 repaired=$($SHOW_NAMESPACE |
4525 awk '/^nlinks_repaired/ { print $2 }')
4526 [ $repaired -eq 0 ] ||
4527 error "(9) Unexpected nlink repaired: $repaired"
4529 if [ $MDSCOUNT -ge 2 ]; then
4530 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4531 error "(10.1) Migrate failure"
4533 # Migration should succeed after clear the overflow timestamp.
4534 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4535 [ "$newfid" != "$oldfid" ] ||
4536 error "(10.2) Migrate should succeed"
4538 ls -l $DIR/$tdir/foo > /dev/null ||
4539 error "(11) 'ls' failed after migration"
4542 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4543 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4545 run_test 29c "verify linkEA size limitation"
4548 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4549 skip "ldiskfs only test" && return
4552 echo "The namespace LFSCK will move the orphans from backend"
4553 echo "/lost+found directory to normal client visible namespace"
4554 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4557 check_mount_and_prep
4559 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4560 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4562 echo "Inject failure stub on MDT0 to simulate the case that"
4563 echo "directory d0 has no linkEA entry, then the LFSCK will"
4564 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4566 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4567 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4568 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4569 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4571 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4572 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4574 echo "Inject failure stub on MDT0 to simulate the case that the"
4575 echo "object's name entry will be removed, but not destroy the"
4576 echo "object. Then backend e2fsck will handle it as orphan and"
4577 echo "add them into the backend /lost+found directory."
4579 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4580 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4581 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4582 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4583 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4584 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4585 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4587 umount_client $MOUNT || error "(10) Fail to stop client!"
4589 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4592 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4593 error "(12) Fail to run e2fsck"
4595 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4596 error "(13) Fail to start MDT0"
4598 echo "Trigger namespace LFSCK to recover backend orphans"
4599 $START_NAMESPACE -r -A ||
4600 error "(14) Fail to start LFSCK for namespace"
4602 wait_all_targets_blocked namespace completed 15
4604 local repaired=$($SHOW_NAMESPACE |
4605 awk '/^local_lost_found_moved/ { print $2 }')
4606 [ $repaired -ge 4 ] ||
4607 error "(16) Fail to recover backend orphans: $repaired"
4609 mount_client $MOUNT || error "(17) Fail to start client!"
4611 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4613 ls -ail $MOUNT/.lustre/lost+found/
4615 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4616 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4617 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4619 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4621 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
4622 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4624 stat ${cname}/d1 || error "(21) d0 is not recovered"
4625 stat ${cname}/f1 || error "(22) f1 is not recovered"
4627 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4630 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4633 echo "For the name entry under a striped directory, if the name"
4634 echo "hash does not match the shard, then the LFSCK will repair"
4635 echo "the bad name entry"
4638 check_mount_and_prep
4640 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4641 error "(1) Fail to create striped directory"
4643 echo "Inject failure stub on client to simulate the case that"
4644 echo "some name entry should be inserted into other non-first"
4645 echo "shard, but inserted into the first shard by wrong"
4647 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4648 $LCTL set_param fail_loc=0x1628 fail_val=0
4649 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4650 error "(2) Fail to create file under striped directory"
4651 $LCTL set_param fail_loc=0 fail_val=0
4653 echo "Trigger namespace LFSCK to repair bad name hash"
4654 $START_NAMESPACE -r -A ||
4655 error "(3) Fail to start LFSCK for namespace"
4657 wait_all_targets_blocked namespace completed 4
4659 local repaired=$($SHOW_NAMESPACE |
4660 awk '/^name_hash_repaired/ { print $2 }')
4661 [ $repaired -ge 1 ] ||
4662 error "(5) Fail to repair bad name hash: $repaired"
4664 umount_client $MOUNT || error "(6) umount failed"
4665 mount_client $MOUNT || error "(7) mount failed"
4667 for ((i = 0; i < $MDSCOUNT; i++)); do
4668 stat $DIR/$tdir/striped_dir/d$i ||
4669 error "(8) Fail to stat d$i after LFSCK"
4670 rmdir $DIR/$tdir/striped_dir/d$i ||
4671 error "(9) Fail to unlink d$i after LFSCK"
4674 rmdir $DIR/$tdir/striped_dir ||
4675 error "(10) Fail to remove the striped directory after LFSCK"
4677 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4680 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4683 echo "For the name entry under a striped directory, if the name"
4684 echo "hash does not match the shard, then the LFSCK will repair"
4685 echo "the bad name entry"
4688 check_mount_and_prep
4690 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4691 error "(1) Fail to create striped directory"
4693 echo "Inject failure stub on client to simulate the case that"
4694 echo "some name entry should be inserted into other non-second"
4695 echo "shard, but inserted into the secod shard by wrong"
4697 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4698 $LCTL set_param fail_loc=0x1628 fail_val=1
4699 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4700 error "(2) Fail to create file under striped directory"
4701 $LCTL set_param fail_loc=0 fail_val=0
4703 echo "Trigger namespace LFSCK to repair bad name hash"
4704 $START_NAMESPACE -r -A ||
4705 error "(3) Fail to start LFSCK for namespace"
4707 wait_all_targets_blocked namespace completed 4
4709 local repaired=$(do_facet mds2 $LCTL get_param -n \
4710 mdd.$(facet_svc mds2).lfsck_namespace |
4711 awk '/^name_hash_repaired/ { print $2 }')
4712 [ $repaired -ge 1 ] ||
4713 error "(5) Fail to repair bad name hash: $repaired"
4715 umount_client $MOUNT || error "(6) umount failed"
4716 mount_client $MOUNT || error "(7) mount failed"
4718 for ((i = 0; i < $MDSCOUNT; i++)); do
4719 stat $DIR/$tdir/striped_dir/d$i ||
4720 error "(8) Fail to stat d$i after LFSCK"
4721 rmdir $DIR/$tdir/striped_dir/d$i ||
4722 error "(9) Fail to unlink d$i after LFSCK"
4725 rmdir $DIR/$tdir/striped_dir ||
4726 error "(10) Fail to remove the striped directory after LFSCK"
4728 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4731 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4734 echo "For some reason, the master MDT-object of the striped directory"
4735 echo "may lost its master LMV EA. If nobody created files under the"
4736 echo "master directly after the master LMV EA lost, then the LFSCK"
4737 echo "should re-generate the master LMV EA."
4740 check_mount_and_prep
4742 echo "Inject failure stub on MDT0 to simulate the case that the"
4743 echo "master MDT-object of the striped directory lost the LMV EA."
4745 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4746 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4747 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4748 error "(1) Fail to create striped directory"
4749 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4751 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4752 $START_NAMESPACE -r -A ||
4753 error "(2) Fail to start LFSCK for namespace"
4755 wait_all_targets_blocked namespace completed 3
4757 local repaired=$($SHOW_NAMESPACE |
4758 awk '/^striped_dirs_repaired/ { print $2 }')
4759 [ $repaired -eq 1 ] ||
4760 error "(4) Fail to re-generate master LMV EA: $repaired"
4762 umount_client $MOUNT || error "(5) umount failed"
4763 mount_client $MOUNT || error "(6) mount failed"
4765 local empty=$(ls $DIR/$tdir/striped_dir/)
4766 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4768 rmdir $DIR/$tdir/striped_dir ||
4769 error "(8) Fail to remove the striped directory after LFSCK"
4771 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4774 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4777 echo "For some reason, the master MDT-object of the striped directory"
4778 echo "may lost its master LMV EA. If somebody created files under the"
4779 echo "master directly after the master LMV EA lost, then the LFSCK"
4780 echo "should NOT re-generate the master LMV EA, instead, it should"
4781 echo "change the broken striped dirctory as read-only to prevent"
4782 echo "further damage"
4785 check_mount_and_prep
4787 echo "Inject failure stub on MDT0 to simulate the case that the"
4788 echo "master MDT-object of the striped directory lost the LMV EA."
4790 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4791 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4792 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4793 error "(1) Fail to create striped directory"
4794 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4796 umount_client $MOUNT || error "(2) umount failed"
4797 mount_client $MOUNT || error "(3) mount failed"
4799 touch $DIR/$tdir/striped_dir/dummy ||
4800 error "(4) Fail to touch under broken striped directory"
4802 echo "Trigger namespace LFSCK to find out the inconsistency"
4803 $START_NAMESPACE -r -A ||
4804 error "(5) Fail to start LFSCK for namespace"
4806 wait_all_targets_blocked namespace completed 6
4808 local repaired=$($SHOW_NAMESPACE |
4809 awk '/^striped_dirs_repaired/ { print $2 }')
4810 [ $repaired -eq 0 ] ||
4811 error "(7) Re-generate master LMV EA unexpected: $repaired"
4813 stat $DIR/$tdir/striped_dir/dummy ||
4814 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4816 touch $DIR/$tdir/striped_dir/foo &&
4817 error "(9) The broken striped directory should be read-only"
4819 chattr -i $DIR/$tdir/striped_dir ||
4820 error "(10) Fail to chattr on the broken striped directory"
4822 rmdir $DIR/$tdir/striped_dir ||
4823 error "(11) Fail to remove the striped directory after LFSCK"
4825 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4828 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4831 echo "For some reason, the slave MDT-object of the striped directory"
4832 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4833 echo "slave LMV EA."
4836 check_mount_and_prep
4838 echo "Inject failure stub on MDT0 to simulate the case that the"
4839 echo "slave MDT-object (that resides on the same MDT as the master"
4840 echo "MDT-object resides on) lost the LMV EA."
4842 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4843 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4844 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4845 error "(1) Fail to create striped directory"
4846 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4848 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4849 $START_NAMESPACE -r -A ||
4850 error "(2) Fail to start LFSCK for namespace"
4852 wait_all_targets_blocked namespace completed 3
4854 local repaired=$($SHOW_NAMESPACE |
4855 awk '/^striped_shards_repaired/ { print $2 }')
4856 [ $repaired -eq 1 ] ||
4857 error "(4) Fail to re-generate slave LMV EA: $repaired"
4859 rmdir $DIR/$tdir/striped_dir ||
4860 error "(5) Fail to remove the striped directory after LFSCK"
4862 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4865 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4868 echo "For some reason, the slave MDT-object of the striped directory"
4869 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4870 echo "slave LMV EA."
4873 check_mount_and_prep
4875 echo "Inject failure stub on MDT0 to simulate the case that the"
4876 echo "slave MDT-object (that resides on different MDT as the master"
4877 echo "MDT-object resides on) lost the LMV EA."
4879 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4880 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4881 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4882 error "(1) Fail to create striped directory"
4883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4885 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4886 $START_NAMESPACE -r -A ||
4887 error "(2) Fail to start LFSCK for namespace"
4889 wait_all_targets_blocked namespace completed 3
4891 local repaired=$(do_facet mds2 $LCTL get_param -n \
4892 mdd.$(facet_svc mds2).lfsck_namespace |
4893 awk '/^striped_shards_repaired/ { print $2 }')
4894 [ $repaired -eq 1 ] ||
4895 error "(4) Fail to re-generate slave LMV EA: $repaired"
4897 rmdir $DIR/$tdir/striped_dir ||
4898 error "(5) Fail to remove the striped directory after LFSCK"
4900 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4903 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4906 echo "For some reason, the stripe index in the slave LMV EA is"
4907 echo "corrupted. The LFSCK should repair the slave LMV EA."
4910 check_mount_and_prep
4912 echo "Inject failure stub on MDT0 to simulate the case that the"
4913 echo "slave LMV EA on the first shard of the striped directory"
4914 echo "claims the same index as the second shard claims"
4916 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4917 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4918 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4919 error "(1) Fail to create striped directory"
4920 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4922 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4923 $START_NAMESPACE -r -A ||
4924 error "(2) Fail to start LFSCK for namespace"
4926 wait_all_targets_blocked namespace completed 3
4928 local repaired=$($SHOW_NAMESPACE |
4929 awk '/^striped_shards_repaired/ { print $2 }')
4930 [ $repaired -eq 1 ] ||
4931 error "(4) Fail to repair slave LMV EA: $repaired"
4933 umount_client $MOUNT || error "(5) umount failed"
4934 mount_client $MOUNT || error "(6) mount failed"
4936 touch $DIR/$tdir/striped_dir/foo ||
4937 error "(7) Fail to touch file after the LFSCK"
4939 rm -f $DIR/$tdir/striped_dir/foo ||
4940 error "(8) Fail to unlink file after the LFSCK"
4942 rmdir $DIR/$tdir/striped_dir ||
4943 error "(9) Fail to remove the striped directory after LFSCK"
4945 run_test 31g "Repair the corrupted slave LMV EA"
4948 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4951 echo "For some reason, the shard's name entry in the striped"
4952 echo "directory may be corrupted. The LFSCK should repair the"
4953 echo "bad shard's name entry."
4956 check_mount_and_prep
4958 echo "Inject failure stub on MDT0 to simulate the case that the"
4959 echo "first shard's name entry in the striped directory claims"
4960 echo "the same index as the second shard's name entry claims."
4962 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4963 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4964 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4965 error "(1) Fail to create striped directory"
4966 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4968 echo "Trigger namespace LFSCK to repair the shard's name entry"
4969 $START_NAMESPACE -r -A ||
4970 error "(2) Fail to start LFSCK for namespace"
4972 wait_all_targets_blocked namespace completed 3
4974 local repaired=$($SHOW_NAMESPACE |
4975 awk '/^dirent_repaired/ { print $2 }')
4976 [ $repaired -eq 1 ] ||
4977 error "(4) Fail to repair shard's name entry: $repaired"
4979 umount_client $MOUNT || error "(5) umount failed"
4980 mount_client $MOUNT || error "(6) mount failed"
4982 touch $DIR/$tdir/striped_dir/foo ||
4983 error "(7) Fail to touch file after the LFSCK"
4985 rm -f $DIR/$tdir/striped_dir/foo ||
4986 error "(8) Fail to unlink file after the LFSCK"
4988 rmdir $DIR/$tdir/striped_dir ||
4989 error "(9) Fail to remove the striped directory after LFSCK"
4991 run_test 31h "Repair the corrupted shard's name entry"
4996 umount_client $MOUNT
4998 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
4999 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5000 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
5002 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5003 [ "$STATUS" == "scanning-phase1" ] ||
5004 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
5007 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
5009 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5013 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
5015 run_test 32 "stop LFSCK when some OST failed"
5021 $START_LAYOUT --dryrun -o -r ||
5022 error "(1) Fail to start layout LFSCK"
5023 wait_all_targets_blocked layout completed 2
5025 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5026 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5027 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5029 $START_NAMESPACE -e abort -A -r ||
5030 error "(4) Fail to start namespace LFSCK"
5031 wait_all_targets_blocked namespace completed 5
5033 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5034 [ "$PARAMS" == "failout,all_targets" ] ||
5035 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5037 run_test 33 "check LFSCK paramters"
5039 # restore MDS/OST size
5040 MDSSIZE=${SAVED_MDSSIZE}
5041 OSTSIZE=${SAVED_OSTSIZE}
5042 OSTCOUNT=${SAVED_OSTCOUNT}
5044 # cleanup the system at last