3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
216 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
217 skip "OI Scrub not implemented for ZFS" && return
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^linkea_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase2/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair crashed linkEA: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
335 error "(7) Fail to stat $DIR/$tdir/dummy"
337 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
338 local dummyname=$($LFS fid2path $DIR $dummyfid)
339 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
340 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
342 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
348 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
350 touch $DIR/$tdir/dummy
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
354 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
356 mdd.${MDT_DEV}.lfsck_namespace |
357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
359 error "(4) unexpected status"
362 local repaired=$($SHOW_NAMESPACE |
363 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
422 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^linkea_repaired/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2d "LFSCK can recover the missing linkEA entry"
457 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
461 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
463 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
464 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
465 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
466 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
468 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
470 wait_all_targets_blocked namespace completed 4
472 local repaired=$($SHOW_NAMESPACE |
473 awk '/^linkea_repaired/ { print $2 }')
474 [ $repaired -eq 1 ] ||
475 error "(5) Fail to repair crashed linkEA: $repaired"
477 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
478 local name=$($LFS fid2path $DIR $fid)
479 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
480 error "(6) Fail to repair linkEA: $fid $name"
482 run_test 2e "namespace LFSCK can verify remote object linkEA"
488 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
489 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
490 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
492 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
493 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
494 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
496 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
498 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
500 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
501 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
502 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
504 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
506 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
507 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
508 mdd.${MDT_DEV}.lfsck_namespace |
509 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
511 error "(10) unexpected status"
514 local checked=$($SHOW_NAMESPACE |
515 awk '/^checked_phase2/ { print $2 }')
516 [ $checked -ge 4 ] ||
517 error "(11) Fail to check multiple-linked object: $checked"
519 local repaired=$($SHOW_NAMESPACE |
520 awk '/^multiple_linked_repaired/ { print $2 }')
521 [ $repaired -ge 2 ] ||
522 error "(12) Fail to repair multiple-linked object: $repaired"
524 run_test 3 "LFSCK can verify multiple-linked objects"
528 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
529 skip "OI Scrub not implemented for ZFS" && return
532 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
533 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
535 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
536 echo "start $SINGLEMDS with disabling OI scrub"
537 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
538 error "(2) Fail to start MDS!"
540 #define OBD_FAIL_LFSCK_DELAY2 0x1601
541 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
542 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
543 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
544 mdd.${MDT_DEV}.lfsck_namespace |
545 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
547 error "(5) unexpected status"
550 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
551 [ "$STATUS" == "scanning-phase1" ] ||
552 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
554 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
555 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
556 mdd.${MDT_DEV}.lfsck_namespace |
557 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
559 error "(7) unexpected status"
562 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
563 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
565 local repaired=$($SHOW_NAMESPACE |
566 awk '/^dirent_repaired/ { print $2 }')
567 # for interop with old server
568 [ -z "$repaired" ] &&
569 repaired=$($SHOW_NAMESPACE |
570 awk '/^updated_phase1/ { print $2 }')
572 [ $repaired -ge 9 ] ||
573 error "(9) Fail to re-generate FID-in-dirent: $repaired"
577 mount_client $MOUNT || error "(10) Fail to start client!"
579 #define OBD_FAIL_FID_LOOKUP 0x1505
580 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
581 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
582 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
584 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
588 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
589 skip "OI Scrub not implemented for ZFS" && return
592 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
593 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
595 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
596 echo "start $SINGLEMDS with disabling OI scrub"
597 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
598 error "(2) Fail to start MDS!"
600 #define OBD_FAIL_LFSCK_DELAY2 0x1601
601 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
602 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
603 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
604 mdd.${MDT_DEV}.lfsck_namespace |
605 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
607 error "(5) unexpected status"
610 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
611 [ "$STATUS" == "scanning-phase1" ] ||
612 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
614 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
615 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
616 mdd.${MDT_DEV}.lfsck_namespace |
617 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
619 error "(7) unexpected status"
622 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
623 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
625 local repaired=$($SHOW_NAMESPACE |
626 awk '/^dirent_repaired/ { print $2 }')
627 # for interop with old server
628 [ -z "$repaired" ] &&
629 repaired=$($SHOW_NAMESPACE |
630 awk '/^updated_phase1/ { print $2 }')
632 [ $repaired -ge 2 ] ||
633 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
637 mount_client $MOUNT || error "(10) Fail to start client!"
639 #define OBD_FAIL_FID_LOOKUP 0x1505
640 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
641 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
643 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
645 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
646 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
647 local dummyname=$($LFS fid2path $DIR $dummyfid)
648 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
649 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
651 run_test 5 "LFSCK can handle IGIF object upgrading"
656 #define OBD_FAIL_LFSCK_DELAY1 0x1600
657 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
658 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
660 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
661 [ "$STATUS" == "scanning-phase1" ] ||
662 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
664 # Sleep 3 sec to guarantee at least one object processed by LFSCK
666 # Fail the LFSCK to guarantee there is at least one checkpoint
667 #define OBD_FAIL_LFSCK_FATAL1 0x1608
668 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
669 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
670 mdd.${MDT_DEV}.lfsck_namespace |
671 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
673 error "(4) unexpected status"
676 local POS0=$($SHOW_NAMESPACE |
677 awk '/^last_checkpoint_position/ { print $2 }' |
680 #define OBD_FAIL_LFSCK_DELAY1 0x1600
681 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
682 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
684 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
685 [ "$STATUS" == "scanning-phase1" ] ||
686 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
688 local POS1=$($SHOW_NAMESPACE |
689 awk '/^latest_start_position/ { print $2 }' |
691 [[ $POS0 -lt $POS1 ]] ||
692 error "(7) Expect larger than: $POS0, but got $POS1"
694 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
695 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
696 mdd.${MDT_DEV}.lfsck_namespace |
697 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
699 error "(8) unexpected status"
702 run_test 6a "LFSCK resumes from last checkpoint (1)"
707 #define OBD_FAIL_LFSCK_DELAY2 0x1601
708 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
709 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
711 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
712 [ "$STATUS" == "scanning-phase1" ] ||
713 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
715 # Sleep 5 sec to guarantee that we are in the directory scanning
717 # Fail the LFSCK to guarantee there is at least one checkpoint
718 #define OBD_FAIL_LFSCK_FATAL2 0x1609
719 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
720 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
721 mdd.${MDT_DEV}.lfsck_namespace |
722 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
724 error "(4) unexpected status"
727 local O_POS0=$($SHOW_NAMESPACE |
728 awk '/^last_checkpoint_position/ { print $2 }' |
731 local D_POS0=$($SHOW_NAMESPACE |
732 awk '/^last_checkpoint_position/ { print $4 }')
734 #define OBD_FAIL_LFSCK_DELAY2 0x1601
735 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
736 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
738 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
739 [ "$STATUS" == "scanning-phase1" ] ||
740 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
742 local O_POS1=$($SHOW_NAMESPACE |
743 awk '/^latest_start_position/ { print $2 }' |
745 local D_POS1=$($SHOW_NAMESPACE |
746 awk '/^latest_start_position/ { print $4 }')
748 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
749 [[ $O_POS0 -lt $O_POS1 ]] ||
750 error "(7.1) $O_POS1 is not larger than $O_POS0"
752 [[ $D_POS0 -lt $D_POS1 ]] ||
753 error "(7.2) $D_POS1 is not larger than $D_POS0"
756 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
757 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
758 mdd.${MDT_DEV}.lfsck_namespace |
759 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
761 error "(8) unexpected status"
764 run_test 6b "LFSCK resumes from last checkpoint (2)"
771 #define OBD_FAIL_LFSCK_DELAY2 0x1601
772 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
773 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
775 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
776 [ "$STATUS" == "scanning-phase1" ] ||
777 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
779 # Sleep 3 sec to guarantee at least one object processed by LFSCK
781 echo "stop $SINGLEMDS"
782 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
784 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
785 echo "start $SINGLEMDS"
786 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
787 error "(5) Fail to start MDS!"
789 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
790 mdd.${MDT_DEV}.lfsck_namespace |
791 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
793 error "(6) unexpected status"
796 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
802 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
803 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
804 for ((i = 0; i < 20; i++)); do
805 touch $DIR/$tdir/dummy${i}
808 #define OBD_FAIL_LFSCK_DELAY3 0x1602
809 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
810 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
811 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
812 mdd.${MDT_DEV}.lfsck_namespace |
813 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
815 error "(4) unexpected status"
819 echo "stop $SINGLEMDS"
820 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
822 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
823 echo "start $SINGLEMDS"
824 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
825 error "(6) Fail to start MDS!"
827 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
828 mdd.${MDT_DEV}.lfsck_namespace |
829 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
831 error "(7) unexpected status"
834 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
839 formatall > /dev/null
845 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
846 [ "$STATUS" == "init" ] ||
847 error "(2) Expect 'init', but got '$STATUS'"
849 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
850 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
851 mkdir $DIR/$tdir/crashed
853 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
854 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
855 for ((i = 0; i < 5; i++)); do
856 touch $DIR/$tdir/dummy${i}
859 umount_client $MOUNT || error "(3) Fail to stop client!"
861 #define OBD_FAIL_LFSCK_DELAY2 0x1601
862 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
863 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
865 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
866 [ "$STATUS" == "scanning-phase1" ] ||
867 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
869 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
871 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
872 [ "$STATUS" == "stopped" ] ||
873 error "(7) Expect 'stopped', but got '$STATUS'"
875 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
877 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
878 [ "$STATUS" == "scanning-phase1" ] ||
879 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
881 #define OBD_FAIL_LFSCK_FATAL2 0x1609
882 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
883 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
884 mdd.${MDT_DEV}.lfsck_namespace |
885 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
887 error "(10) unexpected status"
890 #define OBD_FAIL_LFSCK_DELAY1 0x1600
891 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
892 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
894 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
895 [ "$STATUS" == "scanning-phase1" ] ||
896 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
898 #define OBD_FAIL_LFSCK_CRASH 0x160a
899 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
902 echo "stop $SINGLEMDS"
903 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
905 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
906 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
908 echo "start $SINGLEMDS"
909 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
910 error "(14) Fail to start MDS!"
912 local timeout=$(max_recovery_time)
915 while [ $timer -lt $timeout ]; do
916 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
917 mdt.${MDT_DEV}.recovery_status |
918 awk '/^status/ { print \\\$2 }'")
919 [ "$STATUS" != "RECOVERING" ] && break;
924 [ $timer != $timeout ] ||
925 error "(14.1) recovery timeout"
927 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
928 [ "$STATUS" == "crashed" ] ||
929 error "(15) Expect 'crashed', but got '$STATUS'"
931 #define OBD_FAIL_LFSCK_DELAY2 0x1601
932 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
933 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
935 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
936 [ "$STATUS" == "scanning-phase1" ] ||
937 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
939 echo "stop $SINGLEMDS"
940 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
942 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
943 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
945 echo "start $SINGLEMDS"
946 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
947 error "(19) Fail to start MDS!"
950 while [ $timer -lt $timeout ]; do
951 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
952 mdt.${MDT_DEV}.recovery_status |
953 awk '/^status/ { print \\\$2 }'")
954 [ "$STATUS" != "RECOVERING" ] && break;
959 [ $timer != $timeout ] ||
960 error "(19.1) recovery timeout"
962 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
963 [ "$STATUS" == "paused" ] ||
964 error "(20) Expect 'paused', but got '$STATUS'"
966 echo "stop $SINGLEMDS"
967 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
969 echo "start $SINGLEMDS without resume LFSCK"
970 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
971 error "(20.2) Fail to start MDS!"
974 while [ $timer -lt $timeout ]; do
975 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
976 mdt.${MDT_DEV}.recovery_status |
977 awk '/^status/ { print \\\$2 }'")
978 [ "$STATUS" != "RECOVERING" ] && break;
983 [ $timer != $timeout ] ||
984 error "(20.3) recovery timeout"
986 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
987 [ "$STATUS" == "paused" ] ||
988 error "(20.4) Expect 'paused', but got '$STATUS'"
990 #define OBD_FAIL_LFSCK_DELAY3 0x1602
991 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
993 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
994 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
995 mdd.${MDT_DEV}.lfsck_namespace |
996 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
998 error "(22) unexpected status"
1001 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1002 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1003 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1005 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1006 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1007 mdd.${MDT_DEV}.lfsck_namespace |
1008 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1010 error "(24) unexpected status"
1013 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1014 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1016 run_test 8 "LFSCK state machine"
1019 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1020 skip "Testing on UP system, the speed may be inaccurate."
1024 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1025 { skip "Need MDS version >= 2.7.50"; return; }
1027 check_mount_and_prep
1028 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1029 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1030 createmany -o $DIR/$tdir/lfsck/f 5000
1032 local BASE_SPEED1=100
1034 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1037 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1038 [ "$STATUS" == "scanning-phase1" ] ||
1039 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1041 local SPEED=$($SHOW_LAYOUT |
1042 awk '/^average_speed_phase1/ { print $2 }')
1044 # There may be time error, normally it should be less than 2 seconds.
1045 # We allow another 20% schedule error.
1047 # MAX_MARGIN = 1.2 = 12 / 10
1048 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1049 RUN_TIME1 * 12 / 10))
1050 [ $SPEED -lt $MAX_SPEED ] ||
1051 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1053 # adjust speed limit
1054 local BASE_SPEED2=300
1056 do_facet $SINGLEMDS \
1057 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1060 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1061 # MIN_MARGIN = 0.8 = 8 / 10
1062 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1063 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1064 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1065 [ $SPEED -gt $MIN_SPEED ] || {
1066 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1067 error_ignore LU-5624 \
1068 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1071 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1075 # MAX_MARGIN = 1.2 = 12 / 10
1076 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1077 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1078 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1079 [ $SPEED -lt $MAX_SPEED ] ||
1080 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1082 do_facet $SINGLEMDS \
1083 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1085 wait_update_facet $SINGLEMDS \
1086 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1087 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1088 error "(7) Failed to get expected 'completed'"
1090 run_test 9a "LFSCK speed control (1)"
1093 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1094 skip "Testing on UP system, the speed may be inaccurate."
1098 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1099 { skip "Need MDS version >= 2.7.50"; return; }
1103 echo "Preparing another 50 * 50 files (with error) at $(date)."
1104 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1105 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1106 createmany -d $DIR/$tdir/d 50
1107 createmany -m $DIR/$tdir/f 50
1108 for ((i = 0; i < 50; i++)); do
1109 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1112 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1114 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1115 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1116 mdd.${MDT_DEV}.lfsck_namespace |
1117 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1119 error "(5) unexpected status"
1122 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1123 echo "Prepared at $(date)."
1125 local BASE_SPEED1=50
1127 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1130 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1131 [ "$STATUS" == "scanning-phase2" ] ||
1132 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1134 local SPEED=$($SHOW_NAMESPACE |
1135 awk '/^average_speed_phase2/ { print $2 }')
1136 # There may be time error, normally it should be less than 2 seconds.
1137 # We allow another 20% schedule error.
1139 # MAX_MARGIN = 1.2 = 12 / 10
1140 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1141 RUN_TIME1 * 12 / 10))
1142 [ $SPEED -lt $MAX_SPEED ] ||
1143 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1145 # adjust speed limit
1146 local BASE_SPEED2=150
1148 do_facet $SINGLEMDS \
1149 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1152 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1153 # MIN_MARGIN = 0.8 = 8 / 10
1154 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1155 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1156 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1157 [ $SPEED -gt $MIN_SPEED ] || {
1158 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1159 error_ignore LU-5624 \
1160 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1163 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1167 # MAX_MARGIN = 1.2 = 12 / 10
1168 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1169 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1170 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1171 [ $SPEED -lt $MAX_SPEED ] ||
1172 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1174 do_facet $SINGLEMDS \
1175 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1176 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1177 mdd.${MDT_DEV}.lfsck_namespace |
1178 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1180 error "(11) unexpected status"
1183 run_test 9b "LFSCK speed control (2)"
1187 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1188 skip "lookup(..)/linkea on ZFS issue" && return
1192 echo "Preparing more files with error at $(date)."
1193 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1194 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1196 for ((i = 0; i < 1000; i = $((i+2)))); do
1197 mkdir -p $DIR/$tdir/d${i}
1198 touch $DIR/$tdir/f${i}
1199 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1202 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1203 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1205 for ((i = 1; i < 1000; i = $((i+2)))); do
1206 mkdir -p $DIR/$tdir/d${i}
1207 touch $DIR/$tdir/f${i}
1208 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1212 echo "Prepared at $(date)."
1214 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1216 umount_client $MOUNT
1217 mount_client $MOUNT || error "(3) Fail to start client!"
1219 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1222 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1223 [ "$STATUS" == "scanning-phase1" ] ||
1224 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1226 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1228 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1230 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1232 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1234 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1236 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1238 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1240 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1241 error "(14) Fail to softlink!"
1243 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1244 [ "$STATUS" == "scanning-phase1" ] ||
1245 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1247 do_facet $SINGLEMDS \
1248 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1249 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1250 mdd.${MDT_DEV}.lfsck_namespace |
1251 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1253 error "(16) unexpected status"
1256 run_test 10 "System is available during LFSCK scanning"
1259 ost_remove_lastid() {
1262 local rcmd="do_facet ost${ost}"
1264 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1266 # step 1: local mount
1267 mount_fstype ost${ost} || return 1
1268 # step 2: remove the specified LAST_ID
1269 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1271 unmount_fstype ost${ost} || return 2
1275 check_mount_and_prep
1276 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1277 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1282 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1284 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1285 error "(2) Fail to start ost1"
1287 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1288 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1290 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1291 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1293 wait_update_facet ost1 "$LCTL get_param -n \
1294 obdfilter.${OST_DEV}.lfsck_layout |
1295 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1297 error "(5) unexpected status"
1300 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1302 wait_update_facet ost1 "$LCTL get_param -n \
1303 obdfilter.${OST_DEV}.lfsck_layout |
1304 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1306 error "(6) unexpected status"
1309 echo "the LAST_ID(s) should have been rebuilt"
1310 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1311 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1313 run_test 11a "LFSCK can rebuild lost last_id"
1316 check_mount_and_prep
1317 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1319 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1320 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1321 do_facet ost1 $LCTL set_param fail_loc=0x160d
1323 local count=$(precreated_ost_obj_count 0 0)
1325 createmany -o $DIR/$tdir/f $((count + 32))
1327 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1328 local seq=$(do_facet mds1 $LCTL get_param -n \
1329 osp.${proc_path}.prealloc_last_seq)
1330 local lastid1=$(do_facet ost1 "lctl get_param -n \
1331 obdfilter.${ost1_svc}.last_id" | grep $seq |
1332 awk -F: '{ print $2 }')
1334 umount_client $MOUNT
1335 stop ost1 || error "(1) Fail to stop ost1"
1337 #define OBD_FAIL_OST_ENOSPC 0x215
1338 do_facet ost1 $LCTL set_param fail_loc=0x215
1340 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1341 error "(2) Fail to start ost1"
1343 for ((i = 0; i < 60; i++)); do
1344 lastid2=$(do_facet ost1 "lctl get_param -n \
1345 obdfilter.${ost1_svc}.last_id" | grep $seq |
1346 awk -F: '{ print $2 }')
1347 [ ! -z $lastid2 ] && break;
1351 echo "the on-disk LAST_ID should be smaller than the expected one"
1352 [ $lastid1 -gt $lastid2 ] ||
1353 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1355 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1356 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1358 wait_update_facet ost1 "$LCTL get_param -n \
1359 obdfilter.${OST_DEV}.lfsck_layout |
1360 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1362 error "(6) unexpected status"
1365 stop ost1 || error "(7) Fail to stop ost1"
1367 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1368 error "(8) Fail to start ost1"
1370 echo "the on-disk LAST_ID should have been rebuilt"
1371 wait_update_facet ost1 "$LCTL get_param -n \
1372 obdfilter.${ost1_svc}.last_id | grep $seq |
1373 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1374 do_facet ost1 $LCTL get_param -n \
1375 obdfilter.${ost1_svc}.last_id
1376 error "(9) expect lastid1 $seq:$lastid1"
1379 do_facet ost1 $LCTL set_param fail_loc=0
1380 stopall || error "(10) Fail to stopall"
1382 run_test 11b "LFSCK can rebuild crashed last_id"
1385 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1387 check_mount_and_prep
1388 for k in $(seq $MDSCOUNT); do
1389 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1390 createmany -o $DIR/$tdir/${k}/f 100 ||
1391 error "(0) Fail to create 100 files."
1394 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1395 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1396 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1398 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1399 wait_all_targets namespace scanning-phase1 3
1401 echo "Stop namespace LFSCK on all targets by single lctl command."
1402 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1403 error "(4) Fail to stop LFSCK on all devices!"
1405 echo "All the LFSCK targets should be in 'stopped' status."
1406 wait_all_targets_blocked namespace stopped 5
1408 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1409 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1410 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1412 echo "All the LFSCK targets should be in 'completed' status."
1413 wait_all_targets_blocked namespace completed 7
1415 start_full_debug_logging
1417 echo "Start layout LFSCK on all targets by single command (-s 1)."
1418 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1419 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1421 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1422 wait_all_targets layout scanning-phase1 9
1424 echo "Stop layout LFSCK on all targets by single lctl command."
1425 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1426 error "(10) Fail to stop LFSCK on all devices!"
1428 echo "All the LFSCK targets should be in 'stopped' status."
1429 wait_all_targets_blocked layout stopped 11
1431 for k in $(seq $OSTCOUNT); do
1432 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1433 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1434 awk '/^status/ { print $2 }')
1435 [ "$STATUS" == "stopped" ] ||
1436 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1439 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1440 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1441 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1443 echo "All the LFSCK targets should be in 'completed' status."
1444 wait_all_targets_blocked layout completed 14
1446 stop_full_debug_logging
1448 run_test 12a "single command to trigger LFSCK on all devices"
1451 check_mount_and_prep
1453 echo "Start LFSCK without '-M' specified."
1454 do_facet mds1 $LCTL lfsck_start -A -r ||
1455 error "(0) Fail to start LFSCK without '-M'"
1457 wait_all_targets_blocked namespace completed 1
1458 wait_all_targets_blocked layout completed 2
1460 local count=$(do_facet mds1 $LCTL dl |
1461 awk '{ print $3 }' | grep mdt | wc -l)
1462 if [ $count -gt 1 ]; then
1464 echo "Start layout LFSCK on the node with multipe targets,"
1465 echo "but not specify '-M'/'-A' option. Should get failure."
1467 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1468 error "(3) Start layout LFSCK should fail" || true
1471 run_test 12b "auto detect Lustre device"
1475 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1476 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1477 echo "MDT-object FID."
1480 check_mount_and_prep
1482 echo "Inject failure stub to simulate bad lmm_oi"
1483 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1484 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1485 createmany -o $DIR/$tdir/f 1
1486 $LFS setstripe -E 1M -E -1 $DIR/$tdir/f1 ||
1487 error "(0) Fail to create PFL $DIR/$tdir/f1"
1488 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1490 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1491 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1493 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1494 mdd.${MDT_DEV}.lfsck_layout |
1495 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1497 error "(2) unexpected status"
1500 local repaired=$($SHOW_LAYOUT |
1501 awk '/^repaired_others/ { print $2 }')
1502 [ $repaired -eq 2 ] ||
1503 error "(3) Fail to repair crashed lmm_oi: $repaired"
1505 run_test 13 "LFSCK can repair crashed lmm_oi"
1509 echo "The OST-object referenced by the MDT-object should be there;"
1510 echo "otherwise, the LFSCK should re-create the missing OST-object."
1511 echo "without '--delay-create-ostobj' option."
1514 check_mount_and_prep
1515 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1517 echo "Inject failure stub to simulate dangling referenced MDT-object"
1518 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1519 do_facet ost1 $LCTL set_param fail_loc=0x1610
1520 local count=$(precreated_ost_obj_count 0 0)
1522 createmany -o $DIR/$tdir/f $((count + 16)) ||
1523 error "(0.1) Fail to create $DIR/$tdir/fx"
1524 touch $DIR/$tdir/guard0
1526 for ((i = 0; i < 16; i++)); do
1527 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1528 $DIR/$tdir/f_comp${i} ||
1529 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1531 touch $DIR/$tdir/guard1
1533 do_facet ost1 $LCTL set_param fail_loc=0
1535 start_full_debug_logging
1537 # exhaust other pre-created dangling cases
1538 count=$(precreated_ost_obj_count 0 0)
1539 createmany -o $DIR/$tdir/a $count ||
1540 error "(0.5) Fail to create $count files."
1542 echo "'ls' should fail because of dangling referenced MDT-object"
1543 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1545 echo "Trigger layout LFSCK to find out dangling reference"
1546 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1548 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1549 mdd.${MDT_DEV}.lfsck_layout |
1550 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1552 error "(3) unexpected status"
1555 local repaired=$($SHOW_LAYOUT |
1556 awk '/^repaired_dangling/ { print $2 }')
1557 [ $repaired -ge 32 ] ||
1558 error "(4) Fail to repair dangling reference: $repaired"
1560 echo "'stat' should fail because of not repair dangling by default"
1561 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1562 error "(5.1) stat should fail"
1563 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1564 error "(5.2) stat should fail"
1566 echo "Trigger layout LFSCK to repair dangling reference"
1567 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1569 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1570 mdd.${MDT_DEV}.lfsck_layout |
1571 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1573 error "(7) unexpected status"
1576 # There may be some async LFSCK updates in processing, wait for
1577 # a while until the target reparation has been done. LU-4970.
1579 echo "'stat' should success after layout LFSCK repairing"
1580 wait_update_facet client "stat $DIR/$tdir/guard0 |
1581 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1582 stat $DIR/$tdir/guard0
1584 error "(8.1) unexpected size"
1587 wait_update_facet client "stat $DIR/$tdir/guard1 |
1588 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1589 stat $DIR/$tdir/guard1
1591 error "(8.2) unexpected size"
1594 repaired=$($SHOW_LAYOUT |
1595 awk '/^repaired_dangling/ { print $2 }')
1596 [ $repaired -ge 32 ] ||
1597 error "(9) Fail to repair dangling reference: $repaired"
1599 stop_full_debug_logging
1601 echo "stopall to cleanup object cache"
1604 setupall > /dev/null
1606 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1610 echo "The OST-object referenced by the MDT-object should be there;"
1611 echo "otherwise, the LFSCK should re-create the missing OST-object."
1612 echo "with '--delay-create-ostobj' option."
1615 check_mount_and_prep
1616 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1618 echo "Inject failure stub to simulate dangling referenced MDT-object"
1619 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1620 do_facet ost1 $LCTL set_param fail_loc=0x1610
1621 local count=$(precreated_ost_obj_count 0 0)
1623 createmany -o $DIR/$tdir/f $((count + 31))
1624 touch $DIR/$tdir/guard
1625 do_facet ost1 $LCTL set_param fail_loc=0
1627 start_full_debug_logging
1629 # exhaust other pre-created dangling cases
1630 count=$(precreated_ost_obj_count 0 0)
1631 createmany -o $DIR/$tdir/a $count ||
1632 error "(0) Fail to create $count files."
1634 echo "'ls' should fail because of dangling referenced MDT-object"
1635 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1637 echo "Trigger layout LFSCK to find out dangling reference"
1638 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1640 wait_all_targets_blocked layout completed 3
1642 local repaired=$($SHOW_LAYOUT |
1643 awk '/^repaired_dangling/ { print $2 }')
1644 [ $repaired -ge 32 ] ||
1645 error "(4) Fail to repair dangling reference: $repaired"
1647 echo "'stat' should fail because of not repair dangling by default"
1648 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1650 echo "Trigger layout LFSCK to repair dangling reference"
1651 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1653 wait_all_targets_blocked layout completed 7
1655 # There may be some async LFSCK updates in processing, wait for
1656 # a while until the target reparation has been done. LU-4970.
1658 echo "'stat' should success after layout LFSCK repairing"
1659 wait_update_facet client "stat $DIR/$tdir/guard |
1660 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1661 stat $DIR/$tdir/guard
1663 error "(8) unexpected size"
1666 repaired=$($SHOW_LAYOUT |
1667 awk '/^repaired_dangling/ { print $2 }')
1668 [ $repaired -ge 32 ] ||
1669 error "(9) Fail to repair dangling reference: $repaired"
1671 stop_full_debug_logging
1673 echo "stopall to cleanup object cache"
1676 setupall > /dev/null
1678 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1682 echo "If the OST-object referenced by the MDT-object back points"
1683 echo "to some non-exist MDT-object, then the LFSCK should repair"
1684 echo "the OST-object to back point to the right MDT-object."
1687 check_mount_and_prep
1688 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1690 echo "Inject failure stub to make the OST-object to back point to"
1691 echo "non-exist MDT-object."
1692 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1694 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1695 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1696 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1698 error "(0) Fail to create PFL $DIR/$tdir/f1"
1699 # 'dd' will trigger punch RPC firstly on every OST-objects.
1700 # So even though some OST-object will not be write by 'dd',
1701 # as long as it is allocated (may be NOT allocated in pfl_3b)
1702 # its layout information will be set also.
1703 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1704 cancel_lru_locks osc
1705 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1707 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1708 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1710 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1711 mdd.${MDT_DEV}.lfsck_layout |
1712 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1714 error "(2) unexpected status"
1717 local repaired=$($SHOW_LAYOUT |
1718 awk '/^repaired_unmatched_pair/ { print $2 }')
1719 [ $repaired -ge 3 ] ||
1720 error "(3) Fail to repair unmatched pair: $repaired"
1722 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1726 echo "If the OST-object referenced by the MDT-object back points"
1727 echo "to other MDT-object that doesn't recognize the OST-object,"
1728 echo "then the LFSCK should repair it to back point to the right"
1729 echo "MDT-object (the first one)."
1732 check_mount_and_prep
1733 mkdir -p $DIR/$tdir/0
1734 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1735 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1736 cancel_lru_locks osc
1738 echo "Inject failure stub to make the OST-object to back point to"
1739 echo "other MDT-object"
1742 [ $OSTCOUNT -ge 2 ] && stripes=2
1744 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1745 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1746 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1747 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1749 error "(0) Fail to create PFL $DIR/$tdir/f1"
1750 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1751 cancel_lru_locks osc
1752 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1754 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1755 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1757 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1758 mdd.${MDT_DEV}.lfsck_layout |
1759 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1761 error "(2) unexpected status"
1764 local repaired=$($SHOW_LAYOUT |
1765 awk '/^repaired_unmatched_pair/ { print $2 }')
1766 [ $repaired -eq 4 ] ||
1767 error "(3) Fail to repair unmatched pair: $repaired"
1769 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1772 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1774 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1775 skip "Skip the test after 2.7.55 see LU-6437" && return
1778 echo "According to current metadata migration implementation,"
1779 echo "before the old MDT-object is removed, both the new MDT-object"
1780 echo "and old MDT-object will reference the same LOV layout. Then if"
1781 echo "the layout LFSCK finds the new MDT-object by race, it will"
1782 echo "regard related OST-object(s) as multiple referenced case, and"
1783 echo "will try to create new OST-object(s) for the new MDT-object."
1784 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1785 echo "MDT-object before confirm the multiple referenced case."
1788 check_mount_and_prep
1789 $LFS mkdir -i 1 $DIR/$tdir/a1
1790 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1791 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1792 cancel_lru_locks osc
1794 echo "Inject failure stub on MDT1 to delay the migration"
1796 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1797 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1798 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1799 $LFS migrate -m 0 $DIR/$tdir/a1 &
1802 echo "Trigger layout LFSCK to race with the migration"
1803 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1805 wait_all_targets_blocked layout completed 2
1807 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1808 local repaired=$($SHOW_LAYOUT |
1809 awk '/^repaired_unmatched_pair/ { print $2 }')
1810 [ $repaired -eq 1 ] ||
1811 error "(3) Fail to repair unmatched pair: $repaired"
1813 repaired=$($SHOW_LAYOUT |
1814 awk '/^repaired_multiple_referenced/ { print $2 }')
1815 [ $repaired -eq 0 ] ||
1816 error "(4) Unexpectedly repaird multiple references: $repaired"
1818 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1822 echo "If the OST-object's owner information does not match the owner"
1823 echo "information stored in the MDT-object, then the LFSCK trust the"
1824 echo "MDT-object and update the OST-object's owner information."
1827 check_mount_and_prep
1828 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1829 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1830 cancel_lru_locks osc
1832 echo "Inject failure stub to skip OST-object owner changing"
1833 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1834 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1835 chown 1.1 $DIR/$tdir/f0
1836 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1838 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1841 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1843 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1844 mdd.${MDT_DEV}.lfsck_layout |
1845 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1847 error "(2) unexpected status"
1850 local repaired=$($SHOW_LAYOUT |
1851 awk '/^repaired_inconsistent_owner/ { print $2 }')
1852 [ $repaired -eq 1 ] ||
1853 error "(3) Fail to repair inconsistent owner: $repaired"
1855 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1859 echo "If more than one MDT-objects reference the same OST-object,"
1860 echo "and the OST-object only recognizes one MDT-object, then the"
1861 echo "LFSCK should create new OST-objects for such non-recognized"
1865 check_mount_and_prep
1866 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1868 echo "Inject failure stub to make two MDT-objects to refernce"
1869 echo "the OST-object"
1871 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1872 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1873 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1874 cancel_lru_locks mdc
1875 cancel_lru_locks osc
1877 createmany -o $DIR/$tdir/f 1
1878 cancel_lru_locks mdc
1879 cancel_lru_locks osc
1881 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1883 error "(0) Fail to create PFL $DIR/$tdir/f1"
1884 cancel_lru_locks mdc
1885 cancel_lru_locks osc
1886 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1888 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1889 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1890 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1891 [ $size -eq 1048576 ] ||
1892 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1894 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1895 [ $size -eq 1048576 ] ||
1896 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1898 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1901 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1903 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1904 mdd.${MDT_DEV}.lfsck_layout |
1905 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1907 error "(3) unexpected status"
1910 local repaired=$($SHOW_LAYOUT |
1911 awk '/^repaired_multiple_referenced/ { print $2 }')
1912 [ $repaired -eq 2 ] ||
1913 error "(4) Fail to repair multiple references: $repaired"
1915 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1916 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1917 error "(5) Fail to write f0."
1918 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1919 [ $size -eq 1048576 ] ||
1920 error "(6) guard size should be 1048576, but got $size"
1922 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1923 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1924 error "(7) Fail to write f1."
1925 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1926 [ $size -eq 1048576 ] ||
1927 error "(8) guard size should be 1048576, but got $size"
1929 run_test 17 "LFSCK can repair multiple references"
1931 $LCTL set_param debug=+cache > /dev/null
1935 echo "The target MDT-object is there, but related stripe information"
1936 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1937 echo "layout EA entries."
1940 check_mount_and_prep
1941 $LFS mkdir -i 0 $DIR/$tdir/a1
1942 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1943 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1945 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1947 $LFS path2fid $DIR/$tdir/a1/f1
1948 $LFS getstripe $DIR/$tdir/a1/f1
1950 if [ $MDSCOUNT -ge 2 ]; then
1951 $LFS mkdir -i 1 $DIR/$tdir/a2
1952 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1953 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1954 $LFS path2fid $DIR/$tdir/a2/f2
1955 $LFS getstripe $DIR/$tdir/a2/f2
1958 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
1959 error "(0) Fail to create PFL $DIR/$tdir/f3"
1961 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
1963 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
1965 $LFS path2fid $DIR/$tdir/f3
1966 $LFS getstripe $DIR/$tdir/f3
1968 cancel_lru_locks osc
1970 echo "Inject failure, to make the MDT-object lost its layout EA"
1971 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1972 do_facet mds1 $LCTL set_param fail_loc=0x1615
1973 chown 1.1 $DIR/$tdir/a1/f1
1975 if [ $MDSCOUNT -ge 2 ]; then
1976 do_facet mds2 $LCTL set_param fail_loc=0x1615
1977 chown 1.1 $DIR/$tdir/a2/f2
1980 chown 1.1 $DIR/$tdir/f3
1985 do_facet mds1 $LCTL set_param fail_loc=0
1986 if [ $MDSCOUNT -ge 2 ]; then
1987 do_facet mds2 $LCTL set_param fail_loc=0
1990 cancel_lru_locks mdc
1991 cancel_lru_locks osc
1993 echo "The file size should be incorrect since layout EA is lost"
1994 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1995 [ "$cur_size" != "$saved_size1" ] ||
1996 error "(1) Expect incorrect file1 size"
1998 if [ $MDSCOUNT -ge 2 ]; then
1999 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2000 [ "$cur_size" != "$saved_size1" ] ||
2001 error "(2) Expect incorrect file2 size"
2004 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2005 [ "$cur_size" != "$saved_size2" ] ||
2006 error "(1.2) Expect incorrect file3 size"
2008 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2009 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2011 for k in $(seq $MDSCOUNT); do
2012 # The LFSCK status query internal is 30 seconds. For the case
2013 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2014 # time to guarantee the status sync up.
2015 wait_update_facet mds${k} "$LCTL get_param -n \
2016 mdd.$(facet_svc mds${k}).lfsck_layout |
2017 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2018 error "(4) MDS${k} is not the expected 'completed'"
2021 for k in $(seq $OSTCOUNT); do
2022 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2023 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2024 awk '/^status/ { print $2 }')
2025 [ "$cur_status" == "completed" ] ||
2026 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2029 local repaired=$(do_facet mds1 $LCTL get_param -n \
2030 mdd.$(facet_svc mds1).lfsck_layout |
2031 awk '/^repaired_orphan/ { print $2 }')
2032 [ $repaired -eq 3 ] ||
2033 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2035 if [ $MDSCOUNT -ge 2 ]; then
2036 repaired=$(do_facet mds2 $LCTL get_param -n \
2037 mdd.$(facet_svc mds2).lfsck_layout |
2038 awk '/^repaired_orphan/ { print $2 }')
2039 [ $repaired -eq 2 ] ||
2040 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2043 $LFS path2fid $DIR/$tdir/a1/f1
2044 $LFS getstripe $DIR/$tdir/a1/f1
2046 if [ $MDSCOUNT -ge 2 ]; then
2047 $LFS path2fid $DIR/$tdir/a2/f2
2048 $LFS getstripe $DIR/$tdir/a2/f2
2051 $LFS path2fid $DIR/$tdir/f3
2052 $LFS getstripe $DIR/$tdir/f3
2054 echo "The file size should be correct after layout LFSCK scanning"
2055 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2056 [ "$cur_size" == "$saved_size1" ] ||
2057 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2059 if [ $MDSCOUNT -ge 2 ]; then
2060 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2061 [ "$cur_size" == "$saved_size1" ] ||
2062 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2065 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2066 [ "$cur_size" == "$saved_size2" ] ||
2067 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2069 run_test 18a "Find out orphan OST-object and repair it (1)"
2073 echo "The target MDT-object is lost. The LFSCK should re-create the"
2074 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2075 echo "can move it back to normal namespace manually."
2078 check_mount_and_prep
2079 $LFS mkdir -i 0 $DIR/$tdir/a1
2080 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2081 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2082 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2083 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2085 $LFS getstripe $DIR/$tdir/a1/f1
2087 if [ $MDSCOUNT -ge 2 ]; then
2088 $LFS mkdir -i 1 $DIR/$tdir/a2
2089 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2090 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2091 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2093 $LFS getstripe $DIR/$tdir/a2/f2
2096 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2097 error "(0) Fail to create PFL $DIR/$tdir/f3"
2099 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2101 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2102 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2104 $LFS getstripe $DIR/$tdir/f3
2106 cancel_lru_locks osc
2108 echo "Inject failure, to simulate the case of missing the MDT-object"
2109 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2110 do_facet mds1 $LCTL set_param fail_loc=0x1616
2111 rm -f $DIR/$tdir/a1/f1
2113 if [ $MDSCOUNT -ge 2 ]; then
2114 do_facet mds2 $LCTL set_param fail_loc=0x1616
2115 rm -f $DIR/$tdir/a2/f2
2123 do_facet mds1 $LCTL set_param fail_loc=0
2124 if [ $MDSCOUNT -ge 2 ]; then
2125 do_facet mds2 $LCTL set_param fail_loc=0
2128 cancel_lru_locks mdc
2129 cancel_lru_locks osc
2131 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2132 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2134 for k in $(seq $MDSCOUNT); do
2135 # The LFSCK status query internal is 30 seconds. For the case
2136 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2137 # time to guarantee the status sync up.
2138 wait_update_facet mds${k} "$LCTL get_param -n \
2139 mdd.$(facet_svc mds${k}).lfsck_layout |
2140 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2141 error "(2) MDS${k} is not the expected 'completed'"
2144 for k in $(seq $OSTCOUNT); do
2145 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2146 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2147 awk '/^status/ { print $2 }')
2148 [ "$cur_status" == "completed" ] ||
2149 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2152 local repaired=$(do_facet mds1 $LCTL get_param -n \
2153 mdd.$(facet_svc mds1).lfsck_layout |
2154 awk '/^repaired_orphan/ { print $2 }')
2155 [ $repaired -eq 3 ] ||
2156 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2158 if [ $MDSCOUNT -ge 2 ]; then
2159 repaired=$(do_facet mds2 $LCTL get_param -n \
2160 mdd.$(facet_svc mds2).lfsck_layout |
2161 awk '/^repaired_orphan/ { print $2 }')
2162 [ $repaired -eq 2 ] ||
2163 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2166 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2167 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2168 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2170 if [ $MDSCOUNT -ge 2 ]; then
2171 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2172 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2175 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2176 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2178 $LFS path2fid $DIR/$tdir/a1/f1
2179 $LFS getstripe $DIR/$tdir/a1/f1
2181 if [ $MDSCOUNT -ge 2 ]; then
2182 $LFS path2fid $DIR/$tdir/a2/f2
2183 $LFS getstripe $DIR/$tdir/a2/f2
2186 $LFS path2fid $DIR/$tdir/f3
2187 $LFS getstripe $DIR/$tdir/f3
2189 echo "The file size should be correct after layout LFSCK scanning"
2190 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2191 [ "$cur_size" == "$saved_size1" ] ||
2192 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2194 if [ $MDSCOUNT -ge 2 ]; then
2195 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2196 [ "$cur_size" == "$saved_size1" ] ||
2197 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2200 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2201 [ "$cur_size" == "$saved_size2" ] ||
2202 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2204 run_test 18b "Find out orphan OST-object and repair it (2)"
2208 echo "The target MDT-object is lost, and the OST-object FID is missing."
2209 echo "The LFSCK should re-create the MDT-object with new FID under the "
2210 echo "directory .lustre/lost+found/MDTxxxx."
2213 check_mount_and_prep
2214 $LFS mkdir -i 0 $DIR/$tdir/a1
2215 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2217 echo "Inject failure, to simulate the case of missing parent FID"
2218 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2219 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2221 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2222 $LFS getstripe $DIR/$tdir/a1/f1
2224 if [ $MDSCOUNT -ge 2 ]; then
2225 $LFS mkdir -i 1 $DIR/$tdir/a2
2226 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2227 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2228 $LFS getstripe $DIR/$tdir/a2/f2
2231 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2232 error "(0) Fail to create PFL $DIR/$tdir/f3"
2234 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2235 $LFS getstripe $DIR/$tdir/f3
2237 cancel_lru_locks osc
2238 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2240 echo "Inject failure, to simulate the case of missing the MDT-object"
2241 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2242 do_facet mds1 $LCTL set_param fail_loc=0x1616
2243 rm -f $DIR/$tdir/a1/f1
2245 if [ $MDSCOUNT -ge 2 ]; then
2246 do_facet mds2 $LCTL set_param fail_loc=0x1616
2247 rm -f $DIR/$tdir/a2/f2
2255 do_facet mds1 $LCTL set_param fail_loc=0
2256 if [ $MDSCOUNT -ge 2 ]; then
2257 do_facet mds2 $LCTL set_param fail_loc=0
2260 cancel_lru_locks mdc
2261 cancel_lru_locks osc
2263 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2264 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2266 for k in $(seq $MDSCOUNT); do
2267 # The LFSCK status query internal is 30 seconds. For the case
2268 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2269 # time to guarantee the status sync up.
2270 wait_update_facet mds${k} "$LCTL get_param -n \
2271 mdd.$(facet_svc mds${k}).lfsck_layout |
2272 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2273 error "(2) MDS${k} is not the expected 'completed'"
2276 for k in $(seq $OSTCOUNT); do
2277 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2278 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2279 awk '/^status/ { print $2 }')
2280 [ "$cur_status" == "completed" ] ||
2281 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2284 if [ $MDSCOUNT -ge 2 ]; then
2290 local repaired=$(do_facet mds1 $LCTL get_param -n \
2291 mdd.$(facet_svc mds1).lfsck_layout |
2292 awk '/^repaired_orphan/ { print $2 }')
2293 [ $repaired -eq $expected ] ||
2294 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2296 if [ $MDSCOUNT -ge 2 ]; then
2297 repaired=$(do_facet mds2 $LCTL get_param -n \
2298 mdd.$(facet_svc mds2).lfsck_layout |
2299 awk '/^repaired_orphan/ { print $2 }')
2300 [ $repaired -eq 0 ] ||
2301 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2304 ls -ail $MOUNT/.lustre/lost+found/
2306 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2307 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2308 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2310 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2313 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2314 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2315 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2317 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2318 [ ! -z "$cname" ] ||
2319 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2321 run_test 18c "Find out orphan OST-object and repair it (3)"
2325 echo "The target MDT-object layout EA is corrupted, but the right"
2326 echo "OST-object is still alive as orphan. The layout LFSCK will"
2327 echo "not create new OST-object to occupy such slot."
2330 check_mount_and_prep
2332 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2333 echo "guard" > $DIR/$tdir/a1/f1
2334 echo "foo" > $DIR/$tdir/a1/f2
2336 echo "guard" > $DIR/$tdir/a1/f3
2337 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2338 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2339 echo "foo" > $DIR/$tdir/a1/f4
2341 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2342 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2343 $LFS path2fid $DIR/$tdir/a1/f1
2344 $LFS getstripe $DIR/$tdir/a1/f1
2345 $LFS path2fid $DIR/$tdir/a1/f2
2346 $LFS getstripe $DIR/$tdir/a1/f2
2347 $LFS path2fid $DIR/$tdir/a1/f3
2348 $LFS getstripe $DIR/$tdir/a1/f3
2349 $LFS path2fid $DIR/$tdir/a1/f4
2350 $LFS getstripe $DIR/$tdir/a1/f4
2351 cancel_lru_locks osc
2353 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2354 echo "to reference the same OST-object (which is f1's OST-obejct)."
2355 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2356 echo "dangling reference case, but f2's old OST-object is there."
2358 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2359 echo "to reference the same OST-object (which is f3's OST-obejct)."
2360 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2361 echo "dangling reference case, but f4's old OST-object is there."
2364 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2365 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2366 chown 1.1 $DIR/$tdir/a1/f2
2367 chown 1.1 $DIR/$tdir/a1/f4
2368 rm -f $DIR/$tdir/a1/f1
2369 rm -f $DIR/$tdir/a1/f3
2372 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2374 echo "stopall to cleanup object cache"
2377 setupall > /dev/null
2379 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2380 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2382 for k in $(seq $MDSCOUNT); do
2383 # The LFSCK status query internal is 30 seconds. For the case
2384 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2385 # time to guarantee the status sync up.
2386 wait_update_facet mds${k} "$LCTL get_param -n \
2387 mdd.$(facet_svc mds${k}).lfsck_layout |
2388 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2389 error "(3) MDS${k} is not the expected 'completed'"
2392 for k in $(seq $OSTCOUNT); do
2393 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2394 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2395 awk '/^status/ { print $2 }')
2396 [ "$cur_status" == "completed" ] ||
2397 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2400 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2401 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2402 awk '/^repaired_orphan/ { print $2 }')
2403 [ $repaired -eq 2 ] ||
2404 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2406 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2407 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2408 awk '/^repaired_dangling/ { print $2 }')
2409 [ $repaired -eq 0 ] ||
2410 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2412 echo "The file size should be correct after layout LFSCK scanning"
2413 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2414 [ "$cur_size" == "$saved_size1" ] ||
2415 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2417 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2418 [ "$cur_size" == "$saved_size2" ] ||
2419 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2421 echo "The LFSCK should find back the original data."
2422 cat $DIR/$tdir/a1/f2
2423 $LFS path2fid $DIR/$tdir/a1/f2
2424 $LFS getstripe $DIR/$tdir/a1/f2
2425 cat $DIR/$tdir/a1/f4
2426 $LFS path2fid $DIR/$tdir/a1/f4
2427 $LFS getstripe $DIR/$tdir/a1/f4
2429 run_test 18d "Find out orphan OST-object and repair it (4)"
2433 echo "The target MDT-object layout EA slot is occpuied by some new"
2434 echo "created OST-object when repair dangling reference case. Such"
2435 echo "conflict OST-object has been modified by others. To keep the"
2436 echo "new data, the LFSCK will create a new file to refernece this"
2437 echo "old orphan OST-object."
2440 check_mount_and_prep
2442 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2443 echo "guard" > $DIR/$tdir/a1/f1
2444 echo "foo" > $DIR/$tdir/a1/f2
2446 echo "guard" > $DIR/$tdir/a1/f3
2447 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2448 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2449 echo "foo" > $DIR/$tdir/a1/f4
2451 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2452 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2454 $LFS path2fid $DIR/$tdir/a1/f1
2455 $LFS getstripe $DIR/$tdir/a1/f1
2456 $LFS path2fid $DIR/$tdir/a1/f2
2457 $LFS getstripe $DIR/$tdir/a1/f2
2458 $LFS path2fid $DIR/$tdir/a1/f3
2459 $LFS getstripe $DIR/$tdir/a1/f3
2460 $LFS path2fid $DIR/$tdir/a1/f4
2461 $LFS getstripe $DIR/$tdir/a1/f4
2462 cancel_lru_locks osc
2464 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2465 echo "to reference the same OST-object (which is f1's OST-obejct)."
2466 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2467 echo "dangling reference case, but f2's old OST-object is there."
2469 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2470 echo "to reference the same OST-object (which is f3's OST-obejct)."
2471 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2472 echo "dangling reference case, but f4's old OST-object is there."
2475 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2476 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2477 chown 1.1 $DIR/$tdir/a1/f2
2478 chown 1.1 $DIR/$tdir/a1/f4
2479 rm -f $DIR/$tdir/a1/f1
2480 rm -f $DIR/$tdir/a1/f3
2483 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2485 echo "stopall to cleanup object cache"
2488 setupall > /dev/null
2490 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2491 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2493 start_full_debug_logging
2495 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2496 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2498 wait_update_facet mds1 "$LCTL get_param -n \
2499 mdd.$(facet_svc mds1).lfsck_layout |
2500 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2501 error "(3) MDS1 is not the expected 'scanning-phase2'"
2503 # to guarantee all updates are synced.
2507 echo "Write new data to f2/f4 to modify the new created OST-object."
2508 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2509 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2511 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2513 for k in $(seq $MDSCOUNT); do
2514 # The LFSCK status query internal is 30 seconds. For the case
2515 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2516 # time to guarantee the status sync up.
2517 wait_update_facet mds${k} "$LCTL get_param -n \
2518 mdd.$(facet_svc mds${k}).lfsck_layout |
2519 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2520 error "(4) MDS${k} is not the expected 'completed'"
2523 for k in $(seq $OSTCOUNT); do
2524 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2525 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2526 awk '/^status/ { print $2 }')
2527 [ "$cur_status" == "completed" ] ||
2528 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2531 stop_full_debug_logging
2533 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2534 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2535 awk '/^repaired_orphan/ { print $2 }')
2536 [ $repaired -eq 2 ] ||
2537 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2539 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2540 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2541 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2543 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2544 if [ $count -ne 2 ]; then
2545 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2546 error "(8) Expect 2 stubs under lost+found, but got $count"
2549 echo "The stub file should keep the original f2 or f4 data"
2550 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2551 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2552 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2553 error "(9) Got unexpected $cur_size"
2556 $LFS path2fid $cname
2557 $LFS getstripe $cname
2559 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2560 cur_size=$(ls -il $cname | awk '{ print $6 }')
2561 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2562 error "(10) Got unexpected $cur_size"
2565 $LFS path2fid $cname
2566 $LFS getstripe $cname
2568 echo "The f2/f4 should contains new data."
2569 cat $DIR/$tdir/a1/f2
2570 $LFS path2fid $DIR/$tdir/a1/f2
2571 $LFS getstripe $DIR/$tdir/a1/f2
2572 cat $DIR/$tdir/a1/f4
2573 $LFS path2fid $DIR/$tdir/a1/f4
2574 $LFS getstripe $DIR/$tdir/a1/f4
2576 run_test 18e "Find out orphan OST-object and repair it (5)"
2579 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2582 echo "The target MDT-object is lost. The LFSCK should re-create the"
2583 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2584 echo "to verify some OST-object(s) during the first stage-scanning,"
2585 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2586 echo "should not be affected."
2589 check_mount_and_prep
2590 $LFS mkdir -i 0 $DIR/$tdir/a1
2591 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2592 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2593 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2594 $LFS mkdir -i 0 $DIR/$tdir/a2
2595 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2596 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2597 $LFS getstripe $DIR/$tdir/a1/f1
2598 $LFS getstripe $DIR/$tdir/a2/f2
2600 if [ $MDSCOUNT -ge 2 ]; then
2601 $LFS mkdir -i 1 $DIR/$tdir/a3
2602 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2603 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2604 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2605 $LFS mkdir -i 1 $DIR/$tdir/a4
2606 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2607 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2608 $LFS getstripe $DIR/$tdir/a3/f3
2609 $LFS getstripe $DIR/$tdir/a4/f4
2612 cancel_lru_locks osc
2614 echo "Inject failure, to simulate the case of missing the MDT-object"
2615 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2616 do_facet mds1 $LCTL set_param fail_loc=0x1616
2617 rm -f $DIR/$tdir/a1/f1
2618 rm -f $DIR/$tdir/a2/f2
2620 if [ $MDSCOUNT -ge 2 ]; then
2621 do_facet mds2 $LCTL set_param fail_loc=0x1616
2622 rm -f $DIR/$tdir/a3/f3
2623 rm -f $DIR/$tdir/a4/f4
2629 do_facet mds1 $LCTL set_param fail_loc=0
2630 if [ $MDSCOUNT -ge 2 ]; then
2631 do_facet mds2 $LCTL set_param fail_loc=0
2634 cancel_lru_locks mdc
2635 cancel_lru_locks osc
2637 echo "Inject failure, to simulate the OST0 fail to handle"
2638 echo "MDT0 LFSCK request during the first-stage scanning."
2639 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2640 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2642 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2643 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2645 for k in $(seq $MDSCOUNT); do
2646 # The LFSCK status query internal is 30 seconds. For the case
2647 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2648 # time to guarantee the status sync up.
2649 wait_update_facet mds${k} "$LCTL get_param -n \
2650 mdd.$(facet_svc mds${k}).lfsck_layout |
2651 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2652 error "(2) MDS${k} is not the expected 'partial'"
2655 wait_update_facet ost1 "$LCTL get_param -n \
2656 obdfilter.$(facet_svc ost1).lfsck_layout |
2657 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2658 error "(3) OST1 is not the expected 'partial'"
2661 wait_update_facet ost2 "$LCTL get_param -n \
2662 obdfilter.$(facet_svc ost2).lfsck_layout |
2663 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2664 error "(4) OST2 is not the expected 'completed'"
2667 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2669 local repaired=$(do_facet mds1 $LCTL get_param -n \
2670 mdd.$(facet_svc mds1).lfsck_layout |
2671 awk '/^repaired_orphan/ { print $2 }')
2672 [ $repaired -eq 1 ] ||
2673 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2675 if [ $MDSCOUNT -ge 2 ]; then
2676 repaired=$(do_facet mds2 $LCTL get_param -n \
2677 mdd.$(facet_svc mds2).lfsck_layout |
2678 awk '/^repaired_orphan/ { print $2 }')
2679 [ $repaired -eq 1 ] ||
2680 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2683 echo "Trigger layout LFSCK on all devices again to cleanup"
2684 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2686 for k in $(seq $MDSCOUNT); do
2687 # The LFSCK status query internal is 30 seconds. For the case
2688 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2689 # time to guarantee the status sync up.
2690 wait_update_facet mds${k} "$LCTL get_param -n \
2691 mdd.$(facet_svc mds${k}).lfsck_layout |
2692 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2693 error "(8) MDS${k} is not the expected 'completed'"
2696 for k in $(seq $OSTCOUNT); do
2697 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2698 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2699 awk '/^status/ { print $2 }')
2700 [ "$cur_status" == "completed" ] ||
2701 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2705 local repaired=$(do_facet mds1 $LCTL get_param -n \
2706 mdd.$(facet_svc mds1).lfsck_layout |
2707 awk '/^repaired_orphan/ { print $2 }')
2708 [ $repaired -eq 2 ] ||
2709 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2711 if [ $MDSCOUNT -ge 2 ]; then
2712 repaired=$(do_facet mds2 $LCTL get_param -n \
2713 mdd.$(facet_svc mds2).lfsck_layout |
2714 awk '/^repaired_orphan/ { print $2 }')
2715 [ $repaired -eq 2 ] ||
2716 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2719 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2723 echo "The target MDT-object is lost, but related OI mapping is there"
2724 echo "The LFSCK should recreate the lost MDT-object without affected"
2725 echo "by the stale OI mapping."
2728 check_mount_and_prep
2729 $LFS mkdir -i 0 $DIR/$tdir/a1
2730 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2731 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2732 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2734 $LFS getstripe $DIR/$tdir/a1/f1
2735 cancel_lru_locks osc
2737 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2738 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2739 do_facet mds1 $LCTL set_param fail_loc=0x162e
2740 rm -f $DIR/$tdir/a1/f1
2742 do_facet mds1 $LCTL set_param fail_loc=0
2743 cancel_lru_locks mdc
2744 cancel_lru_locks osc
2746 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2747 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2749 for k in $(seq $MDSCOUNT); do
2750 # The LFSCK status query internal is 30 seconds. For the case
2751 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2752 # time to guarantee the status sync up.
2753 wait_update_facet mds${k} "$LCTL get_param -n \
2754 mdd.$(facet_svc mds${k}).lfsck_layout |
2755 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2756 error "(2) MDS${k} is not the expected 'completed'"
2759 for k in $(seq $OSTCOUNT); do
2760 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2761 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2762 awk '/^status/ { print $2 }')
2763 [ "$cur_status" == "completed" ] ||
2764 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2767 local repaired=$(do_facet mds1 $LCTL get_param -n \
2768 mdd.$(facet_svc mds1).lfsck_layout |
2769 awk '/^repaired_orphan/ { print $2 }')
2770 [ $repaired -eq $OSTCOUNT ] ||
2771 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2773 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2774 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2775 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2777 $LFS path2fid $DIR/$tdir/a1/f1
2778 $LFS getstripe $DIR/$tdir/a1/f1
2780 run_test 18g "Find out orphan OST-object and repair it (7)"
2784 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2785 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2786 echo "scanning its OST-object(s). Then in the second stage scanning,"
2787 echo "the OST will return related OST-object(s) to the MDT as orphan."
2788 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2789 echo "the 'orphan(s)' stripe information."
2792 check_mount_and_prep
2794 $LFS setstripe -E 2M -c 1 -E -1 $DIR/$tdir/f0 ||
2795 error "(0) Fail to create PFL $DIR/$tdir/f0"
2797 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2798 error "(1.1) Fail to write $DIR/$tdir/f0"
2800 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2801 error "(1.2) Fail to write $DIR/$tdir/f0"
2803 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2805 echo "Inject failure stub to simulate bad PFL extent range"
2806 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2807 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2809 chown 1.1 $DIR/$tdir/f0
2811 cancel_lru_locks mdc
2812 cancel_lru_locks osc
2813 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2815 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2816 error "(2) Write to bad PFL file should fail"
2818 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2819 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2821 for k in $(seq $MDSCOUNT); do
2822 # The LFSCK status query internal is 30 seconds. For the case
2823 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2824 # time to guarantee the status sync up.
2825 wait_update_facet mds${k} "$LCTL get_param -n \
2826 mdd.$(facet_svc mds${k}).lfsck_layout |
2827 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2828 error "(4.1) MDS${k} is not the expected 'completed'"
2831 for k in $(seq $OSTCOUNT); do
2832 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2833 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2834 awk '/^status/ { print $2 }')
2835 [ "$cur_status" == "completed" ] ||
2836 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2840 local repaired=$($SHOW_LAYOUT |
2841 awk '/^repaired_orphan/ { print $2 }')
2842 [ $repaired -eq 2 ] ||
2843 error "(5) Fail to repair crashed PFL range: $repaired"
2845 echo "Data in $DIR/$tdir/f0 should not be broken"
2846 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2847 error "(6) Data in $DIR/$tdir/f0 is broken"
2849 echo "Write should succeed after LFSCK repairing the bad PFL range"
2850 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2851 error "(7) Write should succeed after LFSCK"
2853 run_test 18h "LFSCK can repair crashed PFL extent range"
2855 $LCTL set_param debug=-cache > /dev/null
2858 check_mount_and_prep
2859 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2861 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2862 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2864 echo "foo1" > $DIR/$tdir/a0
2865 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2866 error "(0) Fail to create PFL $DIR/$tdir/a1"
2867 echo "foo2" > $DIR/$tdir/a1
2868 echo "guard" > $DIR/$tdir/a2
2869 cancel_lru_locks osc
2871 echo "Inject failure, then client will offer wrong parent FID when read"
2872 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2873 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2875 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2876 $LCTL set_param fail_loc=0x1619
2878 echo "Read RPC with wrong parent FID should be denied"
2879 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2880 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2881 $LCTL set_param fail_loc=0
2883 run_test 19a "OST-object inconsistency self detect"
2886 check_mount_and_prep
2887 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2889 echo "Inject failure stub to make the OST-object to back point to"
2890 echo "non-exist MDT-object"
2892 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2893 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2895 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2896 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2897 echo "foo1" > $DIR/$tdir/f0
2898 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2899 error "(0) Fail to create PFL $DIR/$tdir/f1"
2900 echo "foo2" > $DIR/$tdir/f1
2901 cancel_lru_locks osc
2902 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2904 do_facet ost1 $LCTL set_param -n \
2905 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2906 echo "Nothing should be fixed since self detect and repair is disabled"
2907 local repaired=$(do_facet ost1 $LCTL get_param -n \
2908 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2909 awk '/^repaired/ { print $2 }')
2910 [ $repaired -eq 0 ] ||
2911 error "(1) Expected 0 repaired, but got $repaired"
2913 echo "Read RPC with right parent FID should be accepted,"
2914 echo "and cause parent FID on OST to be fixed"
2916 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2917 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2919 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2920 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2922 repaired=$(do_facet ost1 $LCTL get_param -n \
2923 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2924 awk '/^repaired/ { print $2 }')
2925 [ $repaired -eq 2 ] ||
2926 error "(3) Expected 1 repaired, but got $repaired"
2928 run_test 19b "OST-object inconsistency self repair"
2930 PATTERN_WITH_HOLE="40000001"
2931 PATTERN_WITHOUT_HOLE="1"
2934 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2937 echo "The target MDT-object and some of its OST-object are lost."
2938 echo "The LFSCK should find out the left OST-objects and re-create"
2939 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2940 echo "with the partial OST-objects (LOV EA hole)."
2942 echo "New client can access the file with LOV EA hole via normal"
2943 echo "system tools or commands without crash the system."
2945 echo "For old client, even though it cannot access the file with"
2946 echo "LOV EA hole, it should not cause the system crash."
2949 check_mount_and_prep
2950 $LFS mkdir -i 0 $DIR/$tdir/a1
2951 if [ $OSTCOUNT -gt 2 ]; then
2952 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2955 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2959 # 256 blocks on the stripe0.
2960 # 1 block on the stripe1 for 2 OSTs case.
2961 # 256 blocks on the stripe1 for other cases.
2962 # 1 block on the stripe2 if OSTs > 2
2963 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2964 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2965 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2967 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2968 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2969 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2972 $LFS getstripe $DIR/$tdir/a1/f0
2974 $LFS getstripe $DIR/$tdir/a1/f1
2976 $LFS getstripe $DIR/$tdir/a1/f2
2978 if [ $OSTCOUNT -gt 2 ]; then
2979 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2980 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2982 $LFS getstripe $DIR/$tdir/a1/f3
2985 cancel_lru_locks osc
2987 echo "Inject failure..."
2988 echo "To simulate f0 lost MDT-object"
2989 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2990 do_facet mds1 $LCTL set_param fail_loc=0x1616
2991 rm -f $DIR/$tdir/a1/f0
2993 echo "To simulate f1 lost MDT-object and OST-object0"
2994 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2995 do_facet mds1 $LCTL set_param fail_loc=0x161a
2996 rm -f $DIR/$tdir/a1/f1
2998 echo "To simulate f2 lost MDT-object and OST-object1"
2999 do_facet mds1 $LCTL set_param fail_val=1
3000 rm -f $DIR/$tdir/a1/f2
3002 if [ $OSTCOUNT -gt 2 ]; then
3003 echo "To simulate f3 lost MDT-object and OST-object2"
3004 do_facet mds1 $LCTL set_param fail_val=2
3005 rm -f $DIR/$tdir/a1/f3
3008 umount_client $MOUNT
3011 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3013 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3014 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3016 for k in $(seq $MDSCOUNT); do
3017 # The LFSCK status query internal is 30 seconds. For the case
3018 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3019 # time to guarantee the status sync up.
3020 wait_update_facet mds${k} "$LCTL get_param -n \
3021 mdd.$(facet_svc mds${k}).lfsck_layout |
3022 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3023 error "(2) MDS${k} is not the expected 'completed'"
3026 for k in $(seq $OSTCOUNT); do
3027 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3028 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3029 awk '/^status/ { print $2 }')
3030 [ "$cur_status" == "completed" ] ||
3031 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3034 local repaired=$(do_facet mds1 $LCTL get_param -n \
3035 mdd.$(facet_svc mds1).lfsck_layout |
3036 awk '/^repaired_orphan/ { print $2 }')
3037 if [ $OSTCOUNT -gt 2 ]; then
3038 [ $repaired -eq 9 ] ||
3039 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3041 [ $repaired -eq 4 ] ||
3042 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3045 mount_client $MOUNT || error "(5.0) Fail to start client!"
3047 LOV_PATTERN_F_HOLE=0x40000000
3050 # ${fid0}-R-0 is the old f0
3052 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3053 echo "Check $name, which is the old f0"
3055 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3057 local pattern=$($LFS getstripe -L $name)
3058 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3059 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3061 local stripes=$($LFS getstripe -c $name)
3062 if [ $OSTCOUNT -gt 2 ]; then
3063 [ $stripes -eq 3 ] ||
3064 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3066 [ $stripes -eq 2 ] ||
3067 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3070 local size=$(stat $name | awk '/Size:/ { print $2 }')
3071 [ $size -eq $((4096 * $bcount)) ] ||
3072 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3074 cat $name > /dev/null || error "(5.5) cannot read $name"
3076 echo "dummy" >> $name || error "(5.6) cannot write $name"
3078 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3080 touch $name || error "(5.8) cannot touch $name"
3082 rm -f $name || error "(5.9) cannot unlink $name"
3085 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3087 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3088 if [ $OSTCOUNT -gt 2 ]; then
3089 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3091 echo "Check $name, it contains the old f1's stripe1"
3094 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3096 pattern=$($LFS getstripe -L $name)
3097 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3098 error "(6.2) expect pattern flag hole, but got $pattern"
3100 stripes=$($LFS getstripe -c $name)
3101 if [ $OSTCOUNT -gt 2 ]; then
3102 [ $stripes -eq 3 ] ||
3103 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3105 [ $stripes -eq 2 ] ||
3106 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3109 size=$(stat $name | awk '/Size:/ { print $2 }')
3110 [ $size -eq $((4096 * $bcount)) ] ||
3111 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3113 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3115 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3116 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3119 [ $failures -eq 256 ] ||
3120 error "(6.6) expect 256 IO failures, but get $failures"
3122 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3123 [ $size -eq $((4096 * $bcount)) ] ||
3124 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3126 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3127 error "(6.8) write to the LOV EA hole should fail"
3129 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3130 error "(6.9) write to normal stripe should NOT fail"
3132 echo "foo" >> $name && error "(6.10) append write $name should fail"
3134 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3136 touch $name || error "(6.12) cannot touch $name"
3138 rm -f $name || error "(6.13) cannot unlink $name"
3141 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3143 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3144 if [ $OSTCOUNT -gt 2 ]; then
3145 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3147 echo "Check $name, it contains the old f2's stripe0"
3150 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3152 pattern=$($LFS getstripe -L $name)
3153 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3154 error "(7.2) expect pattern flag hole, but got $pattern"
3156 stripes=$($LFS getstripe -c $name)
3157 size=$(stat $name | awk '/Size:/ { print $2 }')
3158 if [ $OSTCOUNT -gt 2 ]; then
3159 [ $stripes -eq 3 ] ||
3160 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3162 [ $size -eq $((4096 * $bcount)) ] ||
3163 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3165 cat $name > /dev/null &&
3166 error "(7.5.1) normal read $name should fail"
3168 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3169 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3171 [ $failures -eq 256 ] ||
3172 error "(7.6) expect 256 IO failures, but get $failures"
3174 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3175 [ $size -eq $((4096 * $bcount)) ] ||
3176 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3178 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3179 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3181 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3182 error "(7.8.1) write to normal stripe should NOT fail"
3184 echo "foo" >> $name &&
3185 error "(7.8.3) append write $name should fail"
3187 chown $RUNAS_ID:$RUNAS_GID $name ||
3188 error "(7.9.1) cannot chown on $name"
3190 touch $name || error "(7.10.1) cannot touch $name"
3192 [ $stripes -eq 2 ] ||
3193 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3196 [ $size -eq $((4096 * (256 + 0))) ] ||
3197 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3199 cat $name > /dev/null &&
3200 error "(7.5.2) normal read $name should fail"
3202 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3203 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3204 [ $failures -eq 256 ] ||
3205 error "(7.6.2) expect 256 IO failures, but get $failures"
3208 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3209 [ $size -eq $((4096 * $bcount)) ] ||
3210 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3212 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3213 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3215 chown $RUNAS_ID:$RUNAS_GID $name ||
3216 error "(7.9.2) cannot chown on $name"
3218 touch $name || error "(7.10.2) cannot touch $name"
3221 rm -f $name || error "(7.11) cannot unlink $name"
3223 [ $OSTCOUNT -le 2 ] && return
3226 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3228 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3229 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3231 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3233 pattern=$($LFS getstripe -L $name)
3234 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3235 error "(8.2) expect pattern flag hole, but got $pattern"
3237 stripes=$($LFS getstripe -c $name)
3238 [ $stripes -eq 3 ] ||
3239 error "(8.3) expect the stripe count is 3, but got $stripes"
3241 size=$(stat $name | awk '/Size:/ { print $2 }')
3243 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3244 error "(8.4) expect the size $((4096 * 512)), but got $size"
3246 cat $name > /dev/null &&
3247 error "(8.5) normal read $name should fail"
3249 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3250 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3252 [ $failures -eq 256 ] ||
3253 error "(8.6) expect 256 IO failures, but get $failures"
3256 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3257 [ $size -eq $((4096 * $bcount)) ] ||
3258 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3260 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3261 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3263 chown $RUNAS_ID:$RUNAS_GID $name ||
3264 error "(8.9) cannot chown on $name"
3266 touch $name || error "(8.10) cannot touch $name"
3268 rm -f $name || error "(8.11) cannot unlink $name"
3270 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3273 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3276 echo "The target MDT-object and some of its OST-object are lost."
3277 echo "The LFSCK should find out the left OST-objects and re-create"
3278 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3279 echo "with the partial OST-objects (LOV EA hole)."
3281 echo "New client can access the file with LOV EA hole via normal"
3282 echo "system tools or commands without crash the system - PFL case."
3285 check_mount_and_prep
3287 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3288 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3289 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3290 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3291 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3292 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3294 local bcount=$((256 * 3 + 1))
3296 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3297 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3298 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3300 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3301 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3302 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3305 $LFS getstripe $DIR/$tdir/f0
3307 $LFS getstripe $DIR/$tdir/f1
3309 $LFS getstripe $DIR/$tdir/f2
3311 cancel_lru_locks mdc
3312 cancel_lru_locks osc
3314 echo "Inject failure..."
3315 echo "To simulate f0 lost MDT-object"
3316 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3317 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3320 echo "To simulate the case of f1 lost MDT-object and "
3321 echo "the first OST-object in each PFL component"
3322 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3323 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3326 echo "To simulate the case of f2 lost MDT-object and "
3327 echo "the second OST-object in each PFL component"
3328 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3335 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3336 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3338 for k in $(seq $MDSCOUNT); do
3339 # The LFSCK status query internal is 30 seconds. For the case
3340 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3341 # time to guarantee the status sync up.
3342 wait_update_facet mds${k} "$LCTL get_param -n \
3343 mdd.$(facet_svc mds${k}).lfsck_layout |
3344 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3345 error "(4) MDS${k} is not the expected 'completed'"
3348 for k in $(seq $OSTCOUNT); do
3349 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3350 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3351 awk '/^status/ { print $2 }')
3352 [ "$cur_status" == "completed" ] ||
3353 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3356 local repaired=$(do_facet mds1 $LCTL get_param -n \
3357 mdd.$(facet_svc mds1).lfsck_layout |
3358 awk '/^repaired_orphan/ { print $2 }')
3359 [ $repaired -eq 8 ] ||
3360 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3363 # ${fid0}-R-0 is the old f0
3365 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3366 echo "Check $name, which is the old f0"
3368 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3370 local pattern=$($LFS getstripe -L -I1 $name)
3371 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3372 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3374 pattern=$($LFS getstripe -L -I2 $name)
3375 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3376 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3378 local stripes=$($LFS getstripe -c -I1 $name)
3379 [ $stripes -eq 2 ] ||
3380 error "(7.3.1) expect 2 stripes, but got $stripes"
3382 stripes=$($LFS getstripe -c -I2 $name)
3383 [ $stripes -eq 2 ] ||
3384 error "(7.3.2) expect 2 stripes, but got $stripes"
3386 local e_start=$($LFS getstripe -I1 $name |
3387 awk '/lcme_extent.e_start:/ { print $2 }')
3388 [ $e_start -eq 0 ] ||
3389 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3391 local e_end=$($LFS getstripe -I1 $name |
3392 awk '/lcme_extent.e_end:/ { print $2 }')
3393 [ $e_end -eq 2097152 ] ||
3394 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3396 e_start=$($LFS getstripe -I2 $name |
3397 awk '/lcme_extent.e_start:/ { print $2 }')
3398 [ $e_start -eq 2097152 ] ||
3399 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3401 e_end=$($LFS getstripe -I2 $name |
3402 awk '/lcme_extent.e_end:/ { print $2 }')
3403 [ "$e_end" = "EOF" ] ||
3404 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3406 local size=$(stat $name | awk '/Size:/ { print $2 }')
3407 [ $size -eq $((4096 * $bcount)) ] ||
3408 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3410 cat $name > /dev/null || error "(7.7) cannot read $name"
3412 echo "dummy" >> $name || error "(7.8) cannot write $name"
3414 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3416 touch $name || error "(7.10) cannot touch $name"
3418 rm -f $name || error "(7.11) cannot unlink $name"
3421 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3423 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3424 echo "Check $name, it contains f1's second OST-object in each COMP"
3426 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3428 pattern=$($LFS getstripe -L -I1 $name)
3429 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3430 error "(8.2.1) expect pattern flag hole, but got $pattern"
3432 pattern=$($LFS getstripe -L -I2 $name)
3433 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3434 error "(8.2.2) expect pattern flag hole, but got $pattern"
3436 stripes=$($LFS getstripe -c -I1 $name)
3437 [ $stripes -eq 2 ] ||
3438 error "(8.3.2) expect 2 stripes, but got $stripes"
3440 stripes=$($LFS getstripe -c -I2 $name)
3441 [ $stripes -eq 2 ] ||
3442 error "(8.3.2) expect 2 stripes, but got $stripes"
3444 e_start=$($LFS getstripe -I1 $name |
3445 awk '/lcme_extent.e_start:/ { print $2 }')
3446 [ $e_start -eq 0 ] ||
3447 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3449 e_end=$($LFS getstripe -I1 $name |
3450 awk '/lcme_extent.e_end:/ { print $2 }')
3451 [ $e_end -eq 2097152 ] ||
3452 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3454 e_start=$($LFS getstripe -I2 $name |
3455 awk '/lcme_extent.e_start:/ { print $2 }')
3456 [ $e_start -eq 2097152 ] ||
3457 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3459 e_end=$($LFS getstripe -I2 $name |
3460 awk '/lcme_extent.e_end:/ { print $2 }')
3461 [ "$e_end" = "EOF" ] ||
3462 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3464 size=$(stat $name | awk '/Size:/ { print $2 }')
3465 [ $size -eq $((4096 * $bcount)) ] ||
3466 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3468 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3470 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3471 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3473 # The first stripe in each COMP was lost
3474 [ $failures -eq 512 ] ||
3475 error "(8.8) expect 512 IO failures, but get $failures"
3477 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3478 [ $size -eq $((4096 * $bcount)) ] ||
3479 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3481 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3482 error "(8.10) write to the LOV EA hole should fail"
3484 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3485 error "(8.11) write to normal stripe should NOT fail"
3487 echo "foo" >> $name && error "(8.12) append write $name should fail"
3489 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3491 touch $name || error "(8.14) cannot touch $name"
3493 rm -f $name || error "(8.15) cannot unlink $name"
3496 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3498 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3499 echo "Check $name, it contains f2's first stripe in each COMP"
3501 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3503 pattern=$($LFS getstripe -L -I1 $name)
3504 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3505 error "(9.2.1) expect pattern flag hole, but got $pattern"
3507 pattern=$($LFS getstripe -L -I2 $name)
3508 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3509 error "(9.2.2) expect pattern flag hole, but got $pattern"
3511 stripes=$($LFS getstripe -c -I1 $name)
3512 [ $stripes -eq 2 ] ||
3513 error "(9.3.2) expect 2 stripes, but got $stripes"
3515 stripes=$($LFS getstripe -c -I2 $name)
3516 [ $stripes -eq 2 ] ||
3517 error "(9.3.2) expect 2 stripes, but got $stripes"
3519 e_start=$($LFS getstripe -I1 $name |
3520 awk '/lcme_extent.e_start:/ { print $2 }')
3521 [ $e_start -eq 0 ] ||
3522 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3524 e_end=$($LFS getstripe -I1 $name |
3525 awk '/lcme_extent.e_end:/ { print $2 }')
3526 [ $e_end -eq 2097152 ] ||
3527 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3529 e_start=$($LFS getstripe -I2 $name |
3530 awk '/lcme_extent.e_start:/ { print $2 }')
3531 [ $e_start -eq 2097152 ] ||
3532 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3534 e_end=$($LFS getstripe -I2 $name |
3535 awk '/lcme_extent.e_end:/ { print $2 }')
3536 [ "$e_end" = "EOF" ] ||
3537 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3539 size=$(stat $name | awk '/Size:/ { print $2 }')
3540 # The second stripe in COMP was lost, so we do not know there
3541 # have ever been some data before. 'stat' will regard it as
3542 # no data on the lost stripe.
3544 [ $size -eq $((4096 * $bcount)) ] ||
3545 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3547 cat $name > /dev/null &&
3548 error "(9.7) normal read $name should fail"
3550 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3551 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3552 [ $failures -eq 512 ] ||
3553 error "(9.8) expect 256 IO failures, but get $failures"
3555 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3556 # The second stripe in COMP was lost, so we do not know there
3557 # have ever been some data before. Since 'dd' skip failure,
3558 # it will regard the lost stripe contains data.
3560 [ $size -eq $((4096 * $bcount)) ] ||
3561 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3563 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3564 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3566 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3567 error "(9.11) write to normal stripe should NOT fail"
3569 echo "foo" >> $name &&
3570 error "(9.12) append write $name should fail"
3572 chown $RUNAS_ID:$RUNAS_GID $name ||
3573 error "(9.13) cannot chown on $name"
3575 touch $name || error "(9.14) cannot touch $name"
3577 rm -f $name || error "(7.15) cannot unlink $name"
3579 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3582 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3583 skip "ignore the test if MDS is older than 2.5.59" && return
3585 check_mount_and_prep
3586 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3588 echo "Start all LFSCK components by default (-s 1)"
3589 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3590 error "Fail to start LFSCK"
3592 echo "namespace LFSCK should be in 'scanning-phase1' status"
3593 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3594 [ "$STATUS" == "scanning-phase1" ] ||
3595 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3597 echo "layout LFSCK should be in 'scanning-phase1' status"
3598 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3599 [ "$STATUS" == "scanning-phase1" ] ||
3600 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3602 echo "Stop all LFSCK components by default"
3603 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3604 error "Fail to stop LFSCK"
3606 run_test 21 "run all LFSCK components by default"
3609 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3612 echo "The parent_A references the child directory via some name entry,"
3613 echo "but the child directory back references another parent_B via its"
3614 echo "".." name entry. The parent_B does not exist. Then the namespace"
3615 echo "LFSCK will repair the child directory's ".." name entry."
3618 check_mount_and_prep
3620 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3621 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3623 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3624 echo "The dummy's dotdot name entry references the guard."
3625 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3626 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3627 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3628 error "(3) Fail to mkdir on MDT0"
3629 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3631 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3633 echo "Trigger namespace LFSCK to repair unmatched pairs"
3634 $START_NAMESPACE -A -r ||
3635 error "(5) Fail to start LFSCK for namespace"
3637 wait_all_targets_blocked namespace completed 6
3639 local repaired=$($SHOW_NAMESPACE |
3640 awk '/^unmatched_pairs_repaired/ { print $2 }')
3641 [ $repaired -eq 1 ] ||
3642 error "(7) Fail to repair unmatched pairs: $repaired"
3644 echo "'ls' should success after namespace LFSCK repairing"
3645 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3646 error "(8) ls should success."
3648 run_test 22a "LFSCK can repair unmatched pairs (1)"
3651 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3654 echo "The parent_A references the child directory via the name entry_B,"
3655 echo "but the child directory back references another parent_C via its"
3656 echo "".." name entry. The parent_C exists, but there is no the name"
3657 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3658 echo "the child directory's ".." name entry and its linkEA."
3661 check_mount_and_prep
3663 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3664 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3666 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3667 echo "and bad linkEA. The dummy's dotdot name entry references the"
3668 echo "guard. The dummy's linkEA references n non-exist name entry."
3669 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3670 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3671 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3672 error "(3) Fail to mkdir on MDT0"
3673 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3675 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3676 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3677 local dummyname=$($LFS fid2path $DIR $dummyfid)
3678 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3679 error "(4) fid2path works unexpectedly."
3681 echo "Trigger namespace LFSCK to repair unmatched pairs"
3682 $START_NAMESPACE -A -r ||
3683 error "(5) Fail to start LFSCK for namespace"
3685 wait_all_targets_blocked namespace completed 6
3687 local repaired=$($SHOW_NAMESPACE |
3688 awk '/^unmatched_pairs_repaired/ { print $2 }')
3689 [ $repaired -eq 1 ] ||
3690 error "(7) Fail to repair unmatched pairs: $repaired"
3692 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3693 local dummyname=$($LFS fid2path $DIR $dummyfid)
3694 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3695 error "(8) fid2path does not work"
3697 run_test 22b "LFSCK can repair unmatched pairs (2)"
3700 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3703 echo "The name entry is there, but the MDT-object for such name "
3704 echo "entry does not exist. The namespace LFSCK should find out "
3705 echo "and repair the inconsistency as required."
3708 check_mount_and_prep
3710 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3711 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3713 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3714 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3715 do_facet mds2 $LCTL set_param fail_loc=0x1620
3716 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3717 do_facet mds2 $LCTL set_param fail_loc=0
3719 echo "'ls' should fail because of dangling name entry"
3720 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3722 echo "Trigger namespace LFSCK to find out dangling name entry"
3723 $START_NAMESPACE -A -r ||
3724 error "(5) Fail to start LFSCK for namespace"
3726 wait_all_targets_blocked namespace completed 6
3728 local repaired=$($SHOW_NAMESPACE |
3729 awk '/^dangling_repaired/ { print $2 }')
3730 [ $repaired -eq 1 ] ||
3731 error "(7) Fail to repair dangling name entry: $repaired"
3733 echo "'ls' should fail because not re-create MDT-object by default"
3734 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3736 echo "Trigger namespace LFSCK again to repair dangling name entry"
3737 $START_NAMESPACE -A -r -C ||
3738 error "(9) Fail to start LFSCK for namespace"
3740 wait_all_targets_blocked namespace completed 10
3742 repaired=$($SHOW_NAMESPACE |
3743 awk '/^dangling_repaired/ { print $2 }')
3744 [ $repaired -eq 1 ] ||
3745 error "(11) Fail to repair dangling name entry: $repaired"
3747 echo "'ls' should success after namespace LFSCK repairing"
3748 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3750 run_test 23a "LFSCK can repair dangling name entry (1)"
3754 echo "The objectA has multiple hard links, one of them corresponding"
3755 echo "to the name entry_B. But there is something wrong for the name"
3756 echo "entry_B and cause entry_B to references non-exist object_C."
3757 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3758 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3759 echo "comes to the second-stage scanning, it will find that the"
3760 echo "former re-creating object_C is not proper, and will try to"
3761 echo "replace the object_C with the real object_A."
3764 check_mount_and_prep
3766 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3767 $LFS path2fid $DIR/$tdir/d0
3769 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3771 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3772 $LFS path2fid $DIR/$tdir/d0/f0
3774 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3775 $LFS path2fid $DIR/$tdir/d0/f1
3777 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3778 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3780 if [ "$SEQ0" != "$SEQ1" ]; then
3781 # To guarantee that the f0 and f1 are in the same FID seq
3782 rm -f $DIR/$tdir/d0/f0 ||
3783 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3784 echo "dummy" > $DIR/$tdir/d0/f0 ||
3785 error "(3.2) Fail to touch on MDT0"
3786 $LFS path2fid $DIR/$tdir/d0/f0
3789 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3790 OID=$(printf %d $OID)
3792 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3793 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3794 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3795 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3796 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3798 # If there is creation after the dangling injection, it may re-use
3799 # the just released local object (inode) that is referenced by the
3800 # dangling name entry. It will fail the dangling injection.
3801 # So before deleting the target object for the dangling name entry,
3802 # remove some other objects to avoid the target object being reused
3803 # by some potential creations. LU-7429
3804 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3806 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3808 echo "'ls' should fail because of dangling name entry"
3809 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3810 error "(6) ls should fail."
3812 echo "Trigger namespace LFSCK to find out dangling name entry"
3813 $START_NAMESPACE -r -C ||
3814 error "(7) Fail to start LFSCK for namespace"
3816 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3817 mdd.${MDT_DEV}.lfsck_namespace |
3818 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3820 error "(8) unexpected status"
3823 local repaired=$($SHOW_NAMESPACE |
3824 awk '/^dangling_repaired/ { print $2 }')
3825 [ $repaired -eq 1 ] ||
3826 error "(9) Fail to repair dangling name entry: $repaired"
3828 repaired=$($SHOW_NAMESPACE |
3829 awk '/^multiple_linked_repaired/ { print $2 }')
3830 [ $repaired -eq 1 ] ||
3831 error "(10) Fail to drop the former created object: $repaired"
3833 local data=$(cat $DIR/$tdir/d0/foo)
3834 [ "$data" == "dummy" ] ||
3835 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3837 run_test 23b "LFSCK can repair dangling name entry (2)"
3841 echo "The objectA has multiple hard links, one of them corresponding"
3842 echo "to the name entry_B. But there is something wrong for the name"
3843 echo "entry_B and cause entry_B to references non-exist object_C."
3844 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3845 echo "as dangling, and re-create the lost object_C. And then others"
3846 echo "modified the re-created object_C. When the LFSCK comes to the"
3847 echo "second-stage scanning, it will find that the former re-creating"
3848 echo "object_C maybe wrong and try to replace the object_C with the"
3849 echo "real object_A. But because object_C has been modified, so the"
3850 echo "LFSCK cannot replace it."
3853 start_full_debug_logging
3855 check_mount_and_prep
3857 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3858 $LFS path2fid $DIR/$tdir/d0
3860 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3862 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3863 $LFS path2fid $DIR/$tdir/d0/f0
3865 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3866 $LFS path2fid $DIR/$tdir/d0/f1
3868 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3869 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3871 if [ "$SEQ0" != "$SEQ1" ]; then
3872 # To guarantee that the f0 and f1 are in the same FID seq
3873 rm -f $DIR/$tdir/d0/f0 ||
3874 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3875 echo "dummy" > $DIR/$tdir/d0/f0 ||
3876 error "(3.2) Fail to touch on MDT0"
3877 $LFS path2fid $DIR/$tdir/d0/f0
3880 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3881 OID=$(printf %d $OID)
3883 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3884 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3885 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3886 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3887 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3889 # If there is creation after the dangling injection, it may re-use
3890 # the just released local object (inode) that is referenced by the
3891 # dangling name entry. It will fail the dangling injection.
3892 # So before deleting the target object for the dangling name entry,
3893 # remove some other objects to avoid the target object being reused
3894 # by some potential creations. LU-7429
3895 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3897 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3899 echo "'ls' should fail because of dangling name entry"
3900 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3901 error "(6) ls should fail."
3903 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3904 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3906 echo "Trigger namespace LFSCK to find out dangling name entry"
3907 $START_NAMESPACE -r -C ||
3908 error "(7) Fail to start LFSCK for namespace"
3910 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3911 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3912 stat $DIR/$tdir/d0/foo
3914 error "(8) unexpected size"
3917 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3918 cancel_lru_locks osc
3920 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3921 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3922 mdd.${MDT_DEV}.lfsck_namespace |
3923 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3925 error "(10) unexpected status"
3928 stop_full_debug_logging
3930 local repaired=$($SHOW_NAMESPACE |
3931 awk '/^dangling_repaired/ { print $2 }')
3932 [ $repaired -eq 1 ] ||
3933 error "(11) Fail to repair dangling name entry: $repaired"
3935 local data=$(cat $DIR/$tdir/d0/foo)
3936 [ "$data" != "dummy" ] ||
3937 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3939 run_test 23c "LFSCK can repair dangling name entry (3)"
3942 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3945 echo "Two MDT-objects back reference the same name entry via their"
3946 echo "each own linkEA entry, but the name entry only references one"
3947 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3948 echo "for the MDT-object that is not recognized. If such MDT-object"
3949 echo "has no other linkEA entry after the removing, then the LFSCK"
3950 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3953 check_mount_and_prep
3955 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3957 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3958 $LFS path2fid $DIR/$tdir/d0/guard
3960 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3961 $LFS path2fid $DIR/$tdir/d0/dummy
3964 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3965 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3967 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3970 touch $DIR/$tdir/d0/guard/foo ||
3971 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3973 echo "Inject failure stub on MDT0 to simulate the case that"
3974 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3975 echo "that references $DIR/$tdir/d0/guard/foo."
3976 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3977 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3978 echo "there with the same linkEA entry as another MDT-object"
3979 echo "$DIR/$tdir/d0/guard/foo has"
3981 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3982 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3983 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3984 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3985 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3986 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3987 rmdir $DIR/$tdir/d0/dummy/foo ||
3988 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3989 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3991 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3992 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3993 error "(6) stat successfully unexpectedly"
3995 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3996 $START_NAMESPACE -A -r ||
3997 error "(7) Fail to start LFSCK for namespace"
3999 wait_all_targets_blocked namespace completed 8
4001 local repaired=$($SHOW_NAMESPACE |
4002 awk '/^multiple_referenced_repaired/ { print $2 }')
4003 [ $repaired -eq 1 ] ||
4004 error "(9) Fail to repair multiple referenced name entry: $repaired"
4006 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4007 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4008 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4010 local cname="$cfid-$pfid-D-0"
4011 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4012 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4014 run_test 24 "LFSCK can repair multiple-referenced name entry"
4017 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4018 skip "ldiskfs only test" && return
4021 echo "The file type in the name entry does not match the file type"
4022 echo "claimed by the referenced object. Then the LFSCK will update"
4023 echo "the file type in the name entry."
4026 check_mount_and_prep
4028 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4030 echo "Inject failure stub on MDT0 to simulate the case that"
4031 echo "the file type stored in the name entry is wrong."
4033 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4034 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4035 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4036 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4038 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4039 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4041 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4042 mdd.${MDT_DEV}.lfsck_namespace |
4043 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4045 error "(4) unexpected status"
4048 local repaired=$($SHOW_NAMESPACE |
4049 awk '/^bad_file_type_repaired/ { print $2 }')
4050 [ $repaired -eq 1 ] ||
4051 error "(5) Fail to repair bad file type in name entry: $repaired"
4053 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4055 run_test 25 "LFSCK can repair bad file type in the name entry"
4059 echo "The local name entry back referenced by the MDT-object is lost."
4060 echo "The namespace LFSCK will add the missing local name entry back"
4061 echo "to the normal namespace."
4064 check_mount_and_prep
4066 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4067 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4068 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4070 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4071 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4073 echo "Inject failure stub on MDT0 to simulate the case that"
4074 echo "foo's name entry will be removed, but the foo's object"
4075 echo "and its linkEA are kept in the system."
4077 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4078 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4079 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4080 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4082 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4083 error "(5) 'ls' should fail"
4085 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4086 $START_NAMESPACE -r -A ||
4087 error "(6) Fail to start LFSCK for namespace"
4089 wait_all_targets_blocked namespace completed 7
4091 local repaired=$($SHOW_NAMESPACE |
4092 awk '/^lost_dirent_repaired/ { print $2 }')
4093 [ $repaired -eq 1 ] ||
4094 error "(8) Fail to repair lost dirent: $repaired"
4096 ls -ail $DIR/$tdir/d0/foo ||
4097 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4099 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4100 [ "$foofid" == "$foofid2" ] ||
4101 error "(10) foo's FID changed: $foofid, $foofid2"
4103 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4106 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4109 echo "The remote name entry back referenced by the MDT-object is lost."
4110 echo "The namespace LFSCK will add the missing remote name entry back"
4111 echo "to the normal namespace."
4114 check_mount_and_prep
4116 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4117 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4118 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4120 echo "Inject failure stub on MDT0 to simulate the case that"
4121 echo "foo's name entry will be removed, but the foo's object"
4122 echo "and its linkEA are kept in the system."
4124 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4125 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4126 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4127 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4129 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4130 error "(4) 'ls' should fail"
4132 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4133 $START_NAMESPACE -r -A ||
4134 error "(5) Fail to start LFSCK for namespace"
4136 wait_all_targets_blocked namespace completed 6
4138 local repaired=$($SHOW_NAMESPACE |
4139 awk '/^lost_dirent_repaired/ { print $2 }')
4140 [ $repaired -eq 1 ] ||
4141 error "(7) Fail to repair lost dirent: $repaired"
4143 ls -ail $DIR/$tdir/d0/foo ||
4144 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4146 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4147 [ "$foofid" == "$foofid2" ] ||
4148 error "(9) foo's FID changed: $foofid, $foofid2"
4150 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4154 echo "The local parent referenced by the MDT-object linkEA is lost."
4155 echo "The namespace LFSCK will re-create the lost parent as orphan."
4158 check_mount_and_prep
4160 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4161 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4162 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4163 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4165 echo "Inject failure stub on MDT0 to simulate the case that"
4166 echo "foo's name entry will be removed, but the foo's object"
4167 echo "and its linkEA are kept in the system. And then remove"
4168 echo "another hard link and the parent directory."
4170 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4171 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4172 rm -f $DIR/$tdir/d0/foo ||
4173 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4174 rm -f $DIR/$tdir/d0/dummy ||
4175 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4176 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4178 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4179 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4181 echo "Trigger namespace LFSCK to repair the lost parent"
4182 $START_NAMESPACE -r -A ||
4183 error "(6) Fail to start LFSCK for namespace"
4185 wait_all_targets_blocked namespace completed 7
4187 local repaired=$($SHOW_NAMESPACE |
4188 awk '/^lost_dirent_repaired/ { print $2 }')
4189 [ $repaired -eq 1 ] ||
4190 error "(8) Fail to repair lost dirent: $repaired"
4192 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4193 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4194 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4196 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4198 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4199 [ ! -z "$cname" ] ||
4200 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4202 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4205 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4208 echo "The remote parent referenced by the MDT-object linkEA is lost."
4209 echo "The namespace LFSCK will re-create the lost parent as orphan."
4212 check_mount_and_prep
4214 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4215 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4217 $LFS path2fid $DIR/$tdir/d0
4219 echo "Inject failure stub on MDT0 to simulate the case that"
4220 echo "foo's name entry will be removed, but the foo's object"
4221 echo "and its linkEA are kept in the system. And then remove"
4222 echo "the parent directory."
4224 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4226 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4227 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4229 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4230 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4232 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4233 $START_NAMESPACE -r -A ||
4234 error "(6) Fail to start LFSCK for namespace"
4236 wait_all_targets_blocked namespace completed 7
4238 local repaired=$($SHOW_NAMESPACE |
4239 awk '/^lost_dirent_repaired/ { print $2 }')
4240 [ $repaired -eq 1 ] ||
4241 error "(8) Fail to repair lost dirent: $repaired"
4243 ls -ail $MOUNT/.lustre/lost+found/
4245 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4246 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4247 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4249 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4251 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4252 [ ! -z "$cname" ] ||
4253 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4255 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4258 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4261 echo "The target name entry is lost. The LFSCK should insert the"
4262 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4263 echo "the MDT (on which the orphan MDT-object resides) has ever"
4264 echo "failed to respond some name entry verification during the"
4265 echo "first stage-scanning, then the LFSCK should skip to handle"
4266 echo "orphan MDT-object on this MDT. But other MDTs should not"
4270 check_mount_and_prep
4271 $LFS mkdir -i 0 $DIR/$tdir/d1
4272 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4273 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4275 $LFS mkdir -i 1 $DIR/$tdir/d2
4276 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4277 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4279 echo "Inject failure stub on MDT0 to simulate the case that"
4280 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4281 echo "and its linkEA are kept in the system. And the case that"
4282 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4283 echo "and its linkEA are kept in the system."
4285 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4286 do_facet mds1 $LCTL set_param fail_loc=0x1624
4287 do_facet mds2 $LCTL set_param fail_loc=0x1624
4288 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4289 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4290 do_facet mds1 $LCTL set_param fail_loc=0
4291 do_facet mds2 $LCTL set_param fail_loc=0
4293 cancel_lru_locks mdc
4294 cancel_lru_locks osc
4296 echo "Inject failure, to simulate the MDT0 fail to handle"
4297 echo "MDT1 LFSCK request during the first-stage scanning."
4298 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4299 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4301 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4302 $START_NAMESPACE -r -A ||
4303 error "(3) Fail to start LFSCK for namespace"
4305 wait_update_facet mds1 "$LCTL get_param -n \
4306 mdd.$(facet_svc mds1).lfsck_namespace |
4307 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4308 error "(4) mds1 is not the expected 'partial'"
4311 wait_update_facet mds2 "$LCTL get_param -n \
4312 mdd.$(facet_svc mds2).lfsck_namespace |
4313 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4314 error "(5) mds2 is not the expected 'completed'"
4317 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4319 local repaired=$(do_facet mds1 $LCTL get_param -n \
4320 mdd.$(facet_svc mds1).lfsck_namespace |
4321 awk '/^lost_dirent_repaired/ { print $2 }')
4322 [ $repaired -eq 0 ] ||
4323 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4325 repaired=$(do_facet mds2 $LCTL get_param -n \
4326 mdd.$(facet_svc mds2).lfsck_namespace |
4327 awk '/^lost_dirent_repaired/ { print $2 }')
4328 [ $repaired -eq 1 ] ||
4329 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4331 echo "Trigger namespace LFSCK on all devices again to cleanup"
4332 $START_NAMESPACE -r -A ||
4333 error "(8) Fail to start LFSCK for namespace"
4335 wait_all_targets_blocked namespace completed 9
4337 local repaired=$(do_facet mds1 $LCTL get_param -n \
4338 mdd.$(facet_svc mds1).lfsck_namespace |
4339 awk '/^lost_dirent_repaired/ { print $2 }')
4340 [ $repaired -eq 1 ] ||
4341 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4343 repaired=$(do_facet mds2 $LCTL get_param -n \
4344 mdd.$(facet_svc mds2).lfsck_namespace |
4345 awk '/^lost_dirent_repaired/ { print $2 }')
4346 [ $repaired -eq 0 ] ||
4347 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4349 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4353 echo "The object's nlink attribute is larger than the object's known"
4354 echo "name entries count. The LFSCK will repair the object's nlink"
4355 echo "attribute to match the known name entries count"
4358 check_mount_and_prep
4360 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4361 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4363 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4364 echo "nlink attribute is larger than its name entries count."
4366 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4367 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4368 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4369 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4370 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4372 cancel_lru_locks mdc
4373 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4374 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4376 echo "Trigger namespace LFSCK to repair the nlink count"
4377 $START_NAMESPACE -r -A ||
4378 error "(5) Fail to start LFSCK for namespace"
4380 wait_all_targets_blocked namespace completed 6
4382 local repaired=$($SHOW_NAMESPACE |
4383 awk '/^nlinks_repaired/ { print $2 }')
4384 [ $repaired -eq 1 ] ||
4385 error "(7) Fail to repair nlink count: $repaired"
4387 cancel_lru_locks mdc
4388 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4389 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4391 # Disable 29a, we only allow nlink to be updated if the known linkEA
4392 # entries is larger than nlink count.
4394 #run_test 29a "LFSCK can repair bad nlink count (1)"
4398 echo "The object's nlink attribute is smaller than the object's known"
4399 echo "name entries count. The LFSCK will repair the object's nlink"
4400 echo "attribute to match the known name entries count"
4403 check_mount_and_prep
4405 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4406 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4408 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4409 echo "nlink attribute is smaller than its name entries count."
4411 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4412 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4413 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4414 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4415 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4417 cancel_lru_locks mdc
4418 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4419 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4421 echo "Trigger namespace LFSCK to repair the nlink count"
4422 $START_NAMESPACE -r -A ||
4423 error "(5) Fail to start LFSCK for namespace"
4425 wait_all_targets_blocked namespace completed 6
4427 local repaired=$($SHOW_NAMESPACE |
4428 awk '/^nlinks_repaired/ { print $2 }')
4429 [ $repaired -eq 1 ] ||
4430 error "(7) Fail to repair nlink count: $repaired"
4432 cancel_lru_locks mdc
4433 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4434 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4436 run_test 29b "LFSCK can repair bad nlink count (2)"
4441 echo "The namespace LFSCK will create many hard links to the target"
4442 echo "file as to exceed the linkEA size limitation. Under such case"
4443 echo "the linkEA will be marked as overflow that will prevent the"
4444 echo "target file to be migrated. Then remove some hard links to"
4445 echo "make the left hard links to be held within the linkEA size"
4446 echo "limitation. But before the namespace LFSCK adding all the"
4447 echo "missed linkEA entries back, the overflow mark (timestamp)"
4448 echo "will not be cleared."
4451 check_mount_and_prep
4453 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4454 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4455 error "(0.2) Fail to mkdir"
4456 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4457 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4459 # define MAX_LINKEA_SIZE 4096
4460 # sizeof(link_ea_header) = 24
4461 # sizeof(link_ea_entry) = 18
4462 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4463 # (sizeof(link_ea_entry) + name_length))
4464 # If the average name length is 12 bytes, then 150 hard links
4465 # is totally enough to overflow the linkEA
4466 echo "Create 150 hard links should succeed although the linkEA overflow"
4467 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4468 error "(2) Fail to hard link"
4470 cancel_lru_locks mdc
4471 if [ $MDSCOUNT -ge 2 ]; then
4472 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4473 error "(3.1) Migrate failure"
4475 echo "The object with linkEA overflow should NOT be migrated"
4476 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4477 [ "$newfid" == "$oldfid" ] ||
4478 error "(3.2) Migrate should fail: $newfid != $oldfid"
4481 # Remove 100 hard links, then the linkEA should have space
4482 # to hold the missed linkEA entries.
4483 echo "Remove 100 hard links to save space for the missed linkEA entries"
4484 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4486 if [ $MDSCOUNT -ge 2 ]; then
4487 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4488 error "(5.1) Migrate failure"
4490 # The overflow timestamp is still there, so migration will fail.
4491 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4492 [ "$newfid" == "$oldfid" ] ||
4493 error "(5.2) Migrate should fail: $newfid != $oldfid"
4496 # sleep 3 seconds to guarantee that the overflow is recognized
4499 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4500 $START_NAMESPACE -r -A ||
4501 error "(6) Fail to start LFSCK for namespace"
4503 wait_all_targets_blocked namespace completed 7
4505 local repaired=$($SHOW_NAMESPACE |
4506 awk '/^linkea_overflow_cleared/ { print $2 }')
4507 [ $repaired -eq 1 ] ||
4508 error "(8) Fail to clear linkea overflow: $repaired"
4510 repaired=$($SHOW_NAMESPACE |
4511 awk '/^nlinks_repaired/ { print $2 }')
4512 [ $repaired -eq 0 ] ||
4513 error "(9) Unexpected nlink repaired: $repaired"
4515 if [ $MDSCOUNT -ge 2 ]; then
4516 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4517 error "(10.1) Migrate failure"
4519 # Migration should succeed after clear the overflow timestamp.
4520 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4521 [ "$newfid" != "$oldfid" ] ||
4522 error "(10.2) Migrate should succeed"
4524 ls -l $DIR/$tdir/foo > /dev/null ||
4525 error "(11) 'ls' failed after migration"
4528 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4529 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4531 run_test 29c "verify linkEA size limitation"
4534 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4535 skip "ldiskfs only test" && return
4538 echo "The namespace LFSCK will move the orphans from backend"
4539 echo "/lost+found directory to normal client visible namespace"
4540 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4543 check_mount_and_prep
4545 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4546 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4548 echo "Inject failure stub on MDT0 to simulate the case that"
4549 echo "directory d0 has no linkEA entry, then the LFSCK will"
4550 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4552 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4553 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4554 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4555 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4557 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4558 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4560 echo "Inject failure stub on MDT0 to simulate the case that the"
4561 echo "object's name entry will be removed, but not destroy the"
4562 echo "object. Then backend e2fsck will handle it as orphan and"
4563 echo "add them into the backend /lost+found directory."
4565 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4566 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4567 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4568 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4569 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4570 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4571 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4573 umount_client $MOUNT || error "(10) Fail to stop client!"
4575 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4578 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4579 error "(12) Fail to run e2fsck"
4581 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4582 error "(13) Fail to start MDT0"
4584 echo "Trigger namespace LFSCK to recover backend orphans"
4585 $START_NAMESPACE -r -A ||
4586 error "(14) Fail to start LFSCK for namespace"
4588 wait_all_targets_blocked namespace completed 15
4590 local repaired=$($SHOW_NAMESPACE |
4591 awk '/^local_lost_found_moved/ { print $2 }')
4592 [ $repaired -ge 4 ] ||
4593 error "(16) Fail to recover backend orphans: $repaired"
4595 mount_client $MOUNT || error "(17) Fail to start client!"
4597 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4599 ls -ail $MOUNT/.lustre/lost+found/
4601 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4602 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4603 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4605 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4607 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
4608 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4610 stat ${cname}/d1 || error "(21) d0 is not recovered"
4611 stat ${cname}/f1 || error "(22) f1 is not recovered"
4613 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4616 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4619 echo "For the name entry under a striped directory, if the name"
4620 echo "hash does not match the shard, then the LFSCK will repair"
4621 echo "the bad name entry"
4624 check_mount_and_prep
4626 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4627 error "(1) Fail to create striped directory"
4629 echo "Inject failure stub on client to simulate the case that"
4630 echo "some name entry should be inserted into other non-first"
4631 echo "shard, but inserted into the first shard by wrong"
4633 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4634 $LCTL set_param fail_loc=0x1628 fail_val=0
4635 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4636 error "(2) Fail to create file under striped directory"
4637 $LCTL set_param fail_loc=0 fail_val=0
4639 echo "Trigger namespace LFSCK to repair bad name hash"
4640 $START_NAMESPACE -r -A ||
4641 error "(3) Fail to start LFSCK for namespace"
4643 wait_all_targets_blocked namespace completed 4
4645 local repaired=$($SHOW_NAMESPACE |
4646 awk '/^name_hash_repaired/ { print $2 }')
4647 [ $repaired -ge 1 ] ||
4648 error "(5) Fail to repair bad name hash: $repaired"
4650 umount_client $MOUNT || error "(6) umount failed"
4651 mount_client $MOUNT || error "(7) mount failed"
4653 for ((i = 0; i < $MDSCOUNT; i++)); do
4654 stat $DIR/$tdir/striped_dir/d$i ||
4655 error "(8) Fail to stat d$i after LFSCK"
4656 rmdir $DIR/$tdir/striped_dir/d$i ||
4657 error "(9) Fail to unlink d$i after LFSCK"
4660 rmdir $DIR/$tdir/striped_dir ||
4661 error "(10) Fail to remove the striped directory after LFSCK"
4663 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4666 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4669 echo "For the name entry under a striped directory, if the name"
4670 echo "hash does not match the shard, then the LFSCK will repair"
4671 echo "the bad name entry"
4674 check_mount_and_prep
4676 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4677 error "(1) Fail to create striped directory"
4679 echo "Inject failure stub on client to simulate the case that"
4680 echo "some name entry should be inserted into other non-second"
4681 echo "shard, but inserted into the secod shard by wrong"
4683 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4684 $LCTL set_param fail_loc=0x1628 fail_val=1
4685 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4686 error "(2) Fail to create file under striped directory"
4687 $LCTL set_param fail_loc=0 fail_val=0
4689 echo "Trigger namespace LFSCK to repair bad name hash"
4690 $START_NAMESPACE -r -A ||
4691 error "(3) Fail to start LFSCK for namespace"
4693 wait_all_targets_blocked namespace completed 4
4695 local repaired=$(do_facet mds2 $LCTL get_param -n \
4696 mdd.$(facet_svc mds2).lfsck_namespace |
4697 awk '/^name_hash_repaired/ { print $2 }')
4698 [ $repaired -ge 1 ] ||
4699 error "(5) Fail to repair bad name hash: $repaired"
4701 umount_client $MOUNT || error "(6) umount failed"
4702 mount_client $MOUNT || error "(7) mount failed"
4704 for ((i = 0; i < $MDSCOUNT; i++)); do
4705 stat $DIR/$tdir/striped_dir/d$i ||
4706 error "(8) Fail to stat d$i after LFSCK"
4707 rmdir $DIR/$tdir/striped_dir/d$i ||
4708 error "(9) Fail to unlink d$i after LFSCK"
4711 rmdir $DIR/$tdir/striped_dir ||
4712 error "(10) Fail to remove the striped directory after LFSCK"
4714 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4717 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4720 echo "For some reason, the master MDT-object of the striped directory"
4721 echo "may lost its master LMV EA. If nobody created files under the"
4722 echo "master directly after the master LMV EA lost, then the LFSCK"
4723 echo "should re-generate the master LMV EA."
4726 check_mount_and_prep
4728 echo "Inject failure stub on MDT0 to simulate the case that the"
4729 echo "master MDT-object of the striped directory lost the LMV EA."
4731 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4732 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4733 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4734 error "(1) Fail to create striped directory"
4735 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4737 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4738 $START_NAMESPACE -r -A ||
4739 error "(2) Fail to start LFSCK for namespace"
4741 wait_all_targets_blocked namespace completed 3
4743 local repaired=$($SHOW_NAMESPACE |
4744 awk '/^striped_dirs_repaired/ { print $2 }')
4745 [ $repaired -eq 1 ] ||
4746 error "(4) Fail to re-generate master LMV EA: $repaired"
4748 umount_client $MOUNT || error "(5) umount failed"
4749 mount_client $MOUNT || error "(6) mount failed"
4751 local empty=$(ls $DIR/$tdir/striped_dir/)
4752 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4754 rmdir $DIR/$tdir/striped_dir ||
4755 error "(8) Fail to remove the striped directory after LFSCK"
4757 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4760 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4763 echo "For some reason, the master MDT-object of the striped directory"
4764 echo "may lost its master LMV EA. If somebody created files under the"
4765 echo "master directly after the master LMV EA lost, then the LFSCK"
4766 echo "should NOT re-generate the master LMV EA, instead, it should"
4767 echo "change the broken striped dirctory as read-only to prevent"
4768 echo "further damage"
4771 check_mount_and_prep
4773 echo "Inject failure stub on MDT0 to simulate the case that the"
4774 echo "master MDT-object of the striped directory lost the LMV EA."
4776 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4777 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4778 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4779 error "(1) Fail to create striped directory"
4780 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4782 umount_client $MOUNT || error "(2) umount failed"
4783 mount_client $MOUNT || error "(3) mount failed"
4785 touch $DIR/$tdir/striped_dir/dummy ||
4786 error "(4) Fail to touch under broken striped directory"
4788 echo "Trigger namespace LFSCK to find out the inconsistency"
4789 $START_NAMESPACE -r -A ||
4790 error "(5) Fail to start LFSCK for namespace"
4792 wait_all_targets_blocked namespace completed 6
4794 local repaired=$($SHOW_NAMESPACE |
4795 awk '/^striped_dirs_repaired/ { print $2 }')
4796 [ $repaired -eq 0 ] ||
4797 error "(7) Re-generate master LMV EA unexpected: $repaired"
4799 stat $DIR/$tdir/striped_dir/dummy ||
4800 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4802 touch $DIR/$tdir/striped_dir/foo &&
4803 error "(9) The broken striped directory should be read-only"
4805 chattr -i $DIR/$tdir/striped_dir ||
4806 error "(10) Fail to chattr on the broken striped directory"
4808 rmdir $DIR/$tdir/striped_dir ||
4809 error "(11) Fail to remove the striped directory after LFSCK"
4811 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4814 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4817 echo "For some reason, the slave MDT-object of the striped directory"
4818 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4819 echo "slave LMV EA."
4822 check_mount_and_prep
4824 echo "Inject failure stub on MDT0 to simulate the case that the"
4825 echo "slave MDT-object (that resides on the same MDT as the master"
4826 echo "MDT-object resides on) lost the LMV EA."
4828 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4829 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4830 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4831 error "(1) Fail to create striped directory"
4832 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4834 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4835 $START_NAMESPACE -r -A ||
4836 error "(2) Fail to start LFSCK for namespace"
4838 wait_all_targets_blocked namespace completed 3
4840 local repaired=$($SHOW_NAMESPACE |
4841 awk '/^striped_shards_repaired/ { print $2 }')
4842 [ $repaired -eq 1 ] ||
4843 error "(4) Fail to re-generate slave LMV EA: $repaired"
4845 rmdir $DIR/$tdir/striped_dir ||
4846 error "(5) Fail to remove the striped directory after LFSCK"
4848 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4851 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4854 echo "For some reason, the slave MDT-object of the striped directory"
4855 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4856 echo "slave LMV EA."
4859 check_mount_and_prep
4861 echo "Inject failure stub on MDT0 to simulate the case that the"
4862 echo "slave MDT-object (that resides on different MDT as the master"
4863 echo "MDT-object resides on) lost the LMV EA."
4865 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4866 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4867 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4868 error "(1) Fail to create striped directory"
4869 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4871 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4872 $START_NAMESPACE -r -A ||
4873 error "(2) Fail to start LFSCK for namespace"
4875 wait_all_targets_blocked namespace completed 3
4877 local repaired=$(do_facet mds2 $LCTL get_param -n \
4878 mdd.$(facet_svc mds2).lfsck_namespace |
4879 awk '/^striped_shards_repaired/ { print $2 }')
4880 [ $repaired -eq 1 ] ||
4881 error "(4) Fail to re-generate slave LMV EA: $repaired"
4883 rmdir $DIR/$tdir/striped_dir ||
4884 error "(5) Fail to remove the striped directory after LFSCK"
4886 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4889 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4892 echo "For some reason, the stripe index in the slave LMV EA is"
4893 echo "corrupted. The LFSCK should repair the slave LMV EA."
4896 check_mount_and_prep
4898 echo "Inject failure stub on MDT0 to simulate the case that the"
4899 echo "slave LMV EA on the first shard of the striped directory"
4900 echo "claims the same index as the second shard claims"
4902 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4903 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4904 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4905 error "(1) Fail to create striped directory"
4906 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4908 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4909 $START_NAMESPACE -r -A ||
4910 error "(2) Fail to start LFSCK for namespace"
4912 wait_all_targets_blocked namespace completed 3
4914 local repaired=$($SHOW_NAMESPACE |
4915 awk '/^striped_shards_repaired/ { print $2 }')
4916 [ $repaired -eq 1 ] ||
4917 error "(4) Fail to repair slave LMV EA: $repaired"
4919 umount_client $MOUNT || error "(5) umount failed"
4920 mount_client $MOUNT || error "(6) mount failed"
4922 touch $DIR/$tdir/striped_dir/foo ||
4923 error "(7) Fail to touch file after the LFSCK"
4925 rm -f $DIR/$tdir/striped_dir/foo ||
4926 error "(8) Fail to unlink file after the LFSCK"
4928 rmdir $DIR/$tdir/striped_dir ||
4929 error "(9) Fail to remove the striped directory after LFSCK"
4931 run_test 31g "Repair the corrupted slave LMV EA"
4934 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4937 echo "For some reason, the shard's name entry in the striped"
4938 echo "directory may be corrupted. The LFSCK should repair the"
4939 echo "bad shard's name entry."
4942 check_mount_and_prep
4944 echo "Inject failure stub on MDT0 to simulate the case that the"
4945 echo "first shard's name entry in the striped directory claims"
4946 echo "the same index as the second shard's name entry claims."
4948 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4949 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4950 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4951 error "(1) Fail to create striped directory"
4952 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4954 echo "Trigger namespace LFSCK to repair the shard's name entry"
4955 $START_NAMESPACE -r -A ||
4956 error "(2) Fail to start LFSCK for namespace"
4958 wait_all_targets_blocked namespace completed 3
4960 local repaired=$($SHOW_NAMESPACE |
4961 awk '/^dirent_repaired/ { print $2 }')
4962 [ $repaired -eq 1 ] ||
4963 error "(4) Fail to repair shard's name entry: $repaired"
4965 umount_client $MOUNT || error "(5) umount failed"
4966 mount_client $MOUNT || error "(6) mount failed"
4968 touch $DIR/$tdir/striped_dir/foo ||
4969 error "(7) Fail to touch file after the LFSCK"
4971 rm -f $DIR/$tdir/striped_dir/foo ||
4972 error "(8) Fail to unlink file after the LFSCK"
4974 rmdir $DIR/$tdir/striped_dir ||
4975 error "(9) Fail to remove the striped directory after LFSCK"
4977 run_test 31h "Repair the corrupted shard's name entry"
4982 umount_client $MOUNT
4984 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
4985 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
4986 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
4988 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
4989 [ "$STATUS" == "scanning-phase1" ] ||
4990 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
4993 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
4995 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
4999 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
5001 run_test 32 "stop LFSCK when some OST failed"
5007 $START_LAYOUT --dryrun -o -r ||
5008 error "(1) Fail to start layout LFSCK"
5009 wait_all_targets_blocked layout completed 2
5011 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5012 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5013 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5015 $START_NAMESPACE -e abort -A -r ||
5016 error "(4) Fail to start namespace LFSCK"
5017 wait_all_targets_blocked namespace completed 5
5019 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5020 [ "$PARAMS" == "failout,all_targets" ] ||
5021 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5023 run_test 33 "check LFSCK paramters"
5025 # restore MDS/OST size
5026 MDSSIZE=${SAVED_MDSSIZE}
5027 OSTSIZE=${SAVED_OSTSIZE}
5028 OSTCOUNT=${SAVED_OSTCOUNT}
5030 # cleanup the system at last