3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
216 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
217 skip "OI Scrub not implemented for ZFS" && return
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^linkea_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase2/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair crashed linkEA: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
335 error "(7) Fail to stat $DIR/$tdir/dummy"
337 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
338 local dummyname=$($LFS fid2path $DIR $dummyfid)
339 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
340 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
342 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
348 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
350 touch $DIR/$tdir/dummy
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
354 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
356 mdd.${MDT_DEV}.lfsck_namespace |
357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
359 error "(4) unexpected status"
362 local repaired=$($SHOW_NAMESPACE |
363 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
422 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^linkea_repaired/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2d "LFSCK can recover the missing linkEA entry"
457 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
461 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
463 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
464 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
465 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
466 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
468 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
470 wait_all_targets_blocked namespace completed 4
472 local repaired=$($SHOW_NAMESPACE |
473 awk '/^linkea_repaired/ { print $2 }')
474 [ $repaired -eq 1 ] ||
475 error "(5) Fail to repair crashed linkEA: $repaired"
477 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
478 local name=$($LFS fid2path $DIR $fid)
479 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
480 error "(6) Fail to repair linkEA: $fid $name"
482 run_test 2e "namespace LFSCK can verify remote object linkEA"
488 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
489 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
490 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
492 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
493 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
494 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
496 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
497 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
498 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
500 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
501 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
502 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
504 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
506 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
507 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
508 mdd.${MDT_DEV}.lfsck_namespace |
509 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
511 error "(10) unexpected status"
514 local checked=$($SHOW_NAMESPACE |
515 awk '/^checked_phase2/ { print $2 }')
516 [ $checked -ge 4 ] ||
517 error "(11) Fail to check multiple-linked object: $checked"
519 local repaired=$($SHOW_NAMESPACE |
520 awk '/^multiple_linked_repaired/ { print $2 }')
521 [ $repaired -ge 2 ] ||
522 error "(12) Fail to repair multiple-linked object: $repaired"
524 run_test 3 "LFSCK can verify multiple-linked objects"
528 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
529 skip "OI Scrub not implemented for ZFS" && return
532 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
533 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
535 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
536 echo "start $SINGLEMDS with disabling OI scrub"
537 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
538 error "(2) Fail to start MDS!"
540 #define OBD_FAIL_LFSCK_DELAY2 0x1601
541 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
542 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
543 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
544 mdd.${MDT_DEV}.lfsck_namespace |
545 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
547 error "(5) unexpected status"
550 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
551 [ "$STATUS" == "scanning-phase1" ] ||
552 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
554 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
555 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
556 mdd.${MDT_DEV}.lfsck_namespace |
557 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
559 error "(7) unexpected status"
562 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
563 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
565 local repaired=$($SHOW_NAMESPACE |
566 awk '/^dirent_repaired/ { print $2 }')
567 # for interop with old server
568 [ -z "$repaired" ] &&
569 repaired=$($SHOW_NAMESPACE |
570 awk '/^updated_phase1/ { print $2 }')
572 [ $repaired -ge 9 ] ||
573 error "(9) Fail to re-generate FID-in-dirent: $repaired"
577 mount_client $MOUNT || error "(10) Fail to start client!"
579 #define OBD_FAIL_FID_LOOKUP 0x1505
580 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
581 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
582 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
584 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
588 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
589 skip "OI Scrub not implemented for ZFS" && return
592 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
593 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
595 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
596 echo "start $SINGLEMDS with disabling OI scrub"
597 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
598 error "(2) Fail to start MDS!"
600 #define OBD_FAIL_LFSCK_DELAY2 0x1601
601 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
602 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
603 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
604 mdd.${MDT_DEV}.lfsck_namespace |
605 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
607 error "(5) unexpected status"
610 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
611 [ "$STATUS" == "scanning-phase1" ] ||
612 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
614 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
615 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
616 mdd.${MDT_DEV}.lfsck_namespace |
617 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
619 error "(7) unexpected status"
622 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
623 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
625 local repaired=$($SHOW_NAMESPACE |
626 awk '/^dirent_repaired/ { print $2 }')
627 # for interop with old server
628 [ -z "$repaired" ] &&
629 repaired=$($SHOW_NAMESPACE |
630 awk '/^updated_phase1/ { print $2 }')
632 [ $repaired -ge 2 ] ||
633 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
637 mount_client $MOUNT || error "(10) Fail to start client!"
639 #define OBD_FAIL_FID_LOOKUP 0x1505
640 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
641 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
643 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
645 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
646 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
647 local dummyname=$($LFS fid2path $DIR $dummyfid)
648 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
649 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
651 run_test 5 "LFSCK can handle IGIF object upgrading"
656 #define OBD_FAIL_LFSCK_DELAY1 0x1600
657 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
658 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
660 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
661 [ "$STATUS" == "scanning-phase1" ] ||
662 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
664 # Sleep 3 sec to guarantee at least one object processed by LFSCK
666 # Fail the LFSCK to guarantee there is at least one checkpoint
667 #define OBD_FAIL_LFSCK_FATAL1 0x1608
668 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
669 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
670 mdd.${MDT_DEV}.lfsck_namespace |
671 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
673 error "(4) unexpected status"
676 local POS0=$($SHOW_NAMESPACE |
677 awk '/^last_checkpoint_position/ { print $2 }' |
680 #define OBD_FAIL_LFSCK_DELAY1 0x1600
681 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
682 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
684 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
685 [ "$STATUS" == "scanning-phase1" ] ||
686 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
688 local POS1=$($SHOW_NAMESPACE |
689 awk '/^latest_start_position/ { print $2 }' |
691 [[ $POS0 -lt $POS1 ]] ||
692 error "(7) Expect larger than: $POS0, but got $POS1"
694 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
695 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
696 mdd.${MDT_DEV}.lfsck_namespace |
697 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
699 error "(8) unexpected status"
702 run_test 6a "LFSCK resumes from last checkpoint (1)"
707 #define OBD_FAIL_LFSCK_DELAY2 0x1601
708 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
709 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
711 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
712 [ "$STATUS" == "scanning-phase1" ] ||
713 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
715 # Sleep 5 sec to guarantee that we are in the directory scanning
717 # Fail the LFSCK to guarantee there is at least one checkpoint
718 #define OBD_FAIL_LFSCK_FATAL2 0x1609
719 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
720 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
721 mdd.${MDT_DEV}.lfsck_namespace |
722 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
724 error "(4) unexpected status"
727 local O_POS0=$($SHOW_NAMESPACE |
728 awk '/^last_checkpoint_position/ { print $2 }' |
731 local D_POS0=$($SHOW_NAMESPACE |
732 awk '/^last_checkpoint_position/ { print $4 }')
734 #define OBD_FAIL_LFSCK_DELAY2 0x1601
735 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
736 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
738 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
739 [ "$STATUS" == "scanning-phase1" ] ||
740 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
742 local O_POS1=$($SHOW_NAMESPACE |
743 awk '/^latest_start_position/ { print $2 }' |
745 local D_POS1=$($SHOW_NAMESPACE |
746 awk '/^latest_start_position/ { print $4 }')
748 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
749 [[ $O_POS0 -lt $O_POS1 ]] ||
750 error "(7.1) $O_POS1 is not larger than $O_POS0"
752 [[ $D_POS0 -lt $D_POS1 ]] ||
753 error "(7.2) $D_POS1 is not larger than $D_POS0"
756 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
757 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
758 mdd.${MDT_DEV}.lfsck_namespace |
759 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
761 error "(8) unexpected status"
764 run_test 6b "LFSCK resumes from last checkpoint (2)"
771 #define OBD_FAIL_LFSCK_DELAY2 0x1601
772 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
773 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
775 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
776 [ "$STATUS" == "scanning-phase1" ] ||
777 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
779 # Sleep 3 sec to guarantee at least one object processed by LFSCK
781 echo "stop $SINGLEMDS"
782 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
784 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
785 echo "start $SINGLEMDS"
786 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
787 error "(5) Fail to start MDS!"
789 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
790 mdd.${MDT_DEV}.lfsck_namespace |
791 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
793 error "(6) unexpected status"
796 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
802 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
803 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
804 for ((i = 0; i < 20; i++)); do
805 touch $DIR/$tdir/dummy${i}
808 #define OBD_FAIL_LFSCK_DELAY3 0x1602
809 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
810 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
811 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
812 mdd.${MDT_DEV}.lfsck_namespace |
813 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
815 error "(4) unexpected status"
819 echo "stop $SINGLEMDS"
820 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
822 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
823 echo "start $SINGLEMDS"
824 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
825 error "(6) Fail to start MDS!"
827 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
828 mdd.${MDT_DEV}.lfsck_namespace |
829 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
831 error "(7) unexpected status"
834 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
839 formatall > /dev/null
845 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
846 [ "$STATUS" == "init" ] ||
847 error "(2) Expect 'init', but got '$STATUS'"
849 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
850 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
851 mkdir $DIR/$tdir/crashed
853 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
854 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
855 for ((i = 0; i < 5; i++)); do
856 touch $DIR/$tdir/dummy${i}
859 umount_client $MOUNT || error "(3) Fail to stop client!"
861 #define OBD_FAIL_LFSCK_DELAY2 0x1601
862 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
863 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
865 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
866 [ "$STATUS" == "scanning-phase1" ] ||
867 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
869 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
871 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
872 [ "$STATUS" == "stopped" ] ||
873 error "(7) Expect 'stopped', but got '$STATUS'"
875 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
877 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
878 [ "$STATUS" == "scanning-phase1" ] ||
879 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
881 #define OBD_FAIL_LFSCK_FATAL2 0x1609
882 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
883 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
884 mdd.${MDT_DEV}.lfsck_namespace |
885 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
887 error "(10) unexpected status"
890 #define OBD_FAIL_LFSCK_DELAY1 0x1600
891 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
892 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
894 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
895 [ "$STATUS" == "scanning-phase1" ] ||
896 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
898 #define OBD_FAIL_LFSCK_CRASH 0x160a
899 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
902 echo "stop $SINGLEMDS"
903 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
905 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
906 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
908 echo "start $SINGLEMDS"
909 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
910 error "(14) Fail to start MDS!"
912 local timeout=$(max_recovery_time)
915 while [ $timer -lt $timeout ]; do
916 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
917 mdt.${MDT_DEV}.recovery_status |
918 awk '/^status/ { print \\\$2 }'")
919 [ "$STATUS" != "RECOVERING" ] && break;
924 [ $timer != $timeout ] ||
925 error "(14.1) recovery timeout"
927 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
928 [ "$STATUS" == "crashed" ] ||
929 error "(15) Expect 'crashed', but got '$STATUS'"
931 #define OBD_FAIL_LFSCK_DELAY2 0x1601
932 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
933 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
935 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
936 [ "$STATUS" == "scanning-phase1" ] ||
937 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
939 echo "stop $SINGLEMDS"
940 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
942 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
943 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
945 echo "start $SINGLEMDS"
946 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
947 error "(19) Fail to start MDS!"
950 while [ $timer -lt $timeout ]; do
951 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
952 mdt.${MDT_DEV}.recovery_status |
953 awk '/^status/ { print \\\$2 }'")
954 [ "$STATUS" != "RECOVERING" ] && break;
959 [ $timer != $timeout ] ||
960 error "(19.1) recovery timeout"
962 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
963 [ "$STATUS" == "paused" ] ||
964 error "(20) Expect 'paused', but got '$STATUS'"
966 echo "stop $SINGLEMDS"
967 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
969 echo "start $SINGLEMDS without resume LFSCK"
970 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
971 error "(20.2) Fail to start MDS!"
974 while [ $timer -lt $timeout ]; do
975 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
976 mdt.${MDT_DEV}.recovery_status |
977 awk '/^status/ { print \\\$2 }'")
978 [ "$STATUS" != "RECOVERING" ] && break;
983 [ $timer != $timeout ] ||
984 error "(20.3) recovery timeout"
986 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
987 [ "$STATUS" == "paused" ] ||
988 error "(20.4) Expect 'paused', but got '$STATUS'"
990 #define OBD_FAIL_LFSCK_DELAY3 0x1602
991 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
993 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
994 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
995 mdd.${MDT_DEV}.lfsck_namespace |
996 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
998 error "(22) unexpected status"
1001 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1002 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1003 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1005 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1006 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1007 mdd.${MDT_DEV}.lfsck_namespace |
1008 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1010 error "(24) unexpected status"
1013 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1014 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1016 run_test 8 "LFSCK state machine"
1019 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1020 skip "Testing on UP system, the speed may be inaccurate."
1024 check_mount_and_prep
1025 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1026 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1027 createmany -o $DIR/$tdir/lfsck/f 5000
1029 local BASE_SPEED1=100
1031 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1034 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1035 [ "$STATUS" == "scanning-phase1" ] ||
1036 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1038 local SPEED=$($SHOW_LAYOUT |
1039 awk '/^average_speed_phase1/ { print $2 }')
1041 # There may be time error, normally it should be less than 2 seconds.
1042 # We allow another 20% schedule error.
1044 # MAX_MARGIN = 1.3 = 13 / 10
1045 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1046 RUN_TIME1 * 13 / 10))
1047 [ $SPEED -lt $MAX_SPEED ] || {
1049 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1050 error "(4) Speed $SPEED, expected < $MAX_SPEED"
1053 # adjust speed limit
1054 local BASE_SPEED2=300
1056 do_facet $SINGLEMDS \
1057 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1060 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1061 # MIN_MARGIN = 0.7 = 7 / 10
1062 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1063 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1064 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1065 [ $SPEED -gt $MIN_SPEED ] || {
1066 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1067 error_ignore LU-5624 \
1068 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1071 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1075 # MAX_MARGIN = 1.3 = 13 / 10
1076 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1077 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1078 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1079 [ $SPEED -lt $MAX_SPEED ] || {
1081 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1082 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1083 error "(6) Speed $SPEED, expected < $MAX_SPEED"
1086 do_facet $SINGLEMDS \
1087 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1089 wait_update_facet $SINGLEMDS \
1090 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1091 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1092 error "(7) Failed to get expected 'completed'"
1094 run_test 9a "LFSCK speed control (1)"
1097 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1098 skip "Testing on UP system, the speed may be inaccurate."
1104 echo "Preparing another 50 * 50 files (with error) at $(date)."
1105 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1107 createmany -d $DIR/$tdir/d 50
1108 createmany -m $DIR/$tdir/f 50
1109 for ((i = 0; i < 50; i++)); do
1110 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1113 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1115 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1116 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1117 mdd.${MDT_DEV}.lfsck_namespace |
1118 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1120 error "(5) unexpected status"
1123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1124 echo "Prepared at $(date)."
1126 local BASE_SPEED1=50
1128 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1131 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1132 [ "$STATUS" == "scanning-phase2" ] ||
1133 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1135 local SPEED=$($SHOW_NAMESPACE |
1136 awk '/^average_speed_phase2/ { print $2 }')
1137 # There may be time error, normally it should be less than 2 seconds.
1138 # We allow another 20% schedule error.
1140 # MAX_MARGIN = 1.3 = 13 / 10
1141 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1142 RUN_TIME1 * 13 / 10))
1143 [ $SPEED -lt $MAX_SPEED ] || {
1145 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1146 error "(8) Speed $SPEED, expected < $MAX_SPEED"
1149 # adjust speed limit
1150 local BASE_SPEED2=150
1152 do_facet $SINGLEMDS \
1153 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1156 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1157 # MIN_MARGIN = 0.7 = 7 / 10
1158 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1159 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1160 (RUN_TIME1 + RUN_TIME2) * 7 / 10))
1161 [ $SPEED -gt $MIN_SPEED ] || {
1162 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1163 error_ignore LU-5624 \
1164 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1167 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1171 # MAX_MARGIN = 1.3 = 13 / 10
1172 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1173 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1174 (RUN_TIME1 + RUN_TIME2) * 13 / 10))
1175 [ $SPEED -lt $MAX_SPEED ] || {
1177 log "speed1: $BASE_SPEED1 time1: $RUN_TIME1"
1178 log "speed2: $BASE_SPEED2 time2: $RUN_TIME2"
1179 error "(10) Speed $SPEED, expected < $MAX_SPEED"
1182 do_facet $SINGLEMDS \
1183 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1184 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1185 mdd.${MDT_DEV}.lfsck_namespace |
1186 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1188 error "(11) unexpected status"
1191 run_test 9b "LFSCK speed control (2)"
1195 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1196 skip "lookup(..)/linkea on ZFS issue" && return
1200 echo "Preparing more files with error at $(date)."
1201 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1202 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1204 for ((i = 0; i < 1000; i = $((i+2)))); do
1205 mkdir -p $DIR/$tdir/d${i}
1206 touch $DIR/$tdir/f${i}
1207 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1210 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1211 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1213 for ((i = 1; i < 1000; i = $((i+2)))); do
1214 mkdir -p $DIR/$tdir/d${i}
1215 touch $DIR/$tdir/f${i}
1216 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1220 echo "Prepared at $(date)."
1222 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1224 umount_client $MOUNT
1225 mount_client $MOUNT || error "(3) Fail to start client!"
1227 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1230 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1231 [ "$STATUS" == "scanning-phase1" ] ||
1232 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1234 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1236 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1238 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1240 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1242 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1244 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1246 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1248 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1249 error "(14) Fail to softlink!"
1251 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1252 [ "$STATUS" == "scanning-phase1" ] ||
1253 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1255 do_facet $SINGLEMDS \
1256 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1257 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1258 mdd.${MDT_DEV}.lfsck_namespace |
1259 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1261 error "(16) unexpected status"
1264 run_test 10 "System is available during LFSCK scanning"
1267 ost_remove_lastid() {
1270 local rcmd="do_facet ost${ost}"
1272 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1274 # step 1: local mount
1275 mount_fstype ost${ost} || return 1
1276 # step 2: remove the specified LAST_ID
1277 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1279 unmount_fstype ost${ost} || return 2
1283 check_mount_and_prep
1284 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1285 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1290 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1292 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1293 error "(2) Fail to start ost1"
1295 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1296 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1298 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1299 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1301 wait_update_facet ost1 "$LCTL get_param -n \
1302 obdfilter.${OST_DEV}.lfsck_layout |
1303 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1305 error "(5) unexpected status"
1308 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1310 wait_update_facet ost1 "$LCTL get_param -n \
1311 obdfilter.${OST_DEV}.lfsck_layout |
1312 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1314 error "(6) unexpected status"
1317 echo "the LAST_ID(s) should have been rebuilt"
1318 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1319 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1321 run_test 11a "LFSCK can rebuild lost last_id"
1324 check_mount_and_prep
1325 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1327 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1328 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1329 do_facet ost1 $LCTL set_param fail_loc=0x160d
1331 local count=$(precreated_ost_obj_count 0 0)
1333 createmany -o $DIR/$tdir/f $((count + 32))
1335 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1336 local seq=$(do_facet mds1 $LCTL get_param -n \
1337 osp.${proc_path}.prealloc_last_seq)
1338 local lastid1=$(do_facet ost1 "lctl get_param -n \
1339 obdfilter.${ost1_svc}.last_id" | grep $seq |
1340 awk -F: '{ print $2 }')
1342 umount_client $MOUNT
1343 stop ost1 || error "(1) Fail to stop ost1"
1345 #define OBD_FAIL_OST_ENOSPC 0x215
1346 do_facet ost1 $LCTL set_param fail_loc=0x215
1348 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1349 error "(2) Fail to start ost1"
1351 for ((i = 0; i < 60; i++)); do
1352 lastid2=$(do_facet ost1 "lctl get_param -n \
1353 obdfilter.${ost1_svc}.last_id" | grep $seq |
1354 awk -F: '{ print $2 }')
1355 [ ! -z $lastid2 ] && break;
1359 echo "the on-disk LAST_ID should be smaller than the expected one"
1360 [ $lastid1 -gt $lastid2 ] ||
1361 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1363 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1364 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1366 wait_update_facet ost1 "$LCTL get_param -n \
1367 obdfilter.${OST_DEV}.lfsck_layout |
1368 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1370 error "(6) unexpected status"
1373 stop ost1 || error "(7) Fail to stop ost1"
1375 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1376 error "(8) Fail to start ost1"
1378 echo "the on-disk LAST_ID should have been rebuilt"
1379 wait_update_facet ost1 "$LCTL get_param -n \
1380 obdfilter.${ost1_svc}.last_id | grep $seq |
1381 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1382 do_facet ost1 $LCTL get_param -n \
1383 obdfilter.${ost1_svc}.last_id
1384 error "(9) expect lastid1 $seq:$lastid1"
1387 do_facet ost1 $LCTL set_param fail_loc=0
1388 stopall || error "(10) Fail to stopall"
1390 run_test 11b "LFSCK can rebuild crashed last_id"
1393 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1395 check_mount_and_prep
1396 for k in $(seq $MDSCOUNT); do
1397 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1398 createmany -o $DIR/$tdir/${k}/f 100 ||
1399 error "(0) Fail to create 100 files."
1402 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1403 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1404 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1406 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1407 wait_all_targets namespace scanning-phase1 3
1409 echo "Stop namespace LFSCK on all targets by single lctl command."
1410 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1411 error "(4) Fail to stop LFSCK on all devices!"
1413 echo "All the LFSCK targets should be in 'stopped' status."
1414 wait_all_targets_blocked namespace stopped 5
1416 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1417 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1418 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1420 echo "All the LFSCK targets should be in 'completed' status."
1421 wait_all_targets_blocked namespace completed 7
1423 start_full_debug_logging
1425 echo "Start layout LFSCK on all targets by single command (-s 1)."
1426 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1427 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1429 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1430 wait_all_targets layout scanning-phase1 9
1432 echo "Stop layout LFSCK on all targets by single lctl command."
1433 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1434 error "(10) Fail to stop LFSCK on all devices!"
1436 echo "All the LFSCK targets should be in 'stopped' status."
1437 wait_all_targets_blocked layout stopped 11
1439 for k in $(seq $OSTCOUNT); do
1440 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1441 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1442 awk '/^status/ { print $2 }')
1443 [ "$STATUS" == "stopped" ] ||
1444 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1447 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1448 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1449 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1451 echo "All the LFSCK targets should be in 'completed' status."
1452 wait_all_targets_blocked layout completed 14
1454 stop_full_debug_logging
1456 run_test 12a "single command to trigger LFSCK on all devices"
1459 check_mount_and_prep
1461 echo "Start LFSCK without '-M' specified."
1462 do_facet mds1 $LCTL lfsck_start -A -r ||
1463 error "(0) Fail to start LFSCK without '-M'"
1465 wait_all_targets_blocked namespace completed 1
1466 wait_all_targets_blocked layout completed 2
1468 local count=$(do_facet mds1 $LCTL dl |
1469 awk '{ print $3 }' | grep mdt | wc -l)
1470 if [ $count -gt 1 ]; then
1472 echo "Start layout LFSCK on the node with multipe targets,"
1473 echo "but not specify '-M'/'-A' option. Should get failure."
1475 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1476 error "(3) Start layout LFSCK should fail" || true
1479 run_test 12b "auto detect Lustre device"
1483 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1484 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1485 echo "MDT-object FID."
1488 check_mount_and_prep
1490 echo "Inject failure stub to simulate bad lmm_oi"
1491 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1492 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1493 createmany -o $DIR/$tdir/f 1
1494 $LFS setstripe -E 1M -E -1 $DIR/$tdir/f1 ||
1495 error "(0) Fail to create PFL $DIR/$tdir/f1"
1496 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1498 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1499 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1501 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1502 mdd.${MDT_DEV}.lfsck_layout |
1503 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1505 error "(2) unexpected status"
1508 local repaired=$($SHOW_LAYOUT |
1509 awk '/^repaired_others/ { print $2 }')
1510 [ $repaired -eq 2 ] ||
1511 error "(3) Fail to repair crashed lmm_oi: $repaired"
1513 run_test 13 "LFSCK can repair crashed lmm_oi"
1517 echo "The OST-object referenced by the MDT-object should be there;"
1518 echo "otherwise, the LFSCK should re-create the missing OST-object."
1519 echo "without '--delay-create-ostobj' option."
1522 check_mount_and_prep
1523 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1525 echo "Inject failure stub to simulate dangling referenced MDT-object"
1526 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1527 do_facet ost1 $LCTL set_param fail_loc=0x1610
1528 local count=$(precreated_ost_obj_count 0 0)
1530 createmany -o $DIR/$tdir/f $((count + 16)) ||
1531 error "(0.1) Fail to create $DIR/$tdir/fx"
1532 touch $DIR/$tdir/guard0
1534 for ((i = 0; i < 16; i++)); do
1535 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1536 $DIR/$tdir/f_comp${i} ||
1537 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1539 touch $DIR/$tdir/guard1
1541 do_facet ost1 $LCTL set_param fail_loc=0
1543 start_full_debug_logging
1545 # exhaust other pre-created dangling cases
1546 count=$(precreated_ost_obj_count 0 0)
1547 createmany -o $DIR/$tdir/a $count ||
1548 error "(0.5) Fail to create $count files."
1550 echo "'ls' should fail because of dangling referenced MDT-object"
1551 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1553 echo "Trigger layout LFSCK to find out dangling reference"
1554 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1557 mdd.${MDT_DEV}.lfsck_layout |
1558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1560 error "(3) unexpected status"
1563 local repaired=$($SHOW_LAYOUT |
1564 awk '/^repaired_dangling/ { print $2 }')
1565 [ $repaired -ge 32 ] ||
1566 error "(4) Fail to repair dangling reference: $repaired"
1568 echo "'stat' should fail because of not repair dangling by default"
1569 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1570 error "(5.1) stat should fail"
1571 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1572 error "(5.2) stat should fail"
1574 echo "Trigger layout LFSCK to repair dangling reference"
1575 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1577 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1578 mdd.${MDT_DEV}.lfsck_layout |
1579 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1581 error "(7) unexpected status"
1584 # There may be some async LFSCK updates in processing, wait for
1585 # a while until the target reparation has been done. LU-4970.
1587 echo "'stat' should success after layout LFSCK repairing"
1588 wait_update_facet client "stat $DIR/$tdir/guard0 |
1589 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1590 stat $DIR/$tdir/guard0
1592 error "(8.1) unexpected size"
1595 wait_update_facet client "stat $DIR/$tdir/guard1 |
1596 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1597 stat $DIR/$tdir/guard1
1599 error "(8.2) unexpected size"
1602 repaired=$($SHOW_LAYOUT |
1603 awk '/^repaired_dangling/ { print $2 }')
1604 [ $repaired -ge 32 ] ||
1605 error "(9) Fail to repair dangling reference: $repaired"
1607 stop_full_debug_logging
1609 echo "stopall to cleanup object cache"
1612 setupall > /dev/null
1614 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1618 echo "The OST-object referenced by the MDT-object should be there;"
1619 echo "otherwise, the LFSCK should re-create the missing OST-object."
1620 echo "with '--delay-create-ostobj' option."
1623 check_mount_and_prep
1624 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1626 echo "Inject failure stub to simulate dangling referenced MDT-object"
1627 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1628 do_facet ost1 $LCTL set_param fail_loc=0x1610
1629 local count=$(precreated_ost_obj_count 0 0)
1631 createmany -o $DIR/$tdir/f $((count + 31))
1632 touch $DIR/$tdir/guard
1633 do_facet ost1 $LCTL set_param fail_loc=0
1635 start_full_debug_logging
1637 # exhaust other pre-created dangling cases
1638 count=$(precreated_ost_obj_count 0 0)
1639 createmany -o $DIR/$tdir/a $count ||
1640 error "(0) Fail to create $count files."
1642 echo "'ls' should fail because of dangling referenced MDT-object"
1643 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1645 echo "Trigger layout LFSCK to find out dangling reference"
1646 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1648 wait_all_targets_blocked layout completed 3
1650 local repaired=$($SHOW_LAYOUT |
1651 awk '/^repaired_dangling/ { print $2 }')
1652 [ $repaired -ge 32 ] ||
1653 error "(4) Fail to repair dangling reference: $repaired"
1655 echo "'stat' should fail because of not repair dangling by default"
1656 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1658 echo "Trigger layout LFSCK to repair dangling reference"
1659 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1661 wait_all_targets_blocked layout completed 7
1663 # There may be some async LFSCK updates in processing, wait for
1664 # a while until the target reparation has been done. LU-4970.
1666 echo "'stat' should success after layout LFSCK repairing"
1667 wait_update_facet client "stat $DIR/$tdir/guard |
1668 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1669 stat $DIR/$tdir/guard
1671 error "(8) unexpected size"
1674 repaired=$($SHOW_LAYOUT |
1675 awk '/^repaired_dangling/ { print $2 }')
1676 [ $repaired -ge 32 ] ||
1677 error "(9) Fail to repair dangling reference: $repaired"
1679 stop_full_debug_logging
1681 echo "stopall to cleanup object cache"
1684 setupall > /dev/null
1686 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1690 echo "If the OST-object referenced by the MDT-object back points"
1691 echo "to some non-exist MDT-object, then the LFSCK should repair"
1692 echo "the OST-object to back point to the right MDT-object."
1695 check_mount_and_prep
1696 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1698 echo "Inject failure stub to make the OST-object to back point to"
1699 echo "non-exist MDT-object."
1700 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1702 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1703 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1704 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1706 error "(0) Fail to create PFL $DIR/$tdir/f1"
1707 # 'dd' will trigger punch RPC firstly on every OST-objects.
1708 # So even though some OST-object will not be write by 'dd',
1709 # as long as it is allocated (may be NOT allocated in pfl_3b)
1710 # its layout information will be set also.
1711 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1712 cancel_lru_locks osc
1713 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1715 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1716 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1718 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1719 mdd.${MDT_DEV}.lfsck_layout |
1720 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1722 error "(2) unexpected status"
1725 local repaired=$($SHOW_LAYOUT |
1726 awk '/^repaired_unmatched_pair/ { print $2 }')
1727 [ $repaired -ge 3 ] ||
1728 error "(3) Fail to repair unmatched pair: $repaired"
1730 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1734 echo "If the OST-object referenced by the MDT-object back points"
1735 echo "to other MDT-object that doesn't recognize the OST-object,"
1736 echo "then the LFSCK should repair it to back point to the right"
1737 echo "MDT-object (the first one)."
1740 check_mount_and_prep
1741 mkdir -p $DIR/$tdir/0
1742 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1743 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1744 cancel_lru_locks osc
1746 echo "Inject failure stub to make the OST-object to back point to"
1747 echo "other MDT-object"
1750 [ $OSTCOUNT -ge 2 ] && stripes=2
1752 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1753 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1754 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1755 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1757 error "(0) Fail to create PFL $DIR/$tdir/f1"
1758 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1759 cancel_lru_locks osc
1760 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1762 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1763 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1765 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1766 mdd.${MDT_DEV}.lfsck_layout |
1767 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1769 error "(2) unexpected status"
1772 local repaired=$($SHOW_LAYOUT |
1773 awk '/^repaired_unmatched_pair/ { print $2 }')
1774 [ $repaired -eq 4 ] ||
1775 error "(3) Fail to repair unmatched pair: $repaired"
1777 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1780 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
1782 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1783 skip "Skip the test after 2.7.55 see LU-6437" && return
1786 echo "According to current metadata migration implementation,"
1787 echo "before the old MDT-object is removed, both the new MDT-object"
1788 echo "and old MDT-object will reference the same LOV layout. Then if"
1789 echo "the layout LFSCK finds the new MDT-object by race, it will"
1790 echo "regard related OST-object(s) as multiple referenced case, and"
1791 echo "will try to create new OST-object(s) for the new MDT-object."
1792 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1793 echo "MDT-object before confirm the multiple referenced case."
1796 check_mount_and_prep
1797 $LFS mkdir -i 1 $DIR/$tdir/a1
1798 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1799 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1800 cancel_lru_locks osc
1802 echo "Inject failure stub on MDT1 to delay the migration"
1804 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1805 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1806 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1807 $LFS migrate -m 0 $DIR/$tdir/a1 &
1810 echo "Trigger layout LFSCK to race with the migration"
1811 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1813 wait_all_targets_blocked layout completed 2
1815 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1816 local repaired=$($SHOW_LAYOUT |
1817 awk '/^repaired_unmatched_pair/ { print $2 }')
1818 [ $repaired -eq 1 ] ||
1819 error "(3) Fail to repair unmatched pair: $repaired"
1821 repaired=$($SHOW_LAYOUT |
1822 awk '/^repaired_multiple_referenced/ { print $2 }')
1823 [ $repaired -eq 0 ] ||
1824 error "(4) Unexpectedly repaird multiple references: $repaired"
1826 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1830 echo "If the OST-object's owner information does not match the owner"
1831 echo "information stored in the MDT-object, then the LFSCK trust the"
1832 echo "MDT-object and update the OST-object's owner information."
1835 check_mount_and_prep
1836 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1837 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1838 cancel_lru_locks osc
1840 echo "Inject failure stub to skip OST-object owner changing"
1841 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1842 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1843 chown 1.1 $DIR/$tdir/f0
1844 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1846 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1849 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1851 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1852 mdd.${MDT_DEV}.lfsck_layout |
1853 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1855 error "(2) unexpected status"
1858 local repaired=$($SHOW_LAYOUT |
1859 awk '/^repaired_inconsistent_owner/ { print $2 }')
1860 [ $repaired -eq 1 ] ||
1861 error "(3) Fail to repair inconsistent owner: $repaired"
1863 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1867 echo "If more than one MDT-objects reference the same OST-object,"
1868 echo "and the OST-object only recognizes one MDT-object, then the"
1869 echo "LFSCK should create new OST-objects for such non-recognized"
1873 check_mount_and_prep
1874 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1876 echo "Inject failure stub to make two MDT-objects to refernce"
1877 echo "the OST-object"
1879 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1880 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1881 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1882 cancel_lru_locks mdc
1883 cancel_lru_locks osc
1885 createmany -o $DIR/$tdir/f 1
1886 cancel_lru_locks mdc
1887 cancel_lru_locks osc
1889 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1891 error "(0) Fail to create PFL $DIR/$tdir/f1"
1892 cancel_lru_locks mdc
1893 cancel_lru_locks osc
1894 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1896 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1897 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1898 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1899 [ $size -eq 1048576 ] ||
1900 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1902 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1903 [ $size -eq 1048576 ] ||
1904 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1906 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1909 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1911 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1912 mdd.${MDT_DEV}.lfsck_layout |
1913 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1915 error "(3) unexpected status"
1918 local repaired=$($SHOW_LAYOUT |
1919 awk '/^repaired_multiple_referenced/ { print $2 }')
1920 [ $repaired -eq 2 ] ||
1921 error "(4) Fail to repair multiple references: $repaired"
1923 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1924 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1925 error "(5) Fail to write f0."
1926 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1927 [ $size -eq 1048576 ] ||
1928 error "(6) guard size should be 1048576, but got $size"
1930 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1931 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1932 error "(7) Fail to write f1."
1933 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1934 [ $size -eq 1048576 ] ||
1935 error "(8) guard size should be 1048576, but got $size"
1937 run_test 17 "LFSCK can repair multiple references"
1939 $LCTL set_param debug=+cache > /dev/null
1943 echo "The target MDT-object is there, but related stripe information"
1944 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1945 echo "layout EA entries."
1948 check_mount_and_prep
1949 $LFS mkdir -i 0 $DIR/$tdir/a1
1950 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1951 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1953 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1955 $LFS path2fid $DIR/$tdir/a1/f1
1956 $LFS getstripe $DIR/$tdir/a1/f1
1958 if [ $MDSCOUNT -ge 2 ]; then
1959 $LFS mkdir -i 1 $DIR/$tdir/a2
1960 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1961 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1962 $LFS path2fid $DIR/$tdir/a2/f2
1963 $LFS getstripe $DIR/$tdir/a2/f2
1966 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
1967 error "(0) Fail to create PFL $DIR/$tdir/f3"
1969 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
1971 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
1973 $LFS path2fid $DIR/$tdir/f3
1974 $LFS getstripe $DIR/$tdir/f3
1976 cancel_lru_locks osc
1978 echo "Inject failure, to make the MDT-object lost its layout EA"
1979 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1980 do_facet mds1 $LCTL set_param fail_loc=0x1615
1981 chown 1.1 $DIR/$tdir/a1/f1
1983 if [ $MDSCOUNT -ge 2 ]; then
1984 do_facet mds2 $LCTL set_param fail_loc=0x1615
1985 chown 1.1 $DIR/$tdir/a2/f2
1988 chown 1.1 $DIR/$tdir/f3
1993 do_facet mds1 $LCTL set_param fail_loc=0
1994 if [ $MDSCOUNT -ge 2 ]; then
1995 do_facet mds2 $LCTL set_param fail_loc=0
1998 cancel_lru_locks mdc
1999 cancel_lru_locks osc
2001 echo "The file size should be incorrect since layout EA is lost"
2002 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2003 [ "$cur_size" != "$saved_size1" ] ||
2004 error "(1) Expect incorrect file1 size"
2006 if [ $MDSCOUNT -ge 2 ]; then
2007 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2008 [ "$cur_size" != "$saved_size1" ] ||
2009 error "(2) Expect incorrect file2 size"
2012 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2013 [ "$cur_size" != "$saved_size2" ] ||
2014 error "(1.2) Expect incorrect file3 size"
2016 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2017 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2019 for k in $(seq $MDSCOUNT); do
2020 # The LFSCK status query internal is 30 seconds. For the case
2021 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2022 # time to guarantee the status sync up.
2023 wait_update_facet mds${k} "$LCTL get_param -n \
2024 mdd.$(facet_svc mds${k}).lfsck_layout |
2025 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2026 error "(4) MDS${k} is not the expected 'completed'"
2029 for k in $(seq $OSTCOUNT); do
2030 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2031 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2032 awk '/^status/ { print $2 }')
2033 [ "$cur_status" == "completed" ] ||
2034 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2037 local repaired=$(do_facet mds1 $LCTL get_param -n \
2038 mdd.$(facet_svc mds1).lfsck_layout |
2039 awk '/^repaired_orphan/ { print $2 }')
2040 [ $repaired -eq 3 ] ||
2041 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2043 if [ $MDSCOUNT -ge 2 ]; then
2044 repaired=$(do_facet mds2 $LCTL get_param -n \
2045 mdd.$(facet_svc mds2).lfsck_layout |
2046 awk '/^repaired_orphan/ { print $2 }')
2047 [ $repaired -eq 2 ] ||
2048 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2051 $LFS path2fid $DIR/$tdir/a1/f1
2052 $LFS getstripe $DIR/$tdir/a1/f1
2054 if [ $MDSCOUNT -ge 2 ]; then
2055 $LFS path2fid $DIR/$tdir/a2/f2
2056 $LFS getstripe $DIR/$tdir/a2/f2
2059 $LFS path2fid $DIR/$tdir/f3
2060 $LFS getstripe $DIR/$tdir/f3
2062 echo "The file size should be correct after layout LFSCK scanning"
2063 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2064 [ "$cur_size" == "$saved_size1" ] ||
2065 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2067 if [ $MDSCOUNT -ge 2 ]; then
2068 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2069 [ "$cur_size" == "$saved_size1" ] ||
2070 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2073 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2074 [ "$cur_size" == "$saved_size2" ] ||
2075 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2077 run_test 18a "Find out orphan OST-object and repair it (1)"
2081 echo "The target MDT-object is lost. The LFSCK should re-create the"
2082 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2083 echo "can move it back to normal namespace manually."
2086 check_mount_and_prep
2087 $LFS mkdir -i 0 $DIR/$tdir/a1
2088 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2089 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2090 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2091 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2093 $LFS getstripe $DIR/$tdir/a1/f1
2095 if [ $MDSCOUNT -ge 2 ]; then
2096 $LFS mkdir -i 1 $DIR/$tdir/a2
2097 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2098 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2099 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2101 $LFS getstripe $DIR/$tdir/a2/f2
2104 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2105 error "(0) Fail to create PFL $DIR/$tdir/f3"
2107 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2109 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2110 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2112 $LFS getstripe $DIR/$tdir/f3
2114 cancel_lru_locks osc
2116 echo "Inject failure, to simulate the case of missing the MDT-object"
2117 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2118 do_facet mds1 $LCTL set_param fail_loc=0x1616
2119 rm -f $DIR/$tdir/a1/f1
2121 if [ $MDSCOUNT -ge 2 ]; then
2122 do_facet mds2 $LCTL set_param fail_loc=0x1616
2123 rm -f $DIR/$tdir/a2/f2
2131 do_facet mds1 $LCTL set_param fail_loc=0
2132 if [ $MDSCOUNT -ge 2 ]; then
2133 do_facet mds2 $LCTL set_param fail_loc=0
2136 cancel_lru_locks mdc
2137 cancel_lru_locks osc
2139 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2140 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2142 for k in $(seq $MDSCOUNT); do
2143 # The LFSCK status query internal is 30 seconds. For the case
2144 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2145 # time to guarantee the status sync up.
2146 wait_update_facet mds${k} "$LCTL get_param -n \
2147 mdd.$(facet_svc mds${k}).lfsck_layout |
2148 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2149 error "(2) MDS${k} is not the expected 'completed'"
2152 for k in $(seq $OSTCOUNT); do
2153 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2154 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2155 awk '/^status/ { print $2 }')
2156 [ "$cur_status" == "completed" ] ||
2157 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2160 local repaired=$(do_facet mds1 $LCTL get_param -n \
2161 mdd.$(facet_svc mds1).lfsck_layout |
2162 awk '/^repaired_orphan/ { print $2 }')
2163 [ $repaired -eq 3 ] ||
2164 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2166 if [ $MDSCOUNT -ge 2 ]; then
2167 repaired=$(do_facet mds2 $LCTL get_param -n \
2168 mdd.$(facet_svc mds2).lfsck_layout |
2169 awk '/^repaired_orphan/ { print $2 }')
2170 [ $repaired -eq 2 ] ||
2171 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2174 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2175 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2176 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2178 if [ $MDSCOUNT -ge 2 ]; then
2179 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2180 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2183 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2184 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2186 $LFS path2fid $DIR/$tdir/a1/f1
2187 $LFS getstripe $DIR/$tdir/a1/f1
2189 if [ $MDSCOUNT -ge 2 ]; then
2190 $LFS path2fid $DIR/$tdir/a2/f2
2191 $LFS getstripe $DIR/$tdir/a2/f2
2194 $LFS path2fid $DIR/$tdir/f3
2195 $LFS getstripe $DIR/$tdir/f3
2197 echo "The file size should be correct after layout LFSCK scanning"
2198 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2199 [ "$cur_size" == "$saved_size1" ] ||
2200 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2202 if [ $MDSCOUNT -ge 2 ]; then
2203 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2204 [ "$cur_size" == "$saved_size1" ] ||
2205 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2208 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2209 [ "$cur_size" == "$saved_size2" ] ||
2210 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2212 run_test 18b "Find out orphan OST-object and repair it (2)"
2216 echo "The target MDT-object is lost, and the OST-object FID is missing."
2217 echo "The LFSCK should re-create the MDT-object with new FID under the "
2218 echo "directory .lustre/lost+found/MDTxxxx."
2221 check_mount_and_prep
2222 $LFS mkdir -i 0 $DIR/$tdir/a1
2223 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2225 echo "Inject failure, to simulate the case of missing parent FID"
2226 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2227 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2229 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2230 $LFS getstripe $DIR/$tdir/a1/f1
2232 if [ $MDSCOUNT -ge 2 ]; then
2233 $LFS mkdir -i 1 $DIR/$tdir/a2
2234 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2235 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2236 $LFS getstripe $DIR/$tdir/a2/f2
2239 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2240 error "(0) Fail to create PFL $DIR/$tdir/f3"
2242 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2243 $LFS getstripe $DIR/$tdir/f3
2245 cancel_lru_locks osc
2246 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2248 echo "Inject failure, to simulate the case of missing the MDT-object"
2249 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2250 do_facet mds1 $LCTL set_param fail_loc=0x1616
2251 rm -f $DIR/$tdir/a1/f1
2253 if [ $MDSCOUNT -ge 2 ]; then
2254 do_facet mds2 $LCTL set_param fail_loc=0x1616
2255 rm -f $DIR/$tdir/a2/f2
2263 do_facet mds1 $LCTL set_param fail_loc=0
2264 if [ $MDSCOUNT -ge 2 ]; then
2265 do_facet mds2 $LCTL set_param fail_loc=0
2268 cancel_lru_locks mdc
2269 cancel_lru_locks osc
2271 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2272 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2274 for k in $(seq $MDSCOUNT); do
2275 # The LFSCK status query internal is 30 seconds. For the case
2276 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2277 # time to guarantee the status sync up.
2278 wait_update_facet mds${k} "$LCTL get_param -n \
2279 mdd.$(facet_svc mds${k}).lfsck_layout |
2280 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2281 error "(2) MDS${k} is not the expected 'completed'"
2284 for k in $(seq $OSTCOUNT); do
2285 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2286 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2287 awk '/^status/ { print $2 }')
2288 [ "$cur_status" == "completed" ] ||
2289 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2292 if [ $MDSCOUNT -ge 2 ]; then
2298 local repaired=$(do_facet mds1 $LCTL get_param -n \
2299 mdd.$(facet_svc mds1).lfsck_layout |
2300 awk '/^repaired_orphan/ { print $2 }')
2301 [ $repaired -eq $expected ] ||
2302 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2304 if [ $MDSCOUNT -ge 2 ]; then
2305 repaired=$(do_facet mds2 $LCTL get_param -n \
2306 mdd.$(facet_svc mds2).lfsck_layout |
2307 awk '/^repaired_orphan/ { print $2 }')
2308 [ $repaired -eq 0 ] ||
2309 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2312 ls -ail $MOUNT/.lustre/lost+found/
2314 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2315 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2316 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2318 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2321 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2322 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2323 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2325 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2326 [ ! -z "$cname" ] ||
2327 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2329 run_test 18c "Find out orphan OST-object and repair it (3)"
2333 echo "The target MDT-object layout EA is corrupted, but the right"
2334 echo "OST-object is still alive as orphan. The layout LFSCK will"
2335 echo "not create new OST-object to occupy such slot."
2338 check_mount_and_prep
2340 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2341 echo "guard" > $DIR/$tdir/a1/f1
2342 echo "foo" > $DIR/$tdir/a1/f2
2344 echo "guard" > $DIR/$tdir/a1/f3
2345 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2346 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2347 echo "foo" > $DIR/$tdir/a1/f4
2349 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2350 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2351 $LFS path2fid $DIR/$tdir/a1/f1
2352 $LFS getstripe $DIR/$tdir/a1/f1
2353 $LFS path2fid $DIR/$tdir/a1/f2
2354 $LFS getstripe $DIR/$tdir/a1/f2
2355 $LFS path2fid $DIR/$tdir/a1/f3
2356 $LFS getstripe $DIR/$tdir/a1/f3
2357 $LFS path2fid $DIR/$tdir/a1/f4
2358 $LFS getstripe $DIR/$tdir/a1/f4
2359 cancel_lru_locks osc
2361 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2362 echo "to reference the same OST-object (which is f1's OST-obejct)."
2363 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2364 echo "dangling reference case, but f2's old OST-object is there."
2366 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2367 echo "to reference the same OST-object (which is f3's OST-obejct)."
2368 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2369 echo "dangling reference case, but f4's old OST-object is there."
2372 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2373 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2374 chown 1.1 $DIR/$tdir/a1/f2
2375 chown 1.1 $DIR/$tdir/a1/f4
2376 rm -f $DIR/$tdir/a1/f1
2377 rm -f $DIR/$tdir/a1/f3
2380 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2382 echo "stopall to cleanup object cache"
2385 setupall > /dev/null
2387 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2388 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2390 for k in $(seq $MDSCOUNT); do
2391 # The LFSCK status query internal is 30 seconds. For the case
2392 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2393 # time to guarantee the status sync up.
2394 wait_update_facet mds${k} "$LCTL get_param -n \
2395 mdd.$(facet_svc mds${k}).lfsck_layout |
2396 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2397 error "(3) MDS${k} is not the expected 'completed'"
2400 for k in $(seq $OSTCOUNT); do
2401 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2402 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2403 awk '/^status/ { print $2 }')
2404 [ "$cur_status" == "completed" ] ||
2405 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2408 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2409 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2410 awk '/^repaired_orphan/ { print $2 }')
2411 [ $repaired -eq 2 ] ||
2412 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2414 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2415 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2416 awk '/^repaired_dangling/ { print $2 }')
2417 [ $repaired -eq 0 ] ||
2418 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2420 echo "The file size should be correct after layout LFSCK scanning"
2421 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2422 [ "$cur_size" == "$saved_size1" ] ||
2423 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2425 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2426 [ "$cur_size" == "$saved_size2" ] ||
2427 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2429 echo "The LFSCK should find back the original data."
2430 cat $DIR/$tdir/a1/f2
2431 $LFS path2fid $DIR/$tdir/a1/f2
2432 $LFS getstripe $DIR/$tdir/a1/f2
2433 cat $DIR/$tdir/a1/f4
2434 $LFS path2fid $DIR/$tdir/a1/f4
2435 $LFS getstripe $DIR/$tdir/a1/f4
2437 run_test 18d "Find out orphan OST-object and repair it (4)"
2441 echo "The target MDT-object layout EA slot is occpuied by some new"
2442 echo "created OST-object when repair dangling reference case. Such"
2443 echo "conflict OST-object has been modified by others. To keep the"
2444 echo "new data, the LFSCK will create a new file to refernece this"
2445 echo "old orphan OST-object."
2448 check_mount_and_prep
2450 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2451 echo "guard" > $DIR/$tdir/a1/f1
2452 echo "foo" > $DIR/$tdir/a1/f2
2454 echo "guard" > $DIR/$tdir/a1/f3
2455 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2456 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2457 echo "foo" > $DIR/$tdir/a1/f4
2459 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2460 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2462 $LFS path2fid $DIR/$tdir/a1/f1
2463 $LFS getstripe $DIR/$tdir/a1/f1
2464 $LFS path2fid $DIR/$tdir/a1/f2
2465 $LFS getstripe $DIR/$tdir/a1/f2
2466 $LFS path2fid $DIR/$tdir/a1/f3
2467 $LFS getstripe $DIR/$tdir/a1/f3
2468 $LFS path2fid $DIR/$tdir/a1/f4
2469 $LFS getstripe $DIR/$tdir/a1/f4
2470 cancel_lru_locks osc
2472 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2473 echo "to reference the same OST-object (which is f1's OST-obejct)."
2474 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2475 echo "dangling reference case, but f2's old OST-object is there."
2477 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2478 echo "to reference the same OST-object (which is f3's OST-obejct)."
2479 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2480 echo "dangling reference case, but f4's old OST-object is there."
2483 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2484 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2485 chown 1.1 $DIR/$tdir/a1/f2
2486 chown 1.1 $DIR/$tdir/a1/f4
2487 rm -f $DIR/$tdir/a1/f1
2488 rm -f $DIR/$tdir/a1/f3
2491 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2493 echo "stopall to cleanup object cache"
2496 setupall > /dev/null
2498 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2499 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2501 start_full_debug_logging
2503 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2504 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2506 wait_update_facet mds1 "$LCTL get_param -n \
2507 mdd.$(facet_svc mds1).lfsck_layout |
2508 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2509 error "(3) MDS1 is not the expected 'scanning-phase2'"
2511 # to guarantee all updates are synced.
2515 echo "Write new data to f2/f4 to modify the new created OST-object."
2516 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2517 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2519 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2521 for k in $(seq $MDSCOUNT); do
2522 # The LFSCK status query internal is 30 seconds. For the case
2523 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2524 # time to guarantee the status sync up.
2525 wait_update_facet mds${k} "$LCTL get_param -n \
2526 mdd.$(facet_svc mds${k}).lfsck_layout |
2527 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2528 error "(4) MDS${k} is not the expected 'completed'"
2531 for k in $(seq $OSTCOUNT); do
2532 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2533 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2534 awk '/^status/ { print $2 }')
2535 [ "$cur_status" == "completed" ] ||
2536 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2539 stop_full_debug_logging
2541 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2542 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2543 awk '/^repaired_orphan/ { print $2 }')
2544 [ $repaired -eq 2 ] ||
2545 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2547 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2548 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2549 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2551 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2552 if [ $count -ne 2 ]; then
2553 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2554 error "(8) Expect 2 stubs under lost+found, but got $count"
2557 echo "The stub file should keep the original f2 or f4 data"
2558 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2559 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2560 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2561 error "(9) Got unexpected $cur_size"
2564 $LFS path2fid $cname
2565 $LFS getstripe $cname
2567 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2568 cur_size=$(ls -il $cname | awk '{ print $6 }')
2569 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2570 error "(10) Got unexpected $cur_size"
2573 $LFS path2fid $cname
2574 $LFS getstripe $cname
2576 echo "The f2/f4 should contains new data."
2577 cat $DIR/$tdir/a1/f2
2578 $LFS path2fid $DIR/$tdir/a1/f2
2579 $LFS getstripe $DIR/$tdir/a1/f2
2580 cat $DIR/$tdir/a1/f4
2581 $LFS path2fid $DIR/$tdir/a1/f4
2582 $LFS getstripe $DIR/$tdir/a1/f4
2584 run_test 18e "Find out orphan OST-object and repair it (5)"
2587 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2590 echo "The target MDT-object is lost. The LFSCK should re-create the"
2591 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2592 echo "to verify some OST-object(s) during the first stage-scanning,"
2593 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2594 echo "should not be affected."
2597 check_mount_and_prep
2598 $LFS mkdir -i 0 $DIR/$tdir/a1
2599 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2600 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2601 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2602 $LFS mkdir -i 0 $DIR/$tdir/a2
2603 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2604 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2605 $LFS getstripe $DIR/$tdir/a1/f1
2606 $LFS getstripe $DIR/$tdir/a2/f2
2608 if [ $MDSCOUNT -ge 2 ]; then
2609 $LFS mkdir -i 1 $DIR/$tdir/a3
2610 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2611 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2612 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2613 $LFS mkdir -i 1 $DIR/$tdir/a4
2614 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2615 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2616 $LFS getstripe $DIR/$tdir/a3/f3
2617 $LFS getstripe $DIR/$tdir/a4/f4
2620 cancel_lru_locks osc
2622 echo "Inject failure, to simulate the case of missing the MDT-object"
2623 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2624 do_facet mds1 $LCTL set_param fail_loc=0x1616
2625 rm -f $DIR/$tdir/a1/f1
2626 rm -f $DIR/$tdir/a2/f2
2628 if [ $MDSCOUNT -ge 2 ]; then
2629 do_facet mds2 $LCTL set_param fail_loc=0x1616
2630 rm -f $DIR/$tdir/a3/f3
2631 rm -f $DIR/$tdir/a4/f4
2637 do_facet mds1 $LCTL set_param fail_loc=0
2638 if [ $MDSCOUNT -ge 2 ]; then
2639 do_facet mds2 $LCTL set_param fail_loc=0
2642 cancel_lru_locks mdc
2643 cancel_lru_locks osc
2645 echo "Inject failure, to simulate the OST0 fail to handle"
2646 echo "MDT0 LFSCK request during the first-stage scanning."
2647 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2648 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2650 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2651 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2653 for k in $(seq $MDSCOUNT); do
2654 # The LFSCK status query internal is 30 seconds. For the case
2655 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2656 # time to guarantee the status sync up.
2657 wait_update_facet mds${k} "$LCTL get_param -n \
2658 mdd.$(facet_svc mds${k}).lfsck_layout |
2659 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2660 error "(2) MDS${k} is not the expected 'partial'"
2663 wait_update_facet ost1 "$LCTL get_param -n \
2664 obdfilter.$(facet_svc ost1).lfsck_layout |
2665 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2666 error "(3) OST1 is not the expected 'partial'"
2669 wait_update_facet ost2 "$LCTL get_param -n \
2670 obdfilter.$(facet_svc ost2).lfsck_layout |
2671 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2672 error "(4) OST2 is not the expected 'completed'"
2675 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2677 local repaired=$(do_facet mds1 $LCTL get_param -n \
2678 mdd.$(facet_svc mds1).lfsck_layout |
2679 awk '/^repaired_orphan/ { print $2 }')
2680 [ $repaired -eq 1 ] ||
2681 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2683 if [ $MDSCOUNT -ge 2 ]; then
2684 repaired=$(do_facet mds2 $LCTL get_param -n \
2685 mdd.$(facet_svc mds2).lfsck_layout |
2686 awk '/^repaired_orphan/ { print $2 }')
2687 [ $repaired -eq 1 ] ||
2688 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2691 echo "Trigger layout LFSCK on all devices again to cleanup"
2692 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2694 for k in $(seq $MDSCOUNT); do
2695 # The LFSCK status query internal is 30 seconds. For the case
2696 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2697 # time to guarantee the status sync up.
2698 wait_update_facet mds${k} "$LCTL get_param -n \
2699 mdd.$(facet_svc mds${k}).lfsck_layout |
2700 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2701 error "(8) MDS${k} is not the expected 'completed'"
2704 for k in $(seq $OSTCOUNT); do
2705 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2706 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2707 awk '/^status/ { print $2 }')
2708 [ "$cur_status" == "completed" ] ||
2709 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2713 local repaired=$(do_facet mds1 $LCTL get_param -n \
2714 mdd.$(facet_svc mds1).lfsck_layout |
2715 awk '/^repaired_orphan/ { print $2 }')
2716 [ $repaired -eq 2 ] ||
2717 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2719 if [ $MDSCOUNT -ge 2 ]; then
2720 repaired=$(do_facet mds2 $LCTL get_param -n \
2721 mdd.$(facet_svc mds2).lfsck_layout |
2722 awk '/^repaired_orphan/ { print $2 }')
2723 [ $repaired -eq 2 ] ||
2724 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2727 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2731 echo "The target MDT-object is lost, but related OI mapping is there"
2732 echo "The LFSCK should recreate the lost MDT-object without affected"
2733 echo "by the stale OI mapping."
2736 check_mount_and_prep
2737 $LFS mkdir -i 0 $DIR/$tdir/a1
2738 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2739 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2740 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2742 $LFS getstripe $DIR/$tdir/a1/f1
2743 cancel_lru_locks osc
2745 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2746 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2747 do_facet mds1 $LCTL set_param fail_loc=0x162e
2748 rm -f $DIR/$tdir/a1/f1
2750 do_facet mds1 $LCTL set_param fail_loc=0
2751 cancel_lru_locks mdc
2752 cancel_lru_locks osc
2754 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2755 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2757 for k in $(seq $MDSCOUNT); do
2758 # The LFSCK status query internal is 30 seconds. For the case
2759 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2760 # time to guarantee the status sync up.
2761 wait_update_facet mds${k} "$LCTL get_param -n \
2762 mdd.$(facet_svc mds${k}).lfsck_layout |
2763 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2764 error "(2) MDS${k} is not the expected 'completed'"
2767 for k in $(seq $OSTCOUNT); do
2768 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2769 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2770 awk '/^status/ { print $2 }')
2771 [ "$cur_status" == "completed" ] ||
2772 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2775 local repaired=$(do_facet mds1 $LCTL get_param -n \
2776 mdd.$(facet_svc mds1).lfsck_layout |
2777 awk '/^repaired_orphan/ { print $2 }')
2778 [ $repaired -eq $OSTCOUNT ] ||
2779 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2781 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2782 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2783 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2785 $LFS path2fid $DIR/$tdir/a1/f1
2786 $LFS getstripe $DIR/$tdir/a1/f1
2788 run_test 18g "Find out orphan OST-object and repair it (7)"
2792 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2793 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2794 echo "scanning its OST-object(s). Then in the second stage scanning,"
2795 echo "the OST will return related OST-object(s) to the MDT as orphan."
2796 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2797 echo "the 'orphan(s)' stripe information."
2800 check_mount_and_prep
2802 $LFS setstripe -E 2M -c 1 -E -1 $DIR/$tdir/f0 ||
2803 error "(0) Fail to create PFL $DIR/$tdir/f0"
2805 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2806 error "(1.1) Fail to write $DIR/$tdir/f0"
2808 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2809 error "(1.2) Fail to write $DIR/$tdir/f0"
2811 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2813 echo "Inject failure stub to simulate bad PFL extent range"
2814 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2815 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2817 chown 1.1 $DIR/$tdir/f0
2819 cancel_lru_locks mdc
2820 cancel_lru_locks osc
2821 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2823 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2824 error "(2) Write to bad PFL file should fail"
2826 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2827 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2829 for k in $(seq $MDSCOUNT); do
2830 # The LFSCK status query internal is 30 seconds. For the case
2831 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2832 # time to guarantee the status sync up.
2833 wait_update_facet mds${k} "$LCTL get_param -n \
2834 mdd.$(facet_svc mds${k}).lfsck_layout |
2835 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2836 error "(4.1) MDS${k} is not the expected 'completed'"
2839 for k in $(seq $OSTCOUNT); do
2840 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2841 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2842 awk '/^status/ { print $2 }')
2843 [ "$cur_status" == "completed" ] ||
2844 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2848 local repaired=$($SHOW_LAYOUT |
2849 awk '/^repaired_orphan/ { print $2 }')
2850 [ $repaired -eq 2 ] ||
2851 error "(5) Fail to repair crashed PFL range: $repaired"
2853 echo "Data in $DIR/$tdir/f0 should not be broken"
2854 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2855 error "(6) Data in $DIR/$tdir/f0 is broken"
2857 echo "Write should succeed after LFSCK repairing the bad PFL range"
2858 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2859 error "(7) Write should succeed after LFSCK"
2861 run_test 18h "LFSCK can repair crashed PFL extent range"
2863 $LCTL set_param debug=-cache > /dev/null
2866 check_mount_and_prep
2867 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2869 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2870 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2872 echo "foo1" > $DIR/$tdir/a0
2873 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2874 error "(0) Fail to create PFL $DIR/$tdir/a1"
2875 echo "foo2" > $DIR/$tdir/a1
2876 echo "guard" > $DIR/$tdir/a2
2877 cancel_lru_locks osc
2879 echo "Inject failure, then client will offer wrong parent FID when read"
2880 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2881 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2883 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2884 $LCTL set_param fail_loc=0x1619
2886 echo "Read RPC with wrong parent FID should be denied"
2887 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2888 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2889 $LCTL set_param fail_loc=0
2891 run_test 19a "OST-object inconsistency self detect"
2894 check_mount_and_prep
2895 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2897 echo "Inject failure stub to make the OST-object to back point to"
2898 echo "non-exist MDT-object"
2900 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2901 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2903 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2904 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2905 echo "foo1" > $DIR/$tdir/f0
2906 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2907 error "(0) Fail to create PFL $DIR/$tdir/f1"
2908 echo "foo2" > $DIR/$tdir/f1
2909 cancel_lru_locks osc
2910 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2912 do_facet ost1 $LCTL set_param -n \
2913 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2914 echo "Nothing should be fixed since self detect and repair is disabled"
2915 local repaired=$(do_facet ost1 $LCTL get_param -n \
2916 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2917 awk '/^repaired/ { print $2 }')
2918 [ $repaired -eq 0 ] ||
2919 error "(1) Expected 0 repaired, but got $repaired"
2921 echo "Read RPC with right parent FID should be accepted,"
2922 echo "and cause parent FID on OST to be fixed"
2924 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2925 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2927 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2928 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2930 repaired=$(do_facet ost1 $LCTL get_param -n \
2931 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2932 awk '/^repaired/ { print $2 }')
2933 [ $repaired -eq 2 ] ||
2934 error "(3) Expected 1 repaired, but got $repaired"
2936 run_test 19b "OST-object inconsistency self repair"
2938 PATTERN_WITH_HOLE="40000001"
2939 PATTERN_WITHOUT_HOLE="1"
2942 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
2945 echo "The target MDT-object and some of its OST-object are lost."
2946 echo "The LFSCK should find out the left OST-objects and re-create"
2947 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2948 echo "with the partial OST-objects (LOV EA hole)."
2950 echo "New client can access the file with LOV EA hole via normal"
2951 echo "system tools or commands without crash the system."
2953 echo "For old client, even though it cannot access the file with"
2954 echo "LOV EA hole, it should not cause the system crash."
2957 check_mount_and_prep
2958 $LFS mkdir -i 0 $DIR/$tdir/a1
2959 if [ $OSTCOUNT -gt 2 ]; then
2960 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2963 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2967 # 256 blocks on the stripe0.
2968 # 1 block on the stripe1 for 2 OSTs case.
2969 # 256 blocks on the stripe1 for other cases.
2970 # 1 block on the stripe2 if OSTs > 2
2971 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2972 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2973 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2975 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2976 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2977 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2980 $LFS getstripe $DIR/$tdir/a1/f0
2982 $LFS getstripe $DIR/$tdir/a1/f1
2984 $LFS getstripe $DIR/$tdir/a1/f2
2986 if [ $OSTCOUNT -gt 2 ]; then
2987 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2988 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2990 $LFS getstripe $DIR/$tdir/a1/f3
2993 cancel_lru_locks osc
2995 echo "Inject failure..."
2996 echo "To simulate f0 lost MDT-object"
2997 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2998 do_facet mds1 $LCTL set_param fail_loc=0x1616
2999 rm -f $DIR/$tdir/a1/f0
3001 echo "To simulate f1 lost MDT-object and OST-object0"
3002 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3003 do_facet mds1 $LCTL set_param fail_loc=0x161a
3004 rm -f $DIR/$tdir/a1/f1
3006 echo "To simulate f2 lost MDT-object and OST-object1"
3007 do_facet mds1 $LCTL set_param fail_val=1
3008 rm -f $DIR/$tdir/a1/f2
3010 if [ $OSTCOUNT -gt 2 ]; then
3011 echo "To simulate f3 lost MDT-object and OST-object2"
3012 do_facet mds1 $LCTL set_param fail_val=2
3013 rm -f $DIR/$tdir/a1/f3
3016 umount_client $MOUNT
3019 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3021 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3022 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3024 for k in $(seq $MDSCOUNT); do
3025 # The LFSCK status query internal is 30 seconds. For the case
3026 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3027 # time to guarantee the status sync up.
3028 wait_update_facet mds${k} "$LCTL get_param -n \
3029 mdd.$(facet_svc mds${k}).lfsck_layout |
3030 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3031 error "(2) MDS${k} is not the expected 'completed'"
3034 for k in $(seq $OSTCOUNT); do
3035 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3036 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3037 awk '/^status/ { print $2 }')
3038 [ "$cur_status" == "completed" ] ||
3039 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3042 local repaired=$(do_facet mds1 $LCTL get_param -n \
3043 mdd.$(facet_svc mds1).lfsck_layout |
3044 awk '/^repaired_orphan/ { print $2 }')
3045 if [ $OSTCOUNT -gt 2 ]; then
3046 [ $repaired -eq 9 ] ||
3047 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3049 [ $repaired -eq 4 ] ||
3050 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3053 mount_client $MOUNT || error "(5.0) Fail to start client!"
3055 LOV_PATTERN_F_HOLE=0x40000000
3058 # ${fid0}-R-0 is the old f0
3060 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3061 echo "Check $name, which is the old f0"
3063 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3065 local pattern=$($LFS getstripe -L $name)
3066 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3067 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3069 local stripes=$($LFS getstripe -c $name)
3070 if [ $OSTCOUNT -gt 2 ]; then
3071 [ $stripes -eq 3 ] ||
3072 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3074 [ $stripes -eq 2 ] ||
3075 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3078 local size=$(stat $name | awk '/Size:/ { print $2 }')
3079 [ $size -eq $((4096 * $bcount)) ] ||
3080 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3082 cat $name > /dev/null || error "(5.5) cannot read $name"
3084 echo "dummy" >> $name || error "(5.6) cannot write $name"
3086 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3088 touch $name || error "(5.8) cannot touch $name"
3090 rm -f $name || error "(5.9) cannot unlink $name"
3093 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3095 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3096 if [ $OSTCOUNT -gt 2 ]; then
3097 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3099 echo "Check $name, it contains the old f1's stripe1"
3102 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3104 pattern=$($LFS getstripe -L $name)
3105 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3106 error "(6.2) expect pattern flag hole, but got $pattern"
3108 stripes=$($LFS getstripe -c $name)
3109 if [ $OSTCOUNT -gt 2 ]; then
3110 [ $stripes -eq 3 ] ||
3111 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3113 [ $stripes -eq 2 ] ||
3114 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3117 size=$(stat $name | awk '/Size:/ { print $2 }')
3118 [ $size -eq $((4096 * $bcount)) ] ||
3119 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3121 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3123 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3124 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3127 [ $failures -eq 256 ] ||
3128 error "(6.6) expect 256 IO failures, but get $failures"
3130 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3131 [ $size -eq $((4096 * $bcount)) ] ||
3132 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3134 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3135 error "(6.8) write to the LOV EA hole should fail"
3137 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3138 error "(6.9) write to normal stripe should NOT fail"
3140 echo "foo" >> $name && error "(6.10) append write $name should fail"
3142 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3144 touch $name || error "(6.12) cannot touch $name"
3146 rm -f $name || error "(6.13) cannot unlink $name"
3149 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3151 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3152 if [ $OSTCOUNT -gt 2 ]; then
3153 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3155 echo "Check $name, it contains the old f2's stripe0"
3158 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3160 pattern=$($LFS getstripe -L $name)
3161 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3162 error "(7.2) expect pattern flag hole, but got $pattern"
3164 stripes=$($LFS getstripe -c $name)
3165 size=$(stat $name | awk '/Size:/ { print $2 }')
3166 if [ $OSTCOUNT -gt 2 ]; then
3167 [ $stripes -eq 3 ] ||
3168 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3170 [ $size -eq $((4096 * $bcount)) ] ||
3171 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3173 cat $name > /dev/null &&
3174 error "(7.5.1) normal read $name should fail"
3176 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3177 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3179 [ $failures -eq 256 ] ||
3180 error "(7.6) expect 256 IO failures, but get $failures"
3182 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3183 [ $size -eq $((4096 * $bcount)) ] ||
3184 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3186 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3187 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3189 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3190 error "(7.8.1) write to normal stripe should NOT fail"
3192 echo "foo" >> $name &&
3193 error "(7.8.3) append write $name should fail"
3195 chown $RUNAS_ID:$RUNAS_GID $name ||
3196 error "(7.9.1) cannot chown on $name"
3198 touch $name || error "(7.10.1) cannot touch $name"
3200 [ $stripes -eq 2 ] ||
3201 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3204 [ $size -eq $((4096 * (256 + 0))) ] ||
3205 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3207 cat $name > /dev/null &&
3208 error "(7.5.2) normal read $name should fail"
3210 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3211 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3212 [ $failures -eq 256 ] ||
3213 error "(7.6.2) expect 256 IO failures, but get $failures"
3216 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3217 [ $size -eq $((4096 * $bcount)) ] ||
3218 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3220 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3221 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3223 chown $RUNAS_ID:$RUNAS_GID $name ||
3224 error "(7.9.2) cannot chown on $name"
3226 touch $name || error "(7.10.2) cannot touch $name"
3229 rm -f $name || error "(7.11) cannot unlink $name"
3231 [ $OSTCOUNT -le 2 ] && return
3234 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3236 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3237 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3239 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3241 pattern=$($LFS getstripe -L $name)
3242 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3243 error "(8.2) expect pattern flag hole, but got $pattern"
3245 stripes=$($LFS getstripe -c $name)
3246 [ $stripes -eq 3 ] ||
3247 error "(8.3) expect the stripe count is 3, but got $stripes"
3249 size=$(stat $name | awk '/Size:/ { print $2 }')
3251 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3252 error "(8.4) expect the size $((4096 * 512)), but got $size"
3254 cat $name > /dev/null &&
3255 error "(8.5) normal read $name should fail"
3257 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3258 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3260 [ $failures -eq 256 ] ||
3261 error "(8.6) expect 256 IO failures, but get $failures"
3264 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3265 [ $size -eq $((4096 * $bcount)) ] ||
3266 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3268 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3269 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3271 chown $RUNAS_ID:$RUNAS_GID $name ||
3272 error "(8.9) cannot chown on $name"
3274 touch $name || error "(8.10) cannot touch $name"
3276 rm -f $name || error "(8.11) cannot unlink $name"
3278 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3281 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
3284 echo "The target MDT-object and some of its OST-object are lost."
3285 echo "The LFSCK should find out the left OST-objects and re-create"
3286 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3287 echo "with the partial OST-objects (LOV EA hole)."
3289 echo "New client can access the file with LOV EA hole via normal"
3290 echo "system tools or commands without crash the system - PFL case."
3293 check_mount_and_prep
3295 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3296 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3297 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3298 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3299 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3300 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3302 local bcount=$((256 * 3 + 1))
3304 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3305 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3306 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3308 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3309 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3310 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3313 $LFS getstripe $DIR/$tdir/f0
3315 $LFS getstripe $DIR/$tdir/f1
3317 $LFS getstripe $DIR/$tdir/f2
3319 cancel_lru_locks mdc
3320 cancel_lru_locks osc
3322 echo "Inject failure..."
3323 echo "To simulate f0 lost MDT-object"
3324 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3325 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3328 echo "To simulate the case of f1 lost MDT-object and "
3329 echo "the first OST-object in each PFL component"
3330 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3331 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3334 echo "To simulate the case of f2 lost MDT-object and "
3335 echo "the second OST-object in each PFL component"
3336 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3341 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3343 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3344 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3346 for k in $(seq $MDSCOUNT); do
3347 # The LFSCK status query internal is 30 seconds. For the case
3348 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3349 # time to guarantee the status sync up.
3350 wait_update_facet mds${k} "$LCTL get_param -n \
3351 mdd.$(facet_svc mds${k}).lfsck_layout |
3352 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3353 error "(4) MDS${k} is not the expected 'completed'"
3356 for k in $(seq $OSTCOUNT); do
3357 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3358 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3359 awk '/^status/ { print $2 }')
3360 [ "$cur_status" == "completed" ] ||
3361 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3364 local repaired=$(do_facet mds1 $LCTL get_param -n \
3365 mdd.$(facet_svc mds1).lfsck_layout |
3366 awk '/^repaired_orphan/ { print $2 }')
3367 [ $repaired -eq 8 ] ||
3368 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3371 # ${fid0}-R-0 is the old f0
3373 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3374 echo "Check $name, which is the old f0"
3376 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3378 local pattern=$($LFS getstripe -L -I1 $name)
3379 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3380 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3382 pattern=$($LFS getstripe -L -I2 $name)
3383 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3384 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3386 local stripes=$($LFS getstripe -c -I1 $name)
3387 [ $stripes -eq 2 ] ||
3388 error "(7.3.1) expect 2 stripes, but got $stripes"
3390 stripes=$($LFS getstripe -c -I2 $name)
3391 [ $stripes -eq 2 ] ||
3392 error "(7.3.2) expect 2 stripes, but got $stripes"
3394 local e_start=$($LFS getstripe -I1 $name |
3395 awk '/lcme_extent.e_start:/ { print $2 }')
3396 [ $e_start -eq 0 ] ||
3397 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3399 local e_end=$($LFS getstripe -I1 $name |
3400 awk '/lcme_extent.e_end:/ { print $2 }')
3401 [ $e_end -eq 2097152 ] ||
3402 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3404 e_start=$($LFS getstripe -I2 $name |
3405 awk '/lcme_extent.e_start:/ { print $2 }')
3406 [ $e_start -eq 2097152 ] ||
3407 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3409 e_end=$($LFS getstripe -I2 $name |
3410 awk '/lcme_extent.e_end:/ { print $2 }')
3411 [ "$e_end" = "EOF" ] ||
3412 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3414 local size=$(stat $name | awk '/Size:/ { print $2 }')
3415 [ $size -eq $((4096 * $bcount)) ] ||
3416 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3418 cat $name > /dev/null || error "(7.7) cannot read $name"
3420 echo "dummy" >> $name || error "(7.8) cannot write $name"
3422 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3424 touch $name || error "(7.10) cannot touch $name"
3426 rm -f $name || error "(7.11) cannot unlink $name"
3429 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3431 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3432 echo "Check $name, it contains f1's second OST-object in each COMP"
3434 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3436 pattern=$($LFS getstripe -L -I1 $name)
3437 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3438 error "(8.2.1) expect pattern flag hole, but got $pattern"
3440 pattern=$($LFS getstripe -L -I2 $name)
3441 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3442 error "(8.2.2) expect pattern flag hole, but got $pattern"
3444 stripes=$($LFS getstripe -c -I1 $name)
3445 [ $stripes -eq 2 ] ||
3446 error "(8.3.2) expect 2 stripes, but got $stripes"
3448 stripes=$($LFS getstripe -c -I2 $name)
3449 [ $stripes -eq 2 ] ||
3450 error "(8.3.2) expect 2 stripes, but got $stripes"
3452 e_start=$($LFS getstripe -I1 $name |
3453 awk '/lcme_extent.e_start:/ { print $2 }')
3454 [ $e_start -eq 0 ] ||
3455 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3457 e_end=$($LFS getstripe -I1 $name |
3458 awk '/lcme_extent.e_end:/ { print $2 }')
3459 [ $e_end -eq 2097152 ] ||
3460 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3462 e_start=$($LFS getstripe -I2 $name |
3463 awk '/lcme_extent.e_start:/ { print $2 }')
3464 [ $e_start -eq 2097152 ] ||
3465 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3467 e_end=$($LFS getstripe -I2 $name |
3468 awk '/lcme_extent.e_end:/ { print $2 }')
3469 [ "$e_end" = "EOF" ] ||
3470 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3472 size=$(stat $name | awk '/Size:/ { print $2 }')
3473 [ $size -eq $((4096 * $bcount)) ] ||
3474 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3476 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3478 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3479 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3481 # The first stripe in each COMP was lost
3482 [ $failures -eq 512 ] ||
3483 error "(8.8) expect 512 IO failures, but get $failures"
3485 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3486 [ $size -eq $((4096 * $bcount)) ] ||
3487 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3489 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3490 error "(8.10) write to the LOV EA hole should fail"
3492 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3493 error "(8.11) write to normal stripe should NOT fail"
3495 echo "foo" >> $name && error "(8.12) append write $name should fail"
3497 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3499 touch $name || error "(8.14) cannot touch $name"
3501 rm -f $name || error "(8.15) cannot unlink $name"
3504 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3506 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3507 echo "Check $name, it contains f2's first stripe in each COMP"
3509 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3511 pattern=$($LFS getstripe -L -I1 $name)
3512 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3513 error "(9.2.1) expect pattern flag hole, but got $pattern"
3515 pattern=$($LFS getstripe -L -I2 $name)
3516 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3517 error "(9.2.2) expect pattern flag hole, but got $pattern"
3519 stripes=$($LFS getstripe -c -I1 $name)
3520 [ $stripes -eq 2 ] ||
3521 error "(9.3.2) expect 2 stripes, but got $stripes"
3523 stripes=$($LFS getstripe -c -I2 $name)
3524 [ $stripes -eq 2 ] ||
3525 error "(9.3.2) expect 2 stripes, but got $stripes"
3527 e_start=$($LFS getstripe -I1 $name |
3528 awk '/lcme_extent.e_start:/ { print $2 }')
3529 [ $e_start -eq 0 ] ||
3530 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3532 e_end=$($LFS getstripe -I1 $name |
3533 awk '/lcme_extent.e_end:/ { print $2 }')
3534 [ $e_end -eq 2097152 ] ||
3535 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3537 e_start=$($LFS getstripe -I2 $name |
3538 awk '/lcme_extent.e_start:/ { print $2 }')
3539 [ $e_start -eq 2097152 ] ||
3540 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3542 e_end=$($LFS getstripe -I2 $name |
3543 awk '/lcme_extent.e_end:/ { print $2 }')
3544 [ "$e_end" = "EOF" ] ||
3545 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3547 size=$(stat $name | awk '/Size:/ { print $2 }')
3548 # The second stripe in COMP was lost, so we do not know there
3549 # have ever been some data before. 'stat' will regard it as
3550 # no data on the lost stripe.
3552 [ $size -eq $((4096 * $bcount)) ] ||
3553 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3555 cat $name > /dev/null &&
3556 error "(9.7) normal read $name should fail"
3558 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3559 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3560 [ $failures -eq 512 ] ||
3561 error "(9.8) expect 256 IO failures, but get $failures"
3563 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3564 # The second stripe in COMP was lost, so we do not know there
3565 # have ever been some data before. Since 'dd' skip failure,
3566 # it will regard the lost stripe contains data.
3568 [ $size -eq $((4096 * $bcount)) ] ||
3569 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3571 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3572 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3574 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3575 error "(9.11) write to normal stripe should NOT fail"
3577 echo "foo" >> $name &&
3578 error "(9.12) append write $name should fail"
3580 chown $RUNAS_ID:$RUNAS_GID $name ||
3581 error "(9.13) cannot chown on $name"
3583 touch $name || error "(9.14) cannot touch $name"
3585 rm -f $name || error "(7.15) cannot unlink $name"
3587 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3590 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3591 skip "ignore the test if MDS is older than 2.5.59" && return
3593 check_mount_and_prep
3594 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3596 echo "Start all LFSCK components by default (-s 1)"
3597 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3598 error "Fail to start LFSCK"
3600 echo "namespace LFSCK should be in 'scanning-phase1' status"
3601 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3602 [ "$STATUS" == "scanning-phase1" ] ||
3603 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3605 echo "layout LFSCK should be in 'scanning-phase1' status"
3606 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3607 [ "$STATUS" == "scanning-phase1" ] ||
3608 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3610 echo "Stop all LFSCK components by default"
3611 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3612 error "Fail to stop LFSCK"
3614 run_test 21 "run all LFSCK components by default"
3617 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3620 echo "The parent_A references the child directory via some name entry,"
3621 echo "but the child directory back references another parent_B via its"
3622 echo "".." name entry. The parent_B does not exist. Then the namespace"
3623 echo "LFSCK will repair the child directory's ".." name entry."
3626 check_mount_and_prep
3628 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3629 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3631 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3632 echo "The dummy's dotdot name entry references the guard."
3633 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3634 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3635 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3636 error "(3) Fail to mkdir on MDT0"
3637 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3639 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3641 echo "Trigger namespace LFSCK to repair unmatched pairs"
3642 $START_NAMESPACE -A -r ||
3643 error "(5) Fail to start LFSCK for namespace"
3645 wait_all_targets_blocked namespace completed 6
3647 local repaired=$($SHOW_NAMESPACE |
3648 awk '/^unmatched_pairs_repaired/ { print $2 }')
3649 [ $repaired -eq 1 ] ||
3650 error "(7) Fail to repair unmatched pairs: $repaired"
3652 echo "'ls' should success after namespace LFSCK repairing"
3653 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3654 error "(8) ls should success."
3656 run_test 22a "LFSCK can repair unmatched pairs (1)"
3659 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3662 echo "The parent_A references the child directory via the name entry_B,"
3663 echo "but the child directory back references another parent_C via its"
3664 echo "".." name entry. The parent_C exists, but there is no the name"
3665 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3666 echo "the child directory's ".." name entry and its linkEA."
3669 check_mount_and_prep
3671 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3672 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3674 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3675 echo "and bad linkEA. The dummy's dotdot name entry references the"
3676 echo "guard. The dummy's linkEA references n non-exist name entry."
3677 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3678 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3679 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3680 error "(3) Fail to mkdir on MDT0"
3681 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3683 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3684 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3685 local dummyname=$($LFS fid2path $DIR $dummyfid)
3686 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3687 error "(4) fid2path works unexpectedly."
3689 echo "Trigger namespace LFSCK to repair unmatched pairs"
3690 $START_NAMESPACE -A -r ||
3691 error "(5) Fail to start LFSCK for namespace"
3693 wait_all_targets_blocked namespace completed 6
3695 local repaired=$($SHOW_NAMESPACE |
3696 awk '/^unmatched_pairs_repaired/ { print $2 }')
3697 [ $repaired -eq 1 ] ||
3698 error "(7) Fail to repair unmatched pairs: $repaired"
3700 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3701 local dummyname=$($LFS fid2path $DIR $dummyfid)
3702 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3703 error "(8) fid2path does not work"
3705 run_test 22b "LFSCK can repair unmatched pairs (2)"
3708 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3711 echo "The name entry is there, but the MDT-object for such name "
3712 echo "entry does not exist. The namespace LFSCK should find out "
3713 echo "and repair the inconsistency as required."
3716 check_mount_and_prep
3718 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3719 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3721 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3722 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3723 do_facet mds2 $LCTL set_param fail_loc=0x1620
3724 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3725 do_facet mds2 $LCTL set_param fail_loc=0
3727 echo "'ls' should fail because of dangling name entry"
3728 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3730 echo "Trigger namespace LFSCK to find out dangling name entry"
3731 $START_NAMESPACE -A -r ||
3732 error "(5) Fail to start LFSCK for namespace"
3734 wait_all_targets_blocked namespace completed 6
3736 local repaired=$($SHOW_NAMESPACE |
3737 awk '/^dangling_repaired/ { print $2 }')
3738 [ $repaired -eq 1 ] ||
3739 error "(7) Fail to repair dangling name entry: $repaired"
3741 echo "'ls' should fail because not re-create MDT-object by default"
3742 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3744 echo "Trigger namespace LFSCK again to repair dangling name entry"
3745 $START_NAMESPACE -A -r -C ||
3746 error "(9) Fail to start LFSCK for namespace"
3748 wait_all_targets_blocked namespace completed 10
3750 repaired=$($SHOW_NAMESPACE |
3751 awk '/^dangling_repaired/ { print $2 }')
3752 [ $repaired -eq 1 ] ||
3753 error "(11) Fail to repair dangling name entry: $repaired"
3755 echo "'ls' should success after namespace LFSCK repairing"
3756 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3758 run_test 23a "LFSCK can repair dangling name entry (1)"
3762 echo "The objectA has multiple hard links, one of them corresponding"
3763 echo "to the name entry_B. But there is something wrong for the name"
3764 echo "entry_B and cause entry_B to references non-exist object_C."
3765 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3766 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3767 echo "comes to the second-stage scanning, it will find that the"
3768 echo "former re-creating object_C is not proper, and will try to"
3769 echo "replace the object_C with the real object_A."
3772 check_mount_and_prep
3774 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3775 $LFS path2fid $DIR/$tdir/d0
3777 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3779 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3780 $LFS path2fid $DIR/$tdir/d0/f0
3782 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3783 $LFS path2fid $DIR/$tdir/d0/f1
3785 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3786 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3788 if [ "$SEQ0" != "$SEQ1" ]; then
3789 # To guarantee that the f0 and f1 are in the same FID seq
3790 rm -f $DIR/$tdir/d0/f0 ||
3791 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3792 echo "dummy" > $DIR/$tdir/d0/f0 ||
3793 error "(3.2) Fail to touch on MDT0"
3794 $LFS path2fid $DIR/$tdir/d0/f0
3797 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3798 OID=$(printf %d $OID)
3800 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3801 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3802 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3803 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3804 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3806 # If there is creation after the dangling injection, it may re-use
3807 # the just released local object (inode) that is referenced by the
3808 # dangling name entry. It will fail the dangling injection.
3809 # So before deleting the target object for the dangling name entry,
3810 # remove some other objects to avoid the target object being reused
3811 # by some potential creations. LU-7429
3812 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3814 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3816 echo "'ls' should fail because of dangling name entry"
3817 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3818 error "(6) ls should fail."
3820 echo "Trigger namespace LFSCK to find out dangling name entry"
3821 $START_NAMESPACE -r -C ||
3822 error "(7) Fail to start LFSCK for namespace"
3824 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3825 mdd.${MDT_DEV}.lfsck_namespace |
3826 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3828 error "(8) unexpected status"
3831 local repaired=$($SHOW_NAMESPACE |
3832 awk '/^dangling_repaired/ { print $2 }')
3833 [ $repaired -eq 1 ] ||
3834 error "(9) Fail to repair dangling name entry: $repaired"
3836 repaired=$($SHOW_NAMESPACE |
3837 awk '/^multiple_linked_repaired/ { print $2 }')
3838 [ $repaired -eq 1 ] ||
3839 error "(10) Fail to drop the former created object: $repaired"
3841 local data=$(cat $DIR/$tdir/d0/foo)
3842 [ "$data" == "dummy" ] ||
3843 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3845 run_test 23b "LFSCK can repair dangling name entry (2)"
3849 echo "The objectA has multiple hard links, one of them corresponding"
3850 echo "to the name entry_B. But there is something wrong for the name"
3851 echo "entry_B and cause entry_B to references non-exist object_C."
3852 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3853 echo "as dangling, and re-create the lost object_C. And then others"
3854 echo "modified the re-created object_C. When the LFSCK comes to the"
3855 echo "second-stage scanning, it will find that the former re-creating"
3856 echo "object_C maybe wrong and try to replace the object_C with the"
3857 echo "real object_A. But because object_C has been modified, so the"
3858 echo "LFSCK cannot replace it."
3861 start_full_debug_logging
3863 check_mount_and_prep
3865 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3866 $LFS path2fid $DIR/$tdir/d0
3868 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3870 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3871 $LFS path2fid $DIR/$tdir/d0/f0
3873 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3874 $LFS path2fid $DIR/$tdir/d0/f1
3876 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3877 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3879 if [ "$SEQ0" != "$SEQ1" ]; then
3880 # To guarantee that the f0 and f1 are in the same FID seq
3881 rm -f $DIR/$tdir/d0/f0 ||
3882 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3883 echo "dummy" > $DIR/$tdir/d0/f0 ||
3884 error "(3.2) Fail to touch on MDT0"
3885 $LFS path2fid $DIR/$tdir/d0/f0
3888 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3889 OID=$(printf %d $OID)
3891 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3892 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3893 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3894 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3895 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3897 # If there is creation after the dangling injection, it may re-use
3898 # the just released local object (inode) that is referenced by the
3899 # dangling name entry. It will fail the dangling injection.
3900 # So before deleting the target object for the dangling name entry,
3901 # remove some other objects to avoid the target object being reused
3902 # by some potential creations. LU-7429
3903 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3905 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3907 echo "'ls' should fail because of dangling name entry"
3908 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3909 error "(6) ls should fail."
3911 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3912 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3914 echo "Trigger namespace LFSCK to find out dangling name entry"
3915 $START_NAMESPACE -r -C ||
3916 error "(7) Fail to start LFSCK for namespace"
3918 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3919 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3920 stat $DIR/$tdir/d0/foo
3922 error "(8) unexpected size"
3925 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3926 cancel_lru_locks osc
3928 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3929 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3930 mdd.${MDT_DEV}.lfsck_namespace |
3931 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3933 error "(10) unexpected status"
3936 stop_full_debug_logging
3938 local repaired=$($SHOW_NAMESPACE |
3939 awk '/^dangling_repaired/ { print $2 }')
3940 [ $repaired -eq 1 ] ||
3941 error "(11) Fail to repair dangling name entry: $repaired"
3943 local data=$(cat $DIR/$tdir/d0/foo)
3944 [ "$data" != "dummy" ] ||
3945 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3947 run_test 23c "LFSCK can repair dangling name entry (3)"
3950 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
3953 echo "Two MDT-objects back reference the same name entry via their"
3954 echo "each own linkEA entry, but the name entry only references one"
3955 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3956 echo "for the MDT-object that is not recognized. If such MDT-object"
3957 echo "has no other linkEA entry after the removing, then the LFSCK"
3958 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3961 check_mount_and_prep
3963 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3965 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3966 $LFS path2fid $DIR/$tdir/d0/guard
3968 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3969 $LFS path2fid $DIR/$tdir/d0/dummy
3972 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3973 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3975 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3978 touch $DIR/$tdir/d0/guard/foo ||
3979 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3981 echo "Inject failure stub on MDT0 to simulate the case that"
3982 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3983 echo "that references $DIR/$tdir/d0/guard/foo."
3984 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3985 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3986 echo "there with the same linkEA entry as another MDT-object"
3987 echo "$DIR/$tdir/d0/guard/foo has"
3989 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3990 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3991 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3992 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3993 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3994 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3995 rmdir $DIR/$tdir/d0/dummy/foo ||
3996 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3997 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3999 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4000 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4001 error "(6) stat successfully unexpectedly"
4003 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4004 $START_NAMESPACE -A -r ||
4005 error "(7) Fail to start LFSCK for namespace"
4007 wait_all_targets_blocked namespace completed 8
4009 local repaired=$($SHOW_NAMESPACE |
4010 awk '/^multiple_referenced_repaired/ { print $2 }')
4011 [ $repaired -eq 1 ] ||
4012 error "(9) Fail to repair multiple referenced name entry: $repaired"
4014 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4015 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4016 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4018 local cname="$cfid-$pfid-D-0"
4019 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4020 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4022 run_test 24 "LFSCK can repair multiple-referenced name entry"
4025 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4026 skip "ldiskfs only test" && return
4029 echo "The file type in the name entry does not match the file type"
4030 echo "claimed by the referenced object. Then the LFSCK will update"
4031 echo "the file type in the name entry."
4034 check_mount_and_prep
4036 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4038 echo "Inject failure stub on MDT0 to simulate the case that"
4039 echo "the file type stored in the name entry is wrong."
4041 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4042 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4043 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4044 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4046 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4047 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4049 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4050 mdd.${MDT_DEV}.lfsck_namespace |
4051 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4053 error "(4) unexpected status"
4056 local repaired=$($SHOW_NAMESPACE |
4057 awk '/^bad_file_type_repaired/ { print $2 }')
4058 [ $repaired -eq 1 ] ||
4059 error "(5) Fail to repair bad file type in name entry: $repaired"
4061 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4063 run_test 25 "LFSCK can repair bad file type in the name entry"
4067 echo "The local name entry back referenced by the MDT-object is lost."
4068 echo "The namespace LFSCK will add the missing local name entry back"
4069 echo "to the normal namespace."
4072 check_mount_and_prep
4074 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4075 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4076 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4078 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4079 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4081 echo "Inject failure stub on MDT0 to simulate the case that"
4082 echo "foo's name entry will be removed, but the foo's object"
4083 echo "and its linkEA are kept in the system."
4085 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4086 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4087 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4088 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4090 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4091 error "(5) 'ls' should fail"
4093 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4094 $START_NAMESPACE -r -A ||
4095 error "(6) Fail to start LFSCK for namespace"
4097 wait_all_targets_blocked namespace completed 7
4099 local repaired=$($SHOW_NAMESPACE |
4100 awk '/^lost_dirent_repaired/ { print $2 }')
4101 [ $repaired -eq 1 ] ||
4102 error "(8) Fail to repair lost dirent: $repaired"
4104 ls -ail $DIR/$tdir/d0/foo ||
4105 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4107 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4108 [ "$foofid" == "$foofid2" ] ||
4109 error "(10) foo's FID changed: $foofid, $foofid2"
4111 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4114 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4117 echo "The remote name entry back referenced by the MDT-object is lost."
4118 echo "The namespace LFSCK will add the missing remote name entry back"
4119 echo "to the normal namespace."
4122 check_mount_and_prep
4124 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4125 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4126 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4128 echo "Inject failure stub on MDT0 to simulate the case that"
4129 echo "foo's name entry will be removed, but the foo's object"
4130 echo "and its linkEA are kept in the system."
4132 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4133 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4134 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4135 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4137 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4138 error "(4) 'ls' should fail"
4140 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4141 $START_NAMESPACE -r -A ||
4142 error "(5) Fail to start LFSCK for namespace"
4144 wait_all_targets_blocked namespace completed 6
4146 local repaired=$($SHOW_NAMESPACE |
4147 awk '/^lost_dirent_repaired/ { print $2 }')
4148 [ $repaired -eq 1 ] ||
4149 error "(7) Fail to repair lost dirent: $repaired"
4151 ls -ail $DIR/$tdir/d0/foo ||
4152 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4154 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4155 [ "$foofid" == "$foofid2" ] ||
4156 error "(9) foo's FID changed: $foofid, $foofid2"
4158 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4162 echo "The local parent referenced by the MDT-object linkEA is lost."
4163 echo "The namespace LFSCK will re-create the lost parent as orphan."
4166 check_mount_and_prep
4168 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4169 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4170 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4171 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4173 echo "Inject failure stub on MDT0 to simulate the case that"
4174 echo "foo's name entry will be removed, but the foo's object"
4175 echo "and its linkEA are kept in the system. And then remove"
4176 echo "another hard link and the parent directory."
4178 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4179 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4180 rm -f $DIR/$tdir/d0/foo ||
4181 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4182 rm -f $DIR/$tdir/d0/dummy ||
4183 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4186 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4187 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4189 echo "Trigger namespace LFSCK to repair the lost parent"
4190 $START_NAMESPACE -r -A ||
4191 error "(6) Fail to start LFSCK for namespace"
4193 wait_all_targets_blocked namespace completed 7
4195 local repaired=$($SHOW_NAMESPACE |
4196 awk '/^lost_dirent_repaired/ { print $2 }')
4197 [ $repaired -eq 1 ] ||
4198 error "(8) Fail to repair lost dirent: $repaired"
4200 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4201 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4202 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4204 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4206 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4207 [ ! -z "$cname" ] ||
4208 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4210 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4213 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4216 echo "The remote parent referenced by the MDT-object linkEA is lost."
4217 echo "The namespace LFSCK will re-create the lost parent as orphan."
4220 check_mount_and_prep
4222 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4223 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4225 $LFS path2fid $DIR/$tdir/d0
4227 echo "Inject failure stub on MDT0 to simulate the case that"
4228 echo "foo's name entry will be removed, but the foo's object"
4229 echo "and its linkEA are kept in the system. And then remove"
4230 echo "the parent directory."
4232 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4233 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4234 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4235 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4237 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4238 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4240 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4241 $START_NAMESPACE -r -A ||
4242 error "(6) Fail to start LFSCK for namespace"
4244 wait_all_targets_blocked namespace completed 7
4246 local repaired=$($SHOW_NAMESPACE |
4247 awk '/^lost_dirent_repaired/ { print $2 }')
4248 [ $repaired -eq 1 ] ||
4249 error "(8) Fail to repair lost dirent: $repaired"
4251 ls -ail $MOUNT/.lustre/lost+found/
4253 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4254 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4255 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4257 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4259 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4260 [ ! -z "$cname" ] ||
4261 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4263 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4266 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4269 echo "The target name entry is lost. The LFSCK should insert the"
4270 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4271 echo "the MDT (on which the orphan MDT-object resides) has ever"
4272 echo "failed to respond some name entry verification during the"
4273 echo "first stage-scanning, then the LFSCK should skip to handle"
4274 echo "orphan MDT-object on this MDT. But other MDTs should not"
4278 check_mount_and_prep
4279 $LFS mkdir -i 0 $DIR/$tdir/d1
4280 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4281 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4283 $LFS mkdir -i 1 $DIR/$tdir/d2
4284 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4285 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4287 echo "Inject failure stub on MDT0 to simulate the case that"
4288 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4289 echo "and its linkEA are kept in the system. And the case that"
4290 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4291 echo "and its linkEA are kept in the system."
4293 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4294 do_facet mds1 $LCTL set_param fail_loc=0x1624
4295 do_facet mds2 $LCTL set_param fail_loc=0x1624
4296 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4297 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4298 do_facet mds1 $LCTL set_param fail_loc=0
4299 do_facet mds2 $LCTL set_param fail_loc=0
4301 cancel_lru_locks mdc
4302 cancel_lru_locks osc
4304 echo "Inject failure, to simulate the MDT0 fail to handle"
4305 echo "MDT1 LFSCK request during the first-stage scanning."
4306 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4307 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4309 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4310 $START_NAMESPACE -r -A ||
4311 error "(3) Fail to start LFSCK for namespace"
4313 wait_update_facet mds1 "$LCTL get_param -n \
4314 mdd.$(facet_svc mds1).lfsck_namespace |
4315 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4316 error "(4) mds1 is not the expected 'partial'"
4319 wait_update_facet mds2 "$LCTL get_param -n \
4320 mdd.$(facet_svc mds2).lfsck_namespace |
4321 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4322 error "(5) mds2 is not the expected 'completed'"
4325 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4327 local repaired=$(do_facet mds1 $LCTL get_param -n \
4328 mdd.$(facet_svc mds1).lfsck_namespace |
4329 awk '/^lost_dirent_repaired/ { print $2 }')
4330 [ $repaired -eq 0 ] ||
4331 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4333 repaired=$(do_facet mds2 $LCTL get_param -n \
4334 mdd.$(facet_svc mds2).lfsck_namespace |
4335 awk '/^lost_dirent_repaired/ { print $2 }')
4336 [ $repaired -eq 1 ] ||
4337 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4339 echo "Trigger namespace LFSCK on all devices again to cleanup"
4340 $START_NAMESPACE -r -A ||
4341 error "(8) Fail to start LFSCK for namespace"
4343 wait_all_targets_blocked namespace completed 9
4345 local repaired=$(do_facet mds1 $LCTL get_param -n \
4346 mdd.$(facet_svc mds1).lfsck_namespace |
4347 awk '/^lost_dirent_repaired/ { print $2 }')
4348 [ $repaired -eq 1 ] ||
4349 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4351 repaired=$(do_facet mds2 $LCTL get_param -n \
4352 mdd.$(facet_svc mds2).lfsck_namespace |
4353 awk '/^lost_dirent_repaired/ { print $2 }')
4354 [ $repaired -eq 0 ] ||
4355 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4357 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4361 echo "The object's nlink attribute is larger than the object's known"
4362 echo "name entries count. The LFSCK will repair the object's nlink"
4363 echo "attribute to match the known name entries count"
4366 check_mount_and_prep
4368 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4369 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4371 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4372 echo "nlink attribute is larger than its name entries count."
4374 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4375 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4376 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4377 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4378 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4380 cancel_lru_locks mdc
4381 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4382 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4384 echo "Trigger namespace LFSCK to repair the nlink count"
4385 $START_NAMESPACE -r -A ||
4386 error "(5) Fail to start LFSCK for namespace"
4388 wait_all_targets_blocked namespace completed 6
4390 local repaired=$($SHOW_NAMESPACE |
4391 awk '/^nlinks_repaired/ { print $2 }')
4392 [ $repaired -eq 1 ] ||
4393 error "(7) Fail to repair nlink count: $repaired"
4395 cancel_lru_locks mdc
4396 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4397 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4399 # Disable 29a, we only allow nlink to be updated if the known linkEA
4400 # entries is larger than nlink count.
4402 #run_test 29a "LFSCK can repair bad nlink count (1)"
4406 echo "The object's nlink attribute is smaller than the object's known"
4407 echo "name entries count. The LFSCK will repair the object's nlink"
4408 echo "attribute to match the known name entries count"
4411 check_mount_and_prep
4413 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4414 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4416 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4417 echo "nlink attribute is smaller than its name entries count."
4419 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4420 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4421 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4422 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4425 cancel_lru_locks mdc
4426 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4427 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4429 echo "Trigger namespace LFSCK to repair the nlink count"
4430 $START_NAMESPACE -r -A ||
4431 error "(5) Fail to start LFSCK for namespace"
4433 wait_all_targets_blocked namespace completed 6
4435 local repaired=$($SHOW_NAMESPACE |
4436 awk '/^nlinks_repaired/ { print $2 }')
4437 [ $repaired -eq 1 ] ||
4438 error "(7) Fail to repair nlink count: $repaired"
4440 cancel_lru_locks mdc
4441 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4442 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4444 run_test 29b "LFSCK can repair bad nlink count (2)"
4449 echo "The namespace LFSCK will create many hard links to the target"
4450 echo "file as to exceed the linkEA size limitation. Under such case"
4451 echo "the linkEA will be marked as overflow that will prevent the"
4452 echo "target file to be migrated. Then remove some hard links to"
4453 echo "make the left hard links to be held within the linkEA size"
4454 echo "limitation. But before the namespace LFSCK adding all the"
4455 echo "missed linkEA entries back, the overflow mark (timestamp)"
4456 echo "will not be cleared."
4459 check_mount_and_prep
4461 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4462 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4463 error "(0.2) Fail to mkdir"
4464 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4465 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4467 # define MAX_LINKEA_SIZE 4096
4468 # sizeof(link_ea_header) = 24
4469 # sizeof(link_ea_entry) = 18
4470 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4471 # (sizeof(link_ea_entry) + name_length))
4472 # If the average name length is 12 bytes, then 150 hard links
4473 # is totally enough to overflow the linkEA
4474 echo "Create 150 hard links should succeed although the linkEA overflow"
4475 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4476 error "(2) Fail to hard link"
4478 cancel_lru_locks mdc
4479 if [ $MDSCOUNT -ge 2 ]; then
4480 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4481 error "(3.1) Migrate failure"
4483 echo "The object with linkEA overflow should NOT be migrated"
4484 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4485 [ "$newfid" == "$oldfid" ] ||
4486 error "(3.2) Migrate should fail: $newfid != $oldfid"
4489 # Remove 100 hard links, then the linkEA should have space
4490 # to hold the missed linkEA entries.
4491 echo "Remove 100 hard links to save space for the missed linkEA entries"
4492 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4494 if [ $MDSCOUNT -ge 2 ]; then
4495 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4496 error "(5.1) Migrate failure"
4498 # The overflow timestamp is still there, so migration will fail.
4499 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4500 [ "$newfid" == "$oldfid" ] ||
4501 error "(5.2) Migrate should fail: $newfid != $oldfid"
4504 # sleep 3 seconds to guarantee that the overflow is recognized
4507 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4508 $START_NAMESPACE -r -A ||
4509 error "(6) Fail to start LFSCK for namespace"
4511 wait_all_targets_blocked namespace completed 7
4513 local repaired=$($SHOW_NAMESPACE |
4514 awk '/^linkea_overflow_cleared/ { print $2 }')
4515 [ $repaired -eq 1 ] ||
4516 error "(8) Fail to clear linkea overflow: $repaired"
4518 repaired=$($SHOW_NAMESPACE |
4519 awk '/^nlinks_repaired/ { print $2 }')
4520 [ $repaired -eq 0 ] ||
4521 error "(9) Unexpected nlink repaired: $repaired"
4523 if [ $MDSCOUNT -ge 2 ]; then
4524 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4525 error "(10.1) Migrate failure"
4527 # Migration should succeed after clear the overflow timestamp.
4528 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4529 [ "$newfid" != "$oldfid" ] ||
4530 error "(10.2) Migrate should succeed"
4532 ls -l $DIR/$tdir/foo > /dev/null ||
4533 error "(11) 'ls' failed after migration"
4536 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4537 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4539 run_test 29c "verify linkEA size limitation"
4542 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4543 skip "ldiskfs only test" && return
4546 echo "The namespace LFSCK will move the orphans from backend"
4547 echo "/lost+found directory to normal client visible namespace"
4548 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4551 check_mount_and_prep
4553 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4554 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4556 echo "Inject failure stub on MDT0 to simulate the case that"
4557 echo "directory d0 has no linkEA entry, then the LFSCK will"
4558 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4560 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4561 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4562 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4563 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4565 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4566 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4568 echo "Inject failure stub on MDT0 to simulate the case that the"
4569 echo "object's name entry will be removed, but not destroy the"
4570 echo "object. Then backend e2fsck will handle it as orphan and"
4571 echo "add them into the backend /lost+found directory."
4573 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4574 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4575 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4576 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4577 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4578 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4579 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4581 umount_client $MOUNT || error "(10) Fail to stop client!"
4583 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4586 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4587 error "(12) Fail to run e2fsck"
4589 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4590 error "(13) Fail to start MDT0"
4592 echo "Trigger namespace LFSCK to recover backend orphans"
4593 $START_NAMESPACE -r -A ||
4594 error "(14) Fail to start LFSCK for namespace"
4596 wait_all_targets_blocked namespace completed 15
4598 local repaired=$($SHOW_NAMESPACE |
4599 awk '/^local_lost_found_moved/ { print $2 }')
4600 [ $repaired -ge 4 ] ||
4601 error "(16) Fail to recover backend orphans: $repaired"
4603 mount_client $MOUNT || error "(17) Fail to start client!"
4605 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4607 ls -ail $MOUNT/.lustre/lost+found/
4609 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4610 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4611 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4613 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4615 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
4616 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4618 stat ${cname}/d1 || error "(21) d0 is not recovered"
4619 stat ${cname}/f1 || error "(22) f1 is not recovered"
4621 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4624 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4627 echo "For the name entry under a striped directory, if the name"
4628 echo "hash does not match the shard, then the LFSCK will repair"
4629 echo "the bad name entry"
4632 check_mount_and_prep
4634 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4635 error "(1) Fail to create striped directory"
4637 echo "Inject failure stub on client to simulate the case that"
4638 echo "some name entry should be inserted into other non-first"
4639 echo "shard, but inserted into the first shard by wrong"
4641 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4642 $LCTL set_param fail_loc=0x1628 fail_val=0
4643 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4644 error "(2) Fail to create file under striped directory"
4645 $LCTL set_param fail_loc=0 fail_val=0
4647 echo "Trigger namespace LFSCK to repair bad name hash"
4648 $START_NAMESPACE -r -A ||
4649 error "(3) Fail to start LFSCK for namespace"
4651 wait_all_targets_blocked namespace completed 4
4653 local repaired=$($SHOW_NAMESPACE |
4654 awk '/^name_hash_repaired/ { print $2 }')
4655 [ $repaired -ge 1 ] ||
4656 error "(5) Fail to repair bad name hash: $repaired"
4658 umount_client $MOUNT || error "(6) umount failed"
4659 mount_client $MOUNT || error "(7) mount failed"
4661 for ((i = 0; i < $MDSCOUNT; i++)); do
4662 stat $DIR/$tdir/striped_dir/d$i ||
4663 error "(8) Fail to stat d$i after LFSCK"
4664 rmdir $DIR/$tdir/striped_dir/d$i ||
4665 error "(9) Fail to unlink d$i after LFSCK"
4668 rmdir $DIR/$tdir/striped_dir ||
4669 error "(10) Fail to remove the striped directory after LFSCK"
4671 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4674 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4677 echo "For the name entry under a striped directory, if the name"
4678 echo "hash does not match the shard, then the LFSCK will repair"
4679 echo "the bad name entry"
4682 check_mount_and_prep
4684 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4685 error "(1) Fail to create striped directory"
4687 echo "Inject failure stub on client to simulate the case that"
4688 echo "some name entry should be inserted into other non-second"
4689 echo "shard, but inserted into the secod shard by wrong"
4691 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4692 $LCTL set_param fail_loc=0x1628 fail_val=1
4693 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4694 error "(2) Fail to create file under striped directory"
4695 $LCTL set_param fail_loc=0 fail_val=0
4697 echo "Trigger namespace LFSCK to repair bad name hash"
4698 $START_NAMESPACE -r -A ||
4699 error "(3) Fail to start LFSCK for namespace"
4701 wait_all_targets_blocked namespace completed 4
4703 local repaired=$(do_facet mds2 $LCTL get_param -n \
4704 mdd.$(facet_svc mds2).lfsck_namespace |
4705 awk '/^name_hash_repaired/ { print $2 }')
4706 [ $repaired -ge 1 ] ||
4707 error "(5) Fail to repair bad name hash: $repaired"
4709 umount_client $MOUNT || error "(6) umount failed"
4710 mount_client $MOUNT || error "(7) mount failed"
4712 for ((i = 0; i < $MDSCOUNT; i++)); do
4713 stat $DIR/$tdir/striped_dir/d$i ||
4714 error "(8) Fail to stat d$i after LFSCK"
4715 rmdir $DIR/$tdir/striped_dir/d$i ||
4716 error "(9) Fail to unlink d$i after LFSCK"
4719 rmdir $DIR/$tdir/striped_dir ||
4720 error "(10) Fail to remove the striped directory after LFSCK"
4722 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4725 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4728 echo "For some reason, the master MDT-object of the striped directory"
4729 echo "may lost its master LMV EA. If nobody created files under the"
4730 echo "master directly after the master LMV EA lost, then the LFSCK"
4731 echo "should re-generate the master LMV EA."
4734 check_mount_and_prep
4736 echo "Inject failure stub on MDT0 to simulate the case that the"
4737 echo "master MDT-object of the striped directory lost the LMV EA."
4739 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4740 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4741 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4742 error "(1) Fail to create striped directory"
4743 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4745 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4746 $START_NAMESPACE -r -A ||
4747 error "(2) Fail to start LFSCK for namespace"
4749 wait_all_targets_blocked namespace completed 3
4751 local repaired=$($SHOW_NAMESPACE |
4752 awk '/^striped_dirs_repaired/ { print $2 }')
4753 [ $repaired -eq 1 ] ||
4754 error "(4) Fail to re-generate master LMV EA: $repaired"
4756 umount_client $MOUNT || error "(5) umount failed"
4757 mount_client $MOUNT || error "(6) mount failed"
4759 local empty=$(ls $DIR/$tdir/striped_dir/)
4760 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4762 rmdir $DIR/$tdir/striped_dir ||
4763 error "(8) Fail to remove the striped directory after LFSCK"
4765 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4768 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4771 echo "For some reason, the master MDT-object of the striped directory"
4772 echo "may lost its master LMV EA. If somebody created files under the"
4773 echo "master directly after the master LMV EA lost, then the LFSCK"
4774 echo "should NOT re-generate the master LMV EA, instead, it should"
4775 echo "change the broken striped dirctory as read-only to prevent"
4776 echo "further damage"
4779 check_mount_and_prep
4781 echo "Inject failure stub on MDT0 to simulate the case that the"
4782 echo "master MDT-object of the striped directory lost the LMV EA."
4784 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4785 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4786 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4787 error "(1) Fail to create striped directory"
4788 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4790 umount_client $MOUNT || error "(2) umount failed"
4791 mount_client $MOUNT || error "(3) mount failed"
4793 touch $DIR/$tdir/striped_dir/dummy ||
4794 error "(4) Fail to touch under broken striped directory"
4796 echo "Trigger namespace LFSCK to find out the inconsistency"
4797 $START_NAMESPACE -r -A ||
4798 error "(5) Fail to start LFSCK for namespace"
4800 wait_all_targets_blocked namespace completed 6
4802 local repaired=$($SHOW_NAMESPACE |
4803 awk '/^striped_dirs_repaired/ { print $2 }')
4804 [ $repaired -eq 0 ] ||
4805 error "(7) Re-generate master LMV EA unexpected: $repaired"
4807 stat $DIR/$tdir/striped_dir/dummy ||
4808 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4810 touch $DIR/$tdir/striped_dir/foo &&
4811 error "(9) The broken striped directory should be read-only"
4813 chattr -i $DIR/$tdir/striped_dir ||
4814 error "(10) Fail to chattr on the broken striped directory"
4816 rmdir $DIR/$tdir/striped_dir ||
4817 error "(11) Fail to remove the striped directory after LFSCK"
4819 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4822 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4825 echo "For some reason, the slave MDT-object of the striped directory"
4826 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4827 echo "slave LMV EA."
4830 check_mount_and_prep
4832 echo "Inject failure stub on MDT0 to simulate the case that the"
4833 echo "slave MDT-object (that resides on the same MDT as the master"
4834 echo "MDT-object resides on) lost the LMV EA."
4836 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4837 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4838 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4839 error "(1) Fail to create striped directory"
4840 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4842 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4843 $START_NAMESPACE -r -A ||
4844 error "(2) Fail to start LFSCK for namespace"
4846 wait_all_targets_blocked namespace completed 3
4848 local repaired=$($SHOW_NAMESPACE |
4849 awk '/^striped_shards_repaired/ { print $2 }')
4850 [ $repaired -eq 1 ] ||
4851 error "(4) Fail to re-generate slave LMV EA: $repaired"
4853 rmdir $DIR/$tdir/striped_dir ||
4854 error "(5) Fail to remove the striped directory after LFSCK"
4856 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4859 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4862 echo "For some reason, the slave MDT-object of the striped directory"
4863 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4864 echo "slave LMV EA."
4867 check_mount_and_prep
4869 echo "Inject failure stub on MDT0 to simulate the case that the"
4870 echo "slave MDT-object (that resides on different MDT as the master"
4871 echo "MDT-object resides on) lost the LMV EA."
4873 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4874 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4875 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4876 error "(1) Fail to create striped directory"
4877 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4879 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4880 $START_NAMESPACE -r -A ||
4881 error "(2) Fail to start LFSCK for namespace"
4883 wait_all_targets_blocked namespace completed 3
4885 local repaired=$(do_facet mds2 $LCTL get_param -n \
4886 mdd.$(facet_svc mds2).lfsck_namespace |
4887 awk '/^striped_shards_repaired/ { print $2 }')
4888 [ $repaired -eq 1 ] ||
4889 error "(4) Fail to re-generate slave LMV EA: $repaired"
4891 rmdir $DIR/$tdir/striped_dir ||
4892 error "(5) Fail to remove the striped directory after LFSCK"
4894 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4897 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4900 echo "For some reason, the stripe index in the slave LMV EA is"
4901 echo "corrupted. The LFSCK should repair the slave LMV EA."
4904 check_mount_and_prep
4906 echo "Inject failure stub on MDT0 to simulate the case that the"
4907 echo "slave LMV EA on the first shard of the striped directory"
4908 echo "claims the same index as the second shard claims"
4910 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4911 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4912 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4913 error "(1) Fail to create striped directory"
4914 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4916 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4917 $START_NAMESPACE -r -A ||
4918 error "(2) Fail to start LFSCK for namespace"
4920 wait_all_targets_blocked namespace completed 3
4922 local repaired=$($SHOW_NAMESPACE |
4923 awk '/^striped_shards_repaired/ { print $2 }')
4924 [ $repaired -eq 1 ] ||
4925 error "(4) Fail to repair slave LMV EA: $repaired"
4927 umount_client $MOUNT || error "(5) umount failed"
4928 mount_client $MOUNT || error "(6) mount failed"
4930 touch $DIR/$tdir/striped_dir/foo ||
4931 error "(7) Fail to touch file after the LFSCK"
4933 rm -f $DIR/$tdir/striped_dir/foo ||
4934 error "(8) Fail to unlink file after the LFSCK"
4936 rmdir $DIR/$tdir/striped_dir ||
4937 error "(9) Fail to remove the striped directory after LFSCK"
4939 run_test 31g "Repair the corrupted slave LMV EA"
4942 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
4945 echo "For some reason, the shard's name entry in the striped"
4946 echo "directory may be corrupted. The LFSCK should repair the"
4947 echo "bad shard's name entry."
4950 check_mount_and_prep
4952 echo "Inject failure stub on MDT0 to simulate the case that the"
4953 echo "first shard's name entry in the striped directory claims"
4954 echo "the same index as the second shard's name entry claims."
4956 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4957 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4958 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4959 error "(1) Fail to create striped directory"
4960 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4962 echo "Trigger namespace LFSCK to repair the shard's name entry"
4963 $START_NAMESPACE -r -A ||
4964 error "(2) Fail to start LFSCK for namespace"
4966 wait_all_targets_blocked namespace completed 3
4968 local repaired=$($SHOW_NAMESPACE |
4969 awk '/^dirent_repaired/ { print $2 }')
4970 [ $repaired -eq 1 ] ||
4971 error "(4) Fail to repair shard's name entry: $repaired"
4973 umount_client $MOUNT || error "(5) umount failed"
4974 mount_client $MOUNT || error "(6) mount failed"
4976 touch $DIR/$tdir/striped_dir/foo ||
4977 error "(7) Fail to touch file after the LFSCK"
4979 rm -f $DIR/$tdir/striped_dir/foo ||
4980 error "(8) Fail to unlink file after the LFSCK"
4982 rmdir $DIR/$tdir/striped_dir ||
4983 error "(9) Fail to remove the striped directory after LFSCK"
4985 run_test 31h "Repair the corrupted shard's name entry"
4990 umount_client $MOUNT
4992 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
4993 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
4994 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
4996 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
4997 [ "$STATUS" == "scanning-phase1" ] ||
4998 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
5001 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
5003 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5007 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
5009 run_test 32 "stop LFSCK when some OST failed"
5015 $START_LAYOUT --dryrun -o -r ||
5016 error "(1) Fail to start layout LFSCK"
5017 wait_all_targets_blocked layout completed 2
5019 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5020 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5021 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5023 $START_NAMESPACE -e abort -A -r ||
5024 error "(4) Fail to start namespace LFSCK"
5025 wait_all_targets_blocked namespace completed 5
5027 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5028 [ "$PARAMS" == "failout,all_targets" ] ||
5029 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5031 run_test 33 "check LFSCK paramters"
5033 # restore MDS/OST size
5034 MDSSIZE=${SAVED_MDSSIZE}
5035 OSTSIZE=${SAVED_OSTSIZE}
5036 OSTCOUNT=${SAVED_OSTCOUNT}
5038 # cleanup the system at last