3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
216 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
217 skip "OI Scrub not implemented for ZFS" && return
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^linkea_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase2/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair crashed linkEA: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
335 error "(7) Fail to stat $DIR/$tdir/dummy"
337 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
338 local dummyname=$($LFS fid2path $DIR $dummyfid)
339 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
340 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
342 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
348 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
350 touch $DIR/$tdir/dummy
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
354 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
356 mdd.${MDT_DEV}.lfsck_namespace |
357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
359 error "(4) unexpected status"
362 local repaired=$($SHOW_NAMESPACE |
363 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
422 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^linkea_repaired/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2d "LFSCK can recover the missing linkEA entry"
457 [ $MDSCOUNT -lt 2 ] &&
458 skip "We need at least 2 MDSes for this test" && return
462 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
464 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
466 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
467 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
469 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
471 wait_all_targets_blocked namespace completed 4
473 local repaired=$($SHOW_NAMESPACE |
474 awk '/^linkea_repaired/ { print $2 }')
475 [ $repaired -eq 1 ] ||
476 error "(5) Fail to repair crashed linkEA: $repaired"
478 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
479 local name=$($LFS fid2path $DIR $fid)
480 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
481 error "(6) Fail to repair linkEA: $fid $name"
483 run_test 2e "namespace LFSCK can verify remote object linkEA"
489 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
490 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
491 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
493 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
494 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
495 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
497 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
499 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
501 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
503 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
508 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
509 mdd.${MDT_DEV}.lfsck_namespace |
510 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
512 error "(10) unexpected status"
515 local checked=$($SHOW_NAMESPACE |
516 awk '/^checked_phase2/ { print $2 }')
517 [ $checked -ge 4 ] ||
518 error "(11) Fail to check multiple-linked object: $checked"
520 local repaired=$($SHOW_NAMESPACE |
521 awk '/^multiple_linked_repaired/ { print $2 }')
522 [ $repaired -ge 2 ] ||
523 error "(12) Fail to repair multiple-linked object: $repaired"
525 run_test 3 "LFSCK can verify multiple-linked objects"
529 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
530 skip "OI Scrub not implemented for ZFS" && return
533 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
534 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
536 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
537 echo "start $SINGLEMDS with disabling OI scrub"
538 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
539 error "(2) Fail to start MDS!"
541 #define OBD_FAIL_LFSCK_DELAY2 0x1601
542 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
543 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
548 error "(5) unexpected status"
551 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
552 [ "$STATUS" == "scanning-phase1" ] ||
553 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
555 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
557 mdd.${MDT_DEV}.lfsck_namespace |
558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
560 error "(7) unexpected status"
563 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
564 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
566 local repaired=$($SHOW_NAMESPACE |
567 awk '/^dirent_repaired/ { print $2 }')
568 # for interop with old server
569 [ -z "$repaired" ] &&
570 repaired=$($SHOW_NAMESPACE |
571 awk '/^updated_phase1/ { print $2 }')
573 [ $repaired -ge 9 ] ||
574 error "(9) Fail to re-generate FID-in-dirent: $repaired"
578 mount_client $MOUNT || error "(10) Fail to start client!"
580 #define OBD_FAIL_FID_LOOKUP 0x1505
581 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
582 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
583 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
585 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
589 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
590 skip "OI Scrub not implemented for ZFS" && return
593 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
594 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
596 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
597 echo "start $SINGLEMDS with disabling OI scrub"
598 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
599 error "(2) Fail to start MDS!"
601 #define OBD_FAIL_LFSCK_DELAY2 0x1601
602 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
603 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
605 mdd.${MDT_DEV}.lfsck_namespace |
606 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
608 error "(5) unexpected status"
611 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
612 [ "$STATUS" == "scanning-phase1" ] ||
613 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
616 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
617 mdd.${MDT_DEV}.lfsck_namespace |
618 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
620 error "(7) unexpected status"
623 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
624 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
626 local repaired=$($SHOW_NAMESPACE |
627 awk '/^dirent_repaired/ { print $2 }')
628 # for interop with old server
629 [ -z "$repaired" ] &&
630 repaired=$($SHOW_NAMESPACE |
631 awk '/^updated_phase1/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
638 mount_client $MOUNT || error "(10) Fail to start client!"
640 #define OBD_FAIL_FID_LOOKUP 0x1505
641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
642 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
644 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
647 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
648 local dummyname=$($LFS fid2path $DIR $dummyfid)
649 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
650 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
652 run_test 5 "LFSCK can handle IGIF object upgrading"
657 #define OBD_FAIL_LFSCK_DELAY1 0x1600
658 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
659 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
665 # Sleep 3 sec to guarantee at least one object processed by LFSCK
667 # Fail the LFSCK to guarantee there is at least one checkpoint
668 #define OBD_FAIL_LFSCK_FATAL1 0x1608
669 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
670 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
671 mdd.${MDT_DEV}.lfsck_namespace |
672 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
674 error "(4) unexpected status"
677 local POS0=$($SHOW_NAMESPACE |
678 awk '/^last_checkpoint_position/ { print $2 }' |
681 #define OBD_FAIL_LFSCK_DELAY1 0x1600
682 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
683 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
685 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
686 [ "$STATUS" == "scanning-phase1" ] ||
687 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
689 local POS1=$($SHOW_NAMESPACE |
690 awk '/^latest_start_position/ { print $2 }' |
692 [[ $POS0 -lt $POS1 ]] ||
693 error "(7) Expect larger than: $POS0, but got $POS1"
695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
696 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
697 mdd.${MDT_DEV}.lfsck_namespace |
698 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
700 error "(8) unexpected status"
703 run_test 6a "LFSCK resumes from last checkpoint (1)"
708 #define OBD_FAIL_LFSCK_DELAY2 0x1601
709 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
710 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
712 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
713 [ "$STATUS" == "scanning-phase1" ] ||
714 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
716 # Sleep 5 sec to guarantee that we are in the directory scanning
718 # Fail the LFSCK to guarantee there is at least one checkpoint
719 #define OBD_FAIL_LFSCK_FATAL2 0x1609
720 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
721 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
722 mdd.${MDT_DEV}.lfsck_namespace |
723 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
725 error "(4) unexpected status"
728 local O_POS0=$($SHOW_NAMESPACE |
729 awk '/^last_checkpoint_position/ { print $2 }' |
732 local D_POS0=$($SHOW_NAMESPACE |
733 awk '/^last_checkpoint_position/ { print $4 }')
735 #define OBD_FAIL_LFSCK_DELAY2 0x1601
736 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
737 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
739 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
740 [ "$STATUS" == "scanning-phase1" ] ||
741 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
743 local O_POS1=$($SHOW_NAMESPACE |
744 awk '/^latest_start_position/ { print $2 }' |
746 local D_POS1=$($SHOW_NAMESPACE |
747 awk '/^latest_start_position/ { print $4 }')
749 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
750 [[ $O_POS0 -lt $O_POS1 ]] ||
751 error "(7.1) $O_POS1 is not larger than $O_POS0"
753 [[ $D_POS0 -lt $D_POS1 ]] ||
754 error "(7.2) $D_POS1 is not larger than $D_POS0"
757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
758 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
759 mdd.${MDT_DEV}.lfsck_namespace |
760 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
762 error "(8) unexpected status"
765 run_test 6b "LFSCK resumes from last checkpoint (2)"
772 #define OBD_FAIL_LFSCK_DELAY2 0x1601
773 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
774 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
776 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
777 [ "$STATUS" == "scanning-phase1" ] ||
778 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
780 # Sleep 3 sec to guarantee at least one object processed by LFSCK
782 echo "stop $SINGLEMDS"
783 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
785 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
786 echo "start $SINGLEMDS"
787 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
788 error "(5) Fail to start MDS!"
790 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
791 mdd.${MDT_DEV}.lfsck_namespace |
792 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
794 error "(6) unexpected status"
797 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
803 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
805 for ((i = 0; i < 20; i++)); do
806 touch $DIR/$tdir/dummy${i}
809 #define OBD_FAIL_LFSCK_DELAY3 0x1602
810 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
811 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
812 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
813 mdd.${MDT_DEV}.lfsck_namespace |
814 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
816 error "(4) unexpected status"
820 echo "stop $SINGLEMDS"
821 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
824 echo "start $SINGLEMDS"
825 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
826 error "(6) Fail to start MDS!"
828 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
829 mdd.${MDT_DEV}.lfsck_namespace |
830 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
832 error "(7) unexpected status"
835 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
840 formatall > /dev/null
846 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
847 [ "$STATUS" == "init" ] ||
848 error "(2) Expect 'init', but got '$STATUS'"
850 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
852 mkdir $DIR/$tdir/crashed
854 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
855 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
856 for ((i = 0; i < 5; i++)); do
857 touch $DIR/$tdir/dummy${i}
860 umount_client $MOUNT || error "(3) Fail to stop client!"
862 #define OBD_FAIL_LFSCK_DELAY2 0x1601
863 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
864 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
866 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
867 [ "$STATUS" == "scanning-phase1" ] ||
868 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
870 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
872 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
873 [ "$STATUS" == "stopped" ] ||
874 error "(7) Expect 'stopped', but got '$STATUS'"
876 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
878 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
879 [ "$STATUS" == "scanning-phase1" ] ||
880 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
882 #define OBD_FAIL_LFSCK_FATAL2 0x1609
883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
884 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
885 mdd.${MDT_DEV}.lfsck_namespace |
886 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
888 error "(10) unexpected status"
891 #define OBD_FAIL_LFSCK_DELAY1 0x1600
892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
893 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
895 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
896 [ "$STATUS" == "scanning-phase1" ] ||
897 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
899 #define OBD_FAIL_LFSCK_CRASH 0x160a
900 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
903 echo "stop $SINGLEMDS"
904 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
906 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
909 echo "start $SINGLEMDS"
910 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
911 error "(14) Fail to start MDS!"
913 local timeout=$(max_recovery_time)
916 while [ $timer -lt $timeout ]; do
917 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
918 mdt.${MDT_DEV}.recovery_status |
919 awk '/^status/ { print \\\$2 }'")
920 [ "$STATUS" != "RECOVERING" ] && break;
925 [ $timer != $timeout ] ||
926 error "(14.1) recovery timeout"
928 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
929 [ "$STATUS" == "crashed" ] ||
930 error "(15) Expect 'crashed', but got '$STATUS'"
932 #define OBD_FAIL_LFSCK_DELAY2 0x1601
933 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
934 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
936 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
937 [ "$STATUS" == "scanning-phase1" ] ||
938 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
940 echo "stop $SINGLEMDS"
941 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
943 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
946 echo "start $SINGLEMDS"
947 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
948 error "(19) Fail to start MDS!"
951 while [ $timer -lt $timeout ]; do
952 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
953 mdt.${MDT_DEV}.recovery_status |
954 awk '/^status/ { print \\\$2 }'")
955 [ "$STATUS" != "RECOVERING" ] && break;
960 [ $timer != $timeout ] ||
961 error "(19.1) recovery timeout"
963 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
964 [ "$STATUS" == "paused" ] ||
965 error "(20) Expect 'paused', but got '$STATUS'"
967 echo "stop $SINGLEMDS"
968 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
970 echo "start $SINGLEMDS without resume LFSCK"
971 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
972 error "(20.2) Fail to start MDS!"
975 while [ $timer -lt $timeout ]; do
976 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
977 mdt.${MDT_DEV}.recovery_status |
978 awk '/^status/ { print \\\$2 }'")
979 [ "$STATUS" != "RECOVERING" ] && break;
984 [ $timer != $timeout ] ||
985 error "(20.3) recovery timeout"
987 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
988 [ "$STATUS" == "paused" ] ||
989 error "(20.4) Expect 'paused', but got '$STATUS'"
991 #define OBD_FAIL_LFSCK_DELAY3 0x1602
992 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
994 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
995 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
996 mdd.${MDT_DEV}.lfsck_namespace |
997 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
999 error "(22) unexpected status"
1002 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1003 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1004 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1007 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1008 mdd.${MDT_DEV}.lfsck_namespace |
1009 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1011 error "(24) unexpected status"
1014 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1015 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1017 run_test 8 "LFSCK state machine"
1020 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1021 skip "Testing on UP system, the speed may be inaccurate."
1025 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1026 { skip "Need MDS version >= 2.7.50"; return; }
1028 check_mount_and_prep
1029 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1030 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1031 createmany -o $DIR/$tdir/lfsck/f 5000
1033 local BASE_SPEED1=100
1035 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1038 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1039 [ "$STATUS" == "scanning-phase1" ] ||
1040 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1042 local SPEED=$($SHOW_LAYOUT |
1043 awk '/^average_speed_phase1/ { print $2 }')
1045 # There may be time error, normally it should be less than 2 seconds.
1046 # We allow another 20% schedule error.
1048 # MAX_MARGIN = 1.2 = 12 / 10
1049 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1050 RUN_TIME1 * 12 / 10))
1051 [ $SPEED -lt $MAX_SPEED ] ||
1052 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1054 # adjust speed limit
1055 local BASE_SPEED2=300
1057 do_facet $SINGLEMDS \
1058 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1061 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1062 # MIN_MARGIN = 0.8 = 8 / 10
1063 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1064 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1065 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1066 [ $SPEED -gt $MIN_SPEED ] || {
1067 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1068 error_ignore LU-5624 \
1069 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1072 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1076 # MAX_MARGIN = 1.2 = 12 / 10
1077 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1078 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1079 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1080 [ $SPEED -lt $MAX_SPEED ] ||
1081 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1083 do_facet $SINGLEMDS \
1084 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1086 wait_update_facet $SINGLEMDS \
1087 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1088 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1089 error "(7) Failed to get expected 'completed'"
1091 run_test 9a "LFSCK speed control (1)"
1094 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1095 skip "Testing on UP system, the speed may be inaccurate."
1099 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1100 { skip "Need MDS version >= 2.7.50"; return; }
1104 echo "Preparing another 50 * 50 files (with error) at $(date)."
1105 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1107 createmany -d $DIR/$tdir/d 50
1108 createmany -m $DIR/$tdir/f 50
1109 for ((i = 0; i < 50; i++)); do
1110 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1113 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1115 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1116 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1117 mdd.${MDT_DEV}.lfsck_namespace |
1118 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1120 error "(5) unexpected status"
1123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1124 echo "Prepared at $(date)."
1126 local BASE_SPEED1=50
1128 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1131 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1132 [ "$STATUS" == "scanning-phase2" ] ||
1133 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1135 local SPEED=$($SHOW_NAMESPACE |
1136 awk '/^average_speed_phase2/ { print $2 }')
1137 # There may be time error, normally it should be less than 2 seconds.
1138 # We allow another 20% schedule error.
1140 # MAX_MARGIN = 1.2 = 12 / 10
1141 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1142 RUN_TIME1 * 12 / 10))
1143 [ $SPEED -lt $MAX_SPEED ] ||
1144 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1146 # adjust speed limit
1147 local BASE_SPEED2=150
1149 do_facet $SINGLEMDS \
1150 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1153 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1154 # MIN_MARGIN = 0.8 = 8 / 10
1155 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1156 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1157 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1158 [ $SPEED -gt $MIN_SPEED ] || {
1159 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1160 error_ignore LU-5624 \
1161 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1164 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1168 # MAX_MARGIN = 1.2 = 12 / 10
1169 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1170 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1171 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1172 [ $SPEED -lt $MAX_SPEED ] ||
1173 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1175 do_facet $SINGLEMDS \
1176 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1177 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1178 mdd.${MDT_DEV}.lfsck_namespace |
1179 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1181 error "(11) unexpected status"
1184 run_test 9b "LFSCK speed control (2)"
1188 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1189 skip "lookup(..)/linkea on ZFS issue" && return
1193 echo "Preparing more files with error at $(date)."
1194 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1197 for ((i = 0; i < 1000; i = $((i+2)))); do
1198 mkdir -p $DIR/$tdir/d${i}
1199 touch $DIR/$tdir/f${i}
1200 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1203 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1206 for ((i = 1; i < 1000; i = $((i+2)))); do
1207 mkdir -p $DIR/$tdir/d${i}
1208 touch $DIR/$tdir/f${i}
1209 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1213 echo "Prepared at $(date)."
1215 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1217 umount_client $MOUNT
1218 mount_client $MOUNT || error "(3) Fail to start client!"
1220 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1223 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1224 [ "$STATUS" == "scanning-phase1" ] ||
1225 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1227 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1229 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1231 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1233 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1235 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1237 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1239 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1241 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1242 error "(14) Fail to softlink!"
1244 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1245 [ "$STATUS" == "scanning-phase1" ] ||
1246 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1248 do_facet $SINGLEMDS \
1249 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1250 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1251 mdd.${MDT_DEV}.lfsck_namespace |
1252 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1254 error "(16) unexpected status"
1257 run_test 10 "System is available during LFSCK scanning"
1260 ost_remove_lastid() {
1263 local rcmd="do_facet ost${ost}"
1265 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1267 # step 1: local mount
1268 mount_fstype ost${ost} || return 1
1269 # step 2: remove the specified LAST_ID
1270 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1272 unmount_fstype ost${ost} || return 2
1276 check_mount_and_prep
1277 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1278 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1283 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1285 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1286 error "(2) Fail to start ost1"
1288 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1289 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1291 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1292 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1294 wait_update_facet ost1 "$LCTL get_param -n \
1295 obdfilter.${OST_DEV}.lfsck_layout |
1296 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1298 error "(5) unexpected status"
1301 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1303 wait_update_facet ost1 "$LCTL get_param -n \
1304 obdfilter.${OST_DEV}.lfsck_layout |
1305 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1307 error "(6) unexpected status"
1310 echo "the LAST_ID(s) should have been rebuilt"
1311 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1312 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1314 run_test 11a "LFSCK can rebuild lost last_id"
1317 check_mount_and_prep
1318 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1320 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1321 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1322 do_facet ost1 $LCTL set_param fail_loc=0x160d
1324 local count=$(precreated_ost_obj_count 0 0)
1326 createmany -o $DIR/$tdir/f $((count + 32))
1328 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1329 local seq=$(do_facet mds1 $LCTL get_param -n \
1330 osp.${proc_path}.prealloc_last_seq)
1331 local lastid1=$(do_facet ost1 "lctl get_param -n \
1332 obdfilter.${ost1_svc}.last_id" | grep $seq |
1333 awk -F: '{ print $2 }')
1335 umount_client $MOUNT
1336 stop ost1 || error "(1) Fail to stop ost1"
1338 # stop MDS to forget last precreated object
1339 echo "stop $SINGLEMDS"
1340 stop $SINGLEMDS > /dev/null || error "(11) Fail to stop MDS!"
1341 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1342 echo "start $SINGLEMDS"
1343 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
1344 error "(12) Fail to start MDS!"
1346 #define OBD_FAIL_OST_ENOSPC 0x215
1347 do_facet ost1 $LCTL set_param fail_loc=0x215
1349 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1350 error "(2) Fail to start ost1"
1352 for ((i = 0; i < 60; i++)); do
1353 lastid2=$(do_facet ost1 "lctl get_param -n \
1354 obdfilter.${ost1_svc}.last_id" | grep $seq |
1355 awk -F: '{ print $2 }')
1356 [ ! -z $lastid2 ] && break;
1360 echo "the on-disk LAST_ID should be smaller than the expected one"
1361 [ $lastid1 -gt $lastid2 ] ||
1362 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1364 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1365 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1367 wait_update_facet ost1 "$LCTL get_param -n \
1368 obdfilter.${OST_DEV}.lfsck_layout |
1369 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1371 error "(6) unexpected status"
1374 stop ost1 || error "(7) Fail to stop ost1"
1376 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1377 error "(8) Fail to start ost1"
1379 echo "the on-disk LAST_ID should have been rebuilt"
1380 wait_update_facet ost1 "$LCTL get_param -n \
1381 obdfilter.${ost1_svc}.last_id | grep $seq |
1382 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1383 do_facet ost1 $LCTL get_param -n \
1384 obdfilter.${ost1_svc}.last_id
1385 error "(9) expect lastid1 $seq:$lastid1"
1388 do_facet ost1 $LCTL set_param fail_loc=0
1389 stopall || error "(10) Fail to stopall"
1391 run_test 11b "LFSCK can rebuild crashed last_id"
1394 [ $MDSCOUNT -lt 2 ] &&
1395 skip "We need at least 2 MDSes for test_12a" && return
1397 check_mount_and_prep
1398 for k in $(seq $MDSCOUNT); do
1399 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1400 createmany -o $DIR/$tdir/${k}/f 100 ||
1401 error "(0) Fail to create 100 files."
1404 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1405 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1406 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1408 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1409 wait_all_targets namespace scanning-phase1 3
1411 echo "Stop namespace LFSCK on all targets by single lctl command."
1412 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1413 error "(4) Fail to stop LFSCK on all devices!"
1415 echo "All the LFSCK targets should be in 'stopped' status."
1416 wait_all_targets_blocked namespace stopped 5
1418 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1419 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1420 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1422 echo "All the LFSCK targets should be in 'completed' status."
1423 wait_all_targets_blocked namespace completed 7
1425 start_full_debug_logging
1427 echo "Start layout LFSCK on all targets by single command (-s 1)."
1428 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1429 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1431 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1432 wait_all_targets layout scanning-phase1 9
1434 echo "Stop layout LFSCK on all targets by single lctl command."
1435 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1436 error "(10) Fail to stop LFSCK on all devices!"
1438 echo "All the LFSCK targets should be in 'stopped' status."
1439 wait_all_targets_blocked layout stopped 11
1441 for k in $(seq $OSTCOUNT); do
1442 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1443 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1444 awk '/^status/ { print $2 }')
1445 [ "$STATUS" == "stopped" ] ||
1446 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1449 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1450 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1451 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1453 echo "All the LFSCK targets should be in 'completed' status."
1454 wait_all_targets_blocked layout completed 14
1456 stop_full_debug_logging
1458 run_test 12a "single command to trigger LFSCK on all devices"
1461 check_mount_and_prep
1463 echo "Start LFSCK without '-M' specified."
1464 do_facet mds1 $LCTL lfsck_start -A -r ||
1465 error "(0) Fail to start LFSCK without '-M'"
1467 wait_all_targets_blocked namespace completed 1
1468 wait_all_targets_blocked layout completed 2
1470 local count=$(do_facet mds1 $LCTL dl |
1471 awk '{ print $3 }' | grep mdt | wc -l)
1472 if [ $count -gt 1 ]; then
1474 echo "Start layout LFSCK on the node with multipe targets,"
1475 echo "but not specify '-M'/'-A' option. Should get failure."
1477 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1478 error "(3) Start layout LFSCK should fail" || true
1481 run_test 12b "auto detect Lustre device"
1485 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1486 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1487 echo "MDT-object FID."
1490 check_mount_and_prep
1492 echo "Inject failure stub to simulate bad lmm_oi"
1493 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1494 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1495 createmany -o $DIR/$tdir/f 32
1496 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1498 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1499 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1501 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1502 mdd.${MDT_DEV}.lfsck_layout |
1503 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1505 error "(2) unexpected status"
1508 local repaired=$($SHOW_LAYOUT |
1509 awk '/^repaired_others/ { print $2 }')
1510 [ $repaired -eq 32 ] ||
1511 error "(3) Fail to repair crashed lmm_oi: $repaired"
1513 run_test 13 "LFSCK can repair crashed lmm_oi"
1517 echo "The OST-object referenced by the MDT-object should be there;"
1518 echo "otherwise, the LFSCK should re-create the missing OST-object."
1519 echo "without '--delay-create-ostobj' option."
1522 check_mount_and_prep
1523 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1525 echo "Inject failure stub to simulate dangling referenced MDT-object"
1526 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1527 do_facet ost1 $LCTL set_param fail_loc=0x1610
1528 local count=$(precreated_ost_obj_count 0 0)
1530 createmany -o $DIR/$tdir/f $((count + 31))
1531 touch $DIR/$tdir/guard
1532 do_facet ost1 $LCTL set_param fail_loc=0
1534 start_full_debug_logging
1536 # exhaust other pre-created dangling cases
1537 count=$(precreated_ost_obj_count 0 0)
1538 createmany -o $DIR/$tdir/a $count ||
1539 error "(0) Fail to create $count files."
1541 echo "'ls' should fail because of dangling referenced MDT-object"
1542 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1544 echo "Trigger layout LFSCK to find out dangling reference"
1545 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1547 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1548 mdd.${MDT_DEV}.lfsck_layout |
1549 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1551 error "(3) unexpected status"
1554 local repaired=$($SHOW_LAYOUT |
1555 awk '/^repaired_dangling/ { print $2 }')
1556 [ $repaired -ge 32 ] ||
1557 error "(4) Fail to repair dangling reference: $repaired"
1559 echo "'stat' should fail because of not repair dangling by default"
1560 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1562 echo "Trigger layout LFSCK to repair dangling reference"
1563 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1565 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1566 mdd.${MDT_DEV}.lfsck_layout |
1567 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1569 error "(7) unexpected status"
1572 # There may be some async LFSCK updates in processing, wait for
1573 # a while until the target reparation has been done. LU-4970.
1575 echo "'stat' should success after layout LFSCK repairing"
1576 wait_update_facet client "stat $DIR/$tdir/guard |
1577 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1578 stat $DIR/$tdir/guard
1580 error "(8) unexpected size"
1583 repaired=$($SHOW_LAYOUT |
1584 awk '/^repaired_dangling/ { print $2 }')
1585 [ $repaired -ge 32 ] ||
1586 error "(9) Fail to repair dangling reference: $repaired"
1588 stop_full_debug_logging
1590 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1594 echo "The OST-object referenced by the MDT-object should be there;"
1595 echo "otherwise, the LFSCK should re-create the missing OST-object."
1596 echo "with '--delay-create-ostobj' option."
1599 check_mount_and_prep
1600 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1602 echo "Inject failure stub to simulate dangling referenced MDT-object"
1603 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1604 do_facet ost1 $LCTL set_param fail_loc=0x1610
1605 local count=$(precreated_ost_obj_count 0 0)
1607 createmany -o $DIR/$tdir/f $((count + 31))
1608 touch $DIR/$tdir/guard
1609 do_facet ost1 $LCTL set_param fail_loc=0
1611 start_full_debug_logging
1613 # exhaust other pre-created dangling cases
1614 count=$(precreated_ost_obj_count 0 0)
1615 createmany -o $DIR/$tdir/a $count ||
1616 error "(0) Fail to create $count files."
1618 echo "'ls' should fail because of dangling referenced MDT-object"
1619 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1621 echo "Trigger layout LFSCK to find out dangling reference"
1622 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1624 wait_all_targets_blocked layout completed 3
1626 local repaired=$($SHOW_LAYOUT |
1627 awk '/^repaired_dangling/ { print $2 }')
1628 [ $repaired -ge 32 ] ||
1629 error "(4) Fail to repair dangling reference: $repaired"
1631 echo "'stat' should fail because of not repair dangling by default"
1632 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1634 echo "Trigger layout LFSCK to repair dangling reference"
1635 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1637 wait_all_targets_blocked layout completed 7
1639 # There may be some async LFSCK updates in processing, wait for
1640 # a while until the target reparation has been done. LU-4970.
1642 echo "'stat' should success after layout LFSCK repairing"
1643 wait_update_facet client "stat $DIR/$tdir/guard |
1644 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1645 stat $DIR/$tdir/guard
1647 error "(8) unexpected size"
1650 repaired=$($SHOW_LAYOUT |
1651 awk '/^repaired_dangling/ { print $2 }')
1652 [ $repaired -ge 32 ] ||
1653 error "(9) Fail to repair dangling reference: $repaired"
1655 stop_full_debug_logging
1657 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1661 echo "If the OST-object referenced by the MDT-object back points"
1662 echo "to some non-exist MDT-object, then the LFSCK should repair"
1663 echo "the OST-object to back point to the right MDT-object."
1666 check_mount_and_prep
1667 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1669 echo "Inject failure stub to make the OST-object to back point to"
1670 echo "non-exist MDT-object."
1671 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1673 do_facet ost1 $LCTL set_param fail_loc=0x1611
1674 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1675 cancel_lru_locks osc
1676 do_facet ost1 $LCTL set_param fail_loc=0
1678 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1679 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1681 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1682 mdd.${MDT_DEV}.lfsck_layout |
1683 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1685 error "(2) unexpected status"
1688 local repaired=$($SHOW_LAYOUT |
1689 awk '/^repaired_unmatched_pair/ { print $2 }')
1690 [ $repaired -eq 1 ] ||
1691 error "(3) Fail to repair unmatched pair: $repaired"
1693 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1697 echo "If the OST-object referenced by the MDT-object back points"
1698 echo "to other MDT-object that doesn't recognize the OST-object,"
1699 echo "then the LFSCK should repair it to back point to the right"
1700 echo "MDT-object (the first one)."
1703 check_mount_and_prep
1704 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1705 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1706 cancel_lru_locks osc
1708 echo "Inject failure stub to make the OST-object to back point to"
1709 echo "other MDT-object"
1711 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1712 do_facet ost1 $LCTL set_param fail_loc=0x1612
1713 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1714 cancel_lru_locks osc
1715 do_facet ost1 $LCTL set_param fail_loc=0
1717 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1718 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1720 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1721 mdd.${MDT_DEV}.lfsck_layout |
1722 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1724 error "(2) unexpected status"
1727 local repaired=$($SHOW_LAYOUT |
1728 awk '/^repaired_unmatched_pair/ { print $2 }')
1729 [ $repaired -eq 1 ] ||
1730 error "(3) Fail to repair unmatched pair: $repaired"
1732 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1735 [ $MDSCOUNT -lt 2 ] &&
1736 skip "We need at least 2 MDSes for this test" && return
1738 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1739 skip "Skip the test after 2.7.55 see LU-6437" && return
1742 echo "According to current metadata migration implementation,"
1743 echo "before the old MDT-object is removed, both the new MDT-object"
1744 echo "and old MDT-object will reference the same LOV layout. Then if"
1745 echo "the layout LFSCK finds the new MDT-object by race, it will"
1746 echo "regard related OST-object(s) as multiple referenced case, and"
1747 echo "will try to create new OST-object(s) for the new MDT-object."
1748 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1749 echo "MDT-object before confirm the multiple referenced case."
1752 check_mount_and_prep
1753 $LFS mkdir -i 1 $DIR/$tdir/a1
1754 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1755 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1756 cancel_lru_locks osc
1758 echo "Inject failure stub on MDT1 to delay the migration"
1760 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1761 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1762 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1763 $LFS migrate -m 0 $DIR/$tdir/a1 &
1766 echo "Trigger layout LFSCK to race with the migration"
1767 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1769 wait_all_targets_blocked layout completed 2
1771 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1772 local repaired=$($SHOW_LAYOUT |
1773 awk '/^repaired_unmatched_pair/ { print $2 }')
1774 [ $repaired -eq 1 ] ||
1775 error "(3) Fail to repair unmatched pair: $repaired"
1777 repaired=$($SHOW_LAYOUT |
1778 awk '/^repaired_multiple_referenced/ { print $2 }')
1779 [ $repaired -eq 0 ] ||
1780 error "(4) Unexpectedly repaird multiple references: $repaired"
1782 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1786 echo "If the OST-object's owner information does not match the owner"
1787 echo "information stored in the MDT-object, then the LFSCK trust the"
1788 echo "MDT-object and update the OST-object's owner information."
1791 check_mount_and_prep
1792 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1793 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1794 cancel_lru_locks osc
1796 echo "Inject failure stub to skip OST-object owner changing"
1797 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1798 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1799 chown 1.1 $DIR/$tdir/f0
1800 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1802 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1805 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1807 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1808 mdd.${MDT_DEV}.lfsck_layout |
1809 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1811 error "(2) unexpected status"
1814 local repaired=$($SHOW_LAYOUT |
1815 awk '/^repaired_inconsistent_owner/ { print $2 }')
1816 [ $repaired -eq 1 ] ||
1817 error "(3) Fail to repair inconsistent owner: $repaired"
1819 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1823 echo "If more than one MDT-objects reference the same OST-object,"
1824 echo "and the OST-object only recognizes one MDT-object, then the"
1825 echo "LFSCK should create new OST-objects for such non-recognized"
1829 check_mount_and_prep
1830 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1832 echo "Inject failure stub to make two MDT-objects to refernce"
1833 echo "the OST-object"
1835 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1836 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1838 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1839 cancel_lru_locks osc
1841 createmany -o $DIR/$tdir/f 1
1843 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1845 cancel_lru_locks mdc
1846 cancel_lru_locks osc
1848 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1849 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1850 [ $size -eq 1048576 ] ||
1851 error "(1) f0 (wrong) size should be 1048576, but got $size"
1853 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1856 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1858 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1859 mdd.${MDT_DEV}.lfsck_layout |
1860 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1862 error "(3) unexpected status"
1865 local repaired=$($SHOW_LAYOUT |
1866 awk '/^repaired_multiple_referenced/ { print $2 }')
1867 [ $repaired -eq 1 ] ||
1868 error "(4) Fail to repair multiple references: $repaired"
1870 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1871 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1872 error "(5) Fail to write f0."
1873 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1874 [ $size -eq 1048576 ] ||
1875 error "(6) guard size should be 1048576, but got $size"
1877 run_test 17 "LFSCK can repair multiple references"
1879 $LCTL set_param debug=+cache > /dev/null
1883 echo "The target MDT-object is there, but related stripe information"
1884 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1885 echo "layout EA entries."
1888 check_mount_and_prep
1889 $LFS mkdir -i 0 $DIR/$tdir/a1
1890 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
1891 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1893 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1895 $LFS path2fid $DIR/$tdir/a1/f1
1896 $LFS getstripe $DIR/$tdir/a1/f1
1898 if [ $MDSCOUNT -ge 2 ]; then
1899 $LFS mkdir -i 1 $DIR/$tdir/a2
1900 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1901 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1902 $LFS path2fid $DIR/$tdir/a2/f2
1903 $LFS getstripe $DIR/$tdir/a2/f2
1906 cancel_lru_locks osc
1908 echo "Inject failure, to make the MDT-object lost its layout EA"
1909 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1910 do_facet mds1 $LCTL set_param fail_loc=0x1615
1911 chown 1.1 $DIR/$tdir/a1/f1
1913 if [ $MDSCOUNT -ge 2 ]; then
1914 do_facet mds2 $LCTL set_param fail_loc=0x1615
1915 chown 1.1 $DIR/$tdir/a2/f2
1921 do_facet mds1 $LCTL set_param fail_loc=0
1922 if [ $MDSCOUNT -ge 2 ]; then
1923 do_facet mds2 $LCTL set_param fail_loc=0
1926 cancel_lru_locks mdc
1927 cancel_lru_locks osc
1929 echo "The file size should be incorrect since layout EA is lost"
1930 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1931 [ "$cur_size" != "$saved_size" ] ||
1932 error "(1) Expect incorrect file1 size"
1934 if [ $MDSCOUNT -ge 2 ]; then
1935 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1936 [ "$cur_size" != "$saved_size" ] ||
1937 error "(2) Expect incorrect file2 size"
1940 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
1941 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
1943 for k in $(seq $MDSCOUNT); do
1944 # The LFSCK status query internal is 30 seconds. For the case
1945 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
1946 # time to guarantee the status sync up.
1947 wait_update_facet mds${k} "$LCTL get_param -n \
1948 mdd.$(facet_svc mds${k}).lfsck_layout |
1949 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
1950 error "(4) MDS${k} is not the expected 'completed'"
1953 for k in $(seq $OSTCOUNT); do
1954 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
1955 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1956 awk '/^status/ { print $2 }')
1957 [ "$cur_status" == "completed" ] ||
1958 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
1961 local repaired=$(do_facet mds1 $LCTL get_param -n \
1962 mdd.$(facet_svc mds1).lfsck_layout |
1963 awk '/^repaired_orphan/ { print $2 }')
1964 [ $repaired -eq 1 ] ||
1965 error "(6.1) Expect 1 fixed on mds1, but got: $repaired"
1967 if [ $MDSCOUNT -ge 2 ]; then
1968 repaired=$(do_facet mds2 $LCTL get_param -n \
1969 mdd.$(facet_svc mds2).lfsck_layout |
1970 awk '/^repaired_orphan/ { print $2 }')
1971 [ $repaired -eq 2 ] ||
1972 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
1975 $LFS path2fid $DIR/$tdir/a1/f1
1976 $LFS getstripe $DIR/$tdir/a1/f1
1978 if [ $MDSCOUNT -ge 2 ]; then
1979 $LFS path2fid $DIR/$tdir/a2/f2
1980 $LFS getstripe $DIR/$tdir/a2/f2
1983 echo "The file size should be correct after layout LFSCK scanning"
1984 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1985 [ "$cur_size" == "$saved_size" ] ||
1986 error "(7) Expect file1 size $saved_size, but got $cur_size"
1988 if [ $MDSCOUNT -ge 2 ]; then
1989 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
1990 [ "$cur_size" == "$saved_size" ] ||
1991 error "(8) Expect file2 size $saved_size, but got $cur_size"
1994 run_test 18a "Find out orphan OST-object and repair it (1)"
1998 echo "The target MDT-object is lost. The LFSCK should re-create the"
1999 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2000 echo "can move it back to normal namespace manually."
2003 check_mount_and_prep
2004 $LFS mkdir -i 0 $DIR/$tdir/a1
2005 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2006 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2007 local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2008 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2010 $LFS getstripe $DIR/$tdir/a1/f1
2012 if [ $MDSCOUNT -ge 2 ]; then
2013 $LFS mkdir -i 1 $DIR/$tdir/a2
2014 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2015 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2016 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2018 $LFS getstripe $DIR/$tdir/a2/f2
2021 cancel_lru_locks osc
2023 echo "Inject failure, to simulate the case of missing the MDT-object"
2024 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2025 do_facet mds1 $LCTL set_param fail_loc=0x1616
2026 rm -f $DIR/$tdir/a1/f1
2028 if [ $MDSCOUNT -ge 2 ]; then
2029 do_facet mds2 $LCTL set_param fail_loc=0x1616
2030 rm -f $DIR/$tdir/a2/f2
2036 do_facet mds1 $LCTL set_param fail_loc=0
2037 if [ $MDSCOUNT -ge 2 ]; then
2038 do_facet mds2 $LCTL set_param fail_loc=0
2041 cancel_lru_locks mdc
2042 cancel_lru_locks osc
2044 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2045 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2047 for k in $(seq $MDSCOUNT); do
2048 # The LFSCK status query internal is 30 seconds. For the case
2049 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2050 # time to guarantee the status sync up.
2051 wait_update_facet mds${k} "$LCTL get_param -n \
2052 mdd.$(facet_svc mds${k}).lfsck_layout |
2053 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2054 error "(2) MDS${k} is not the expected 'completed'"
2057 for k in $(seq $OSTCOUNT); do
2058 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2059 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2060 awk '/^status/ { print $2 }')
2061 [ "$cur_status" == "completed" ] ||
2062 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2065 local repaired=$(do_facet mds1 $LCTL get_param -n \
2066 mdd.$(facet_svc mds1).lfsck_layout |
2067 awk '/^repaired_orphan/ { print $2 }')
2068 [ $repaired -eq 1 ] ||
2069 error "(4.1) Expect 1 fixed on mds1, but got: $repaired"
2071 if [ $MDSCOUNT -ge 2 ]; then
2072 repaired=$(do_facet mds2 $LCTL get_param -n \
2073 mdd.$(facet_svc mds2).lfsck_layout |
2074 awk '/^repaired_orphan/ { print $2 }')
2075 [ $repaired -eq 2 ] ||
2076 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2079 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2080 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2081 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2083 if [ $MDSCOUNT -ge 2 ]; then
2084 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2085 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2088 $LFS path2fid $DIR/$tdir/a1/f1
2089 $LFS getstripe $DIR/$tdir/a1/f1
2091 if [ $MDSCOUNT -ge 2 ]; then
2092 $LFS path2fid $DIR/$tdir/a2/f2
2093 $LFS getstripe $DIR/$tdir/a2/f2
2096 echo "The file size should be correct after layout LFSCK scanning"
2097 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2098 [ "$cur_size" == "$saved_size" ] ||
2099 error "(7) Expect file1 size $saved_size, but got $cur_size"
2101 if [ $MDSCOUNT -ge 2 ]; then
2102 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2103 [ "$cur_size" == "$saved_size" ] ||
2104 error "(8) Expect file2 size $saved_size, but got $cur_size"
2107 run_test 18b "Find out orphan OST-object and repair it (2)"
2111 echo "The target MDT-object is lost, and the OST-object FID is missing."
2112 echo "The LFSCK should re-create the MDT-object with new FID under the "
2113 echo "directory .lustre/lost+found/MDTxxxx."
2116 check_mount_and_prep
2117 $LFS mkdir -i 0 $DIR/$tdir/a1
2118 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2120 echo "Inject failure, to simulate the case of missing parent FID"
2121 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2122 do_facet ost1 $LCTL set_param fail_loc=0x1617
2124 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2125 $LFS getstripe $DIR/$tdir/a1/f1
2127 if [ $MDSCOUNT -ge 2 ]; then
2128 $LFS mkdir -i 1 $DIR/$tdir/a2
2129 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a2
2130 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2131 $LFS getstripe $DIR/$tdir/a2/f2
2134 cancel_lru_locks osc
2136 echo "Inject failure, to simulate the case of missing the MDT-object"
2137 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2138 do_facet mds1 $LCTL set_param fail_loc=0x1616
2139 rm -f $DIR/$tdir/a1/f1
2141 if [ $MDSCOUNT -ge 2 ]; then
2142 do_facet mds2 $LCTL set_param fail_loc=0x1616
2143 rm -f $DIR/$tdir/a2/f2
2149 do_facet mds1 $LCTL set_param fail_loc=0
2150 if [ $MDSCOUNT -ge 2 ]; then
2151 do_facet mds2 $LCTL set_param fail_loc=0
2154 cancel_lru_locks mdc
2155 cancel_lru_locks osc
2157 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2158 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2160 for k in $(seq $MDSCOUNT); do
2161 # The LFSCK status query internal is 30 seconds. For the case
2162 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2163 # time to guarantee the status sync up.
2164 wait_update_facet mds${k} "$LCTL get_param -n \
2165 mdd.$(facet_svc mds${k}).lfsck_layout |
2166 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2167 error "(2) MDS${k} is not the expected 'completed'"
2170 for k in $(seq $OSTCOUNT); do
2171 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2172 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2173 awk '/^status/ { print $2 }')
2174 [ "$cur_status" == "completed" ] ||
2175 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2178 if [ $MDSCOUNT -ge 2 ]; then
2184 local repaired=$(do_facet mds1 $LCTL get_param -n \
2185 mdd.$(facet_svc mds1).lfsck_layout |
2186 awk '/^repaired_orphan/ { print $2 }')
2187 [ $repaired -eq $expected ] ||
2188 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2190 if [ $MDSCOUNT -ge 2 ]; then
2191 repaired=$(do_facet mds2 $LCTL get_param -n \
2192 mdd.$(facet_svc mds2).lfsck_layout |
2193 awk '/^repaired_orphan/ { print $2 }')
2194 [ $repaired -eq 0 ] ||
2195 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2198 ls -ail $MOUNT/.lustre/lost+found/
2200 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2201 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2202 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2204 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2207 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2208 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2209 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2211 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2212 [ ! -z "$cname" ] ||
2213 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2215 run_test 18c "Find out orphan OST-object and repair it (3)"
2219 echo "The target MDT-object layout EA is corrupted, but the right"
2220 echo "OST-object is still alive as orphan. The layout LFSCK will"
2221 echo "not create new OST-object to occupy such slot."
2224 check_mount_and_prep
2226 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2227 echo "guard" > $DIR/$tdir/a1/f1
2228 echo "foo" > $DIR/$tdir/a1/f2
2229 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2230 $LFS path2fid $DIR/$tdir/a1/f1
2231 $LFS getstripe $DIR/$tdir/a1/f1
2232 $LFS path2fid $DIR/$tdir/a1/f2
2233 $LFS getstripe $DIR/$tdir/a1/f2
2234 cancel_lru_locks osc
2236 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2237 echo "to reference the same OST-object (which is f1's OST-obejct)."
2238 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2239 echo "dangling reference case, but f2's old OST-object is there."
2242 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2243 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2244 chown 1.1 $DIR/$tdir/a1/f2
2245 rm -f $DIR/$tdir/a1/f1
2248 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2250 echo "stopall to cleanup object cache"
2253 setupall > /dev/null
2255 echo "The file size should be incorrect since dangling referenced"
2256 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2257 [ "$cur_size" != "$saved_size" ] ||
2258 error "(1) Expect incorrect file2 size"
2260 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2261 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2263 for k in $(seq $MDSCOUNT); do
2264 # The LFSCK status query internal is 30 seconds. For the case
2265 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2266 # time to guarantee the status sync up.
2267 wait_update_facet mds${k} "$LCTL get_param -n \
2268 mdd.$(facet_svc mds${k}).lfsck_layout |
2269 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2270 error "(3) MDS${k} is not the expected 'completed'"
2273 for k in $(seq $OSTCOUNT); do
2274 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2275 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2276 awk '/^status/ { print $2 }')
2277 [ "$cur_status" == "completed" ] ||
2278 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2281 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2282 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2283 awk '/^repaired_orphan/ { print $2 }')
2284 [ $repaired -eq 1 ] ||
2285 error "(5) Expect 1 orphan has been fixed, but got: $repaired"
2287 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2288 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2289 awk '/^repaired_dangling/ { print $2 }')
2290 [ $repaired -eq 0 ] ||
2291 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2293 echo "The file size should be correct after layout LFSCK scanning"
2294 cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2295 [ "$cur_size" == "$saved_size" ] ||
2296 error "(7) Expect file2 size $saved_size, but got $cur_size"
2298 echo "The LFSCK should find back the original data."
2299 cat $DIR/$tdir/a1/f2
2300 $LFS path2fid $DIR/$tdir/a1/f2
2301 $LFS getstripe $DIR/$tdir/a1/f2
2303 run_test 18d "Find out orphan OST-object and repair it (4)"
2307 echo "The target MDT-object layout EA slot is occpuied by some new"
2308 echo "created OST-object when repair dangling reference case. Such"
2309 echo "conflict OST-object has been modified by others. To keep the"
2310 echo "new data, the LFSCK will create a new file to refernece this"
2311 echo "old orphan OST-object."
2314 check_mount_and_prep
2316 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2317 echo "guard" > $DIR/$tdir/a1/f1
2318 echo "foo" > $DIR/$tdir/a1/f2
2319 local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2320 $LFS path2fid $DIR/$tdir/a1/f1
2321 $LFS getstripe $DIR/$tdir/a1/f1
2322 $LFS path2fid $DIR/$tdir/a1/f2
2323 $LFS getstripe $DIR/$tdir/a1/f2
2324 cancel_lru_locks osc
2326 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2327 echo "to reference the same OST-object (which is f1's OST-obejct)."
2328 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2329 echo "dangling reference case, but f2's old OST-object is there."
2332 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2333 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2334 chown 1.1 $DIR/$tdir/a1/f2
2335 rm -f $DIR/$tdir/a1/f1
2338 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2340 echo "stopall to cleanup object cache"
2343 setupall > /dev/null
2345 echo "The file size should be incorrect since dangling referenced"
2346 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2347 [ "$cur_size" != "$saved_size" ] ||
2348 error "(1) Expect incorrect file2 size"
2350 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2351 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2353 start_full_debug_logging
2355 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2356 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2358 wait_update_facet mds1 "$LCTL get_param -n \
2359 mdd.$(facet_svc mds1).lfsck_layout |
2360 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2361 error "(3) MDS1 is not the expected 'scanning-phase2'"
2363 # to guarantee all updates are synced.
2367 echo "Write new data to f2 to modify the new created OST-object."
2368 echo "dummy" >> $DIR/$tdir/a1/f2
2370 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2372 for k in $(seq $MDSCOUNT); do
2373 # The LFSCK status query internal is 30 seconds. For the case
2374 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2375 # time to guarantee the status sync up.
2376 wait_update_facet mds${k} "$LCTL get_param -n \
2377 mdd.$(facet_svc mds${k}).lfsck_layout |
2378 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2379 error "(4) MDS${k} is not the expected 'completed'"
2382 for k in $(seq $OSTCOUNT); do
2383 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2384 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2385 awk '/^status/ { print $2 }')
2386 [ "$cur_status" == "completed" ] ||
2387 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2390 stop_full_debug_logging
2392 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2393 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2394 awk '/^repaired_orphan/ { print $2 }')
2395 [ $repaired -eq 1 ] ||
2396 error "(6) Expect 1 orphan has been fixed, but got: $repaired"
2398 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2399 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2400 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2402 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-*)
2403 [ ! -z "$cname" ] ||
2404 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2406 echo "The stub file should keep the original f2 data"
2407 cur_size=$(ls -il $cname | awk '{ print $6 }')
2408 [ "$cur_size" == "$saved_size" ] ||
2409 error "(9) Expect file2 size $saved_size, but got $cur_size"
2412 $LFS path2fid $cname
2413 $LFS getstripe $cname
2415 echo "The f2 should contains new data."
2416 cat $DIR/$tdir/a1/f2
2417 $LFS path2fid $DIR/$tdir/a1/f2
2418 $LFS getstripe $DIR/$tdir/a1/f2
2420 run_test 18e "Find out orphan OST-object and repair it (5)"
2423 [ $OSTCOUNT -lt 2 ] &&
2424 skip "The test needs at least 2 OSTs" && return
2427 echo "The target MDT-object is lost. The LFSCK should re-create the"
2428 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2429 echo "to verify some OST-object(s) during the first stage-scanning,"
2430 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2431 echo "should not be affected."
2434 check_mount_and_prep
2435 $LFS mkdir -i 0 $DIR/$tdir/a1
2436 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a1
2437 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2438 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2439 $LFS mkdir -i 0 $DIR/$tdir/a2
2440 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2441 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2442 $LFS getstripe $DIR/$tdir/a1/f1
2443 $LFS getstripe $DIR/$tdir/a2/f2
2445 if [ $MDSCOUNT -ge 2 ]; then
2446 $LFS mkdir -i 1 $DIR/$tdir/a3
2447 $LFS setstripe -c 1 -i 0 -S 1M $DIR/$tdir/a3
2448 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2449 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2450 $LFS mkdir -i 1 $DIR/$tdir/a4
2451 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2452 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2453 $LFS getstripe $DIR/$tdir/a3/f3
2454 $LFS getstripe $DIR/$tdir/a4/f4
2457 cancel_lru_locks osc
2459 echo "Inject failure, to simulate the case of missing the MDT-object"
2460 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2461 do_facet mds1 $LCTL set_param fail_loc=0x1616
2462 rm -f $DIR/$tdir/a1/f1
2463 rm -f $DIR/$tdir/a2/f2
2465 if [ $MDSCOUNT -ge 2 ]; then
2466 do_facet mds2 $LCTL set_param fail_loc=0x1616
2467 rm -f $DIR/$tdir/a3/f3
2468 rm -f $DIR/$tdir/a4/f4
2474 do_facet mds1 $LCTL set_param fail_loc=0
2475 if [ $MDSCOUNT -ge 2 ]; then
2476 do_facet mds2 $LCTL set_param fail_loc=0
2479 cancel_lru_locks mdc
2480 cancel_lru_locks osc
2482 echo "Inject failure, to simulate the OST0 fail to handle"
2483 echo "MDT0 LFSCK request during the first-stage scanning."
2484 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2485 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2487 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2488 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2490 for k in $(seq $MDSCOUNT); do
2491 # The LFSCK status query internal is 30 seconds. For the case
2492 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2493 # time to guarantee the status sync up.
2494 wait_update_facet mds${k} "$LCTL get_param -n \
2495 mdd.$(facet_svc mds${k}).lfsck_layout |
2496 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2497 error "(2) MDS${k} is not the expected 'partial'"
2500 wait_update_facet ost1 "$LCTL get_param -n \
2501 obdfilter.$(facet_svc ost1).lfsck_layout |
2502 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2503 error "(3) OST1 is not the expected 'partial'"
2506 wait_update_facet ost2 "$LCTL get_param -n \
2507 obdfilter.$(facet_svc ost2).lfsck_layout |
2508 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2509 error "(4) OST2 is not the expected 'completed'"
2512 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2514 local repaired=$(do_facet mds1 $LCTL get_param -n \
2515 mdd.$(facet_svc mds1).lfsck_layout |
2516 awk '/^repaired_orphan/ { print $2 }')
2517 [ $repaired -eq 1 ] ||
2518 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2520 if [ $MDSCOUNT -ge 2 ]; then
2521 repaired=$(do_facet mds2 $LCTL get_param -n \
2522 mdd.$(facet_svc mds2).lfsck_layout |
2523 awk '/^repaired_orphan/ { print $2 }')
2524 [ $repaired -eq 1 ] ||
2525 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2528 echo "Trigger layout LFSCK on all devices again to cleanup"
2529 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2531 for k in $(seq $MDSCOUNT); do
2532 # The LFSCK status query internal is 30 seconds. For the case
2533 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2534 # time to guarantee the status sync up.
2535 wait_update_facet mds${k} "$LCTL get_param -n \
2536 mdd.$(facet_svc mds${k}).lfsck_layout |
2537 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2538 error "(8) MDS${k} is not the expected 'completed'"
2541 for k in $(seq $OSTCOUNT); do
2542 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2543 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2544 awk '/^status/ { print $2 }')
2545 [ "$cur_status" == "completed" ] ||
2546 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2550 local repaired=$(do_facet mds1 $LCTL get_param -n \
2551 mdd.$(facet_svc mds1).lfsck_layout |
2552 awk '/^repaired_orphan/ { print $2 }')
2553 [ $repaired -eq 2 ] ||
2554 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2556 if [ $MDSCOUNT -ge 2 ]; then
2557 repaired=$(do_facet mds2 $LCTL get_param -n \
2558 mdd.$(facet_svc mds2).lfsck_layout |
2559 awk '/^repaired_orphan/ { print $2 }')
2560 [ $repaired -eq 2 ] ||
2561 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2564 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2568 echo "The target MDT-object is lost, but related OI mapping is there"
2569 echo "The LFSCK should recreate the lost MDT-object without affected"
2570 echo "by the stale OI mapping."
2573 check_mount_and_prep
2574 $LFS mkdir -i 0 $DIR/$tdir/a1
2575 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2576 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2577 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2579 $LFS getstripe $DIR/$tdir/a1/f1
2580 cancel_lru_locks osc
2582 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2583 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2584 do_facet mds1 $LCTL set_param fail_loc=0x162e
2585 rm -f $DIR/$tdir/a1/f1
2587 do_facet mds1 $LCTL set_param fail_loc=0
2588 cancel_lru_locks mdc
2589 cancel_lru_locks osc
2591 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2592 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2594 for k in $(seq $MDSCOUNT); do
2595 # The LFSCK status query internal is 30 seconds. For the case
2596 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2597 # time to guarantee the status sync up.
2598 wait_update_facet mds${k} "$LCTL get_param -n \
2599 mdd.$(facet_svc mds${k}).lfsck_layout |
2600 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2601 error "(2) MDS${k} is not the expected 'completed'"
2604 for k in $(seq $OSTCOUNT); do
2605 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2606 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2607 awk '/^status/ { print $2 }')
2608 [ "$cur_status" == "completed" ] ||
2609 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2612 local repaired=$(do_facet mds1 $LCTL get_param -n \
2613 mdd.$(facet_svc mds1).lfsck_layout |
2614 awk '/^repaired_orphan/ { print $2 }')
2615 [ $repaired -eq $OSTCOUNT ] ||
2616 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2618 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2619 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2620 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2622 $LFS path2fid $DIR/$tdir/a1/f1
2623 $LFS getstripe $DIR/$tdir/a1/f1
2625 run_test 18g "Find out orphan OST-object and repair it (7)"
2627 $LCTL set_param debug=-cache > /dev/null
2630 check_mount_and_prep
2631 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2633 echo "foo" > $DIR/$tdir/a0
2634 echo "guard" > $DIR/$tdir/a1
2635 cancel_lru_locks osc
2637 echo "Inject failure, then client will offer wrong parent FID when read"
2638 do_facet ost1 $LCTL set_param -n \
2639 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2640 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2641 $LCTL set_param fail_loc=0x1619
2643 echo "Read RPC with wrong parent FID should be denied"
2644 cat $DIR/$tdir/a0 && error "(3) Read should be denied!"
2645 $LCTL set_param fail_loc=0
2647 run_test 19a "OST-object inconsistency self detect"
2650 check_mount_and_prep
2651 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2653 echo "Inject failure stub to make the OST-object to back point to"
2654 echo "non-exist MDT-object"
2656 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2657 do_facet ost1 $LCTL set_param fail_loc=0x1611
2658 echo "foo" > $DIR/$tdir/f0
2659 cancel_lru_locks osc
2660 do_facet ost1 $LCTL set_param fail_loc=0
2662 echo "Nothing should be fixed since self detect and repair is disabled"
2663 local repaired=$(do_facet ost1 $LCTL get_param -n \
2664 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2665 awk '/^repaired/ { print $2 }')
2666 [ $repaired -eq 0 ] ||
2667 error "(1) Expected 0 repaired, but got $repaired"
2669 echo "Read RPC with right parent FID should be accepted,"
2670 echo "and cause parent FID on OST to be fixed"
2672 do_facet ost1 $LCTL set_param -n \
2673 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2674 cat $DIR/$tdir/f0 || error "(2) Read should not be denied!"
2676 repaired=$(do_facet ost1 $LCTL get_param -n \
2677 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2678 awk '/^repaired/ { print $2 }')
2679 [ $repaired -eq 1 ] ||
2680 error "(3) Expected 1 repaired, but got $repaired"
2682 run_test 19b "OST-object inconsistency self repair"
2684 PATTERN_WITH_HOLE="40000001"
2685 PATTERN_WITHOUT_HOLE="1"
2688 [ $OSTCOUNT -lt 2 ] &&
2689 skip "The test needs at least 2 OSTs" && return
2692 echo "The target MDT-object and some of its OST-object are lost."
2693 echo "The LFSCK should find out the left OST-objects and re-create"
2694 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2695 echo "with the partial OST-objects (LOV EA hole)."
2697 echo "New client can access the file with LOV EA hole via normal"
2698 echo "system tools or commands without crash the system."
2700 echo "For old client, even though it cannot access the file with"
2701 echo "LOV EA hole, it should not cause the system crash."
2704 check_mount_and_prep
2705 $LFS mkdir -i 0 $DIR/$tdir/a1
2706 if [ $OSTCOUNT -gt 2 ]; then
2707 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2710 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2714 # 256 blocks on the stripe0.
2715 # 1 block on the stripe1 for 2 OSTs case.
2716 # 256 blocks on the stripe1 for other cases.
2717 # 1 block on the stripe2 if OSTs > 2
2718 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2719 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2720 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2722 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2723 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2724 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2727 $LFS getstripe $DIR/$tdir/a1/f0
2729 $LFS getstripe $DIR/$tdir/a1/f1
2731 $LFS getstripe $DIR/$tdir/a1/f2
2733 if [ $OSTCOUNT -gt 2 ]; then
2734 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2735 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2737 $LFS getstripe $DIR/$tdir/a1/f3
2740 cancel_lru_locks osc
2742 echo "Inject failure..."
2743 echo "To simulate f0 lost MDT-object"
2744 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2745 do_facet mds1 $LCTL set_param fail_loc=0x1616
2746 rm -f $DIR/$tdir/a1/f0
2748 echo "To simulate f1 lost MDT-object and OST-object0"
2749 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
2750 do_facet mds1 $LCTL set_param fail_loc=0x161a
2751 rm -f $DIR/$tdir/a1/f1
2753 echo "To simulate f2 lost MDT-object and OST-object1"
2754 do_facet mds1 $LCTL set_param fail_val=1
2755 rm -f $DIR/$tdir/a1/f2
2757 if [ $OSTCOUNT -gt 2 ]; then
2758 echo "To simulate f3 lost MDT-object and OST-object2"
2759 do_facet mds1 $LCTL set_param fail_val=2
2760 rm -f $DIR/$tdir/a1/f3
2763 umount_client $MOUNT
2766 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2768 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2769 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2771 for k in $(seq $MDSCOUNT); do
2772 # The LFSCK status query internal is 30 seconds. For the case
2773 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2774 # time to guarantee the status sync up.
2775 wait_update_facet mds${k} "$LCTL get_param -n \
2776 mdd.$(facet_svc mds${k}).lfsck_layout |
2777 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
2778 error "(2) MDS${k} is not the expected 'completed'"
2781 for k in $(seq $OSTCOUNT); do
2782 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2783 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2784 awk '/^status/ { print $2 }')
2785 [ "$cur_status" == "completed" ] ||
2786 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2789 local repaired=$(do_facet mds1 $LCTL get_param -n \
2790 mdd.$(facet_svc mds1).lfsck_layout |
2791 awk '/^repaired_orphan/ { print $2 }')
2792 if [ $OSTCOUNT -gt 2 ]; then
2793 [ $repaired -eq 9 ] ||
2794 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
2796 [ $repaired -eq 4 ] ||
2797 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
2800 mount_client $MOUNT || error "(5.0) Fail to start client!"
2802 LOV_PATTERN_F_HOLE=0x40000000
2805 # ${fid0}-R-0 is the old f0
2807 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
2808 echo "Check $name, which is the old f0"
2810 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
2812 local pattern=$($LFS getstripe -L $name)
2813 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
2814 error "(5.2) NOT expect pattern flag hole, but got $pattern"
2816 local stripes=$($LFS getstripe -c $name)
2817 if [ $OSTCOUNT -gt 2 ]; then
2818 [ $stripes -eq 3 ] ||
2819 error "(5.3.1) expect the stripe count is 3, but got $stripes"
2821 [ $stripes -eq 2 ] ||
2822 error "(5.3.2) expect the stripe count is 2, but got $stripes"
2825 local size=$(stat $name | awk '/Size:/ { print $2 }')
2826 [ $size -eq $((4096 * $bcount)) ] ||
2827 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
2829 cat $name > /dev/null || error "(5.5) cannot read $name"
2831 echo "dummy" >> $name || error "(5.6) cannot write $name"
2833 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
2835 touch $name || error "(5.8) cannot touch $name"
2837 rm -f $name || error "(5.9) cannot unlink $name"
2840 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
2842 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2843 if [ $OSTCOUNT -gt 2 ]; then
2844 echo "Check $name, it contains the old f1's stripe1 and stripe2"
2846 echo "Check $name, it contains the old f1's stripe1"
2849 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
2851 pattern=$($LFS getstripe -L $name)
2852 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
2853 error "(6.2) expect pattern flag hole, but got $pattern"
2855 stripes=$($LFS getstripe -c $name)
2856 if [ $OSTCOUNT -gt 2 ]; then
2857 [ $stripes -eq 3 ] ||
2858 error "(6.3.1) expect the stripe count is 3, but got $stripes"
2860 [ $stripes -eq 2 ] ||
2861 error "(6.3.2) expect the stripe count is 2, but got $stripes"
2864 size=$(stat $name | awk '/Size:/ { print $2 }')
2865 [ $size -eq $((4096 * $bcount)) ] ||
2866 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
2868 cat $name > /dev/null && error "(6.5) normal read $name should fail"
2870 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2871 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2874 [ $failures -eq 256 ] ||
2875 error "(6.6) expect 256 IO failures, but get $failures"
2877 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2878 [ $size -eq $((4096 * $bcount)) ] ||
2879 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
2881 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
2882 error "(6.8) write to the LOV EA hole should fail"
2884 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
2885 error "(6.9) write to normal stripe should NOT fail"
2887 echo "foo" >> $name && error "(6.10) append write $name should fail"
2889 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
2891 touch $name || error "(6.12) cannot touch $name"
2893 rm -f $name || error "(6.13) cannot unlink $name"
2896 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
2898 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
2899 if [ $OSTCOUNT -gt 2 ]; then
2900 echo "Check $name, it contains the old f2's stripe0 and stripe2"
2902 echo "Check $name, it contains the old f2's stripe0"
2905 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
2907 pattern=$($LFS getstripe -L $name)
2908 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
2909 error "(7.2) expect pattern flag hole, but got $pattern"
2911 stripes=$($LFS getstripe -c $name)
2912 size=$(stat $name | awk '/Size:/ { print $2 }')
2913 if [ $OSTCOUNT -gt 2 ]; then
2914 [ $stripes -eq 3 ] ||
2915 error "(7.3.1) expect the stripe count is 3, but got $stripes"
2917 [ $size -eq $((4096 * $bcount)) ] ||
2918 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
2920 cat $name > /dev/null &&
2921 error "(7.5.1) normal read $name should fail"
2923 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2924 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2926 [ $failures -eq 256 ] ||
2927 error "(7.6) expect 256 IO failures, but get $failures"
2929 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2930 [ $size -eq $((4096 * $bcount)) ] ||
2931 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
2933 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2934 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
2936 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
2937 error "(7.8.1) write to normal stripe should NOT fail"
2939 echo "foo" >> $name &&
2940 error "(7.8.3) append write $name should fail"
2942 chown $RUNAS_ID:$RUNAS_GID $name ||
2943 error "(7.9.1) cannot chown on $name"
2945 touch $name || error "(7.10.1) cannot touch $name"
2947 [ $stripes -eq 2 ] ||
2948 error "(7.3.2) expect the stripe count is 2, but got $stripes"
2951 [ $size -eq $((4096 * (256 + 0))) ] ||
2952 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
2954 cat $name > /dev/null &&
2955 error "(7.5.2) normal read $name should fail"
2957 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
2958 bs=4096 2>&1 | grep "Input/output error" | wc -l)
2959 [ $failures -eq 256 ] ||
2960 error "(7.6.2) expect 256 IO failures, but get $failures"
2963 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
2964 [ $size -eq $((4096 * $bcount)) ] ||
2965 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
2967 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
2968 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
2970 chown $RUNAS_ID:$RUNAS_GID $name ||
2971 error "(7.9.2) cannot chown on $name"
2973 touch $name || error "(7.10.2) cannot touch $name"
2976 rm -f $name || error "(7.11) cannot unlink $name"
2978 [ $OSTCOUNT -le 2 ] && return
2981 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
2983 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2984 echo "Check $name, which contains the old f3's stripe0 and stripe1"
2986 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
2988 pattern=$($LFS getstripe -L $name)
2989 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
2990 error "(8.2) expect pattern flag hole, but got $pattern"
2992 stripes=$($LFS getstripe -c $name)
2993 [ $stripes -eq 3 ] ||
2994 error "(8.3) expect the stripe count is 3, but got $stripes"
2996 size=$(stat $name | awk '/Size:/ { print $2 }')
2998 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
2999 error "(8.4) expect the size $((4096 * 512)), but got $size"
3001 cat $name > /dev/null &&
3002 error "(8.5) normal read $name should fail"
3004 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3005 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3007 [ $failures -eq 256 ] ||
3008 error "(8.6) expect 256 IO failures, but get $failures"
3011 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3012 [ $size -eq $((4096 * $bcount)) ] ||
3013 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3015 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3016 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3018 chown $RUNAS_ID:$RUNAS_GID $name ||
3019 error "(8.9) cannot chown on $name"
3021 touch $name || error "(8.10) cannot touch $name"
3023 rm -f $name || error "(8.11) cannot unlink $name"
3025 run_test 20 "Handle the orphan with dummy LOV EA slot properly"
3028 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3029 skip "ignore the test if MDS is older than 2.5.59" && return
3031 check_mount_and_prep
3032 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3034 echo "Start all LFSCK components by default (-s 1)"
3035 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3036 error "Fail to start LFSCK"
3038 echo "namespace LFSCK should be in 'scanning-phase1' status"
3039 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3040 [ "$STATUS" == "scanning-phase1" ] ||
3041 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3043 echo "layout LFSCK should be in 'scanning-phase1' status"
3044 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3045 [ "$STATUS" == "scanning-phase1" ] ||
3046 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3048 echo "Stop all LFSCK components by default"
3049 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3050 error "Fail to stop LFSCK"
3052 run_test 21 "run all LFSCK components by default"
3055 [ $MDSCOUNT -lt 2 ] &&
3056 skip "We need at least 2 MDSes for this test" && return
3059 echo "The parent_A references the child directory via some name entry,"
3060 echo "but the child directory back references another parent_B via its"
3061 echo "".." name entry. The parent_B does not exist. Then the namespace"
3062 echo "LFSCK will repair the child directory's ".." name entry."
3065 check_mount_and_prep
3067 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3068 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3070 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3071 echo "The dummy's dotdot name entry references the guard."
3072 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3073 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3074 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3075 error "(3) Fail to mkdir on MDT0"
3076 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3078 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3080 echo "Trigger namespace LFSCK to repair unmatched pairs"
3081 $START_NAMESPACE -A -r ||
3082 error "(5) Fail to start LFSCK for namespace"
3084 wait_all_targets_blocked namespace completed 6
3086 local repaired=$($SHOW_NAMESPACE |
3087 awk '/^unmatched_pairs_repaired/ { print $2 }')
3088 [ $repaired -eq 1 ] ||
3089 error "(7) Fail to repair unmatched pairs: $repaired"
3091 echo "'ls' should success after namespace LFSCK repairing"
3092 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3093 error "(8) ls should success."
3095 run_test 22a "LFSCK can repair unmatched pairs (1)"
3098 [ $MDSCOUNT -lt 2 ] &&
3099 skip "We need at least 2 MDSes for this test" && return
3102 echo "The parent_A references the child directory via the name entry_B,"
3103 echo "but the child directory back references another parent_C via its"
3104 echo "".." name entry. The parent_C exists, but there is no the name"
3105 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3106 echo "the child directory's ".." name entry and its linkEA."
3109 check_mount_and_prep
3111 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3112 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3114 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3115 echo "and bad linkEA. The dummy's dotdot name entry references the"
3116 echo "guard. The dummy's linkEA references n non-exist name entry."
3117 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3118 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3119 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3120 error "(3) Fail to mkdir on MDT0"
3121 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3123 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3124 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3125 local dummyname=$($LFS fid2path $DIR $dummyfid)
3126 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3127 error "(4) fid2path works unexpectedly."
3129 echo "Trigger namespace LFSCK to repair unmatched pairs"
3130 $START_NAMESPACE -A -r ||
3131 error "(5) Fail to start LFSCK for namespace"
3133 wait_all_targets_blocked namespace completed 6
3135 local repaired=$($SHOW_NAMESPACE |
3136 awk '/^unmatched_pairs_repaired/ { print $2 }')
3137 [ $repaired -eq 1 ] ||
3138 error "(7) Fail to repair unmatched pairs: $repaired"
3140 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3141 local dummyname=$($LFS fid2path $DIR $dummyfid)
3142 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3143 error "(8) fid2path does not work"
3145 run_test 22b "LFSCK can repair unmatched pairs (2)"
3148 [ $MDSCOUNT -lt 2 ] &&
3149 skip "We need at least 2 MDSes for this test" && return
3152 echo "The name entry is there, but the MDT-object for such name "
3153 echo "entry does not exist. The namespace LFSCK should find out "
3154 echo "and repair the inconsistency as required."
3157 check_mount_and_prep
3159 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3160 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3162 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3163 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3164 do_facet mds2 $LCTL set_param fail_loc=0x1620
3165 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3166 do_facet mds2 $LCTL set_param fail_loc=0
3168 echo "'ls' should fail because of dangling name entry"
3169 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3171 echo "Trigger namespace LFSCK to find out dangling name entry"
3172 $START_NAMESPACE -A -r ||
3173 error "(5) Fail to start LFSCK for namespace"
3175 wait_all_targets_blocked namespace completed 6
3177 local repaired=$($SHOW_NAMESPACE |
3178 awk '/^dangling_repaired/ { print $2 }')
3179 [ $repaired -eq 1 ] ||
3180 error "(7) Fail to repair dangling name entry: $repaired"
3182 echo "'ls' should fail because not re-create MDT-object by default"
3183 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3185 echo "Trigger namespace LFSCK again to repair dangling name entry"
3186 $START_NAMESPACE -A -r -C ||
3187 error "(9) Fail to start LFSCK for namespace"
3189 wait_all_targets_blocked namespace completed 10
3191 repaired=$($SHOW_NAMESPACE |
3192 awk '/^dangling_repaired/ { print $2 }')
3193 [ $repaired -eq 1 ] ||
3194 error "(11) Fail to repair dangling name entry: $repaired"
3196 echo "'ls' should success after namespace LFSCK repairing"
3197 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3199 run_test 23a "LFSCK can repair dangling name entry (1)"
3203 echo "The objectA has multiple hard links, one of them corresponding"
3204 echo "to the name entry_B. But there is something wrong for the name"
3205 echo "entry_B and cause entry_B to references non-exist object_C."
3206 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3207 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3208 echo "comes to the second-stage scanning, it will find that the"
3209 echo "former re-creating object_C is not proper, and will try to"
3210 echo "replace the object_C with the real object_A."
3213 check_mount_and_prep
3215 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3216 $LFS path2fid $DIR/$tdir/d0
3218 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3220 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3221 $LFS path2fid $DIR/$tdir/d0/f0
3223 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3224 $LFS path2fid $DIR/$tdir/d0/f1
3226 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3227 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3229 if [ "$SEQ0" != "$SEQ1" ]; then
3230 # To guarantee that the f0 and f1 are in the same FID seq
3231 rm -f $DIR/$tdir/d0/f0 ||
3232 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3233 echo "dummy" > $DIR/$tdir/d0/f0 ||
3234 error "(3.2) Fail to touch on MDT0"
3235 $LFS path2fid $DIR/$tdir/d0/f0
3238 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3239 OID=$(printf %d $OID)
3241 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3242 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3243 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3244 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3245 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3247 # If there is creation after the dangling injection, it may re-use
3248 # the just released local object (inode) that is referenced by the
3249 # dangling name entry. It will fail the dangling injection.
3250 # So before deleting the target object for the dangling name entry,
3251 # remove some other objects to avoid the target object being reused
3252 # by some potential creations. LU-7429
3253 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3255 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3257 echo "'ls' should fail because of dangling name entry"
3258 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3259 error "(6) ls should fail."
3261 echo "Trigger namespace LFSCK to find out dangling name entry"
3262 $START_NAMESPACE -r -C ||
3263 error "(7) Fail to start LFSCK for namespace"
3265 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3266 mdd.${MDT_DEV}.lfsck_namespace |
3267 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3269 error "(8) unexpected status"
3272 local repaired=$($SHOW_NAMESPACE |
3273 awk '/^dangling_repaired/ { print $2 }')
3274 [ $repaired -eq 1 ] ||
3275 error "(9) Fail to repair dangling name entry: $repaired"
3277 repaired=$($SHOW_NAMESPACE |
3278 awk '/^multiple_linked_repaired/ { print $2 }')
3279 [ $repaired -eq 1 ] ||
3280 error "(10) Fail to drop the former created object: $repaired"
3282 local data=$(cat $DIR/$tdir/d0/foo)
3283 [ "$data" == "dummy" ] ||
3284 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3286 run_test 23b "LFSCK can repair dangling name entry (2)"
3290 echo "The objectA has multiple hard links, one of them corresponding"
3291 echo "to the name entry_B. But there is something wrong for the name"
3292 echo "entry_B and cause entry_B to references non-exist object_C."
3293 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3294 echo "as dangling, and re-create the lost object_C. And then others"
3295 echo "modified the re-created object_C. When the LFSCK comes to the"
3296 echo "second-stage scanning, it will find that the former re-creating"
3297 echo "object_C maybe wrong and try to replace the object_C with the"
3298 echo "real object_A. But because object_C has been modified, so the"
3299 echo "LFSCK cannot replace it."
3302 start_full_debug_logging
3304 check_mount_and_prep
3306 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3307 $LFS path2fid $DIR/$tdir/d0
3309 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3311 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3312 $LFS path2fid $DIR/$tdir/d0/f0
3314 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3315 $LFS path2fid $DIR/$tdir/d0/f1
3317 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3318 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3320 if [ "$SEQ0" != "$SEQ1" ]; then
3321 # To guarantee that the f0 and f1 are in the same FID seq
3322 rm -f $DIR/$tdir/d0/f0 ||
3323 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3324 echo "dummy" > $DIR/$tdir/d0/f0 ||
3325 error "(3.2) Fail to touch on MDT0"
3326 $LFS path2fid $DIR/$tdir/d0/f0
3329 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3330 OID=$(printf %d $OID)
3332 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3333 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3334 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3335 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3336 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3338 # If there is creation after the dangling injection, it may re-use
3339 # the just released local object (inode) that is referenced by the
3340 # dangling name entry. It will fail the dangling injection.
3341 # So before deleting the target object for the dangling name entry,
3342 # remove some other objects to avoid the target object being reused
3343 # by some potential creations. LU-7429
3344 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3346 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3348 echo "'ls' should fail because of dangling name entry"
3349 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3350 error "(6) ls should fail."
3352 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3353 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3355 echo "Trigger namespace LFSCK to find out dangling name entry"
3356 $START_NAMESPACE -r -C ||
3357 error "(7) Fail to start LFSCK for namespace"
3359 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3360 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3361 stat $DIR/$tdir/d0/foo
3363 error "(8) unexpected size"
3366 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3367 cancel_lru_locks osc
3369 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3370 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3371 mdd.${MDT_DEV}.lfsck_namespace |
3372 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3374 error "(10) unexpected status"
3377 stop_full_debug_logging
3379 local repaired=$($SHOW_NAMESPACE |
3380 awk '/^dangling_repaired/ { print $2 }')
3381 [ $repaired -eq 1 ] ||
3382 error "(11) Fail to repair dangling name entry: $repaired"
3384 local data=$(cat $DIR/$tdir/d0/foo)
3385 [ "$data" != "dummy" ] ||
3386 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3388 run_test 23c "LFSCK can repair dangling name entry (3)"
3391 [ $MDSCOUNT -lt 2 ] &&
3392 skip "We need at least 2 MDSes for this test" && return
3395 echo "Two MDT-objects back reference the same name entry via their"
3396 echo "each own linkEA entry, but the name entry only references one"
3397 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3398 echo "for the MDT-object that is not recognized. If such MDT-object"
3399 echo "has no other linkEA entry after the removing, then the LFSCK"
3400 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3403 check_mount_and_prep
3405 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3407 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3408 $LFS path2fid $DIR/$tdir/d0/guard
3410 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3411 $LFS path2fid $DIR/$tdir/d0/dummy
3414 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3415 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3417 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3420 touch $DIR/$tdir/d0/guard/foo ||
3421 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3423 echo "Inject failure stub on MDT0 to simulate the case that"
3424 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3425 echo "that references $DIR/$tdir/d0/guard/foo."
3426 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3427 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3428 echo "there with the same linkEA entry as another MDT-object"
3429 echo "$DIR/$tdir/d0/guard/foo has"
3431 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3432 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3433 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3434 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3435 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3436 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3437 rmdir $DIR/$tdir/d0/dummy/foo ||
3438 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3439 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3441 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
3442 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
3443 error "(6) stat successfully unexpectedly"
3445 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
3446 $START_NAMESPACE -A -r ||
3447 error "(7) Fail to start LFSCK for namespace"
3449 wait_all_targets_blocked namespace completed 8
3451 local repaired=$($SHOW_NAMESPACE |
3452 awk '/^multiple_referenced_repaired/ { print $2 }')
3453 [ $repaired -eq 1 ] ||
3454 error "(9) Fail to repair multiple referenced name entry: $repaired"
3456 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3457 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3458 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3460 local cname="$cfid-$pfid-D-0"
3461 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
3462 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
3464 run_test 24 "LFSCK can repair multiple-referenced name entry"
3467 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3468 skip "Only support to inject failure on ldiskfs" && return
3471 echo "The file type in the name entry does not match the file type"
3472 echo "claimed by the referenced object. Then the LFSCK will update"
3473 echo "the file type in the name entry."
3476 check_mount_and_prep
3478 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3480 echo "Inject failure stub on MDT0 to simulate the case that"
3481 echo "the file type stored in the name entry is wrong."
3483 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
3484 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
3485 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
3486 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3488 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
3489 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
3491 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3492 mdd.${MDT_DEV}.lfsck_namespace |
3493 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3495 error "(4) unexpected status"
3498 local repaired=$($SHOW_NAMESPACE |
3499 awk '/^bad_file_type_repaired/ { print $2 }')
3500 [ $repaired -eq 1 ] ||
3501 error "(5) Fail to repair bad file type in name entry: $repaired"
3503 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
3505 run_test 25 "LFSCK can repair bad file type in the name entry"
3509 echo "The local name entry back referenced by the MDT-object is lost."
3510 echo "The namespace LFSCK will add the missing local name entry back"
3511 echo "to the normal namespace."
3514 check_mount_and_prep
3516 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3517 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3518 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3520 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3521 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3523 echo "Inject failure stub on MDT0 to simulate the case that"
3524 echo "foo's name entry will be removed, but the foo's object"
3525 echo "and its linkEA are kept in the system."
3527 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3528 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3529 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3530 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3532 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3533 error "(5) 'ls' should fail"
3535 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3536 $START_NAMESPACE -r -A ||
3537 error "(6) Fail to start LFSCK for namespace"
3539 wait_all_targets_blocked namespace completed 7
3541 local repaired=$($SHOW_NAMESPACE |
3542 awk '/^lost_dirent_repaired/ { print $2 }')
3543 [ $repaired -eq 1 ] ||
3544 error "(8) Fail to repair lost dirent: $repaired"
3546 ls -ail $DIR/$tdir/d0/foo ||
3547 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
3549 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3550 [ "$foofid" == "$foofid2" ] ||
3551 error "(10) foo's FID changed: $foofid, $foofid2"
3553 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
3556 [ $MDSCOUNT -lt 2 ] &&
3557 skip "We need at least 2 MDSes for this test" && return
3560 echo "The remote name entry back referenced by the MDT-object is lost."
3561 echo "The namespace LFSCK will add the missing remote name entry back"
3562 echo "to the normal namespace."
3565 check_mount_and_prep
3567 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3568 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3569 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
3571 echo "Inject failure stub on MDT0 to simulate the case that"
3572 echo "foo's name entry will be removed, but the foo's object"
3573 echo "and its linkEA are kept in the system."
3575 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3576 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3577 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3578 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3580 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3581 error "(4) 'ls' should fail"
3583 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3584 $START_NAMESPACE -r -A ||
3585 error "(5) Fail to start LFSCK for namespace"
3587 wait_all_targets_blocked namespace completed 6
3589 local repaired=$($SHOW_NAMESPACE |
3590 awk '/^lost_dirent_repaired/ { print $2 }')
3591 [ $repaired -eq 1 ] ||
3592 error "(7) Fail to repair lost dirent: $repaired"
3594 ls -ail $DIR/$tdir/d0/foo ||
3595 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
3597 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
3598 [ "$foofid" == "$foofid2" ] ||
3599 error "(9) foo's FID changed: $foofid, $foofid2"
3601 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
3605 echo "The local parent referenced by the MDT-object linkEA is lost."
3606 echo "The namespace LFSCK will re-create the lost parent as orphan."
3609 check_mount_and_prep
3611 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3612 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3613 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
3614 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3616 echo "Inject failure stub on MDT0 to simulate the case that"
3617 echo "foo's name entry will be removed, but the foo's object"
3618 echo "and its linkEA are kept in the system. And then remove"
3619 echo "another hard link and the parent directory."
3621 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3622 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3623 rm -f $DIR/$tdir/d0/foo ||
3624 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
3625 rm -f $DIR/$tdir/d0/dummy ||
3626 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
3627 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3629 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
3630 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
3632 echo "Trigger namespace LFSCK to repair the lost parent"
3633 $START_NAMESPACE -r -A ||
3634 error "(6) Fail to start LFSCK for namespace"
3636 wait_all_targets_blocked namespace completed 7
3638 local repaired=$($SHOW_NAMESPACE |
3639 awk '/^lost_dirent_repaired/ { print $2 }')
3640 [ $repaired -eq 1 ] ||
3641 error "(8) Fail to repair lost dirent: $repaired"
3643 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
3644 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
3645 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
3647 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
3649 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
3650 [ ! -z "$cname" ] ||
3651 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
3653 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
3656 [ $MDSCOUNT -lt 2 ] &&
3657 skip "We need at least 2 MDSes for this test" && return
3660 echo "The remote parent referenced by the MDT-object linkEA is lost."
3661 echo "The namespace LFSCK will re-create the lost parent as orphan."
3664 check_mount_and_prep
3666 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3667 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
3669 $LFS path2fid $DIR/$tdir/d0
3671 echo "Inject failure stub on MDT0 to simulate the case that"
3672 echo "foo's name entry will be removed, but the foo's object"
3673 echo "and its linkEA are kept in the system. And then remove"
3674 echo "the parent directory."
3676 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3677 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
3678 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
3679 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3681 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
3682 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
3684 echo "Trigger namespace LFSCK to repair the missing remote name entry"
3685 $START_NAMESPACE -r -A ||
3686 error "(6) Fail to start LFSCK for namespace"
3688 wait_all_targets_blocked namespace completed 7
3690 local repaired=$($SHOW_NAMESPACE |
3691 awk '/^lost_dirent_repaired/ { print $2 }')
3692 [ $repaired -eq 1 ] ||
3693 error "(8) Fail to repair lost dirent: $repaired"
3695 ls -ail $MOUNT/.lustre/lost+found/
3697 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
3698 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
3699 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
3701 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
3703 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
3704 [ ! -z "$cname" ] ||
3705 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
3707 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
3710 [ $MDSCOUNT -lt 2 ] &&
3711 skip "The test needs at least 2 MDTs" && return
3714 echo "The target name entry is lost. The LFSCK should insert the"
3715 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
3716 echo "the MDT (on which the orphan MDT-object resides) has ever"
3717 echo "failed to respond some name entry verification during the"
3718 echo "first stage-scanning, then the LFSCK should skip to handle"
3719 echo "orphan MDT-object on this MDT. But other MDTs should not"
3723 check_mount_and_prep
3724 $LFS mkdir -i 0 $DIR/$tdir/d1
3725 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
3726 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
3728 $LFS mkdir -i 1 $DIR/$tdir/d2
3729 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
3730 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
3732 echo "Inject failure stub on MDT0 to simulate the case that"
3733 echo "d1/a1's name entry will be removed, but the d1/a1's object"
3734 echo "and its linkEA are kept in the system. And the case that"
3735 echo "d2/a2's name entry will be removed, but the d2/a2's object"
3736 echo "and its linkEA are kept in the system."
3738 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
3739 do_facet mds1 $LCTL set_param fail_loc=0x1624
3740 do_facet mds2 $LCTL set_param fail_loc=0x1624
3741 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
3742 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
3743 do_facet mds1 $LCTL set_param fail_loc=0
3744 do_facet mds2 $LCTL set_param fail_loc=0
3746 cancel_lru_locks mdc
3747 cancel_lru_locks osc
3749 echo "Inject failure, to simulate the MDT0 fail to handle"
3750 echo "MDT1 LFSCK request during the first-stage scanning."
3751 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
3752 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
3754 echo "Trigger namespace LFSCK on all devices to find out orphan object"
3755 $START_NAMESPACE -r -A ||
3756 error "(3) Fail to start LFSCK for namespace"
3758 wait_update_facet mds1 "$LCTL get_param -n \
3759 mdd.$(facet_svc mds1).lfsck_namespace |
3760 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
3761 error "(4) mds1 is not the expected 'partial'"
3764 wait_update_facet mds2 "$LCTL get_param -n \
3765 mdd.$(facet_svc mds2).lfsck_namespace |
3766 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3767 error "(5) mds2 is not the expected 'completed'"
3770 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
3772 local repaired=$(do_facet mds1 $LCTL get_param -n \
3773 mdd.$(facet_svc mds1).lfsck_namespace |
3774 awk '/^lost_dirent_repaired/ { print $2 }')
3775 [ $repaired -eq 0 ] ||
3776 error "(6) Expect 0 fixed on mds1, but got: $repaired"
3778 repaired=$(do_facet mds2 $LCTL get_param -n \
3779 mdd.$(facet_svc mds2).lfsck_namespace |
3780 awk '/^lost_dirent_repaired/ { print $2 }')
3781 [ $repaired -eq 1 ] ||
3782 error "(7) Expect 1 fixed on mds2, but got: $repaired"
3784 echo "Trigger namespace LFSCK on all devices again to cleanup"
3785 $START_NAMESPACE -r -A ||
3786 error "(8) Fail to start LFSCK for namespace"
3788 wait_all_targets_blocked namespace completed 9
3790 local repaired=$(do_facet mds1 $LCTL get_param -n \
3791 mdd.$(facet_svc mds1).lfsck_namespace |
3792 awk '/^lost_dirent_repaired/ { print $2 }')
3793 [ $repaired -eq 1 ] ||
3794 error "(10) Expect 1 fixed on mds1, but got: $repaired"
3796 repaired=$(do_facet mds2 $LCTL get_param -n \
3797 mdd.$(facet_svc mds2).lfsck_namespace |
3798 awk '/^lost_dirent_repaired/ { print $2 }')
3799 [ $repaired -eq 0 ] ||
3800 error "(11) Expect 0 fixed on mds2, but got: $repaired"
3802 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
3806 echo "The object's nlink attribute is larger than the object's known"
3807 echo "name entries count. The LFSCK will repair the object's nlink"
3808 echo "attribute to match the known name entries count"
3811 check_mount_and_prep
3813 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3814 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3816 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3817 echo "nlink attribute is larger than its name entries count."
3819 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
3820 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
3821 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3822 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3825 cancel_lru_locks mdc
3826 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3827 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
3829 echo "Trigger namespace LFSCK to repair the nlink count"
3830 $START_NAMESPACE -r -A ||
3831 error "(5) Fail to start LFSCK for namespace"
3833 wait_all_targets_blocked namespace completed 6
3835 local repaired=$($SHOW_NAMESPACE |
3836 awk '/^nlinks_repaired/ { print $2 }')
3837 [ $repaired -eq 1 ] ||
3838 error "(7) Fail to repair nlink count: $repaired"
3840 cancel_lru_locks mdc
3841 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3842 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3844 # Disable 29a, we only allow nlink to be updated if the known linkEA
3845 # entries is larger than nlink count.
3847 #run_test 29a "LFSCK can repair bad nlink count (1)"
3851 echo "The object's nlink attribute is smaller than the object's known"
3852 echo "name entries count. The LFSCK will repair the object's nlink"
3853 echo "attribute to match the known name entries count"
3856 check_mount_and_prep
3858 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3859 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
3861 echo "Inject failure stub on MDT0 to simulate the case that foo's"
3862 echo "nlink attribute is smaller than its name entries count."
3864 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
3865 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
3866 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
3867 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
3868 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3870 cancel_lru_locks mdc
3871 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
3872 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
3874 echo "Trigger namespace LFSCK to repair the nlink count"
3875 $START_NAMESPACE -r -A ||
3876 error "(5) Fail to start LFSCK for namespace"
3878 wait_all_targets_blocked namespace completed 6
3880 local repaired=$($SHOW_NAMESPACE |
3881 awk '/^nlinks_repaired/ { print $2 }')
3882 [ $repaired -eq 1 ] ||
3883 error "(7) Fail to repair nlink count: $repaired"
3885 cancel_lru_locks mdc
3886 count=$(stat --format=%h $DIR/$tdir/d0/foo)
3887 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
3889 run_test 29b "LFSCK can repair bad nlink count (2)"
3894 echo "The namespace LFSCK will create many hard links to the target"
3895 echo "file as to exceed the linkEA size limitation. Under such case"
3896 echo "the linkEA will be marked as overflow that will prevent the"
3897 echo "target file to be migrated. Then remove some hard links to"
3898 echo "make the left hard links to be held within the linkEA size"
3899 echo "limitation. But before the namespace LFSCK adding all the"
3900 echo "missed linkEA entries back, the overflow mark (timestamp)"
3901 echo "will not be cleared."
3904 check_mount_and_prep
3906 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
3907 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
3908 error "(0.2) Fail to mkdir"
3909 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
3910 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
3912 # define MAX_LINKEA_SIZE 4096
3913 # sizeof(link_ea_header) = 24
3914 # sizeof(link_ea_entry) = 18
3915 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
3916 # (sizeof(link_ea_entry) + name_length))
3917 # If the average name length is 12 bytes, then 150 hard links
3918 # is totally enough to overflow the linkEA
3919 echo "Create 150 hard links should succeed although the linkEA overflow"
3920 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
3921 error "(2) Fail to hard link"
3923 cancel_lru_locks mdc
3924 if [ $MDSCOUNT -ge 2 ]; then
3925 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
3926 error "(3.1) Migrate failure"
3928 echo "The object with linkEA overflow should NOT be migrated"
3929 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
3930 [ "$newfid" == "$oldfid" ] ||
3931 error "(3.2) Migrate should fail: $newfid != $oldfid"
3934 # Remove 100 hard links, then the linkEA should have space
3935 # to hold the missed linkEA entries.
3936 echo "Remove 100 hard links to save space for the missed linkEA entries"
3937 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
3939 if [ $MDSCOUNT -ge 2 ]; then
3940 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
3941 error "(5.1) Migrate failure"
3943 # The overflow timestamp is still there, so migration will fail.
3944 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
3945 [ "$newfid" == "$oldfid" ] ||
3946 error "(5.2) Migrate should fail: $newfid != $oldfid"
3949 # sleep 3 seconds to guarantee that the overflow is recognized
3952 echo "Trigger namespace LFSCK to clear the overflow timestamp"
3953 $START_NAMESPACE -r -A ||
3954 error "(6) Fail to start LFSCK for namespace"
3956 wait_all_targets_blocked namespace completed 7
3958 local repaired=$($SHOW_NAMESPACE |
3959 awk '/^linkea_overflow_cleared/ { print $2 }')
3960 [ $repaired -eq 1 ] ||
3961 error "(8) Fail to clear linkea overflow: $repaired"
3963 repaired=$($SHOW_NAMESPACE |
3964 awk '/^nlinks_repaired/ { print $2 }')
3965 [ $repaired -eq 0 ] ||
3966 error "(9) Unexpected nlink repaired: $repaired"
3968 if [ $MDSCOUNT -ge 2 ]; then
3969 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
3970 error "(10.1) Migrate failure"
3972 # Migration should succeed after clear the overflow timestamp.
3973 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
3974 [ "$newfid" != "$oldfid" ] ||
3975 error "(10.2) Migrate should succeed"
3977 ls -l $DIR/$tdir/foo > /dev/null ||
3978 error "(11) 'ls' failed after migration"
3981 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
3982 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
3984 run_test 29c "verify linkEA size limitation"
3987 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
3988 skip "Only support backend /lost+found for ldiskfs" && return
3991 echo "The namespace LFSCK will move the orphans from backend"
3992 echo "/lost+found directory to normal client visible namespace"
3993 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
3996 check_mount_and_prep
3998 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
3999 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4001 echo "Inject failure stub on MDT0 to simulate the case that"
4002 echo "directory d0 has no linkEA entry, then the LFSCK will"
4003 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4005 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4007 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4008 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4010 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4011 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4013 echo "Inject failure stub on MDT0 to simulate the case that the"
4014 echo "object's name entry will be removed, but not destroy the"
4015 echo "object. Then backend e2fsck will handle it as orphan and"
4016 echo "add them into the backend /lost+found directory."
4018 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4019 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4020 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4021 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4022 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4023 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4024 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4026 umount_client $MOUNT || error "(10) Fail to stop client!"
4028 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4031 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4032 error "(12) Fail to run e2fsck"
4034 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4035 error "(13) Fail to start MDT0"
4037 echo "Trigger namespace LFSCK to recover backend orphans"
4038 $START_NAMESPACE -r -A ||
4039 error "(14) Fail to start LFSCK for namespace"
4041 wait_all_targets_blocked namespace completed 15
4043 local repaired=$($SHOW_NAMESPACE |
4044 awk '/^local_lost_found_moved/ { print $2 }')
4045 [ $repaired -ge 4 ] ||
4046 error "(16) Fail to recover backend orphans: $repaired"
4048 mount_client $MOUNT || error "(17) Fail to start client!"
4050 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4052 ls -ail $MOUNT/.lustre/lost+found/
4054 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4055 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4056 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4058 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4060 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
4061 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4063 stat ${cname}/d1 || error "(21) d0 is not recovered"
4064 stat ${cname}/f1 || error "(22) f1 is not recovered"
4066 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4069 [ $MDSCOUNT -lt 2 ] &&
4070 skip "The test needs at least 2 MDTs" && return
4073 echo "For the name entry under a striped directory, if the name"
4074 echo "hash does not match the shard, then the LFSCK will repair"
4075 echo "the bad name entry"
4078 check_mount_and_prep
4080 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4081 error "(1) Fail to create striped directory"
4083 echo "Inject failure stub on client to simulate the case that"
4084 echo "some name entry should be inserted into other non-first"
4085 echo "shard, but inserted into the first shard by wrong"
4087 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4088 $LCTL set_param fail_loc=0x1628 fail_val=0
4089 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4090 error "(2) Fail to create file under striped directory"
4091 $LCTL set_param fail_loc=0 fail_val=0
4093 echo "Trigger namespace LFSCK to repair bad name hash"
4094 $START_NAMESPACE -r -A ||
4095 error "(3) Fail to start LFSCK for namespace"
4097 wait_all_targets_blocked namespace completed 4
4099 local repaired=$($SHOW_NAMESPACE |
4100 awk '/^name_hash_repaired/ { print $2 }')
4101 [ $repaired -ge 1 ] ||
4102 error "(5) Fail to repair bad name hash: $repaired"
4104 umount_client $MOUNT || error "(6) umount failed"
4105 mount_client $MOUNT || error "(7) mount failed"
4107 for ((i = 0; i < $MDSCOUNT; i++)); do
4108 stat $DIR/$tdir/striped_dir/d$i ||
4109 error "(8) Fail to stat d$i after LFSCK"
4110 rmdir $DIR/$tdir/striped_dir/d$i ||
4111 error "(9) Fail to unlink d$i after LFSCK"
4114 rmdir $DIR/$tdir/striped_dir ||
4115 error "(10) Fail to remove the striped directory after LFSCK"
4117 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4120 [ $MDSCOUNT -lt 2 ] &&
4121 skip "The test needs at least 2 MDTs" && return
4124 echo "For the name entry under a striped directory, if the name"
4125 echo "hash does not match the shard, then the LFSCK will repair"
4126 echo "the bad name entry"
4129 check_mount_and_prep
4131 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4132 error "(1) Fail to create striped directory"
4134 echo "Inject failure stub on client to simulate the case that"
4135 echo "some name entry should be inserted into other non-second"
4136 echo "shard, but inserted into the secod shard by wrong"
4138 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4139 $LCTL set_param fail_loc=0x1628 fail_val=1
4140 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4141 error "(2) Fail to create file under striped directory"
4142 $LCTL set_param fail_loc=0 fail_val=0
4144 echo "Trigger namespace LFSCK to repair bad name hash"
4145 $START_NAMESPACE -r -A ||
4146 error "(3) Fail to start LFSCK for namespace"
4148 wait_all_targets_blocked namespace completed 4
4150 local repaired=$(do_facet mds2 $LCTL get_param -n \
4151 mdd.$(facet_svc mds2).lfsck_namespace |
4152 awk '/^name_hash_repaired/ { print $2 }')
4153 [ $repaired -ge 1 ] ||
4154 error "(5) Fail to repair bad name hash: $repaired"
4156 umount_client $MOUNT || error "(6) umount failed"
4157 mount_client $MOUNT || error "(7) mount failed"
4159 for ((i = 0; i < $MDSCOUNT; i++)); do
4160 stat $DIR/$tdir/striped_dir/d$i ||
4161 error "(8) Fail to stat d$i after LFSCK"
4162 rmdir $DIR/$tdir/striped_dir/d$i ||
4163 error "(9) Fail to unlink d$i after LFSCK"
4166 rmdir $DIR/$tdir/striped_dir ||
4167 error "(10) Fail to remove the striped directory after LFSCK"
4169 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4172 [ $MDSCOUNT -lt 2 ] &&
4173 skip "The test needs at least 2 MDTs" && return
4176 echo "For some reason, the master MDT-object of the striped directory"
4177 echo "may lost its master LMV EA. If nobody created files under the"
4178 echo "master directly after the master LMV EA lost, then the LFSCK"
4179 echo "should re-generate the master LMV EA."
4182 check_mount_and_prep
4184 echo "Inject failure stub on MDT0 to simulate the case that the"
4185 echo "master MDT-object of the striped directory lost the LMV EA."
4187 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4188 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4189 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4190 error "(1) Fail to create striped directory"
4191 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4193 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4194 $START_NAMESPACE -r -A ||
4195 error "(2) Fail to start LFSCK for namespace"
4197 wait_all_targets_blocked namespace completed 3
4199 local repaired=$($SHOW_NAMESPACE |
4200 awk '/^striped_dirs_repaired/ { print $2 }')
4201 [ $repaired -eq 1 ] ||
4202 error "(4) Fail to re-generate master LMV EA: $repaired"
4204 umount_client $MOUNT || error "(5) umount failed"
4205 mount_client $MOUNT || error "(6) mount failed"
4207 local empty=$(ls $DIR/$tdir/striped_dir/)
4208 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4210 rmdir $DIR/$tdir/striped_dir ||
4211 error "(8) Fail to remove the striped directory after LFSCK"
4213 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4216 [ $MDSCOUNT -lt 2 ] &&
4217 skip "The test needs at least 2 MDTs" && return
4220 echo "For some reason, the master MDT-object of the striped directory"
4221 echo "may lost its master LMV EA. If somebody created files under the"
4222 echo "master directly after the master LMV EA lost, then the LFSCK"
4223 echo "should NOT re-generate the master LMV EA, instead, it should"
4224 echo "change the broken striped dirctory as read-only to prevent"
4225 echo "further damage"
4228 check_mount_and_prep
4230 echo "Inject failure stub on MDT0 to simulate the case that the"
4231 echo "master MDT-object of the striped directory lost the LMV EA."
4233 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4234 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4235 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4236 error "(1) Fail to create striped directory"
4237 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4239 umount_client $MOUNT || error "(2) umount failed"
4240 mount_client $MOUNT || error "(3) mount failed"
4242 touch $DIR/$tdir/striped_dir/dummy ||
4243 error "(4) Fail to touch under broken striped directory"
4245 echo "Trigger namespace LFSCK to find out the inconsistency"
4246 $START_NAMESPACE -r -A ||
4247 error "(5) Fail to start LFSCK for namespace"
4249 wait_all_targets_blocked namespace completed 6
4251 local repaired=$($SHOW_NAMESPACE |
4252 awk '/^striped_dirs_repaired/ { print $2 }')
4253 [ $repaired -eq 0 ] ||
4254 error "(7) Re-generate master LMV EA unexpected: $repaired"
4256 stat $DIR/$tdir/striped_dir/dummy ||
4257 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4259 touch $DIR/$tdir/striped_dir/foo &&
4260 error "(9) The broken striped directory should be read-only"
4262 chattr -i $DIR/$tdir/striped_dir ||
4263 error "(10) Fail to chattr on the broken striped directory"
4265 rmdir $DIR/$tdir/striped_dir ||
4266 error "(11) Fail to remove the striped directory after LFSCK"
4268 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4271 [ $MDSCOUNT -lt 2 ] &&
4272 skip "The test needs at least 2 MDTs" && return
4275 echo "For some reason, the slave MDT-object of the striped directory"
4276 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4277 echo "slave LMV EA."
4280 check_mount_and_prep
4282 echo "Inject failure stub on MDT0 to simulate the case that the"
4283 echo "slave MDT-object (that resides on the same MDT as the master"
4284 echo "MDT-object resides on) lost the LMV EA."
4286 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4287 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4288 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4289 error "(1) Fail to create striped directory"
4290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4292 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4293 $START_NAMESPACE -r -A ||
4294 error "(2) Fail to start LFSCK for namespace"
4296 wait_all_targets_blocked namespace completed 3
4298 local repaired=$($SHOW_NAMESPACE |
4299 awk '/^striped_shards_repaired/ { print $2 }')
4300 [ $repaired -eq 1 ] ||
4301 error "(4) Fail to re-generate slave LMV EA: $repaired"
4303 rmdir $DIR/$tdir/striped_dir ||
4304 error "(5) Fail to remove the striped directory after LFSCK"
4306 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4309 [ $MDSCOUNT -lt 2 ] &&
4310 skip "The test needs at least 2 MDTs" && return
4313 echo "For some reason, the slave MDT-object of the striped directory"
4314 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4315 echo "slave LMV EA."
4318 check_mount_and_prep
4320 echo "Inject failure stub on MDT0 to simulate the case that the"
4321 echo "slave MDT-object (that resides on different MDT as the master"
4322 echo "MDT-object resides on) lost the LMV EA."
4324 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4325 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4326 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4327 error "(1) Fail to create striped directory"
4328 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4330 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4331 $START_NAMESPACE -r -A ||
4332 error "(2) Fail to start LFSCK for namespace"
4334 wait_all_targets_blocked namespace completed 3
4336 local repaired=$(do_facet mds2 $LCTL get_param -n \
4337 mdd.$(facet_svc mds2).lfsck_namespace |
4338 awk '/^striped_shards_repaired/ { print $2 }')
4339 [ $repaired -eq 1 ] ||
4340 error "(4) Fail to re-generate slave LMV EA: $repaired"
4342 rmdir $DIR/$tdir/striped_dir ||
4343 error "(5) Fail to remove the striped directory after LFSCK"
4345 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4348 [ $MDSCOUNT -lt 2 ] &&
4349 skip "The test needs at least 2 MDTs" && return
4352 echo "For some reason, the stripe index in the slave LMV EA is"
4353 echo "corrupted. The LFSCK should repair the slave LMV EA."
4356 check_mount_and_prep
4358 echo "Inject failure stub on MDT0 to simulate the case that the"
4359 echo "slave LMV EA on the first shard of the striped directory"
4360 echo "claims the same index as the second shard claims"
4362 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4363 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4364 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4365 error "(1) Fail to create striped directory"
4366 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4368 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4369 $START_NAMESPACE -r -A ||
4370 error "(2) Fail to start LFSCK for namespace"
4372 wait_all_targets_blocked namespace completed 3
4374 local repaired=$($SHOW_NAMESPACE |
4375 awk '/^striped_shards_repaired/ { print $2 }')
4376 [ $repaired -eq 1 ] ||
4377 error "(4) Fail to repair slave LMV EA: $repaired"
4379 umount_client $MOUNT || error "(5) umount failed"
4380 mount_client $MOUNT || error "(6) mount failed"
4382 touch $DIR/$tdir/striped_dir/foo ||
4383 error "(7) Fail to touch file after the LFSCK"
4385 rm -f $DIR/$tdir/striped_dir/foo ||
4386 error "(8) Fail to unlink file after the LFSCK"
4388 rmdir $DIR/$tdir/striped_dir ||
4389 error "(9) Fail to remove the striped directory after LFSCK"
4391 run_test 31g "Repair the corrupted slave LMV EA"
4394 [ $MDSCOUNT -lt 2 ] &&
4395 skip "The test needs at least 2 MDTs" && return
4398 echo "For some reason, the shard's name entry in the striped"
4399 echo "directory may be corrupted. The LFSCK should repair the"
4400 echo "bad shard's name entry."
4403 check_mount_and_prep
4405 echo "Inject failure stub on MDT0 to simulate the case that the"
4406 echo "first shard's name entry in the striped directory claims"
4407 echo "the same index as the second shard's name entry claims."
4409 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4410 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4411 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4412 error "(1) Fail to create striped directory"
4413 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4415 echo "Trigger namespace LFSCK to repair the shard's name entry"
4416 $START_NAMESPACE -r -A ||
4417 error "(2) Fail to start LFSCK for namespace"
4419 wait_all_targets_blocked namespace completed 3
4421 local repaired=$($SHOW_NAMESPACE |
4422 awk '/^dirent_repaired/ { print $2 }')
4423 [ $repaired -eq 1 ] ||
4424 error "(4) Fail to repair shard's name entry: $repaired"
4426 umount_client $MOUNT || error "(5) umount failed"
4427 mount_client $MOUNT || error "(6) mount failed"
4429 touch $DIR/$tdir/striped_dir/foo ||
4430 error "(7) Fail to touch file after the LFSCK"
4432 rm -f $DIR/$tdir/striped_dir/foo ||
4433 error "(8) Fail to unlink file after the LFSCK"
4435 rmdir $DIR/$tdir/striped_dir ||
4436 error "(9) Fail to remove the striped directory after LFSCK"
4438 run_test 31h "Repair the corrupted shard's name entry"
4443 umount_client $MOUNT
4445 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
4446 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
4447 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
4449 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
4450 [ "$STATUS" == "scanning-phase1" ] ||
4451 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
4454 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
4456 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
4460 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
4462 run_test 32 "stop LFSCK when some OST failed"
4464 # restore MDS/OST size
4465 MDSSIZE=${SAVED_MDSSIZE}
4466 OSTSIZE=${SAVED_OSTSIZE}
4467 OSTCOUNT=${SAVED_OSTCOUNT}
4469 # cleanup the system at last