3 # Run select tests by setting ONLY, or as arguments to the script.
4 # Skip specific tests by setting EXCEPT.
11 #Bug number for excepting test
12 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT"
14 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
15 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
17 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
18 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 require_dsh_mds || exit 0
27 if ! check_versions; then
28 skip "It is NOT necessary to test lfsck under interoperation mode"
32 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.60) ]] &&
33 skip "Need MDS version at least 2.3.60" && exit 0
37 SAVED_MDSSIZE=${MDSSIZE}
38 SAVED_OSTSIZE=${OSTSIZE}
39 SAVED_OSTCOUNT=${OSTCOUNT}
40 # use small MDS + OST size to speed formatting time
41 # do not use too small MDSSIZE/OSTSIZE, which affect the default journal size
44 # no need too many OSTs, to reduce the format/start/stop overhead
45 [ $OSTCOUNT -gt 4 ] && OSTCOUNT=4
47 # build up a clean test environment.
51 [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.4.90) ]] &&
52 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2c"
54 [[ $(lustre_version_code ost1) -lt $(version_code 2.5.55) ]] &&
55 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14 15 16 17 18 19 20 21"
57 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.50) ]] &&
58 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 2d 2e 3 22 23 24 25 26 27 28 29 30 31"
60 # DNE does not support striped directory on zfs-based backend yet.
61 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
62 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 31"
66 MDT_DEV="${FSNAME}-MDT0000"
67 OST_DEV="${FSNAME}-OST0000"
68 MDT_DEVNAME=$(mdsdevname ${SINGLEMDS//mds/})
69 START_NAMESPACE="do_facet $SINGLEMDS \
70 $LCTL lfsck_start -M ${MDT_DEV} -t namespace"
71 START_LAYOUT="do_facet $SINGLEMDS \
72 $LCTL lfsck_start -M ${MDT_DEV} -t layout"
73 START_LAYOUT_ON_OST="do_facet ost1 $LCTL lfsck_start -M ${OST_DEV} -t layout"
74 STOP_LFSCK="do_facet $SINGLEMDS $LCTL lfsck_stop -M ${MDT_DEV}"
75 SHOW_NAMESPACE="do_facet $SINGLEMDS \
76 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_namespace"
77 SHOW_LAYOUT="do_facet $SINGLEMDS \
78 $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout"
79 SHOW_LAYOUT_ON_OST="do_facet ost1 \
80 $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout"
81 MOUNT_OPTS_SCRUB="-o user_xattr"
82 MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub"
83 MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck"
92 echo "preparing... $nfiles * $ndirs files will be created $(date)."
93 if [ ! -z $igif ]; then
94 #define OBD_FAIL_FID_IGIF 0x1504
95 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1504
98 cp $LUSTRE/tests/*.sh $DIR/$tdir/
99 if [ $ndirs -gt 0 ]; then
100 createmany -d $DIR/$tdir/d $ndirs
101 createmany -m $DIR/$tdir/f $ndirs
102 if [ $nfiles -gt 0 ]; then
103 for ((i = 0; i < $ndirs; i++)); do
104 createmany -m $DIR/$tdir/d${i}/f $nfiles > \
105 /dev/null || error "createmany $nfiles"
108 createmany -d $DIR/$tdir/e $ndirs
111 if [ ! -z $igif ]; then
112 touch $DIR/$tdir/dummy
113 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
116 echo "prepared $(date)."
119 run_e2fsck_on_mdt0() {
120 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && return
122 stop $SINGLEMDS > /dev/null || error "(0) Fail to the stop MDT0"
123 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n" |
125 run_e2fsck $(facet_active_host $SINGLEMDS) $(mdsdevname 1) "-n"
126 error "(2) Detected inconsistency on MDT0"
128 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
129 error "(3) Fail to start MDT0"
132 wait_all_targets_blocked() {
137 local count=$(do_facet mds1 \
138 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
139 awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
140 [[ $count -eq $MDSCOUNT ]] || {
141 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
142 error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
151 wait_update_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 |
152 awk '/^${com}_mdts_${status}/ { print \\\$2 }'" \
153 "$MDSCOUNT" $LTIME || {
154 do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
155 error "($err) some MDTs are not in ${status}"
162 #define OBD_FAIL_LFSCK_DELAY1 0x1600
163 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x1600
164 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
166 $SHOW_NAMESPACE || error "Fail to monitor LFSCK (3)"
168 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
169 [ "$STATUS" == "scanning-phase1" ] ||
170 error "(4) Expect 'scanning-phase1', but got '$STATUS'"
172 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
174 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
175 [ "$STATUS" == "stopped" ] ||
176 error "(6) Expect 'stopped', but got '$STATUS'"
178 $START_NAMESPACE || error "(7) Fail to start LFSCK for namespace!"
180 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
181 [ "$STATUS" == "scanning-phase1" ] ||
182 error "(8) Expect 'scanning-phase1', but got '$STATUS'"
184 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
185 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
186 mdd.${MDT_DEV}.lfsck_namespace |
187 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
189 error "(9) unexpected status"
192 local repaired=$($SHOW_NAMESPACE |
193 awk '/^updated_phase1/ { print $2 }')
194 [ $repaired -eq 0 ] ||
195 error "(10) Expect nothing to be repaired, but got: $repaired"
197 local scanned1=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
198 $START_NAMESPACE -r || error "(11) Fail to reset LFSCK!"
199 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
200 mdd.${MDT_DEV}.lfsck_namespace |
201 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
203 error "(12) unexpected status"
206 local scanned2=$($SHOW_NAMESPACE | awk '/^success_count/ { print $2 }')
207 [ $((scanned1 + 1)) -eq $scanned2 ] ||
208 error "(13) Expect success $((scanned1 + 1)), but got $scanned2"
210 echo "stopall, should NOT crash LU-3649"
211 stopall || error "(14) Fail to stopall"
213 run_test 0 "Control LFSCK manually"
216 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
217 skip "OI Scrub not implemented for ZFS" && return
221 #define OBD_FAIL_FID_INDIR 0x1501
222 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1501
223 touch $DIR/$tdir/dummy
225 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
227 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
228 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
229 mdd.${MDT_DEV}.lfsck_namespace |
230 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
232 error "(4) unexpected status"
235 local repaired=$($SHOW_NAMESPACE |
236 awk '/^dirent_repaired/ { print $2 }')
237 # for interop with old server
238 [ -z "$repaired" ] &&
239 repaired=$($SHOW_NAMESPACE |
240 awk '/^updated_phase1/ { print $2 }')
242 [ $repaired -eq 1 ] ||
243 error "(5) Fail to repair crashed FID-in-dirent: $repaired"
247 mount_client $MOUNT || error "(6) Fail to start client!"
249 #define OBD_FAIL_FID_LOOKUP 0x1505
250 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
251 ls $DIR/$tdir/ > /dev/null || error "(7) no FID-in-dirent."
253 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
255 run_test 1a "LFSCK can find out and repair crashed FID-in-dirent"
259 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
260 skip "OI Scrub not implemented for ZFS" && return
264 #define OBD_FAIL_FID_INLMA 0x1502
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1502
266 touch $DIR/$tdir/dummy
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
270 #define OBD_FAIL_FID_NOLMA 0x1506
271 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1506
272 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
273 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
274 mdd.${MDT_DEV}.lfsck_namespace |
275 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
277 error "(4) unexpected status"
280 local repaired=$($SHOW_NAMESPACE |
281 awk '/^dirent_repaired/ { print $2 }')
282 # for interop with old server
283 [ -z "$repaired" ] &&
284 repaired=$($SHOW_NAMESPACE |
285 awk '/^updated_phase1/ { print $2 }')
287 [ $repaired -eq 1 ] ||
288 error "(5) Fail to repair the missing FID-in-LMA: $repaired"
290 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
293 mount_client $MOUNT || error "(6) Fail to start client!"
295 #define OBD_FAIL_FID_LOOKUP 0x1505
296 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
297 stat $DIR/$tdir/dummy > /dev/null || error "(7) no FID-in-LMA."
299 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
301 run_test 1b "LFSCK can find out and repair the missing FID-in-LMA"
306 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
307 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
308 touch $DIR/$tdir/dummy
310 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
312 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
313 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
314 mdd.${MDT_DEV}.lfsck_namespace |
315 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
317 error "(4) unexpected status"
320 local repaired=$($SHOW_NAMESPACE |
321 awk '/^linkea_repaired/ { print $2 }')
322 # for interop with old server
323 [ -z "$repaired" ] &&
324 repaired=$($SHOW_NAMESPACE |
325 awk '/^updated_phase2/ { print $2 }')
327 [ $repaired -eq 1 ] ||
328 error "(5) Fail to repair crashed linkEA: $repaired"
332 mount_client $MOUNT || error "(6) Fail to start client!"
334 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
335 error "(7) Fail to stat $DIR/$tdir/dummy"
337 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
338 local dummyname=$($LFS fid2path $DIR $dummyfid)
339 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
340 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
342 run_test 2a "LFSCK can find out and repair crashed linkEA entry"
348 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
349 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
350 touch $DIR/$tdir/dummy
352 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
354 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
355 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
356 mdd.${MDT_DEV}.lfsck_namespace |
357 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
359 error "(4) unexpected status"
362 local repaired=$($SHOW_NAMESPACE |
363 awk '/^updated_phase2/ { print $2 }')
364 [ $repaired -eq 1 ] ||
365 error "(5) Fail to repair crashed linkEA: $repaired"
369 mount_client $MOUNT || error "(6) Fail to start client!"
371 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
372 error "(7) Fail to stat $DIR/$tdir/dummy"
374 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
375 local dummyname=$($LFS fid2path $DIR $dummyfid)
376 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
377 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
379 run_test 2b "LFSCK can find out and remove invalid linkEA entry"
385 #define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
386 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1605
387 touch $DIR/$tdir/dummy
389 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
391 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
392 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
393 mdd.${MDT_DEV}.lfsck_namespace |
394 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
396 error "(4) unexpected status"
399 local repaired=$($SHOW_NAMESPACE |
400 awk '/^updated_phase2/ { print $2 }')
401 [ $repaired -eq 1 ] ||
402 error "(5) Fail to repair crashed linkEA: $repaired"
406 mount_client $MOUNT || error "(6) Fail to start client!"
408 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
409 error "(7) Fail to stat $DIR/$tdir/dummy"
411 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
412 local dummyname=$($LFS fid2path $DIR $dummyfid)
413 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
414 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
416 run_test 2c "LFSCK can find out and remove repeated linkEA entry"
422 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
423 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
424 touch $DIR/$tdir/dummy
426 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
428 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
429 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
430 mdd.${MDT_DEV}.lfsck_namespace |
431 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
433 error "(4) unexpected status"
436 local repaired=$($SHOW_NAMESPACE |
437 awk '/^linkea_repaired/ { print $2 }')
438 [ $repaired -eq 1 ] ||
439 error "(5) Fail to repair crashed linkEA: $repaired"
443 mount_client $MOUNT || error "(6) Fail to start client!"
445 stat $DIR/$tdir/dummy | grep "Links: 1" > /dev/null ||
446 error "(7) Fail to stat $DIR/$tdir/dummy"
448 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
449 local dummyname=$($LFS fid2path $DIR $dummyfid)
450 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
451 error "(8) Fail to repair linkEA: $dummyfid $dummyname"
453 run_test 2d "LFSCK can recover the missing linkEA entry"
457 [ $MDSCOUNT -lt 2 ] &&
458 skip "We need at least 2 MDSes for this test" && return
462 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT1"
464 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
465 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
466 $LFS mkdir -i 0 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT0"
467 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
469 $START_NAMESPACE -r -A || error "(3) Fail to start LFSCK for namespace!"
471 wait_all_targets_blocked namespace completed 4
473 local repaired=$($SHOW_NAMESPACE |
474 awk '/^linkea_repaired/ { print $2 }')
475 [ $repaired -eq 1 ] ||
476 error "(5) Fail to repair crashed linkEA: $repaired"
478 local fid=$($LFS path2fid $DIR/$tdir/d0/d1)
479 local name=$($LFS fid2path $DIR $fid)
480 [ "$name" == "$DIR/$tdir/d0/d1" ] ||
481 error "(6) Fail to repair linkEA: $fid $name"
483 run_test 2e "namespace LFSCK can verify remote object linkEA"
489 mkdir $DIR/$tdir/dummy || error "(1) Fail to mkdir"
490 ln $DIR/$tdir/d0/f0 $DIR/$tdir/dummy/f0 || error "(2) Fail to hardlink"
491 ln $DIR/$tdir/d0/f1 $DIR/$tdir/dummy/f1 || error "(3) Fail to hardlink"
493 $LFS mkdir -i 0 $DIR/$tdir/edir || error "(4) Fail to mkdir"
494 touch $DIR/$tdir/edir/f0 || error "(5) Fail to touch"
495 touch $DIR/$tdir/edir/f1 || error "(6) Fail to touch"
497 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
498 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
499 ln $DIR/$tdir/edir/f0 $DIR/$tdir/edir/w0 || error "(7) Fail to hardlink"
501 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
502 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
503 ln $DIR/$tdir/edir/f1 $DIR/$tdir/edir/w1 || error "(8) Fail to hardlink"
505 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
507 $START_NAMESPACE -r || error "(9) Fail to start LFSCK for namespace!"
508 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
509 mdd.${MDT_DEV}.lfsck_namespace |
510 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
512 error "(10) unexpected status"
515 local checked=$($SHOW_NAMESPACE |
516 awk '/^checked_phase2/ { print $2 }')
517 [ $checked -ge 4 ] ||
518 error "(11) Fail to check multiple-linked object: $checked"
520 local repaired=$($SHOW_NAMESPACE |
521 awk '/^multiple_linked_repaired/ { print $2 }')
522 [ $repaired -ge 2 ] ||
523 error "(12) Fail to repair multiple-linked object: $repaired"
525 run_test 3 "LFSCK can verify multiple-linked objects"
529 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
530 skip "OI Scrub not implemented for ZFS" && return
533 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
534 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
536 mds_backup_restore $SINGLEMDS || error "(1) Fail to backup/restore!"
537 echo "start $SINGLEMDS with disabling OI scrub"
538 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
539 error "(2) Fail to start MDS!"
541 #define OBD_FAIL_LFSCK_DELAY2 0x1601
542 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
543 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
544 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
545 mdd.${MDT_DEV}.lfsck_namespace |
546 awk '/^flags/ { print \\\$2 }'" "inconsistent" 32 || {
548 error "(5) unexpected status"
551 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
552 [ "$STATUS" == "scanning-phase1" ] ||
553 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
555 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
556 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
557 mdd.${MDT_DEV}.lfsck_namespace |
558 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
560 error "(7) unexpected status"
563 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
564 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
566 local repaired=$($SHOW_NAMESPACE |
567 awk '/^dirent_repaired/ { print $2 }')
568 # for interop with old server
569 [ -z "$repaired" ] &&
570 repaired=$($SHOW_NAMESPACE |
571 awk '/^updated_phase1/ { print $2 }')
573 [ $repaired -ge 9 ] ||
574 error "(9) Fail to re-generate FID-in-dirent: $repaired"
578 mount_client $MOUNT || error "(10) Fail to start client!"
580 #define OBD_FAIL_FID_LOOKUP 0x1505
581 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
582 ls $DIR/$tdir/ > /dev/null || error "(11) no FID-in-dirent."
583 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
585 run_test 4 "FID-in-dirent can be rebuilt after MDT file-level backup/restore"
589 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
590 skip "OI Scrub not implemented for ZFS" && return
593 cleanup_mount $MOUNT || error "(0.1) Fail to stop client!"
594 stop $SINGLEMDS > /dev/null || error "(0.2) Fail to stop MDS!"
596 mds_backup_restore $SINGLEMDS 1 || error "(1) Fail to backup/restore!"
597 echo "start $SINGLEMDS with disabling OI scrub"
598 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
599 error "(2) Fail to start MDS!"
601 #define OBD_FAIL_LFSCK_DELAY2 0x1601
602 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
603 $START_NAMESPACE -r || error "(4) Fail to start LFSCK for namespace!"
604 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
605 mdd.${MDT_DEV}.lfsck_namespace |
606 awk '/^flags/ { print \\\$2 }'" "inconsistent,upgrade" 32 || {
608 error "(5) unexpected status"
611 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
612 [ "$STATUS" == "scanning-phase1" ] ||
613 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
615 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
616 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
617 mdd.${MDT_DEV}.lfsck_namespace |
618 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
620 error "(7) unexpected status"
623 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
624 [ -z "$FLAGS" ] || error "(8) Expect empty flags, but got '$FLAGS'"
626 local repaired=$($SHOW_NAMESPACE |
627 awk '/^dirent_repaired/ { print $2 }')
628 # for interop with old server
629 [ -z "$repaired" ] &&
630 repaired=$($SHOW_NAMESPACE |
631 awk '/^updated_phase1/ { print $2 }')
633 [ $repaired -ge 2 ] ||
634 error "(9) Fail to generate FID-in-dirent for IGIF: $repaired"
638 mount_client $MOUNT || error "(10) Fail to start client!"
640 #define OBD_FAIL_FID_LOOKUP 0x1505
641 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1505
642 stat $DIR/$tdir/dummy > /dev/null || error "(11) no FID-in-LMA."
644 ls $DIR/$tdir/ > /dev/null || error "(12) no FID-in-dirent."
646 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
647 local dummyfid=$($LFS path2fid $DIR/$tdir/dummy)
648 local dummyname=$($LFS fid2path $DIR $dummyfid)
649 [ "$dummyname" == "$DIR/$tdir/dummy" ] ||
650 error "(13) Fail to generate linkEA: $dummyfid $dummyname"
652 run_test 5 "LFSCK can handle IGIF object upgrading"
657 #define OBD_FAIL_LFSCK_DELAY1 0x1600
658 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
659 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
661 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
662 [ "$STATUS" == "scanning-phase1" ] ||
663 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
665 # Sleep 3 sec to guarantee at least one object processed by LFSCK
667 # Fail the LFSCK to guarantee there is at least one checkpoint
668 #define OBD_FAIL_LFSCK_FATAL1 0x1608
669 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001608
670 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
671 mdd.${MDT_DEV}.lfsck_namespace |
672 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
674 error "(4) unexpected status"
677 local POS0=$($SHOW_NAMESPACE |
678 awk '/^last_checkpoint_position/ { print $2 }' |
681 #define OBD_FAIL_LFSCK_DELAY1 0x1600
682 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1600
683 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
685 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
686 [ "$STATUS" == "scanning-phase1" ] ||
687 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
689 local POS1=$($SHOW_NAMESPACE |
690 awk '/^latest_start_position/ { print $2 }' |
692 [[ $POS0 -lt $POS1 ]] ||
693 error "(7) Expect larger than: $POS0, but got $POS1"
695 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
696 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
697 mdd.${MDT_DEV}.lfsck_namespace |
698 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
700 error "(8) unexpected status"
703 run_test 6a "LFSCK resumes from last checkpoint (1)"
708 #define OBD_FAIL_LFSCK_DELAY2 0x1601
709 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
710 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
712 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
713 [ "$STATUS" == "scanning-phase1" ] ||
714 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
716 # Sleep 5 sec to guarantee that we are in the directory scanning
718 # Fail the LFSCK to guarantee there is at least one checkpoint
719 #define OBD_FAIL_LFSCK_FATAL2 0x1609
720 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
721 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
722 mdd.${MDT_DEV}.lfsck_namespace |
723 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
725 error "(4) unexpected status"
728 local O_POS0=$($SHOW_NAMESPACE |
729 awk '/^last_checkpoint_position/ { print $2 }' |
732 local D_POS0=$($SHOW_NAMESPACE |
733 awk '/^last_checkpoint_position/ { print $4 }')
735 #define OBD_FAIL_LFSCK_DELAY2 0x1601
736 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
737 $START_NAMESPACE || error "(5) Fail to start LFSCK for namespace!"
739 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
740 [ "$STATUS" == "scanning-phase1" ] ||
741 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
743 local O_POS1=$($SHOW_NAMESPACE |
744 awk '/^latest_start_position/ { print $2 }' |
746 local D_POS1=$($SHOW_NAMESPACE |
747 awk '/^latest_start_position/ { print $4 }')
749 if [ "$D_POS0" == "N/A" -o "$D_POS1" == "N/A" ]; then
750 [[ $O_POS0 -lt $O_POS1 ]] ||
751 error "(7.1) $O_POS1 is not larger than $O_POS0"
753 [[ $D_POS0 -lt $D_POS1 ]] ||
754 error "(7.2) $D_POS1 is not larger than $D_POS0"
757 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
758 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
759 mdd.${MDT_DEV}.lfsck_namespace |
760 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
762 error "(8) unexpected status"
765 run_test 6b "LFSCK resumes from last checkpoint (2)"
772 #define OBD_FAIL_LFSCK_DELAY2 0x1601
773 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1601
774 $START_NAMESPACE -r || error "(2) Fail to start LFSCK for namespace!"
776 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
777 [ "$STATUS" == "scanning-phase1" ] ||
778 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
780 # Sleep 3 sec to guarantee at least one object processed by LFSCK
782 echo "stop $SINGLEMDS"
783 stop $SINGLEMDS > /dev/null || error "(4) Fail to stop MDS!"
785 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
786 echo "start $SINGLEMDS"
787 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
788 error "(5) Fail to start MDS!"
790 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
791 mdd.${MDT_DEV}.lfsck_namespace |
792 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
794 error "(6) unexpected status"
797 run_test 7a "non-stopped LFSCK should auto restarts after MDS remount (1)"
803 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
804 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
805 for ((i = 0; i < 20; i++)); do
806 touch $DIR/$tdir/dummy${i}
809 #define OBD_FAIL_LFSCK_DELAY3 0x1602
810 do_facet $SINGLEMDS $LCTL set_param fail_val=1 fail_loc=0x1602
811 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace!"
812 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
813 mdd.${MDT_DEV}.lfsck_namespace |
814 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
816 error "(4) unexpected status"
820 echo "stop $SINGLEMDS"
821 stop $SINGLEMDS > /dev/null || error "(5) Fail to stop MDS!"
823 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
824 echo "start $SINGLEMDS"
825 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
826 error "(6) Fail to start MDS!"
828 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
829 mdd.${MDT_DEV}.lfsck_namespace |
830 awk '/^status/ { print \\\$2 }'" "completed" 30 || {
832 error "(7) unexpected status"
835 run_test 7b "non-stopped LFSCK should auto restarts after MDS remount (2)"
840 formatall > /dev/null
846 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
847 [ "$STATUS" == "init" ] ||
848 error "(2) Expect 'init', but got '$STATUS'"
850 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
851 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
852 mkdir $DIR/$tdir/crashed
854 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
855 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
856 for ((i = 0; i < 5; i++)); do
857 touch $DIR/$tdir/dummy${i}
860 umount_client $MOUNT || error "(3) Fail to stop client!"
862 #define OBD_FAIL_LFSCK_DELAY2 0x1601
863 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1601
864 $START_NAMESPACE || error "(4) Fail to start LFSCK for namespace!"
866 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
867 [ "$STATUS" == "scanning-phase1" ] ||
868 error "(5) Expect 'scanning-phase1', but got '$STATUS'"
870 $STOP_LFSCK || error "(6) Fail to stop LFSCK!"
872 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
873 [ "$STATUS" == "stopped" ] ||
874 error "(7) Expect 'stopped', but got '$STATUS'"
876 $START_NAMESPACE || error "(8) Fail to start LFSCK for namespace!"
878 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
879 [ "$STATUS" == "scanning-phase1" ] ||
880 error "(9) Expect 'scanning-phase1', but got '$STATUS'"
882 #define OBD_FAIL_LFSCK_FATAL2 0x1609
883 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x80001609
884 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
885 mdd.${MDT_DEV}.lfsck_namespace |
886 awk '/^status/ { print \\\$2 }'" "failed" 32 || {
888 error "(10) unexpected status"
891 #define OBD_FAIL_LFSCK_DELAY1 0x1600
892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1600
893 $START_NAMESPACE || error "(11) Fail to start LFSCK for namespace!"
895 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
896 [ "$STATUS" == "scanning-phase1" ] ||
897 error "(12) Expect 'scanning-phase1', but got '$STATUS'"
899 #define OBD_FAIL_LFSCK_CRASH 0x160a
900 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160a
903 echo "stop $SINGLEMDS"
904 stop $SINGLEMDS > /dev/null || error "(13) Fail to stop MDS!"
906 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
907 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
909 echo "start $SINGLEMDS"
910 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
911 error "(14) Fail to start MDS!"
913 local timeout=$(max_recovery_time)
916 while [ $timer -lt $timeout ]; do
917 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
918 mdt.${MDT_DEV}.recovery_status |
919 awk '/^status/ { print \\\$2 }'")
920 [ "$STATUS" != "RECOVERING" ] && break;
925 [ $timer != $timeout ] ||
926 error "(14.1) recovery timeout"
928 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
929 [ "$STATUS" == "crashed" ] ||
930 error "(15) Expect 'crashed', but got '$STATUS'"
932 #define OBD_FAIL_LFSCK_DELAY2 0x1601
933 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1601
934 $START_NAMESPACE || error "(16) Fail to start LFSCK for namespace!"
936 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
937 [ "$STATUS" == "scanning-phase1" ] ||
938 error "(17) Expect 'scanning-phase1', but got '$STATUS'"
940 echo "stop $SINGLEMDS"
941 stop $SINGLEMDS > /dev/null || error "(18) Fail to stop MDS!"
943 #define OBD_FAIL_LFSCK_NO_AUTO 0x160b
944 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160b
946 echo "start $SINGLEMDS"
947 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SCRUB > /dev/null ||
948 error "(19) Fail to start MDS!"
951 while [ $timer -lt $timeout ]; do
952 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
953 mdt.${MDT_DEV}.recovery_status |
954 awk '/^status/ { print \\\$2 }'")
955 [ "$STATUS" != "RECOVERING" ] && break;
960 [ $timer != $timeout ] ||
961 error "(19.1) recovery timeout"
963 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
964 [ "$STATUS" == "paused" ] ||
965 error "(20) Expect 'paused', but got '$STATUS'"
967 echo "stop $SINGLEMDS"
968 stop $SINGLEMDS > /dev/null || error "(20.1) Fail to stop MDS!"
970 echo "start $SINGLEMDS without resume LFSCK"
971 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_SKIP_LFSCK > /dev/null ||
972 error "(20.2) Fail to start MDS!"
975 while [ $timer -lt $timeout ]; do
976 STATUS=$(do_facet $SINGLEMDS "$LCTL get_param -n \
977 mdt.${MDT_DEV}.recovery_status |
978 awk '/^status/ { print \\\$2 }'")
979 [ "$STATUS" != "RECOVERING" ] && break;
984 [ $timer != $timeout ] ||
985 error "(20.3) recovery timeout"
987 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
988 [ "$STATUS" == "paused" ] ||
989 error "(20.4) Expect 'paused', but got '$STATUS'"
991 #define OBD_FAIL_LFSCK_DELAY3 0x1602
992 do_facet $SINGLEMDS $LCTL set_param fail_val=2 fail_loc=0x1602
994 $START_NAMESPACE || error "(21) Fail to start LFSCK for namespace!"
995 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
996 mdd.${MDT_DEV}.lfsck_namespace |
997 awk '/^status/ { print \\\$2 }'" "scanning-phase2" 32 || {
999 error "(22) unexpected status"
1002 local FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1003 [ "$FLAGS" == "scanned-once,inconsistent" ] ||
1004 error "(23) Expect 'scanned-once,inconsistent',but got '$FLAGS'"
1006 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1007 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1008 mdd.${MDT_DEV}.lfsck_namespace |
1009 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1011 error "(24) unexpected status"
1014 FLAGS=$($SHOW_NAMESPACE | awk '/^flags/ { print $2 }')
1015 [ -z "$FLAGS" ] || error "(25) Expect empty flags, but got '$FLAGS'"
1017 run_test 8 "LFSCK state machine"
1020 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1021 skip "Testing on UP system, the speed may be inaccurate."
1025 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1026 { skip "Need MDS version >= 2.7.50"; return; }
1028 check_mount_and_prep
1029 $LFS mkdir -i 0 $DIR/$tdir/lfsck || error "(1) Fail to mkdir lfsck"
1030 $LFS setstripe -c 1 -i -1 $DIR/$tdir/lfsck
1031 createmany -o $DIR/$tdir/lfsck/f 5000
1033 local BASE_SPEED1=100
1035 $START_LAYOUT -r -s $BASE_SPEED1 || error "(2) Fail to start LFSCK!"
1038 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
1039 [ "$STATUS" == "scanning-phase1" ] ||
1040 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
1042 local SPEED=$($SHOW_LAYOUT |
1043 awk '/^average_speed_phase1/ { print $2 }')
1045 # There may be time error, normally it should be less than 2 seconds.
1046 # We allow another 20% schedule error.
1048 # MAX_MARGIN = 1.2 = 12 / 10
1049 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1050 RUN_TIME1 * 12 / 10))
1051 [ $SPEED -lt $MAX_SPEED ] ||
1052 error "(4) Got speed $SPEED, expected less than $MAX_SPEED"
1054 # adjust speed limit
1055 local BASE_SPEED2=300
1057 do_facet $SINGLEMDS \
1058 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1061 SPEED=$($SHOW_LAYOUT | awk '/^average_speed_phase1/ { print $2 }')
1062 # MIN_MARGIN = 0.8 = 8 / 10
1063 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1064 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1065 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1066 [ $SPEED -gt $MIN_SPEED ] || {
1067 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1068 error_ignore LU-5624 \
1069 "(5.1) Got speed $SPEED, expected more than $MIN_SPEED"
1072 "(5.2) Got speed $SPEED, expected more than $MIN_SPEED"
1076 # MAX_MARGIN = 1.2 = 12 / 10
1077 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1078 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1079 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1080 [ $SPEED -lt $MAX_SPEED ] ||
1081 error "(6) Got speed $SPEED, expected less than $MAX_SPEED"
1083 do_facet $SINGLEMDS \
1084 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1086 wait_update_facet $SINGLEMDS \
1087 "$LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout |
1088 awk '/^status/ { print \\\$2 }'" "completed" 30 ||
1089 error "(7) Failed to get expected 'completed'"
1091 run_test 9a "LFSCK speed control (1)"
1094 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
1095 skip "Testing on UP system, the speed may be inaccurate."
1099 [[ $server_version -ge $(version_code 2.7.50) ]] ||
1100 { skip "Need MDS version >= 2.7.50"; return; }
1104 echo "Preparing another 50 * 50 files (with error) at $(date)."
1105 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1106 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1107 createmany -d $DIR/$tdir/d 50
1108 createmany -m $DIR/$tdir/f 50
1109 for ((i = 0; i < 50; i++)); do
1110 createmany -m $DIR/$tdir/d${i}/f 50 > /dev/null
1113 #define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
1114 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160c
1115 $START_NAMESPACE -r || error "(4) Fail to start LFSCK!"
1116 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1117 mdd.${MDT_DEV}.lfsck_namespace |
1118 awk '/^status/ { print \\\$2 }'" "stopped" 10 || {
1120 error "(5) unexpected status"
1123 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1124 echo "Prepared at $(date)."
1126 local BASE_SPEED1=50
1128 $START_NAMESPACE -s $BASE_SPEED1 || error "(6) Fail to start LFSCK!"
1131 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1132 [ "$STATUS" == "scanning-phase2" ] ||
1133 error "(7) Expect 'scanning-phase2', but got '$STATUS'"
1135 local SPEED=$($SHOW_NAMESPACE |
1136 awk '/^average_speed_phase2/ { print $2 }')
1137 # There may be time error, normally it should be less than 2 seconds.
1138 # We allow another 20% schedule error.
1140 # MAX_MARGIN = 1.2 = 12 / 10
1141 local MAX_SPEED=$((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) / \
1142 RUN_TIME1 * 12 / 10))
1143 [ $SPEED -lt $MAX_SPEED ] ||
1144 error "(8) Got speed $SPEED, expected less than $MAX_SPEED"
1146 # adjust speed limit
1147 local BASE_SPEED2=150
1149 do_facet $SINGLEMDS \
1150 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit $BASE_SPEED2
1153 SPEED=$($SHOW_NAMESPACE | awk '/^average_speed_phase2/ { print $2 }')
1154 # MIN_MARGIN = 0.8 = 8 / 10
1155 local MIN_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 - TIME_DIFF) + \
1156 BASE_SPEED2 * (RUN_TIME2 - TIME_DIFF)) / \
1157 (RUN_TIME1 + RUN_TIME2) * 8 / 10))
1158 [ $SPEED -gt $MIN_SPEED ] || {
1159 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
1160 error_ignore LU-5624 \
1161 "(9.1) Got speed $SPEED, expected more than $MIN_SPEED"
1164 "(9.2) Got speed $SPEED, expected more than $MIN_SPEED"
1168 # MAX_MARGIN = 1.2 = 12 / 10
1169 MAX_SPEED=$(((BASE_SPEED1 * (RUN_TIME1 + TIME_DIFF) + \
1170 BASE_SPEED2 * (RUN_TIME2 + TIME_DIFF)) / \
1171 (RUN_TIME1 + RUN_TIME2) * 12 / 10))
1172 [ $SPEED -lt $MAX_SPEED ] ||
1173 error "(10) Got speed $SPEED, expected less than $MAX_SPEED"
1175 do_facet $SINGLEMDS \
1176 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1177 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1178 mdd.${MDT_DEV}.lfsck_namespace |
1179 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1181 error "(11) unexpected status"
1184 run_test 9b "LFSCK speed control (2)"
1188 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
1189 skip "lookup(..)/linkea on ZFS issue" && return
1193 echo "Preparing more files with error at $(date)."
1194 #define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
1195 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1603
1197 for ((i = 0; i < 1000; i = $((i+2)))); do
1198 mkdir -p $DIR/$tdir/d${i}
1199 touch $DIR/$tdir/f${i}
1200 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1203 #define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
1204 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1604
1206 for ((i = 1; i < 1000; i = $((i+2)))); do
1207 mkdir -p $DIR/$tdir/d${i}
1208 touch $DIR/$tdir/f${i}
1209 createmany -m $DIR/$tdir/d${i}/f 5 > /dev/null
1212 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1213 echo "Prepared at $(date)."
1215 ln $DIR/$tdir/f200 $DIR/$tdir/d200/dummy
1217 umount_client $MOUNT
1218 mount_client $MOUNT || error "(3) Fail to start client!"
1220 $START_NAMESPACE -r -s 100 || error "(5) Fail to start LFSCK!"
1223 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1224 [ "$STATUS" == "scanning-phase1" ] ||
1225 error "(6) Expect 'scanning-phase1', but got '$STATUS'"
1227 ls -ailR $MOUNT > /dev/null || error "(7) Fail to ls!"
1229 touch $DIR/$tdir/d198/a0 || error "(8) Fail to touch!"
1231 mkdir $DIR/$tdir/d199/a1 || error "(9) Fail to mkdir!"
1233 unlink $DIR/$tdir/f200 || error "(10) Fail to unlink!"
1235 rm -rf $DIR/$tdir/d201 || error "(11) Fail to rmdir!"
1237 mv $DIR/$tdir/f202 $DIR/$tdir/d203/ || error "(12) Fail to rename!"
1239 ln $DIR/$tdir/f204 $DIR/$tdir/d205/a3 || error "(13) Fail to hardlink!"
1241 ln -s $DIR/$tdir/d206 $DIR/$tdir/d207/a4 ||
1242 error "(14) Fail to softlink!"
1244 STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
1245 [ "$STATUS" == "scanning-phase1" ] ||
1246 error "(15) Expect 'scanning-phase1', but got '$STATUS'"
1248 do_facet $SINGLEMDS \
1249 $LCTL set_param -n mdd.${MDT_DEV}.lfsck_speed_limit 0
1250 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1251 mdd.${MDT_DEV}.lfsck_namespace |
1252 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1254 error "(16) unexpected status"
1257 run_test 10 "System is available during LFSCK scanning"
1260 ost_remove_lastid() {
1263 local rcmd="do_facet ost${ost}"
1265 echo "remove LAST_ID on ost${ost}: idx=${idx}"
1267 # step 1: local mount
1268 mount_fstype ost${ost} || return 1
1269 # step 2: remove the specified LAST_ID
1270 ${rcmd} rm -fv $(facet_mntpt ost${ost})/O/${idx}/{LAST_ID,d0/0}
1272 unmount_fstype ost${ost} || return 2
1276 check_mount_and_prep
1277 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1278 createmany -o $DIR/$tdir/f 64 || error "(0) Fail to create 64 files."
1283 ost_remove_lastid 1 0 || error "(1) Fail to remove LAST_ID"
1285 start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB > /dev/null ||
1286 error "(2) Fail to start ost1"
1288 #define OBD_FAIL_LFSCK_DELAY4 0x160e
1289 do_facet ost1 $LCTL set_param fail_val=3 fail_loc=0x160e
1291 echo "trigger LFSCK for layout on ost1 to rebuild the LAST_ID(s)"
1292 $START_LAYOUT_ON_OST -r || error "(4) Fail to start LFSCK on OST!"
1294 wait_update_facet ost1 "$LCTL get_param -n \
1295 obdfilter.${OST_DEV}.lfsck_layout |
1296 awk '/^flags/ { print \\\$2 }'" "crashed_lastid" 60 || {
1298 error "(5) unexpected status"
1301 do_facet ost1 $LCTL set_param fail_val=0 fail_loc=0
1303 wait_update_facet ost1 "$LCTL get_param -n \
1304 obdfilter.${OST_DEV}.lfsck_layout |
1305 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1307 error "(6) unexpected status"
1310 echo "the LAST_ID(s) should have been rebuilt"
1311 FLAGS=$($SHOW_LAYOUT_ON_OST | awk '/^flags/ { print $2 }')
1312 [ -z "$FLAGS" ] || error "(7) Expect empty flags, but got '$FLAGS'"
1314 run_test 11a "LFSCK can rebuild lost last_id"
1317 check_mount_and_prep
1318 $SETSTRIPE -c 1 -i 0 $DIR/$tdir
1320 echo "set fail_loc=0x160d to skip the updating LAST_ID on-disk"
1321 #define OBD_FAIL_LFSCK_SKIP_LASTID 0x160d
1322 do_facet ost1 $LCTL set_param fail_loc=0x160d
1324 local count=$(precreated_ost_obj_count 0 0)
1326 createmany -o $DIR/$tdir/f $((count + 32))
1328 local proc_path="${FSNAME}-OST0000-osc-MDT0000"
1329 local seq=$(do_facet mds1 $LCTL get_param -n \
1330 osp.${proc_path}.prealloc_last_seq)
1331 local lastid1=$(do_facet ost1 "lctl get_param -n \
1332 obdfilter.${ost1_svc}.last_id" | grep $seq |
1333 awk -F: '{ print $2 }')
1335 umount_client $MOUNT
1336 stop ost1 || error "(1) Fail to stop ost1"
1338 #define OBD_FAIL_OST_ENOSPC 0x215
1339 do_facet ost1 $LCTL set_param fail_loc=0x215
1341 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1342 error "(2) Fail to start ost1"
1344 for ((i = 0; i < 60; i++)); do
1345 lastid2=$(do_facet ost1 "lctl get_param -n \
1346 obdfilter.${ost1_svc}.last_id" | grep $seq |
1347 awk -F: '{ print $2 }')
1348 [ ! -z $lastid2 ] && break;
1352 echo "the on-disk LAST_ID should be smaller than the expected one"
1353 [ $lastid1 -gt $lastid2 ] ||
1354 error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
1356 echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
1357 $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
1359 wait_update_facet ost1 "$LCTL get_param -n \
1360 obdfilter.${OST_DEV}.lfsck_layout |
1361 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1363 error "(6) unexpected status"
1366 stop ost1 || error "(7) Fail to stop ost1"
1368 start ost1 $(ostdevname 1) $OST_MOUNT_OPTS ||
1369 error "(8) Fail to start ost1"
1371 echo "the on-disk LAST_ID should have been rebuilt"
1372 wait_update_facet ost1 "$LCTL get_param -n \
1373 obdfilter.${ost1_svc}.last_id | grep $seq |
1374 awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
1375 do_facet ost1 $LCTL get_param -n \
1376 obdfilter.${ost1_svc}.last_id
1377 error "(9) expect lastid1 $seq:$lastid1"
1380 do_facet ost1 $LCTL set_param fail_loc=0
1381 stopall || error "(10) Fail to stopall"
1383 run_test 11b "LFSCK can rebuild crashed last_id"
1386 [ $MDSCOUNT -lt 2 ] &&
1387 skip "We need at least 2 MDSes for test_12a" && return
1389 check_mount_and_prep
1390 for k in $(seq $MDSCOUNT); do
1391 $LFS mkdir -i $((k - 1)) $DIR/$tdir/${k}
1392 createmany -o $DIR/$tdir/${k}/f 100 ||
1393 error "(0) Fail to create 100 files."
1396 echo "Start namespace LFSCK on all targets by single command (-s 1)."
1397 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1398 -s 1 -r || error "(2) Fail to start LFSCK on all devices!"
1400 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1401 wait_all_targets namespace scanning-phase1 3
1403 echo "Stop namespace LFSCK on all targets by single lctl command."
1404 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1405 error "(4) Fail to stop LFSCK on all devices!"
1407 echo "All the LFSCK targets should be in 'stopped' status."
1408 wait_all_targets_blocked namespace stopped 5
1410 echo "Re-start namespace LFSCK on all targets by single command (-s 0)."
1411 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t namespace -A \
1412 -s 0 -r || error "(6) Fail to start LFSCK on all devices!"
1414 echo "All the LFSCK targets should be in 'completed' status."
1415 wait_all_targets_blocked namespace completed 7
1417 start_full_debug_logging
1419 echo "Start layout LFSCK on all targets by single command (-s 1)."
1420 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1421 -s 1 -r || error "(8) Fail to start LFSCK on all devices!"
1423 echo "All the LFSCK targets should be in 'scanning-phase1' status."
1424 wait_all_targets layout scanning-phase1 9
1426 echo "Stop layout LFSCK on all targets by single lctl command."
1427 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 -A ||
1428 error "(10) Fail to stop LFSCK on all devices!"
1430 echo "All the LFSCK targets should be in 'stopped' status."
1431 wait_all_targets_blocked layout stopped 11
1433 for k in $(seq $OSTCOUNT); do
1434 local STATUS=$(do_facet ost${k} $LCTL get_param -n \
1435 obdfilter.$(facet_svc ost${k}).lfsck_layout |
1436 awk '/^status/ { print $2 }')
1437 [ "$STATUS" == "stopped" ] ||
1438 error "(12) OST${k} Expect 'stopped', but got '$STATUS'"
1441 echo "Re-start layout LFSCK on all targets by single command (-s 0)."
1442 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -t layout -A \
1443 -s 0 -r || error "(13) Fail to start LFSCK on all devices!"
1445 echo "All the LFSCK targets should be in 'completed' status."
1446 wait_all_targets_blocked layout completed 14
1448 stop_full_debug_logging
1450 run_test 12a "single command to trigger LFSCK on all devices"
1453 check_mount_and_prep
1455 echo "Start LFSCK without '-M' specified."
1456 do_facet mds1 $LCTL lfsck_start -A -r ||
1457 error "(0) Fail to start LFSCK without '-M'"
1459 wait_all_targets_blocked namespace completed 1
1460 wait_all_targets_blocked layout completed 2
1462 local count=$(do_facet mds1 $LCTL dl |
1463 awk '{ print $3 }' | grep mdt | wc -l)
1464 if [ $count -gt 1 ]; then
1466 echo "Start layout LFSCK on the node with multipe targets,"
1467 echo "but not specify '-M'/'-A' option. Should get failure."
1469 do_facet mds1 $LCTL lfsck_start -t layout -r &&
1470 error "(3) Start layout LFSCK should fail" || true
1473 run_test 12b "auto detect Lustre device"
1477 echo "The lmm_oi in layout EA should be consistent with the MDT-object"
1478 echo "FID; otherwise, the LFSCK should re-generate the lmm_oi from the"
1479 echo "MDT-object FID."
1482 check_mount_and_prep
1484 echo "Inject failure stub to simulate bad lmm_oi"
1485 #define OBD_FAIL_LFSCK_BAD_LMMOI 0x160f
1486 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x160f
1487 createmany -o $DIR/$tdir/f 1
1488 $LFS setstripe -E 1M -E -1 $DIR/$tdir/f1 ||
1489 error "(0) Fail to create PFL $DIR/$tdir/f1"
1490 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1492 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
1493 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1495 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1496 mdd.${MDT_DEV}.lfsck_layout |
1497 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1499 error "(2) unexpected status"
1502 local repaired=$($SHOW_LAYOUT |
1503 awk '/^repaired_others/ { print $2 }')
1504 [ $repaired -eq 2 ] ||
1505 error "(3) Fail to repair crashed lmm_oi: $repaired"
1507 run_test 13 "LFSCK can repair crashed lmm_oi"
1511 echo "The OST-object referenced by the MDT-object should be there;"
1512 echo "otherwise, the LFSCK should re-create the missing OST-object."
1513 echo "without '--delay-create-ostobj' option."
1516 check_mount_and_prep
1517 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1519 echo "Inject failure stub to simulate dangling referenced MDT-object"
1520 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1521 do_facet ost1 $LCTL set_param fail_loc=0x1610
1522 local count=$(precreated_ost_obj_count 0 0)
1524 createmany -o $DIR/$tdir/f $((count + 16)) ||
1525 error "(0.1) Fail to create $DIR/$tdir/fx"
1526 touch $DIR/$tdir/guard0
1528 for ((i = 0; i < 16; i++)); do
1529 $LFS setstripe -E 512K -S 256K -o 0 -E 2M \
1530 $DIR/$tdir/f_comp${i} ||
1531 error "(0.2) Fail to create $DIR/$tdir/f_comp${i}"
1533 touch $DIR/$tdir/guard1
1535 do_facet ost1 $LCTL set_param fail_loc=0
1537 start_full_debug_logging
1539 # exhaust other pre-created dangling cases
1540 count=$(precreated_ost_obj_count 0 0)
1541 createmany -o $DIR/$tdir/a $count ||
1542 error "(0.5) Fail to create $count files."
1544 echo "'ls' should fail because of dangling referenced MDT-object"
1545 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1547 echo "Trigger layout LFSCK to find out dangling reference"
1548 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1550 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1551 mdd.${MDT_DEV}.lfsck_layout |
1552 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1554 error "(3) unexpected status"
1557 local repaired=$($SHOW_LAYOUT |
1558 awk '/^repaired_dangling/ { print $2 }')
1559 [ $repaired -ge 32 ] ||
1560 error "(4) Fail to repair dangling reference: $repaired"
1562 echo "'stat' should fail because of not repair dangling by default"
1563 stat $DIR/$tdir/guard0 > /dev/null 2>&1 &&
1564 error "(5.1) stat should fail"
1565 stat $DIR/$tdir/guard1 > /dev/null 2>&1 &&
1566 error "(5.2) stat should fail"
1568 echo "Trigger layout LFSCK to repair dangling reference"
1569 $START_LAYOUT -r -c || error "(6) Fail to start LFSCK for layout!"
1571 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1572 mdd.${MDT_DEV}.lfsck_layout |
1573 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1575 error "(7) unexpected status"
1578 # There may be some async LFSCK updates in processing, wait for
1579 # a while until the target reparation has been done. LU-4970.
1581 echo "'stat' should success after layout LFSCK repairing"
1582 wait_update_facet client "stat $DIR/$tdir/guard0 |
1583 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1584 stat $DIR/$tdir/guard0
1586 error "(8.1) unexpected size"
1589 wait_update_facet client "stat $DIR/$tdir/guard1 |
1590 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1591 stat $DIR/$tdir/guard1
1593 error "(8.2) unexpected size"
1596 repaired=$($SHOW_LAYOUT |
1597 awk '/^repaired_dangling/ { print $2 }')
1598 [ $repaired -ge 32 ] ||
1599 error "(9) Fail to repair dangling reference: $repaired"
1601 stop_full_debug_logging
1603 echo "stopall to cleanup object cache"
1606 setupall > /dev/null
1608 run_test 14a "LFSCK can repair MDT-object with dangling LOV EA reference (1)"
1612 echo "The OST-object referenced by the MDT-object should be there;"
1613 echo "otherwise, the LFSCK should re-create the missing OST-object."
1614 echo "with '--delay-create-ostobj' option."
1617 check_mount_and_prep
1618 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1620 echo "Inject failure stub to simulate dangling referenced MDT-object"
1621 #define OBD_FAIL_LFSCK_DANGLING 0x1610
1622 do_facet ost1 $LCTL set_param fail_loc=0x1610
1623 local count=$(precreated_ost_obj_count 0 0)
1625 createmany -o $DIR/$tdir/f $((count + 31))
1626 touch $DIR/$tdir/guard
1627 do_facet ost1 $LCTL set_param fail_loc=0
1629 start_full_debug_logging
1631 # exhaust other pre-created dangling cases
1632 count=$(precreated_ost_obj_count 0 0)
1633 createmany -o $DIR/$tdir/a $count ||
1634 error "(0) Fail to create $count files."
1636 echo "'ls' should fail because of dangling referenced MDT-object"
1637 ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(1) ls should fail."
1639 echo "Trigger layout LFSCK to find out dangling reference"
1640 $START_LAYOUT -r -o -d || error "(2) Fail to start LFSCK for layout!"
1642 wait_all_targets_blocked layout completed 3
1644 local repaired=$($SHOW_LAYOUT |
1645 awk '/^repaired_dangling/ { print $2 }')
1646 [ $repaired -ge 32 ] ||
1647 error "(4) Fail to repair dangling reference: $repaired"
1649 echo "'stat' should fail because of not repair dangling by default"
1650 stat $DIR/$tdir/guard > /dev/null 2>&1 && error "(5) stat should fail"
1652 echo "Trigger layout LFSCK to repair dangling reference"
1653 $START_LAYOUT -r -o -c -d || error "(6) Fail to start LFSCK for layout!"
1655 wait_all_targets_blocked layout completed 7
1657 # There may be some async LFSCK updates in processing, wait for
1658 # a while until the target reparation has been done. LU-4970.
1660 echo "'stat' should success after layout LFSCK repairing"
1661 wait_update_facet client "stat $DIR/$tdir/guard |
1662 awk '/Size/ { print \\\$2 }'" "0" 32 || {
1663 stat $DIR/$tdir/guard
1665 error "(8) unexpected size"
1668 repaired=$($SHOW_LAYOUT |
1669 awk '/^repaired_dangling/ { print $2 }')
1670 [ $repaired -ge 32 ] ||
1671 error "(9) Fail to repair dangling reference: $repaired"
1673 stop_full_debug_logging
1675 echo "stopall to cleanup object cache"
1678 setupall > /dev/null
1680 run_test 14b "LFSCK can repair MDT-object with dangling LOV EA reference (2)"
1684 echo "If the OST-object referenced by the MDT-object back points"
1685 echo "to some non-exist MDT-object, then the LFSCK should repair"
1686 echo "the OST-object to back point to the right MDT-object."
1689 check_mount_and_prep
1690 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1692 echo "Inject failure stub to make the OST-object to back point to"
1693 echo "non-exist MDT-object."
1694 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
1696 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
1697 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1698 $LFS setstripe -E 1M -S 256K -c 1 -E -1 -S 512K -c $OSTCOUNT \
1700 error "(0) Fail to create PFL $DIR/$tdir/f1"
1701 # 'dd' will trigger punch RPC firstly on every OST-objects.
1702 # So even though some OST-object will not be write by 'dd',
1703 # as long as it is allocated (may be NOT allocated in pfl_3b)
1704 # its layout information will be set also.
1705 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4K count=257
1706 cancel_lru_locks osc
1707 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1709 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1710 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1712 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1713 mdd.${MDT_DEV}.lfsck_layout |
1714 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1716 error "(2) unexpected status"
1719 local repaired=$($SHOW_LAYOUT |
1720 awk '/^repaired_unmatched_pair/ { print $2 }')
1721 [ $repaired -ge 3 ] ||
1722 error "(3) Fail to repair unmatched pair: $repaired"
1724 run_test 15a "LFSCK can repair unmatched MDT-object/OST-object pairs (1)"
1728 echo "If the OST-object referenced by the MDT-object back points"
1729 echo "to other MDT-object that doesn't recognize the OST-object,"
1730 echo "then the LFSCK should repair it to back point to the right"
1731 echo "MDT-object (the first one)."
1734 check_mount_and_prep
1735 mkdir -p $DIR/$tdir/0
1736 $LFS setstripe -c 1 -i 0 $DIR/$tdir/0
1737 dd if=/dev/zero of=$DIR/$tdir/0/guard bs=1M count=1
1738 cancel_lru_locks osc
1740 echo "Inject failure stub to make the OST-object to back point to"
1741 echo "other MDT-object"
1744 [ $OSTCOUNT -ge 2 ] && stripes=2
1746 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR2 0x1612
1747 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1612
1748 dd if=/dev/zero of=$DIR/$tdir/0/f0 bs=1M count=1
1749 $LFS setstripe -E 1M -S 256K -c $stripes -E 2M -S 512K -c 1 \
1751 error "(0) Fail to create PFL $DIR/$tdir/f1"
1752 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2
1753 cancel_lru_locks osc
1754 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
1756 echo "Trigger layout LFSCK to find out unmatched pairs and fix them"
1757 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1759 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1760 mdd.${MDT_DEV}.lfsck_layout |
1761 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1763 error "(2) unexpected status"
1766 local repaired=$($SHOW_LAYOUT |
1767 awk '/^repaired_unmatched_pair/ { print $2 }')
1768 [ $repaired -eq 4 ] ||
1769 error "(3) Fail to repair unmatched pair: $repaired"
1771 run_test 15b "LFSCK can repair unmatched MDT-object/OST-object pairs (2)"
1774 [ $MDSCOUNT -lt 2 ] &&
1775 skip "We need at least 2 MDSes for this test" && return
1777 [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.55) ] &&
1778 skip "Skip the test after 2.7.55 see LU-6437" && return
1781 echo "According to current metadata migration implementation,"
1782 echo "before the old MDT-object is removed, both the new MDT-object"
1783 echo "and old MDT-object will reference the same LOV layout. Then if"
1784 echo "the layout LFSCK finds the new MDT-object by race, it will"
1785 echo "regard related OST-object(s) as multiple referenced case, and"
1786 echo "will try to create new OST-object(s) for the new MDT-object."
1787 echo "To avoid such trouble, the layout LFSCK needs to lock the old"
1788 echo "MDT-object before confirm the multiple referenced case."
1791 check_mount_and_prep
1792 $LFS mkdir -i 1 $DIR/$tdir/a1
1793 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1794 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=1
1795 cancel_lru_locks osc
1797 echo "Inject failure stub on MDT1 to delay the migration"
1799 #define OBD_FAIL_MIGRATE_DELAY 0x1803
1800 do_facet mds2 $LCTL set_param fail_val=5 fail_loc=0x1803
1801 echo "Migrate $DIR/$tdir/a1 from MDT1 to MDT0 with delay"
1802 $LFS migrate -m 0 $DIR/$tdir/a1 &
1805 echo "Trigger layout LFSCK to race with the migration"
1806 $START_LAYOUT -A -r || error "(1) Fail to start layout LFSCK!"
1808 wait_all_targets_blocked layout completed 2
1810 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
1811 local repaired=$($SHOW_LAYOUT |
1812 awk '/^repaired_unmatched_pair/ { print $2 }')
1813 [ $repaired -eq 1 ] ||
1814 error "(3) Fail to repair unmatched pair: $repaired"
1816 repaired=$($SHOW_LAYOUT |
1817 awk '/^repaired_multiple_referenced/ { print $2 }')
1818 [ $repaired -eq 0 ] ||
1819 error "(4) Unexpectedly repaird multiple references: $repaired"
1821 run_test 15c "LFSCK can repair unmatched MDT-object/OST-object pairs (3)"
1825 echo "If the OST-object's owner information does not match the owner"
1826 echo "information stored in the MDT-object, then the LFSCK trust the"
1827 echo "MDT-object and update the OST-object's owner information."
1830 check_mount_and_prep
1831 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1832 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1
1833 cancel_lru_locks osc
1835 echo "Inject failure stub to skip OST-object owner changing"
1836 #define OBD_FAIL_LFSCK_BAD_OWNER 0x1613
1837 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1613
1838 chown 1.1 $DIR/$tdir/f0
1839 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
1841 echo "Trigger layout LFSCK to find out inconsistent OST-object owner"
1844 $START_LAYOUT -r || error "(1) Fail to start LFSCK for layout!"
1846 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1847 mdd.${MDT_DEV}.lfsck_layout |
1848 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1850 error "(2) unexpected status"
1853 local repaired=$($SHOW_LAYOUT |
1854 awk '/^repaired_inconsistent_owner/ { print $2 }')
1855 [ $repaired -eq 1 ] ||
1856 error "(3) Fail to repair inconsistent owner: $repaired"
1858 run_test 16 "LFSCK can repair inconsistent MDT-object/OST-object owner"
1862 echo "If more than one MDT-objects reference the same OST-object,"
1863 echo "and the OST-object only recognizes one MDT-object, then the"
1864 echo "LFSCK should create new OST-objects for such non-recognized"
1868 check_mount_and_prep
1869 $LFS setstripe -c 1 -i 0 $DIR/$tdir
1871 echo "Inject failure stub to make two MDT-objects to refernce"
1872 echo "the OST-object"
1874 #define OBD_FAIL_LFSCK_MULTIPLE_REF 0x1614
1875 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0x1614
1876 dd if=/dev/zero of=$DIR/$tdir/guard bs=1M count=1
1877 cancel_lru_locks mdc
1878 cancel_lru_locks osc
1880 createmany -o $DIR/$tdir/f 1
1881 cancel_lru_locks mdc
1882 cancel_lru_locks osc
1884 $LFS setstripe -E 2M -S 256K -o 0 -E 4M -S 512K -o 0 \
1886 error "(0) Fail to create PFL $DIR/$tdir/f1"
1887 cancel_lru_locks mdc
1888 cancel_lru_locks osc
1889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
1891 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard use the same OST-objects"
1892 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard use the same OST-objects"
1893 local size=$(ls -l $DIR/$tdir/f0 | awk '{ print $5 }')
1894 [ $size -eq 1048576 ] ||
1895 error "(1.1) f0 (wrong) size should be 1048576, but got $size"
1897 size=$(ls -l $DIR/$tdir/f1 | awk '{ print $5 }')
1898 [ $size -eq 1048576 ] ||
1899 error "(1.2) f1 (wrong) size should be 1048576, but got $size"
1901 echo "Trigger layout LFSCK to find out multiple refenced MDT-objects"
1904 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
1906 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
1907 mdd.${MDT_DEV}.lfsck_layout |
1908 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
1910 error "(3) unexpected status"
1913 local repaired=$($SHOW_LAYOUT |
1914 awk '/^repaired_multiple_referenced/ { print $2 }')
1915 [ $repaired -eq 2 ] ||
1916 error "(4) Fail to repair multiple references: $repaired"
1918 echo "$DIR/$tdir/f0 and $DIR/$tdir/guard should use diff OST-objects"
1919 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=2 ||
1920 error "(5) Fail to write f0."
1921 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1922 [ $size -eq 1048576 ] ||
1923 error "(6) guard size should be 1048576, but got $size"
1925 echo "$DIR/$tdir/f1 and $DIR/$tdir/guard should use diff OST-objects"
1926 dd if=/dev/zero of=$DIR/$tdir/f1 bs=1M count=2 ||
1927 error "(7) Fail to write f1."
1928 size=$(ls -l $DIR/$tdir/guard | awk '{ print $5 }')
1929 [ $size -eq 1048576 ] ||
1930 error "(8) guard size should be 1048576, but got $size"
1932 run_test 17 "LFSCK can repair multiple references"
1934 $LCTL set_param debug=+cache > /dev/null
1938 echo "The target MDT-object is there, but related stripe information"
1939 echo "is lost or partly lost. The LFSCK should regenerate the missing"
1940 echo "layout EA entries."
1943 check_mount_and_prep
1944 $LFS mkdir -i 0 $DIR/$tdir/a1
1945 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
1946 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
1948 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1950 $LFS path2fid $DIR/$tdir/a1/f1
1951 $LFS getstripe $DIR/$tdir/a1/f1
1953 if [ $MDSCOUNT -ge 2 ]; then
1954 $LFS mkdir -i 1 $DIR/$tdir/a2
1955 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
1956 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
1957 $LFS path2fid $DIR/$tdir/a2/f2
1958 $LFS getstripe $DIR/$tdir/a2/f2
1961 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
1962 error "(0) Fail to create PFL $DIR/$tdir/f3"
1964 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
1966 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
1968 $LFS path2fid $DIR/$tdir/f3
1969 $LFS getstripe $DIR/$tdir/f3
1971 cancel_lru_locks osc
1973 echo "Inject failure, to make the MDT-object lost its layout EA"
1974 #define OBD_FAIL_LFSCK_LOST_STRIPE 0x1615
1975 do_facet mds1 $LCTL set_param fail_loc=0x1615
1976 chown 1.1 $DIR/$tdir/a1/f1
1978 if [ $MDSCOUNT -ge 2 ]; then
1979 do_facet mds2 $LCTL set_param fail_loc=0x1615
1980 chown 1.1 $DIR/$tdir/a2/f2
1983 chown 1.1 $DIR/$tdir/f3
1988 do_facet mds1 $LCTL set_param fail_loc=0
1989 if [ $MDSCOUNT -ge 2 ]; then
1990 do_facet mds2 $LCTL set_param fail_loc=0
1993 cancel_lru_locks mdc
1994 cancel_lru_locks osc
1996 echo "The file size should be incorrect since layout EA is lost"
1997 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
1998 [ "$cur_size" != "$saved_size1" ] ||
1999 error "(1) Expect incorrect file1 size"
2001 if [ $MDSCOUNT -ge 2 ]; then
2002 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2003 [ "$cur_size" != "$saved_size1" ] ||
2004 error "(2) Expect incorrect file2 size"
2007 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2008 [ "$cur_size" != "$saved_size2" ] ||
2009 error "(1.2) Expect incorrect file3 size"
2011 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2012 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2014 for k in $(seq $MDSCOUNT); do
2015 # The LFSCK status query internal is 30 seconds. For the case
2016 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2017 # time to guarantee the status sync up.
2018 wait_update_facet mds${k} "$LCTL get_param -n \
2019 mdd.$(facet_svc mds${k}).lfsck_layout |
2020 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2021 error "(4) MDS${k} is not the expected 'completed'"
2024 for k in $(seq $OSTCOUNT); do
2025 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2026 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2027 awk '/^status/ { print $2 }')
2028 [ "$cur_status" == "completed" ] ||
2029 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2032 local repaired=$(do_facet mds1 $LCTL get_param -n \
2033 mdd.$(facet_svc mds1).lfsck_layout |
2034 awk '/^repaired_orphan/ { print $2 }')
2035 [ $repaired -eq 3 ] ||
2036 error "(6.1) Expect 3 fixed on mds1, but got: $repaired"
2038 if [ $MDSCOUNT -ge 2 ]; then
2039 repaired=$(do_facet mds2 $LCTL get_param -n \
2040 mdd.$(facet_svc mds2).lfsck_layout |
2041 awk '/^repaired_orphan/ { print $2 }')
2042 [ $repaired -eq 2 ] ||
2043 error "(6.2) Expect 2 fixed on mds2, but got: $repaired"
2046 $LFS path2fid $DIR/$tdir/a1/f1
2047 $LFS getstripe $DIR/$tdir/a1/f1
2049 if [ $MDSCOUNT -ge 2 ]; then
2050 $LFS path2fid $DIR/$tdir/a2/f2
2051 $LFS getstripe $DIR/$tdir/a2/f2
2054 $LFS path2fid $DIR/$tdir/f3
2055 $LFS getstripe $DIR/$tdir/f3
2057 echo "The file size should be correct after layout LFSCK scanning"
2058 cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2059 [ "$cur_size" == "$saved_size1" ] ||
2060 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2062 if [ $MDSCOUNT -ge 2 ]; then
2063 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2064 [ "$cur_size" == "$saved_size1" ] ||
2065 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2068 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2069 [ "$cur_size" == "$saved_size2" ] ||
2070 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2072 run_test 18a "Find out orphan OST-object and repair it (1)"
2076 echo "The target MDT-object is lost. The LFSCK should re-create the"
2077 echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
2078 echo "can move it back to normal namespace manually."
2081 check_mount_and_prep
2082 $LFS mkdir -i 0 $DIR/$tdir/a1
2083 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2084 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2085 local saved_size1=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2086 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2088 $LFS getstripe $DIR/$tdir/a1/f1
2090 if [ $MDSCOUNT -ge 2 ]; then
2091 $LFS mkdir -i 1 $DIR/$tdir/a2
2092 $LFS setstripe -c 2 -i 1 -S 1M $DIR/$tdir/a2
2093 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2094 fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
2096 $LFS getstripe $DIR/$tdir/a2/f2
2099 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2100 error "(0) Fail to create PFL $DIR/$tdir/f3"
2102 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2104 local saved_size2=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2105 local fid3=$($LFS path2fid $DIR/$tdir/f3)
2107 $LFS getstripe $DIR/$tdir/f3
2109 cancel_lru_locks osc
2111 echo "Inject failure, to simulate the case of missing the MDT-object"
2112 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2113 do_facet mds1 $LCTL set_param fail_loc=0x1616
2114 rm -f $DIR/$tdir/a1/f1
2116 if [ $MDSCOUNT -ge 2 ]; then
2117 do_facet mds2 $LCTL set_param fail_loc=0x1616
2118 rm -f $DIR/$tdir/a2/f2
2126 do_facet mds1 $LCTL set_param fail_loc=0
2127 if [ $MDSCOUNT -ge 2 ]; then
2128 do_facet mds2 $LCTL set_param fail_loc=0
2131 cancel_lru_locks mdc
2132 cancel_lru_locks osc
2134 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2135 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2137 for k in $(seq $MDSCOUNT); do
2138 # The LFSCK status query internal is 30 seconds. For the case
2139 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2140 # time to guarantee the status sync up.
2141 wait_update_facet mds${k} "$LCTL get_param -n \
2142 mdd.$(facet_svc mds${k}).lfsck_layout |
2143 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2144 error "(2) MDS${k} is not the expected 'completed'"
2147 for k in $(seq $OSTCOUNT); do
2148 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2149 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2150 awk '/^status/ { print $2 }')
2151 [ "$cur_status" == "completed" ] ||
2152 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2155 local repaired=$(do_facet mds1 $LCTL get_param -n \
2156 mdd.$(facet_svc mds1).lfsck_layout |
2157 awk '/^repaired_orphan/ { print $2 }')
2158 [ $repaired -eq 3 ] ||
2159 error "(4.1) Expect 3 fixed on mds1, but got: $repaired"
2161 if [ $MDSCOUNT -ge 2 ]; then
2162 repaired=$(do_facet mds2 $LCTL get_param -n \
2163 mdd.$(facet_svc mds2).lfsck_layout |
2164 awk '/^repaired_orphan/ { print $2 }')
2165 [ $repaired -eq 2 ] ||
2166 error "(4.2) Expect 2 fixed on mds2, but got: $repaired"
2169 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2170 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2171 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2173 if [ $MDSCOUNT -ge 2 ]; then
2174 local name=$MOUNT/.lustre/lost+found/MDT0001/${fid2}-R-0
2175 mv $name $DIR/$tdir/a2/f2 || error "(6) Fail to move $name"
2178 mv $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0 $DIR/$tdir/f3 ||
2179 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
2181 $LFS path2fid $DIR/$tdir/a1/f1
2182 $LFS getstripe $DIR/$tdir/a1/f1
2184 if [ $MDSCOUNT -ge 2 ]; then
2185 $LFS path2fid $DIR/$tdir/a2/f2
2186 $LFS getstripe $DIR/$tdir/a2/f2
2189 $LFS path2fid $DIR/$tdir/f3
2190 $LFS getstripe $DIR/$tdir/f3
2192 echo "The file size should be correct after layout LFSCK scanning"
2193 local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
2194 [ "$cur_size" == "$saved_size1" ] ||
2195 error "(7) Expect file1 size $saved_size1, but got $cur_size"
2197 if [ $MDSCOUNT -ge 2 ]; then
2198 cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
2199 [ "$cur_size" == "$saved_size1" ] ||
2200 error "(8) Expect file2 size $saved_size1, but got $cur_size"
2203 cur_size=$(ls -il $DIR/$tdir/f3 | awk '{ print $6 }')
2204 [ "$cur_size" == "$saved_size2" ] ||
2205 error "(9) Expect file1 size $saved_size2, but got $cur_size"
2207 run_test 18b "Find out orphan OST-object and repair it (2)"
2211 echo "The target MDT-object is lost, and the OST-object FID is missing."
2212 echo "The LFSCK should re-create the MDT-object with new FID under the "
2213 echo "directory .lustre/lost+found/MDTxxxx."
2216 check_mount_and_prep
2217 $LFS mkdir -i 0 $DIR/$tdir/a1
2218 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2220 echo "Inject failure, to simulate the case of missing parent FID"
2221 #define OBD_FAIL_LFSCK_NOPFID 0x1617
2222 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1617
2224 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2225 $LFS getstripe $DIR/$tdir/a1/f1
2227 if [ $MDSCOUNT -ge 2 ]; then
2228 $LFS mkdir -i 1 $DIR/$tdir/a2
2229 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a2
2230 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2231 $LFS getstripe $DIR/$tdir/a2/f2
2234 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/f3 ||
2235 error "(0) Fail to create PFL $DIR/$tdir/f3"
2237 dd if=/dev/zero of=$DIR/$tdir/f3 bs=1M count=2
2238 $LFS getstripe $DIR/$tdir/f3
2240 cancel_lru_locks osc
2241 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2243 echo "Inject failure, to simulate the case of missing the MDT-object"
2244 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2245 do_facet mds1 $LCTL set_param fail_loc=0x1616
2246 rm -f $DIR/$tdir/a1/f1
2248 if [ $MDSCOUNT -ge 2 ]; then
2249 do_facet mds2 $LCTL set_param fail_loc=0x1616
2250 rm -f $DIR/$tdir/a2/f2
2258 do_facet mds1 $LCTL set_param fail_loc=0
2259 if [ $MDSCOUNT -ge 2 ]; then
2260 do_facet mds2 $LCTL set_param fail_loc=0
2263 cancel_lru_locks mdc
2264 cancel_lru_locks osc
2266 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2267 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2269 for k in $(seq $MDSCOUNT); do
2270 # The LFSCK status query internal is 30 seconds. For the case
2271 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2272 # time to guarantee the status sync up.
2273 wait_update_facet mds${k} "$LCTL get_param -n \
2274 mdd.$(facet_svc mds${k}).lfsck_layout |
2275 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2276 error "(2) MDS${k} is not the expected 'completed'"
2279 for k in $(seq $OSTCOUNT); do
2280 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2281 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2282 awk '/^status/ { print $2 }')
2283 [ "$cur_status" == "completed" ] ||
2284 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2287 if [ $MDSCOUNT -ge 2 ]; then
2293 local repaired=$(do_facet mds1 $LCTL get_param -n \
2294 mdd.$(facet_svc mds1).lfsck_layout |
2295 awk '/^repaired_orphan/ { print $2 }')
2296 [ $repaired -eq $expected ] ||
2297 error "(4) Expect $expected fixed on mds1, but got: $repaired"
2299 if [ $MDSCOUNT -ge 2 ]; then
2300 repaired=$(do_facet mds2 $LCTL get_param -n \
2301 mdd.$(facet_svc mds2).lfsck_layout |
2302 awk '/^repaired_orphan/ { print $2 }')
2303 [ $repaired -eq 0 ] ||
2304 error "(5) Expect 0 fixed on mds2, but got: $repaired"
2307 ls -ail $MOUNT/.lustre/lost+found/
2309 echo "There should NOT be some stub under .lustre/lost+found/MDT0001/"
2310 if [ -d $MOUNT/.lustre/lost+found/MDT0001 ]; then
2311 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-N-*)
2313 error "(6) .lustre/lost+found/MDT0001/ should be empty"
2316 echo "There should be some stub under .lustre/lost+found/MDT0000/"
2317 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2318 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2320 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-N-*)
2321 [ ! -z "$cname" ] ||
2322 error "(8) .lustre/lost+found/MDT0000/ should not be empty"
2324 run_test 18c "Find out orphan OST-object and repair it (3)"
2328 echo "The target MDT-object layout EA is corrupted, but the right"
2329 echo "OST-object is still alive as orphan. The layout LFSCK will"
2330 echo "not create new OST-object to occupy such slot."
2333 check_mount_and_prep
2335 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2336 echo "guard" > $DIR/$tdir/a1/f1
2337 echo "foo" > $DIR/$tdir/a1/f2
2339 echo "guard" > $DIR/$tdir/a1/f3
2340 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2341 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2342 echo "foo" > $DIR/$tdir/a1/f4
2344 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2345 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2346 $LFS path2fid $DIR/$tdir/a1/f1
2347 $LFS getstripe $DIR/$tdir/a1/f1
2348 $LFS path2fid $DIR/$tdir/a1/f2
2349 $LFS getstripe $DIR/$tdir/a1/f2
2350 $LFS path2fid $DIR/$tdir/a1/f3
2351 $LFS getstripe $DIR/$tdir/a1/f3
2352 $LFS path2fid $DIR/$tdir/a1/f4
2353 $LFS getstripe $DIR/$tdir/a1/f4
2354 cancel_lru_locks osc
2356 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2357 echo "to reference the same OST-object (which is f1's OST-obejct)."
2358 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2359 echo "dangling reference case, but f2's old OST-object is there."
2361 echo "The failure also makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2362 echo "to reference the same OST-object (which is f3's OST-obejct)."
2363 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2364 echo "dangling reference case, but f4's old OST-object is there."
2367 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2368 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2369 chown 1.1 $DIR/$tdir/a1/f2
2370 chown 1.1 $DIR/$tdir/a1/f4
2371 rm -f $DIR/$tdir/a1/f1
2372 rm -f $DIR/$tdir/a1/f3
2375 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2377 echo "stopall to cleanup object cache"
2380 setupall > /dev/null
2382 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2383 $START_LAYOUT -r -o -c -d || error "(2) Fail to start LFSCK for layout!"
2385 for k in $(seq $MDSCOUNT); do
2386 # The LFSCK status query internal is 30 seconds. For the case
2387 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2388 # time to guarantee the status sync up.
2389 wait_update_facet mds${k} "$LCTL get_param -n \
2390 mdd.$(facet_svc mds${k}).lfsck_layout |
2391 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2392 error "(3) MDS${k} is not the expected 'completed'"
2395 for k in $(seq $OSTCOUNT); do
2396 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2397 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2398 awk '/^status/ { print $2 }')
2399 [ "$cur_status" == "completed" ] ||
2400 error "(4) OST${k} Expect 'completed', but got '$cur_status'"
2403 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2404 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2405 awk '/^repaired_orphan/ { print $2 }')
2406 [ $repaired -eq 2 ] ||
2407 error "(5) Expect 2 orphans have been fixed, but got: $repaired"
2409 repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2410 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2411 awk '/^repaired_dangling/ { print $2 }')
2412 [ $repaired -eq 0 ] ||
2413 error "(6) Expect 0 dangling has been fixed, but got: $repaired"
2415 echo "The file size should be correct after layout LFSCK scanning"
2416 local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2417 [ "$cur_size" == "$saved_size1" ] ||
2418 error "(7) Expect file2 size $saved_size1, but got $cur_size"
2420 cur_size=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2421 [ "$cur_size" == "$saved_size2" ] ||
2422 error "(8) Expect file4 size $saved_size2, but got $cur_size"
2424 echo "The LFSCK should find back the original data."
2425 cat $DIR/$tdir/a1/f2
2426 $LFS path2fid $DIR/$tdir/a1/f2
2427 $LFS getstripe $DIR/$tdir/a1/f2
2428 cat $DIR/$tdir/a1/f4
2429 $LFS path2fid $DIR/$tdir/a1/f4
2430 $LFS getstripe $DIR/$tdir/a1/f4
2432 run_test 18d "Find out orphan OST-object and repair it (4)"
2436 echo "The target MDT-object layout EA slot is occpuied by some new"
2437 echo "created OST-object when repair dangling reference case. Such"
2438 echo "conflict OST-object has been modified by others. To keep the"
2439 echo "new data, the LFSCK will create a new file to refernece this"
2440 echo "old orphan OST-object."
2443 check_mount_and_prep
2445 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2446 echo "guard" > $DIR/$tdir/a1/f1
2447 echo "foo" > $DIR/$tdir/a1/f2
2449 echo "guard" > $DIR/$tdir/a1/f3
2450 $LFS setstripe -E 1M -S 1M -o 0 -E -1 -S 1M $DIR/$tdir/a1/f4 ||
2451 error "(0) Fail to create PFL $DIR/$tdir/a1/f4"
2452 echo "foo" > $DIR/$tdir/a1/f4
2454 local saved_size1=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
2455 local saved_size2=$(ls -il $DIR/$tdir/a1/f4 | awk '{ print $6 }')
2457 $LFS path2fid $DIR/$tdir/a1/f1
2458 $LFS getstripe $DIR/$tdir/a1/f1
2459 $LFS path2fid $DIR/$tdir/a1/f2
2460 $LFS getstripe $DIR/$tdir/a1/f2
2461 $LFS path2fid $DIR/$tdir/a1/f3
2462 $LFS getstripe $DIR/$tdir/a1/f3
2463 $LFS path2fid $DIR/$tdir/a1/f4
2464 $LFS getstripe $DIR/$tdir/a1/f4
2465 cancel_lru_locks osc
2467 echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
2468 echo "to reference the same OST-object (which is f1's OST-obejct)."
2469 echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
2470 echo "dangling reference case, but f2's old OST-object is there."
2472 echo "Also the failure makes $DIR/$tdir/a1/f3 and $DIR/$tdir/a1/f4"
2473 echo "to reference the same OST-object (which is f3's OST-obejct)."
2474 echo "Then drop $DIR/$tdir/a1/f3 and its OST-object, so f4 becomes"
2475 echo "dangling reference case, but f4's old OST-object is there."
2478 #define OBD_FAIL_LFSCK_CHANGE_STRIPE 0x1618
2479 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
2480 chown 1.1 $DIR/$tdir/a1/f2
2481 chown 1.1 $DIR/$tdir/a1/f4
2482 rm -f $DIR/$tdir/a1/f1
2483 rm -f $DIR/$tdir/a1/f3
2486 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2488 echo "stopall to cleanup object cache"
2491 setupall > /dev/null
2493 #define OBD_FAIL_LFSCK_DELAY3 0x1602
2494 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
2496 start_full_debug_logging
2498 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2499 $START_LAYOUT -r -o -c || error "(2) Fail to start LFSCK for layout!"
2501 wait_update_facet mds1 "$LCTL get_param -n \
2502 mdd.$(facet_svc mds1).lfsck_layout |
2503 awk '/^status/ { print \\\$2 }'" "scanning-phase2" $LTIME ||
2504 error "(3) MDS1 is not the expected 'scanning-phase2'"
2506 # to guarantee all updates are synced.
2510 echo "Write new data to f2/f4 to modify the new created OST-object."
2511 echo "dummy" >> $DIR/$tdir/a1/f2 || error "write a1/f2 failed"
2512 echo "dummy" >> $DIR/$tdir/a1/f4 || error "write a1/f4 failed"
2514 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
2516 for k in $(seq $MDSCOUNT); do
2517 # The LFSCK status query internal is 30 seconds. For the case
2518 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2519 # time to guarantee the status sync up.
2520 wait_update_facet mds${k} "$LCTL get_param -n \
2521 mdd.$(facet_svc mds${k}).lfsck_layout |
2522 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2523 error "(4) MDS${k} is not the expected 'completed'"
2526 for k in $(seq $OSTCOUNT); do
2527 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2528 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2529 awk '/^status/ { print $2 }')
2530 [ "$cur_status" == "completed" ] ||
2531 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
2534 stop_full_debug_logging
2536 local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
2537 mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
2538 awk '/^repaired_orphan/ { print $2 }')
2539 [ $repaired -eq 2 ] ||
2540 error "(6) Expect 2 orphans have been fixed, but got: $repaired"
2542 echo "There should be stub file under .lustre/lost+found/MDT0000/"
2543 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
2544 error "(7) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
2546 local count=$(ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-* | wc -l)
2547 if [ $count -ne 2 ]; then
2548 ls -l $MOUNT/.lustre/lost+found/MDT0000/*-C-*
2549 error "(8) Expect 2 stubs under lost+found, but got $count"
2552 echo "The stub file should keep the original f2 or f4 data"
2553 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | head -n 1)
2554 local cur_size=$(ls -il $cname | awk '{ print $6 }')
2555 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2556 error "(9) Got unexpected $cur_size"
2559 $LFS path2fid $cname
2560 $LFS getstripe $cname
2562 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-C-* | tail -n 1)
2563 cur_size=$(ls -il $cname | awk '{ print $6 }')
2564 [ "$cur_size" != "$saved_size1" -a "$cur_size" != "$saved_size2" ] &&
2565 error "(10) Got unexpected $cur_size"
2568 $LFS path2fid $cname
2569 $LFS getstripe $cname
2571 echo "The f2/f4 should contains new data."
2572 cat $DIR/$tdir/a1/f2
2573 $LFS path2fid $DIR/$tdir/a1/f2
2574 $LFS getstripe $DIR/$tdir/a1/f2
2575 cat $DIR/$tdir/a1/f4
2576 $LFS path2fid $DIR/$tdir/a1/f4
2577 $LFS getstripe $DIR/$tdir/a1/f4
2579 run_test 18e "Find out orphan OST-object and repair it (5)"
2582 [ $OSTCOUNT -lt 2 ] &&
2583 skip "The test needs at least 2 OSTs" && return
2586 echo "The target MDT-object is lost. The LFSCK should re-create the"
2587 echo "MDT-object under .lustre/lost+found/MDTxxxx. If some OST fail"
2588 echo "to verify some OST-object(s) during the first stage-scanning,"
2589 echo "the LFSCK should skip orphan OST-objects for such OST. Others"
2590 echo "should not be affected."
2593 check_mount_and_prep
2594 $LFS mkdir -i 0 $DIR/$tdir/a1
2595 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a1
2596 dd if=/dev/zero of=$DIR/$tdir/a1/guard bs=1M count=2
2597 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
2598 $LFS mkdir -i 0 $DIR/$tdir/a2
2599 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a2
2600 dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
2601 $LFS getstripe $DIR/$tdir/a1/f1
2602 $LFS getstripe $DIR/$tdir/a2/f2
2604 if [ $MDSCOUNT -ge 2 ]; then
2605 $LFS mkdir -i 1 $DIR/$tdir/a3
2606 $LFS setstripe -c 1 -i 0 $DIR/$tdir/a3
2607 dd if=/dev/zero of=$DIR/$tdir/a3/guard bs=1M count=2
2608 dd if=/dev/zero of=$DIR/$tdir/a3/f3 bs=1M count=2
2609 $LFS mkdir -i 1 $DIR/$tdir/a4
2610 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a4
2611 dd if=/dev/zero of=$DIR/$tdir/a4/f4 bs=1M count=2
2612 $LFS getstripe $DIR/$tdir/a3/f3
2613 $LFS getstripe $DIR/$tdir/a4/f4
2616 cancel_lru_locks osc
2618 echo "Inject failure, to simulate the case of missing the MDT-object"
2619 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2620 do_facet mds1 $LCTL set_param fail_loc=0x1616
2621 rm -f $DIR/$tdir/a1/f1
2622 rm -f $DIR/$tdir/a2/f2
2624 if [ $MDSCOUNT -ge 2 ]; then
2625 do_facet mds2 $LCTL set_param fail_loc=0x1616
2626 rm -f $DIR/$tdir/a3/f3
2627 rm -f $DIR/$tdir/a4/f4
2633 do_facet mds1 $LCTL set_param fail_loc=0
2634 if [ $MDSCOUNT -ge 2 ]; then
2635 do_facet mds2 $LCTL set_param fail_loc=0
2638 cancel_lru_locks mdc
2639 cancel_lru_locks osc
2641 echo "Inject failure, to simulate the OST0 fail to handle"
2642 echo "MDT0 LFSCK request during the first-stage scanning."
2643 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
2644 do_facet mds1 $LCTL set_param fail_loc=0x161c fail_val=0
2646 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2647 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2649 for k in $(seq $MDSCOUNT); do
2650 # The LFSCK status query internal is 30 seconds. For the case
2651 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2652 # time to guarantee the status sync up.
2653 wait_update_facet mds${k} "$LCTL get_param -n \
2654 mdd.$(facet_svc mds${k}).lfsck_layout |
2655 awk '/^status/ { print \\\$2 }'" "partial" $LTIME ||
2656 error "(2) MDS${k} is not the expected 'partial'"
2659 wait_update_facet ost1 "$LCTL get_param -n \
2660 obdfilter.$(facet_svc ost1).lfsck_layout |
2661 awk '/^status/ { print \\\$2 }'" "partial" $LTIME || {
2662 error "(3) OST1 is not the expected 'partial'"
2665 wait_update_facet ost2 "$LCTL get_param -n \
2666 obdfilter.$(facet_svc ost2).lfsck_layout |
2667 awk '/^status/ { print \\\$2 }'" "completed" $LTIME || {
2668 error "(4) OST2 is not the expected 'completed'"
2671 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
2673 local repaired=$(do_facet mds1 $LCTL get_param -n \
2674 mdd.$(facet_svc mds1).lfsck_layout |
2675 awk '/^repaired_orphan/ { print $2 }')
2676 [ $repaired -eq 1 ] ||
2677 error "(5) Expect 1 fixed on mds{1}, but got: $repaired"
2679 if [ $MDSCOUNT -ge 2 ]; then
2680 repaired=$(do_facet mds2 $LCTL get_param -n \
2681 mdd.$(facet_svc mds2).lfsck_layout |
2682 awk '/^repaired_orphan/ { print $2 }')
2683 [ $repaired -eq 1 ] ||
2684 error "(6) Expect 1 fixed on mds{2}, but got: $repaired"
2687 echo "Trigger layout LFSCK on all devices again to cleanup"
2688 $START_LAYOUT -r -o || error "(7) Fail to start LFSCK for layout!"
2690 for k in $(seq $MDSCOUNT); do
2691 # The LFSCK status query internal is 30 seconds. For the case
2692 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2693 # time to guarantee the status sync up.
2694 wait_update_facet mds${k} "$LCTL get_param -n \
2695 mdd.$(facet_svc mds${k}).lfsck_layout |
2696 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2697 error "(8) MDS${k} is not the expected 'completed'"
2700 for k in $(seq $OSTCOUNT); do
2701 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2702 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2703 awk '/^status/ { print $2 }')
2704 [ "$cur_status" == "completed" ] ||
2705 error "(9) OST${k} Expect 'completed', but got '$cur_status'"
2709 local repaired=$(do_facet mds1 $LCTL get_param -n \
2710 mdd.$(facet_svc mds1).lfsck_layout |
2711 awk '/^repaired_orphan/ { print $2 }')
2712 [ $repaired -eq 2 ] ||
2713 error "(10) Expect 2 fixed on mds{1}, but got: $repaired"
2715 if [ $MDSCOUNT -ge 2 ]; then
2716 repaired=$(do_facet mds2 $LCTL get_param -n \
2717 mdd.$(facet_svc mds2).lfsck_layout |
2718 awk '/^repaired_orphan/ { print $2 }')
2719 [ $repaired -eq 2 ] ||
2720 error "(11) Expect 2 fixed on mds{2}, but got: $repaired"
2723 run_test 18f "Skip the failed OST(s) when handle orphan OST-objects"
2727 echo "The target MDT-object is lost, but related OI mapping is there"
2728 echo "The LFSCK should recreate the lost MDT-object without affected"
2729 echo "by the stale OI mapping."
2732 check_mount_and_prep
2733 $LFS mkdir -i 0 $DIR/$tdir/a1
2734 $LFS setstripe -c -1 -i 0 -S 1M $DIR/$tdir/a1
2735 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=$OSTCOUNT
2736 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2738 $LFS getstripe $DIR/$tdir/a1/f1
2739 cancel_lru_locks osc
2741 echo "Inject failure to simulate lost MDT-object but keep OI mapping"
2742 #define OBD_FAIL_LFSCK_LOST_MDTOBJ2 0x162e
2743 do_facet mds1 $LCTL set_param fail_loc=0x162e
2744 rm -f $DIR/$tdir/a1/f1
2746 do_facet mds1 $LCTL set_param fail_loc=0
2747 cancel_lru_locks mdc
2748 cancel_lru_locks osc
2750 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
2751 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
2753 for k in $(seq $MDSCOUNT); do
2754 # The LFSCK status query internal is 30 seconds. For the case
2755 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2756 # time to guarantee the status sync up.
2757 wait_update_facet mds${k} "$LCTL get_param -n \
2758 mdd.$(facet_svc mds${k}).lfsck_layout |
2759 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2760 error "(2) MDS${k} is not the expected 'completed'"
2763 for k in $(seq $OSTCOUNT); do
2764 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
2765 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2766 awk '/^status/ { print $2 }')
2767 [ "$cur_status" == "completed" ] ||
2768 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
2771 local repaired=$(do_facet mds1 $LCTL get_param -n \
2772 mdd.$(facet_svc mds1).lfsck_layout |
2773 awk '/^repaired_orphan/ { print $2 }')
2774 [ $repaired -eq $OSTCOUNT ] ||
2775 error "(4) Expect $OSTCOUNT fixed, but got: $repaired"
2777 echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
2778 mv $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0 $DIR/$tdir/a1/f1 ||
2779 error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
2781 $LFS path2fid $DIR/$tdir/a1/f1
2782 $LFS getstripe $DIR/$tdir/a1/f1
2784 run_test 18g "Find out orphan OST-object and repair it (7)"
2788 echo "The PFL extent crashed. During the first cycle LFSCK scanning,"
2789 echo "the layout LFSCK will keep the bad PFL file(s) there without"
2790 echo "scanning its OST-object(s). Then in the second stage scanning,"
2791 echo "the OST will return related OST-object(s) to the MDT as orphan."
2792 echo "And then the LFSCK on the MDT can rebuild the PFL extent with"
2793 echo "the 'orphan(s)' stripe information."
2796 check_mount_and_prep
2798 $LFS setstripe -E 2M -c 1 -E -1 $DIR/$tdir/f0 ||
2799 error "(0) Fail to create PFL $DIR/$tdir/f0"
2801 cat $LUSTRE/tests/test-framework.sh > $DIR/$tdir/f0 ||
2802 error "(1.1) Fail to write $DIR/$tdir/f0"
2804 dd if=$LUSTRE/tests/test-framework.sh of=$DIR/$tdir/f0 bs=1M seek=2 ||
2805 error "(1.2) Fail to write $DIR/$tdir/f0"
2807 cp $DIR/$tdir/f0 $DIR/$tdir/guard
2809 echo "Inject failure stub to simulate bad PFL extent range"
2810 #define OBD_FAIL_LFSCK_BAD_PFL_RANGE 0x162f
2811 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162f
2813 chown 1.1 $DIR/$tdir/f0
2815 cancel_lru_locks mdc
2816 cancel_lru_locks osc
2817 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
2819 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 &&
2820 error "(2) Write to bad PFL file should fail"
2822 echo "Trigger layout LFSCK to find out the bad lmm_oi and fix them"
2823 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
2825 for k in $(seq $MDSCOUNT); do
2826 # The LFSCK status query internal is 30 seconds. For the case
2827 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
2828 # time to guarantee the status sync up.
2829 wait_update_facet mds${k} "$LCTL get_param -n \
2830 mdd.$(facet_svc mds${k}).lfsck_layout |
2831 awk '/^status/ { print \\\$2 }'" "completed" $LTIME ||
2832 error "(4.1) MDS${k} is not the expected 'completed'"
2835 for k in $(seq $OSTCOUNT); do
2836 cur_status=$(do_facet ost${k} $LCTL get_param -n \
2837 obdfilter.$(facet_svc ost${k}).lfsck_layout |
2838 awk '/^status/ { print $2 }')
2839 [ "$cur_status" == "completed" ] ||
2840 error "(4.2) OST${k} Expect 'completed', but got '$cur_status'"
2844 local repaired=$($SHOW_LAYOUT |
2845 awk '/^repaired_orphan/ { print $2 }')
2846 [ $repaired -eq 2 ] ||
2847 error "(5) Fail to repair crashed PFL range: $repaired"
2849 echo "Data in $DIR/$tdir/f0 should not be broken"
2850 diff $DIR/$tdir/f0 $DIR/$tdir/guard ||
2851 error "(6) Data in $DIR/$tdir/f0 is broken"
2853 echo "Write should succeed after LFSCK repairing the bad PFL range"
2854 dd if=/dev/zero of=$DIR/$tdir/f0 bs=1M count=1 ||
2855 error "(7) Write should succeed after LFSCK"
2857 run_test 18h "LFSCK can repair crashed PFL extent range"
2859 $LCTL set_param debug=-cache > /dev/null
2862 check_mount_and_prep
2863 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2865 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2866 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2868 echo "foo1" > $DIR/$tdir/a0
2869 $LFS setstripe -E 512K -S 512K -o 0 -E -1 -S 1M $DIR/$tdir/a1 ||
2870 error "(0) Fail to create PFL $DIR/$tdir/a1"
2871 echo "foo2" > $DIR/$tdir/a1
2872 echo "guard" > $DIR/$tdir/a2
2873 cancel_lru_locks osc
2875 echo "Inject failure, then client will offer wrong parent FID when read"
2876 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2877 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2879 #define OBD_FAIL_LFSCK_INVALID_PFID 0x1619
2880 $LCTL set_param fail_loc=0x1619
2882 echo "Read RPC with wrong parent FID should be denied"
2883 cat $DIR/$tdir/a0 && error "(3.1) Read a0 should be denied!"
2884 cat $DIR/$tdir/a1 && error "(3.2) Read a1 should be denied!"
2885 $LCTL set_param fail_loc=0
2887 run_test 19a "OST-object inconsistency self detect"
2890 check_mount_and_prep
2891 $LFS setstripe -c 1 -i 0 $DIR/$tdir
2893 echo "Inject failure stub to make the OST-object to back point to"
2894 echo "non-exist MDT-object"
2896 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2897 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2899 #define OBD_FAIL_LFSCK_UNMATCHED_PAIR1 0x1611
2900 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0x1611
2901 echo "foo1" > $DIR/$tdir/f0
2902 $LFS setstripe -E 1M -S 1M -o 0 -E 4M -S 256K $DIR/$tdir/f1 ||
2903 error "(0) Fail to create PFL $DIR/$tdir/f1"
2904 echo "foo2" > $DIR/$tdir/f1
2905 cancel_lru_locks osc
2906 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
2908 do_facet ost1 $LCTL set_param -n \
2909 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 0
2910 echo "Nothing should be fixed since self detect and repair is disabled"
2911 local repaired=$(do_facet ost1 $LCTL get_param -n \
2912 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2913 awk '/^repaired/ { print $2 }')
2914 [ $repaired -eq 0 ] ||
2915 error "(1) Expected 0 repaired, but got $repaired"
2917 echo "Read RPC with right parent FID should be accepted,"
2918 echo "and cause parent FID on OST to be fixed"
2920 do_nodes $(comma_list $(osts_nodes)) $LCTL set_param -n \
2921 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid 1
2923 cat $DIR/$tdir/f0 || error "(2.1) Read f0 should not be denied!"
2924 cat $DIR/$tdir/f1 || error "(2.2) Read f1 should not be denied!"
2926 repaired=$(do_facet ost1 $LCTL get_param -n \
2927 obdfilter.${FSNAME}-OST0000.lfsck_verify_pfid |
2928 awk '/^repaired/ { print $2 }')
2929 [ $repaired -eq 2 ] ||
2930 error "(3) Expected 1 repaired, but got $repaired"
2932 run_test 19b "OST-object inconsistency self repair"
2934 PATTERN_WITH_HOLE="40000001"
2935 PATTERN_WITHOUT_HOLE="1"
2938 [ $OSTCOUNT -lt 2 ] &&
2939 skip "The test needs at least 2 OSTs" && return
2942 echo "The target MDT-object and some of its OST-object are lost."
2943 echo "The LFSCK should find out the left OST-objects and re-create"
2944 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
2945 echo "with the partial OST-objects (LOV EA hole)."
2947 echo "New client can access the file with LOV EA hole via normal"
2948 echo "system tools or commands without crash the system."
2950 echo "For old client, even though it cannot access the file with"
2951 echo "LOV EA hole, it should not cause the system crash."
2954 check_mount_and_prep
2955 $LFS mkdir -i 0 $DIR/$tdir/a1
2956 if [ $OSTCOUNT -gt 2 ]; then
2957 $LFS setstripe -c 3 -i 0 -S 1M $DIR/$tdir/a1
2960 $LFS setstripe -c 2 -i 0 -S 1M $DIR/$tdir/a1
2964 # 256 blocks on the stripe0.
2965 # 1 block on the stripe1 for 2 OSTs case.
2966 # 256 blocks on the stripe1 for other cases.
2967 # 1 block on the stripe2 if OSTs > 2
2968 dd if=/dev/zero of=$DIR/$tdir/a1/f0 bs=4096 count=$bcount
2969 dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=4096 count=$bcount
2970 dd if=/dev/zero of=$DIR/$tdir/a1/f2 bs=4096 count=$bcount
2972 local fid0=$($LFS path2fid $DIR/$tdir/a1/f0)
2973 local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
2974 local fid2=$($LFS path2fid $DIR/$tdir/a1/f2)
2977 $LFS getstripe $DIR/$tdir/a1/f0
2979 $LFS getstripe $DIR/$tdir/a1/f1
2981 $LFS getstripe $DIR/$tdir/a1/f2
2983 if [ $OSTCOUNT -gt 2 ]; then
2984 dd if=/dev/zero of=$DIR/$tdir/a1/f3 bs=4096 count=$bcount
2985 fid3=$($LFS path2fid $DIR/$tdir/a1/f3)
2987 $LFS getstripe $DIR/$tdir/a1/f3
2990 cancel_lru_locks osc
2992 echo "Inject failure..."
2993 echo "To simulate f0 lost MDT-object"
2994 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
2995 do_facet mds1 $LCTL set_param fail_loc=0x1616
2996 rm -f $DIR/$tdir/a1/f0
2998 echo "To simulate f1 lost MDT-object and OST-object0"
2999 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3000 do_facet mds1 $LCTL set_param fail_loc=0x161a
3001 rm -f $DIR/$tdir/a1/f1
3003 echo "To simulate f2 lost MDT-object and OST-object1"
3004 do_facet mds1 $LCTL set_param fail_val=1
3005 rm -f $DIR/$tdir/a1/f2
3007 if [ $OSTCOUNT -gt 2 ]; then
3008 echo "To simulate f3 lost MDT-object and OST-object2"
3009 do_facet mds1 $LCTL set_param fail_val=2
3010 rm -f $DIR/$tdir/a1/f3
3013 umount_client $MOUNT
3016 do_facet mds1 $LCTL set_param fail_loc=0 fail_val=0
3018 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3019 $START_LAYOUT -r -o || error "(1) Fail to start LFSCK for layout!"
3021 for k in $(seq $MDSCOUNT); do
3022 # The LFSCK status query internal is 30 seconds. For the case
3023 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3024 # time to guarantee the status sync up.
3025 wait_update_facet mds${k} "$LCTL get_param -n \
3026 mdd.$(facet_svc mds${k}).lfsck_layout |
3027 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3028 error "(2) MDS${k} is not the expected 'completed'"
3031 for k in $(seq $OSTCOUNT); do
3032 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3033 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3034 awk '/^status/ { print $2 }')
3035 [ "$cur_status" == "completed" ] ||
3036 error "(3) OST${k} Expect 'completed', but got '$cur_status'"
3039 local repaired=$(do_facet mds1 $LCTL get_param -n \
3040 mdd.$(facet_svc mds1).lfsck_layout |
3041 awk '/^repaired_orphan/ { print $2 }')
3042 if [ $OSTCOUNT -gt 2 ]; then
3043 [ $repaired -eq 9 ] ||
3044 error "(4.1) Expect 9 fixed on mds1, but got: $repaired"
3046 [ $repaired -eq 4 ] ||
3047 error "(4.2) Expect 4 fixed on mds1, but got: $repaired"
3050 mount_client $MOUNT || error "(5.0) Fail to start client!"
3052 LOV_PATTERN_F_HOLE=0x40000000
3055 # ${fid0}-R-0 is the old f0
3057 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3058 echo "Check $name, which is the old f0"
3060 $LFS getstripe -v $name || error "(5.1) cannot getstripe on $name"
3062 local pattern=$($LFS getstripe -L $name)
3063 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3064 error "(5.2) NOT expect pattern flag hole, but got $pattern"
3066 local stripes=$($LFS getstripe -c $name)
3067 if [ $OSTCOUNT -gt 2 ]; then
3068 [ $stripes -eq 3 ] ||
3069 error "(5.3.1) expect the stripe count is 3, but got $stripes"
3071 [ $stripes -eq 2 ] ||
3072 error "(5.3.2) expect the stripe count is 2, but got $stripes"
3075 local size=$(stat $name | awk '/Size:/ { print $2 }')
3076 [ $size -eq $((4096 * $bcount)) ] ||
3077 error "(5.4) expect the size $((4096 * $bcount)), but got $size"
3079 cat $name > /dev/null || error "(5.5) cannot read $name"
3081 echo "dummy" >> $name || error "(5.6) cannot write $name"
3083 chown $RUNAS_ID:$RUNAS_GID $name || error "(5.7) cannot chown on $name"
3085 touch $name || error "(5.8) cannot touch $name"
3087 rm -f $name || error "(5.9) cannot unlink $name"
3090 # ${fid1}-R-0 contains the old f1's stripe1 (and stripe2 if OSTs > 2)
3092 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3093 if [ $OSTCOUNT -gt 2 ]; then
3094 echo "Check $name, it contains the old f1's stripe1 and stripe2"
3096 echo "Check $name, it contains the old f1's stripe1"
3099 $LFS getstripe -v $name || error "(6.1) cannot getstripe on $name"
3101 pattern=$($LFS getstripe -L $name)
3102 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3103 error "(6.2) expect pattern flag hole, but got $pattern"
3105 stripes=$($LFS getstripe -c $name)
3106 if [ $OSTCOUNT -gt 2 ]; then
3107 [ $stripes -eq 3 ] ||
3108 error "(6.3.1) expect the stripe count is 3, but got $stripes"
3110 [ $stripes -eq 2 ] ||
3111 error "(6.3.2) expect the stripe count is 2, but got $stripes"
3114 size=$(stat $name | awk '/Size:/ { print $2 }')
3115 [ $size -eq $((4096 * $bcount)) ] ||
3116 error "(6.4) expect the size $((4096 * $bcount)), but got $size"
3118 cat $name > /dev/null && error "(6.5) normal read $name should fail"
3120 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3121 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3124 [ $failures -eq 256 ] ||
3125 error "(6.6) expect 256 IO failures, but get $failures"
3127 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3128 [ $size -eq $((4096 * $bcount)) ] ||
3129 error "(6.7) expect the size $((4096 * $bcount)), but got $size"
3131 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3132 error "(6.8) write to the LOV EA hole should fail"
3134 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3135 error "(6.9) write to normal stripe should NOT fail"
3137 echo "foo" >> $name && error "(6.10) append write $name should fail"
3139 chown $RUNAS_ID:$RUNAS_GID $name || error "(6.11) cannot chown on $name"
3141 touch $name || error "(6.12) cannot touch $name"
3143 rm -f $name || error "(6.13) cannot unlink $name"
3146 # ${fid2}-R-0 it contains the old f2's stripe0 (and stripe2 if OSTs > 2)
3148 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3149 if [ $OSTCOUNT -gt 2 ]; then
3150 echo "Check $name, it contains the old f2's stripe0 and stripe2"
3152 echo "Check $name, it contains the old f2's stripe0"
3155 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3157 pattern=$($LFS getstripe -L $name)
3158 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3159 error "(7.2) expect pattern flag hole, but got $pattern"
3161 stripes=$($LFS getstripe -c $name)
3162 size=$(stat $name | awk '/Size:/ { print $2 }')
3163 if [ $OSTCOUNT -gt 2 ]; then
3164 [ $stripes -eq 3 ] ||
3165 error "(7.3.1) expect the stripe count is 3, but got $stripes"
3167 [ $size -eq $((4096 * $bcount)) ] ||
3168 error "(7.4.1) expect size $((4096 * $bcount)), but got $size"
3170 cat $name > /dev/null &&
3171 error "(7.5.1) normal read $name should fail"
3173 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3174 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3176 [ $failures -eq 256 ] ||
3177 error "(7.6) expect 256 IO failures, but get $failures"
3179 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3180 [ $size -eq $((4096 * $bcount)) ] ||
3181 error "(7.7) expect the size $((4096 * $bcount)), but got $size"
3183 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3184 seek=300 && error "(7.8.0) write to the LOV EA hole should fail"
3186 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3187 error "(7.8.1) write to normal stripe should NOT fail"
3189 echo "foo" >> $name &&
3190 error "(7.8.3) append write $name should fail"
3192 chown $RUNAS_ID:$RUNAS_GID $name ||
3193 error "(7.9.1) cannot chown on $name"
3195 touch $name || error "(7.10.1) cannot touch $name"
3197 [ $stripes -eq 2 ] ||
3198 error "(7.3.2) expect the stripe count is 2, but got $stripes"
3201 [ $size -eq $((4096 * (256 + 0))) ] ||
3202 error "(7.4.2) expect the size $((4096 * 256)), but got $size"
3204 cat $name > /dev/null &&
3205 error "(7.5.2) normal read $name should fail"
3207 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3208 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3209 [ $failures -eq 256 ] ||
3210 error "(7.6.2) expect 256 IO failures, but get $failures"
3213 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3214 [ $size -eq $((4096 * $bcount)) ] ||
3215 error "(7.7.2) expect the size $((4096 * $bcount)), got $size"
3217 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3218 seek=256 && error "(7.8.2) write to the LOV EA hole should fail"
3220 chown $RUNAS_ID:$RUNAS_GID $name ||
3221 error "(7.9.2) cannot chown on $name"
3223 touch $name || error "(7.10.2) cannot touch $name"
3226 rm -f $name || error "(7.11) cannot unlink $name"
3228 [ $OSTCOUNT -le 2 ] && return
3231 # ${fid3}-R-0 should contains the old f3's stripe0 and stripe1
3233 name="$MOUNT/.lustre/lost+found/MDT0000/${fid3}-R-0"
3234 echo "Check $name, which contains the old f3's stripe0 and stripe1"
3236 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3238 pattern=$($LFS getstripe -L $name)
3239 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3240 error "(8.2) expect pattern flag hole, but got $pattern"
3242 stripes=$($LFS getstripe -c $name)
3243 [ $stripes -eq 3 ] ||
3244 error "(8.3) expect the stripe count is 3, but got $stripes"
3246 size=$(stat $name | awk '/Size:/ { print $2 }')
3248 [ $size -eq $((4096 * (256 + 256 + 0))) ] ||
3249 error "(8.4) expect the size $((4096 * 512)), but got $size"
3251 cat $name > /dev/null &&
3252 error "(8.5) normal read $name should fail"
3254 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3255 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3257 [ $failures -eq 256 ] ||
3258 error "(8.6) expect 256 IO failures, but get $failures"
3261 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3262 [ $size -eq $((4096 * $bcount)) ] ||
3263 error "(8.7) expect the size $((4096 * $bcount)), but got $size"
3265 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3266 seek=512 && error "(8.8) write to the LOV EA hole should fail"
3268 chown $RUNAS_ID:$RUNAS_GID $name ||
3269 error "(8.9) cannot chown on $name"
3271 touch $name || error "(8.10) cannot touch $name"
3273 rm -f $name || error "(8.11) cannot unlink $name"
3275 run_test 20a "Handle the orphan with dummy LOV EA slot properly"
3278 [ $OSTCOUNT -lt 2 ] &&
3279 skip "The test needs at least 2 OSTs" && return
3282 echo "The target MDT-object and some of its OST-object are lost."
3283 echo "The LFSCK should find out the left OST-objects and re-create"
3284 echo "the MDT-object under the direcotry .lustre/lost+found/MDTxxxx/"
3285 echo "with the partial OST-objects (LOV EA hole)."
3287 echo "New client can access the file with LOV EA hole via normal"
3288 echo "system tools or commands without crash the system - PFL case."
3291 check_mount_and_prep
3293 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f0 ||
3294 error "(0) Fail to create PFL file $DIR/$tdir/f0"
3295 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f1 ||
3296 error "(1) Fail to create PFL file $DIR/$tdir/f1"
3297 $LFS setstripe -E 2M -S 1M -c 2 -E -1 -S 1M -c 2 $DIR/$tdir/f2 ||
3298 error "(2) Fail to create PFL file $DIR/$tdir/f2"
3300 local bcount=$((256 * 3 + 1))
3302 dd if=/dev/zero of=$DIR/$tdir/f0 bs=4096 count=$bcount
3303 dd if=/dev/zero of=$DIR/$tdir/f1 bs=4096 count=$bcount
3304 dd if=/dev/zero of=$DIR/$tdir/f2 bs=4096 count=$bcount
3306 local fid0=$($LFS path2fid $DIR/$tdir/f0)
3307 local fid1=$($LFS path2fid $DIR/$tdir/f1)
3308 local fid2=$($LFS path2fid $DIR/$tdir/f2)
3311 $LFS getstripe $DIR/$tdir/f0
3313 $LFS getstripe $DIR/$tdir/f1
3315 $LFS getstripe $DIR/$tdir/f2
3317 cancel_lru_locks mdc
3318 cancel_lru_locks osc
3320 echo "Inject failure..."
3321 echo "To simulate f0 lost MDT-object"
3322 #define OBD_FAIL_LFSCK_LOST_MDTOBJ 0x1616
3323 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1616
3326 echo "To simulate the case of f1 lost MDT-object and "
3327 echo "the first OST-object in each PFL component"
3328 #define OBD_FAIL_LFSCK_LOST_SPEOBJ 0x161a
3329 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161a
3332 echo "To simulate the case of f2 lost MDT-object and "
3333 echo "the second OST-object in each PFL component"
3334 do_facet $SINGLEMDS $LCTL set_param fail_val=1
3339 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
3341 echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
3342 $START_LAYOUT -r -o || error "(3) Fail to start LFSCK for layout!"
3344 for k in $(seq $MDSCOUNT); do
3345 # The LFSCK status query internal is 30 seconds. For the case
3346 # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
3347 # time to guarantee the status sync up.
3348 wait_update_facet mds${k} "$LCTL get_param -n \
3349 mdd.$(facet_svc mds${k}).lfsck_layout |
3350 awk '/^status/ { print \\\$2 }'" "completed" 32 ||
3351 error "(4) MDS${k} is not the expected 'completed'"
3354 for k in $(seq $OSTCOUNT); do
3355 local cur_status=$(do_facet ost${k} $LCTL get_param -n \
3356 obdfilter.$(facet_svc ost${k}).lfsck_layout |
3357 awk '/^status/ { print $2 }')
3358 [ "$cur_status" == "completed" ] ||
3359 error "(5) OST${k} Expect 'completed', but got '$cur_status'"
3362 local repaired=$(do_facet mds1 $LCTL get_param -n \
3363 mdd.$(facet_svc mds1).lfsck_layout |
3364 awk '/^repaired_orphan/ { print $2 }')
3365 [ $repaired -eq 8 ] ||
3366 error "(6) Expect 8 fixed on mds1, but got: $repaired"
3369 # ${fid0}-R-0 is the old f0
3371 local name="$MOUNT/.lustre/lost+found/MDT0000/${fid0}-R-0"
3372 echo "Check $name, which is the old f0"
3374 $LFS getstripe -v $name || error "(7.1) cannot getstripe on $name"
3376 local pattern=$($LFS getstripe -L -I1 $name)
3377 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3378 error "(7.2.1) NOT expect pattern flag hole, but got $pattern"
3380 pattern=$($LFS getstripe -L -I2 $name)
3381 [[ "$pattern" = "$PATTERN_WITHOUT_HOLE" ]] ||
3382 error "(7.2.2) NOT expect pattern flag hole, but got $pattern"
3384 local stripes=$($LFS getstripe -c -I1 $name)
3385 [ $stripes -eq 2 ] ||
3386 error "(7.3.1) expect 2 stripes, but got $stripes"
3388 stripes=$($LFS getstripe -c -I2 $name)
3389 [ $stripes -eq 2 ] ||
3390 error "(7.3.2) expect 2 stripes, but got $stripes"
3392 local e_start=$($LFS getstripe -I1 $name |
3393 awk '/lcme_extent.e_start:/ { print $2 }')
3394 [ $e_start -eq 0 ] ||
3395 error "(7.4.1) expect the COMP1 start at 0, got $e_start"
3397 local e_end=$($LFS getstripe -I1 $name |
3398 awk '/lcme_extent.e_end:/ { print $2 }')
3399 [ $e_end -eq 2097152 ] ||
3400 error "(7.4.2) expect the COMP1 end at 2097152, got $e_end"
3402 e_start=$($LFS getstripe -I2 $name |
3403 awk '/lcme_extent.e_start:/ { print $2 }')
3404 [ $e_start -eq 2097152 ] ||
3405 error "(7.5.1) expect the COMP2 start at 2097152, got $e_start"
3407 e_end=$($LFS getstripe -I2 $name |
3408 awk '/lcme_extent.e_end:/ { print $2 }')
3409 [ "$e_end" = "EOF" ] ||
3410 error "(7.5.2) expect the COMP2 end at (EOF), got $e_end"
3412 local size=$(stat $name | awk '/Size:/ { print $2 }')
3413 [ $size -eq $((4096 * $bcount)) ] ||
3414 error "(7.6) expect the size $((4096 * $bcount)), but got $size"
3416 cat $name > /dev/null || error "(7.7) cannot read $name"
3418 echo "dummy" >> $name || error "(7.8) cannot write $name"
3420 chown $RUNAS_ID:$RUNAS_GID $name || error "(7.9) cannot chown on $name"
3422 touch $name || error "(7.10) cannot touch $name"
3424 rm -f $name || error "(7.11) cannot unlink $name"
3427 # ${fid1}-R-0 contains the old f1's second stripe in each COMP
3429 name="$MOUNT/.lustre/lost+found/MDT0000/${fid1}-R-0"
3430 echo "Check $name, it contains f1's second OST-object in each COMP"
3432 $LFS getstripe -v $name || error "(8.1) cannot getstripe on $name"
3434 pattern=$($LFS getstripe -L -I1 $name)
3435 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3436 error "(8.2.1) expect pattern flag hole, but got $pattern"
3438 pattern=$($LFS getstripe -L -I2 $name)
3439 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3440 error "(8.2.2) expect pattern flag hole, but got $pattern"
3442 stripes=$($LFS getstripe -c -I1 $name)
3443 [ $stripes -eq 2 ] ||
3444 error "(8.3.2) expect 2 stripes, but got $stripes"
3446 stripes=$($LFS getstripe -c -I2 $name)
3447 [ $stripes -eq 2 ] ||
3448 error "(8.3.2) expect 2 stripes, but got $stripes"
3450 e_start=$($LFS getstripe -I1 $name |
3451 awk '/lcme_extent.e_start:/ { print $2 }')
3452 [ $e_start -eq 0 ] ||
3453 error "(8.4.1) expect the COMP1 start at 0, got $e_start"
3455 e_end=$($LFS getstripe -I1 $name |
3456 awk '/lcme_extent.e_end:/ { print $2 }')
3457 [ $e_end -eq 2097152 ] ||
3458 error "(8.4.2) expect the COMP1 end at 2097152, got $e_end"
3460 e_start=$($LFS getstripe -I2 $name |
3461 awk '/lcme_extent.e_start:/ { print $2 }')
3462 [ $e_start -eq 2097152 ] ||
3463 error "(8.5.1) expect the COMP2 start at 2097152, got $e_start"
3465 e_end=$($LFS getstripe -I2 $name |
3466 awk '/lcme_extent.e_end:/ { print $2 }')
3467 [ "$e_end" = "EOF" ] ||
3468 error "(8.5.2) expect the COMP2 end at (EOF), got $e_end"
3470 size=$(stat $name | awk '/Size:/ { print $2 }')
3471 [ $size -eq $((4096 * $bcount)) ] ||
3472 error "(8.6) expect the size $((4096 * $bcount)), but got $size"
3474 cat $name > /dev/null && error "(8.7) normal read $name should fail"
3476 local failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3477 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3479 # The first stripe in each COMP was lost
3480 [ $failures -eq 512 ] ||
3481 error "(8.8) expect 512 IO failures, but get $failures"
3483 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3484 [ $size -eq $((4096 * $bcount)) ] ||
3485 error "(8.9) expect the size $((4096 * $bcount)), but got $size"
3487 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 &&
3488 error "(8.10) write to the LOV EA hole should fail"
3490 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 seek=300 ||
3491 error "(8.11) write to normal stripe should NOT fail"
3493 echo "foo" >> $name && error "(8.12) append write $name should fail"
3495 chown $RUNAS_ID:$RUNAS_GID $name || error "(8.13) cannot chown on $name"
3497 touch $name || error "(8.14) cannot touch $name"
3499 rm -f $name || error "(8.15) cannot unlink $name"
3502 # ${fid2}-R-0 contains the old f2's first stripe in each COMP
3504 name="$MOUNT/.lustre/lost+found/MDT0000/${fid2}-R-0"
3505 echo "Check $name, it contains f2's first stripe in each COMP"
3507 $LFS getstripe -v $name || error "(9.1) cannot getstripe on $name"
3509 pattern=$($LFS getstripe -L -I1 $name)
3510 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3511 error "(9.2.1) expect pattern flag hole, but got $pattern"
3513 pattern=$($LFS getstripe -L -I2 $name)
3514 [[ "$pattern" = "$PATTERN_WITH_HOLE" ]] ||
3515 error "(9.2.2) expect pattern flag hole, but got $pattern"
3517 stripes=$($LFS getstripe -c -I1 $name)
3518 [ $stripes -eq 2 ] ||
3519 error "(9.3.2) expect 2 stripes, but got $stripes"
3521 stripes=$($LFS getstripe -c -I2 $name)
3522 [ $stripes -eq 2 ] ||
3523 error "(9.3.2) expect 2 stripes, but got $stripes"
3525 e_start=$($LFS getstripe -I1 $name |
3526 awk '/lcme_extent.e_start:/ { print $2 }')
3527 [ $e_start -eq 0 ] ||
3528 error "(9.4.1) expect the COMP1 start at 0, got $e_start"
3530 e_end=$($LFS getstripe -I1 $name |
3531 awk '/lcme_extent.e_end:/ { print $2 }')
3532 [ $e_end -eq 2097152 ] ||
3533 error "(9.4.2) expect the COMP1 end at 2097152, got $e_end"
3535 e_start=$($LFS getstripe -I2 $name |
3536 awk '/lcme_extent.e_start:/ { print $2 }')
3537 [ $e_start -eq 2097152 ] ||
3538 error "(9.5.1) expect the COMP2 start at 2097152, got $e_start"
3540 e_end=$($LFS getstripe -I2 $name |
3541 awk '/lcme_extent.e_end:/ { print $2 }')
3542 [ "$e_end" = "EOF" ] ||
3543 error "(9.5.2) expect the COMP2 end at (EOF), got $e_end"
3545 size=$(stat $name | awk '/Size:/ { print $2 }')
3546 # The second stripe in COMP was lost, so we do not know there
3547 # have ever been some data before. 'stat' will regard it as
3548 # no data on the lost stripe.
3550 [ $size -eq $((4096 * $bcount)) ] ||
3551 error "(9.6) expect size $((4096 * $bcount)), but got $size"
3553 cat $name > /dev/null &&
3554 error "(9.7) normal read $name should fail"
3556 failures=$(dd if=$name of=$DIR/$tdir/dump conv=sync,noerror \
3557 bs=4096 2>&1 | grep "Input/output error" | wc -l)
3558 [ $failures -eq 512 ] ||
3559 error "(9.8) expect 256 IO failures, but get $failures"
3561 size=$(stat $DIR/$tdir/dump | awk '/Size:/ { print $2 }')
3562 # The second stripe in COMP was lost, so we do not know there
3563 # have ever been some data before. Since 'dd' skip failure,
3564 # it will regard the lost stripe contains data.
3566 [ $size -eq $((4096 * $bcount)) ] ||
3567 error "(9.9) expect the size $((4096 * $bcount)), but got $size"
3569 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 \
3570 seek=300 && error "(9.10) write to the LOV EA hole should fail"
3572 dd if=/dev/zero of=$name conv=sync,notrunc bs=4096 count=1 ||
3573 error "(9.11) write to normal stripe should NOT fail"
3575 echo "foo" >> $name &&
3576 error "(9.12) append write $name should fail"
3578 chown $RUNAS_ID:$RUNAS_GID $name ||
3579 error "(9.13) cannot chown on $name"
3581 touch $name || error "(9.14) cannot touch $name"
3583 rm -f $name || error "(7.15) cannot unlink $name"
3585 run_test 20b "Handle the orphan with dummy LOV EA slot properly - PFL case"
3588 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.5.59) ]] &&
3589 skip "ignore the test if MDS is older than 2.5.59" && return
3591 check_mount_and_prep
3592 createmany -o $DIR/$tdir/f 100 || error "(0) Fail to create 100 files"
3594 echo "Start all LFSCK components by default (-s 1)"
3595 do_facet mds1 $LCTL lfsck_start -M ${FSNAME}-MDT0000 -s 1 -r ||
3596 error "Fail to start LFSCK"
3598 echo "namespace LFSCK should be in 'scanning-phase1' status"
3599 local STATUS=$($SHOW_NAMESPACE | awk '/^status/ { print $2 }')
3600 [ "$STATUS" == "scanning-phase1" ] ||
3601 error "Expect namespace 'scanning-phase1', but got '$STATUS'"
3603 echo "layout LFSCK should be in 'scanning-phase1' status"
3604 STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
3605 [ "$STATUS" == "scanning-phase1" ] ||
3606 error "Expect layout 'scanning-phase1', but got '$STATUS'"
3608 echo "Stop all LFSCK components by default"
3609 do_facet mds1 $LCTL lfsck_stop -M ${FSNAME}-MDT0000 ||
3610 error "Fail to stop LFSCK"
3612 run_test 21 "run all LFSCK components by default"
3615 [ $MDSCOUNT -lt 2 ] &&
3616 skip "We need at least 2 MDSes for this test" && return
3619 echo "The parent_A references the child directory via some name entry,"
3620 echo "but the child directory back references another parent_B via its"
3621 echo "".." name entry. The parent_B does not exist. Then the namespace"
3622 echo "LFSCK will repair the child directory's ".." name entry."
3625 check_mount_and_prep
3627 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3628 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3630 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3631 echo "The dummy's dotdot name entry references the guard."
3632 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3633 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3634 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3635 error "(3) Fail to mkdir on MDT0"
3636 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3638 rmdir $DIR/$tdir/guard || error "(4) Fail to rmdir $DIR/$tdir/guard"
3640 echo "Trigger namespace LFSCK to repair unmatched pairs"
3641 $START_NAMESPACE -A -r ||
3642 error "(5) Fail to start LFSCK for namespace"
3644 wait_all_targets_blocked namespace completed 6
3646 local repaired=$($SHOW_NAMESPACE |
3647 awk '/^unmatched_pairs_repaired/ { print $2 }')
3648 [ $repaired -eq 1 ] ||
3649 error "(7) Fail to repair unmatched pairs: $repaired"
3651 echo "'ls' should success after namespace LFSCK repairing"
3652 ls -ail $DIR/$tdir/foo/dummy > /dev/null ||
3653 error "(8) ls should success."
3655 run_test 22a "LFSCK can repair unmatched pairs (1)"
3658 [ $MDSCOUNT -lt 2 ] &&
3659 skip "We need at least 2 MDSes for this test" && return
3662 echo "The parent_A references the child directory via the name entry_B,"
3663 echo "but the child directory back references another parent_C via its"
3664 echo "".." name entry. The parent_C exists, but there is no the name"
3665 echo "entry_B under the parent_C. Then the namespace LFSCK will repair"
3666 echo "the child directory's ".." name entry and its linkEA."
3669 check_mount_and_prep
3671 $LFS mkdir -i 1 $DIR/$tdir/guard || error "(1) Fail to mkdir on MDT1"
3672 $LFS mkdir -i 1 $DIR/$tdir/foo || error "(2) Fail to mkdir on MDT1"
3674 echo "Inject failure stub on MDT0 to simulate bad dotdot name entry"
3675 echo "and bad linkEA. The dummy's dotdot name entry references the"
3676 echo "guard. The dummy's linkEA references n non-exist name entry."
3677 #define OBD_FAIL_LFSCK_BAD_PARENT 0x161e
3678 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161e
3679 $LFS mkdir -i 0 $DIR/$tdir/foo/dummy ||
3680 error "(3) Fail to mkdir on MDT0"
3681 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
3683 local dummyfid=$($LFS path2fid $DIR/$tdir/foo/dummy)
3684 echo "fid2path should NOT work on the dummy's FID $dummyfid"
3685 local dummyname=$($LFS fid2path $DIR $dummyfid)
3686 [ "$dummyname" != "$DIR/$tdir/foo/dummy" ] ||
3687 error "(4) fid2path works unexpectedly."
3689 echo "Trigger namespace LFSCK to repair unmatched pairs"
3690 $START_NAMESPACE -A -r ||
3691 error "(5) Fail to start LFSCK for namespace"
3693 wait_all_targets_blocked namespace completed 6
3695 local repaired=$($SHOW_NAMESPACE |
3696 awk '/^unmatched_pairs_repaired/ { print $2 }')
3697 [ $repaired -eq 1 ] ||
3698 error "(7) Fail to repair unmatched pairs: $repaired"
3700 echo "fid2path should work on the dummy's FID $dummyfid after LFSCK"
3701 local dummyname=$($LFS fid2path $DIR $dummyfid)
3702 [ "$dummyname" == "$DIR/$tdir/foo/dummy" ] ||
3703 error "(8) fid2path does not work"
3705 run_test 22b "LFSCK can repair unmatched pairs (2)"
3708 [ $MDSCOUNT -lt 2 ] &&
3709 skip "We need at least 2 MDSes for this test" && return
3712 echo "The name entry is there, but the MDT-object for such name "
3713 echo "entry does not exist. The namespace LFSCK should find out "
3714 echo "and repair the inconsistency as required."
3717 check_mount_and_prep
3719 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3720 $LFS mkdir -i 1 $DIR/$tdir/d0/d1 || error "(2) Fail to mkdir d1 on MDT1"
3722 echo "Inject failure stub on MDT1 to simulate dangling name entry"
3723 #define OBD_FAIL_LFSCK_DANGLING2 0x1620
3724 do_facet mds2 $LCTL set_param fail_loc=0x1620
3725 rmdir $DIR/$tdir/d0/d1 || error "(3) Fail to rmdir d1"
3726 do_facet mds2 $LCTL set_param fail_loc=0
3728 echo "'ls' should fail because of dangling name entry"
3729 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(4) ls should fail."
3731 echo "Trigger namespace LFSCK to find out dangling name entry"
3732 $START_NAMESPACE -A -r ||
3733 error "(5) Fail to start LFSCK for namespace"
3735 wait_all_targets_blocked namespace completed 6
3737 local repaired=$($SHOW_NAMESPACE |
3738 awk '/^dangling_repaired/ { print $2 }')
3739 [ $repaired -eq 1 ] ||
3740 error "(7) Fail to repair dangling name entry: $repaired"
3742 echo "'ls' should fail because not re-create MDT-object by default"
3743 ls -ail $DIR/$tdir/d0/d1 > /dev/null 2>&1 && error "(8) ls should fail."
3745 echo "Trigger namespace LFSCK again to repair dangling name entry"
3746 $START_NAMESPACE -A -r -C ||
3747 error "(9) Fail to start LFSCK for namespace"
3749 wait_all_targets_blocked namespace completed 10
3751 repaired=$($SHOW_NAMESPACE |
3752 awk '/^dangling_repaired/ { print $2 }')
3753 [ $repaired -eq 1 ] ||
3754 error "(11) Fail to repair dangling name entry: $repaired"
3756 echo "'ls' should success after namespace LFSCK repairing"
3757 ls -ail $DIR/$tdir/d0/d1 > /dev/null || error "(12) ls should success."
3759 run_test 23a "LFSCK can repair dangling name entry (1)"
3763 echo "The objectA has multiple hard links, one of them corresponding"
3764 echo "to the name entry_B. But there is something wrong for the name"
3765 echo "entry_B and cause entry_B to references non-exist object_C."
3766 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3767 echo "as dangling, and re-create the lost object_C. When the LFSCK"
3768 echo "comes to the second-stage scanning, it will find that the"
3769 echo "former re-creating object_C is not proper, and will try to"
3770 echo "replace the object_C with the real object_A."
3773 check_mount_and_prep
3775 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3776 $LFS path2fid $DIR/$tdir/d0
3778 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3780 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3781 $LFS path2fid $DIR/$tdir/d0/f0
3783 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3784 $LFS path2fid $DIR/$tdir/d0/f1
3786 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3787 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3789 if [ "$SEQ0" != "$SEQ1" ]; then
3790 # To guarantee that the f0 and f1 are in the same FID seq
3791 rm -f $DIR/$tdir/d0/f0 ||
3792 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3793 echo "dummy" > $DIR/$tdir/d0/f0 ||
3794 error "(3.2) Fail to touch on MDT0"
3795 $LFS path2fid $DIR/$tdir/d0/f0
3798 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3799 OID=$(printf %d $OID)
3801 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3802 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3803 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3804 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3805 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3807 # If there is creation after the dangling injection, it may re-use
3808 # the just released local object (inode) that is referenced by the
3809 # dangling name entry. It will fail the dangling injection.
3810 # So before deleting the target object for the dangling name entry,
3811 # remove some other objects to avoid the target object being reused
3812 # by some potential creations. LU-7429
3813 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3815 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3817 echo "'ls' should fail because of dangling name entry"
3818 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3819 error "(6) ls should fail."
3821 echo "Trigger namespace LFSCK to find out dangling name entry"
3822 $START_NAMESPACE -r -C ||
3823 error "(7) Fail to start LFSCK for namespace"
3825 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3826 mdd.${MDT_DEV}.lfsck_namespace |
3827 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3829 error "(8) unexpected status"
3832 local repaired=$($SHOW_NAMESPACE |
3833 awk '/^dangling_repaired/ { print $2 }')
3834 [ $repaired -eq 1 ] ||
3835 error "(9) Fail to repair dangling name entry: $repaired"
3837 repaired=$($SHOW_NAMESPACE |
3838 awk '/^multiple_linked_repaired/ { print $2 }')
3839 [ $repaired -eq 1 ] ||
3840 error "(10) Fail to drop the former created object: $repaired"
3842 local data=$(cat $DIR/$tdir/d0/foo)
3843 [ "$data" == "dummy" ] ||
3844 error "(11) The $DIR/$tdir/d0/foo is not recovered: $data"
3846 run_test 23b "LFSCK can repair dangling name entry (2)"
3850 echo "The objectA has multiple hard links, one of them corresponding"
3851 echo "to the name entry_B. But there is something wrong for the name"
3852 echo "entry_B and cause entry_B to references non-exist object_C."
3853 echo "In the first-stage scanning, the LFSCK will think the entry_B"
3854 echo "as dangling, and re-create the lost object_C. And then others"
3855 echo "modified the re-created object_C. When the LFSCK comes to the"
3856 echo "second-stage scanning, it will find that the former re-creating"
3857 echo "object_C maybe wrong and try to replace the object_C with the"
3858 echo "real object_A. But because object_C has been modified, so the"
3859 echo "LFSCK cannot replace it."
3862 start_full_debug_logging
3864 check_mount_and_prep
3866 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0 on MDT0"
3867 $LFS path2fid $DIR/$tdir/d0
3869 createmany -o $DIR/$tdir/d0/t 10 || error "(1.5) Fail to creatmany"
3871 echo "dummy" > $DIR/$tdir/d0/f0 || error "(2) Fail to touch on MDT0"
3872 $LFS path2fid $DIR/$tdir/d0/f0
3874 echo "dead" > $DIR/$tdir/d0/f1 || error "(3) Fail to touch on MDT0"
3875 $LFS path2fid $DIR/$tdir/d0/f1
3877 local SEQ0=$($LFS path2fid $DIR/$tdir/d0/f0 | awk -F':' '{print $1}')
3878 local SEQ1=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $1}')
3880 if [ "$SEQ0" != "$SEQ1" ]; then
3881 # To guarantee that the f0 and f1 are in the same FID seq
3882 rm -f $DIR/$tdir/d0/f0 ||
3883 error "(3.1) Fail to unlink $DIR/$tdir/d0/f0"
3884 echo "dummy" > $DIR/$tdir/d0/f0 ||
3885 error "(3.2) Fail to touch on MDT0"
3886 $LFS path2fid $DIR/$tdir/d0/f0
3889 local OID=$($LFS path2fid $DIR/$tdir/d0/f1 | awk -F':' '{print $2}')
3890 OID=$(printf %d $OID)
3892 echo "Inject failure stub on MDT0 to simulate dangling name entry"
3893 #define OBD_FAIL_LFSCK_DANGLING3 0x1621
3894 do_facet $SINGLEMDS $LCTL set_param fail_val=$OID fail_loc=0x1621
3895 ln $DIR/$tdir/d0/f0 $DIR/$tdir/d0/foo || error "(4) Fail to hard link"
3896 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3898 # If there is creation after the dangling injection, it may re-use
3899 # the just released local object (inode) that is referenced by the
3900 # dangling name entry. It will fail the dangling injection.
3901 # So before deleting the target object for the dangling name entry,
3902 # remove some other objects to avoid the target object being reused
3903 # by some potential creations. LU-7429
3904 unlinkmany $DIR/$tdir/d0/t 10 || error "(5.0) Fail to unlinkmany"
3906 rm -f $DIR/$tdir/d0/f1 || error "(5) Fail to unlink $DIR/$tdir/d0/f1"
3908 echo "'ls' should fail because of dangling name entry"
3909 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
3910 error "(6) ls should fail."
3912 #define OBD_FAIL_LFSCK_DELAY3 0x1602
3913 do_facet $SINGLEMDS $LCTL set_param fail_val=10 fail_loc=0x1602
3915 echo "Trigger namespace LFSCK to find out dangling name entry"
3916 $START_NAMESPACE -r -C ||
3917 error "(7) Fail to start LFSCK for namespace"
3919 wait_update_facet client "stat $DIR/$tdir/d0/foo |
3920 awk '/Size/ { print \\\$2 }'" "0" $LTIME || {
3921 stat $DIR/$tdir/d0/foo
3923 error "(8) unexpected size"
3926 echo "data" >> $DIR/$tdir/d0/foo || error "(9) Fail to write"
3927 cancel_lru_locks osc
3929 do_facet $SINGLEMDS $LCTL set_param fail_val=0 fail_loc=0
3930 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
3931 mdd.${MDT_DEV}.lfsck_namespace |
3932 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
3934 error "(10) unexpected status"
3937 stop_full_debug_logging
3939 local repaired=$($SHOW_NAMESPACE |
3940 awk '/^dangling_repaired/ { print $2 }')
3941 [ $repaired -eq 1 ] ||
3942 error "(11) Fail to repair dangling name entry: $repaired"
3944 local data=$(cat $DIR/$tdir/d0/foo)
3945 [ "$data" != "dummy" ] ||
3946 error "(12) The $DIR/$tdir/d0/foo should not be recovered"
3948 run_test 23c "LFSCK can repair dangling name entry (3)"
3951 [ $MDSCOUNT -lt 2 ] &&
3952 skip "We need at least 2 MDSes for this test" && return
3955 echo "Two MDT-objects back reference the same name entry via their"
3956 echo "each own linkEA entry, but the name entry only references one"
3957 echo "MDT-object. The namespace LFSCK will remove the linkEA entry"
3958 echo "for the MDT-object that is not recognized. If such MDT-object"
3959 echo "has no other linkEA entry after the removing, then the LFSCK"
3960 echo "will add it as orphan under the .lustre/lost+found/MDTxxxx/."
3963 check_mount_and_prep
3965 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
3967 mkdir $DIR/$tdir/d0/guard || error "(1) Fail to mkdir guard"
3968 $LFS path2fid $DIR/$tdir/d0/guard
3970 mkdir $DIR/$tdir/d0/dummy || error "(2) Fail to mkdir dummy"
3971 $LFS path2fid $DIR/$tdir/d0/dummy
3974 if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
3975 pfid=$($LFS path2fid $DIR/$tdir/d0/guard)
3977 pfid=$($LFS path2fid $DIR/$tdir/d0/dummy)
3980 touch $DIR/$tdir/d0/guard/foo ||
3981 error "(3) Fail to touch $DIR/$tdir/d0/guard/foo"
3983 echo "Inject failure stub on MDT0 to simulate the case that"
3984 echo "the $DIR/$tdir/d0/dummy/foo has the 'bad' linkEA entry"
3985 echo "that references $DIR/$tdir/d0/guard/foo."
3986 echo "Then remove the name entry $DIR/$tdir/d0/dummy/foo."
3987 echo "So the MDT-object $DIR/$tdir/d0/dummy/foo will be left"
3988 echo "there with the same linkEA entry as another MDT-object"
3989 echo "$DIR/$tdir/d0/guard/foo has"
3991 #define OBD_FAIL_LFSCK_MUL_REF 0x1622
3992 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1622
3993 $LFS mkdir -i 0 $DIR/$tdir/d0/dummy/foo ||
3994 error "(4) Fail to mkdir $DIR/$tdir/d0/dummy/foo"
3995 $LFS path2fid $DIR/$tdir/d0/dummy/foo
3996 local cfid=$($LFS path2fid $DIR/$tdir/d0/dummy/foo)
3997 rmdir $DIR/$tdir/d0/dummy/foo ||
3998 error "(5) Fail to remove $DIR/$tdir/d0/dummy/foo name entry"
3999 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4001 echo "stat $DIR/$tdir/d0/dummy/foo should fail"
4002 stat $DIR/$tdir/d0/dummy/foo > /dev/null 2>&1 &&
4003 error "(6) stat successfully unexpectedly"
4005 echo "Trigger namespace LFSCK to repair multiple-referenced name entry"
4006 $START_NAMESPACE -A -r ||
4007 error "(7) Fail to start LFSCK for namespace"
4009 wait_all_targets_blocked namespace completed 8
4011 local repaired=$($SHOW_NAMESPACE |
4012 awk '/^multiple_referenced_repaired/ { print $2 }')
4013 [ $repaired -eq 1 ] ||
4014 error "(9) Fail to repair multiple referenced name entry: $repaired"
4016 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4017 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4018 error "(10) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4020 local cname="$cfid-$pfid-D-0"
4021 ls -ail $MOUNT/.lustre/lost+found/MDT0000/$cname ||
4022 error "(11) .lustre/lost+found/MDT0000/ should not be empty"
4024 run_test 24 "LFSCK can repair multiple-referenced name entry"
4027 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4028 skip "Only support to inject failure on ldiskfs" && return
4031 echo "The file type in the name entry does not match the file type"
4032 echo "claimed by the referenced object. Then the LFSCK will update"
4033 echo "the file type in the name entry."
4036 check_mount_and_prep
4038 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4040 echo "Inject failure stub on MDT0 to simulate the case that"
4041 echo "the file type stored in the name entry is wrong."
4043 #define OBD_FAIL_LFSCK_BAD_TYPE 0x1623
4044 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1623
4045 touch $DIR/$tdir/d0/foo || error "(2) Fail to touch $DIR/$tdir/d0/foo"
4046 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4048 echo "Trigger namespace LFSCK to repair bad file type in the name entry"
4049 $START_NAMESPACE -r || error "(3) Fail to start LFSCK for namespace"
4051 wait_update_facet $SINGLEMDS "$LCTL get_param -n \
4052 mdd.${MDT_DEV}.lfsck_namespace |
4053 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4055 error "(4) unexpected status"
4058 local repaired=$($SHOW_NAMESPACE |
4059 awk '/^bad_file_type_repaired/ { print $2 }')
4060 [ $repaired -eq 1 ] ||
4061 error "(5) Fail to repair bad file type in name entry: $repaired"
4063 ls -ail $DIR/$tdir/d0 || error "(6) Fail to 'ls' the $DIR/$tdir/d0"
4065 run_test 25 "LFSCK can repair bad file type in the name entry"
4069 echo "The local name entry back referenced by the MDT-object is lost."
4070 echo "The namespace LFSCK will add the missing local name entry back"
4071 echo "to the normal namespace."
4074 check_mount_and_prep
4076 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4077 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4078 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4080 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4081 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4083 echo "Inject failure stub on MDT0 to simulate the case that"
4084 echo "foo's name entry will be removed, but the foo's object"
4085 echo "and its linkEA are kept in the system."
4087 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4088 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4089 rm -f $DIR/$tdir/d0/foo || error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4090 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4092 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4093 error "(5) 'ls' should fail"
4095 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4096 $START_NAMESPACE -r -A ||
4097 error "(6) Fail to start LFSCK for namespace"
4099 wait_all_targets_blocked namespace completed 7
4101 local repaired=$($SHOW_NAMESPACE |
4102 awk '/^lost_dirent_repaired/ { print $2 }')
4103 [ $repaired -eq 1 ] ||
4104 error "(8) Fail to repair lost dirent: $repaired"
4106 ls -ail $DIR/$tdir/d0/foo ||
4107 error "(9) Fail to 'ls' $DIR/$tdir/d0/foo"
4109 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4110 [ "$foofid" == "$foofid2" ] ||
4111 error "(10) foo's FID changed: $foofid, $foofid2"
4113 run_test 26a "LFSCK can add the missing local name entry back to the namespace"
4116 [ $MDSCOUNT -lt 2 ] &&
4117 skip "We need at least 2 MDSes for this test" && return
4120 echo "The remote name entry back referenced by the MDT-object is lost."
4121 echo "The namespace LFSCK will add the missing remote name entry back"
4122 echo "to the normal namespace."
4125 check_mount_and_prep
4127 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4128 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4129 local foofid=$($LFS path2fid $DIR/$tdir/d0/foo)
4131 echo "Inject failure stub on MDT0 to simulate the case that"
4132 echo "foo's name entry will be removed, but the foo's object"
4133 echo "and its linkEA are kept in the system."
4135 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4136 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4137 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4138 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4140 ls -ail $DIR/$tdir/d0/foo > /dev/null 2>&1 &&
4141 error "(4) 'ls' should fail"
4143 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4144 $START_NAMESPACE -r -A ||
4145 error "(5) Fail to start LFSCK for namespace"
4147 wait_all_targets_blocked namespace completed 6
4149 local repaired=$($SHOW_NAMESPACE |
4150 awk '/^lost_dirent_repaired/ { print $2 }')
4151 [ $repaired -eq 1 ] ||
4152 error "(7) Fail to repair lost dirent: $repaired"
4154 ls -ail $DIR/$tdir/d0/foo ||
4155 error "(8) Fail to 'ls' $DIR/$tdir/d0/foo"
4157 local foofid2=$($LFS path2fid $DIR/$tdir/d0/foo)
4158 [ "$foofid" == "$foofid2" ] ||
4159 error "(9) foo's FID changed: $foofid, $foofid2"
4161 run_test 26b "LFSCK can add the missing remote name entry back to the namespace"
4165 echo "The local parent referenced by the MDT-object linkEA is lost."
4166 echo "The namespace LFSCK will re-create the lost parent as orphan."
4169 check_mount_and_prep
4171 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4172 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4173 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/dummy ||
4174 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4176 echo "Inject failure stub on MDT0 to simulate the case that"
4177 echo "foo's name entry will be removed, but the foo's object"
4178 echo "and its linkEA are kept in the system. And then remove"
4179 echo "another hard link and the parent directory."
4181 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4182 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4183 rm -f $DIR/$tdir/d0/foo ||
4184 error "(4) Fail to unlink $DIR/$tdir/d0/foo"
4185 rm -f $DIR/$tdir/d0/dummy ||
4186 error "(5) Fail to unlink $DIR/$tdir/d0/dummy"
4187 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4189 rm -rf $DIR/$tdir/d0 || error "(5) Fail to unlink the dir d0"
4190 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(6) 'ls' should fail"
4192 echo "Trigger namespace LFSCK to repair the lost parent"
4193 $START_NAMESPACE -r -A ||
4194 error "(6) Fail to start LFSCK for namespace"
4196 wait_all_targets_blocked namespace completed 7
4198 local repaired=$($SHOW_NAMESPACE |
4199 awk '/^lost_dirent_repaired/ { print $2 }')
4200 [ $repaired -eq 1 ] ||
4201 error "(8) Fail to repair lost dirent: $repaired"
4203 echo "There should be an orphan under .lustre/lost+found/MDT0000/"
4204 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4205 error "(9) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4207 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4209 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-P-*)
4210 [ ! -z "$cname" ] ||
4211 error "(10) .lustre/lost+found/MDT0000/ should not be empty"
4213 run_test 27a "LFSCK can recreate the lost local parent directory as orphan"
4216 [ $MDSCOUNT -lt 2 ] &&
4217 skip "We need at least 2 MDSes for this test" && return
4220 echo "The remote parent referenced by the MDT-object linkEA is lost."
4221 echo "The namespace LFSCK will re-create the lost parent as orphan."
4224 check_mount_and_prep
4226 $LFS mkdir -i 1 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4227 $LFS mkdir -i 0 $DIR/$tdir/d0/foo || error "(2) Fail to mkdir foo"
4229 $LFS path2fid $DIR/$tdir/d0
4231 echo "Inject failure stub on MDT0 to simulate the case that"
4232 echo "foo's name entry will be removed, but the foo's object"
4233 echo "and its linkEA are kept in the system. And then remove"
4234 echo "the parent directory."
4236 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4237 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4238 rmdir $DIR/$tdir/d0/foo || error "(3) Fail to rmdir $DIR/$tdir/d0/foo"
4239 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4241 rmdir $DIR/$tdir/d0 || error "(4) Fail to unlink the dir d0"
4242 ls -ail $DIR/$tdir/d0 > /dev/null 2>&1 && error "(5) 'ls' should fail"
4244 echo "Trigger namespace LFSCK to repair the missing remote name entry"
4245 $START_NAMESPACE -r -A ||
4246 error "(6) Fail to start LFSCK for namespace"
4248 wait_all_targets_blocked namespace completed 7
4250 local repaired=$($SHOW_NAMESPACE |
4251 awk '/^lost_dirent_repaired/ { print $2 }')
4252 [ $repaired -eq 1 ] ||
4253 error "(8) Fail to repair lost dirent: $repaired"
4255 ls -ail $MOUNT/.lustre/lost+found/
4257 echo "There should be an orphan under .lustre/lost+found/MDT0001/"
4258 [ -d $MOUNT/.lustre/lost+found/MDT0001 ] ||
4259 error "(9) $MOUNT/.lustre/lost+found/MDT0001/ should be there"
4261 ls -ail $MOUNT/.lustre/lost+found/MDT0001/
4263 cname=$(find $MOUNT/.lustre/lost+found/MDT0001/ -name *-P-*)
4264 [ ! -z "$cname" ] ||
4265 error "(10) .lustre/lost+found/MDT0001/ should not be empty"
4267 run_test 27b "LFSCK can recreate the lost remote parent directory as orphan"
4270 [ $MDSCOUNT -lt 2 ] &&
4271 skip "The test needs at least 2 MDTs" && return
4274 echo "The target name entry is lost. The LFSCK should insert the"
4275 echo "orphan MDT-object under .lustre/lost+found/MDTxxxx. But if"
4276 echo "the MDT (on which the orphan MDT-object resides) has ever"
4277 echo "failed to respond some name entry verification during the"
4278 echo "first stage-scanning, then the LFSCK should skip to handle"
4279 echo "orphan MDT-object on this MDT. But other MDTs should not"
4283 check_mount_and_prep
4284 $LFS mkdir -i 0 $DIR/$tdir/d1
4285 $LFS mkdir -i 1 $DIR/$tdir/d1/a1
4286 $LFS mkdir -i 1 $DIR/$tdir/d1/a2
4288 $LFS mkdir -i 1 $DIR/$tdir/d2
4289 $LFS mkdir -i 0 $DIR/$tdir/d2/a1
4290 $LFS mkdir -i 0 $DIR/$tdir/d2/a2
4292 echo "Inject failure stub on MDT0 to simulate the case that"
4293 echo "d1/a1's name entry will be removed, but the d1/a1's object"
4294 echo "and its linkEA are kept in the system. And the case that"
4295 echo "d2/a2's name entry will be removed, but the d2/a2's object"
4296 echo "and its linkEA are kept in the system."
4298 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4299 do_facet mds1 $LCTL set_param fail_loc=0x1624
4300 do_facet mds2 $LCTL set_param fail_loc=0x1624
4301 rmdir $DIR/$tdir/d1/a1 || error "(1) Fail to rmdir $DIR/$tdir/d1/a1"
4302 rmdir $DIR/$tdir/d2/a2 || error "(2) Fail to rmdir $DIR/$tdir/d2/a2"
4303 do_facet mds1 $LCTL set_param fail_loc=0
4304 do_facet mds2 $LCTL set_param fail_loc=0
4306 cancel_lru_locks mdc
4307 cancel_lru_locks osc
4309 echo "Inject failure, to simulate the MDT0 fail to handle"
4310 echo "MDT1 LFSCK request during the first-stage scanning."
4311 #define OBD_FAIL_LFSCK_BAD_NETWORK 0x161c
4312 do_facet mds2 $LCTL set_param fail_loc=0x161c fail_val=0
4314 echo "Trigger namespace LFSCK on all devices to find out orphan object"
4315 $START_NAMESPACE -r -A ||
4316 error "(3) Fail to start LFSCK for namespace"
4318 wait_update_facet mds1 "$LCTL get_param -n \
4319 mdd.$(facet_svc mds1).lfsck_namespace |
4320 awk '/^status/ { print \\\$2 }'" "partial" 32 || {
4321 error "(4) mds1 is not the expected 'partial'"
4324 wait_update_facet mds2 "$LCTL get_param -n \
4325 mdd.$(facet_svc mds2).lfsck_namespace |
4326 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
4327 error "(5) mds2 is not the expected 'completed'"
4330 do_facet mds2 $LCTL set_param fail_loc=0 fail_val=0
4332 local repaired=$(do_facet mds1 $LCTL get_param -n \
4333 mdd.$(facet_svc mds1).lfsck_namespace |
4334 awk '/^lost_dirent_repaired/ { print $2 }')
4335 [ $repaired -eq 0 ] ||
4336 error "(6) Expect 0 fixed on mds1, but got: $repaired"
4338 repaired=$(do_facet mds2 $LCTL get_param -n \
4339 mdd.$(facet_svc mds2).lfsck_namespace |
4340 awk '/^lost_dirent_repaired/ { print $2 }')
4341 [ $repaired -eq 1 ] ||
4342 error "(7) Expect 1 fixed on mds2, but got: $repaired"
4344 echo "Trigger namespace LFSCK on all devices again to cleanup"
4345 $START_NAMESPACE -r -A ||
4346 error "(8) Fail to start LFSCK for namespace"
4348 wait_all_targets_blocked namespace completed 9
4350 local repaired=$(do_facet mds1 $LCTL get_param -n \
4351 mdd.$(facet_svc mds1).lfsck_namespace |
4352 awk '/^lost_dirent_repaired/ { print $2 }')
4353 [ $repaired -eq 1 ] ||
4354 error "(10) Expect 1 fixed on mds1, but got: $repaired"
4356 repaired=$(do_facet mds2 $LCTL get_param -n \
4357 mdd.$(facet_svc mds2).lfsck_namespace |
4358 awk '/^lost_dirent_repaired/ { print $2 }')
4359 [ $repaired -eq 0 ] ||
4360 error "(11) Expect 0 fixed on mds2, but got: $repaired"
4362 run_test 28 "Skip the failed MDT(s) when handle orphan MDT-objects"
4366 echo "The object's nlink attribute is larger than the object's known"
4367 echo "name entries count. The LFSCK will repair the object's nlink"
4368 echo "attribute to match the known name entries count"
4371 check_mount_and_prep
4373 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4374 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4376 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4377 echo "nlink attribute is larger than its name entries count."
4379 #define OBD_FAIL_LFSCK_MORE_NLINK 0x1625
4380 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1625
4381 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4382 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4383 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4385 cancel_lru_locks mdc
4386 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4387 [ $count -eq 3 ] || error "(4) Cannot inject error: $count"
4389 echo "Trigger namespace LFSCK to repair the nlink count"
4390 $START_NAMESPACE -r -A ||
4391 error "(5) Fail to start LFSCK for namespace"
4393 wait_all_targets_blocked namespace completed 6
4395 local repaired=$($SHOW_NAMESPACE |
4396 awk '/^nlinks_repaired/ { print $2 }')
4397 [ $repaired -eq 1 ] ||
4398 error "(7) Fail to repair nlink count: $repaired"
4400 cancel_lru_locks mdc
4401 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4402 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4404 # Disable 29a, we only allow nlink to be updated if the known linkEA
4405 # entries is larger than nlink count.
4407 #run_test 29a "LFSCK can repair bad nlink count (1)"
4411 echo "The object's nlink attribute is smaller than the object's known"
4412 echo "name entries count. The LFSCK will repair the object's nlink"
4413 echo "attribute to match the known name entries count"
4416 check_mount_and_prep
4418 $LFS mkdir -i 0 $DIR/$tdir/d0 || error "(1) Fail to mkdir d0"
4419 touch $DIR/$tdir/d0/foo || error "(2) Fail to create foo"
4421 echo "Inject failure stub on MDT0 to simulate the case that foo's"
4422 echo "nlink attribute is smaller than its name entries count."
4424 #define OBD_FAIL_LFSCK_LESS_NLINK 0x1626
4425 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1626
4426 ln $DIR/$tdir/d0/foo $DIR/$tdir/d0/h1 ||
4427 error "(3) Fail to hard link to $DIR/$tdir/d0/foo"
4428 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4430 cancel_lru_locks mdc
4431 local count=$(stat --format=%h $DIR/$tdir/d0/foo)
4432 [ $count -eq 1 ] || error "(4) Cannot inject error: $count"
4434 echo "Trigger namespace LFSCK to repair the nlink count"
4435 $START_NAMESPACE -r -A ||
4436 error "(5) Fail to start LFSCK for namespace"
4438 wait_all_targets_blocked namespace completed 6
4440 local repaired=$($SHOW_NAMESPACE |
4441 awk '/^nlinks_repaired/ { print $2 }')
4442 [ $repaired -eq 1 ] ||
4443 error "(7) Fail to repair nlink count: $repaired"
4445 cancel_lru_locks mdc
4446 count=$(stat --format=%h $DIR/$tdir/d0/foo)
4447 [ $count -eq 2 ] || error "(8) Fail to repair nlink count: $count"
4449 run_test 29b "LFSCK can repair bad nlink count (2)"
4454 echo "The namespace LFSCK will create many hard links to the target"
4455 echo "file as to exceed the linkEA size limitation. Under such case"
4456 echo "the linkEA will be marked as overflow that will prevent the"
4457 echo "target file to be migrated. Then remove some hard links to"
4458 echo "make the left hard links to be held within the linkEA size"
4459 echo "limitation. But before the namespace LFSCK adding all the"
4460 echo "missed linkEA entries back, the overflow mark (timestamp)"
4461 echo "will not be cleared."
4464 check_mount_and_prep
4466 mkdir -p $DIR/$tdir/guard || error "(0.1) Fail to mkdir"
4467 $LFS mkdir -i $((MDSCOUNT - 1)) $DIR/$tdir/foo ||
4468 error "(0.2) Fail to mkdir"
4469 touch $DIR/$tdir/guard/f0 || error "(1) Fail to create"
4470 local oldfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4472 # define MAX_LINKEA_SIZE 4096
4473 # sizeof(link_ea_header) = 24
4474 # sizeof(link_ea_entry) = 18
4475 # nlink_min=$(((MAX_LINKEA_SIZE - sizeof(link_ea_header)) /
4476 # (sizeof(link_ea_entry) + name_length))
4477 # If the average name length is 12 bytes, then 150 hard links
4478 # is totally enough to overflow the linkEA
4479 echo "Create 150 hard links should succeed although the linkEA overflow"
4480 createmany -l $DIR/$tdir/guard/f0 $DIR/$tdir/foo/ttttttttttt 150 ||
4481 error "(2) Fail to hard link"
4483 cancel_lru_locks mdc
4484 if [ $MDSCOUNT -ge 2 ]; then
4485 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4486 error "(3.1) Migrate failure"
4488 echo "The object with linkEA overflow should NOT be migrated"
4489 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4490 [ "$newfid" == "$oldfid" ] ||
4491 error "(3.2) Migrate should fail: $newfid != $oldfid"
4494 # Remove 100 hard links, then the linkEA should have space
4495 # to hold the missed linkEA entries.
4496 echo "Remove 100 hard links to save space for the missed linkEA entries"
4497 unlinkmany $DIR/$tdir/foo/ttttttttttt 100 || error "(4) Fail to unlink"
4499 if [ $MDSCOUNT -ge 2 ]; then
4500 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4501 error "(5.1) Migrate failure"
4503 # The overflow timestamp is still there, so migration will fail.
4504 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4505 [ "$newfid" == "$oldfid" ] ||
4506 error "(5.2) Migrate should fail: $newfid != $oldfid"
4509 # sleep 3 seconds to guarantee that the overflow is recognized
4512 echo "Trigger namespace LFSCK to clear the overflow timestamp"
4513 $START_NAMESPACE -r -A ||
4514 error "(6) Fail to start LFSCK for namespace"
4516 wait_all_targets_blocked namespace completed 7
4518 local repaired=$($SHOW_NAMESPACE |
4519 awk '/^linkea_overflow_cleared/ { print $2 }')
4520 [ $repaired -eq 1 ] ||
4521 error "(8) Fail to clear linkea overflow: $repaired"
4523 repaired=$($SHOW_NAMESPACE |
4524 awk '/^nlinks_repaired/ { print $2 }')
4525 [ $repaired -eq 0 ] ||
4526 error "(9) Unexpected nlink repaired: $repaired"
4528 if [ $MDSCOUNT -ge 2 ]; then
4529 $LFS migrate -m 1 $DIR/$tdir/guard 2>/dev/null ||
4530 error "(10.1) Migrate failure"
4532 # Migration should succeed after clear the overflow timestamp.
4533 local newfid=$($LFS path2fid $DIR/$tdir/guard/f0)
4534 [ "$newfid" != "$oldfid" ] ||
4535 error "(10.2) Migrate should succeed"
4537 ls -l $DIR/$tdir/foo > /dev/null ||
4538 error "(11) 'ls' failed after migration"
4541 rm -f $DIR/$tdir/guard/f0 || error "(12) Fail to unlink f0"
4542 rm -rf $DIR/$tdir/foo || error "(13) Fail to rmdir foo"
4544 run_test 29c "verify linkEA size limitation"
4547 [ $(facet_fstype $SINGLEMDS) != ldiskfs ] &&
4548 skip "Only support backend /lost+found for ldiskfs" && return
4551 echo "The namespace LFSCK will move the orphans from backend"
4552 echo "/lost+found directory to normal client visible namespace"
4553 echo "or to global visible ./lustre/lost+found/MDTxxxx/ directory"
4556 check_mount_and_prep
4558 $LFS mkdir -i 0 $DIR/$tdir/foo || error "(1) Fail to mkdir foo"
4559 touch $DIR/$tdir/foo/f0 || error "(2) Fail to touch f1"
4561 echo "Inject failure stub on MDT0 to simulate the case that"
4562 echo "directory d0 has no linkEA entry, then the LFSCK will"
4563 echo "move it into .lustre/lost+found/MDTxxxx/ later."
4565 #define OBD_FAIL_LFSCK_NO_LINKEA 0x161d
4566 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x161d
4567 mkdir $DIR/$tdir/foo/d0 || error "(3) Fail to mkdir d0"
4568 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4570 touch $DIR/$tdir/foo/d0/f1 || error "(4) Fail to touch f1"
4571 mkdir $DIR/$tdir/foo/d0/d1 || error "(5) Fail to mkdir d1"
4573 echo "Inject failure stub on MDT0 to simulate the case that the"
4574 echo "object's name entry will be removed, but not destroy the"
4575 echo "object. Then backend e2fsck will handle it as orphan and"
4576 echo "add them into the backend /lost+found directory."
4578 #define OBD_FAIL_LFSCK_NO_NAMEENTRY 0x1624
4579 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1624
4580 rmdir $DIR/$tdir/foo/d0/d1 || error "(6) Fail to rmdir d1"
4581 rm -f $DIR/$tdir/foo/d0/f1 || error "(7) Fail to unlink f1"
4582 rmdir $DIR/$tdir/foo/d0 || error "(8) Fail to rmdir d0"
4583 rm -f $DIR/$tdir/foo/f0 || error "(9) Fail to unlink f0"
4584 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4586 umount_client $MOUNT || error "(10) Fail to stop client!"
4588 stop $SINGLEMDS || error "(11) Fail to stop MDT0"
4591 run_e2fsck $(facet_host $SINGLEMDS) $MDT_DEVNAME "-y" ||
4592 error "(12) Fail to run e2fsck"
4594 start $SINGLEMDS $MDT_DEVNAME $MOUNT_OPTS_NOSCRUB > /dev/null ||
4595 error "(13) Fail to start MDT0"
4597 echo "Trigger namespace LFSCK to recover backend orphans"
4598 $START_NAMESPACE -r -A ||
4599 error "(14) Fail to start LFSCK for namespace"
4601 wait_all_targets_blocked namespace completed 15
4603 local repaired=$($SHOW_NAMESPACE |
4604 awk '/^local_lost_found_moved/ { print $2 }')
4605 [ $repaired -ge 4 ] ||
4606 error "(16) Fail to recover backend orphans: $repaired"
4608 mount_client $MOUNT || error "(17) Fail to start client!"
4610 stat $DIR/$tdir/foo/f0 || error "(18) f0 is not recovered"
4612 ls -ail $MOUNT/.lustre/lost+found/
4614 echo "d0 should become orphan under .lustre/lost+found/MDT0000/"
4615 [ -d $MOUNT/.lustre/lost+found/MDT0000 ] ||
4616 error "(19) $MOUNT/.lustre/lost+found/MDT0000/ should be there"
4618 ls -ail $MOUNT/.lustre/lost+found/MDT0000/
4620 cname=$(find $MOUNT/.lustre/lost+found/MDT0000/ -name *-*-D-*)
4621 [ ! -z "$cname" ] || error "(20) d0 is not recovered"
4623 stat ${cname}/d1 || error "(21) d0 is not recovered"
4624 stat ${cname}/f1 || error "(22) f1 is not recovered"
4626 run_test 30 "LFSCK can recover the orphans from backend /lost+found"
4629 [ $MDSCOUNT -lt 2 ] &&
4630 skip "The test needs at least 2 MDTs" && return
4633 echo "For the name entry under a striped directory, if the name"
4634 echo "hash does not match the shard, then the LFSCK will repair"
4635 echo "the bad name entry"
4638 check_mount_and_prep
4640 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4641 error "(1) Fail to create striped directory"
4643 echo "Inject failure stub on client to simulate the case that"
4644 echo "some name entry should be inserted into other non-first"
4645 echo "shard, but inserted into the first shard by wrong"
4647 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4648 $LCTL set_param fail_loc=0x1628 fail_val=0
4649 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4650 error "(2) Fail to create file under striped directory"
4651 $LCTL set_param fail_loc=0 fail_val=0
4653 echo "Trigger namespace LFSCK to repair bad name hash"
4654 $START_NAMESPACE -r -A ||
4655 error "(3) Fail to start LFSCK for namespace"
4657 wait_all_targets_blocked namespace completed 4
4659 local repaired=$($SHOW_NAMESPACE |
4660 awk '/^name_hash_repaired/ { print $2 }')
4661 [ $repaired -ge 1 ] ||
4662 error "(5) Fail to repair bad name hash: $repaired"
4664 umount_client $MOUNT || error "(6) umount failed"
4665 mount_client $MOUNT || error "(7) mount failed"
4667 for ((i = 0; i < $MDSCOUNT; i++)); do
4668 stat $DIR/$tdir/striped_dir/d$i ||
4669 error "(8) Fail to stat d$i after LFSCK"
4670 rmdir $DIR/$tdir/striped_dir/d$i ||
4671 error "(9) Fail to unlink d$i after LFSCK"
4674 rmdir $DIR/$tdir/striped_dir ||
4675 error "(10) Fail to remove the striped directory after LFSCK"
4677 run_test 31a "The LFSCK can find/repair the name entry with bad name hash (1)"
4680 [ $MDSCOUNT -lt 2 ] &&
4681 skip "The test needs at least 2 MDTs" && return
4684 echo "For the name entry under a striped directory, if the name"
4685 echo "hash does not match the shard, then the LFSCK will repair"
4686 echo "the bad name entry"
4689 check_mount_and_prep
4691 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4692 error "(1) Fail to create striped directory"
4694 echo "Inject failure stub on client to simulate the case that"
4695 echo "some name entry should be inserted into other non-second"
4696 echo "shard, but inserted into the secod shard by wrong"
4698 #define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628
4699 $LCTL set_param fail_loc=0x1628 fail_val=1
4700 createmany -d $DIR/$tdir/striped_dir/d $MDSCOUNT ||
4701 error "(2) Fail to create file under striped directory"
4702 $LCTL set_param fail_loc=0 fail_val=0
4704 echo "Trigger namespace LFSCK to repair bad name hash"
4705 $START_NAMESPACE -r -A ||
4706 error "(3) Fail to start LFSCK for namespace"
4708 wait_all_targets_blocked namespace completed 4
4710 local repaired=$(do_facet mds2 $LCTL get_param -n \
4711 mdd.$(facet_svc mds2).lfsck_namespace |
4712 awk '/^name_hash_repaired/ { print $2 }')
4713 [ $repaired -ge 1 ] ||
4714 error "(5) Fail to repair bad name hash: $repaired"
4716 umount_client $MOUNT || error "(6) umount failed"
4717 mount_client $MOUNT || error "(7) mount failed"
4719 for ((i = 0; i < $MDSCOUNT; i++)); do
4720 stat $DIR/$tdir/striped_dir/d$i ||
4721 error "(8) Fail to stat d$i after LFSCK"
4722 rmdir $DIR/$tdir/striped_dir/d$i ||
4723 error "(9) Fail to unlink d$i after LFSCK"
4726 rmdir $DIR/$tdir/striped_dir ||
4727 error "(10) Fail to remove the striped directory after LFSCK"
4729 run_test 31b "The LFSCK can find/repair the name entry with bad name hash (2)"
4732 [ $MDSCOUNT -lt 2 ] &&
4733 skip "The test needs at least 2 MDTs" && return
4736 echo "For some reason, the master MDT-object of the striped directory"
4737 echo "may lost its master LMV EA. If nobody created files under the"
4738 echo "master directly after the master LMV EA lost, then the LFSCK"
4739 echo "should re-generate the master LMV EA."
4742 check_mount_and_prep
4744 echo "Inject failure stub on MDT0 to simulate the case that the"
4745 echo "master MDT-object of the striped directory lost the LMV EA."
4747 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4748 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4749 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4750 error "(1) Fail to create striped directory"
4751 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
4753 echo "Trigger namespace LFSCK to re-generate master LMV EA"
4754 $START_NAMESPACE -r -A ||
4755 error "(2) Fail to start LFSCK for namespace"
4757 wait_all_targets_blocked namespace completed 3
4759 local repaired=$($SHOW_NAMESPACE |
4760 awk '/^striped_dirs_repaired/ { print $2 }')
4761 [ $repaired -eq 1 ] ||
4762 error "(4) Fail to re-generate master LMV EA: $repaired"
4764 umount_client $MOUNT || error "(5) umount failed"
4765 mount_client $MOUNT || error "(6) mount failed"
4767 local empty=$(ls $DIR/$tdir/striped_dir/)
4768 [ -z "$empty" ] || error "(7) The master LMV EA is not repaired: $empty"
4770 rmdir $DIR/$tdir/striped_dir ||
4771 error "(8) Fail to remove the striped directory after LFSCK"
4773 run_test 31c "Re-generate the lost master LMV EA for striped directory"
4776 [ $MDSCOUNT -lt 2 ] &&
4777 skip "The test needs at least 2 MDTs" && return
4780 echo "For some reason, the master MDT-object of the striped directory"
4781 echo "may lost its master LMV EA. If somebody created files under the"
4782 echo "master directly after the master LMV EA lost, then the LFSCK"
4783 echo "should NOT re-generate the master LMV EA, instead, it should"
4784 echo "change the broken striped dirctory as read-only to prevent"
4785 echo "further damage"
4788 check_mount_and_prep
4790 echo "Inject failure stub on MDT0 to simulate the case that the"
4791 echo "master MDT-object of the striped directory lost the LMV EA."
4793 #define OBD_FAIL_LFSCK_LOST_MASTER_LMV 0x1629
4794 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1629
4795 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4796 error "(1) Fail to create striped directory"
4797 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0
4799 umount_client $MOUNT || error "(2) umount failed"
4800 mount_client $MOUNT || error "(3) mount failed"
4802 touch $DIR/$tdir/striped_dir/dummy ||
4803 error "(4) Fail to touch under broken striped directory"
4805 echo "Trigger namespace LFSCK to find out the inconsistency"
4806 $START_NAMESPACE -r -A ||
4807 error "(5) Fail to start LFSCK for namespace"
4809 wait_all_targets_blocked namespace completed 6
4811 local repaired=$($SHOW_NAMESPACE |
4812 awk '/^striped_dirs_repaired/ { print $2 }')
4813 [ $repaired -eq 0 ] ||
4814 error "(7) Re-generate master LMV EA unexpected: $repaired"
4816 stat $DIR/$tdir/striped_dir/dummy ||
4817 error "(8) Fail to stat $DIR/$tdir/striped_dir/dummy"
4819 touch $DIR/$tdir/striped_dir/foo &&
4820 error "(9) The broken striped directory should be read-only"
4822 chattr -i $DIR/$tdir/striped_dir ||
4823 error "(10) Fail to chattr on the broken striped directory"
4825 rmdir $DIR/$tdir/striped_dir ||
4826 error "(11) Fail to remove the striped directory after LFSCK"
4828 run_test 31d "Set broken striped directory (modified after broken) as read-only"
4831 [ $MDSCOUNT -lt 2 ] &&
4832 skip "The test needs at least 2 MDTs" && return
4835 echo "For some reason, the slave MDT-object of the striped directory"
4836 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4837 echo "slave LMV EA."
4840 check_mount_and_prep
4842 echo "Inject failure stub on MDT0 to simulate the case that the"
4843 echo "slave MDT-object (that resides on the same MDT as the master"
4844 echo "MDT-object resides on) lost the LMV EA."
4846 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4847 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=0
4848 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4849 error "(1) Fail to create striped directory"
4850 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4852 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4853 $START_NAMESPACE -r -A ||
4854 error "(2) Fail to start LFSCK for namespace"
4856 wait_all_targets_blocked namespace completed 3
4858 local repaired=$($SHOW_NAMESPACE |
4859 awk '/^striped_shards_repaired/ { print $2 }')
4860 [ $repaired -eq 1 ] ||
4861 error "(4) Fail to re-generate slave LMV EA: $repaired"
4863 rmdir $DIR/$tdir/striped_dir ||
4864 error "(5) Fail to remove the striped directory after LFSCK"
4866 run_test 31e "Re-generate the lost slave LMV EA for striped directory (1)"
4869 [ $MDSCOUNT -lt 2 ] &&
4870 skip "The test needs at least 2 MDTs" && return
4873 echo "For some reason, the slave MDT-object of the striped directory"
4874 echo "may lost its slave LMV EA. The LFSCK should re-generate the"
4875 echo "slave LMV EA."
4878 check_mount_and_prep
4880 echo "Inject failure stub on MDT0 to simulate the case that the"
4881 echo "slave MDT-object (that resides on different MDT as the master"
4882 echo "MDT-object resides on) lost the LMV EA."
4884 #define OBD_FAIL_LFSCK_LOST_SLAVE_LMV 0x162a
4885 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162a fail_val=1
4886 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4887 error "(1) Fail to create striped directory"
4888 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4890 echo "Trigger namespace LFSCK to re-generate slave LMV EA"
4891 $START_NAMESPACE -r -A ||
4892 error "(2) Fail to start LFSCK for namespace"
4894 wait_all_targets_blocked namespace completed 3
4896 local repaired=$(do_facet mds2 $LCTL get_param -n \
4897 mdd.$(facet_svc mds2).lfsck_namespace |
4898 awk '/^striped_shards_repaired/ { print $2 }')
4899 [ $repaired -eq 1 ] ||
4900 error "(4) Fail to re-generate slave LMV EA: $repaired"
4902 rmdir $DIR/$tdir/striped_dir ||
4903 error "(5) Fail to remove the striped directory after LFSCK"
4905 run_test 31f "Re-generate the lost slave LMV EA for striped directory (2)"
4908 [ $MDSCOUNT -lt 2 ] &&
4909 skip "The test needs at least 2 MDTs" && return
4912 echo "For some reason, the stripe index in the slave LMV EA is"
4913 echo "corrupted. The LFSCK should repair the slave LMV EA."
4916 check_mount_and_prep
4918 echo "Inject failure stub on MDT0 to simulate the case that the"
4919 echo "slave LMV EA on the first shard of the striped directory"
4920 echo "claims the same index as the second shard claims"
4922 #define OBD_FAIL_LFSCK_BAD_SLAVE_LMV 0x162b
4923 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162b fail_val=0
4924 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4925 error "(1) Fail to create striped directory"
4926 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4928 echo "Trigger namespace LFSCK to repair the slave LMV EA"
4929 $START_NAMESPACE -r -A ||
4930 error "(2) Fail to start LFSCK for namespace"
4932 wait_all_targets_blocked namespace completed 3
4934 local repaired=$($SHOW_NAMESPACE |
4935 awk '/^striped_shards_repaired/ { print $2 }')
4936 [ $repaired -eq 1 ] ||
4937 error "(4) Fail to repair slave LMV EA: $repaired"
4939 umount_client $MOUNT || error "(5) umount failed"
4940 mount_client $MOUNT || error "(6) mount failed"
4942 touch $DIR/$tdir/striped_dir/foo ||
4943 error "(7) Fail to touch file after the LFSCK"
4945 rm -f $DIR/$tdir/striped_dir/foo ||
4946 error "(8) Fail to unlink file after the LFSCK"
4948 rmdir $DIR/$tdir/striped_dir ||
4949 error "(9) Fail to remove the striped directory after LFSCK"
4951 run_test 31g "Repair the corrupted slave LMV EA"
4954 [ $MDSCOUNT -lt 2 ] &&
4955 skip "The test needs at least 2 MDTs" && return
4958 echo "For some reason, the shard's name entry in the striped"
4959 echo "directory may be corrupted. The LFSCK should repair the"
4960 echo "bad shard's name entry."
4963 check_mount_and_prep
4965 echo "Inject failure stub on MDT0 to simulate the case that the"
4966 echo "first shard's name entry in the striped directory claims"
4967 echo "the same index as the second shard's name entry claims."
4969 #define OBD_FAIL_LFSCK_BAD_SLAVE_NAME 0x162c
4970 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x162c fail_val=0
4971 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir/striped_dir ||
4972 error "(1) Fail to create striped directory"
4973 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0 fail_val=0
4975 echo "Trigger namespace LFSCK to repair the shard's name entry"
4976 $START_NAMESPACE -r -A ||
4977 error "(2) Fail to start LFSCK for namespace"
4979 wait_all_targets_blocked namespace completed 3
4981 local repaired=$($SHOW_NAMESPACE |
4982 awk '/^dirent_repaired/ { print $2 }')
4983 [ $repaired -eq 1 ] ||
4984 error "(4) Fail to repair shard's name entry: $repaired"
4986 umount_client $MOUNT || error "(5) umount failed"
4987 mount_client $MOUNT || error "(6) mount failed"
4989 touch $DIR/$tdir/striped_dir/foo ||
4990 error "(7) Fail to touch file after the LFSCK"
4992 rm -f $DIR/$tdir/striped_dir/foo ||
4993 error "(8) Fail to unlink file after the LFSCK"
4995 rmdir $DIR/$tdir/striped_dir ||
4996 error "(9) Fail to remove the striped directory after LFSCK"
4998 run_test 31h "Repair the corrupted shard's name entry"
5003 umount_client $MOUNT
5005 #define OBD_FAIL_LFSCK_ASSISTANT_DIRECT 0x162d
5006 do_facet $SINGLEMDS $LCTL set_param fail_val=3 fail_loc=0x162d
5007 $START_LAYOUT -r || error "(2) Fail to start LFSCK for layout!"
5009 local STATUS=$($SHOW_LAYOUT | awk '/^status/ { print $2 }')
5010 [ "$STATUS" == "scanning-phase1" ] ||
5011 error "(3) Expect 'scanning-phase1', but got '$STATUS'"
5014 stop ost1 > /dev/null || error "(4) Fail to stop OST1!"
5016 do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0
5020 $STOP_LFSCK || error "(5) Fail to stop LFSCK!"
5022 run_test 32 "stop LFSCK when some OST failed"
5028 $START_LAYOUT --dryrun -o -r ||
5029 error "(1) Fail to start layout LFSCK"
5030 wait_all_targets_blocked layout completed 2
5032 local PARAMS=$($SHOW_LAYOUT | awk '/^param/ { print $2 }')
5033 [ "$PARAMS" == "dryrun,all_targets,orphan" ] ||
5034 error "(3) Expect 'dryrun,all_targets,orphan', got '$PARAMS'"
5036 $START_NAMESPACE -e abort -A -r ||
5037 error "(4) Fail to start namespace LFSCK"
5038 wait_all_targets_blocked namespace completed 5
5040 PARAMS=$($SHOW_NAMESPACE | awk '/^param/ { print $2 }')
5041 [ "$PARAMS" == "failout,all_targets" ] ||
5042 error "(6) Expect 'failout,all_targets', got '$PARAMS'"
5044 run_test 33 "check LFSCK paramters"
5046 # restore MDS/OST size
5047 MDSSIZE=${SAVED_MDSSIZE}
5048 OSTSIZE=${SAVED_OSTSIZE}
5049 OSTCOUNT=${SAVED_OSTCOUNT}
5051 # cleanup the system at last